diff options
Diffstat (limited to 'fs')
237 files changed, 5817 insertions, 4893 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c index 55e108e5e133..1c8dc696d516 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -49,22 +49,20 @@ int v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses, void v9fs_cache_inode_get_cookie(struct inode *inode) { - struct v9fs_inode *v9inode; + struct v9fs_inode *v9inode = V9FS_I(inode); struct v9fs_session_info *v9ses; __le32 version; __le64 path; if (!S_ISREG(inode->i_mode)) return; - - v9inode = V9FS_I(inode); - if (WARN_ON(v9inode->fscache)) + if (WARN_ON(v9fs_inode_cookie(v9inode))) return; version = cpu_to_le32(v9inode->qid.version); path = cpu_to_le64(v9inode->qid.path); v9ses = v9fs_inode2v9ses(inode); - v9inode->fscache = + v9inode->netfs_ctx.cache = fscache_acquire_cookie(v9fs_session_cache(v9ses), 0, &path, sizeof(path), @@ -72,5 +70,5 @@ void v9fs_cache_inode_get_cookie(struct inode *inode) i_size_read(&v9inode->vfs_inode)); p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n", - inode, v9inode->fscache); + inode, v9fs_inode_cookie(v9inode)); } diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 08f65c40af4f..e28ddf763b3b 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -623,9 +623,7 @@ static void v9fs_sysfs_cleanup(void) static void v9fs_inode_init_once(void *foo) { struct v9fs_inode *v9inode = (struct v9fs_inode *)foo; -#ifdef CONFIG_9P_FSCACHE - v9inode->fscache = NULL; -#endif + memset(&v9inode->qid, 0, sizeof(v9inode->qid)); inode_init_once(&v9inode->vfs_inode); } diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index bc8b30205d36..ec0e8df3b2eb 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -9,6 +9,7 @@ #define FS_9P_V9FS_H #include <linux/backing-dev.h> +#include <linux/netfs.h> /** * enum p9_session_flags - option flags for each 9P session @@ -108,14 +109,15 @@ struct v9fs_session_info { #define V9FS_INO_INVALID_ATTR 0x01 struct v9fs_inode { -#ifdef CONFIG_9P_FSCACHE - struct fscache_cookie *fscache; -#endif + struct { + /* These must be contiguous */ + struct inode vfs_inode; /* the VFS's inode record */ + struct netfs_i_context netfs_ctx; /* Netfslib context */ + }; struct p9_qid qid; unsigned int cache_validity; struct p9_fid *writeback_fid; struct mutex v_mutex; - struct inode vfs_inode; }; static inline struct v9fs_inode *V9FS_I(const struct inode *inode) @@ -126,7 +128,7 @@ static inline struct v9fs_inode *V9FS_I(const struct inode *inode) static inline struct fscache_cookie *v9fs_inode_cookie(struct v9fs_inode *v9inode) { #ifdef CONFIG_9P_FSCACHE - return v9inode->fscache; + return netfs_i_cookie(&v9inode->vfs_inode); #else return NULL; #endif @@ -163,6 +165,7 @@ extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses, extern const struct inode_operations v9fs_dir_inode_operations_dotl; extern const struct inode_operations v9fs_file_inode_operations_dotl; extern const struct inode_operations v9fs_symlink_inode_operations_dotl; +extern const struct netfs_request_ops v9fs_req_ops; extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb, int new); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 76956c9d2af9..501128188343 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -28,12 +28,12 @@ #include "fid.h" /** - * v9fs_req_issue_op - Issue a read from 9P + * v9fs_issue_read - Issue a read from 9P * @subreq: The read to make */ -static void v9fs_req_issue_op(struct netfs_read_subrequest *subreq) +static void v9fs_issue_read(struct netfs_io_subrequest *subreq) { - struct netfs_read_request *rreq = subreq->rreq; + struct netfs_io_request *rreq = subreq->rreq; struct p9_fid *fid = rreq->netfs_priv; struct iov_iter to; loff_t pos = subreq->start + subreq->transferred; @@ -52,20 +52,21 @@ static void v9fs_req_issue_op(struct netfs_read_subrequest *subreq) } /** - * v9fs_init_rreq - Initialise a read request + * v9fs_init_request - Initialise a read request * @rreq: The read request * @file: The file being read from */ -static void v9fs_init_rreq(struct netfs_read_request *rreq, struct file *file) +static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file) { struct p9_fid *fid = file->private_data; refcount_inc(&fid->count); rreq->netfs_priv = fid; + return 0; } /** - * v9fs_req_cleanup - Cleanup request initialized by v9fs_init_rreq + * v9fs_req_cleanup - Cleanup request initialized by v9fs_init_request * @mapping: unused mapping of request to cleanup * @priv: private data to cleanup, a fid, guaranted non-null. */ @@ -77,21 +78,10 @@ static void v9fs_req_cleanup(struct address_space *mapping, void *priv) } /** - * v9fs_is_cache_enabled - Determine if caching is enabled for an inode - * @inode: The inode to check - */ -static bool v9fs_is_cache_enabled(struct inode *inode) -{ - struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(inode)); - - return fscache_cookie_enabled(cookie) && cookie->cache_priv; -} - -/** * v9fs_begin_cache_operation - Begin a cache operation for a read * @rreq: The read request */ -static int v9fs_begin_cache_operation(struct netfs_read_request *rreq) +static int v9fs_begin_cache_operation(struct netfs_io_request *rreq) { #ifdef CONFIG_9P_FSCACHE struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(rreq->inode)); @@ -102,37 +92,14 @@ static int v9fs_begin_cache_operation(struct netfs_read_request *rreq) #endif } -static const struct netfs_read_request_ops v9fs_req_ops = { - .init_rreq = v9fs_init_rreq, - .is_cache_enabled = v9fs_is_cache_enabled, +const struct netfs_request_ops v9fs_req_ops = { + .init_request = v9fs_init_request, .begin_cache_operation = v9fs_begin_cache_operation, - .issue_op = v9fs_req_issue_op, + .issue_read = v9fs_issue_read, .cleanup = v9fs_req_cleanup, }; /** - * v9fs_vfs_readpage - read an entire page in from 9P - * @file: file being read - * @page: structure to page - * - */ -static int v9fs_vfs_readpage(struct file *file, struct page *page) -{ - struct folio *folio = page_folio(page); - - return netfs_readpage(file, folio, &v9fs_req_ops, NULL); -} - -/** - * v9fs_vfs_readahead - read a set of pages from 9P - * @ractl: The readahead parameters - */ -static void v9fs_vfs_readahead(struct readahead_control *ractl) -{ - netfs_readahead(ractl, &v9fs_req_ops, NULL); -} - -/** * v9fs_release_page - release the private state associated with a page * @page: The page to be released * @gfp: The caller's allocation restrictions @@ -308,8 +275,7 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping, * file. We need to do this before we get a lock on the page in case * there's more than one writer competing for the same cache block. */ - retval = netfs_write_begin(filp, mapping, pos, len, flags, &folio, fsdata, - &v9fs_req_ops, NULL); + retval = netfs_write_begin(filp, mapping, pos, len, flags, &folio, fsdata); if (retval < 0) return retval; @@ -370,8 +336,8 @@ static bool v9fs_dirty_folio(struct address_space *mapping, struct folio *folio) #endif const struct address_space_operations v9fs_addr_operations = { - .readpage = v9fs_vfs_readpage, - .readahead = v9fs_vfs_readahead, + .readpage = netfs_readpage, + .readahead = netfs_readahead, .dirty_folio = v9fs_dirty_folio, .writepage = v9fs_vfs_writepage, .write_begin = v9fs_write_begin, diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 84c3cf7dffa5..55367ecb9442 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -231,9 +231,6 @@ struct inode *v9fs_alloc_inode(struct super_block *sb) v9inode = alloc_inode_sb(sb, v9fs_inode_cache, GFP_KERNEL); if (!v9inode) return NULL; -#ifdef CONFIG_9P_FSCACHE - v9inode->fscache = NULL; -#endif v9inode->writeback_fid = NULL; v9inode->cache_validity = 0; mutex_init(&v9inode->v_mutex); @@ -250,6 +247,14 @@ void v9fs_free_inode(struct inode *inode) kmem_cache_free(v9fs_inode_cache, V9FS_I(inode)); } +/* + * Set parameters for the netfs library + */ +static void v9fs_set_netfs_context(struct inode *inode) +{ + netfs_i_context_init(inode, &v9fs_req_ops); +} + int v9fs_init_inode(struct v9fs_session_info *v9ses, struct inode *inode, umode_t mode, dev_t rdev) { @@ -338,6 +343,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, err = -EINVAL; goto error; } + + v9fs_set_netfs_context(inode); error: return err; diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index db832cc931c8..f120bcb8bf73 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -76,6 +76,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) /* there shouldn't be an existing inode */ BUG_ON(!(inode->i_state & I_NEW)); + netfs_i_context_init(inode, NULL); inode->i_size = 0; inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; if (root) { diff --git a/fs/afs/file.c b/fs/afs/file.c index 0f9fdb284a20..26292a110a8f 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -19,13 +19,11 @@ #include "internal.h" static int afs_file_mmap(struct file *file, struct vm_area_struct *vma); -static int afs_readpage(struct file *file, struct page *page); static int afs_symlink_readpage(struct file *file, struct page *page); static void afs_invalidate_folio(struct folio *folio, size_t offset, size_t length); static int afs_releasepage(struct page *page, gfp_t gfp_flags); -static void afs_readahead(struct readahead_control *ractl); static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter); static void afs_vm_open(struct vm_area_struct *area); static void afs_vm_close(struct vm_area_struct *area); @@ -52,8 +50,8 @@ const struct inode_operations afs_file_inode_operations = { }; const struct address_space_operations afs_file_aops = { - .readpage = afs_readpage, - .readahead = afs_readahead, + .readpage = netfs_readpage, + .readahead = netfs_readahead, .dirty_folio = afs_dirty_folio, .launder_folio = afs_launder_folio, .releasepage = afs_releasepage, @@ -240,7 +238,7 @@ void afs_put_read(struct afs_read *req) static void afs_fetch_data_notify(struct afs_operation *op) { struct afs_read *req = op->fetch.req; - struct netfs_read_subrequest *subreq = req->subreq; + struct netfs_io_subrequest *subreq = req->subreq; int error = op->error; if (error == -ECONNABORTED) @@ -310,7 +308,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req) return afs_do_sync_operation(op); } -static void afs_req_issue_op(struct netfs_read_subrequest *subreq) +static void afs_issue_read(struct netfs_io_subrequest *subreq) { struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode); struct afs_read *fsreq; @@ -359,19 +357,13 @@ static int afs_symlink_readpage(struct file *file, struct page *page) return ret; } -static void afs_init_rreq(struct netfs_read_request *rreq, struct file *file) +static int afs_init_request(struct netfs_io_request *rreq, struct file *file) { rreq->netfs_priv = key_get(afs_file_key(file)); + return 0; } -static bool afs_is_cache_enabled(struct inode *inode) -{ - struct fscache_cookie *cookie = afs_vnode_cache(AFS_FS_I(inode)); - - return fscache_cookie_enabled(cookie) && cookie->cache_priv; -} - -static int afs_begin_cache_operation(struct netfs_read_request *rreq) +static int afs_begin_cache_operation(struct netfs_io_request *rreq) { #ifdef CONFIG_AFS_FSCACHE struct afs_vnode *vnode = AFS_FS_I(rreq->inode); @@ -396,27 +388,14 @@ static void afs_priv_cleanup(struct address_space *mapping, void *netfs_priv) key_put(netfs_priv); } -const struct netfs_read_request_ops afs_req_ops = { - .init_rreq = afs_init_rreq, - .is_cache_enabled = afs_is_cache_enabled, +const struct netfs_request_ops afs_req_ops = { + .init_request = afs_init_request, .begin_cache_operation = afs_begin_cache_operation, .check_write_begin = afs_check_write_begin, - .issue_op = afs_req_issue_op, + .issue_read = afs_issue_read, .cleanup = afs_priv_cleanup, }; -static int afs_readpage(struct file *file, struct page *page) -{ - struct folio *folio = page_folio(page); - - return netfs_readpage(file, folio, &afs_req_ops, NULL); -} - -static void afs_readahead(struct readahead_control *ractl) -{ - netfs_readahead(ractl, &afs_req_ops, NULL); -} - int afs_write_inode(struct inode *inode, struct writeback_control *wbc) { fscache_unpin_writeback(wbc, afs_vnode_cache(AFS_FS_I(inode))); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 5964f8aee090..2fe402483ad5 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -54,6 +54,14 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren } /* + * Set parameters for the netfs library + */ +static void afs_set_netfs_context(struct afs_vnode *vnode) +{ + netfs_i_context_init(&vnode->vfs_inode, &afs_req_ops); +} + +/* * Initialise an inode from the vnode status. */ static int afs_inode_init_from_status(struct afs_operation *op, @@ -128,6 +136,7 @@ static int afs_inode_init_from_status(struct afs_operation *op, } afs_set_i_size(vnode, status->size); + afs_set_netfs_context(vnode); vnode->invalid_before = status->data_version; inode_set_iversion_raw(&vnode->vfs_inode, status->data_version); @@ -237,6 +246,7 @@ static void afs_apply_status(struct afs_operation *op, * idea of what the size should be that's not the same as * what's on the server. */ + vnode->netfs_ctx.remote_i_size = status->size; if (change_size) { afs_set_i_size(vnode, status->size); inode->i_ctime = t; @@ -420,7 +430,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) struct afs_vnode_cache_aux aux; if (vnode->status.type != AFS_FTYPE_FILE) { - vnode->cache = NULL; + vnode->netfs_ctx.cache = NULL; return; } @@ -430,12 +440,14 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) key.vnode_id_ext[1] = htonl(vnode->fid.vnode_hi); afs_set_cache_aux(vnode, &aux); - vnode->cache = fscache_acquire_cookie( - vnode->volume->cache, - vnode->status.type == AFS_FTYPE_FILE ? 0 : FSCACHE_ADV_SINGLE_CHUNK, - &key, sizeof(key), - &aux, sizeof(aux), - vnode->status.size); + afs_vnode_set_cache(vnode, + fscache_acquire_cookie( + vnode->volume->cache, + vnode->status.type == AFS_FTYPE_FILE ? + 0 : FSCACHE_ADV_SINGLE_CHUNK, + &key, sizeof(key), + &aux, sizeof(aux), + vnode->status.size)); #endif } @@ -528,6 +540,7 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key) vnode = AFS_FS_I(inode); vnode->cb_v_break = as->volume->cb_v_break, + afs_set_netfs_context(vnode); op = afs_alloc_operation(key, as->volume); if (IS_ERR(op)) { @@ -786,11 +799,8 @@ void afs_evict_inode(struct inode *inode) afs_put_wb_key(wbk); } -#ifdef CONFIG_AFS_FSCACHE - fscache_relinquish_cookie(vnode->cache, + fscache_relinquish_cookie(afs_vnode_cache(vnode), test_bit(AFS_VNODE_DELETED, &vnode->flags)); - vnode->cache = NULL; -#endif afs_prune_wb_keys(vnode); afs_put_permits(rcu_access_pointer(vnode->permit_cache)); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index dc5032e10244..7b7ef945dc78 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -207,7 +207,7 @@ struct afs_read { loff_t file_size; /* File size returned by server */ struct key *key; /* The key to use to reissue the read */ struct afs_vnode *vnode; /* The file being read into. */ - struct netfs_read_subrequest *subreq; /* Fscache helper read request this belongs to */ + struct netfs_io_subrequest *subreq; /* Fscache helper read request this belongs to */ afs_dataversion_t data_version; /* Version number returned by server */ refcount_t usage; unsigned int call_debug_id; @@ -619,15 +619,16 @@ enum afs_lock_state { * leak from one inode to another. */ struct afs_vnode { - struct inode vfs_inode; /* the VFS's inode record */ + struct { + /* These must be contiguous */ + struct inode vfs_inode; /* the VFS's inode record */ + struct netfs_i_context netfs_ctx; /* Netfslib context */ + }; struct afs_volume *volume; /* volume on which vnode resides */ struct afs_fid fid; /* the file identifier for this inode */ struct afs_file_status status; /* AFS status info for this file */ afs_dataversion_t invalid_before; /* Child dentries are invalid before this */ -#ifdef CONFIG_AFS_FSCACHE - struct fscache_cookie *cache; /* caching cookie */ -#endif struct afs_permits __rcu *permit_cache; /* cache of permits so far obtained */ struct mutex io_lock; /* Lock for serialising I/O on this mutex */ struct rw_semaphore validate_lock; /* lock for validating this vnode */ @@ -674,12 +675,20 @@ struct afs_vnode { static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode) { #ifdef CONFIG_AFS_FSCACHE - return vnode->cache; + return netfs_i_cookie(&vnode->vfs_inode); #else return NULL; #endif } +static inline void afs_vnode_set_cache(struct afs_vnode *vnode, + struct fscache_cookie *cookie) +{ +#ifdef CONFIG_AFS_FSCACHE + vnode->netfs_ctx.cache = cookie; +#endif +} + /* * cached security record for one user's attempt to access a vnode */ @@ -1063,7 +1072,7 @@ extern const struct address_space_operations afs_file_aops; extern const struct address_space_operations afs_symlink_aops; extern const struct inode_operations afs_file_inode_operations; extern const struct file_operations afs_file_operations; -extern const struct netfs_read_request_ops afs_req_ops; +extern const struct netfs_request_ops afs_req_ops; extern int afs_cache_wb_key(struct afs_vnode *, struct afs_file *); extern void afs_put_wb_key(struct afs_wb_key *); diff --git a/fs/afs/super.c b/fs/afs/super.c index 7592c0f469f1..1fea195b0b27 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -688,13 +688,11 @@ static struct inode *afs_alloc_inode(struct super_block *sb) /* Reset anything that shouldn't leak from one inode to the next. */ memset(&vnode->fid, 0, sizeof(vnode->fid)); memset(&vnode->status, 0, sizeof(vnode->status)); + afs_vnode_set_cache(vnode, NULL); vnode->volume = NULL; vnode->lock_key = NULL; vnode->permit_cache = NULL; -#ifdef CONFIG_AFS_FSCACHE - vnode->cache = NULL; -#endif vnode->flags = 1 << AFS_VNODE_UNSET; vnode->lock_state = AFS_VNODE_LOCK_NONE; diff --git a/fs/afs/write.c b/fs/afs/write.c index e1c17081d18e..6bcf1475511b 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -60,8 +60,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping, * file. We need to do this before we get a lock on the page in case * there's more than one writer competing for the same cache block. */ - ret = netfs_write_begin(file, mapping, pos, len, flags, &folio, fsdata, - &afs_req_ops, NULL); + ret = netfs_write_begin(file, mapping, pos, len, flags, &folio, fsdata); if (ret < 0) return ret; @@ -355,9 +354,10 @@ static const struct afs_operation_ops afs_store_data_operation = { static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t pos, bool laundering) { + struct netfs_i_context *ictx = &vnode->netfs_ctx; struct afs_operation *op; struct afs_wb_key *wbk = NULL; - loff_t size = iov_iter_count(iter), i_size; + loff_t size = iov_iter_count(iter); int ret = -ENOKEY; _enter("%s{%llx:%llu.%u},%llx,%llx", @@ -379,15 +379,13 @@ static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t return -ENOMEM; } - i_size = i_size_read(&vnode->vfs_inode); - afs_op_set_vnode(op, 0, vnode); op->file[0].dv_delta = 1; op->file[0].modification = true; op->store.write_iter = iter; op->store.pos = pos; op->store.size = size; - op->store.i_size = max(pos + size, i_size); + op->store.i_size = max(pos + size, ictx->remote_i_size); op->store.laundering = laundering; op->mtime = vnode->vfs_inode.i_mtime; op->flags |= AFS_OPERATION_UNINTR; @@ -1478,7 +1478,6 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) req->ki_flags = iocb_flags(req->ki_filp); if (iocb->aio_flags & IOCB_FLAG_RESFD) req->ki_flags |= IOCB_EVENTFD; - req->ki_hint = ki_hint_validate(file_write_hint(req->ki_filp)); if (iocb->aio_flags & IOCB_FLAG_IOPRIO) { /* * If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then @@ -1553,7 +1552,6 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb, file = req->ki_filp; if (unlikely(!(file->f_mode & FMODE_READ))) return -EBADF; - ret = -EINVAL; if (unlikely(!file->f_op->read_iter)) return -EINVAL; diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 4188ba3fd8c3..99f9995670ea 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -17,6 +17,7 @@ subdir-ccflags-y += $(condflags) subdir-ccflags-y += -Wno-missing-field-initializers subdir-ccflags-y += -Wno-sign-compare subdir-ccflags-y += -Wno-type-limits +subdir-ccflags-y += -Wno-shift-negative-value obj-$(CONFIG_BTRFS_FS) := btrfs.o diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d78b3a2d04e3..724e8fe06aa0 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3329,7 +3329,6 @@ static int alloc_new_bio(struct btrfs_inode *inode, bio_ctrl->bio_flags = bio_flags; bio->bi_end_io = end_io_func; bio->bi_private = &inode->io_tree; - bio->bi_write_hint = inode->vfs_inode.i_write_hint; bio->bi_opf = opf; ret = calc_bio_boundaries(bio_ctrl, inode, file_offset); if (ret < 0) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index aa0a60ee26cb..6bfc4343c98d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8296,7 +8296,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * cover the full folio, like invalidating the last folio, we're * still safe to wait for ordered extent to finish. */ - if (!(offset == 0 && length == PAGE_SIZE)) { + if (!(offset == 0 && length == folio_size(folio))) { btrfs_releasepage(&folio->page, GFP_NOFS); return; } diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 04a88bfe4fcf..998e3f180d90 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -645,7 +645,7 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, int ret; /* - * Lock destination range to serialize with concurrent readpages() and + * Lock destination range to serialize with concurrent readahead() and * source range to serialize with relocation. */ btrfs_double_extent_lock(src, loff, dst, dst_loff, len); @@ -739,7 +739,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, } /* - * Lock destination range to serialize with concurrent readpages() and + * Lock destination range to serialize with concurrent readahead() and * source range to serialize with relocation. */ btrfs_double_extent_lock(src, off, inode, destoff, len); diff --git a/fs/buffer.c b/fs/buffer.c index ed26cb1d381d..2b5561ae5d0b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -53,7 +53,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, - enum rw_hint hint, struct writeback_control *wbc); + struct writeback_control *wbc); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) @@ -1804,8 +1804,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, - inode->i_write_hint, wbc); + submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc); nr_underway++; } bh = next; @@ -1859,8 +1858,7 @@ recover: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, - inode->i_write_hint, wbc); + submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc); nr_underway++; } bh = next; @@ -2354,8 +2352,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size) if (err) goto out; - err = pagecache_write_begin(NULL, mapping, size, 0, - AOP_FLAG_CONT_EXPAND, &page, &fsdata); + err = pagecache_write_begin(NULL, mapping, size, 0, 0, &page, &fsdata); if (err) goto out; @@ -3004,7 +3001,7 @@ static void end_bio_bh_io_sync(struct bio *bio) } static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, - enum rw_hint write_hint, struct writeback_control *wbc) + struct writeback_control *wbc) { struct bio *bio; @@ -3030,7 +3027,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio->bi_write_hint = write_hint; bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); BUG_ON(bio->bi_iter.bi_size != bh->b_size); @@ -3052,7 +3048,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, int submit_bh(int op, int op_flags, struct buffer_head *bh) { - return submit_bh_wbc(op, op_flags, bh, 0, NULL); + return submit_bh_wbc(op, op_flags, bh, NULL); } EXPORT_SYMBOL(submit_bh); diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 753986ea1583..9dc81e781f2b 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -138,7 +138,6 @@ static int cachefiles_read(struct netfs_cache_resources *cres, ki->iocb.ki_filp = file; ki->iocb.ki_pos = start_pos + skipped; ki->iocb.ki_flags = IOCB_DIRECT; - ki->iocb.ki_hint = ki_hint_validate(file_write_hint(file)); ki->iocb.ki_ioprio = get_current_ioprio(); ki->skipped = skipped; ki->object = object; @@ -313,7 +312,6 @@ static int cachefiles_write(struct netfs_cache_resources *cres, ki->iocb.ki_filp = file; ki->iocb.ki_pos = start_pos; ki->iocb.ki_flags = IOCB_DIRECT | IOCB_WRITE; - ki->iocb.ki_hint = ki_hint_validate(file_write_hint(file)); ki->iocb.ki_ioprio = get_current_ioprio(); ki->object = object; ki->inval_counter = cres->inval_counter; @@ -382,18 +380,18 @@ presubmission_error: * Prepare a read operation, shortening it to a cached/uncached * boundary as appropriate. */ -static enum netfs_read_source cachefiles_prepare_read(struct netfs_read_subrequest *subreq, +static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq, loff_t i_size) { enum cachefiles_prepare_read_trace why; - struct netfs_read_request *rreq = subreq->rreq; + struct netfs_io_request *rreq = subreq->rreq; struct netfs_cache_resources *cres = &rreq->cache_resources; struct cachefiles_object *object; struct cachefiles_cache *cache; struct fscache_cookie *cookie = fscache_cres_cookie(cres); const struct cred *saved_cred; struct file *file = cachefiles_cres_file(cres); - enum netfs_read_source ret = NETFS_DOWNLOAD_FROM_SERVER; + enum netfs_io_source ret = NETFS_DOWNLOAD_FROM_SERVER; loff_t off, to; ino_t ino = file ? file_inode(file)->i_ino : 0; @@ -406,7 +404,7 @@ static enum netfs_read_source cachefiles_prepare_read(struct netfs_read_subreque } if (test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) { - __set_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags); + __set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); why = cachefiles_trace_read_no_data; goto out_no_object; } @@ -475,7 +473,7 @@ static enum netfs_read_source cachefiles_prepare_read(struct netfs_read_subreque goto out; download_and_store: - __set_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags); + __set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); out: cachefiles_end_secure(cache, saved_cred); out_no_object: diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index f6135c93ce9d..aa25bffd4823 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -182,9 +182,9 @@ static int ceph_releasepage(struct page *page, gfp_t gfp) return 1; } -static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq) +static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq) { - struct inode *inode = rreq->mapping->host; + struct inode *inode = rreq->inode; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_layout *lo = &ci->i_layout; u32 blockoff; @@ -199,9 +199,9 @@ static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq) rreq->len = roundup(rreq->len, lo->stripe_unit); } -static bool ceph_netfs_clamp_length(struct netfs_read_subrequest *subreq) +static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq) { - struct inode *inode = subreq->rreq->mapping->host; + struct inode *inode = subreq->rreq->inode; struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u64 objno, objoff; @@ -218,7 +218,7 @@ static void finish_netfs_read(struct ceph_osd_request *req) { struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode); struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); - struct netfs_read_subrequest *subreq = req->r_priv; + struct netfs_io_subrequest *subreq = req->r_priv; int num_pages; int err = req->r_result; @@ -244,10 +244,63 @@ static void finish_netfs_read(struct ceph_osd_request *req) iput(req->r_inode); } -static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq) +static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) { - struct netfs_read_request *rreq = subreq->rreq; - struct inode *inode = rreq->mapping->host; + struct netfs_io_request *rreq = subreq->rreq; + struct inode *inode = rreq->inode; + struct ceph_mds_reply_info_parsed *rinfo; + struct ceph_mds_reply_info_in *iinfo; + struct ceph_mds_request *req; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_inode_info *ci = ceph_inode(inode); + struct iov_iter iter; + ssize_t err = 0; + size_t len; + + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); + + if (subreq->start >= inode->i_size) + goto out; + + /* We need to fetch the inline data. */ + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + req->r_ino1 = ci->i_vino; + req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA); + req->r_num_caps = 2; + + err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err < 0) + goto out; + + rinfo = &req->r_reply_info; + iinfo = &rinfo->targeti; + if (iinfo->inline_version == CEPH_INLINE_NONE) { + /* The data got uninlined */ + ceph_mdsc_put_request(req); + return false; + } + + len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len); + iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len); + err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter); + if (err == 0) + err = -EFAULT; + + ceph_mdsc_put_request(req); +out: + netfs_subreq_terminated(subreq, err, false); + return true; +} + +static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) +{ + struct netfs_io_request *rreq = subreq->rreq; + struct inode *inode = rreq->inode; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_osd_request *req; @@ -258,6 +311,10 @@ static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq) int err = 0; u64 len = subreq->len; + if (ci->i_inline_version != CEPH_INLINE_NONE && + ceph_netfs_issue_op_inline(subreq)) + return; + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len, 0, 1, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica, @@ -296,6 +353,45 @@ out: dout("%s: result %d\n", __func__, err); } +static int ceph_init_request(struct netfs_io_request *rreq, struct file *file) +{ + struct inode *inode = rreq->inode; + int got = 0, want = CEPH_CAP_FILE_CACHE; + int ret = 0; + + if (rreq->origin != NETFS_READAHEAD) + return 0; + + if (file) { + struct ceph_rw_context *rw_ctx; + struct ceph_file_info *fi = file->private_data; + + rw_ctx = ceph_find_rw_context(fi); + if (rw_ctx) + return 0; + } + + /* + * readahead callers do not necessarily hold Fcb caps + * (e.g. fadvise, madvise). + */ + ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got); + if (ret < 0) { + dout("start_read %p, error getting cap\n", inode); + return ret; + } + + if (!(got & want)) { + dout("start_read %p, no cache cap\n", inode); + return -EACCES; + } + if (ret == 0) + return -EACCES; + + rreq->netfs_priv = (void *)(uintptr_t)got; + return 0; +} + static void ceph_readahead_cleanup(struct address_space *mapping, void *priv) { struct inode *inode = mapping->host; @@ -306,78 +402,16 @@ static void ceph_readahead_cleanup(struct address_space *mapping, void *priv) ceph_put_cap_refs(ci, got); } -static const struct netfs_read_request_ops ceph_netfs_read_ops = { - .is_cache_enabled = ceph_is_cache_enabled, +const struct netfs_request_ops ceph_netfs_ops = { + .init_request = ceph_init_request, .begin_cache_operation = ceph_begin_cache_operation, - .issue_op = ceph_netfs_issue_op, + .issue_read = ceph_netfs_issue_read, .expand_readahead = ceph_netfs_expand_readahead, .clamp_length = ceph_netfs_clamp_length, .check_write_begin = ceph_netfs_check_write_begin, .cleanup = ceph_readahead_cleanup, }; -/* read a single page, without unlocking it. */ -static int ceph_readpage(struct file *file, struct page *subpage) -{ - struct folio *folio = page_folio(subpage); - struct inode *inode = file_inode(file); - struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vino vino = ceph_vino(inode); - size_t len = folio_size(folio); - u64 off = folio_file_pos(folio); - - if (ci->i_inline_version != CEPH_INLINE_NONE) { - /* - * Uptodate inline data should have been added - * into page cache while getting Fcr caps. - */ - if (off == 0) { - folio_unlock(folio); - return -EINVAL; - } - zero_user_segment(&folio->page, 0, folio_size(folio)); - folio_mark_uptodate(folio); - folio_unlock(folio); - return 0; - } - - dout("readpage ino %llx.%llx file %p off %llu len %zu folio %p index %lu\n", - vino.ino, vino.snap, file, off, len, folio, folio_index(folio)); - - return netfs_readpage(file, folio, &ceph_netfs_read_ops, NULL); -} - -static void ceph_readahead(struct readahead_control *ractl) -{ - struct inode *inode = file_inode(ractl->file); - struct ceph_file_info *fi = ractl->file->private_data; - struct ceph_rw_context *rw_ctx; - int got = 0; - int ret = 0; - - if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE) - return; - - rw_ctx = ceph_find_rw_context(fi); - if (!rw_ctx) { - /* - * readahead callers do not necessarily hold Fcb caps - * (e.g. fadvise, madvise). - */ - int want = CEPH_CAP_FILE_CACHE; - - ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got); - if (ret < 0) - dout("start_read %p, error getting cap\n", inode); - else if (!(got & want)) - dout("start_read %p, no cache cap\n", inode); - - if (ret <= 0) - return; - } - netfs_readahead(ractl, &ceph_netfs_read_ops, (void *)(uintptr_t)got); -} - #ifdef CONFIG_CEPH_FSCACHE static void ceph_set_page_fscache(struct page *page) { @@ -1281,45 +1315,10 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { struct inode *inode = file_inode(file); - struct ceph_inode_info *ci = ceph_inode(inode); struct folio *folio = NULL; - pgoff_t index = pos >> PAGE_SHIFT; int r; - /* - * Uninlining should have already been done and everything updated, EXCEPT - * for inline_version sent to the MDS. - */ - if (ci->i_inline_version != CEPH_INLINE_NONE) { - unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; - if (aop_flags & AOP_FLAG_NOFS) - fgp_flags |= FGP_NOFS; - folio = __filemap_get_folio(mapping, index, fgp_flags, - mapping_gfp_mask(mapping)); - if (!folio) - return -ENOMEM; - - /* - * The inline_version on a new inode is set to 1. If that's the - * case, then the folio is brand new and isn't yet Uptodate. - */ - r = 0; - if (index == 0 && ci->i_inline_version != 1) { - if (!folio_test_uptodate(folio)) { - WARN_ONCE(1, "ceph: write_begin called on still-inlined inode (inline_version %llu)!\n", - ci->i_inline_version); - r = -EINVAL; - } - goto out; - } - zero_user_segment(&folio->page, 0, folio_size(folio)); - folio_mark_uptodate(folio); - goto out; - } - - r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &folio, NULL, - &ceph_netfs_read_ops, NULL); -out: + r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &folio, NULL); if (r == 0) folio_wait_fscache(folio); if (r < 0) { @@ -1373,8 +1372,8 @@ out: } const struct address_space_operations ceph_aops = { - .readpage = ceph_readpage, - .readahead = ceph_readahead, + .readpage = netfs_readpage, + .readahead = netfs_readahead, .writepage = ceph_writepage, .writepages = ceph_writepages_start, .write_begin = ceph_write_begin, @@ -1515,19 +1514,6 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); ceph_block_sigs(&oldset); - if (ci->i_inline_version != CEPH_INLINE_NONE) { - struct page *locked_page = NULL; - if (off == 0) { - lock_page(page); - locked_page = page; - } - err = ceph_uninline_data(vma->vm_file, locked_page); - if (locked_page) - unlock_page(locked_page); - if (err < 0) - goto out_free; - } - if (off + thp_size(page) <= size) len = thp_size(page); else @@ -1584,11 +1570,9 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) ceph_put_snap_context(snapc); } while (err == 0); - if (ret == VM_FAULT_LOCKED || - ci->i_inline_version != CEPH_INLINE_NONE) { + if (ret == VM_FAULT_LOCKED) { int dirty; spin_lock(&ci->i_ceph_lock); - ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf); spin_unlock(&ci->i_ceph_lock); @@ -1652,16 +1636,30 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, } } -int ceph_uninline_data(struct file *filp, struct page *locked_page) +int ceph_uninline_data(struct file *file) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_osd_request *req; - struct page *page = NULL; - u64 len, inline_version; + struct ceph_cap_flush *prealloc_cf; + struct folio *folio = NULL; + u64 inline_version = CEPH_INLINE_NONE; + struct page *pages[1]; int err = 0; - bool from_pagecache = false; + u64 len; + + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + + folio = read_mapping_folio(inode->i_mapping, 0, file); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); + goto out; + } + + folio_lock(folio); spin_lock(&ci->i_ceph_lock); inline_version = ci->i_inline_version; @@ -1672,45 +1670,11 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) if (inline_version == 1 || /* initial version, no data */ inline_version == CEPH_INLINE_NONE) - goto out; - - if (locked_page) { - page = locked_page; - WARN_ON(!PageUptodate(page)); - } else if (ceph_caps_issued(ci) & - (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) { - page = find_get_page(inode->i_mapping, 0); - if (page) { - if (PageUptodate(page)) { - from_pagecache = true; - lock_page(page); - } else { - put_page(page); - page = NULL; - } - } - } + goto out_unlock; - if (page) { - len = i_size_read(inode); - if (len > PAGE_SIZE) - len = PAGE_SIZE; - } else { - page = __page_cache_alloc(GFP_NOFS); - if (!page) { - err = -ENOMEM; - goto out; - } - err = __ceph_do_getattr(inode, page, - CEPH_STAT_CAP_INLINE_DATA, true); - if (err < 0) { - /* no inline data */ - if (err == -ENODATA) - err = 0; - goto out; - } - len = err; - } + len = i_size_read(inode); + if (len > folio_size(folio)) + len = folio_size(folio); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), 0, &len, 0, 1, @@ -1718,7 +1682,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) NULL, 0, 0, false); if (IS_ERR(req)) { err = PTR_ERR(req); - goto out; + goto out_unlock; } req->r_mtime = inode->i_mtime; @@ -1727,7 +1691,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) err = ceph_osdc_wait_request(&fsc->client->osdc, req); ceph_osdc_put_request(req); if (err < 0) - goto out; + goto out_unlock; req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), 0, &len, 1, 3, @@ -1736,10 +1700,11 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) ci->i_truncate_size, false); if (IS_ERR(req)) { err = PTR_ERR(req); - goto out; + goto out_unlock; } - osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false); + pages[0] = folio_page(folio, 0); + osd_req_op_extent_osd_data_pages(req, 1, pages, len, 0, false, false); { __le64 xattr_buf = cpu_to_le64(inline_version); @@ -1749,7 +1714,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) CEPH_OSD_CMPXATTR_OP_GT, CEPH_OSD_CMPXATTR_MODE_U64); if (err) - goto out_put; + goto out_put_req; } { @@ -1760,7 +1725,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) "inline_version", xattr_buf, xattr_len, 0, 0); if (err) - goto out_put; + goto out_put_req; } req->r_mtime = inode->i_mtime; @@ -1771,19 +1736,28 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, err); -out_put: + if (!err) { + int dirty; + + /* Set to CAP_INLINE_NONE and dirty the caps */ + down_read(&fsc->mdsc->snap_rwsem); + spin_lock(&ci->i_ceph_lock); + ci->i_inline_version = CEPH_INLINE_NONE; + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf); + spin_unlock(&ci->i_ceph_lock); + up_read(&fsc->mdsc->snap_rwsem); + if (dirty) + __mark_inode_dirty(inode, dirty); + } +out_put_req: ceph_osdc_put_request(req); if (err == -ECANCELED) err = 0; +out_unlock: + folio_unlock(folio); + folio_put(folio); out: - if (page && page != locked_page) { - if (from_pagecache) { - unlock_page(page); - put_page(page); - } else - __free_pages(page, 0); - } - + ceph_free_cap_flush(prealloc_cf); dout("uninline_data %p %llx.%llx inline_version %llu = %d\n", inode, ceph_vinop(inode), inline_version, err); return err; diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 7d22850623ef..ddea99922073 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -29,26 +29,25 @@ void ceph_fscache_register_inode_cookie(struct inode *inode) if (!(inode->i_state & I_NEW)) return; - WARN_ON_ONCE(ci->fscache); + WARN_ON_ONCE(ci->netfs_ctx.cache); - ci->fscache = fscache_acquire_cookie(fsc->fscache, 0, - &ci->i_vino, sizeof(ci->i_vino), - &ci->i_version, sizeof(ci->i_version), - i_size_read(inode)); + ci->netfs_ctx.cache = + fscache_acquire_cookie(fsc->fscache, 0, + &ci->i_vino, sizeof(ci->i_vino), + &ci->i_version, sizeof(ci->i_version), + i_size_read(inode)); } -void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) +void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info *ci) { - struct fscache_cookie *cookie = ci->fscache; - - fscache_relinquish_cookie(cookie, false); + fscache_relinquish_cookie(ceph_fscache_cookie(ci), false); } void ceph_fscache_use_cookie(struct inode *inode, bool will_modify) { struct ceph_inode_info *ci = ceph_inode(inode); - fscache_use_cookie(ci->fscache, will_modify); + fscache_use_cookie(ceph_fscache_cookie(ci), will_modify); } void ceph_fscache_unuse_cookie(struct inode *inode, bool update) @@ -58,9 +57,10 @@ void ceph_fscache_unuse_cookie(struct inode *inode, bool update) if (update) { loff_t i_size = i_size_read(inode); - fscache_unuse_cookie(ci->fscache, &ci->i_version, &i_size); + fscache_unuse_cookie(ceph_fscache_cookie(ci), + &ci->i_version, &i_size); } else { - fscache_unuse_cookie(ci->fscache, NULL, NULL); + fscache_unuse_cookie(ceph_fscache_cookie(ci), NULL, NULL); } } @@ -69,14 +69,14 @@ void ceph_fscache_update(struct inode *inode) struct ceph_inode_info *ci = ceph_inode(inode); loff_t i_size = i_size_read(inode); - fscache_update_cookie(ci->fscache, &ci->i_version, &i_size); + fscache_update_cookie(ceph_fscache_cookie(ci), &ci->i_version, &i_size); } void ceph_fscache_invalidate(struct inode *inode, bool dio_write) { struct ceph_inode_info *ci = ceph_inode(inode); - fscache_invalidate(ceph_inode(inode)->fscache, + fscache_invalidate(ceph_fscache_cookie(ci), &ci->i_version, i_size_read(inode), dio_write ? FSCACHE_INVAL_DIO_WRITE : 0); } diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index b90f3016994d..7255b790a4c1 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -26,14 +26,9 @@ void ceph_fscache_unuse_cookie(struct inode *inode, bool update); void ceph_fscache_update(struct inode *inode); void ceph_fscache_invalidate(struct inode *inode, bool dio_write); -static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) -{ - ci->fscache = NULL; -} - static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci) { - return ci->fscache; + return netfs_i_cookie(&ci->vfs_inode); } static inline void ceph_fscache_resize(struct inode *inode, loff_t to) @@ -62,7 +57,7 @@ static inline int ceph_fscache_dirty_folio(struct address_space *mapping, return fscache_dirty_folio(mapping, folio, ceph_fscache_cookie(ci)); } -static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq) +static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq) { struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(rreq->inode)); @@ -91,10 +86,6 @@ static inline void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) { } -static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) -{ -} - static inline void ceph_fscache_register_inode_cookie(struct inode *inode) { } @@ -144,7 +135,7 @@ static inline bool ceph_is_cache_enabled(struct inode *inode) return false; } -static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq) +static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq) { return -ENOBUFS; } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index b472cd066d1c..f1ad6884d4da 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1915,6 +1915,13 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, ceph_get_mds_session(session); spin_lock(&ci->i_ceph_lock); + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { + /* Don't send messages until we get async create reply */ + spin_unlock(&ci->i_ceph_lock); + ceph_put_mds_session(session); + return; + } + if (ci->i_ceph_flags & CEPH_I_FLUSH) flags |= CHECK_CAPS_FLUSH; retry: @@ -2409,6 +2416,9 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) dout("write_inode %p wait=%d\n", inode, wait); ceph_fscache_unpin_writeback(inode, wbc); if (wait) { + err = ceph_wait_on_async_create(inode); + if (err) + return err; dirty = try_flush_caps(inode, &flush_tid); if (dirty) err = wait_event_interruptible(ci->i_cap_wq, @@ -2439,6 +2449,10 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, u64 first_tid = 0; u64 last_snap_flush = 0; + /* Don't do anything until create reply comes in */ + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) + return; + ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) { @@ -4152,7 +4166,6 @@ void ceph_handle_caps(struct ceph_mds_session *session, /* lookup ino */ inode = ceph_find_inode(mdsc->fsc->sb, vino); - ci = ceph_inode(inode); dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, vino.snap, inode); @@ -4178,6 +4191,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, } goto flush_cap_releases; } + ci = ceph_inode(inode); /* these will work even if we don't have a cap yet */ switch (op) { diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 3cf7c9c1085b..bec3c4549c07 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -175,7 +175,7 @@ static int metrics_latency_show(struct seq_file *s, void *p) struct ceph_fs_client *fsc = s->private; struct ceph_client_metric *cm = &fsc->mdsc->metric; struct ceph_metric *m; - s64 total, sum, avg, min, max, sq; + s64 total, avg, min, max, sq; int i; seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n"); @@ -185,8 +185,7 @@ static int metrics_latency_show(struct seq_file *s, void *p) m = &cm->metric[i]; spin_lock(&m->lock); total = m->total; - sum = m->latency_sum; - avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; + avg = m->latency_avg; min = m->latency_min; max = m->latency_max; sq = m->latency_sq_sum; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 133dbd9338e7..eae417d71136 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -145,7 +145,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx, return ERR_PTR(-EAGAIN); } /* reading/filling the cache are serialized by - i_mutex, no need to use page lock */ + i_rwsem, no need to use page lock */ unlock_page(cache_ctl->page); cache_ctl->dentries = kmap(cache_ctl->page); } @@ -155,7 +155,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx, rcu_read_lock(); spin_lock(&parent->d_lock); /* check i_size again here, because empty directory can be - * marked as complete while not holding the i_mutex. */ + * marked as complete while not holding the i_rwsem. */ if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir)) dentry = cache_ctl->dentries[cache_ctl->index]; else @@ -478,8 +478,11 @@ more: 2 : (fpos_off(rde->offset) + 1); err = note_last_dentry(dfi, rde->name, rde->name_len, next_offset); - if (err) + if (err) { + ceph_mdsc_put_request(dfi->last_readdir); + dfi->last_readdir = NULL; return err; + } } else if (req->r_reply_info.dir_end) { dfi->next_offset = 2; /* keep last name */ @@ -520,6 +523,12 @@ more: if (!dir_emit(ctx, rde->name, rde->name_len, ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)), le32_to_cpu(rde->inode.in->mode) >> 12)) { + /* + * NOTE: Here no need to put the 'dfi->last_readdir', + * because when dir_emit stops us it's most likely + * doesn't have enough memory, etc. So for next readdir + * it will continue. + */ dout("filldir stopping us...\n"); return 0; } @@ -671,7 +680,7 @@ struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req, struct dentry *dentry) { struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); - struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */ + struct inode *parent = d_inode(dentry->d_parent); /* we hold i_rwsem */ /* .snap dir? */ if (ceph_snap(parent) == CEPH_NOSNAP && diff --git a/fs/ceph/file.c b/fs/ceph/file.c index bbed3224ad68..6c9e837aa1d3 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -207,6 +207,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, struct ceph_mount_options *opt = ceph_inode_to_client(&ci->vfs_inode)->mount_options; struct ceph_file_info *fi; + int ret; dout("%s %p %p 0%o (%s)\n", __func__, inode, file, inode->i_mode, isdir ? "dir" : "regular"); @@ -240,7 +241,22 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, INIT_LIST_HEAD(&fi->rw_contexts); fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen); + if ((file->f_mode & FMODE_WRITE) && + ci->i_inline_version != CEPH_INLINE_NONE) { + ret = ceph_uninline_data(file); + if (ret < 0) + goto error; + } + return 0; + +error: + ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE); + ceph_put_fmode(ci, fi->fmode, 1); + kmem_cache_free(ceph_file_cachep, fi); + /* wake up anyone waiting for caps on this inode */ + wake_up_all(&ci->i_cap_wq); + return ret; } /* @@ -516,52 +532,67 @@ static void restore_deleg_ino(struct inode *dir, u64 ino) } } +static void wake_async_create_waiters(struct inode *inode, + struct ceph_mds_session *session) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + spin_lock(&ci->i_ceph_lock); + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { + ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE; + wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT); + } + ceph_kick_flushing_inode_caps(session, ci); + spin_unlock(&ci->i_ceph_lock); +} + static void ceph_async_create_cb(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { + struct dentry *dentry = req->r_dentry; + struct inode *dinode = d_inode(dentry); + struct inode *tinode = req->r_target_inode; int result = req->r_err ? req->r_err : le32_to_cpu(req->r_reply_info.head->result); + WARN_ON_ONCE(dinode && tinode && dinode != tinode); + + /* MDS changed -- caller must resubmit */ if (result == -EJUKEBOX) goto out; mapping_set_error(req->r_parent->i_mapping, result); if (result) { - struct dentry *dentry = req->r_dentry; - struct inode *inode = d_inode(dentry); int pathlen = 0; u64 base = 0; char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, &base, 0); + pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n", + base, IS_ERR(path) ? "<<bad>>" : path, result); + ceph_mdsc_free_path(path, pathlen); + ceph_dir_clear_complete(req->r_parent); if (!d_unhashed(dentry)) d_drop(dentry); - ceph_inode_shutdown(inode); - - pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n", - base, IS_ERR(path) ? "<<bad>>" : path, result); - ceph_mdsc_free_path(path, pathlen); + if (dinode) { + mapping_set_error(dinode->i_mapping, result); + ceph_inode_shutdown(dinode); + wake_async_create_waiters(dinode, req->r_session); + } } - if (req->r_target_inode) { - struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); - u64 ino = ceph_vino(req->r_target_inode).ino; + if (tinode) { + u64 ino = ceph_vino(tinode).ino; if (req->r_deleg_ino != ino) pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n", __func__, req->r_err, req->r_deleg_ino, ino); - mapping_set_error(req->r_target_inode->i_mapping, result); - spin_lock(&ci->i_ceph_lock); - if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { - ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE; - wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT); - } - ceph_kick_flushing_inode_caps(req->r_session, ci); - spin_unlock(&ci->i_ceph_lock); + mapping_set_error(tinode->i_mapping, result); + wake_async_create_waiters(tinode, req->r_session); } else if (!result) { pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__, req->r_deleg_ino); @@ -1041,7 +1072,6 @@ static void ceph_aio_complete(struct inode *inode, } spin_lock(&ci->i_ceph_lock); - ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &aio_req->prealloc_cf); spin_unlock(&ci->i_ceph_lock); @@ -1778,12 +1808,6 @@ retry_snap: if (err) goto out; - if (ci->i_inline_version != CEPH_INLINE_NONE) { - err = ceph_uninline_data(file, NULL); - if (err < 0) - goto out; - } - dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", inode, ceph_vinop(inode), pos, count, i_size_read(inode)); if (!(fi->flags & CEPH_F_SYNC) && !direct_lock) @@ -1845,7 +1869,7 @@ retry_snap: * are pending vmtruncate. So write and vmtruncate * can not run at the same time */ - written = generic_perform_write(file, from, pos); + written = generic_perform_write(iocb, from); if (likely(written >= 0)) iocb->ki_pos = pos + written; ceph_end_io_write(inode); @@ -1855,7 +1879,6 @@ retry_snap: int dirty; spin_lock(&ci->i_ceph_lock); - ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf); spin_unlock(&ci->i_ceph_lock); @@ -2109,12 +2132,6 @@ static long ceph_fallocate(struct file *file, int mode, goto unlock; } - if (ci->i_inline_version != CEPH_INLINE_NONE) { - ret = ceph_uninline_data(file, NULL); - if (ret < 0) - goto unlock; - } - size = i_size_read(inode); /* Are we punching a hole beyond EOF? */ @@ -2139,7 +2156,6 @@ static long ceph_fallocate(struct file *file, int mode, if (!ret) { spin_lock(&ci->i_ceph_lock); - ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf); spin_unlock(&ci->i_ceph_lock); @@ -2532,7 +2548,6 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, } /* Mark Fw dirty */ spin_lock(&dst_ci->i_ceph_lock); - dst_ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf); spin_unlock(&dst_ci->i_ceph_lock); if (dirty) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 9cfa6c730519..63113e2a4890 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -87,13 +87,13 @@ struct inode *ceph_get_snapdir(struct inode *parent) if (!S_ISDIR(parent->i_mode)) { pr_warn_once("bad snapdir parent type (mode=0%o)\n", parent->i_mode); - return ERR_PTR(-ENOTDIR); + goto err; } if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) { pr_warn_once("bad snapdir inode type (mode=0%o)\n", inode->i_mode); - return ERR_PTR(-ENOTDIR); + goto err; } inode->i_mode = parent->i_mode; @@ -113,6 +113,12 @@ struct inode *ceph_get_snapdir(struct inode *parent) } return inode; +err: + if ((inode->i_state & I_NEW)) + discard_new_inode(inode); + else + iput(inode); + return ERR_PTR(-ENOTDIR); } const struct inode_operations ceph_file_iops = { @@ -453,6 +459,9 @@ struct inode *ceph_alloc_inode(struct super_block *sb) dout("alloc_inode %p\n", &ci->vfs_inode); + /* Set parameters for the netfs library */ + netfs_i_context_init(&ci->vfs_inode, &ceph_netfs_ops); + spin_lock_init(&ci->i_ceph_lock); ci->i_version = 0; @@ -538,9 +547,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb) INIT_WORK(&ci->i_work, ceph_inode_work); ci->i_work_mask = 0; memset(&ci->i_btime, '\0', sizeof(ci->i_btime)); - - ceph_fscache_inode_init(ci); - return &ci->vfs_inode; } @@ -1201,7 +1207,7 @@ out_unlock: /* * splice a dentry to an inode. - * caller must hold directory i_mutex for this to be safe. + * caller must hold directory i_rwsem for this to be safe. */ static int splice_dentry(struct dentry **pdn, struct inode *in) { @@ -1598,7 +1604,7 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn, return idx == 0 ? -ENOMEM : 0; } /* reading/filling the cache are serialized by - * i_mutex, no need to use page lock */ + * i_rwsem, no need to use page lock */ unlock_page(ctl->page); ctl->dentries = kmap(ctl->page); if (idx == 0) @@ -2301,6 +2307,57 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page, return err; } +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, + size_t size) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + int mode = USE_AUTH_MDS; + int err; + char *xattr_value; + size_t xattr_value_len; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode); + if (IS_ERR(req)) { + err = -ENOMEM; + goto out; + } + + req->r_path2 = kstrdup(name, GFP_NOFS); + if (!req->r_path2) { + err = -ENOMEM; + goto put; + } + + ihold(inode); + req->r_inode = inode; + err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err < 0) + goto put; + + xattr_value = req->r_reply_info.xattr_info.xattr_value; + xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len; + + dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size); + + err = (int)xattr_value_len; + if (size == 0) + goto put; + + if (xattr_value_len > size) { + err = -ERANGE; + goto put; + } + + memcpy(value, xattr_value, xattr_value_len); +put: + ceph_mdsc_put_request(req); +out: + dout("do_getvxattr result=%d\n", err); + return err; +} + /* * Check inode permissions. We verify we have a valid value for diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index d1f154aec249..3e2843e86e27 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -111,10 +111,10 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, req->r_args.filelock_change.length = cpu_to_le64(length); req->r_args.filelock_change.wait = wait; - if (wait) - req->r_wait_for_completion = ceph_lock_wait_for_completion; - - err = ceph_mdsc_do_request(mdsc, inode, req); + err = ceph_mdsc_submit_request(mdsc, inode, req); + if (!err) + err = ceph_mdsc_wait_request(mdsc, req, wait ? + ceph_lock_wait_for_completion : NULL); if (!err && operation == CEPH_MDS_OP_GETFILELOCK) { fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid); if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index c30eefc0ac19..fa38c013126d 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -555,6 +555,28 @@ bad: return -EIO; } +static int parse_reply_info_getvxattr(void **p, void *end, + struct ceph_mds_reply_info_parsed *info, + u64 features) +{ + u32 value_len; + + ceph_decode_skip_8(p, end, bad); /* skip current version: 1 */ + ceph_decode_skip_8(p, end, bad); /* skip first version: 1 */ + ceph_decode_skip_32(p, end, bad); /* skip payload length */ + + ceph_decode_32_safe(p, end, value_len, bad); + + if (value_len == end - *p) { + info->xattr_info.xattr_value = *p; + info->xattr_info.xattr_value_len = value_len; + *p = end; + return value_len; + } +bad: + return -EIO; +} + /* * parse extra results */ @@ -570,6 +592,8 @@ static int parse_reply_info_extra(void **p, void *end, return parse_reply_info_readdir(p, end, info, features); else if (op == CEPH_MDS_OP_CREATE) return parse_reply_info_create(p, end, info, features, s); + else if (op == CEPH_MDS_OP_GETVXATTR) + return parse_reply_info_getvxattr(p, end, info, features); else return -EIO; } @@ -2178,7 +2202,8 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, order = get_order(size * num_entries); while (order >= 0) { rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL | - __GFP_NOWARN, + __GFP_NOWARN | + __GFP_ZERO, order); if (rinfo->dir_entries) break; @@ -2946,15 +2971,16 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir, return err; } -static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, - struct ceph_mds_request *req) +int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req, + ceph_mds_request_wait_callback_t wait_func) { int err; /* wait */ dout("do_request waiting\n"); - if (!req->r_timeout && req->r_wait_for_completion) { - err = req->r_wait_for_completion(mdsc, req); + if (wait_func) { + err = wait_func(mdsc, req); } else { long timeleft = wait_for_completion_killable_timeout( &req->r_completion, @@ -3011,7 +3037,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, /* issue */ err = ceph_mdsc_submit_request(mdsc, dir, req); if (!err) - err = ceph_mdsc_wait_request(mdsc, req); + err = ceph_mdsc_wait_request(mdsc, req, NULL); dout("do_request %p done, result %d\n", req, err); return err; } @@ -3097,35 +3123,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) result = le32_to_cpu(head->result); - /* - * Handle an ESTALE - * if we're not talking to the authority, send to them - * if the authority has changed while we weren't looking, - * send to new authority - * Otherwise we just have to return an ESTALE - */ - if (result == -ESTALE) { - dout("got ESTALE on request %llu\n", req->r_tid); - req->r_resend_mds = -1; - if (req->r_direct_mode != USE_AUTH_MDS) { - dout("not using auth, setting for that now\n"); - req->r_direct_mode = USE_AUTH_MDS; - __do_request(mdsc, req); - mutex_unlock(&mdsc->mutex); - goto out; - } else { - int mds = __choose_mds(mdsc, req, NULL); - if (mds >= 0 && mds != req->r_session->s_mds) { - dout("but auth changed, so resending\n"); - __do_request(mdsc, req); - mutex_unlock(&mdsc->mutex); - goto out; - } - } - dout("have to return ESTALE on request %llu\n", req->r_tid); - } - - if (head->safe) { set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags); __unregister_request(mdsc, req); @@ -4841,7 +4838,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) mutex_unlock(&mdsc->mutex); ceph_cleanup_snapid_map(mdsc); - ceph_cleanup_empty_realms(mdsc); + ceph_cleanup_global_and_empty_realms(mdsc); cancel_work_sync(&mdsc->cap_reclaim_work); cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 97c7f7bfa55f..33497846e47e 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -100,6 +100,11 @@ struct ceph_mds_reply_dir_entry { loff_t offset; }; +struct ceph_mds_reply_xattr { + char *xattr_value; + size_t xattr_value_len; +}; + /* * parsed info about an mds reply, including information about * either: 1) the target inode and/or its parent directory and dentry, @@ -115,6 +120,7 @@ struct ceph_mds_reply_info_parsed { char *dname; u32 dname_len; struct ceph_mds_reply_lease *dlease; + struct ceph_mds_reply_xattr xattr_info; /* extra */ union { @@ -274,8 +280,8 @@ struct ceph_mds_request { union ceph_mds_request_args r_args; int r_fmode; /* file mode, if expecting cap */ - const struct cred *r_cred; int r_request_release_offset; + const struct cred *r_cred; struct timespec64 r_stamp; /* for choosing which mds to send this request to */ @@ -296,12 +302,11 @@ struct ceph_mds_request { struct ceph_msg *r_reply; struct ceph_mds_reply_info_parsed r_reply_info; int r_err; - + u32 r_readdir_offset; struct page *r_locked_page; int r_dir_caps; int r_num_caps; - u32 r_readdir_offset; unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */ unsigned long r_started; /* start time to measure timeout against */ @@ -329,7 +334,6 @@ struct ceph_mds_request { struct completion r_completion; struct completion r_safe_completion; ceph_mds_request_callback_t r_callback; - ceph_mds_request_wait_callback_t r_wait_for_completion; struct list_head r_unsafe_item; /* per-session unsafe list item */ long long r_dir_release_cnt; @@ -507,6 +511,9 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req); +int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req, + ceph_mds_request_wait_callback_t wait_func); extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req); diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index 0fcba68f9a99..c47347d2e84e 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -8,6 +8,12 @@ #include "metric.h" #include "mds_client.h" +static void ktime_to_ceph_timespec(struct ceph_timespec *ts, ktime_t val) +{ + struct timespec64 t = ktime_to_timespec64(val); + ceph_encode_timespec64(ts, &t); +} + static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, struct ceph_mds_session *s) { @@ -26,7 +32,6 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, u64 nr_caps = atomic64_read(&m->total_caps); u32 header_len = sizeof(struct ceph_metric_header); struct ceph_msg *msg; - struct timespec64 ts; s64 sum; s32 items = 0; s32 len; @@ -59,37 +64,40 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, /* encode the read latency metric */ read = (struct ceph_metric_read_latency *)(cap + 1); read->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY); - read->header.ver = 1; + read->header.ver = 2; read->header.compat = 1; read->header.data_len = cpu_to_le32(sizeof(*read) - header_len); sum = m->metric[METRIC_READ].latency_sum; - jiffies_to_timespec64(sum, &ts); - read->sec = cpu_to_le32(ts.tv_sec); - read->nsec = cpu_to_le32(ts.tv_nsec); + ktime_to_ceph_timespec(&read->lat, sum); + ktime_to_ceph_timespec(&read->avg, m->metric[METRIC_READ].latency_avg); + read->sq_sum = cpu_to_le64(m->metric[METRIC_READ].latency_sq_sum); + read->count = cpu_to_le64(m->metric[METRIC_READ].total); items++; /* encode the write latency metric */ write = (struct ceph_metric_write_latency *)(read + 1); write->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY); - write->header.ver = 1; + write->header.ver = 2; write->header.compat = 1; write->header.data_len = cpu_to_le32(sizeof(*write) - header_len); sum = m->metric[METRIC_WRITE].latency_sum; - jiffies_to_timespec64(sum, &ts); - write->sec = cpu_to_le32(ts.tv_sec); - write->nsec = cpu_to_le32(ts.tv_nsec); + ktime_to_ceph_timespec(&write->lat, sum); + ktime_to_ceph_timespec(&write->avg, m->metric[METRIC_WRITE].latency_avg); + write->sq_sum = cpu_to_le64(m->metric[METRIC_WRITE].latency_sq_sum); + write->count = cpu_to_le64(m->metric[METRIC_WRITE].total); items++; /* encode the metadata latency metric */ meta = (struct ceph_metric_metadata_latency *)(write + 1); meta->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY); - meta->header.ver = 1; + meta->header.ver = 2; meta->header.compat = 1; meta->header.data_len = cpu_to_le32(sizeof(*meta) - header_len); sum = m->metric[METRIC_METADATA].latency_sum; - jiffies_to_timespec64(sum, &ts); - meta->sec = cpu_to_le32(ts.tv_sec); - meta->nsec = cpu_to_le32(ts.tv_nsec); + ktime_to_ceph_timespec(&meta->lat, sum); + ktime_to_ceph_timespec(&meta->avg, m->metric[METRIC_METADATA].latency_avg); + meta->sq_sum = cpu_to_le64(m->metric[METRIC_METADATA].latency_sq_sum); + meta->count = cpu_to_le64(m->metric[METRIC_METADATA].total); items++; /* encode the dentry lease metric */ @@ -250,6 +258,7 @@ int ceph_metric_init(struct ceph_client_metric *m) metric->size_max = 0; metric->total = 0; metric->latency_sum = 0; + metric->latency_avg = 0; metric->latency_sq_sum = 0; metric->latency_min = KTIME_MAX; metric->latency_max = 0; @@ -307,20 +316,19 @@ void ceph_metric_destroy(struct ceph_client_metric *m) max = new; \ } -static inline void __update_stdev(ktime_t total, ktime_t lsum, - ktime_t *sq_sump, ktime_t lat) +static inline void __update_mean_and_stdev(ktime_t total, ktime_t *lavg, + ktime_t *sq_sump, ktime_t lat) { - ktime_t avg, sq; - - if (unlikely(total == 1)) - return; - - /* the sq is (lat - old_avg) * (lat - new_avg) */ - avg = DIV64_U64_ROUND_CLOSEST((lsum - lat), (total - 1)); - sq = lat - avg; - avg = DIV64_U64_ROUND_CLOSEST(lsum, total); - sq = sq * (lat - avg); - *sq_sump += sq; + ktime_t avg; + + if (unlikely(total == 1)) { + *lavg = lat; + } else { + /* the sq is (lat - old_avg) * (lat - new_avg) */ + avg = *lavg + div64_s64(lat - *lavg, total); + *sq_sump += (lat - *lavg)*(lat - avg); + *lavg = avg; + } } void ceph_update_metrics(struct ceph_metric *m, @@ -339,6 +347,7 @@ void ceph_update_metrics(struct ceph_metric *m, METRIC_UPDATE_MIN_MAX(m->size_min, m->size_max, size); m->latency_sum += lat; METRIC_UPDATE_MIN_MAX(m->latency_min, m->latency_max, lat); - __update_stdev(total, m->latency_sum, &m->latency_sq_sum, lat); + __update_mean_and_stdev(total, &m->latency_avg, &m->latency_sq_sum, + lat); spin_unlock(&m->lock); } diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index bb45608181e7..0d0c44bd3332 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -2,7 +2,7 @@ #ifndef _FS_CEPH_MDS_METRIC_H #define _FS_CEPH_MDS_METRIC_H -#include <linux/types.h> +#include <linux/ceph/types.h> #include <linux/percpu_counter.h> #include <linux/ktime.h> @@ -19,27 +19,39 @@ enum ceph_metric_type { CLIENT_METRIC_TYPE_OPENED_INODES, CLIENT_METRIC_TYPE_READ_IO_SIZES, CLIENT_METRIC_TYPE_WRITE_IO_SIZES, - - CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_WRITE_IO_SIZES, + CLIENT_METRIC_TYPE_AVG_READ_LATENCY, + CLIENT_METRIC_TYPE_STDEV_READ_LATENCY, + CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY, + CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY, + CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY, + CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, + + CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, }; /* * This will always have the highest metric bit value * as the last element of the array. */ -#define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED { \ - CLIENT_METRIC_TYPE_CAP_INFO, \ - CLIENT_METRIC_TYPE_READ_LATENCY, \ - CLIENT_METRIC_TYPE_WRITE_LATENCY, \ - CLIENT_METRIC_TYPE_METADATA_LATENCY, \ - CLIENT_METRIC_TYPE_DENTRY_LEASE, \ - CLIENT_METRIC_TYPE_OPENED_FILES, \ - CLIENT_METRIC_TYPE_PINNED_ICAPS, \ - CLIENT_METRIC_TYPE_OPENED_INODES, \ - CLIENT_METRIC_TYPE_READ_IO_SIZES, \ - CLIENT_METRIC_TYPE_WRITE_IO_SIZES, \ - \ - CLIENT_METRIC_TYPE_MAX, \ +#define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED { \ + CLIENT_METRIC_TYPE_CAP_INFO, \ + CLIENT_METRIC_TYPE_READ_LATENCY, \ + CLIENT_METRIC_TYPE_WRITE_LATENCY, \ + CLIENT_METRIC_TYPE_METADATA_LATENCY, \ + CLIENT_METRIC_TYPE_DENTRY_LEASE, \ + CLIENT_METRIC_TYPE_OPENED_FILES, \ + CLIENT_METRIC_TYPE_PINNED_ICAPS, \ + CLIENT_METRIC_TYPE_OPENED_INODES, \ + CLIENT_METRIC_TYPE_READ_IO_SIZES, \ + CLIENT_METRIC_TYPE_WRITE_IO_SIZES, \ + CLIENT_METRIC_TYPE_AVG_READ_LATENCY, \ + CLIENT_METRIC_TYPE_STDEV_READ_LATENCY, \ + CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY, \ + CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY, \ + CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY, \ + CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, \ + \ + CLIENT_METRIC_TYPE_MAX, \ } struct ceph_metric_header { @@ -60,22 +72,28 @@ struct ceph_metric_cap { /* metric read latency header */ struct ceph_metric_read_latency { struct ceph_metric_header header; - __le32 sec; - __le32 nsec; + struct ceph_timespec lat; + struct ceph_timespec avg; + __le64 sq_sum; + __le64 count; } __packed; /* metric write latency header */ struct ceph_metric_write_latency { struct ceph_metric_header header; - __le32 sec; - __le32 nsec; + struct ceph_timespec lat; + struct ceph_timespec avg; + __le64 sq_sum; + __le64 count; } __packed; /* metric metadata latency header */ struct ceph_metric_metadata_latency { struct ceph_metric_header header; - __le32 sec; - __le32 nsec; + struct ceph_timespec lat; + struct ceph_timespec avg; + __le64 sq_sum; + __le64 count; } __packed; /* metric dentry lease header */ @@ -140,6 +158,7 @@ struct ceph_metric { u64 size_min; u64 size_max; ktime_t latency_sum; + ktime_t latency_avg; ktime_t latency_sq_sum; ktime_t latency_min; ktime_t latency_max; diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index b41e6724c591..322ee5add942 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -121,18 +121,23 @@ static struct ceph_snap_realm *ceph_create_snap_realm( if (!realm) return ERR_PTR(-ENOMEM); - atomic_set(&realm->nref, 1); /* for caller */ + /* Do not release the global dummy snaprealm until unmouting */ + if (ino == CEPH_INO_GLOBAL_SNAPREALM) + atomic_set(&realm->nref, 2); + else + atomic_set(&realm->nref, 1); realm->ino = ino; INIT_LIST_HEAD(&realm->children); INIT_LIST_HEAD(&realm->child_item); INIT_LIST_HEAD(&realm->empty_item); INIT_LIST_HEAD(&realm->dirty_item); + INIT_LIST_HEAD(&realm->rebuild_item); INIT_LIST_HEAD(&realm->inodes_with_caps); spin_lock_init(&realm->inodes_with_caps_lock); __insert_snap_realm(&mdsc->snap_realms, realm); mdsc->num_snap_realms++; - dout("create_snap_realm %llx %p\n", realm->ino, realm); + dout("%s %llx %p\n", __func__, realm->ino, realm); return realm; } @@ -156,7 +161,7 @@ static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc, else if (ino > r->ino) n = n->rb_right; else { - dout("lookup_snap_realm %llx %p\n", r->ino, r); + dout("%s %llx %p\n", __func__, r->ino, r); return r; } } @@ -184,7 +189,7 @@ static void __destroy_snap_realm(struct ceph_mds_client *mdsc, { lockdep_assert_held_write(&mdsc->snap_rwsem); - dout("__destroy_snap_realm %p %llx\n", realm, realm->ino); + dout("%s %p %llx\n", __func__, realm, realm->ino); rb_erase(&realm->node, &mdsc->snap_realms); mdsc->num_snap_realms--; @@ -260,9 +265,14 @@ static void __cleanup_empty_realms(struct ceph_mds_client *mdsc) spin_unlock(&mdsc->snap_empty_lock); } -void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc) +void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc) { + struct ceph_snap_realm *global_realm; + down_write(&mdsc->snap_rwsem); + global_realm = __lookup_snap_realm(mdsc, CEPH_INO_GLOBAL_SNAPREALM); + if (global_realm) + ceph_put_snap_realm(mdsc, global_realm); __cleanup_empty_realms(mdsc); up_write(&mdsc->snap_rwsem); } @@ -292,9 +302,8 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc, if (IS_ERR(parent)) return PTR_ERR(parent); } - dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n", - realm->ino, realm, realm->parent_ino, realm->parent, - parentino, parent); + dout("%s %llx %p: %llx %p -> %llx %p\n", __func__, realm->ino, + realm, realm->parent_ino, realm->parent, parentino, parent); if (realm->parent) { list_del_init(&realm->child_item); ceph_put_snap_realm(mdsc, realm->parent); @@ -320,7 +329,8 @@ static int cmpu64_rev(const void *a, const void *b) * build the snap context for a given realm. */ static int build_snap_context(struct ceph_snap_realm *realm, - struct list_head* dirty_realms) + struct list_head *realm_queue, + struct list_head *dirty_realms) { struct ceph_snap_realm *parent = realm->parent; struct ceph_snap_context *snapc; @@ -334,9 +344,9 @@ static int build_snap_context(struct ceph_snap_realm *realm, */ if (parent) { if (!parent->cached_context) { - err = build_snap_context(parent, dirty_realms); - if (err) - goto fail; + /* add to the queue head */ + list_add(&parent->rebuild_item, realm_queue); + return 1; } num += parent->cached_context->num_snaps; } @@ -349,9 +359,8 @@ static int build_snap_context(struct ceph_snap_realm *realm, realm->cached_context->seq == realm->seq && (!parent || realm->cached_context->seq >= parent->cached_context->seq)) { - dout("build_snap_context %llx %p: %p seq %lld (%u snaps)" - " (unchanged)\n", - realm->ino, realm, realm->cached_context, + dout("%s %llx %p: %p seq %lld (%u snaps) (unchanged)\n", + __func__, realm->ino, realm, realm->cached_context, realm->cached_context->seq, (unsigned int)realm->cached_context->num_snaps); return 0; @@ -390,9 +399,8 @@ static int build_snap_context(struct ceph_snap_realm *realm, sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL); snapc->num_snaps = num; - dout("build_snap_context %llx %p: %p seq %lld (%u snaps)\n", - realm->ino, realm, snapc, snapc->seq, - (unsigned int) snapc->num_snaps); + dout("%s %llx %p: %p seq %lld (%u snaps)\n", __func__, realm->ino, + realm, snapc, snapc->seq, (unsigned int) snapc->num_snaps); ceph_put_snap_context(realm->cached_context); realm->cached_context = snapc; @@ -409,8 +417,7 @@ fail: ceph_put_snap_context(realm->cached_context); realm->cached_context = NULL; } - pr_err("build_snap_context %llx %p fail %d\n", realm->ino, - realm, err); + pr_err("%s %llx %p fail %d\n", __func__, realm->ino, realm, err); return err; } @@ -420,13 +427,50 @@ fail: static void rebuild_snap_realms(struct ceph_snap_realm *realm, struct list_head *dirty_realms) { - struct ceph_snap_realm *child; + LIST_HEAD(realm_queue); + int last = 0; + bool skip = false; + + list_add_tail(&realm->rebuild_item, &realm_queue); + + while (!list_empty(&realm_queue)) { + struct ceph_snap_realm *_realm, *child; + + _realm = list_first_entry(&realm_queue, + struct ceph_snap_realm, + rebuild_item); + + /* + * If the last building failed dues to memory + * issue, just empty the realm_queue and return + * to avoid infinite loop. + */ + if (last < 0) { + list_del_init(&_realm->rebuild_item); + continue; + } + + last = build_snap_context(_realm, &realm_queue, dirty_realms); + dout("%s %llx %p, %s\n", __func__, _realm->ino, _realm, + last > 0 ? "is deferred" : !last ? "succeeded" : "failed"); + + /* is any child in the list ? */ + list_for_each_entry(child, &_realm->children, child_item) { + if (!list_empty(&child->rebuild_item)) { + skip = true; + break; + } + } - dout("rebuild_snap_realms %llx %p\n", realm->ino, realm); - build_snap_context(realm, dirty_realms); + if (!skip) { + list_for_each_entry(child, &_realm->children, child_item) + list_add_tail(&child->rebuild_item, &realm_queue); + } - list_for_each_entry(child, &realm->children, child_item) - rebuild_snap_realms(child, dirty_realms); + /* last == 1 means need to build parent first */ + if (last <= 0) + list_del_init(&_realm->rebuild_item); + } } @@ -474,23 +518,15 @@ static bool has_new_snaps(struct ceph_snap_context *o, * Caller must hold snap_rwsem for read (i.e., the realm topology won't * change). */ -static void ceph_queue_cap_snap(struct ceph_inode_info *ci) +static void ceph_queue_cap_snap(struct ceph_inode_info *ci, + struct ceph_cap_snap **pcapsnap) { struct inode *inode = &ci->vfs_inode; - struct ceph_cap_snap *capsnap; struct ceph_snap_context *old_snapc, *new_snapc; + struct ceph_cap_snap *capsnap = *pcapsnap; struct ceph_buffer *old_blob = NULL; int used, dirty; - capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); - if (!capsnap) { - pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode); - return; - } - capsnap->cap_flush.is_capsnap = true; - INIT_LIST_HEAD(&capsnap->cap_flush.i_list); - INIT_LIST_HEAD(&capsnap->cap_flush.g_list); - spin_lock(&ci->i_ceph_lock); used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); @@ -511,12 +547,14 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci) as no new writes are allowed to start when pending, so any writes in progress now were started before the previous cap_snap. lucky us. */ - dout("queue_cap_snap %p already pending\n", inode); + dout("%s %p %llx.%llx already pending\n", + __func__, inode, ceph_vinop(inode)); goto update_snapc; } if (ci->i_wrbuffer_ref_head == 0 && !(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) { - dout("queue_cap_snap %p nothing dirty|writing\n", inode); + dout("%s %p %llx.%llx nothing dirty|writing\n", + __func__, inode, ceph_vinop(inode)); goto update_snapc; } @@ -536,20 +574,17 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci) } else { if (!(used & CEPH_CAP_FILE_WR) && ci->i_wrbuffer_ref_head == 0) { - dout("queue_cap_snap %p " - "no new_snap|dirty_page|writing\n", inode); + dout("%s %p %llx.%llx no new_snap|dirty_page|writing\n", + __func__, inode, ceph_vinop(inode)); goto update_snapc; } } - dout("queue_cap_snap %p cap_snap %p queuing under %p %s %s\n", - inode, capsnap, old_snapc, ceph_cap_string(dirty), - capsnap->need_flush ? "" : "no_flush"); + dout("%s %p %llx.%llx cap_snap %p queuing under %p %s %s\n", + __func__, inode, ceph_vinop(inode), capsnap, old_snapc, + ceph_cap_string(dirty), capsnap->need_flush ? "" : "no_flush"); ihold(inode); - refcount_set(&capsnap->nref, 1); - INIT_LIST_HEAD(&capsnap->ci_item); - capsnap->follows = old_snapc->seq; capsnap->issued = __ceph_caps_issued(ci, NULL); capsnap->dirty = dirty; @@ -579,31 +614,30 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci) list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); if (used & CEPH_CAP_FILE_WR) { - dout("queue_cap_snap %p cap_snap %p snapc %p" - " seq %llu used WR, now pending\n", inode, + dout("%s %p %llx.%llx cap_snap %p snapc %p seq %llu used WR," + " now pending\n", __func__, inode, ceph_vinop(inode), capsnap, old_snapc, old_snapc->seq); capsnap->writing = 1; } else { /* note mtime, size NOW. */ __ceph_finish_cap_snap(ci, capsnap); } - capsnap = NULL; + *pcapsnap = NULL; old_snapc = NULL; update_snapc: - if (ci->i_wrbuffer_ref_head == 0 && - ci->i_wr_ref == 0 && - ci->i_dirty_caps == 0 && - ci->i_flushing_caps == 0) { - ci->i_head_snapc = NULL; - } else { + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + ci->i_head_snapc = NULL; + } else { ci->i_head_snapc = ceph_get_snap_context(new_snapc); dout(" new snapc is %p\n", new_snapc); } spin_unlock(&ci->i_ceph_lock); ceph_buffer_put(old_blob); - kfree(capsnap); ceph_put_snap_context(old_snapc); } @@ -632,27 +666,28 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, capsnap->truncate_size = ci->i_truncate_size; capsnap->truncate_seq = ci->i_truncate_seq; if (capsnap->dirty_pages) { - dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu " - "still has %d dirty pages\n", inode, capsnap, - capsnap->context, capsnap->context->seq, - ceph_cap_string(capsnap->dirty), capsnap->size, - capsnap->dirty_pages); + dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu " + "still has %d dirty pages\n", __func__, inode, + ceph_vinop(inode), capsnap, capsnap->context, + capsnap->context->seq, ceph_cap_string(capsnap->dirty), + capsnap->size, capsnap->dirty_pages); return 0; } /* Fb cap still in use, delay it */ if (ci->i_wb_ref) { - dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu " - "used WRBUFFER, delaying\n", inode, capsnap, - capsnap->context, capsnap->context->seq, - ceph_cap_string(capsnap->dirty), capsnap->size); + dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu " + "used WRBUFFER, delaying\n", __func__, inode, + ceph_vinop(inode), capsnap, capsnap->context, + capsnap->context->seq, ceph_cap_string(capsnap->dirty), + capsnap->size); capsnap->writing = 1; return 0; } ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; - dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n", - inode, capsnap, capsnap->context, + dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu\n", + __func__, inode, ceph_vinop(inode), capsnap, capsnap->context, capsnap->context->seq, ceph_cap_string(capsnap->dirty), capsnap->size); @@ -671,8 +706,9 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) { struct ceph_inode_info *ci; struct inode *lastinode = NULL; + struct ceph_cap_snap *capsnap = NULL; - dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino); + dout("%s %p %llx inode\n", __func__, realm, realm->ino); spin_lock(&realm->inodes_with_caps_lock); list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) { @@ -682,13 +718,35 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) spin_unlock(&realm->inodes_with_caps_lock); iput(lastinode); lastinode = inode; - ceph_queue_cap_snap(ci); + + /* + * Allocate the capsnap memory outside of ceph_queue_cap_snap() + * to reduce very possible but unnecessary frequently memory + * allocate/free in this loop. + */ + if (!capsnap) { + capsnap = kmem_cache_zalloc(ceph_cap_snap_cachep, GFP_NOFS); + if (!capsnap) { + pr_err("ENOMEM allocating ceph_cap_snap on %p\n", + inode); + return; + } + } + capsnap->cap_flush.is_capsnap = true; + refcount_set(&capsnap->nref, 1); + INIT_LIST_HEAD(&capsnap->cap_flush.i_list); + INIT_LIST_HEAD(&capsnap->cap_flush.g_list); + INIT_LIST_HEAD(&capsnap->ci_item); + + ceph_queue_cap_snap(ci, &capsnap); spin_lock(&realm->inodes_with_caps_lock); } spin_unlock(&realm->inodes_with_caps_lock); iput(lastinode); - dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); + if (capsnap) + kmem_cache_free(ceph_cap_snap_cachep, capsnap); + dout("%s %p %llx done\n", __func__, realm, realm->ino); } /* @@ -707,14 +765,16 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, __le64 *prior_parent_snaps; /* encoded */ struct ceph_snap_realm *realm = NULL; struct ceph_snap_realm *first_realm = NULL; - int invalidate = 0; + struct ceph_snap_realm *realm_to_rebuild = NULL; + int rebuild_snapcs; int err = -ENOMEM; LIST_HEAD(dirty_realms); lockdep_assert_held_write(&mdsc->snap_rwsem); - dout("update_snap_trace deletion=%d\n", deletion); + dout("%s deletion=%d\n", __func__, deletion); more: + rebuild_snapcs = 0; ceph_decode_need(&p, e, sizeof(*ri), bad); ri = p; p += sizeof(*ri); @@ -738,10 +798,10 @@ more: err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent)); if (err < 0) goto fail; - invalidate += err; + rebuild_snapcs += err; if (le64_to_cpu(ri->seq) > realm->seq) { - dout("update_snap_trace updating %llx %p %lld -> %lld\n", + dout("%s updating %llx %p %lld -> %lld\n", __func__, realm->ino, realm, realm->seq, le64_to_cpu(ri->seq)); /* update realm parameters, snap lists */ realm->seq = le64_to_cpu(ri->seq); @@ -763,22 +823,30 @@ more: if (realm->seq > mdsc->last_snap_seq) mdsc->last_snap_seq = realm->seq; - invalidate = 1; + rebuild_snapcs = 1; } else if (!realm->cached_context) { - dout("update_snap_trace %llx %p seq %lld new\n", + dout("%s %llx %p seq %lld new\n", __func__, realm->ino, realm, realm->seq); - invalidate = 1; + rebuild_snapcs = 1; } else { - dout("update_snap_trace %llx %p seq %lld unchanged\n", + dout("%s %llx %p seq %lld unchanged\n", __func__, realm->ino, realm, realm->seq); } - dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino, - realm, invalidate, p, e); + dout("done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino, + realm, rebuild_snapcs, p, e); + + /* + * this will always track the uppest parent realm from which + * we need to rebuild the snapshot contexts _downward_ in + * hierarchy. + */ + if (rebuild_snapcs) + realm_to_rebuild = realm; - /* invalidate when we reach the _end_ (root) of the trace */ - if (invalidate && p >= e) - rebuild_snap_realms(realm, &dirty_realms); + /* rebuild_snapcs when we reach the _end_ (root) of the trace */ + if (realm_to_rebuild && p >= e) + rebuild_snap_realms(realm_to_rebuild, &dirty_realms); if (!first_realm) first_realm = realm; @@ -814,7 +882,7 @@ fail: ceph_put_snap_realm(mdsc, realm); if (first_realm) ceph_put_snap_realm(mdsc, first_realm); - pr_err("update_snap_trace error %d\n", err); + pr_err("%s error %d\n", __func__, err); return err; } @@ -831,7 +899,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc) struct inode *inode; struct ceph_mds_session *session = NULL; - dout("flush_snaps\n"); + dout("%s\n", __func__); spin_lock(&mdsc->snap_flush_lock); while (!list_empty(&mdsc->snap_flush_list)) { ci = list_first_entry(&mdsc->snap_flush_list, @@ -846,7 +914,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc) spin_unlock(&mdsc->snap_flush_lock); ceph_put_mds_session(session); - dout("flush_snaps done\n"); + dout("%s done\n", __func__); } /** @@ -928,8 +996,8 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, trace_len = le32_to_cpu(h->trace_len); p += sizeof(*h); - dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds, - ceph_snap_op_name(op), split, trace_len); + dout("%s from mds%d op %s split %llx tracelen %d\n", __func__, + mds, ceph_snap_op_name(op), split, trace_len); mutex_lock(&session->s_mutex); inc_session_sequence(session); @@ -989,13 +1057,13 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, */ if (ci->i_snap_realm->created > le64_to_cpu(ri->created)) { - dout(" leaving %p in newer realm %llx %p\n", - inode, ci->i_snap_realm->ino, + dout(" leaving %p %llx.%llx in newer realm %llx %p\n", + inode, ceph_vinop(inode), ci->i_snap_realm->ino, ci->i_snap_realm); goto skip_inode; } - dout(" will move %p to split realm %llx %p\n", - inode, realm->ino, realm); + dout(" will move %p %llx.%llx to split realm %llx %p\n", + inode, ceph_vinop(inode), realm->ino, realm); ceph_get_snap_realm(mdsc, realm); ceph_change_snap_realm(inode, realm); @@ -1038,7 +1106,7 @@ skip_inode: return; bad: - pr_err("corrupt snap message from mds%d\n", mds); + pr_err("%s corrupt snap message from mds%d\n", __func__, mds); ceph_msg_dump(msg); out: if (locked_rwsem) @@ -1071,7 +1139,8 @@ struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc, } spin_unlock(&mdsc->snapid_map_lock); if (exist) { - dout("found snapid map %llx -> %x\n", exist->snap, exist->dev); + dout("%s found snapid map %llx -> %x\n", __func__, + exist->snap, exist->dev); return exist; } @@ -1115,11 +1184,13 @@ struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc, if (exist) { free_anon_bdev(sm->dev); kfree(sm); - dout("found snapid map %llx -> %x\n", exist->snap, exist->dev); + dout("%s found snapid map %llx -> %x\n", __func__, + exist->snap, exist->dev); return exist; } - dout("create snapid map %llx -> %x\n", sm->snap, sm->dev); + dout("%s create snapid map %llx -> %x\n", __func__, + sm->snap, sm->dev); return sm; } diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index 573bb9556fb5..e36e8948e728 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c @@ -60,6 +60,7 @@ const char *ceph_mds_op_name(int op) case CEPH_MDS_OP_LOOKUPINO: return "lookupino"; case CEPH_MDS_OP_LOOKUPNAME: return "lookupname"; case CEPH_MDS_OP_GETATTR: return "getattr"; + case CEPH_MDS_OP_GETVXATTR: return "getvxattr"; case CEPH_MDS_OP_SETXATTR: return "setxattr"; case CEPH_MDS_OP_SETATTR: return "setattr"; case CEPH_MDS_OP_RMXATTR: return "rmxattr"; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 4a3b77d049c7..e6987d295079 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -865,6 +865,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) */ struct kmem_cache *ceph_inode_cachep; struct kmem_cache *ceph_cap_cachep; +struct kmem_cache *ceph_cap_snap_cachep; struct kmem_cache *ceph_cap_flush_cachep; struct kmem_cache *ceph_dentry_cachep; struct kmem_cache *ceph_file_cachep; @@ -893,6 +894,9 @@ static int __init init_caches(void) ceph_cap_cachep = KMEM_CACHE(ceph_cap, SLAB_MEM_SPREAD); if (!ceph_cap_cachep) goto bad_cap; + ceph_cap_snap_cachep = KMEM_CACHE(ceph_cap_snap, SLAB_MEM_SPREAD); + if (!ceph_cap_snap_cachep) + goto bad_cap_snap; ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); if (!ceph_cap_flush_cachep) @@ -932,6 +936,8 @@ bad_file: bad_dentry: kmem_cache_destroy(ceph_cap_flush_cachep); bad_cap_flush: + kmem_cache_destroy(ceph_cap_snap_cachep); +bad_cap_snap: kmem_cache_destroy(ceph_cap_cachep); bad_cap: kmem_cache_destroy(ceph_inode_cachep); @@ -948,6 +954,7 @@ static void destroy_caches(void) kmem_cache_destroy(ceph_inode_cachep); kmem_cache_destroy(ceph_cap_cachep); + kmem_cache_destroy(ceph_cap_snap_cachep); kmem_cache_destroy(ceph_cap_flush_cachep); kmem_cache_destroy(ceph_dentry_cachep); kmem_cache_destroy(ceph_file_cachep); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 0bd97aea2319..20ceab74e871 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -17,13 +17,11 @@ #include <linux/posix_acl.h> #include <linux/refcount.h> #include <linux/security.h> +#include <linux/netfs.h> +#include <linux/fscache.h> #include <linux/ceph/libceph.h> -#ifdef CONFIG_CEPH_FSCACHE -#include <linux/fscache.h> -#endif - /* large granularity for statfs utilization stats to facilitate * large volume sizes on 32-bit machines. */ #define CEPH_BLOCK_SHIFT 22 /* 4 MB */ @@ -231,7 +229,7 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) if (refcount_dec_and_test(&capsnap->nref)) { if (capsnap->xattr_blob) ceph_buffer_put(capsnap->xattr_blob); - kfree(capsnap); + kmem_cache_free(ceph_cap_snap_cachep, capsnap); } } @@ -318,6 +316,11 @@ struct ceph_inode_xattrs_info { * Ceph inode. */ struct ceph_inode_info { + struct { + /* These must be contiguous */ + struct inode vfs_inode; + struct netfs_i_context netfs_ctx; /* Netfslib context */ + }; struct ceph_vino i_vino; /* ceph ino + snap */ spinlock_t i_ceph_lock; @@ -428,11 +431,6 @@ struct ceph_inode_info { struct work_struct i_work; unsigned long i_work_mask; - -#ifdef CONFIG_CEPH_FSCACHE - struct fscache_cookie *fscache; -#endif - struct inode vfs_inode; /* at end */ }; static inline struct ceph_inode_info * @@ -884,6 +882,8 @@ struct ceph_snap_realm { struct list_head dirty_item; /* if realm needs new context */ + struct list_head rebuild_item; /* rebuild snap realms _downward_ in hierarchy */ + /* the current set of snaps for this realm */ struct ceph_snap_context *cached_context; @@ -939,7 +939,7 @@ extern void ceph_handle_snap(struct ceph_mds_client *mdsc, struct ceph_msg *msg); extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap); -extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); +extern void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc); extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc, u64 snap); @@ -1049,6 +1049,7 @@ static inline bool ceph_inode_is_shutdown(struct inode *inode) /* xattr.c */ int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int); +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size); ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t); extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci); @@ -1213,8 +1214,9 @@ extern void __ceph_touch_fmode(struct ceph_inode_info *ci, /* addr.c */ extern const struct address_space_operations ceph_aops; +extern const struct netfs_request_ops ceph_netfs_ops; extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); -extern int ceph_uninline_data(struct file *filp, struct page *locked_page); +extern int ceph_uninline_data(struct file *file); extern int ceph_pool_perm_check(struct inode *inode, int need); extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index fcf7dfdecf96..afec84088471 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -923,10 +923,13 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_xattr *xattr; - struct ceph_vxattr *vxattr = NULL; + struct ceph_vxattr *vxattr; int req_mask; ssize_t err; + if (strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) + goto handle_non_vxattrs; + /* let's see if a virtual xattr was requested */ vxattr = ceph_match_vxattr(inode, name); if (vxattr) { @@ -945,8 +948,14 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, err = -ERANGE; } return err; + } else { + err = ceph_do_getvxattr(inode, name, value, size); + /* this would happen with a new client and old server combo */ + if (err == -EOPNOTSUPP) + err = -ENODATA; + return err; } - +handle_non_vxattrs: req_mask = __get_request_mask(inode); spin_lock(&ci->i_ceph_lock); diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index ea00e1a91250..9d334816eac0 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -94,7 +94,7 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon) le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics), le32_to_cpu(tcon->fsAttrInfo.Attributes), le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength), - tcon->tidStatus); + tcon->status); if (dev_type == FILE_DEVICE_DISK) seq_puts(m, " type: DISK "); else if (dev_type == FILE_DEVICE_CD_ROM) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index d1211ad4e85b..a47fa44b6d52 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -699,14 +699,14 @@ static void cifs_umount_begin(struct super_block *sb) tcon = cifs_sb_master_tcon(cifs_sb); spin_lock(&cifs_tcp_ses_lock); - if ((tcon->tc_count > 1) || (tcon->tidStatus == CifsExiting)) { + if ((tcon->tc_count > 1) || (tcon->status == TID_EXITING)) { /* we have other mounts to same share or we have already tried to force umount this and woken up all waiting network requests, nothing to do */ spin_unlock(&cifs_tcp_ses_lock); return; } else if (tcon->tc_count == 1) - tcon->tidStatus = CifsExiting; + tcon->status = TID_EXITING; spin_unlock(&cifs_tcp_ses_lock); /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 48b343d03430..8de977c359b1 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -16,6 +16,7 @@ #include <linux/mempool.h> #include <linux/workqueue.h> #include <linux/utsname.h> +#include <linux/netfs.h> #include "cifs_fs_sb.h" #include "cifsacl.h" #include <crypto/internal/hash.h> @@ -115,10 +116,18 @@ enum statusEnum { CifsInNegotiate, CifsNeedSessSetup, CifsInSessSetup, - CifsNeedTcon, - CifsInTcon, - CifsNeedFilesInvalidate, - CifsInFilesInvalidate +}; + +/* associated with each tree connection to the server */ +enum tid_status_enum { + TID_NEW = 0, + TID_GOOD, + TID_EXITING, + TID_NEED_RECON, + TID_NEED_TCON, + TID_IN_TCON, + TID_NEED_FILES_INVALIDATE, /* currently unused */ + TID_IN_FILES_INVALIDATE }; enum securityEnum { @@ -852,13 +861,7 @@ compare_mid(__u16 mid, const struct smb_hdr *smb) #define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4) #define CIFS_MAX_RFC1002_RSIZE ((1<<17) - 1 - sizeof(READ_RSP) + 4) -/* - * The default wsize is 1M. find_get_pages seems to return a maximum of 256 - * pages in a single call. With PAGE_SIZE == 4k, this means we can fill - * a single wsize request with a single call. - */ #define CIFS_DEFAULT_IOSIZE (1024 * 1024) -#define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024) /* * Windows only supports a max of 60kb reads and 65535 byte writes. Default to @@ -1038,7 +1041,7 @@ struct cifs_tcon { char *password; /* for share-level security */ __u32 tid; /* The 4 byte tree id */ __u16 Flags; /* optional support bits */ - enum statusEnum tidStatus; + enum tid_status_enum status; atomic_t num_smbs_sent; union { struct { @@ -1402,6 +1405,11 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file); */ struct cifsInodeInfo { + struct { + /* These must be contiguous */ + struct inode vfs_inode; /* the VFS's inode record */ + struct netfs_i_context netfs_ctx; /* Netfslib context */ + }; bool can_cache_brlcks; struct list_head llist; /* locks helb by this inode */ /* @@ -1432,10 +1440,6 @@ struct cifsInodeInfo { u64 uniqueid; /* server inode number */ u64 createtime; /* creation time on server */ __u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for this inode */ -#ifdef CONFIG_CIFS_FSCACHE - struct fscache_cookie *fscache; -#endif - struct inode vfs_inode; struct list_head deferred_closes; /* list of deferred closes */ spinlock_t deferred_lock; /* protection on deferred list */ bool lease_granted; /* Flag to indicate whether lease or oplock is granted. */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 68b9a436af4b..aeba371c4c70 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -123,18 +123,6 @@ */ #define CIFS_SESS_KEY_SIZE (16) -/* - * Size of the smb3 signing key - */ -#define SMB3_SIGN_KEY_SIZE (16) - -/* - * Size of the smb3 encryption/decryption key storage. - * This size is big enough to store any cipher key types. - */ -#define SMB3_ENC_DEC_KEY_SIZE (32) - -#define CIFS_CLIENT_CHALLENGE_SIZE (8) #define CIFS_SERVER_CHALLENGE_SIZE (8) #define CIFS_HMAC_MD5_HASH_SIZE (16) #define CIFS_CPHTXT_SIZE (16) @@ -1658,7 +1646,7 @@ struct smb_t2_rsp { #define SMB_FIND_FILE_ID_FULL_DIR_INFO 0x105 #define SMB_FIND_FILE_ID_BOTH_DIR_INFO 0x106 #define SMB_FIND_FILE_UNIX 0x202 -#define SMB_FIND_FILE_POSIX_INFO 0x064 +/* #define SMB_FIND_FILE_POSIX_INFO 0x064 */ typedef struct smb_com_transaction2_qpi_req { struct smb_hdr hdr; /* wct = 14+ */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 071e2f21a7db..47e927c4ff8d 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -75,12 +75,11 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) /* only send once per connect */ spin_lock(&cifs_tcp_ses_lock); - if (tcon->ses->status != CifsGood || - tcon->tidStatus != CifsNeedReconnect) { + if ((tcon->ses->status != CifsGood) || (tcon->status != TID_NEED_RECON)) { spin_unlock(&cifs_tcp_ses_lock); return; } - tcon->tidStatus = CifsInFilesInvalidate; + tcon->status = TID_IN_FILES_INVALIDATE; spin_unlock(&cifs_tcp_ses_lock); /* list all files open on tree connection and mark them invalid */ @@ -100,8 +99,8 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) mutex_unlock(&tcon->crfid.fid_mutex); spin_lock(&cifs_tcp_ses_lock); - if (tcon->tidStatus == CifsInFilesInvalidate) - tcon->tidStatus = CifsNeedTcon; + if (tcon->status == TID_IN_FILES_INVALIDATE) + tcon->status = TID_NEED_TCON; spin_unlock(&cifs_tcp_ses_lock); /* @@ -136,7 +135,7 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) * have tcon) are allowed as we start force umount */ spin_lock(&cifs_tcp_ses_lock); - if (tcon->tidStatus == CifsExiting) { + if (tcon->status == TID_EXITING) { if (smb_command != SMB_COM_WRITE_ANDX && smb_command != SMB_COM_OPEN_ANDX && smb_command != SMB_COM_TREE_DISCONNECT) { @@ -597,7 +596,7 @@ CIFSSMBNegotiate(const unsigned int xid, set_credits(server, server->maxReq); /* probably no need to store and check maxvcs */ server->maxBuf = le32_to_cpu(pSMBr->MaxBufferSize); - /* set up max_read for readpages check */ + /* set up max_read for readahead check */ server->max_read = server->maxBuf; server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); cifs_dbg(NOISY, "Max buf = %d\n", ses->server->maxBuf); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 9964c3634322..ee3b7c15e884 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -245,7 +245,7 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { tcon->need_reconnect = true; - tcon->tidStatus = CifsNeedReconnect; + tcon->status = TID_NEED_RECON; } if (ses->tcon_ipc) ses->tcon_ipc->need_reconnect = true; @@ -2207,7 +2207,7 @@ get_ses_fail: static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) { - if (tcon->tidStatus == CifsExiting) + if (tcon->status == TID_EXITING) return 0; if (strncmp(tcon->treeName, ctx->UNC, MAX_TREE_SIZE)) return 0; @@ -3513,6 +3513,9 @@ static int connect_dfs_target(struct mount_ctx *mnt_ctx, const char *full_path, struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb; char *oldmnt = cifs_sb->ctx->mount_options; + cifs_dbg(FYI, "%s: full_path=%s ref_path=%s target=%s\n", __func__, full_path, ref_path, + dfs_cache_get_tgt_name(tit)); + rc = dfs_cache_get_tgt_referral(ref_path, tit, &ref); if (rc) goto out; @@ -3611,13 +3614,18 @@ static int __follow_dfs_link(struct mount_ctx *mnt_ctx) if (rc) goto out; - /* Try all dfs link targets */ + /* Try all dfs link targets. If an I/O fails from currently connected DFS target with an + * error other than STATUS_PATH_NOT_COVERED (-EREMOTE), then retry it from other targets as + * specified in MS-DFSC "3.1.5.2 I/O Operation to Target Fails with an Error Other Than + * STATUS_PATH_NOT_COVERED." + */ for (rc = -ENOENT, tit = dfs_cache_get_tgt_iterator(&tl); tit; tit = dfs_cache_get_next_tgt(&tl, tit)) { rc = connect_dfs_target(mnt_ctx, full_path, mnt_ctx->leaf_fullpath + 1, tit); if (!rc) { rc = is_path_remote(mnt_ctx); - break; + if (!rc || rc == -EREMOTE) + break; } } @@ -3691,7 +3699,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) goto error; rc = is_path_remote(&mnt_ctx); - if (rc == -EREMOTE) + if (rc) rc = follow_dfs_link(&mnt_ctx); if (rc) goto error; @@ -4478,12 +4486,12 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru /* only send once per connect */ spin_lock(&cifs_tcp_ses_lock); if (tcon->ses->status != CifsGood || - (tcon->tidStatus != CifsNew && - tcon->tidStatus != CifsNeedTcon)) { + (tcon->status != TID_NEW && + tcon->status != TID_NEED_TCON)) { spin_unlock(&cifs_tcp_ses_lock); return 0; } - tcon->tidStatus = CifsInTcon; + tcon->status = TID_IN_TCON; spin_unlock(&cifs_tcp_ses_lock); tree = kzalloc(MAX_TREE_SIZE, GFP_KERNEL); @@ -4524,13 +4532,13 @@ out: if (rc) { spin_lock(&cifs_tcp_ses_lock); - if (tcon->tidStatus == CifsInTcon) - tcon->tidStatus = CifsNeedTcon; + if (tcon->status == TID_IN_TCON) + tcon->status = TID_NEED_TCON; spin_unlock(&cifs_tcp_ses_lock); } else { spin_lock(&cifs_tcp_ses_lock); - if (tcon->tidStatus == CifsInTcon) - tcon->tidStatus = CifsGood; + if (tcon->status == TID_IN_TCON) + tcon->status = TID_GOOD; spin_unlock(&cifs_tcp_ses_lock); tcon->need_reconnect = false; } @@ -4546,24 +4554,24 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru /* only send once per connect */ spin_lock(&cifs_tcp_ses_lock); if (tcon->ses->status != CifsGood || - (tcon->tidStatus != CifsNew && - tcon->tidStatus != CifsNeedTcon)) { + (tcon->status != TID_NEW && + tcon->status != TID_NEED_TCON)) { spin_unlock(&cifs_tcp_ses_lock); return 0; } - tcon->tidStatus = CifsInTcon; + tcon->status = TID_IN_TCON; spin_unlock(&cifs_tcp_ses_lock); rc = ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, nlsc); if (rc) { spin_lock(&cifs_tcp_ses_lock); - if (tcon->tidStatus == CifsInTcon) - tcon->tidStatus = CifsNeedTcon; + if (tcon->status == TID_IN_TCON) + tcon->status = TID_NEED_TCON; spin_unlock(&cifs_tcp_ses_lock); } else { spin_lock(&cifs_tcp_ses_lock); - if (tcon->tidStatus == CifsInTcon) - tcon->tidStatus = CifsGood; + if (tcon->status == TID_IN_TCON) + tcon->status = TID_GOOD; spin_unlock(&cifs_tcp_ses_lock); tcon->need_reconnect = false; } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 60f43bff7ccb..d511a78383c3 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -4210,13 +4210,19 @@ cifs_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; + /* Wait for the page to be written to the cache before we allow it to + * be modified. We then assume the entire page will need writing back. + */ #ifdef CONFIG_CIFS_FSCACHE if (PageFsCache(page) && wait_on_page_fscache_killable(page) < 0) return VM_FAULT_RETRY; #endif - lock_page(page); + wait_on_page_writeback(page); + + if (lock_page_killable(page) < 0) + return VM_FAULT_RETRY; return VM_FAULT_LOCKED; } diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index 33af72e0ac0c..a638b29e9062 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -103,7 +103,7 @@ void cifs_fscache_get_inode_cookie(struct inode *inode) cifs_fscache_fill_coherency(&cifsi->vfs_inode, &cd); - cifsi->fscache = + cifsi->netfs_ctx.cache = fscache_acquire_cookie(tcon->fscache, 0, &cifsi->uniqueid, sizeof(cifsi->uniqueid), &cd, sizeof(cd), @@ -126,22 +126,15 @@ void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update) void cifs_fscache_release_inode_cookie(struct inode *inode) { struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct fscache_cookie *cookie = cifs_inode_cookie(inode); - if (cifsi->fscache) { - cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache); - fscache_relinquish_cookie(cifsi->fscache, false); - cifsi->fscache = NULL; + if (cookie) { + cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cookie); + fscache_relinquish_cookie(cookie, false); + cifsi->netfs_ctx.cache = NULL; } } -static inline void fscache_end_operation(struct netfs_cache_resources *cres) -{ - const struct netfs_cache_ops *ops = fscache_operation_valid(cres); - - if (ops) - ops->end_operation(cres); -} - /* * Fallback page reading interface. */ diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h index 55129908e2c1..52355c0912ae 100644 --- a/fs/cifs/fscache.h +++ b/fs/cifs/fscache.h @@ -61,7 +61,7 @@ void cifs_fscache_fill_coherency(struct inode *inode, static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode) { - return CIFS_I(inode)->fscache; + return netfs_i_cookie(inode); } static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 60d853c92f6a..2f9e7d2f81b6 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -49,7 +49,7 @@ static void cifs_set_ops(struct inode *inode) inode->i_fop = &cifs_file_ops; } - /* check if server can support readpages */ + /* check if server can support readahead */ if (cifs_sb_master_tcon(cifs_sb)->ses->server->max_read < PAGE_SIZE + MAX_CIFS_HDR_SIZE) inode->i_data.a_ops = &cifs_addr_ops_smallbuf; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 56598f7dbe00..afaf59c22193 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -116,7 +116,7 @@ tconInfoAlloc(void) } atomic_inc(&tconInfoAllocCount); - ret_buf->tidStatus = CifsNew; + ret_buf->status = TID_NEW; ++ret_buf->tc_count; INIT_LIST_HEAD(&ret_buf->openFileList); INIT_LIST_HEAD(&ret_buf->tcon_list); diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h index 298458404252..55758b9ec877 100644 --- a/fs/cifs/ntlmssp.h +++ b/fs/cifs/ntlmssp.h @@ -107,7 +107,7 @@ struct negotiate_message { SECURITY_BUFFER WorkstationName; /* RFC 1001 and ASCII */ struct ntlmssp_version Version; /* SECURITY_BUFFER */ - char DomainString[0]; + char DomainString[]; /* followed by WorkstationString */ } __packed; diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h index 4125fd113cfb..82e916ad167c 100644 --- a/fs/cifs/smb2glob.h +++ b/fs/cifs/smb2glob.h @@ -41,15 +41,4 @@ #define END_OF_CHAIN 4 #define RELATED_REQUEST 8 -#define SMB2_SIGNATURE_SIZE (16) -#define SMB2_NTLMV2_SESSKEY_SIZE (16) -#define SMB2_HMACSHA256_SIZE (32) -#define SMB2_CMACAES_SIZE (16) -#define SMB3_SIGNKEY_SIZE (16) -#define SMB3_GCM128_CRYPTKEY_SIZE (16) -#define SMB3_GCM256_CRYPTKEY_SIZE (32) - -/* Maximum buffer size value we can send with 1 credit */ -#define SMB2_MAX_BUFFER_SIZE 65536 - #endif /* _SMB2_GLOB_H */ diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index b25623e3fe3d..c653beb735b8 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -203,7 +203,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr) if (smb2_rsp_struct_sizes[command] != pdu->StructureSize2) { if (command != SMB2_OPLOCK_BREAK_HE && (shdr->Status == 0 || - pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2)) { + pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2_LE)) { /* error packets have 9 byte structure size */ cifs_dbg(VFS, "Invalid response size %u for command %d\n", le16_to_cpu(pdu->StructureSize2), command); @@ -303,7 +303,7 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *shdr) /* error responses do not have data area */ if (shdr->Status && shdr->Status != STATUS_MORE_PROCESSING_REQUIRED && (((struct smb2_err_rsp *)shdr)->StructureSize) == - SMB2_ERROR_STRUCTURE_SIZE2) + SMB2_ERROR_STRUCTURE_SIZE2_LE) return NULL; /* @@ -478,11 +478,11 @@ smb2_get_lease_state(struct cifsInodeInfo *cinode) __le32 lease = 0; if (CIFS_CACHE_WRITE(cinode)) - lease |= SMB2_LEASE_WRITE_CACHING; + lease |= SMB2_LEASE_WRITE_CACHING_LE; if (CIFS_CACHE_HANDLE(cinode)) - lease |= SMB2_LEASE_HANDLE_CACHING; + lease |= SMB2_LEASE_HANDLE_CACHING_LE; if (CIFS_CACHE_READ(cinode)) - lease |= SMB2_LEASE_READ_CACHING; + lease |= SMB2_LEASE_READ_CACHING_LE; return lease; } @@ -832,8 +832,8 @@ smb2_handle_cancelled_mid(struct mid_q_entry *mid, struct TCP_Server_Info *serve rc = __smb2_handle_cancelled_cmd(tcon, le16_to_cpu(hdr->Command), le64_to_cpu(hdr->MessageId), - le64_to_cpu(rsp->PersistentFileId), - le64_to_cpu(rsp->VolatileFileId)); + rsp->PersistentFileId, + rsp->VolatileFileId); if (rc) cifs_put_tcon(tcon); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 891b11576e55..db23f5b404ba 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -897,8 +897,8 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, atomic_inc(&tcon->num_remote_opens); o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; - oparms.fid->persistent_fid = le64_to_cpu(o_rsp->PersistentFileId); - oparms.fid->volatile_fid = le64_to_cpu(o_rsp->VolatileFileId); + oparms.fid->persistent_fid = o_rsp->PersistentFileId; + oparms.fid->volatile_fid = o_rsp->VolatileFileId; #ifdef CONFIG_CIFS_DEBUG2 oparms.fid->mid = le64_to_cpu(o_rsp->hdr.MessageId); #endif /* CIFS_DEBUG2 */ @@ -1192,17 +1192,12 @@ smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb) { int rc; - __le16 *utf16_path; struct kvec rsp_iov = {NULL, 0}; int buftype = CIFS_NO_BUFFER; struct smb2_query_info_rsp *rsp; struct smb2_file_full_ea_info *info = NULL; - utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); - if (!utf16_path) - return -ENOMEM; - - rc = smb2_query_info_compound(xid, tcon, utf16_path, + rc = smb2_query_info_compound(xid, tcon, path, FILE_READ_EA, FILE_FULL_EA_INFORMATION, SMB2_O_INFO_FILE, @@ -1235,7 +1230,6 @@ smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, le32_to_cpu(rsp->OutputBufferLength), ea_name); qeas_exit: - kfree(utf16_path); free_rsp_buf(buftype, rsp_iov.iov_base); return rc; } @@ -1295,7 +1289,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, * the new EA. If not we should not add it since we * would not be able to even read the EAs back. */ - rc = smb2_query_info_compound(xid, tcon, utf16_path, + rc = smb2_query_info_compound(xid, tcon, path, FILE_READ_EA, FILE_FULL_EA_INFORMATION, SMB2_O_INFO_FILE, @@ -1643,6 +1637,7 @@ smb2_ioctl_query_info(const unsigned int xid, unsigned int size[2]; void *data[2]; int create_options = is_dir ? CREATE_NOT_FILE : CREATE_NOT_DIR; + void (*free_req1_func)(struct smb_rqst *r); vars = kzalloc(sizeof(*vars), GFP_ATOMIC); if (vars == NULL) @@ -1652,27 +1647,29 @@ smb2_ioctl_query_info(const unsigned int xid, resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; - if (copy_from_user(&qi, arg, sizeof(struct smb_query_info))) - goto e_fault; - + if (copy_from_user(&qi, arg, sizeof(struct smb_query_info))) { + rc = -EFAULT; + goto free_vars; + } if (qi.output_buffer_length > 1024) { - kfree(vars); - return -EINVAL; + rc = -EINVAL; + goto free_vars; } if (!ses || !server) { - kfree(vars); - return -EIO; + rc = -EIO; + goto free_vars; } if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - buffer = memdup_user(arg + sizeof(struct smb_query_info), - qi.output_buffer_length); - if (IS_ERR(buffer)) { - kfree(vars); - return PTR_ERR(buffer); + if (qi.output_buffer_length) { + buffer = memdup_user(arg + sizeof(struct smb_query_info), qi.output_buffer_length); + if (IS_ERR(buffer)) { + rc = PTR_ERR(buffer); + goto free_vars; + } } /* Open */ @@ -1710,45 +1707,45 @@ smb2_ioctl_query_info(const unsigned int xid, rc = SMB2_open_init(tcon, server, &rqst[0], &oplock, &oparms, path); if (rc) - goto iqinf_exit; + goto free_output_buffer; smb2_set_next_command(tcon, &rqst[0]); /* Query */ if (qi.flags & PASSTHRU_FSCTL) { /* Can eventually relax perm check since server enforces too */ - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN)) { rc = -EPERM; - else { - rqst[1].rq_iov = &vars->io_iov[0]; - rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE; - - rc = SMB2_ioctl_init(tcon, server, - &rqst[1], - COMPOUND_FID, COMPOUND_FID, - qi.info_type, true, buffer, - qi.output_buffer_length, - CIFSMaxBufSize - - MAX_SMB2_CREATE_RESPONSE_SIZE - - MAX_SMB2_CLOSE_RESPONSE_SIZE); + goto free_open_req; } + rqst[1].rq_iov = &vars->io_iov[0]; + rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE; + + rc = SMB2_ioctl_init(tcon, server, &rqst[1], COMPOUND_FID, COMPOUND_FID, + qi.info_type, true, buffer, qi.output_buffer_length, + CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE - + MAX_SMB2_CLOSE_RESPONSE_SIZE); + free_req1_func = SMB2_ioctl_free; } else if (qi.flags == PASSTHRU_SET_INFO) { /* Can eventually relax perm check since server enforces too */ - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN)) { rc = -EPERM; - else { - rqst[1].rq_iov = &vars->si_iov[0]; - rqst[1].rq_nvec = 1; - - size[0] = 8; - data[0] = buffer; - - rc = SMB2_set_info_init(tcon, server, - &rqst[1], - COMPOUND_FID, COMPOUND_FID, - current->tgid, - FILE_END_OF_FILE_INFORMATION, - SMB2_O_INFO_FILE, 0, data, size); + goto free_open_req; + } + if (qi.output_buffer_length < 8) { + rc = -EINVAL; + goto free_open_req; } + rqst[1].rq_iov = &vars->si_iov[0]; + rqst[1].rq_nvec = 1; + + /* MS-FSCC 2.4.13 FileEndOfFileInformation */ + size[0] = 8; + data[0] = buffer; + + rc = SMB2_set_info_init(tcon, server, &rqst[1], COMPOUND_FID, COMPOUND_FID, + current->tgid, FILE_END_OF_FILE_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + free_req1_func = SMB2_set_info_free; } else if (qi.flags == PASSTHRU_QUERY_INFO) { rqst[1].rq_iov = &vars->qi_iov[0]; rqst[1].rq_nvec = 1; @@ -1759,6 +1756,7 @@ smb2_ioctl_query_info(const unsigned int xid, qi.info_type, qi.additional_information, qi.input_buffer_length, qi.output_buffer_length, buffer); + free_req1_func = SMB2_query_info_free; } else { /* unknown flags */ cifs_tcon_dbg(VFS, "Invalid passthru query flags: 0x%x\n", qi.flags); @@ -1766,7 +1764,7 @@ smb2_ioctl_query_info(const unsigned int xid, } if (rc) - goto iqinf_exit; + goto free_open_req; smb2_set_next_command(tcon, &rqst[1]); smb2_set_related(&rqst[1]); @@ -1777,14 +1775,14 @@ smb2_ioctl_query_info(const unsigned int xid, rc = SMB2_close_init(tcon, server, &rqst[2], COMPOUND_FID, COMPOUND_FID, false); if (rc) - goto iqinf_exit; + goto free_req_1; smb2_set_related(&rqst[2]); rc = compound_send_recv(xid, ses, server, flags, 3, rqst, resp_buftype, rsp_iov); if (rc) - goto iqinf_exit; + goto out; /* No need to bump num_remote_opens since handle immediately closed */ if (qi.flags & PASSTHRU_FSCTL) { @@ -1794,18 +1792,22 @@ smb2_ioctl_query_info(const unsigned int xid, qi.input_buffer_length = le32_to_cpu(io_rsp->OutputCount); if (qi.input_buffer_length > 0 && le32_to_cpu(io_rsp->OutputOffset) + qi.input_buffer_length - > rsp_iov[1].iov_len) - goto e_fault; + > rsp_iov[1].iov_len) { + rc = -EFAULT; + goto out; + } if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length, - sizeof(qi.input_buffer_length))) - goto e_fault; + sizeof(qi.input_buffer_length))) { + rc = -EFAULT; + goto out; + } if (copy_to_user((void __user *)pqi + sizeof(struct smb_query_info), (const void *)io_rsp + le32_to_cpu(io_rsp->OutputOffset), qi.input_buffer_length)) - goto e_fault; + rc = -EFAULT; } else { pqi = (struct smb_query_info __user *)arg; qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; @@ -1813,28 +1815,30 @@ smb2_ioctl_query_info(const unsigned int xid, qi.input_buffer_length = le32_to_cpu(qi_rsp->OutputBufferLength); if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length, - sizeof(qi.input_buffer_length))) - goto e_fault; + sizeof(qi.input_buffer_length))) { + rc = -EFAULT; + goto out; + } if (copy_to_user(pqi + 1, qi_rsp->Buffer, qi.input_buffer_length)) - goto e_fault; + rc = -EFAULT; } - iqinf_exit: - cifs_small_buf_release(rqst[0].rq_iov[0].iov_base); - cifs_small_buf_release(rqst[1].rq_iov[0].iov_base); - cifs_small_buf_release(rqst[2].rq_iov[0].iov_base); +out: free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); - kfree(vars); + SMB2_close_free(&rqst[2]); +free_req_1: + free_req1_func(&rqst[1]); +free_open_req: + SMB2_open_free(&rqst[0]); +free_output_buffer: kfree(buffer); +free_vars: + kfree(vars); return rc; - -e_fault: - rc = -EFAULT; - goto iqinf_exit; } static ssize_t @@ -2407,8 +2411,8 @@ again: cifs_dbg(FYI, "query_dir_first: open failed rc=%d\n", rc); goto qdf_free; } - fid->persistent_fid = le64_to_cpu(op_rsp->PersistentFileId); - fid->volatile_fid = le64_to_cpu(op_rsp->VolatileFileId); + fid->persistent_fid = op_rsp->PersistentFileId; + fid->volatile_fid = op_rsp->VolatileFileId; /* Anything else than ENODATA means a genuine error */ if (rc && rc != -ENODATA) { @@ -2646,7 +2650,7 @@ smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst) */ int smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, - __le16 *utf16_path, u32 desired_access, + const char *path, u32 desired_access, u32 class, u32 type, u32 output_len, struct kvec *rsp, int *buftype, struct cifs_sb_info *cifs_sb) @@ -2664,6 +2668,14 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_fid fid; int rc; + __le16 *utf16_path; + struct cached_fid *cfid = NULL; + + if (!path) + path = ""; + utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); + if (!utf16_path) + return -ENOMEM; if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -2672,6 +2684,8 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; memset(rsp_iov, 0, sizeof(rsp_iov)); + rc = open_cached_dir(xid, tcon, path, cifs_sb, &cfid); + memset(&open_iov, 0, sizeof(open_iov)); rqst[0].rq_iov = open_iov; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; @@ -2693,15 +2707,29 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, rqst[1].rq_iov = qi_iov; rqst[1].rq_nvec = 1; - rc = SMB2_query_info_init(tcon, server, - &rqst[1], COMPOUND_FID, COMPOUND_FID, - class, type, 0, - output_len, 0, - NULL); + if (cfid) { + rc = SMB2_query_info_init(tcon, server, + &rqst[1], + cfid->fid->persistent_fid, + cfid->fid->volatile_fid, + class, type, 0, + output_len, 0, + NULL); + } else { + rc = SMB2_query_info_init(tcon, server, + &rqst[1], + COMPOUND_FID, + COMPOUND_FID, + class, type, 0, + output_len, 0, + NULL); + } if (rc) goto qic_exit; - smb2_set_next_command(tcon, &rqst[1]); - smb2_set_related(&rqst[1]); + if (!cfid) { + smb2_set_next_command(tcon, &rqst[1]); + smb2_set_related(&rqst[1]); + } memset(&close_iov, 0, sizeof(close_iov)); rqst[2].rq_iov = close_iov; @@ -2713,9 +2741,15 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, goto qic_exit; smb2_set_related(&rqst[2]); - rc = compound_send_recv(xid, ses, server, - flags, 3, rqst, - resp_buftype, rsp_iov); + if (cfid) { + rc = compound_send_recv(xid, ses, server, + flags, 1, &rqst[1], + &resp_buftype[1], &rsp_iov[1]); + } else { + rc = compound_send_recv(xid, ses, server, + flags, 3, rqst, + resp_buftype, rsp_iov); + } if (rc) { free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); if (rc == -EREMCHG) { @@ -2729,11 +2763,14 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, *buftype = resp_buftype[1]; qic_exit: + kfree(utf16_path); SMB2_open_free(&rqst[0]); SMB2_query_info_free(&rqst[1]); SMB2_close_free(&rqst[2]); free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); + if (cfid) + close_cached_dir(cfid); return rc; } @@ -2743,13 +2780,12 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon, { struct smb2_query_info_rsp *rsp; struct smb2_fs_full_size_info *info = NULL; - __le16 utf16_path = 0; /* Null - open root of share */ struct kvec rsp_iov = {NULL, 0}; int buftype = CIFS_NO_BUFFER; int rc; - rc = smb2_query_info_compound(xid, tcon, &utf16_path, + rc = smb2_query_info_compound(xid, tcon, "", FILE_READ_ATTRIBUTES, FS_FULL_SIZE_INFORMATION, SMB2_O_INFO_FILESYSTEM, @@ -4293,12 +4329,12 @@ static __le32 map_oplock_to_lease(u8 oplock) { if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) - return SMB2_LEASE_WRITE_CACHING | SMB2_LEASE_READ_CACHING; + return SMB2_LEASE_WRITE_CACHING_LE | SMB2_LEASE_READ_CACHING_LE; else if (oplock == SMB2_OPLOCK_LEVEL_II) - return SMB2_LEASE_READ_CACHING; + return SMB2_LEASE_READ_CACHING_LE; else if (oplock == SMB2_OPLOCK_LEVEL_BATCH) - return SMB2_LEASE_HANDLE_CACHING | SMB2_LEASE_READ_CACHING | - SMB2_LEASE_WRITE_CACHING; + return SMB2_LEASE_HANDLE_CACHING_LE | SMB2_LEASE_READ_CACHING_LE | + SMB2_LEASE_WRITE_CACHING_LE; return 0; } @@ -4360,7 +4396,7 @@ smb2_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key) struct create_lease *lc = (struct create_lease *)buf; *epoch = 0; /* not used */ - if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS) + if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS_LE) return SMB2_OPLOCK_LEVEL_NOCHANGE; return le32_to_cpu(lc->lcontext.LeaseState); } @@ -4371,7 +4407,7 @@ smb3_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key) struct create_lease_v2 *lc = (struct create_lease_v2 *)buf; *epoch = le16_to_cpu(lc->lcontext.Epoch); - if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS) + if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS_LE) return SMB2_OPLOCK_LEVEL_NOCHANGE; if (lease_key) memcpy(lease_key, &lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE); @@ -5814,8 +5850,8 @@ struct smb_version_values smb20_values = { .protocol_id = SMB20_PROT_ID, .req_capabilities = 0, /* MBZ */ .large_lock_type = 0, - .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, - .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, + .shared_lock_type = SMB2_LOCKFLAG_SHARED, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, @@ -5835,8 +5871,8 @@ struct smb_version_values smb21_values = { .protocol_id = SMB21_PROT_ID, .req_capabilities = 0, /* MBZ on negotiate req until SMB3 dialect */ .large_lock_type = 0, - .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, - .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, + .shared_lock_type = SMB2_LOCKFLAG_SHARED, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, @@ -5856,8 +5892,8 @@ struct smb_version_values smb3any_values = { .protocol_id = SMB302_PROT_ID, /* doesn't matter, send protocol array */ .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION | SMB2_GLOBAL_CAP_DIRECTORY_LEASING, .large_lock_type = 0, - .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, - .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, + .shared_lock_type = SMB2_LOCKFLAG_SHARED, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, @@ -5877,8 +5913,8 @@ struct smb_version_values smbdefault_values = { .protocol_id = SMB302_PROT_ID, /* doesn't matter, send protocol array */ .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION | SMB2_GLOBAL_CAP_DIRECTORY_LEASING, .large_lock_type = 0, - .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, - .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, + .shared_lock_type = SMB2_LOCKFLAG_SHARED, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, @@ -5898,8 +5934,8 @@ struct smb_version_values smb30_values = { .protocol_id = SMB30_PROT_ID, .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION | SMB2_GLOBAL_CAP_DIRECTORY_LEASING, .large_lock_type = 0, - .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, - .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, + .shared_lock_type = SMB2_LOCKFLAG_SHARED, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, @@ -5919,8 +5955,8 @@ struct smb_version_values smb302_values = { .protocol_id = SMB302_PROT_ID, .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION | SMB2_GLOBAL_CAP_DIRECTORY_LEASING, .large_lock_type = 0, - .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, - .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, + .shared_lock_type = SMB2_LOCKFLAG_SHARED, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, @@ -5940,8 +5976,8 @@ struct smb_version_values smb311_values = { .protocol_id = SMB311_PROT_ID, .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION | SMB2_GLOBAL_CAP_DIRECTORY_LEASING, .large_lock_type = 0, - .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, - .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, + .shared_lock_type = SMB2_LOCKFLAG_SHARED, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 7e7909b1ae11..1b7ad0c09566 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -163,7 +163,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, return 0; spin_lock(&cifs_tcp_ses_lock); - if (tcon->tidStatus == CifsExiting) { + if (tcon->status == TID_EXITING) { /* * only tree disconnect, open, and write, * (and ulogoff which does not have tcon) @@ -2734,13 +2734,10 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, goto err_free_req; } - trace_smb3_posix_mkdir_done(xid, le64_to_cpu(rsp->PersistentFileId), - tcon->tid, - ses->Suid, CREATE_NOT_FILE, - FILE_WRITE_ATTRIBUTES); + trace_smb3_posix_mkdir_done(xid, rsp->PersistentFileId, tcon->tid, ses->Suid, + CREATE_NOT_FILE, FILE_WRITE_ATTRIBUTES); - SMB2_close(xid, tcon, le64_to_cpu(rsp->PersistentFileId), - le64_to_cpu(rsp->VolatileFileId)); + SMB2_close(xid, tcon, rsp->PersistentFileId, rsp->VolatileFileId); /* Eventually save off posix specific response info and timestaps */ @@ -3009,14 +3006,12 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, } else if (rsp == NULL) /* unlikely to happen, but safer to check */ goto creat_exit; else - trace_smb3_open_done(xid, le64_to_cpu(rsp->PersistentFileId), - tcon->tid, - ses->Suid, oparms->create_options, - oparms->desired_access); + trace_smb3_open_done(xid, rsp->PersistentFileId, tcon->tid, ses->Suid, + oparms->create_options, oparms->desired_access); atomic_inc(&tcon->num_remote_opens); - oparms->fid->persistent_fid = le64_to_cpu(rsp->PersistentFileId); - oparms->fid->volatile_fid = le64_to_cpu(rsp->VolatileFileId); + oparms->fid->persistent_fid = rsp->PersistentFileId; + oparms->fid->volatile_fid = rsp->VolatileFileId; oparms->fid->access = oparms->desired_access; #ifdef CONFIG_CIFS_DEBUG2 oparms->fid->mid = le64_to_cpu(rsp->hdr.MessageId); @@ -3313,8 +3308,8 @@ SMB2_close_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, if (rc) return rc; - req->PersistentFileId = cpu_to_le64(persistent_fid); - req->VolatileFileId = cpu_to_le64(volatile_fid); + req->PersistentFileId = persistent_fid; + req->VolatileFileId = volatile_fid; if (query_attrs) req->Flags = SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB; else @@ -3677,8 +3672,8 @@ SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst, if (rc) return rc; - req->PersistentFileId = cpu_to_le64(persistent_fid); - req->VolatileFileId = cpu_to_le64(volatile_fid); + req->PersistentFileId = persistent_fid; + req->VolatileFileId = volatile_fid; /* See note 354 of MS-SMB2, 64K max */ req->OutputBufferLength = cpu_to_le32(SMB2_MAX_BUFFER_SIZE - MAX_SMB2_HDR_SIZE); @@ -3858,12 +3853,14 @@ void smb2_reconnect_server(struct work_struct *work) tcon = kzalloc(sizeof(struct cifs_tcon), GFP_KERNEL); if (!tcon) { resched = true; - list_del_init(&ses->rlist); - cifs_put_smb_ses(ses); + list_for_each_entry_safe(ses, ses2, &tmp_ses_list, rlist) { + list_del_init(&ses->rlist); + cifs_put_smb_ses(ses); + } goto done; } - tcon->tidStatus = CifsGood; + tcon->status = TID_GOOD; tcon->retry = false; tcon->need_reconnect = false; @@ -3951,8 +3948,8 @@ SMB2_flush_init(const unsigned int xid, struct smb_rqst *rqst, if (rc) return rc; - req->PersistentFileId = cpu_to_le64(persistent_fid); - req->VolatileFileId = cpu_to_le64(volatile_fid); + req->PersistentFileId = persistent_fid; + req->VolatileFileId = volatile_fid; iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; @@ -4033,8 +4030,8 @@ smb2_new_read_req(void **buf, unsigned int *total_len, shdr = &req->hdr; shdr->Id.SyncId.ProcessId = cpu_to_le32(io_parms->pid); - req->PersistentFileId = cpu_to_le64(io_parms->persistent_fid); - req->VolatileFileId = cpu_to_le64(io_parms->volatile_fid); + req->PersistentFileId = io_parms->persistent_fid; + req->VolatileFileId = io_parms->volatile_fid; req->ReadChannelInfoOffset = 0; /* reserved */ req->ReadChannelInfoLength = 0; /* reserved */ req->Channel = 0; /* reserved */ @@ -4094,8 +4091,8 @@ smb2_new_read_req(void **buf, unsigned int *total_len, */ shdr->SessionId = cpu_to_le64(0xFFFFFFFFFFFFFFFF); shdr->Id.SyncId.TreeId = cpu_to_le32(0xFFFFFFFF); - req->PersistentFileId = cpu_to_le64(0xFFFFFFFFFFFFFFFF); - req->VolatileFileId = cpu_to_le64(0xFFFFFFFFFFFFFFFF); + req->PersistentFileId = (u64)-1; + req->VolatileFileId = (u64)-1; } } if (remaining_bytes > io_parms->length) @@ -4307,21 +4304,19 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE); cifs_dbg(VFS, "Send error in read = %d\n", rc); trace_smb3_read_err(xid, - le64_to_cpu(req->PersistentFileId), + req->PersistentFileId, io_parms->tcon->tid, ses->Suid, io_parms->offset, io_parms->length, rc); } else - trace_smb3_read_done(xid, - le64_to_cpu(req->PersistentFileId), - io_parms->tcon->tid, ses->Suid, - io_parms->offset, 0); + trace_smb3_read_done(xid, req->PersistentFileId, io_parms->tcon->tid, + ses->Suid, io_parms->offset, 0); free_rsp_buf(resp_buftype, rsp_iov.iov_base); cifs_small_buf_release(req); return rc == -ENODATA ? 0 : rc; } else trace_smb3_read_done(xid, - le64_to_cpu(req->PersistentFileId), + req->PersistentFileId, io_parms->tcon->tid, ses->Suid, io_parms->offset, io_parms->length); @@ -4463,8 +4458,8 @@ smb2_async_writev(struct cifs_writedata *wdata, shdr = (struct smb2_hdr *)req; shdr->Id.SyncId.ProcessId = cpu_to_le32(wdata->cfile->pid); - req->PersistentFileId = cpu_to_le64(wdata->cfile->fid.persistent_fid); - req->VolatileFileId = cpu_to_le64(wdata->cfile->fid.volatile_fid); + req->PersistentFileId = wdata->cfile->fid.persistent_fid; + req->VolatileFileId = wdata->cfile->fid.volatile_fid; req->WriteChannelInfoOffset = 0; req->WriteChannelInfoLength = 0; req->Channel = 0; @@ -4562,7 +4557,7 @@ smb2_async_writev(struct cifs_writedata *wdata, if (rc) { trace_smb3_write_err(0 /* no xid */, - le64_to_cpu(req->PersistentFileId), + req->PersistentFileId, tcon->tid, tcon->ses->Suid, wdata->offset, wdata->bytes, rc); kref_put(&wdata->refcount, release); @@ -4615,8 +4610,8 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, req->hdr.Id.SyncId.ProcessId = cpu_to_le32(io_parms->pid); - req->PersistentFileId = cpu_to_le64(io_parms->persistent_fid); - req->VolatileFileId = cpu_to_le64(io_parms->volatile_fid); + req->PersistentFileId = io_parms->persistent_fid; + req->VolatileFileId = io_parms->volatile_fid; req->WriteChannelInfoOffset = 0; req->WriteChannelInfoLength = 0; req->Channel = 0; @@ -4645,7 +4640,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, if (rc) { trace_smb3_write_err(xid, - le64_to_cpu(req->PersistentFileId), + req->PersistentFileId, io_parms->tcon->tid, io_parms->tcon->ses->Suid, io_parms->offset, io_parms->length, rc); @@ -4654,7 +4649,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, } else { *nbytes = le32_to_cpu(rsp->DataLength); trace_smb3_write_done(xid, - le64_to_cpu(req->PersistentFileId), + req->PersistentFileId, io_parms->tcon->tid, io_parms->tcon->ses->Suid, io_parms->offset, *nbytes); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 33cfd0a1adf1..d8c4388b190d 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -56,16 +56,6 @@ struct smb2_rdma_crypto_transform { #define COMPOUND_FID 0xFFFFFFFFFFFFFFFFULL -#define SMB2_ERROR_STRUCTURE_SIZE2 cpu_to_le16(9) - -struct smb2_err_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; - __le16 Reserved; /* MBZ */ - __le32 ByteCount; /* even if zero, at least one byte follows */ - __u8 ErrorData[1]; /* variable length */ -} __packed; - #define SYMLINK_ERROR_TAG 0x4c4d5953 struct smb2_symlink_err_rsp { @@ -139,47 +129,6 @@ struct share_redirect_error_context_rsp { #define SMB2_LEASE_HANDLE_CACHING_HE 0x02 #define SMB2_LEASE_WRITE_CACHING_HE 0x04 -#define SMB2_LEASE_NONE cpu_to_le32(0x00) -#define SMB2_LEASE_READ_CACHING cpu_to_le32(0x01) -#define SMB2_LEASE_HANDLE_CACHING cpu_to_le32(0x02) -#define SMB2_LEASE_WRITE_CACHING cpu_to_le32(0x04) - -#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS cpu_to_le32(0x00000002) -#define SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET cpu_to_le32(0x00000004) - -#define SMB2_LEASE_KEY_SIZE 16 - -struct lease_context { - u8 LeaseKey[SMB2_LEASE_KEY_SIZE]; - __le32 LeaseState; - __le32 LeaseFlags; - __le64 LeaseDuration; -} __packed; - -struct lease_context_v2 { - u8 LeaseKey[SMB2_LEASE_KEY_SIZE]; - __le32 LeaseState; - __le32 LeaseFlags; - __le64 LeaseDuration; - __le64 ParentLeaseKeyLow; - __le64 ParentLeaseKeyHigh; - __le16 Epoch; - __le16 Reserved; -} __packed; - -struct create_lease { - struct create_context ccontext; - __u8 Name[8]; - struct lease_context lcontext; -} __packed; - -struct create_lease_v2 { - struct create_context ccontext; - __u8 Name[8]; - struct lease_context_v2 lcontext; - __u8 Pad[4]; -} __packed; - struct create_durable { struct create_context ccontext; __u8 Name[8]; @@ -192,13 +141,6 @@ struct create_durable { } Data; } __packed; -struct create_posix { - struct create_context ccontext; - __u8 Name[16]; - __le32 Mode; - __u32 Reserved; -} __packed; - /* See MS-SMB2 2.2.13.2.11 */ /* Flags */ #define SMB2_DHANDLE_FLAG_PERSISTENT 0x00000002 @@ -287,12 +229,6 @@ struct copychunk_ioctl { __u32 Reserved2; } __packed; -/* this goes in the ioctl buffer when doing FSCTL_SET_ZERO_DATA */ -struct file_zero_data_information { - __le64 FileOffset; - __le64 BeyondFinalZero; -} __packed; - struct copychunk_ioctl_rsp { __le32 ChunksWritten; __le32 ChunkBytesWritten; @@ -338,11 +274,6 @@ struct fsctl_get_integrity_information_rsp { __le32 ClusterSizeInBytes; } __packed; -struct file_allocated_range_buffer { - __le64 file_offset; - __le64 length; -} __packed; - /* Integrity ChecksumAlgorithm choices for above */ #define CHECKSUM_TYPE_NONE 0x0000 #define CHECKSUM_TYPE_CRC64 0x0002 @@ -351,53 +282,6 @@ struct file_allocated_range_buffer { /* Integrity flags for above */ #define FSCTL_INTEGRITY_FLAG_CHECKSUM_ENFORCEMENT_OFF 0x00000001 -/* Reparse structures - see MS-FSCC 2.1.2 */ - -/* struct fsctl_reparse_info_req is empty, only response structs (see below) */ - -struct reparse_data_buffer { - __le32 ReparseTag; - __le16 ReparseDataLength; - __u16 Reserved; - __u8 DataBuffer[]; /* Variable Length */ -} __packed; - -struct reparse_guid_data_buffer { - __le32 ReparseTag; - __le16 ReparseDataLength; - __u16 Reserved; - __u8 ReparseGuid[16]; - __u8 DataBuffer[]; /* Variable Length */ -} __packed; - -struct reparse_mount_point_data_buffer { - __le32 ReparseTag; - __le16 ReparseDataLength; - __u16 Reserved; - __le16 SubstituteNameOffset; - __le16 SubstituteNameLength; - __le16 PrintNameOffset; - __le16 PrintNameLength; - __u8 PathBuffer[]; /* Variable Length */ -} __packed; - -#define SYMLINK_FLAG_RELATIVE 0x00000001 - -struct reparse_symlink_data_buffer { - __le32 ReparseTag; - __le16 ReparseDataLength; - __u16 Reserved; - __le16 SubstituteNameOffset; - __le16 SubstituteNameLength; - __le16 PrintNameOffset; - __le16 PrintNameLength; - __le32 Flags; - __u8 PathBuffer[]; /* Variable Length */ -} __packed; - -/* See MS-FSCC 2.1.2.6 and cifspdu.h for struct reparse_posix_data */ - - /* See MS-DFSC 2.2.2 */ struct fsctl_get_dfs_referral_req { __le16 MaxReferralLevel; @@ -413,22 +297,6 @@ struct network_resiliency_req { } __packed; /* There is no buffer for the response ie no struct network_resiliency_rsp */ - -struct validate_negotiate_info_req { - __le32 Capabilities; - __u8 Guid[SMB2_CLIENT_GUID_SIZE]; - __le16 SecurityMode; - __le16 DialectCount; - __le16 Dialects[4]; /* BB expand this if autonegotiate > 4 dialects */ -} __packed; - -struct validate_negotiate_info_rsp { - __le32 Capabilities; - __u8 Guid[SMB2_CLIENT_GUID_SIZE]; - __le16 SecurityMode; - __le16 Dialect; /* Dialect in use for the connection */ -} __packed; - #define RSS_CAPABLE cpu_to_le32(0x00000001) #define RDMA_CAPABLE cpu_to_le32(0x00000002) @@ -464,14 +332,6 @@ struct compress_ioctl { __le16 CompressionState; /* See cifspdu.h for possible flag values */ } __packed; -struct duplicate_extents_to_file { - __u64 PersistentFileHandle; /* source file handle, opaque endianness */ - __u64 VolatileFileHandle; - __le64 SourceFileOffset; - __le64 TargetFileOffset; - __le64 ByteCount; /* Bytes to be copied */ -} __packed; - /* * Maximum number of iovs we need for an ioctl request. * [0] : struct smb2_ioctl_req @@ -479,370 +339,11 @@ struct duplicate_extents_to_file { */ #define SMB2_IOCTL_IOV_SIZE 2 -struct smb2_ioctl_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 57 */ - __u16 Reserved; - __le32 CtlCode; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ - __le32 InputOffset; - __le32 InputCount; - __le32 MaxInputResponse; - __le32 OutputOffset; - __le32 OutputCount; - __le32 MaxOutputResponse; - __le32 Flags; - __u32 Reserved2; - __u8 Buffer[]; -} __packed; - -struct smb2_ioctl_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 57 */ - __u16 Reserved; - __le32 CtlCode; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ - __le32 InputOffset; - __le32 InputCount; - __le32 OutputOffset; - __le32 OutputCount; - __le32 Flags; - __u32 Reserved2; - /* char * buffer[] */ -} __packed; - -#define SMB2_LOCKFLAG_SHARED_LOCK 0x0001 -#define SMB2_LOCKFLAG_EXCLUSIVE_LOCK 0x0002 -#define SMB2_LOCKFLAG_UNLOCK 0x0004 -#define SMB2_LOCKFLAG_FAIL_IMMEDIATELY 0x0010 - -struct smb2_lock_element { - __le64 Offset; - __le64 Length; - __le32 Flags; - __le32 Reserved; -} __packed; - -struct smb2_lock_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 48 */ - __le16 LockCount; - /* - * The least significant four bits are the index, the other 28 bits are - * the lock sequence number (0 to 64). See MS-SMB2 2.2.26 - */ - __le32 LockSequenceNumber; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ - /* Followed by at least one */ - struct smb2_lock_element locks[1]; -} __packed; - -struct smb2_lock_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 4 */ - __le16 Reserved; -} __packed; - -struct smb2_echo_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 4 */ - __u16 Reserved; -} __packed; - -struct smb2_echo_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 4 */ - __u16 Reserved; -} __packed; - -/* search (query_directory) Flags field */ -#define SMB2_RESTART_SCANS 0x01 -#define SMB2_RETURN_SINGLE_ENTRY 0x02 -#define SMB2_INDEX_SPECIFIED 0x04 -#define SMB2_REOPEN 0x10 - -#define SMB2_QUERY_DIRECTORY_IOV_SIZE 2 - -/* - * Valid FileInformation classes. - * - * Note that these are a subset of the (file) QUERY_INFO levels defined - * later in this file (but since QUERY_DIRECTORY uses equivalent numbers - * we do not redefine them here) - * - * FileDirectoryInfomation 0x01 - * FileFullDirectoryInformation 0x02 - * FileIdFullDirectoryInformation 0x26 - * FileBothDirectoryInformation 0x03 - * FileIdBothDirectoryInformation 0x25 - * FileNamesInformation 0x0C - * FileIdExtdDirectoryInformation 0x3C - */ - -struct smb2_query_directory_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 33 */ - __u8 FileInformationClass; - __u8 Flags; - __le32 FileIndex; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ - __le16 FileNameOffset; - __le16 FileNameLength; - __le32 OutputBufferLength; - __u8 Buffer[1]; -} __packed; - -struct smb2_query_directory_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 9 */ - __le16 OutputBufferOffset; - __le32 OutputBufferLength; - __u8 Buffer[1]; -} __packed; - -/* Possible InfoType values */ -#define SMB2_O_INFO_FILE 0x01 -#define SMB2_O_INFO_FILESYSTEM 0x02 -#define SMB2_O_INFO_SECURITY 0x03 -#define SMB2_O_INFO_QUOTA 0x04 - -/* Security info type additionalinfo flags. See MS-SMB2 (2.2.37) or MS-DTYP */ -#define OWNER_SECINFO 0x00000001 -#define GROUP_SECINFO 0x00000002 -#define DACL_SECINFO 0x00000004 -#define SACL_SECINFO 0x00000008 -#define LABEL_SECINFO 0x00000010 -#define ATTRIBUTE_SECINFO 0x00000020 -#define SCOPE_SECINFO 0x00000040 -#define BACKUP_SECINFO 0x00010000 -#define UNPROTECTED_SACL_SECINFO 0x10000000 -#define UNPROTECTED_DACL_SECINFO 0x20000000 -#define PROTECTED_SACL_SECINFO 0x40000000 -#define PROTECTED_DACL_SECINFO 0x80000000 - -/* Flags used for FileFullEAinfo */ -#define SL_RESTART_SCAN 0x00000001 -#define SL_RETURN_SINGLE_ENTRY 0x00000002 -#define SL_INDEX_SPECIFIED 0x00000004 - -struct smb2_query_info_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 41 */ - __u8 InfoType; - __u8 FileInfoClass; - __le32 OutputBufferLength; - __le16 InputBufferOffset; - __u16 Reserved; - __le32 InputBufferLength; - __le32 AdditionalInformation; - __le32 Flags; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ - __u8 Buffer[1]; -} __packed; - -struct smb2_query_info_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 9 */ - __le16 OutputBufferOffset; - __le32 OutputBufferLength; - __u8 Buffer[1]; -} __packed; - -/* - * Maximum number of iovs we need for a set-info request. - * The largest one is rename/hardlink - * [0] : struct smb2_set_info_req + smb2_file_[rename|link]_info - * [1] : path - * [2] : compound padding - */ -#define SMB2_SET_INFO_IOV_SIZE 3 - -struct smb2_set_info_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 33 */ - __u8 InfoType; - __u8 FileInfoClass; - __le32 BufferLength; - __le16 BufferOffset; - __u16 Reserved; - __le32 AdditionalInformation; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ - __u8 Buffer[1]; -} __packed; - -struct smb2_set_info_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 2 */ -} __packed; - -struct smb2_oplock_break { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 24 */ - __u8 OplockLevel; - __u8 Reserved; - __le32 Reserved2; - __u64 PersistentFid; - __u64 VolatileFid; -} __packed; - -#define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01) - -struct smb2_lease_break { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 44 */ - __le16 Epoch; - __le32 Flags; - __u8 LeaseKey[16]; - __le32 CurrentLeaseState; - __le32 NewLeaseState; - __le32 BreakReason; - __le32 AccessMaskHint; - __le32 ShareMaskHint; -} __packed; - -struct smb2_lease_ack { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 36 */ - __le16 Reserved; - __le32 Flags; - __u8 LeaseKey[16]; - __le32 LeaseState; - __le64 LeaseDuration; -} __packed; - /* - * PDU infolevel structure definitions + * PDU query infolevel structure definitions * BB consider moving to a different header */ -/* File System Information Classes */ -#define FS_VOLUME_INFORMATION 1 /* Query */ -#define FS_LABEL_INFORMATION 2 /* Local only */ -#define FS_SIZE_INFORMATION 3 /* Query */ -#define FS_DEVICE_INFORMATION 4 /* Query */ -#define FS_ATTRIBUTE_INFORMATION 5 /* Query */ -#define FS_CONTROL_INFORMATION 6 /* Query, Set */ -#define FS_FULL_SIZE_INFORMATION 7 /* Query */ -#define FS_OBJECT_ID_INFORMATION 8 /* Query, Set */ -#define FS_DRIVER_PATH_INFORMATION 9 /* Local only */ -#define FS_VOLUME_FLAGS_INFORMATION 10 /* Local only */ -#define FS_SECTOR_SIZE_INFORMATION 11 /* SMB3 or later. Query */ -#define FS_POSIX_INFORMATION 100 /* SMB3.1.1 POSIX. Query */ - -struct smb2_fs_full_size_info { - __le64 TotalAllocationUnits; - __le64 CallerAvailableAllocationUnits; - __le64 ActualAvailableAllocationUnits; - __le32 SectorsPerAllocationUnit; - __le32 BytesPerSector; -} __packed; - -#define SSINFO_FLAGS_ALIGNED_DEVICE 0x00000001 -#define SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE 0x00000002 -#define SSINFO_FLAGS_NO_SEEK_PENALTY 0x00000004 -#define SSINFO_FLAGS_TRIM_ENABLED 0x00000008 - -/* sector size info struct */ -struct smb3_fs_ss_info { - __le32 LogicalBytesPerSector; - __le32 PhysicalBytesPerSectorForAtomicity; - __le32 PhysicalBytesPerSectorForPerf; - __le32 FileSystemEffectivePhysicalBytesPerSectorForAtomicity; - __le32 Flags; - __le32 ByteOffsetForSectorAlignment; - __le32 ByteOffsetForPartitionAlignment; -} __packed; - -/* volume info struct - see MS-FSCC 2.5.9 */ -#define MAX_VOL_LABEL_LEN 32 -struct smb3_fs_vol_info { - __le64 VolumeCreationTime; - __u32 VolumeSerialNumber; - __le32 VolumeLabelLength; /* includes trailing null */ - __u8 SupportsObjects; /* True if eg like NTFS, supports objects */ - __u8 Reserved; - __u8 VolumeLabel[]; /* variable len */ -} __packed; - -/* partial list of QUERY INFO levels */ -#define FILE_DIRECTORY_INFORMATION 1 -#define FILE_FULL_DIRECTORY_INFORMATION 2 -#define FILE_BOTH_DIRECTORY_INFORMATION 3 -#define FILE_BASIC_INFORMATION 4 -#define FILE_STANDARD_INFORMATION 5 -#define FILE_INTERNAL_INFORMATION 6 -#define FILE_EA_INFORMATION 7 -#define FILE_ACCESS_INFORMATION 8 -#define FILE_NAME_INFORMATION 9 -#define FILE_RENAME_INFORMATION 10 -#define FILE_LINK_INFORMATION 11 -#define FILE_NAMES_INFORMATION 12 -#define FILE_DISPOSITION_INFORMATION 13 -#define FILE_POSITION_INFORMATION 14 -#define FILE_FULL_EA_INFORMATION 15 -#define FILE_MODE_INFORMATION 16 -#define FILE_ALIGNMENT_INFORMATION 17 -#define FILE_ALL_INFORMATION 18 -#define FILE_ALLOCATION_INFORMATION 19 -#define FILE_END_OF_FILE_INFORMATION 20 -#define FILE_ALTERNATE_NAME_INFORMATION 21 -#define FILE_STREAM_INFORMATION 22 -#define FILE_PIPE_INFORMATION 23 -#define FILE_PIPE_LOCAL_INFORMATION 24 -#define FILE_PIPE_REMOTE_INFORMATION 25 -#define FILE_MAILSLOT_QUERY_INFORMATION 26 -#define FILE_MAILSLOT_SET_INFORMATION 27 -#define FILE_COMPRESSION_INFORMATION 28 -#define FILE_OBJECT_ID_INFORMATION 29 -/* Number 30 not defined in documents */ -#define FILE_MOVE_CLUSTER_INFORMATION 31 -#define FILE_QUOTA_INFORMATION 32 -#define FILE_REPARSE_POINT_INFORMATION 33 -#define FILE_NETWORK_OPEN_INFORMATION 34 -#define FILE_ATTRIBUTE_TAG_INFORMATION 35 -#define FILE_TRACKING_INFORMATION 36 -#define FILEID_BOTH_DIRECTORY_INFORMATION 37 -#define FILEID_FULL_DIRECTORY_INFORMATION 38 -#define FILE_VALID_DATA_LENGTH_INFORMATION 39 -#define FILE_SHORT_NAME_INFORMATION 40 -#define FILE_SFIO_RESERVE_INFORMATION 44 -#define FILE_SFIO_VOLUME_INFORMATION 45 -#define FILE_HARD_LINK_INFORMATION 46 -#define FILE_NORMALIZED_NAME_INFORMATION 48 -#define FILEID_GLOBAL_TX_DIRECTORY_INFORMATION 50 -#define FILE_STANDARD_LINK_INFORMATION 54 -#define FILE_ID_INFORMATION 59 -#define FILE_ID_EXTD_DIRECTORY_INFORMATION 60 - -struct smb2_file_internal_info { - __le64 IndexNumber; -} __packed; /* level 6 Query */ - -struct smb2_file_rename_info { /* encoding of request for level 10 */ - __u8 ReplaceIfExists; /* 1 = replace existing target with new */ - /* 0 = fail if target already exists */ - __u8 Reserved[7]; - __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ - __le32 FileNameLength; - char FileName[]; /* New name to be assigned */ - /* padding - overall struct size must be >= 24 so filename + pad >= 6 */ -} __packed; /* level 10 Set */ - -struct smb2_file_link_info { /* encoding of request for level 11 */ - __u8 ReplaceIfExists; /* 1 = replace existing link with new */ - /* 0 = fail if link already exists */ - __u8 Reserved[7]; - __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ - __le32 FileNameLength; - char FileName[]; /* Name to be assigned to new link */ -} __packed; /* level 11 Set */ - struct smb2_file_full_ea_info { /* encoding of response for level 15 */ __le32 next_entry_offset; __u8 flags; @@ -851,38 +352,6 @@ struct smb2_file_full_ea_info { /* encoding of response for level 15 */ char ea_data[]; /* \0 terminated name plus value */ } __packed; /* level 15 Set */ -/* - * This level 18, although with struct with same name is different from cifs - * level 0x107. Level 0x107 has an extra u64 between AccessFlags and - * CurrentByteOffset. - */ -struct smb2_file_all_info { /* data block encoding of response to level 18 */ - __le64 CreationTime; /* Beginning of FILE_BASIC_INFO equivalent */ - __le64 LastAccessTime; - __le64 LastWriteTime; - __le64 ChangeTime; - __le32 Attributes; - __u32 Pad1; /* End of FILE_BASIC_INFO_INFO equivalent */ - __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */ - __le64 EndOfFile; /* size ie offset to first free byte in file */ - __le32 NumberOfLinks; /* hard links */ - __u8 DeletePending; - __u8 Directory; - __u16 Pad2; /* End of FILE_STANDARD_INFO equivalent */ - __le64 IndexNumber; - __le32 EASize; - __le32 AccessFlags; - __le64 CurrentByteOffset; - __le32 Mode; - __le32 AlignmentRequirement; - __le32 FileNameLength; - char FileName[1]; -} __packed; /* level 18 Query */ - -struct smb2_file_eof_info { /* encoding of request for level 10 */ - __le64 EndOfFile; /* new end of file value */ -} __packed; /* level 20 Set */ - struct smb2_file_reparse_point_info { __le64 IndexNumber; __le32 Tag; @@ -935,6 +404,8 @@ struct create_posix_rsp { struct cifs_sid group; /* var-sized on the wire */ } __packed; +#define SMB2_QUERY_DIRECTORY_IOV_SIZE 2 + /* * SMB2-only POSIX info level for query dir * @@ -966,31 +437,6 @@ struct smb2_posix_info { */ } __packed; -/* Level 100 query info */ -struct smb311_posix_qinfo { - __le64 CreationTime; - __le64 LastAccessTime; - __le64 LastWriteTime; - __le64 ChangeTime; - __le64 EndOfFile; - __le64 AllocationSize; - __le32 DosAttributes; - __le64 Inode; - __le32 DeviceId; - __le32 Zero; - /* beginning of POSIX Create Context Response */ - __le32 HardLinks; - __le32 ReparseTag; - __le32 Mode; - u8 Sids[]; - /* - * var sized owner SID - * var sized group SID - * le32 filenamelength - * u8 filename[] - */ -} __packed; - /* * Parsed version of the above struct. Allows direct access to the * variable length fields diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 4a7062fd1c26..a69f1eed1cfe 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -283,7 +283,7 @@ extern int smb311_update_preauth_hash(struct cifs_ses *ses, struct kvec *iov, int nvec); extern int smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, - __le16 *utf16_path, u32 desired_access, + const char *path, u32 desired_access, u32 class, u32 type, u32 output_len, struct kvec *rsp, int *buftype, struct cifs_sb_info *cifs_sb); diff --git a/fs/coredump.c b/fs/coredump.c index 7ed7d601e5e0..ebc43f960b64 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -31,7 +31,6 @@ #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/audit.h> -#include <linux/tracehook.h> #include <linux/kmod.h> #include <linux/fsnotify.h> #include <linux/fs_struct.h> diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 4fcca79f39ae..526a4c1bed99 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -248,7 +248,7 @@ EXPORT_SYMBOL(fscrypt_encrypt_block_inplace); * which must still be locked and not uptodate. Normally, blocksize == * PAGE_SIZE and the whole page is decrypted at once. * - * This is for use by the filesystem's ->readpages() method. + * This is for use by the filesystem's ->readahead() method. * * Return: 0 on success; -errno on failure */ @@ -389,7 +389,7 @@ static struct page *dax_busy_page(void *entry) } /* - * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page + * dax_lock_page - Lock the DAX entry corresponding to a page * @page: The page whose entry we want to lock * * Context: Process context. diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 2f117c57160d..3dcf0b8b4e93 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -450,6 +450,11 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode, * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. + * + * NOTE: it's expected that most callers should _ignore_ the errors returned + * by this function. Other debugfs functions handle the fact that the "dentry" + * passed to them could be an error and they don't crash in that case. + * Drivers should generally work fine even if debugfs fails to init anyway. */ struct dentry *debugfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, @@ -551,6 +556,11 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_size); * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. + * + * NOTE: it's expected that most callers should _ignore_ the errors returned + * by this function. Other debugfs functions handle the fact that the "dentry" + * passed to them could be an error and they don't crash in that case. + * Drivers should generally work fine even if debugfs fails to init anyway. */ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) { diff --git a/fs/direct-io.c b/fs/direct-io.c index 38bca4980a1c..aef06e607b40 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -402,9 +402,6 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, bio->bi_end_io = dio_bio_end_aio; else bio->bi_end_io = dio_bio_end_io; - - bio->bi_write_hint = dio->iocb->ki_hint; - sdio->bio = bio; sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; } diff --git a/fs/exec.c b/fs/exec.c index 8256e8bb9ad3..e3e55d5e0be1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -56,7 +56,6 @@ #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/audit.h> -#include <linux/tracehook.h> #include <linux/kmod.h> #include <linux/fsnotify.h> #include <linux/fs_struct.h> @@ -1309,12 +1308,6 @@ int begin_new_exec(struct linux_binprm * bprm) if (retval) goto out_unlock; - /* - * Ensure that the uaccess routines can actually operate on userspace - * pointers: - */ - force_uaccess_begin(); - if (me->flags & PF_KTHREAD) free_kthread_struct(me); me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 619e5b4bed10..c6800b880920 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -203,7 +203,8 @@ struct exfat_mount_options { /* on error: continue, panic, remount-ro */ enum exfat_error_mode errors; unsigned utf8:1, /* Use of UTF-8 character set */ - discard:1; /* Issue discard requests on deletions */ + discard:1, /* Issue discard requests on deletions */ + keep_last_dots:1; /* Keep trailing periods in paths */ int time_offset; /* Offset of timestamps from UTC (in minutes) */ }; diff --git a/fs/exfat/file.c b/fs/exfat/file.c index d890fd34bb2d..2f5130059236 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -218,8 +218,6 @@ int __exfat_truncate(struct inode *inode, loff_t new_size) if (exfat_free_cluster(inode, &clu)) return -EIO; - exfat_clear_volume_dirty(sb); - return 0; } diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index af4eb39cc0c3..a02a04a993bf 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -65,11 +65,14 @@ static int exfat_d_revalidate(struct dentry *dentry, unsigned int flags) return ret; } -/* returns the length of a struct qstr, ignoring trailing dots */ -static unsigned int exfat_striptail_len(unsigned int len, const char *name) +/* returns the length of a struct qstr, ignoring trailing dots if necessary */ +static unsigned int exfat_striptail_len(unsigned int len, const char *name, + bool keep_last_dots) { - while (len && name[len - 1] == '.') - len--; + if (!keep_last_dots) { + while (len && name[len - 1] == '.') + len--; + } return len; } @@ -83,7 +86,8 @@ static int exfat_d_hash(const struct dentry *dentry, struct qstr *qstr) struct super_block *sb = dentry->d_sb; struct nls_table *t = EXFAT_SB(sb)->nls_io; const unsigned char *name = qstr->name; - unsigned int len = exfat_striptail_len(qstr->len, qstr->name); + unsigned int len = exfat_striptail_len(qstr->len, qstr->name, + EXFAT_SB(sb)->options.keep_last_dots); unsigned long hash = init_name_hash(dentry); int i, charlen; wchar_t c; @@ -104,8 +108,10 @@ static int exfat_d_cmp(const struct dentry *dentry, unsigned int len, { struct super_block *sb = dentry->d_sb; struct nls_table *t = EXFAT_SB(sb)->nls_io; - unsigned int alen = exfat_striptail_len(name->len, name->name); - unsigned int blen = exfat_striptail_len(len, str); + unsigned int alen = exfat_striptail_len(name->len, name->name, + EXFAT_SB(sb)->options.keep_last_dots); + unsigned int blen = exfat_striptail_len(len, str, + EXFAT_SB(sb)->options.keep_last_dots); wchar_t c1, c2; int charlen, i; @@ -136,7 +142,8 @@ static int exfat_utf8_d_hash(const struct dentry *dentry, struct qstr *qstr) { struct super_block *sb = dentry->d_sb; const unsigned char *name = qstr->name; - unsigned int len = exfat_striptail_len(qstr->len, qstr->name); + unsigned int len = exfat_striptail_len(qstr->len, qstr->name, + EXFAT_SB(sb)->options.keep_last_dots); unsigned long hash = init_name_hash(dentry); int i, charlen; unicode_t u; @@ -161,8 +168,11 @@ static int exfat_utf8_d_cmp(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { struct super_block *sb = dentry->d_sb; - unsigned int alen = exfat_striptail_len(name->len, name->name); - unsigned int blen = exfat_striptail_len(len, str); + unsigned int alen = exfat_striptail_len(name->len, name->name, + EXFAT_SB(sb)->options.keep_last_dots); + unsigned int blen = exfat_striptail_len(len, str, + EXFAT_SB(sb)->options.keep_last_dots); + unicode_t u_a, u_b; int charlen, i; @@ -416,13 +426,25 @@ static int __exfat_resolve_path(struct inode *inode, const unsigned char *path, struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(inode); + int pathlen = strlen(path); - /* strip all trailing periods */ - namelen = exfat_striptail_len(strlen(path), path); + /* + * get the length of the pathname excluding + * trailing periods, if any. + */ + namelen = exfat_striptail_len(pathlen, path, false); + if (EXFAT_SB(sb)->options.keep_last_dots) { + /* + * Do not allow the creation of files with names + * ending with period(s). + */ + if (!lookup && (namelen < pathlen)) + return -EINVAL; + namelen = pathlen; + } if (!namelen) return -ENOENT; - - if (strlen(path) > (MAX_NAME_LENGTH * MAX_CHARSET_SIZE)) + if (pathlen > (MAX_NAME_LENGTH * MAX_CHARSET_SIZE)) return -ENAMETOOLONG; /* @@ -554,7 +576,6 @@ static int exfat_create(struct user_namespace *mnt_userns, struct inode *dir, exfat_set_volume_dirty(sb); err = exfat_add_entry(dir, dentry->d_name.name, &cdir, TYPE_FILE, &info); - exfat_clear_volume_dirty(sb); if (err) goto unlock; @@ -812,7 +833,6 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry) /* This doesn't modify ei */ ei->dir.dir = DIR_DELETED; - exfat_clear_volume_dirty(sb); inode_inc_iversion(dir); dir->i_mtime = dir->i_atime = current_time(dir); @@ -846,7 +866,6 @@ static int exfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir, exfat_set_volume_dirty(sb); err = exfat_add_entry(dir, dentry->d_name.name, &cdir, TYPE_DIR, &info); - exfat_clear_volume_dirty(sb); if (err) goto unlock; @@ -976,7 +995,6 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry) goto unlock; } ei->dir.dir = DIR_DELETED; - exfat_clear_volume_dirty(sb); inode_inc_iversion(dir); dir->i_mtime = dir->i_atime = current_time(dir); @@ -1311,7 +1329,6 @@ del_out: */ new_ei->dir.dir = DIR_DELETED; } - exfat_clear_volume_dirty(sb); out: return ret; } diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 9f892903419a..8ca21e7917d1 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -100,7 +100,6 @@ static int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flags) { struct exfat_sb_info *sbi = EXFAT_SB(sb); struct boot_sector *p_boot = (struct boot_sector *)sbi->boot_bh->b_data; - bool sync; /* retain persistent-flags */ new_flags |= sbi->vol_flags_persistent; @@ -119,16 +118,11 @@ static int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flags) p_boot->vol_flags = cpu_to_le16(new_flags); - if ((new_flags & VOLUME_DIRTY) && !buffer_dirty(sbi->boot_bh)) - sync = true; - else - sync = false; - set_buffer_uptodate(sbi->boot_bh); mark_buffer_dirty(sbi->boot_bh); - if (sync) - sync_dirty_buffer(sbi->boot_bh); + __sync_dirty_buffer(sbi->boot_bh, REQ_SYNC | REQ_FUA | REQ_PREFLUSH); + return 0; } @@ -174,6 +168,8 @@ static int exfat_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",errors=remount-ro"); if (opts->discard) seq_puts(m, ",discard"); + if (opts->keep_last_dots) + seq_puts(m, ",keep_last_dots"); if (opts->time_offset) seq_printf(m, ",time_offset=%d", opts->time_offset); return 0; @@ -217,6 +213,7 @@ enum { Opt_charset, Opt_errors, Opt_discard, + Opt_keep_last_dots, Opt_time_offset, /* Deprecated options */ @@ -243,6 +240,7 @@ static const struct fs_parameter_spec exfat_parameters[] = { fsparam_string("iocharset", Opt_charset), fsparam_enum("errors", Opt_errors, exfat_param_enums), fsparam_flag("discard", Opt_discard), + fsparam_flag("keep_last_dots", Opt_keep_last_dots), fsparam_s32("time_offset", Opt_time_offset), __fsparam(NULL, "utf8", Opt_utf8, fs_param_deprecated, NULL), @@ -297,6 +295,9 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_discard: opts->discard = 1; break; + case Opt_keep_last_dots: + opts->keep_last_dots = 1; + break; case Opt_time_offset: /* * Make the limit 24 just in case someone invents something diff --git a/fs/ext2/super.c b/fs/ext2/super.c index e48aaf52ce93..f6a19f6d9f6d 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -753,8 +753,12 @@ static loff_t ext2_max_size(int bits) res += 1LL << (bits-2); res += 1LL << (2*(bits-2)); res += 1LL << (3*(bits-2)); + /* Compute how many metadata blocks are needed */ + meta_blocks = 1; + meta_blocks += 1 + ppb; + meta_blocks += 1 + ppb + ppb * ppb; /* Does block tree limit file size? */ - if (res < upper_limit) + if (res + meta_blocks <= upper_limit) goto check_lfs; res = upper_limit; diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index 80414dcba6e1..1db12847a83b 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -55,13 +55,13 @@ struct ext4_fc_del_range { struct ext4_fc_dentry_info { __le32 fc_parent_ino; __le32 fc_ino; - __u8 fc_dname[0]; + __u8 fc_dname[]; }; /* Value structure for EXT4_FC_TAG_INODE and EXT4_FC_TAG_INODE_PARTIAL. */ struct ext4_fc_inode { __le32 fc_ino; - __u8 fc_raw_inode[0]; + __u8 fc_raw_inode[]; }; /* Value structure for tag EXT4_FC_TAG_TAIL. */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 8bd66cdc41be..6feb07e3e1eb 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -267,7 +267,7 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, goto out; current->backing_dev_info = inode_to_bdi(inode); - ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos); + ret = generic_perform_write(iocb, from); current->backing_dev_info = NULL; out: diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1ce13f69fbec..13740f2d0e61 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3589,7 +3589,7 @@ const struct iomap_ops ext4_iomap_report_ops = { static bool ext4_journalled_dirty_folio(struct address_space *mapping, struct folio *folio) { - WARN_ON_ONCE(!page_has_buffers(&folio->page)); + WARN_ON_ONCE(!folio_buffers(folio)); folio_set_checked(folio); return filemap_dirty_folio(mapping, folio); } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 17bb78ebd784..495ce59fb4ad 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -373,7 +373,6 @@ void ext4_io_submit(struct ext4_io_submit *io) if (bio) { if (io->io_wbc->sync_mode == WB_SYNC_ALL) io->io_bio->bi_opf |= REQ_SYNC; - io->io_bio->bi_write_hint = io->io_end->inode->i_write_hint; submit_bio(io->io_bio); } io->io_bio = NULL; @@ -418,10 +417,8 @@ static void io_submit_add_bh(struct ext4_io_submit *io, submit_and_retry: ext4_io_submit(io); } - if (io->io_bio == NULL) { + if (io->io_bio == NULL) io_submit_init_bio(io, bh); - io->io_bio->bi_write_hint = inode->i_write_hint; - } ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); if (ret != bh->b_size) goto submit_and_retry; diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 1aa26d6634fc..af491e170c4a 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -109,7 +109,7 @@ static void verity_work(struct work_struct *work) struct bio *bio = ctx->bio; /* - * fsverity_verify_bio() may call readpages() again, and although verity + * fsverity_verify_bio() may call readahead() again, and although verity * will be disabled for that, decryption may still be needed, causing * another bio_post_read_ctx to be allocated. So to guarantee that * mempool_alloc() never deadlocks we must free the current ctx first. diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index a8fc4fa511a8..f5366feea82d 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -456,7 +456,7 @@ static bool f2fs_dirty_meta_folio(struct address_space *mapping, folio_mark_uptodate(folio); if (!folio_test_dirty(folio)) { filemap_dirty_folio(mapping, folio); - inc_page_count(F2FS_P_SB(&folio->page), F2FS_DIRTY_META); + inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_META); set_page_private_reference(&folio->page); return true; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b3131fc8762d..8e0c2e773c8d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -164,7 +164,7 @@ static void f2fs_verify_bio(struct work_struct *work) bool may_have_compressed_pages = (ctx->enabled_steps & STEP_DECOMPRESS); /* - * fsverity_verify_bio() may call readpages() again, and while verity + * fsverity_verify_bio() may call readahead() again, and while verity * will be disabled for this, decryption and/or decompression may still * be needed, resulting in another bio_post_read_ctx being allocated. * So to prevent deadlocks we need to release the current ctx to the @@ -428,8 +428,6 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) } else { bio->bi_end_io = f2fs_write_end_io; bio->bi_private = sbi; - bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, - fio->type, fio->temp); } iostat_alloc_and_bind_ctx(sbi, bio, NULL); @@ -2394,7 +2392,7 @@ static void f2fs_readahead(struct readahead_control *rac) if (!f2fs_is_compress_backend_ready(inode)) return; - /* If the file has inline data, skip readpages */ + /* If the file has inline data, skip readahead */ if (f2fs_has_inline_data(inode)) return; @@ -3573,7 +3571,7 @@ static bool f2fs_dirty_data_folio(struct address_space *mapping, f2fs_update_dirty_folio(inode, folio); return true; } - return true; + return false; } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 68ddf4c7ca64..5b89af0f27f0 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4448,7 +4448,7 @@ static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb, return -EOPNOTSUPP; current->backing_dev_info = inode_to_bdi(inode); - ret = generic_perform_write(file, from, iocb->ki_pos); + ret = generic_perform_write(iocb, from); current->backing_dev_info = NULL; if (ret > 0) { @@ -4482,10 +4482,8 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); const bool do_opu = f2fs_lfs_mode(sbi); - const int whint_mode = F2FS_OPTION(sbi).whint_mode; const loff_t pos = iocb->ki_pos; const ssize_t count = iov_iter_count(from); - const enum rw_hint hint = iocb->ki_hint; unsigned int dio_flags; struct iomap_dio *dio; ssize_t ret; @@ -4518,8 +4516,6 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, if (do_opu) f2fs_down_read(&fi->i_gc_rwsem[READ]); } - if (whint_mode == WHINT_MODE_OFF) - iocb->ki_hint = WRITE_LIFE_NOT_SET; /* * We have to use __iomap_dio_rw() and iomap_dio_complete() instead of @@ -4542,8 +4538,6 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, ret = iomap_dio_complete(dio); } - if (whint_mode == WHINT_MODE_OFF) - iocb->ki_hint = hint; if (do_opu) f2fs_up_read(&fi->i_gc_rwsem[READ]); f2fs_up_read(&fi->i_gc_rwsem[WRITE]); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 0b6e741e94a0..c45d341dcf6e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2146,11 +2146,11 @@ static bool f2fs_dirty_node_folio(struct address_space *mapping, folio_mark_uptodate(folio); #ifdef CONFIG_F2FS_CHECK_FS if (IS_INODE(&folio->page)) - f2fs_inode_chksum_set(F2FS_P_SB(&folio->page), &folio->page); + f2fs_inode_chksum_set(F2FS_M_SB(mapping), &folio->page); #endif if (!folio_test_dirty(folio)) { filemap_dirty_folio(mapping, folio); - inc_page_count(F2FS_P_SB(&folio->page), F2FS_DIRTY_NODES); + inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES); set_page_private_reference(&folio->page); return true; } diff --git a/fs/fat/dir.c b/fs/fat/dir.c index c4a274285858..249825017da7 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -722,7 +722,7 @@ static int func(struct dir_context *ctx, const char *name, int name_len, \ if (name_len >= sizeof(d1->d_name)) \ name_len = sizeof(d1->d_name) - 1; \ \ - if (put_user(0, d2->d_name) || \ + if (put_user(0, &d2->d_name[0]) || \ put_user(0, &d2->d_reclen) || \ copy_to_user(d1->d_name, name, name_len) || \ put_user(0, d1->d_name + name_len) || \ diff --git a/fs/fcntl.c b/fs/fcntl.c index 9c6c6a3e2de5..f15d885b9796 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -291,22 +291,6 @@ static long fcntl_rw_hint(struct file *file, unsigned int cmd, u64 h; switch (cmd) { - case F_GET_FILE_RW_HINT: - h = file_write_hint(file); - if (copy_to_user(argp, &h, sizeof(*argp))) - return -EFAULT; - return 0; - case F_SET_FILE_RW_HINT: - if (copy_from_user(&h, argp, sizeof(h))) - return -EFAULT; - hint = (enum rw_hint) h; - if (!rw_hint_valid(hint)) - return -EINVAL; - - spin_lock(&file->f_lock); - file->f_write_hint = hint; - spin_unlock(&file->f_lock); - return 0; case F_GET_RW_HINT: h = inode->i_write_hint; if (copy_to_user(argp, &h, sizeof(*argp))) @@ -431,8 +415,6 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, break; case F_GET_RW_HINT: case F_SET_RW_HINT: - case F_GET_FILE_RW_HINT: - case F_SET_FILE_RW_HINT: err = fcntl_rw_hint(filp, cmd, arg); break; default: diff --git a/fs/file.c b/fs/file.c index 97d212a9b814..ee9317346702 100644 --- a/fs/file.c +++ b/fs/file.c @@ -87,6 +87,21 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds); } +/* + * Note how the fdtable bitmap allocations very much have to be a multiple of + * BITS_PER_LONG. This is not only because we walk those things in chunks of + * 'unsigned long' in some places, but simply because that is how the Linux + * kernel bitmaps are defined to work: they are not "bits in an array of bytes", + * they are very much "bits in an array of unsigned long". + * + * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied + * by that "1024/sizeof(ptr)" before, we already know there are sufficient + * clear low bits. Clang seems to realize that, gcc ends up being confused. + * + * On a 128-bit machine, the ALIGN() would actually matter. In the meantime, + * let's consider it documentation (and maybe a test-case for gcc to improve + * its code generation ;) + */ static struct fdtable * alloc_fdtable(unsigned int nr) { struct fdtable *fdt; @@ -102,6 +117,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr) nr /= (1024 / sizeof(struct file *)); nr = roundup_pow_of_two(nr + 1); nr *= (1024 / sizeof(struct file *)); + nr = ALIGN(nr, BITS_PER_LONG); /* * Note that this can drive nr *below* what we had passed if sysctl_nr_open * had been set lower between the check in expand_files() and here. Deal @@ -269,6 +285,19 @@ static unsigned int count_open_files(struct fdtable *fdt) return i; } +/* + * Note that a sane fdtable size always has to be a multiple of + * BITS_PER_LONG, since we have bitmaps that are sized by this. + * + * 'max_fds' will normally already be properly aligned, but it + * turns out that in the close_range() -> __close_range() -> + * unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end + * up having a 'max_fds' value that isn't already aligned. + * + * Rather than make close_range() have to worry about this, + * just make that BITS_PER_LONG alignment be part of a sane + * fdtable size. Becuase that's really what it is. + */ static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds) { unsigned int count; @@ -276,7 +305,7 @@ static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds) count = count_open_files(fdt); if (max_fds < NR_OPEN_DEFAULT) max_fds = NR_OPEN_DEFAULT; - return min(count, max_fds); + return ALIGN(min(count, max_fds), BITS_PER_LONG); } /* diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index f121c21590dc..ed1c9ed737f2 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -71,17 +71,6 @@ static inline void fscache_see_cookie(struct fscache_cookie *cookie, } /* - * io.c - */ -static inline void fscache_end_operation(struct netfs_cache_resources *cres) -{ - const struct netfs_cache_ops *ops = fscache_operation_valid(cres); - - if (ops) - ops->end_operation(cres); -} - -/* * main.c */ extern unsigned fscache_debug; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index eac4984cc753..488b460e046f 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -627,7 +627,7 @@ struct fuse_conn { /** Connection successful. Only set in INIT */ unsigned conn_init:1; - /** Do readpages asynchronously? Only set in INIT */ + /** Do readahead asynchronously? Only set in INIT */ unsigned async_read:1; /** Return an unique read error after abort. Only set in INIT */ diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index df58966bc874..33cde4bbccdc 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -170,7 +170,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, #else if (flags & FUSE_IOCTL_COMPAT) { inarg.flags |= FUSE_IOCTL_32BIT; -#ifdef CONFIG_X86_X32 +#ifdef CONFIG_X86_X32_ABI if (in_x32_syscall()) inarg.flags |= FUSE_IOCTL_COMPAT_X32; #endif diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index d67108489148..39080b2d6cf8 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -606,9 +606,9 @@ out: return ret; } -static inline __be64 *gfs2_indirect_init(struct metapath *mp, - struct gfs2_glock *gl, unsigned int i, - unsigned offset, u64 bn) +static inline void gfs2_indirect_init(struct metapath *mp, + struct gfs2_glock *gl, unsigned int i, + unsigned offset, u64 bn) { __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data + ((i > 1) ? sizeof(struct gfs2_meta_header) : @@ -621,7 +621,6 @@ static inline __be64 *gfs2_indirect_init(struct metapath *mp, gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header)); ptr += offset; *ptr = cpu_to_be64(bn); - return ptr; } enum alloc_state { @@ -2146,7 +2145,7 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize) ret = do_shrink(inode, newsize); out: - gfs2_rs_delete(ip, NULL); + gfs2_rs_delete(ip); gfs2_qa_put(ip); return ret; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 8c39a8571b1f..22b41acfbbc3 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -706,7 +706,7 @@ static int gfs2_release(struct inode *inode, struct file *file) if (file->f_mode & FMODE_WRITE) { if (gfs2_rs_active(&ip->i_res)) - gfs2_rs_delete(ip, &inode->i_writecount); + gfs2_rs_delete(ip); gfs2_qa_put(ip); } return 0; @@ -775,8 +775,7 @@ static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i, size_t *window_size) { size_t count = iov_iter_count(i); - char __user *p; - int pages = 1; + size_t size, offs; if (likely(!count)) return false; @@ -785,18 +784,20 @@ static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i, if (!iter_is_iovec(i)) return false; + size = PAGE_SIZE; + offs = offset_in_page(i->iov[0].iov_base + i->iov_offset); if (*prev_count != count || !*window_size) { - int pages, nr_dirtied; + size_t nr_dirtied; - pages = min_t(int, BIO_MAX_VECS, DIV_ROUND_UP(count, PAGE_SIZE)); + size = ALIGN(offs + count, PAGE_SIZE); + size = min_t(size_t, size, SZ_1M); nr_dirtied = max(current->nr_dirtied_pause - - current->nr_dirtied, 1); - pages = min(pages, nr_dirtied); + current->nr_dirtied, 8); + size = min(size, nr_dirtied << PAGE_SHIFT); } *prev_count = count; - p = i->iov[0].iov_base + i->iov_offset; - *window_size = (size_t)PAGE_SIZE * pages - offset_in_page(p); + *window_size = size - offs; return true; } @@ -851,9 +852,9 @@ retry_under_glock: leftover = fault_in_iov_iter_writeable(to, window_size); gfs2_holder_disallow_demote(gh); if (leftover != window_size) { - if (!gfs2_holder_queued(gh)) - goto retry; - goto retry_under_glock; + if (gfs2_holder_queued(gh)) + goto retry_under_glock; + goto retry; } } if (gfs2_holder_queued(gh)) @@ -920,9 +921,9 @@ retry_under_glock: leftover = fault_in_iov_iter_readable(from, window_size); gfs2_holder_disallow_demote(gh); if (leftover != window_size) { - if (!gfs2_holder_queued(gh)) - goto retry; - goto retry_under_glock; + if (gfs2_holder_queued(gh)) + goto retry_under_glock; + goto retry; } } out: @@ -950,20 +951,19 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) * and retry. */ - if (iocb->ki_flags & IOCB_DIRECT) { - ret = gfs2_file_direct_read(iocb, to, &gh); - if (likely(ret != -ENOTBLK)) - return ret; - iocb->ki_flags &= ~IOCB_DIRECT; - } + if (iocb->ki_flags & IOCB_DIRECT) + return gfs2_file_direct_read(iocb, to, &gh); + + pagefault_disable(); iocb->ki_flags |= IOCB_NOIO; ret = generic_file_read_iter(iocb, to); iocb->ki_flags &= ~IOCB_NOIO; + pagefault_enable(); if (ret >= 0) { if (!iov_iter_count(to)) return ret; written = ret; - } else { + } else if (ret != -EFAULT) { if (ret != -EAGAIN) return ret; if (iocb->ki_flags & IOCB_NOWAIT) @@ -989,12 +989,11 @@ retry_under_glock: leftover = fault_in_iov_iter_writeable(to, window_size); gfs2_holder_disallow_demote(&gh); if (leftover != window_size) { - if (!gfs2_holder_queued(&gh)) { - if (written) - goto out_uninit; - goto retry; - } - goto retry_under_glock; + if (gfs2_holder_queued(&gh)) + goto retry_under_glock; + if (written) + goto out_uninit; + goto retry; } } if (gfs2_holder_queued(&gh)) @@ -1068,12 +1067,11 @@ retry_under_glock: gfs2_holder_disallow_demote(gh); if (leftover != window_size) { from->count = min(from->count, window_size - leftover); - if (!gfs2_holder_queued(gh)) { - if (read) - goto out_uninit; - goto retry; - } - goto retry_under_glock; + if (gfs2_holder_queued(gh)) + goto retry_under_glock; + if (read && !(iocb->ki_flags & IOCB_DIRECT)) + goto out_uninit; + goto retry; } } out_unlock: @@ -1083,6 +1081,7 @@ out_uninit: gfs2_holder_uninit(gh); if (statfs_gh) kfree(statfs_gh); + from->count = orig_count - read; return read ? read : ret; } @@ -1497,7 +1496,6 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) if (error != GLR_TRYFAILED) break; fl_gh->gh_flags = LM_FLAG_TRY | GL_EXACT; - fl_gh->gh_error = 0; msleep(sleeptime); } if (error) { diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 6b23399eaee0..630c6550eacf 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -542,7 +542,7 @@ restart: * some reason. If this holder is the head of the list, it * means we have a blocked holder at the head, so return 1. */ - if (gh->gh_list.prev == &gl->gl_holders) + if (list_is_first(&gh->gh_list, &gl->gl_holders)) return 1; do_error(gl, 0); break; @@ -669,6 +669,8 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) /* Check for state != intended state */ if (unlikely(state != gl->gl_target)) { + if (gh && (ret & LM_OUT_CANCELED)) + gfs2_holder_wake(gh); if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) { /* move to back of queue and try next entry */ if (ret & LM_OUT_CANCELED) { @@ -1259,7 +1261,6 @@ void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags, gh->gh_owner_pid = get_pid(task_pid(current)); gh->gh_state = state; gh->gh_flags = flags; - gh->gh_error = 0; gh->gh_iflags = 0; gfs2_glock_hold(gl); } @@ -1565,6 +1566,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh) if (test_bit(GLF_LRU, &gl->gl_flags)) gfs2_glock_remove_from_lru(gl); + gh->gh_error = 0; spin_lock(&gl->gl_lockref.lock); add_to_queue(gh); if (unlikely((LM_FLAG_NOEXP & gh->gh_flags) && @@ -1691,6 +1693,14 @@ void gfs2_glock_dq(struct gfs2_holder *gh) struct gfs2_glock *gl = gh->gh_gl; spin_lock(&gl->gl_lockref.lock); + if (list_is_first(&gh->gh_list, &gl->gl_holders) && + !test_bit(HIF_HOLDER, &gh->gh_iflags)) { + spin_unlock(&gl->gl_lockref.lock); + gl->gl_name.ln_sbd->sd_lockstruct.ls_ops->lm_cancel(gl); + wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE); + spin_lock(&gl->gl_lockref.lock); + } + __gfs2_glock_dq(gh); spin_unlock(&gl->gl_lockref.lock); } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 89905f4f29bb..c8ec876f33ea 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -131,7 +131,21 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_glock *io_gl; - error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); + error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, + &ip->i_gl); + if (unlikely(error)) + goto fail; + + error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, + &io_gl); + if (unlikely(error)) + goto fail; + + if (blktype != GFS2_BLKST_UNLINKED) + gfs2_cancel_delete_work(io_gl); + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, + &ip->i_iopen_gh); + gfs2_glock_put(io_gl); if (unlikely(error)) goto fail; @@ -161,16 +175,6 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, set_bit(GLF_INSTANTIATE_NEEDED, &ip->i_gl->gl_flags); - error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl); - if (unlikely(error)) - goto fail; - if (blktype != GFS2_BLKST_UNLINKED) - gfs2_cancel_delete_work(io_gl); - error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); - gfs2_glock_put(io_gl); - if (unlikely(error)) - goto fail; - /* Lowest possible timestamp; will be overwritten in gfs2_dinode_in. */ inode->i_atime.tv_sec = 1LL << (8 * sizeof(inode->i_atime.tv_sec) - 1); inode->i_atime.tv_nsec = 0; @@ -716,13 +720,17 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = insert_inode_locked4(inode, ip->i_no_addr, iget_test, &ip->i_no_addr); BUG_ON(error); - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (error) goto fail_gunlock2; + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); + if (error) + goto fail_gunlock3; + error = gfs2_trans_begin(sdp, blocks, 0); if (error) - goto fail_gunlock2; + goto fail_gunlock3; if (blocks > 1) { ip->i_eattr = ip->i_no_addr + 1; @@ -731,10 +739,6 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, init_dinode(dip, ip, symname); gfs2_trans_end(sdp); - error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); - if (error) - goto fail_gunlock2; - glock_set_object(ip->i_gl, ip); glock_set_object(io_gl, ip); gfs2_set_iop(inode); @@ -745,14 +749,14 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (default_acl) { error = __gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); if (error) - goto fail_gunlock3; + goto fail_gunlock4; posix_acl_release(default_acl); default_acl = NULL; } if (acl) { error = __gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS); if (error) - goto fail_gunlock3; + goto fail_gunlock4; posix_acl_release(acl); acl = NULL; } @@ -760,11 +764,11 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = security_inode_init_security(&ip->i_inode, &dip->i_inode, name, &gfs2_initxattrs, NULL); if (error) - goto fail_gunlock3; + goto fail_gunlock4; error = link_dinode(dip, name, ip, &da); if (error) - goto fail_gunlock3; + goto fail_gunlock4; mark_inode_dirty(inode); d_instantiate(dentry, inode); @@ -782,9 +786,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, unlock_new_inode(inode); return error; -fail_gunlock3: +fail_gunlock4: glock_clear_object(ip->i_gl, ip); glock_clear_object(io_gl, ip); +fail_gunlock3: gfs2_glock_dq_uninit(&ip->i_iopen_gh); fail_gunlock2: gfs2_glock_put(io_gl); @@ -793,7 +798,7 @@ fail_free_inode: if (free_vfs_inode) /* else evict will do the put for us */ gfs2_glock_put(ip->i_gl); } - gfs2_rs_delete(ip, NULL); + gfs2_rs_deltree(&ip->i_res); gfs2_qa_put(ip); fail_free_acls: posix_acl_release(default_acl); diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 50578f881e6d..2559a79cf14b 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -261,6 +261,7 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, int req; u32 lkf; char strname[GDLM_STRNAME_BYTES] = ""; + int error; req = make_mode(gl->gl_name.ln_sbd, req_state); lkf = make_flags(gl, flags, req); @@ -279,8 +280,14 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, * Submit the actual lock request. */ - return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname, +again: + error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname, GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); + if (error == -EBUSY) { + msleep(20); + goto again; + } + return error; } static void gdlm_put_lock(struct gfs2_glock *gl) @@ -312,8 +319,14 @@ static void gdlm_put_lock(struct gfs2_glock *gl) return; } +again: error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK, NULL, gl); + if (error == -EBUSY) { + msleep(20); + goto again; + } + if (error) { fs_err(sdp, "gdlm_unlock %x,%llx err=%d\n", gl->gl_name.ln_type, diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 4ae1eefae616..6ba51cbb94cf 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -491,7 +491,6 @@ static struct bio *gfs2_chain_bio(struct bio *prev, unsigned int nr_iovecs) new = bio_alloc(prev->bi_bdev, nr_iovecs, prev->bi_opf, GFP_NOIO); bio_clone_blkg_association(new, prev); new->bi_iter.bi_sector = bio_end_sector(prev); - new->bi_write_hint = prev->bi_write_hint; bio_chain(new, prev); submit_bio(prev); return new; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 0fb3c01bc557..801ad9f4f2be 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -680,13 +680,14 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs) /** * gfs2_rs_delete - delete a multi-block reservation * @ip: The inode for this reservation - * @wcount: The inode's write count, or NULL * */ -void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount) +void gfs2_rs_delete(struct gfs2_inode *ip) { + struct inode *inode = &ip->i_inode; + down_write(&ip->i_rw_mutex); - if ((wcount == NULL) || (atomic_read(wcount) <= 1)) + if (atomic_read(&inode->i_writecount) <= 1) gfs2_rs_deltree(&ip->i_res); up_write(&ip->i_rw_mutex); } @@ -922,15 +923,15 @@ static int read_rindex_entry(struct gfs2_inode *ip) spin_lock_init(&rgd->rd_rsspin); mutex_init(&rgd->rd_mutex); - error = compute_bitstructs(rgd); - if (error) - goto fail; - error = gfs2_glock_get(sdp, rgd->rd_addr, &gfs2_rgrp_glops, CREATE, &rgd->rd_gl); if (error) goto fail; + error = compute_bitstructs(rgd); + if (error) + goto fail_glock; + rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr; rgd->rd_flags &= ~GFS2_RDF_PREFERRED; if (rgd->rd_data > sdp->sd_max_rg_data) @@ -944,6 +945,7 @@ static int read_rindex_entry(struct gfs2_inode *ip) } error = 0; /* someone else read in the rgrp; free it and ignore it */ +fail_glock: gfs2_glock_put(rgd->rd_gl); fail: @@ -1415,7 +1417,8 @@ int gfs2_fitrim(struct file *filp, void __user *argp) start = r.start >> bs_shift; end = start + (r.len >> bs_shift); - minlen = max_t(u64, r.minlen, + minlen = max_t(u64, r.minlen, sdp->sd_sb.sb_bsize); + minlen = max_t(u64, minlen, q->limits.discard_granularity) >> bs_shift; if (end <= start || minlen > sdp->sd_max_rg_data) diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 3e2ca1fb4305..46dd94e9e085 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -45,7 +45,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, bool dinode, u64 *generation); extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs); -extern void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount); +extern void gfs2_rs_delete(struct gfs2_inode *ip); extern void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, u64 bstart, u32 blen, int meta); extern void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index cf9cf66522b3..bdb773e5c88f 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1396,7 +1396,7 @@ out: truncate_inode_pages_final(&inode->i_data); if (ip->i_qadata) gfs2_assert_warn(sdp, ip->i_qadata->qa_ref == 0); - gfs2_rs_delete(ip, NULL); + gfs2_rs_deltree(&ip->i_res); gfs2_ordered_del_inode(ip); clear_inode(inode); gfs2_dir_hash_inval(ip); diff --git a/fs/internal.h b/fs/internal.h index fb2c2ea807d7..08503dc68d2b 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -74,7 +74,7 @@ int do_linkat(int olddfd, struct filename *old, int newdfd, * namespace.c */ extern struct vfsmount *lookup_mnt(const struct path *); -extern int finish_automount(struct vfsmount *, struct path *); +extern int finish_automount(struct vfsmount *, const struct path *); extern int sb_prepare_remount_readonly(struct super_block *); diff --git a/fs/io-wq.c b/fs/io-wq.c index 5b93fa67d346..32aeb2c581c5 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -13,7 +13,7 @@ #include <linux/slab.h> #include <linux/rculist_nulls.h> #include <linux/cpu.h> -#include <linux/tracehook.h> +#include <linux/task_work.h> #include <linux/audit.h> #include <uapi/linux/io_uring.h> @@ -522,7 +522,9 @@ static bool io_flush_signals(void) { if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) { __set_current_state(TASK_RUNNING); - tracehook_notify_signal(); + clear_notify_signal(); + if (task_work_pending(current)) + task_work_run(); return true; } return false; diff --git a/fs/io_uring.c b/fs/io_uring.c index 496a2af7d12c..a8413f006417 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -79,7 +79,6 @@ #include <linux/task_work.h> #include <linux/pagemap.h> #include <linux/io_uring.h> -#include <linux/tracehook.h> #include <linux/audit.h> #include <linux/security.h> @@ -612,6 +611,7 @@ struct io_sr_msg { int msg_flags; int bgid; size_t len; + size_t done_io; }; struct io_open { @@ -782,6 +782,7 @@ enum { REQ_F_SKIP_LINK_CQES_BIT, REQ_F_SINGLE_POLL_BIT, REQ_F_DOUBLE_POLL_BIT, + REQ_F_PARTIAL_IO_BIT, /* keep async read/write and isreg together and in order */ REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, @@ -844,6 +845,8 @@ enum { REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT), /* double poll may active */ REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), + /* request has already done partial IO */ + REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), }; struct async_poll { @@ -924,7 +927,6 @@ struct io_kiocb { struct io_wq_work_node comp_list; atomic_t refs; atomic_t poll_refs; - struct io_kiocb *link; struct io_task_work io_task_work; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ struct hlist_node hash_node; @@ -932,9 +934,11 @@ struct io_kiocb { struct async_poll *apoll; /* opcode allocated if it needs to store data for async defer */ void *async_data; - /* custom credentials, valid IFF REQ_F_CREDS is set */ /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ struct io_buffer *kbuf; + /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */ + struct io_kiocb *link; + /* custom credentials, valid IFF REQ_F_CREDS is set */ const struct cred *creds; struct io_wq_work work; }; @@ -963,6 +967,7 @@ struct io_op_def { /* set if opcode supports polled "wait" */ unsigned pollin : 1; unsigned pollout : 1; + unsigned poll_exclusive : 1; /* op supports buffer selection */ unsigned buffer_select : 1; /* do prep async if is going to be punted */ @@ -1057,6 +1062,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, + .poll_exclusive = 1, }, [IORING_OP_ASYNC_CANCEL] = { .audit_skip = 1, @@ -1331,6 +1337,8 @@ static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list) static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) { + lockdep_assert_held(&req->ctx->completion_lock); + if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) return 0; return __io_put_kbuf(req, &req->ctx->io_buffers_comp); @@ -1363,6 +1371,8 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req, cflags = __io_put_kbuf(req, &ctx->io_buffers_comp); spin_unlock(&ctx->completion_lock); } else { + lockdep_assert_held(&req->ctx->uring_lock); + cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache); } @@ -1383,7 +1393,7 @@ static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, return NULL; } -static void io_kbuf_recycle(struct io_kiocb *req) +static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_buffer_list *bl; @@ -1391,6 +1401,12 @@ static void io_kbuf_recycle(struct io_kiocb *req) if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) return; + /* don't recycle if we already did IO to this buffer */ + if (req->flags & REQ_F_PARTIAL_IO) + return; + + if (issue_flags & IO_URING_F_UNLOCKED) + mutex_lock(&ctx->uring_lock); lockdep_assert_held(&ctx->uring_lock); @@ -1399,6 +1415,9 @@ static void io_kbuf_recycle(struct io_kiocb *req) list_add(&buf->list, &bl->buf_list); req->flags &= ~REQ_F_BUFFER_SELECTED; req->kbuf = NULL; + + if (issue_flags & IO_URING_F_UNLOCKED) + mutex_unlock(&ctx->uring_lock); } static bool io_match_task(struct io_kiocb *head, struct task_struct *task, @@ -2105,6 +2124,12 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res, } } io_req_put_rsrc(req, ctx); + /* + * Selected buffer deallocation in io_clean_op() assumes that + * we don't hold ->completion_lock. Clean them here to avoid + * deadlocks. + */ + io_put_kbuf_comp(req); io_dismantle_req(req); io_put_task(req->task, 1); wq_list_add_head(&req->comp_list, &ctx->locked_free_list); @@ -2149,7 +2174,7 @@ static inline void io_req_complete(struct io_kiocb *req, s32 res) static void io_req_complete_failed(struct io_kiocb *req, s32 res) { req_set_fail(req); - io_req_complete_post(req, res, io_put_kbuf(req, 0)); + io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); } static void io_req_complete_fail_submit(struct io_kiocb *req) @@ -2438,6 +2463,8 @@ static void handle_prev_tw_list(struct io_wq_work_node *node, struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); + prefetch(container_of(next, struct io_kiocb, io_task_work.node)); + if (req->ctx != *ctx) { if (unlikely(!*uring_locked && *ctx)) ctx_commit_and_unlock(*ctx); @@ -2470,6 +2497,8 @@ static void handle_tw_list(struct io_wq_work_node *node, struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); + prefetch(container_of(next, struct io_kiocb, io_task_work.node)); + if (req->ctx != *ctx) { ctx_flush_and_put(*ctx, locked); *ctx = req->ctx; @@ -2750,9 +2779,11 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) static inline bool io_run_task_work(void) { - if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) { + if (test_thread_flag(TIF_NOTIFY_SIGNAL) || task_work_pending(current)) { __set_current_state(TASK_RUNNING); - tracehook_notify_signal(); + clear_notify_signal(); + if (task_work_pending(current)) + task_work_run(); return true; } @@ -2973,8 +3004,12 @@ static bool io_rw_should_reissue(struct io_kiocb *req) static bool __io_complete_rw_common(struct io_kiocb *req, long res) { - if (req->rw.kiocb.ki_flags & IOCB_WRITE) + if (req->rw.kiocb.ki_flags & IOCB_WRITE) { kiocb_end_write(req); + fsnotify_modify(req->file); + } else { + fsnotify_access(req->file); + } if (unlikely(res != req->result)) { if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_should_reissue(req)) { @@ -3941,7 +3976,6 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { if (unlikely(!(req->file->f_mode & FMODE_WRITE))) return -EBADF; - req->rw.kiocb.ki_hint = ki_hint_validate(file_write_hint(req->file)); return io_prep_rw(req, sqe); } @@ -4439,9 +4473,6 @@ static int io_msg_ring_prep(struct io_kiocb *req, sqe->splice_fd_in || sqe->buf_index || sqe->personality)) return -EINVAL; - if (req->file->f_op != &io_uring_fops) - return -EBADFD; - req->msg.user_data = READ_ONCE(sqe->off); req->msg.len = READ_ONCE(sqe->len); return 0; @@ -4451,14 +4482,18 @@ static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *target_ctx; struct io_msg *msg = &req->msg; - int ret = -EOVERFLOW; bool filled; + int ret; + + ret = -EBADFD; + if (req->file->f_op != &io_uring_fops) + goto done; + ret = -EOVERFLOW; target_ctx = req->file->private_data; spin_lock(&target_ctx->completion_lock); - filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, - IORING_CQE_F_MSG); + filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, 0); io_commit_cqring(target_ctx); spin_unlock(&target_ctx->completion_lock); @@ -4467,6 +4502,9 @@ static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) ret = 0; } +done: + if (ret < 0) + req_set_fail(req); __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -4537,6 +4575,8 @@ static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) req->sync.len); if (ret < 0) req_set_fail(req); + else + fsnotify_modify(req->file); io_req_complete(req, ret); return 0; } @@ -5419,12 +5459,21 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->ctx->compat) sr->msg_flags |= MSG_CMSG_COMPAT; #endif + sr->done_io = 0; return 0; } +static bool io_net_retry(struct socket *sock, int flags) +{ + if (!(flags & MSG_WAITALL)) + return false; + return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; +} + static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_msghdr iomsg, *kmsg; + struct io_sr_msg *sr = &req->sr_msg; struct socket *sock; struct io_buffer *kbuf; unsigned flags; @@ -5467,6 +5516,11 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) return io_setup_async_msg(req, kmsg); if (ret == -ERESTARTSYS) ret = -EINTR; + if (ret > 0 && io_net_retry(sock, flags)) { + sr->done_io += ret; + req->flags |= REQ_F_PARTIAL_IO; + return io_setup_async_msg(req, kmsg); + } req_set_fail(req); } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { req_set_fail(req); @@ -5476,6 +5530,10 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) if (kmsg->free_iov) kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; + if (ret >= 0) + ret += sr->done_io; + else if (sr->done_io) + ret = sr->done_io; __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags)); return 0; } @@ -5526,12 +5584,23 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; if (ret == -ERESTARTSYS) ret = -EINTR; + if (ret > 0 && io_net_retry(sock, flags)) { + sr->len -= ret; + sr->buf += ret; + sr->done_io += ret; + req->flags |= REQ_F_PARTIAL_IO; + return -EAGAIN; + } req_set_fail(req); } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { out_free: req_set_fail(req); } + if (ret >= 0) + ret += sr->done_io; + else if (sr->done_io) + ret = sr->done_io; __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags)); return 0; } @@ -5569,9 +5638,6 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) struct file *file; int ret, fd; - if (req->file->f_flags & O_NONBLOCK) - req->flags |= REQ_F_NOWAIT; - if (!fixed) { fd = __get_unused_fd_flags(accept->flags, accept->nofile); if (unlikely(fd < 0)) @@ -5801,7 +5867,7 @@ struct io_poll_table { }; #define IO_POLL_CANCEL_FLAG BIT(31) -#define IO_POLL_REF_MASK ((1u << 20)-1) +#define IO_POLL_REF_MASK GENMASK(30, 0) /* * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can @@ -6035,10 +6101,13 @@ static void io_poll_cancel_req(struct io_kiocb *req) io_poll_execute(req, 0, 0); } +#define wqe_to_req(wait) ((void *)((unsigned long) (wait)->private & ~1)) +#define wqe_is_double(wait) ((unsigned long) (wait)->private & 1) + static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { - struct io_kiocb *req = wait->private; + struct io_kiocb *req = wqe_to_req(wait); struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb, wait); __poll_t mask = key_to_poll(key); @@ -6076,7 +6145,10 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (mask && poll->events & EPOLLONESHOT) { list_del_init(&poll->wait.entry); poll->head = NULL; - req->flags &= ~REQ_F_SINGLE_POLL; + if (wqe_is_double(wait)) + req->flags &= ~REQ_F_DOUBLE_POLL; + else + req->flags &= ~REQ_F_SINGLE_POLL; } __io_poll_execute(req, mask, poll->events); } @@ -6088,6 +6160,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, struct io_poll_iocb **poll_ptr) { struct io_kiocb *req = pt->req; + unsigned long wqe_private = (unsigned long) req; /* * The file being polled uses multiple waitqueues for poll handling @@ -6113,6 +6186,8 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, pt->error = -ENOMEM; return; } + /* mark as double wq entry */ + wqe_private |= 1; req->flags |= REQ_F_DOUBLE_POLL; io_init_poll_iocb(poll, first->events, first->wait.func); *poll_ptr = poll; @@ -6123,7 +6198,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, req->flags |= REQ_F_SINGLE_POLL; pt->nr_entries++; poll->head = head; - poll->wait.private = req; + poll->wait.private = (void *) wqe_private; if (poll->events & EPOLLEXCLUSIVE) add_wait_queue_exclusive(head, &poll->wait); @@ -6150,7 +6225,6 @@ static int __io_arm_poll_handler(struct io_kiocb *req, INIT_HLIST_NODE(&req->hash_node); io_init_poll_iocb(poll, mask, io_poll_wake); poll->file = req->file; - poll->wait.private = req; ipt->pt._key = mask; ipt->req = req; @@ -6238,7 +6312,8 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) } else { mask |= POLLOUT | POLLWRNORM; } - + if (def->poll_exclusive) + mask |= EPOLLEXCLUSIVE; if (!(issue_flags & IO_URING_F_UNLOCKED) && !list_empty(&ctx->apoll_cache)) { apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, @@ -6254,6 +6329,8 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) req->flags |= REQ_F_POLLED; ipt.pt._qproc = io_async_queue_proc; + io_kbuf_recycle(req, issue_flags); + ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask); if (ret || ipt.error) return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; @@ -6281,6 +6358,7 @@ static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, list = &ctx->cancel_hash[i]; hlist_for_each_entry_safe(req, tmp, list, hash_node) { if (io_match_task_safe(req, tsk, cancel_all)) { + hlist_del_init(&req->hash_node); io_poll_cancel_req(req); found = true; } @@ -7075,8 +7153,11 @@ fail: static void io_clean_op(struct io_kiocb *req) { - if (req->flags & REQ_F_BUFFER_SELECTED) + if (req->flags & REQ_F_BUFFER_SELECTED) { + spin_lock(&req->ctx->completion_lock); io_put_kbuf_comp(req); + spin_unlock(&req->ctx->completion_lock); + } if (req->flags & REQ_F_NEED_CLEANUP) { switch (req->opcode) { @@ -7505,11 +7586,9 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req) * Queued up for async execution, worker will release * submit reference when the iocb is actually submitted. */ - io_kbuf_recycle(req); io_queue_async_work(req, NULL); break; case IO_APOLL_OK: - io_kbuf_recycle(req); break; } @@ -8042,7 +8121,7 @@ static int io_sq_thread(void *data) } prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); - if (!io_sqd_events_pending(sqd) && !current->task_works) { + if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) { bool needs_sched = true; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { @@ -8053,6 +8132,13 @@ static int io_sq_thread(void *data) needs_sched = false; break; } + + /* + * Ensure the store of the wakeup flag is not + * reordered with the load of the SQ tail + */ + smp_mb(); + if (io_sqring_entries(ctx)) { needs_sched = false; break; @@ -8782,6 +8868,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) fput(fpl->fp[i]); } else { kfree_skb(skb); + free_uid(fpl->user); kfree(fpl); } @@ -11097,7 +11184,7 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, hlist_for_each_entry(req, list, hash_node) seq_printf(m, " op=%d, task_works=%d\n", req->opcode, - req->task->task_works != NULL); + task_work_pending(req->task)); } seq_puts(m, "CqOverflowList:\n"); diff --git a/fs/ioctl.c b/fs/ioctl.c index 090bf47606ab..80ac36aea913 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -173,7 +173,7 @@ int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, if (*len == 0) return -EINVAL; - if (start > maxbytes) + if (start >= maxbytes) return -EFBIG; /* diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index dffd2cac8113..8ce8720093b9 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -435,18 +435,17 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) { struct iomap_page *iop = to_iomap_page(folio); struct inode *inode = folio->mapping->host; - size_t len; unsigned first, last, i; if (!iop) return false; - /* Limit range to this folio */ - len = min(folio_size(folio) - from, count); + /* Caller's range may extend past the end of this folio */ + count = min(folio_size(folio) - from, count); - /* First and last blocks in range within page */ + /* First and last blocks in range within folio */ first = from >> inode->i_blkbits; - last = (from + len - 1) >> inode->i_blkbits; + last = (from + count - 1) >> inode->i_blkbits; for (i = first; i <= last; i++) if (!test_bit(i, iop->uptodate)) @@ -765,7 +764,7 @@ again: * same page as we're writing to, without it being marked * up-to-date. */ - if (unlikely(fault_in_iov_iter_readable(i, bytes))) { + if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { status = -EFAULT; break; } @@ -1222,7 +1221,6 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, REQ_OP_WRITE | wbc_to_write_flags(wbc), GFP_NOFS, &iomap_ioend_bioset); bio->bi_iter.bi_sector = sector; - bio->bi_write_hint = inode->i_write_hint; wbc_init_bio(wbc, bio); ioend = container_of(bio, struct iomap_ioend, io_inline_bio); @@ -1253,7 +1251,6 @@ iomap_chain_bio(struct bio *prev) new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS); bio_clone_blkg_association(new, prev); new->bi_iter.bi_sector = bio_end_sector(prev); - new->bi_write_hint = prev->bi_write_hint; bio_chain(prev, new); bio_get(prev); /* for iomap_finish_ioend */ diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 67cf9c16f80c..b08f5dc31780 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -315,7 +315,6 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, GFP_KERNEL); bio->bi_iter.bi_sector = iomap_sector(iomap, pos); - bio->bi_write_hint = dio->iocb->ki_hint; bio->bi_ioprio = dio->iocb->ki_ioprio; bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c index b288c8ae1236..837cd55fd4c5 100644 --- a/fs/jffs2/build.c +++ b/fs/jffs2/build.c @@ -415,13 +415,15 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c) jffs2_free_ino_caches(c); jffs2_free_raw_node_refs(c); ret = -EIO; - goto out_free; + goto out_sum_exit; } jffs2_calc_trigger_levels(c); return 0; + out_sum_exit: + jffs2_sum_exit(c); out_free: kvfree(c->blocks); diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 2ac410477c4f..71f03a5d36ed 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -603,8 +603,8 @@ out_root: jffs2_free_ino_caches(c); jffs2_free_raw_node_refs(c); kvfree(c->blocks); - out_inohash: jffs2_clear_xattr_subsystem(c); + out_inohash: kfree(c->inocache_list); out_wbuf: jffs2_flash_cleanup(c); diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h index 2e4a86763c07..93a2951538ce 100644 --- a/fs/jffs2/jffs2_fs_i.h +++ b/fs/jffs2/jffs2_fs_i.h @@ -18,11 +18,11 @@ #include <linux/mutex.h> struct jffs2_inode_info { - /* We need an internal mutex similar to inode->i_mutex. + /* We need an internal mutex similar to inode->i_rwsem. Unfortunately, we can't used the existing one, because either the GC would deadlock, or we'd have to release it before letting GC proceed. Or we'd have to put ugliness - into the GC code so it didn't attempt to obtain the i_mutex + into the GC code so it didn't attempt to obtain the i_rwsem for the inode(s) which are already locked */ struct mutex sem; diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index b676056826be..29671e33a171 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -136,7 +136,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) if (!s) { JFFS2_WARNING("Can't allocate memory for summary\n"); ret = -ENOMEM; - goto out; + goto out_buf; } } @@ -275,13 +275,15 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) } ret = 0; out: + jffs2_sum_reset_collected(s); + kfree(s); + out_buf: if (buf_size) kfree(flashbuf); #ifndef __ECOS else mtd_unpoint(c->mtd, 0, c->mtd->size); #endif - kfree(s); return ret; } diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 27be2e8ba237..d1943a7b4b04 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -146,12 +146,13 @@ void jfs_evict_inode(struct inode *inode) dquot_initialize(inode); if (JFS_IP(inode)->fileset == FILESYSTEM_I) { + struct inode *ipimap = JFS_SBI(inode->i_sb)->ipimap; truncate_inode_pages_final(&inode->i_data); if (test_cflag(COMMIT_Freewmap, inode)) jfs_free_zero_link(inode); - if (JFS_SBI(inode->i_sb)->ipimap) + if (ipimap && JFS_IP(ipimap)->i_imap) diFree(inode); /* diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 91f4ec93dab1..d8502f4989d9 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -148,6 +148,7 @@ static const s8 budtab[256] = { * 0 - success * -ENOMEM - insufficient memory * -EIO - i/o error + * -EINVAL - wrong bmap data */ int dbMount(struct inode *ipbmap) { @@ -179,6 +180,12 @@ int dbMount(struct inode *ipbmap) bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree); bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage); bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag); + if (!bmp->db_numag) { + release_metapage(mp); + kfree(bmp); + return -EINVAL; + } + bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel); bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag); bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref); diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index e6d9772ddb4c..61a8edc4ba8b 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -971,6 +971,15 @@ void kernfs_destroy_root(struct kernfs_root *root) } /** + * kernfs_root_to_node - return the kernfs_node associated with a kernfs_root + * @root: root to use to lookup + */ +struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root) +{ + return root->kn; +} + +/** * kernfs_create_dir_ns - create a directory * @parent: parent in which to create a new directory * @name: name of the new directory diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 9414a7a60a9f..88423069407c 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -120,13 +120,8 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) if (next == ERR_PTR(-ENODEV)) kernfs_seq_stop_active(sf, next); return next; - } else { - /* - * The same behavior and code as single_open(). Returns - * !NULL if pos is at the beginning; otherwise, NULL. - */ - return NULL + !*ppos; } + return single_start(sf, ppos); } static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos) @@ -1002,7 +997,7 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, #endif /* - * kn->attr.ops is accesible only while holding active ref. We + * kn->attr.ops is accessible only while holding active ref. We * need to know whether some ops are implemented outside active * ref. Cache their existence in flags. */ diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index f9cc912c31e1..eeaa779b929c 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -31,6 +31,24 @@ struct kernfs_iattrs { atomic_t user_xattr_size; }; +struct kernfs_root { + /* published fields */ + struct kernfs_node *kn; + unsigned int flags; /* KERNFS_ROOT_* flags */ + + /* private fields, do not use outside kernfs proper */ + struct idr ino_idr; + u32 last_id_lowbits; + u32 id_highbits; + struct kernfs_syscall_ops *syscall_ops; + + /* list of kernfs_super_info of this root, protected by kernfs_rwsem */ + struct list_head supers; + + wait_queue_head_t deactivate_waitq; + struct rw_semaphore kernfs_rwsem; +}; + /* +1 to avoid triggering overflow warning when negating it */ #define KN_DEACTIVATED_BIAS (INT_MIN + 1) @@ -122,7 +140,6 @@ int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); /* * dir.c */ -extern struct rw_semaphore kernfs_rwsem; extern const struct dentry_operations kernfs_dops; extern const struct file_operations kernfs_dir_fops; extern const struct inode_operations kernfs_dir_iops; diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h index 71bfb7de4472..ebe6ca08467a 100644 --- a/fs/ksmbd/ksmbd_netlink.h +++ b/fs/ksmbd/ksmbd_netlink.h @@ -241,7 +241,7 @@ struct ksmbd_rpc_command { struct ksmbd_spnego_authen_request { __u32 handle; __u16 spnego_blob_len; /* the length of spnego_blob */ - __u8 spnego_blob[0]; /* + __u8 spnego_blob[]; /* * the GSS token from SecurityBuffer of * SMB2 SESSION SETUP request */ diff --git a/fs/ksmbd/ntlmssp.h b/fs/ksmbd/ntlmssp.h index adaf4c0cbe8f..f13153c18b4e 100644 --- a/fs/ksmbd/ntlmssp.h +++ b/fs/ksmbd/ntlmssp.h @@ -95,7 +95,7 @@ struct security_buffer { struct target_info { __le16 Type; __le16 Length; - __u8 Content[0]; + __u8 Content[]; } __packed; struct negotiate_message { @@ -108,7 +108,7 @@ struct negotiate_message { * struct security_buffer for version info not present since we * do not set the version is present flag */ - char DomainString[0]; + char DomainString[]; /* followed by WorkstationString */ } __packed; @@ -140,7 +140,7 @@ struct authenticate_message { * struct security_buffer for version info not present since we * do not set the version is present flag */ - char UserString[0]; + char UserString[]; } __packed; struct ntlmv2_resp { diff --git a/fs/ksmbd/oplock.c b/fs/ksmbd/oplock.c index 077b8761d099..23871b18a429 100644 --- a/fs/ksmbd/oplock.c +++ b/fs/ksmbd/oplock.c @@ -656,8 +656,8 @@ static void __smb2_oplock_break_noti(struct work_struct *wk) rsp->OplockLevel = SMB2_OPLOCK_LEVEL_NONE; rsp->Reserved = 0; rsp->Reserved2 = 0; - rsp->PersistentFid = cpu_to_le64(fp->persistent_id); - rsp->VolatileFid = cpu_to_le64(fp->volatile_id); + rsp->PersistentFid = fp->persistent_id; + rsp->VolatileFid = fp->volatile_id; inc_rfc1001_len(work->response_buf, 24); diff --git a/fs/ksmbd/server.c b/fs/ksmbd/server.c index 2e12f6d8483b..4cd03d661df0 100644 --- a/fs/ksmbd/server.c +++ b/fs/ksmbd/server.c @@ -585,7 +585,7 @@ static int __init ksmbd_server_init(void) if (ret) goto err_crypto_destroy; - pr_warn_once("The ksmbd server is experimental, use at your own risk.\n"); + pr_warn_once("The ksmbd server is experimental\n"); return 0; diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 67e8e28e3fc3..3bf6c56c654c 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -377,12 +377,8 @@ static void init_chained_smb2_rsp(struct ksmbd_work *work) * command in the compound request */ if (req->Command == SMB2_CREATE && rsp->Status == STATUS_SUCCESS) { - work->compound_fid = - le64_to_cpu(((struct smb2_create_rsp *)rsp)-> - VolatileFileId); - work->compound_pfid = - le64_to_cpu(((struct smb2_create_rsp *)rsp)-> - PersistentFileId); + work->compound_fid = ((struct smb2_create_rsp *)rsp)->VolatileFileId; + work->compound_pfid = ((struct smb2_create_rsp *)rsp)->PersistentFileId; work->compound_sid = le64_to_cpu(rsp->SessionId); } @@ -2129,7 +2125,7 @@ static noinline int create_smb2_pipe(struct ksmbd_work *work) rsp->EndofFile = cpu_to_le64(0); rsp->FileAttributes = FILE_ATTRIBUTE_NORMAL_LE; rsp->Reserved2 = 0; - rsp->VolatileFileId = cpu_to_le64(id); + rsp->VolatileFileId = id; rsp->PersistentFileId = 0; rsp->CreateContextsOffset = 0; rsp->CreateContextsLength = 0; @@ -3157,8 +3153,8 @@ int smb2_open(struct ksmbd_work *work) rsp->Reserved2 = 0; - rsp->PersistentFileId = cpu_to_le64(fp->persistent_id); - rsp->VolatileFileId = cpu_to_le64(fp->volatile_id); + rsp->PersistentFileId = fp->persistent_id; + rsp->VolatileFileId = fp->volatile_id; rsp->CreateContextsOffset = 0; rsp->CreateContextsLength = 0; @@ -3865,9 +3861,7 @@ int smb2_query_dir(struct ksmbd_work *work) goto err_out2; } - dir_fp = ksmbd_lookup_fd_slow(work, - le64_to_cpu(req->VolatileFileId), - le64_to_cpu(req->PersistentFileId)); + dir_fp = ksmbd_lookup_fd_slow(work, req->VolatileFileId, req->PersistentFileId); if (!dir_fp) { rc = -EBADF; goto err_out2; @@ -4088,12 +4082,12 @@ static int smb2_get_info_file_pipe(struct ksmbd_session *sess, * Windows can sometime send query file info request on * pipe without opening it, checking error condition here */ - id = le64_to_cpu(req->VolatileFileId); + id = req->VolatileFileId; if (!ksmbd_session_rpc_method(sess, id)) return -ENOENT; ksmbd_debug(SMB, "FileInfoClass %u, FileId 0x%llx\n", - req->FileInfoClass, le64_to_cpu(req->VolatileFileId)); + req->FileInfoClass, req->VolatileFileId); switch (req->FileInfoClass) { case FILE_STANDARD_INFORMATION: @@ -4738,7 +4732,7 @@ static int smb2_get_info_file(struct ksmbd_work *work, } if (work->next_smb2_rcv_hdr_off) { - if (!has_file_id(le64_to_cpu(req->VolatileFileId))) { + if (!has_file_id(req->VolatileFileId)) { ksmbd_debug(SMB, "Compound request set FID = %llu\n", work->compound_fid); id = work->compound_fid; @@ -4747,8 +4741,8 @@ static int smb2_get_info_file(struct ksmbd_work *work, } if (!has_file_id(id)) { - id = le64_to_cpu(req->VolatileFileId); - pid = le64_to_cpu(req->PersistentFileId); + id = req->VolatileFileId; + pid = req->PersistentFileId; } fp = ksmbd_lookup_fd_slow(work, id, pid); @@ -5113,7 +5107,7 @@ static int smb2_get_info_sec(struct ksmbd_work *work, } if (work->next_smb2_rcv_hdr_off) { - if (!has_file_id(le64_to_cpu(req->VolatileFileId))) { + if (!has_file_id(req->VolatileFileId)) { ksmbd_debug(SMB, "Compound request set FID = %llu\n", work->compound_fid); id = work->compound_fid; @@ -5122,8 +5116,8 @@ static int smb2_get_info_sec(struct ksmbd_work *work, } if (!has_file_id(id)) { - id = le64_to_cpu(req->VolatileFileId); - pid = le64_to_cpu(req->PersistentFileId); + id = req->VolatileFileId; + pid = req->PersistentFileId; } fp = ksmbd_lookup_fd_slow(work, id, pid); @@ -5221,7 +5215,7 @@ static noinline int smb2_close_pipe(struct ksmbd_work *work) struct smb2_close_req *req = smb2_get_msg(work->request_buf); struct smb2_close_rsp *rsp = smb2_get_msg(work->response_buf); - id = le64_to_cpu(req->VolatileFileId); + id = req->VolatileFileId; ksmbd_session_rpc_close(work->sess, id); rsp->StructureSize = cpu_to_le16(60); @@ -5280,7 +5274,7 @@ int smb2_close(struct ksmbd_work *work) } if (work->next_smb2_rcv_hdr_off && - !has_file_id(le64_to_cpu(req->VolatileFileId))) { + !has_file_id(req->VolatileFileId)) { if (!has_file_id(work->compound_fid)) { /* file already closed, return FILE_CLOSED */ ksmbd_debug(SMB, "file already closed\n"); @@ -5299,7 +5293,7 @@ int smb2_close(struct ksmbd_work *work) work->compound_pfid = KSMBD_NO_FID; } } else { - volatile_id = le64_to_cpu(req->VolatileFileId); + volatile_id = req->VolatileFileId; } ksmbd_debug(SMB, "volatile_id = %llu\n", volatile_id); @@ -5988,7 +5982,7 @@ int smb2_set_info(struct ksmbd_work *work) if (work->next_smb2_rcv_hdr_off) { req = ksmbd_req_buf_next(work); rsp = ksmbd_resp_buf_next(work); - if (!has_file_id(le64_to_cpu(req->VolatileFileId))) { + if (!has_file_id(req->VolatileFileId)) { ksmbd_debug(SMB, "Compound request set FID = %llu\n", work->compound_fid); id = work->compound_fid; @@ -6000,8 +5994,8 @@ int smb2_set_info(struct ksmbd_work *work) } if (!has_file_id(id)) { - id = le64_to_cpu(req->VolatileFileId); - pid = le64_to_cpu(req->PersistentFileId); + id = req->VolatileFileId; + pid = req->PersistentFileId; } fp = ksmbd_lookup_fd_slow(work, id, pid); @@ -6079,7 +6073,7 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work) struct smb2_read_req *req = smb2_get_msg(work->request_buf); struct smb2_read_rsp *rsp = smb2_get_msg(work->response_buf); - id = le64_to_cpu(req->VolatileFileId); + id = req->VolatileFileId; inc_rfc1001_len(work->response_buf, 16); rpc_resp = ksmbd_rpc_read(work->sess, id); @@ -6215,8 +6209,7 @@ int smb2_read(struct ksmbd_work *work) goto out; } - fp = ksmbd_lookup_fd_slow(work, le64_to_cpu(req->VolatileFileId), - le64_to_cpu(req->PersistentFileId)); + fp = ksmbd_lookup_fd_slow(work, req->VolatileFileId, req->PersistentFileId); if (!fp) { err = -ENOENT; goto out; @@ -6335,7 +6328,7 @@ static noinline int smb2_write_pipe(struct ksmbd_work *work) size_t length; length = le32_to_cpu(req->Length); - id = le64_to_cpu(req->VolatileFileId); + id = req->VolatileFileId; if (le16_to_cpu(req->DataOffset) == offsetof(struct smb2_write_req, Buffer)) { @@ -6471,8 +6464,7 @@ int smb2_write(struct ksmbd_work *work) goto out; } - fp = ksmbd_lookup_fd_slow(work, le64_to_cpu(req->VolatileFileId), - le64_to_cpu(req->PersistentFileId)); + fp = ksmbd_lookup_fd_slow(work, req->VolatileFileId, req->PersistentFileId); if (!fp) { err = -ENOENT; goto out; @@ -6584,12 +6576,9 @@ int smb2_flush(struct ksmbd_work *work) WORK_BUFFERS(work, req, rsp); - ksmbd_debug(SMB, "SMB2_FLUSH called for fid %llu\n", - le64_to_cpu(req->VolatileFileId)); + ksmbd_debug(SMB, "SMB2_FLUSH called for fid %llu\n", req->VolatileFileId); - err = ksmbd_vfs_fsync(work, - le64_to_cpu(req->VolatileFileId), - le64_to_cpu(req->PersistentFileId)); + err = ksmbd_vfs_fsync(work, req->VolatileFileId, req->PersistentFileId); if (err) goto out; @@ -6618,8 +6607,7 @@ int smb2_cancel(struct ksmbd_work *work) struct ksmbd_conn *conn = work->conn; struct smb2_hdr *hdr = smb2_get_msg(work->request_buf); struct smb2_hdr *chdr; - struct ksmbd_work *cancel_work = NULL; - int canceled = 0; + struct ksmbd_work *cancel_work = NULL, *iter; struct list_head *command_list; ksmbd_debug(SMB, "smb2 cancel called on mid %llu, async flags 0x%x\n", @@ -6629,11 +6617,11 @@ int smb2_cancel(struct ksmbd_work *work) command_list = &conn->async_requests; spin_lock(&conn->request_lock); - list_for_each_entry(cancel_work, command_list, + list_for_each_entry(iter, command_list, async_request_entry) { - chdr = smb2_get_msg(cancel_work->request_buf); + chdr = smb2_get_msg(iter->request_buf); - if (cancel_work->async_id != + if (iter->async_id != le64_to_cpu(hdr->Id.AsyncId)) continue; @@ -6641,7 +6629,7 @@ int smb2_cancel(struct ksmbd_work *work) "smb2 with AsyncId %llu cancelled command = 0x%x\n", le64_to_cpu(hdr->Id.AsyncId), le16_to_cpu(chdr->Command)); - canceled = 1; + cancel_work = iter; break; } spin_unlock(&conn->request_lock); @@ -6649,24 +6637,24 @@ int smb2_cancel(struct ksmbd_work *work) command_list = &conn->requests; spin_lock(&conn->request_lock); - list_for_each_entry(cancel_work, command_list, request_entry) { - chdr = smb2_get_msg(cancel_work->request_buf); + list_for_each_entry(iter, command_list, request_entry) { + chdr = smb2_get_msg(iter->request_buf); if (chdr->MessageId != hdr->MessageId || - cancel_work == work) + iter == work) continue; ksmbd_debug(SMB, "smb2 with mid %llu cancelled command = 0x%x\n", le64_to_cpu(hdr->MessageId), le16_to_cpu(chdr->Command)); - canceled = 1; + cancel_work = iter; break; } spin_unlock(&conn->request_lock); } - if (canceled) { + if (cancel_work) { cancel_work->state = KSMBD_WORK_CANCELLED; if (cancel_work->cancel_fn) cancel_work->cancel_fn(cancel_work->cancel_argv); @@ -6804,12 +6792,9 @@ int smb2_lock(struct ksmbd_work *work) int prior_lock = 0; ksmbd_debug(SMB, "Received lock request\n"); - fp = ksmbd_lookup_fd_slow(work, - le64_to_cpu(req->VolatileFileId), - le64_to_cpu(req->PersistentFileId)); + fp = ksmbd_lookup_fd_slow(work, req->VolatileFileId, req->PersistentFileId); if (!fp) { - ksmbd_debug(SMB, "Invalid file id for lock : %llu\n", - le64_to_cpu(req->VolatileFileId)); + ksmbd_debug(SMB, "Invalid file id for lock : %llu\n", req->VolatileFileId); err = -ENOENT; goto out2; } @@ -7164,8 +7149,8 @@ static int fsctl_copychunk(struct ksmbd_work *work, ci_rsp = (struct copychunk_ioctl_rsp *)&rsp->Buffer[0]; - rsp->VolatileFileId = cpu_to_le64(volatile_id); - rsp->PersistentFileId = cpu_to_le64(persistent_id); + rsp->VolatileFileId = volatile_id; + rsp->PersistentFileId = persistent_id; ci_rsp->ChunksWritten = cpu_to_le32(ksmbd_server_side_copy_max_chunk_count()); ci_rsp->ChunkBytesWritten = @@ -7379,8 +7364,8 @@ ipv6_retry: if (nii_rsp) nii_rsp->Next = 0; - rsp->PersistentFileId = cpu_to_le64(SMB2_NO_FID); - rsp->VolatileFileId = cpu_to_le64(SMB2_NO_FID); + rsp->PersistentFileId = SMB2_NO_FID; + rsp->VolatileFileId = SMB2_NO_FID; return nbytes; } @@ -7547,9 +7532,7 @@ static int fsctl_request_resume_key(struct ksmbd_work *work, { struct ksmbd_file *fp; - fp = ksmbd_lookup_fd_slow(work, - le64_to_cpu(req->VolatileFileId), - le64_to_cpu(req->PersistentFileId)); + fp = ksmbd_lookup_fd_slow(work, req->VolatileFileId, req->PersistentFileId); if (!fp) return -ENOENT; @@ -7579,7 +7562,7 @@ int smb2_ioctl(struct ksmbd_work *work) if (work->next_smb2_rcv_hdr_off) { req = ksmbd_req_buf_next(work); rsp = ksmbd_resp_buf_next(work); - if (!has_file_id(le64_to_cpu(req->VolatileFileId))) { + if (!has_file_id(req->VolatileFileId)) { ksmbd_debug(SMB, "Compound request set FID = %llu\n", work->compound_fid); id = work->compound_fid; @@ -7590,14 +7573,14 @@ int smb2_ioctl(struct ksmbd_work *work) } if (!has_file_id(id)) - id = le64_to_cpu(req->VolatileFileId); + id = req->VolatileFileId; if (req->Flags != cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL)) { rsp->hdr.Status = STATUS_NOT_SUPPORTED; goto out; } - cnt_code = le32_to_cpu(req->CntCode); + cnt_code = le32_to_cpu(req->CtlCode); ret = smb2_calc_max_out_buf_len(work, 48, le32_to_cpu(req->MaxOutputResponse)); if (ret < 0) { @@ -7656,8 +7639,8 @@ int smb2_ioctl(struct ksmbd_work *work) goto out; nbytes = sizeof(struct validate_negotiate_info_rsp); - rsp->PersistentFileId = cpu_to_le64(SMB2_NO_FID); - rsp->VolatileFileId = cpu_to_le64(SMB2_NO_FID); + rsp->PersistentFileId = SMB2_NO_FID; + rsp->VolatileFileId = SMB2_NO_FID; break; case FSCTL_QUERY_NETWORK_INTERFACE_INFO: ret = fsctl_query_iface_info_ioctl(conn, rsp, out_buf_len); @@ -7703,10 +7686,10 @@ int smb2_ioctl(struct ksmbd_work *work) rsp->PersistentFileId = req->PersistentFileId; fsctl_copychunk(work, (struct copychunk_ioctl_req *)&req->Buffer[0], - le32_to_cpu(req->CntCode), + le32_to_cpu(req->CtlCode), le32_to_cpu(req->InputCount), - le64_to_cpu(req->VolatileFileId), - le64_to_cpu(req->PersistentFileId), + req->VolatileFileId, + req->PersistentFileId, rsp); break; case FSCTL_SET_SPARSE: @@ -7857,7 +7840,7 @@ dup_ext_out: goto out; } - rsp->CntCode = cpu_to_le32(cnt_code); + rsp->CtlCode = cpu_to_le32(cnt_code); rsp->InputCount = cpu_to_le32(0); rsp->InputOffset = cpu_to_le32(112); rsp->OutputOffset = cpu_to_le32(112); @@ -7903,8 +7886,8 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work) char req_oplevel = 0, rsp_oplevel = 0; unsigned int oplock_change_type; - volatile_id = le64_to_cpu(req->VolatileFid); - persistent_id = le64_to_cpu(req->PersistentFid); + volatile_id = req->VolatileFid; + persistent_id = req->PersistentFid; req_oplevel = req->OplockLevel; ksmbd_debug(OPLOCK, "v_id %llu, p_id %llu request oplock level %d\n", volatile_id, persistent_id, req_oplevel); @@ -7999,8 +7982,8 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work) rsp->OplockLevel = rsp_oplevel; rsp->Reserved = 0; rsp->Reserved2 = 0; - rsp->VolatileFid = cpu_to_le64(volatile_id); - rsp->PersistentFid = cpu_to_le64(persistent_id); + rsp->VolatileFid = volatile_id; + rsp->PersistentFid = persistent_id; inc_rfc1001_len(work->response_buf, 24); return; @@ -8500,7 +8483,7 @@ static void fill_transform_hdr(void *tr_buf, char *old_buf, __le16 cipher_type) struct smb2_hdr *hdr = smb2_get_msg(old_buf); unsigned int orig_len = get_rfc1002_len(old_buf); - memset(tr_buf, 0, sizeof(struct smb2_transform_hdr) + 4); + /* tr_buf must be cleared by the caller */ tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM; tr_hdr->OriginalMessageSize = cpu_to_le32(orig_len); tr_hdr->Flags = cpu_to_le16(TRANSFORM_FLAG_ENCRYPTED); diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h index 725b800c29c8..af455278d005 100644 --- a/fs/ksmbd/smb2pdu.h +++ b/fs/ksmbd/smb2pdu.h @@ -16,42 +16,13 @@ #define FILE_CREATED 0x00000002 #define FILE_OVERWRITTEN 0x00000003 -/* - * Size of the session key (crypto key encrypted with the password - */ -#define SMB2_NTLMV2_SESSKEY_SIZE 16 -#define SMB2_SIGNATURE_SIZE 16 -#define SMB2_HMACSHA256_SIZE 32 -#define SMB2_CMACAES_SIZE 16 -#define SMB3_GCM128_CRYPTKEY_SIZE 16 -#define SMB3_GCM256_CRYPTKEY_SIZE 32 - -/* - * Size of the smb3 encryption/decryption keys - */ -#define SMB3_ENC_DEC_KEY_SIZE 32 - -/* - * Size of the smb3 signing key - */ -#define SMB3_SIGN_KEY_SIZE 16 - -#define CIFS_CLIENT_CHALLENGE_SIZE 8 -#define SMB_SERVER_CHALLENGE_SIZE 8 - /* SMB2 Max Credits */ #define SMB2_MAX_CREDITS 8192 -/* Maximum buffer size value we can send with 1 credit */ -#define SMB2_MAX_BUFFER_SIZE 65536 - -#define NUMBER_OF_SMB2_COMMANDS 0x0013 - /* BB FIXME - analyze following length BB */ #define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */ #define SMB21_DEFAULT_IOSIZE (1024 * 1024) -#define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024) #define SMB3_DEFAULT_TRANS_SIZE (1024 * 1024) #define SMB3_MIN_IOSIZE (64 * 1024) #define SMB3_MAX_IOSIZE (8 * 1024 * 1024) @@ -65,18 +36,6 @@ * */ -#define SMB2_ERROR_STRUCTURE_SIZE2 9 -#define SMB2_ERROR_STRUCTURE_SIZE2_LE cpu_to_le16(SMB2_ERROR_STRUCTURE_SIZE2) - -struct smb2_err_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; - __u8 ErrorContextCount; - __u8 Reserved; - __le32 ByteCount; /* even if zero, at least one byte follows */ - __u8 ErrorData[1]; /* variable length */ -} __packed; - struct preauth_integrity_info { /* PreAuth integrity Hash ID */ __le16 Preauth_HashId; @@ -116,8 +75,8 @@ struct create_durable_reconn_req { union { __u8 Reserved[16]; struct { - __le64 PersistentFileId; - __le64 VolatileFileId; + __u64 PersistentFileId; + __u64 VolatileFileId; } Fid; } Data; } __packed; @@ -126,8 +85,8 @@ struct create_durable_reconn_v2_req { struct create_context ccontext; __u8 Name[8]; struct { - __le64 PersistentFileId; - __le64 VolatileFileId; + __u64 PersistentFileId; + __u64 VolatileFileId; } Fid; __u8 CreateGuid[16]; __le32 Flags; @@ -161,13 +120,6 @@ struct create_alloc_size_req { __le64 AllocationSize; } __packed; -struct create_posix { - struct create_context ccontext; - __u8 Name[16]; - __le32 Mode; - __u32 Reserved; -} __packed; - struct create_durable_rsp { struct create_context ccontext; __u8 Name[8]; @@ -209,45 +161,6 @@ struct create_posix_rsp { u8 SidBuffer[40]; } __packed; -#define SMB2_LEASE_NONE_LE cpu_to_le32(0x00) -#define SMB2_LEASE_READ_CACHING_LE cpu_to_le32(0x01) -#define SMB2_LEASE_HANDLE_CACHING_LE cpu_to_le32(0x02) -#define SMB2_LEASE_WRITE_CACHING_LE cpu_to_le32(0x04) - -#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS_LE cpu_to_le32(0x02) - -#define SMB2_LEASE_KEY_SIZE 16 - -struct lease_context { - __u8 LeaseKey[SMB2_LEASE_KEY_SIZE]; - __le32 LeaseState; - __le32 LeaseFlags; - __le64 LeaseDuration; -} __packed; - -struct lease_context_v2 { - __u8 LeaseKey[SMB2_LEASE_KEY_SIZE]; - __le32 LeaseState; - __le32 LeaseFlags; - __le64 LeaseDuration; - __u8 ParentLeaseKey[SMB2_LEASE_KEY_SIZE]; - __le16 Epoch; - __le16 Reserved; -} __packed; - -struct create_lease { - struct create_context ccontext; - __u8 Name[8]; - struct lease_context lcontext; -} __packed; - -struct create_lease_v2 { - struct create_context ccontext; - __u8 Name[8]; - struct lease_context_v2 lcontext; - __u8 Pad[4]; -} __packed; - struct smb2_buffer_desc_v1 { __le64 offset; __le32 token; @@ -256,63 +169,6 @@ struct smb2_buffer_desc_v1 { #define SMB2_0_IOCTL_IS_FSCTL 0x00000001 -struct duplicate_extents_to_file { - __u64 PersistentFileHandle; /* source file handle, opaque endianness */ - __u64 VolatileFileHandle; - __le64 SourceFileOffset; - __le64 TargetFileOffset; - __le64 ByteCount; /* Bytes to be copied */ -} __packed; - -struct smb2_ioctl_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 57 */ - __le16 Reserved; /* offset from start of SMB2 header to write data */ - __le32 CntCode; - __le64 PersistentFileId; - __le64 VolatileFileId; - __le32 InputOffset; /* Reserved MBZ */ - __le32 InputCount; - __le32 MaxInputResponse; - __le32 OutputOffset; - __le32 OutputCount; - __le32 MaxOutputResponse; - __le32 Flags; - __le32 Reserved2; - __u8 Buffer[1]; -} __packed; - -struct smb2_ioctl_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 49 */ - __le16 Reserved; /* offset from start of SMB2 header to write data */ - __le32 CntCode; - __le64 PersistentFileId; - __le64 VolatileFileId; - __le32 InputOffset; /* Reserved MBZ */ - __le32 InputCount; - __le32 OutputOffset; - __le32 OutputCount; - __le32 Flags; - __le32 Reserved2; - __u8 Buffer[1]; -} __packed; - -struct validate_negotiate_info_req { - __le32 Capabilities; - __u8 Guid[SMB2_CLIENT_GUID_SIZE]; - __le16 SecurityMode; - __le16 DialectCount; - __le16 Dialects[1]; /* dialect (someday maybe list) client asked for */ -} __packed; - -struct validate_negotiate_info_rsp { - __le32 Capabilities; - __u8 Guid[SMB2_CLIENT_GUID_SIZE]; - __le16 SecurityMode; - __le16 Dialect; /* Dialect in use for the connection */ -} __packed; - struct smb_sockaddr_in { __be16 Port; __be32 IPv4address; @@ -357,7 +213,7 @@ struct file_object_buf_type1_ioctl_rsp { } __packed; struct resume_key_ioctl_rsp { - __le64 ResumeKey[3]; + __u64 ResumeKey[3]; __le32 ContextLength; __u8 Context[4]; /* ignored, Windows sets to 4 bytes of zero */ } __packed; @@ -386,167 +242,6 @@ struct file_sparse { __u8 SetSparse; } __packed; -struct file_zero_data_information { - __le64 FileOffset; - __le64 BeyondFinalZero; -} __packed; - -struct file_allocated_range_buffer { - __le64 file_offset; - __le64 length; -} __packed; - -struct reparse_data_buffer { - __le32 ReparseTag; - __le16 ReparseDataLength; - __u16 Reserved; - __u8 DataBuffer[]; /* Variable Length */ -} __packed; - -/* SMB2 Notify Action Flags */ -#define FILE_ACTION_ADDED 0x00000001 -#define FILE_ACTION_REMOVED 0x00000002 -#define FILE_ACTION_MODIFIED 0x00000003 -#define FILE_ACTION_RENAMED_OLD_NAME 0x00000004 -#define FILE_ACTION_RENAMED_NEW_NAME 0x00000005 -#define FILE_ACTION_ADDED_STREAM 0x00000006 -#define FILE_ACTION_REMOVED_STREAM 0x00000007 -#define FILE_ACTION_MODIFIED_STREAM 0x00000008 -#define FILE_ACTION_REMOVED_BY_DELETE 0x00000009 - -#define SMB2_LOCKFLAG_SHARED 0x0001 -#define SMB2_LOCKFLAG_EXCLUSIVE 0x0002 -#define SMB2_LOCKFLAG_UNLOCK 0x0004 -#define SMB2_LOCKFLAG_FAIL_IMMEDIATELY 0x0010 -#define SMB2_LOCKFLAG_MASK 0x0007 - -struct smb2_lock_element { - __le64 Offset; - __le64 Length; - __le32 Flags; - __le32 Reserved; -} __packed; - -struct smb2_lock_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 48 */ - __le16 LockCount; - __le32 Reserved; - __le64 PersistentFileId; - __le64 VolatileFileId; - /* Followed by at least one */ - struct smb2_lock_element locks[1]; -} __packed; - -struct smb2_lock_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 4 */ - __le16 Reserved; -} __packed; - -struct smb2_echo_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 4 */ - __u16 Reserved; -} __packed; - -struct smb2_echo_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 4 */ - __u16 Reserved; -} __packed; - -/* search (query_directory) Flags field */ -#define SMB2_RESTART_SCANS 0x01 -#define SMB2_RETURN_SINGLE_ENTRY 0x02 -#define SMB2_INDEX_SPECIFIED 0x04 -#define SMB2_REOPEN 0x10 - -struct smb2_query_directory_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 33 */ - __u8 FileInformationClass; - __u8 Flags; - __le32 FileIndex; - __le64 PersistentFileId; - __le64 VolatileFileId; - __le16 FileNameOffset; - __le16 FileNameLength; - __le32 OutputBufferLength; - __u8 Buffer[1]; -} __packed; - -struct smb2_query_directory_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 9 */ - __le16 OutputBufferOffset; - __le32 OutputBufferLength; - __u8 Buffer[1]; -} __packed; - -/* Possible InfoType values */ -#define SMB2_O_INFO_FILE 0x01 -#define SMB2_O_INFO_FILESYSTEM 0x02 -#define SMB2_O_INFO_SECURITY 0x03 -#define SMB2_O_INFO_QUOTA 0x04 - -/* Security info type additionalinfo flags. See MS-SMB2 (2.2.37) or MS-DTYP */ -#define OWNER_SECINFO 0x00000001 -#define GROUP_SECINFO 0x00000002 -#define DACL_SECINFO 0x00000004 -#define SACL_SECINFO 0x00000008 -#define LABEL_SECINFO 0x00000010 -#define ATTRIBUTE_SECINFO 0x00000020 -#define SCOPE_SECINFO 0x00000040 -#define BACKUP_SECINFO 0x00010000 -#define UNPROTECTED_SACL_SECINFO 0x10000000 -#define UNPROTECTED_DACL_SECINFO 0x20000000 -#define PROTECTED_SACL_SECINFO 0x40000000 -#define PROTECTED_DACL_SECINFO 0x80000000 - -struct smb2_query_info_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 41 */ - __u8 InfoType; - __u8 FileInfoClass; - __le32 OutputBufferLength; - __le16 InputBufferOffset; - __u16 Reserved; - __le32 InputBufferLength; - __le32 AdditionalInformation; - __le32 Flags; - __le64 PersistentFileId; - __le64 VolatileFileId; - __u8 Buffer[1]; -} __packed; - -struct smb2_query_info_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 9 */ - __le16 OutputBufferOffset; - __le32 OutputBufferLength; - __u8 Buffer[1]; -} __packed; - -struct smb2_set_info_req { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 33 */ - __u8 InfoType; - __u8 FileInfoClass; - __le32 BufferLength; - __le16 BufferOffset; - __u16 Reserved; - __le32 AdditionalInformation; - __le64 PersistentFileId; - __le64 VolatileFileId; - __u8 Buffer[1]; -} __packed; - -struct smb2_set_info_rsp { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 2 */ -} __packed; - /* FILE Info response size */ #define FILE_DIRECTORY_INFORMATION_SIZE 1 #define FILE_FULL_DIRECTORY_INFORMATION_SIZE 2 @@ -602,145 +297,11 @@ struct fs_type_info { long magic_number; } __packed; -struct smb2_oplock_break { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 24 */ - __u8 OplockLevel; - __u8 Reserved; - __le32 Reserved2; - __le64 PersistentFid; - __le64 VolatileFid; -} __packed; - -#define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01) - -struct smb2_lease_break { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 44 */ - __le16 Epoch; - __le32 Flags; - __u8 LeaseKey[16]; - __le32 CurrentLeaseState; - __le32 NewLeaseState; - __le32 BreakReason; - __le32 AccessMaskHint; - __le32 ShareMaskHint; -} __packed; - -struct smb2_lease_ack { - struct smb2_hdr hdr; - __le16 StructureSize; /* Must be 36 */ - __le16 Reserved; - __le32 Flags; - __u8 LeaseKey[16]; - __le32 LeaseState; - __le64 LeaseDuration; -} __packed; - /* - * PDU infolevel structure definitions + * PDU query infolevel structure definitions * BB consider moving to a different header */ -/* File System Information Classes */ -#define FS_VOLUME_INFORMATION 1 /* Query */ -#define FS_LABEL_INFORMATION 2 /* Set */ -#define FS_SIZE_INFORMATION 3 /* Query */ -#define FS_DEVICE_INFORMATION 4 /* Query */ -#define FS_ATTRIBUTE_INFORMATION 5 /* Query */ -#define FS_CONTROL_INFORMATION 6 /* Query, Set */ -#define FS_FULL_SIZE_INFORMATION 7 /* Query */ -#define FS_OBJECT_ID_INFORMATION 8 /* Query, Set */ -#define FS_DRIVER_PATH_INFORMATION 9 /* Query */ -#define FS_SECTOR_SIZE_INFORMATION 11 /* SMB3 or later. Query */ -#define FS_POSIX_INFORMATION 100 /* SMB3.1.1 POSIX. Query */ - -struct smb2_fs_full_size_info { - __le64 TotalAllocationUnits; - __le64 CallerAvailableAllocationUnits; - __le64 ActualAvailableAllocationUnits; - __le32 SectorsPerAllocationUnit; - __le32 BytesPerSector; -} __packed; - -#define SSINFO_FLAGS_ALIGNED_DEVICE 0x00000001 -#define SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE 0x00000002 -#define SSINFO_FLAGS_NO_SEEK_PENALTY 0x00000004 -#define SSINFO_FLAGS_TRIM_ENABLED 0x00000008 - -/* sector size info struct */ -struct smb3_fs_ss_info { - __le32 LogicalBytesPerSector; - __le32 PhysicalBytesPerSectorForAtomicity; - __le32 PhysicalBytesPerSectorForPerf; - __le32 FSEffPhysicalBytesPerSectorForAtomicity; - __le32 Flags; - __le32 ByteOffsetForSectorAlignment; - __le32 ByteOffsetForPartitionAlignment; -} __packed; - -/* File System Control Information */ -struct smb2_fs_control_info { - __le64 FreeSpaceStartFiltering; - __le64 FreeSpaceThreshold; - __le64 FreeSpaceStopFiltering; - __le64 DefaultQuotaThreshold; - __le64 DefaultQuotaLimit; - __le32 FileSystemControlFlags; - __le32 Padding; -} __packed; - -/* partial list of QUERY INFO levels */ -#define FILE_DIRECTORY_INFORMATION 1 -#define FILE_FULL_DIRECTORY_INFORMATION 2 -#define FILE_BOTH_DIRECTORY_INFORMATION 3 -#define FILE_BASIC_INFORMATION 4 -#define FILE_STANDARD_INFORMATION 5 -#define FILE_INTERNAL_INFORMATION 6 -#define FILE_EA_INFORMATION 7 -#define FILE_ACCESS_INFORMATION 8 -#define FILE_NAME_INFORMATION 9 -#define FILE_RENAME_INFORMATION 10 -#define FILE_LINK_INFORMATION 11 -#define FILE_NAMES_INFORMATION 12 -#define FILE_DISPOSITION_INFORMATION 13 -#define FILE_POSITION_INFORMATION 14 -#define FILE_FULL_EA_INFORMATION 15 -#define FILE_MODE_INFORMATION 16 -#define FILE_ALIGNMENT_INFORMATION 17 -#define FILE_ALL_INFORMATION 18 -#define FILE_ALLOCATION_INFORMATION 19 -#define FILE_END_OF_FILE_INFORMATION 20 -#define FILE_ALTERNATE_NAME_INFORMATION 21 -#define FILE_STREAM_INFORMATION 22 -#define FILE_PIPE_INFORMATION 23 -#define FILE_PIPE_LOCAL_INFORMATION 24 -#define FILE_PIPE_REMOTE_INFORMATION 25 -#define FILE_MAILSLOT_QUERY_INFORMATION 26 -#define FILE_MAILSLOT_SET_INFORMATION 27 -#define FILE_COMPRESSION_INFORMATION 28 -#define FILE_OBJECT_ID_INFORMATION 29 -/* Number 30 not defined in documents */ -#define FILE_MOVE_CLUSTER_INFORMATION 31 -#define FILE_QUOTA_INFORMATION 32 -#define FILE_REPARSE_POINT_INFORMATION 33 -#define FILE_NETWORK_OPEN_INFORMATION 34 -#define FILE_ATTRIBUTE_TAG_INFORMATION 35 -#define FILE_TRACKING_INFORMATION 36 -#define FILEID_BOTH_DIRECTORY_INFORMATION 37 -#define FILEID_FULL_DIRECTORY_INFORMATION 38 -#define FILE_VALID_DATA_LENGTH_INFORMATION 39 -#define FILE_SHORT_NAME_INFORMATION 40 -#define FILE_SFIO_RESERVE_INFORMATION 44 -#define FILE_SFIO_VOLUME_INFORMATION 45 -#define FILE_HARD_LINK_INFORMATION 46 -#define FILE_NORMALIZED_NAME_INFORMATION 48 -#define FILEID_GLOBAL_TX_DIRECTORY_INFORMATION 50 -#define FILE_STANDARD_LINK_INFORMATION 54 - -#define OP_BREAK_STRUCT_SIZE_20 24 -#define OP_BREAK_STRUCT_SIZE_21 36 - struct smb2_file_access_info { __le32 AccessFlags; } __packed; @@ -749,56 +310,6 @@ struct smb2_file_alignment_info { __le32 AlignmentRequirement; } __packed; -struct smb2_file_internal_info { - __le64 IndexNumber; -} __packed; /* level 6 Query */ - -struct smb2_file_rename_info { /* encoding of request for level 10 */ - __u8 ReplaceIfExists; /* 1 = replace existing target with new */ - /* 0 = fail if target already exists */ - __u8 Reserved[7]; - __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ - __le32 FileNameLength; - char FileName[0]; /* New name to be assigned */ -} __packed; /* level 10 Set */ - -struct smb2_file_link_info { /* encoding of request for level 11 */ - __u8 ReplaceIfExists; /* 1 = replace existing link with new */ - /* 0 = fail if link already exists */ - __u8 Reserved[7]; - __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ - __le32 FileNameLength; - char FileName[0]; /* Name to be assigned to new link */ -} __packed; /* level 11 Set */ - -/* - * This level 18, although with struct with same name is different from cifs - * level 0x107. Level 0x107 has an extra u64 between AccessFlags and - * CurrentByteOffset. - */ -struct smb2_file_all_info { /* data block encoding of response to level 18 */ - __le64 CreationTime; /* Beginning of FILE_BASIC_INFO equivalent */ - __le64 LastAccessTime; - __le64 LastWriteTime; - __le64 ChangeTime; - __le32 Attributes; - __u32 Pad1; /* End of FILE_BASIC_INFO_INFO equivalent */ - __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */ - __le64 EndOfFile; /* size ie offset to first free byte in file */ - __le32 NumberOfLinks; /* hard links */ - __u8 DeletePending; - __u8 Directory; - __u16 Pad2; /* End of FILE_STANDARD_INFO equivalent */ - __le64 IndexNumber; - __le32 EASize; - __le32 AccessFlags; - __le64 CurrentByteOffset; - __le32 Mode; - __le32 AlignmentRequirement; - __le32 FileNameLength; - char FileName[1]; -} __packed; /* level 18 Query */ - struct smb2_file_basic_info { /* data block encoding of response to level 18 */ __le64 CreationTime; /* Beginning of FILE_BASIC_INFO equivalent */ __le64 LastAccessTime; @@ -810,7 +321,7 @@ struct smb2_file_basic_info { /* data block encoding of response to level 18 */ struct smb2_file_alt_name_info { __le32 FileNameLength; - char FileName[0]; + char FileName[]; } __packed; struct smb2_file_stream_info { @@ -818,13 +329,9 @@ struct smb2_file_stream_info { __le32 StreamNameLength; __le64 StreamSize; __le64 StreamAllocationSize; - char StreamName[0]; + char StreamName[]; } __packed; -struct smb2_file_eof_info { /* encoding of request for level 10 */ - __le64 EndOfFile; /* new end of file value */ -} __packed; /* level 20 Set */ - struct smb2_file_ntwrk_info { __le64 CreationTime; __le64 LastAccessTime; @@ -915,34 +422,6 @@ struct create_sd_buf_req { struct smb_ntsd ntsd; } __packed; -/* Find File infolevels */ -#define SMB_FIND_FILE_POSIX_INFO 0x064 - -/* Level 100 query info */ -struct smb311_posix_qinfo { - __le64 CreationTime; - __le64 LastAccessTime; - __le64 LastWriteTime; - __le64 ChangeTime; - __le64 EndOfFile; - __le64 AllocationSize; - __le32 DosAttributes; - __le64 Inode; - __le32 DeviceId; - __le32 Zero; - /* beginning of POSIX Create Context Response */ - __le32 HardLinks; - __le32 ReparseTag; - __le32 Mode; - u8 Sids[]; - /* - * var sized owner SID - * var sized group SID - * le32 filenamelength - * u8 filename[] - */ -} __packed; - struct smb2_posix_info { __le32 NextEntryOffset; __u32 Ignored; diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index ba5a22bc2e6d..e646d79554b8 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -211,7 +211,7 @@ struct smb_direct_rdma_rw_msg { struct completion *completion; struct rdma_rw_ctx rw_ctx; struct sg_table sgt; - struct scatterlist sg_list[0]; + struct scatterlist sg_list[]; }; static inline int get_buf_page_count(void *buf, int size) diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c index 82a1429bbe12..8fef9de787d3 100644 --- a/fs/ksmbd/transport_tcp.c +++ b/fs/ksmbd/transport_tcp.c @@ -476,7 +476,7 @@ static int ksmbd_netdev_event(struct notifier_block *nb, unsigned long event, switch (event) { case NETDEV_UP: - if (netdev->priv_flags & IFF_BRIDGE_PORT) + if (netif_is_bridge_port(netdev)) return NOTIFY_OK; list_for_each_entry(iface, &iface_list, entry) { @@ -585,7 +585,7 @@ int ksmbd_tcp_set_interfaces(char *ifc_list, int ifc_list_sz) rtnl_lock(); for_each_netdev(&init_net, netdev) { - if (netdev->priv_flags & IFF_BRIDGE_PORT) + if (netif_is_bridge_port(netdev)) continue; if (!alloc_iface(kstrdup(netdev->name, GFP_KERNEL))) return -ENOMEM; diff --git a/fs/ksmbd/xattr.h b/fs/ksmbd/xattr.h index 8857c01093d9..16499ca5c82d 100644 --- a/fs/ksmbd/xattr.h +++ b/fs/ksmbd/xattr.h @@ -76,7 +76,7 @@ struct xattr_acl_entry { struct xattr_smb_acl { int count; int next; - struct xattr_acl_entry entries[0]; + struct xattr_acl_entry entries[]; }; /* 64bytes hash in xattr_ntacl is computed with sha256 */ diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 700228c7f38b..f1a6610e4ee6 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -448,7 +448,8 @@ static const struct address_space_operations minix_aops = { .writepage = minix_writepage, .write_begin = minix_write_begin, .write_end = generic_write_end, - .bmap = minix_bmap + .bmap = minix_bmap, + .direct_IO = noop_direct_IO }; static const struct inode_operations minix_symlink_inode_operations = { diff --git a/fs/mpage.c b/fs/mpage.c index 9ed1e58e8d70..1fe56f8c495f 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -148,13 +148,11 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) int op = REQ_OP_READ; unsigned nblocks; unsigned relative_block; - gfp_t gfp; + gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); if (args->is_readahead) { op |= REQ_RAHEAD; - gfp = readahead_gfp_mask(page->mapping); - } else { - gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); + gfp |= __GFP_NORETRY | __GFP_NOWARN; } if (page_has_buffers(page)) @@ -588,7 +586,6 @@ alloc_new: GFP_NOFS); bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); wbc_init_bio(wbc, bio); - bio->bi_write_hint = inode->i_write_hint; } /* diff --git a/fs/namespace.c b/fs/namespace.c index 0044feef59d0..a0a36bfa3aa0 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -344,8 +344,24 @@ int __mnt_want_write(struct vfsmount *m) * incremented count after it has set MNT_WRITE_HOLD. */ smp_mb(); - while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) - cpu_relax(); + might_lock(&mount_lock.lock); + while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) { + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + cpu_relax(); + } else { + /* + * This prevents priority inversion, if the task + * setting MNT_WRITE_HOLD got preempted on a remote + * CPU, and it prevents life lock if the task setting + * MNT_WRITE_HOLD has a lower priority and is bound to + * the same CPU as the task that is spinning here. + */ + preempt_enable(); + lock_mount_hash(); + unlock_mount_hash(); + preempt_disable(); + } + } /* * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will * be set to match its requirements. So we must not load that until @@ -563,12 +579,9 @@ int sb_prepare_remount_readonly(struct super_block *sb) lock_mount_hash(); list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { - mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; - smp_mb(); - if (mnt_get_writers(mnt) > 0) { - err = -EBUSY; + err = mnt_hold_writers(mnt); + if (err) break; - } } } if (!err && atomic_long_read(&sb->s_remove_count)) @@ -2099,22 +2112,23 @@ static int invent_group_ids(struct mount *mnt, bool recurse) int count_mounts(struct mnt_namespace *ns, struct mount *mnt) { unsigned int max = READ_ONCE(sysctl_mount_max); - unsigned int mounts = 0, old, pending, sum; + unsigned int mounts = 0; struct mount *p; + if (ns->mounts >= max) + return -ENOSPC; + max -= ns->mounts; + if (ns->pending_mounts >= max) + return -ENOSPC; + max -= ns->pending_mounts; + for (p = mnt; p; p = next_mnt(p, mnt)) mounts++; - old = ns->mounts; - pending = ns->pending_mounts; - sum = old + pending; - if ((old > sum) || - (pending > sum) || - (max < sum) || - (mounts > (max - sum))) + if (mounts > max) return -ENOSPC; - ns->pending_mounts = pending + mounts; + ns->pending_mounts += mounts; return 0; } @@ -2908,7 +2922,7 @@ static int do_move_mount_old(struct path *path, const char *old_name) * add a mount into a namespace's mount tree */ static int do_add_mount(struct mount *newmnt, struct mountpoint *mp, - struct path *path, int mnt_flags) + const struct path *path, int mnt_flags) { struct mount *parent = real_mount(path->mnt); @@ -3031,7 +3045,7 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, return err; } -int finish_automount(struct vfsmount *m, struct path *path) +int finish_automount(struct vfsmount *m, const struct path *path) { struct dentry *dentry = path->dentry; struct mountpoint *mp; @@ -4000,46 +4014,57 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) return 0; } -static struct mount *mount_setattr_prepare(struct mount_kattr *kattr, - struct mount *mnt, int *err) +/** + * mnt_allow_writers() - check whether the attribute change allows writers + * @kattr: the new mount attributes + * @mnt: the mount to which @kattr will be applied + * + * Check whether thew new mount attributes in @kattr allow concurrent writers. + * + * Return: true if writers need to be held, false if not + */ +static inline bool mnt_allow_writers(const struct mount_kattr *kattr, + const struct mount *mnt) { - struct mount *m = mnt, *last = NULL; + return !(kattr->attr_set & MNT_READONLY) || + (mnt->mnt.mnt_flags & MNT_READONLY); +} - if (!is_mounted(&m->mnt)) { - *err = -EINVAL; - goto out; - } +static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) +{ + struct mount *m; + int err; - if (!(mnt_has_parent(m) ? check_mnt(m) : is_anon_ns(m->mnt_ns))) { - *err = -EINVAL; - goto out; - } + for (m = mnt; m; m = next_mnt(m, mnt)) { + if (!can_change_locked_flags(m, recalc_flags(kattr, m))) { + err = -EPERM; + break; + } - do { - unsigned int flags; + err = can_idmap_mount(kattr, m); + if (err) + break; - flags = recalc_flags(kattr, m); - if (!can_change_locked_flags(m, flags)) { - *err = -EPERM; - goto out; + if (!mnt_allow_writers(kattr, m)) { + err = mnt_hold_writers(m); + if (err) + break; } - *err = can_idmap_mount(kattr, m); - if (*err) - goto out; + if (!kattr->recurse) + return 0; + } - last = m; + if (err) { + struct mount *p; - if ((kattr->attr_set & MNT_READONLY) && - !(m->mnt.mnt_flags & MNT_READONLY)) { - *err = mnt_hold_writers(m); - if (*err) - goto out; + for (p = mnt; p != m; p = next_mnt(p, mnt)) { + /* If we had to hold writers unblock them. */ + if (p->mnt.mnt_flags & MNT_WRITE_HOLD) + mnt_unhold_writers(p); } - } while (kattr->recurse && (m = next_mnt(m, mnt))); - -out: - return last; + } + return err; } static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) @@ -4067,48 +4092,32 @@ static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) put_user_ns(old_mnt_userns); } -static void mount_setattr_commit(struct mount_kattr *kattr, - struct mount *mnt, struct mount *last, - int err) +static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) { - struct mount *m = mnt; + struct mount *m; - do { - if (!err) { - unsigned int flags; + for (m = mnt; m; m = next_mnt(m, mnt)) { + unsigned int flags; - do_idmap_mount(kattr, m); - flags = recalc_flags(kattr, m); - WRITE_ONCE(m->mnt.mnt_flags, flags); - } + do_idmap_mount(kattr, m); + flags = recalc_flags(kattr, m); + WRITE_ONCE(m->mnt.mnt_flags, flags); - /* - * We either set MNT_READONLY above so make it visible - * before ~MNT_WRITE_HOLD or we failed to recursively - * apply mount options. - */ - if ((kattr->attr_set & MNT_READONLY) && - (m->mnt.mnt_flags & MNT_WRITE_HOLD)) + /* If we had to hold writers unblock them. */ + if (m->mnt.mnt_flags & MNT_WRITE_HOLD) mnt_unhold_writers(m); - if (!err && kattr->propagation) + if (kattr->propagation) change_mnt_propagation(m, kattr->propagation); - - /* - * On failure, only cleanup until we found the first mount - * we failed to handle. - */ - if (err && m == last) + if (!kattr->recurse) break; - } while (kattr->recurse && (m = next_mnt(m, mnt))); - - if (!err) - touch_mnt_namespace(mnt->mnt_ns); + } + touch_mnt_namespace(mnt->mnt_ns); } static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) { - struct mount *mnt = real_mount(path->mnt), *last = NULL; + struct mount *mnt = real_mount(path->mnt); int err = 0; if (path->dentry != mnt->mnt.mnt_root) @@ -4129,16 +4138,32 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) } } + err = -EINVAL; lock_mount_hash(); + /* Ensure that this isn't anything purely vfs internal. */ + if (!is_mounted(&mnt->mnt)) + goto out; + /* - * Get the mount tree in a shape where we can change mount - * properties without failure. + * If this is an attached mount make sure it's located in the callers + * mount namespace. If it's not don't let the caller interact with it. + * If this is a detached mount make sure it has an anonymous mount + * namespace attached to it, i.e. we've created it via OPEN_TREE_CLONE. */ - last = mount_setattr_prepare(kattr, mnt, &err); - if (last) /* Commit all changes or revert to the old state. */ - mount_setattr_commit(kattr, mnt, last, err); + if (!(mnt_has_parent(mnt) ? check_mnt(mnt) : is_anon_ns(mnt->mnt_ns))) + goto out; + /* + * First, we get the mount tree in a shape where we can change mount + * properties without failure. If we succeeded to do so we commit all + * changes and if we failed we clean up. + */ + err = mount_setattr_prepare(kattr, mnt); + if (!err) + mount_setattr_commit(kattr, mnt); + +out: unlock_mount_hash(); if (kattr->propagation) { diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile index c15bfc966d96..f684c0cd1ec5 100644 --- a/fs/netfs/Makefile +++ b/fs/netfs/Makefile @@ -1,5 +1,11 @@ # SPDX-License-Identifier: GPL-2.0 -netfs-y := read_helper.o stats.o +netfs-y := \ + buffered_read.o \ + io.o \ + main.o \ + objects.o + +netfs-$(CONFIG_NETFS_STATS) += stats.o obj-$(CONFIG_NETFS_SUPPORT) := netfs.o diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c new file mode 100644 index 000000000000..281a88a5b8dc --- /dev/null +++ b/fs/netfs/buffered_read.c @@ -0,0 +1,428 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Network filesystem high-level buffered read support. + * + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/export.h> +#include <linux/task_io_accounting_ops.h> +#include "internal.h" + +/* + * Unlock the folios in a read operation. We need to set PG_fscache on any + * folios we're going to write back before we unlock them. + */ +void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) +{ + struct netfs_io_subrequest *subreq; + struct folio *folio; + unsigned int iopos, account = 0; + pgoff_t start_page = rreq->start / PAGE_SIZE; + pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; + bool subreq_failed = false; + + XA_STATE(xas, &rreq->mapping->i_pages, start_page); + + if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) { + __clear_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags); + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); + } + } + + /* Walk through the pagecache and the I/O request lists simultaneously. + * We may have a mixture of cached and uncached sections and we only + * really want to write out the uncached sections. This is slightly + * complicated by the possibility that we might have huge pages with a + * mixture inside. + */ + subreq = list_first_entry(&rreq->subrequests, + struct netfs_io_subrequest, rreq_link); + iopos = 0; + subreq_failed = (subreq->error < 0); + + trace_netfs_rreq(rreq, netfs_rreq_trace_unlock); + + rcu_read_lock(); + xas_for_each(&xas, folio, last_page) { + unsigned int pgpos = (folio_index(folio) - start_page) * PAGE_SIZE; + unsigned int pgend = pgpos + folio_size(folio); + bool pg_failed = false; + + for (;;) { + if (!subreq) { + pg_failed = true; + break; + } + if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) + folio_start_fscache(folio); + pg_failed |= subreq_failed; + if (pgend < iopos + subreq->len) + break; + + account += subreq->transferred; + iopos += subreq->len; + if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { + subreq = list_next_entry(subreq, rreq_link); + subreq_failed = (subreq->error < 0); + } else { + subreq = NULL; + subreq_failed = false; + } + if (pgend == iopos) + break; + } + + if (!pg_failed) { + flush_dcache_folio(folio); + folio_mark_uptodate(folio); + } + + if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { + if (folio_index(folio) == rreq->no_unlock_folio && + test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) + _debug("no unlock"); + else + folio_unlock(folio); + } + } + rcu_read_unlock(); + + task_io_account_read(account); + if (rreq->netfs_ops->done) + rreq->netfs_ops->done(rreq); +} + +static void netfs_cache_expand_readahead(struct netfs_io_request *rreq, + loff_t *_start, size_t *_len, loff_t i_size) +{ + struct netfs_cache_resources *cres = &rreq->cache_resources; + + if (cres->ops && cres->ops->expand_readahead) + cres->ops->expand_readahead(cres, _start, _len, i_size); +} + +static void netfs_rreq_expand(struct netfs_io_request *rreq, + struct readahead_control *ractl) +{ + /* Give the cache a chance to change the request parameters. The + * resultant request must contain the original region. + */ + netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size); + + /* Give the netfs a chance to change the request parameters. The + * resultant request must contain the original region. + */ + if (rreq->netfs_ops->expand_readahead) + rreq->netfs_ops->expand_readahead(rreq); + + /* Expand the request if the cache wants it to start earlier. Note + * that the expansion may get further extended if the VM wishes to + * insert THPs and the preferred start and/or end wind up in the middle + * of THPs. + * + * If this is the case, however, the THP size should be an integer + * multiple of the cache granule size, so we get a whole number of + * granules to deal with. + */ + if (rreq->start != readahead_pos(ractl) || + rreq->len != readahead_length(ractl)) { + readahead_expand(ractl, rreq->start, rreq->len); + rreq->start = readahead_pos(ractl); + rreq->len = readahead_length(ractl); + + trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl), + netfs_read_trace_expanded); + } +} + +/** + * netfs_readahead - Helper to manage a read request + * @ractl: The description of the readahead request + * + * Fulfil a readahead request by drawing data from the cache if possible, or + * the netfs if not. Space beyond the EOF is zero-filled. Multiple I/O + * requests from different sources will get munged together. If necessary, the + * readahead window can be expanded in either direction to a more convenient + * alighment for RPC efficiency or to make storage in the cache feasible. + * + * The calling netfs must initialise a netfs context contiguous to the vfs + * inode before calling this. + * + * This is usable whether or not caching is enabled. + */ +void netfs_readahead(struct readahead_control *ractl) +{ + struct netfs_io_request *rreq; + struct netfs_i_context *ctx = netfs_i_context(ractl->mapping->host); + int ret; + + _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl)); + + if (readahead_count(ractl) == 0) + return; + + rreq = netfs_alloc_request(ractl->mapping, ractl->file, + readahead_pos(ractl), + readahead_length(ractl), + NETFS_READAHEAD); + if (IS_ERR(rreq)) + return; + + if (ctx->ops->begin_cache_operation) { + ret = ctx->ops->begin_cache_operation(rreq); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) + goto cleanup_free; + } + + netfs_stat(&netfs_n_rh_readahead); + trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl), + netfs_read_trace_readahead); + + netfs_rreq_expand(rreq, ractl); + + /* Drop the refs on the folios here rather than in the cache or + * filesystem. The locks will be dropped in netfs_rreq_unlock(). + */ + while (readahead_folio(ractl)) + ; + + netfs_begin_read(rreq, false); + return; + +cleanup_free: + netfs_put_request(rreq, false, netfs_rreq_trace_put_failed); + return; +} +EXPORT_SYMBOL(netfs_readahead); + +/** + * netfs_readpage - Helper to manage a readpage request + * @file: The file to read from + * @subpage: A subpage of the folio to read + * + * Fulfil a readpage request by drawing data from the cache if possible, or the + * netfs if not. Space beyond the EOF is zero-filled. Multiple I/O requests + * from different sources will get munged together. + * + * The calling netfs must initialise a netfs context contiguous to the vfs + * inode before calling this. + * + * This is usable whether or not caching is enabled. + */ +int netfs_readpage(struct file *file, struct page *subpage) +{ + struct folio *folio = page_folio(subpage); + struct address_space *mapping = folio_file_mapping(folio); + struct netfs_io_request *rreq; + struct netfs_i_context *ctx = netfs_i_context(mapping->host); + int ret; + + _enter("%lx", folio_index(folio)); + + rreq = netfs_alloc_request(mapping, file, + folio_file_pos(folio), folio_size(folio), + NETFS_READPAGE); + if (IS_ERR(rreq)) { + ret = PTR_ERR(rreq); + goto alloc_error; + } + + if (ctx->ops->begin_cache_operation) { + ret = ctx->ops->begin_cache_operation(rreq); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) + goto discard; + } + + netfs_stat(&netfs_n_rh_readpage); + trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage); + return netfs_begin_read(rreq, true); + +discard: + netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); +alloc_error: + folio_unlock(folio); + return ret; +} +EXPORT_SYMBOL(netfs_readpage); + +/* + * Prepare a folio for writing without reading first + * @folio: The folio being prepared + * @pos: starting position for the write + * @len: length of write + * @always_fill: T if the folio should always be completely filled/cleared + * + * In some cases, write_begin doesn't need to read at all: + * - full folio write + * - write that lies in a folio that is completely beyond EOF + * - write that covers the folio from start to EOF or beyond it + * + * If any of these criteria are met, then zero out the unwritten parts + * of the folio and return true. Otherwise, return false. + */ +static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len, + bool always_fill) +{ + struct inode *inode = folio_inode(folio); + loff_t i_size = i_size_read(inode); + size_t offset = offset_in_folio(folio, pos); + size_t plen = folio_size(folio); + + if (unlikely(always_fill)) { + if (pos - offset + len <= i_size) + return false; /* Page entirely before EOF */ + zero_user_segment(&folio->page, 0, plen); + folio_mark_uptodate(folio); + return true; + } + + /* Full folio write */ + if (offset == 0 && len >= plen) + return true; + + /* Page entirely beyond the end of the file */ + if (pos - offset >= i_size) + goto zero_out; + + /* Write that covers from the start of the folio to EOF or beyond */ + if (offset == 0 && (pos + len) >= i_size) + goto zero_out; + + return false; +zero_out: + zero_user_segments(&folio->page, 0, offset, offset + len, plen); + return true; +} + +/** + * netfs_write_begin - Helper to prepare for writing + * @file: The file to read from + * @mapping: The mapping to read from + * @pos: File position at which the write will begin + * @len: The length of the write (may extend beyond the end of the folio chosen) + * @aop_flags: AOP_* flags + * @_folio: Where to put the resultant folio + * @_fsdata: Place for the netfs to store a cookie + * + * Pre-read data for a write-begin request by drawing data from the cache if + * possible, or the netfs if not. Space beyond the EOF is zero-filled. + * Multiple I/O requests from different sources will get munged together. If + * necessary, the readahead window can be expanded in either direction to a + * more convenient alighment for RPC efficiency or to make storage in the cache + * feasible. + * + * The calling netfs must provide a table of operations, only one of which, + * issue_op, is mandatory. + * + * The check_write_begin() operation can be provided to check for and flush + * conflicting writes once the folio is grabbed and locked. It is passed a + * pointer to the fsdata cookie that gets returned to the VM to be passed to + * write_end. It is permitted to sleep. It should return 0 if the request + * should go ahead; unlock the folio and return -EAGAIN to cause the folio to + * be regot; or return an error. + * + * The calling netfs must initialise a netfs context contiguous to the vfs + * inode before calling this. + * + * This is usable whether or not caching is enabled. + */ +int netfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned int len, unsigned int aop_flags, + struct folio **_folio, void **_fsdata) +{ + struct netfs_io_request *rreq; + struct netfs_i_context *ctx = netfs_i_context(file_inode(file )); + struct folio *folio; + unsigned int fgp_flags; + pgoff_t index = pos >> PAGE_SHIFT; + int ret; + + DEFINE_READAHEAD(ractl, file, NULL, mapping, index); + +retry: + fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; + if (aop_flags & AOP_FLAG_NOFS) + fgp_flags |= FGP_NOFS; + folio = __filemap_get_folio(mapping, index, fgp_flags, + mapping_gfp_mask(mapping)); + if (!folio) + return -ENOMEM; + + if (ctx->ops->check_write_begin) { + /* Allow the netfs (eg. ceph) to flush conflicts. */ + ret = ctx->ops->check_write_begin(file, pos, len, folio, _fsdata); + if (ret < 0) { + trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin); + if (ret == -EAGAIN) + goto retry; + goto error; + } + } + + if (folio_test_uptodate(folio)) + goto have_folio; + + /* If the page is beyond the EOF, we want to clear it - unless it's + * within the cache granule containing the EOF, in which case we need + * to preload the granule. + */ + if (!netfs_is_cache_enabled(ctx) && + netfs_skip_folio_read(folio, pos, len, false)) { + netfs_stat(&netfs_n_rh_write_zskip); + goto have_folio_no_wait; + } + + rreq = netfs_alloc_request(mapping, file, + folio_file_pos(folio), folio_size(folio), + NETFS_READ_FOR_WRITE); + if (IS_ERR(rreq)) { + ret = PTR_ERR(rreq); + goto error; + } + rreq->no_unlock_folio = folio_index(folio); + __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); + + if (ctx->ops->begin_cache_operation) { + ret = ctx->ops->begin_cache_operation(rreq); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) + goto error_put; + } + + netfs_stat(&netfs_n_rh_write_begin); + trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin); + + /* Expand the request to meet caching requirements and download + * preferences. + */ + ractl._nr_pages = folio_nr_pages(folio); + netfs_rreq_expand(rreq, &ractl); + + /* We hold the folio locks, so we can drop the references */ + folio_get(folio); + while (readahead_folio(&ractl)) + ; + + ret = netfs_begin_read(rreq, true); + if (ret < 0) + goto error; + +have_folio: + ret = folio_wait_fscache_killable(folio); + if (ret < 0) + goto error; +have_folio_no_wait: + *_folio = folio; + _leave(" = 0"); + return 0; + +error_put: + netfs_put_request(rreq, false, netfs_rreq_trace_put_failed); +error: + folio_unlock(folio); + folio_put(folio); + _leave(" = %d", ret); + return ret; +} +EXPORT_SYMBOL(netfs_write_begin); diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index b7f2c4459f33..b7b0e3d18d9e 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -5,6 +5,10 @@ * Written by David Howells (dhowells@redhat.com) */ +#include <linux/netfs.h> +#include <linux/fscache.h> +#include <trace/events/netfs.h> + #ifdef pr_fmt #undef pr_fmt #endif @@ -12,11 +16,40 @@ #define pr_fmt(fmt) "netfs: " fmt /* - * read_helper.c + * buffered_read.c + */ +void netfs_rreq_unlock_folios(struct netfs_io_request *rreq); + +/* + * io.c + */ +int netfs_begin_read(struct netfs_io_request *rreq, bool sync); + +/* + * main.c */ extern unsigned int netfs_debug; /* + * objects.c + */ +struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, + struct file *file, + loff_t start, size_t len, + enum netfs_io_origin origin); +void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what); +void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async); +void netfs_put_request(struct netfs_io_request *rreq, bool was_async, + enum netfs_rreq_ref_trace what); +struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq); + +static inline void netfs_see_request(struct netfs_io_request *rreq, + enum netfs_rreq_ref_trace what) +{ + trace_netfs_rreq_ref(rreq->debug_id, refcount_read(&rreq->ref), what); +} + +/* * stats.c */ #ifdef CONFIG_NETFS_STATS @@ -55,6 +88,21 @@ static inline void netfs_stat_d(atomic_t *stat) #define netfs_stat_d(x) do {} while(0) #endif +/* + * Miscellaneous functions. + */ +static inline bool netfs_is_cache_enabled(struct netfs_i_context *ctx) +{ +#if IS_ENABLED(CONFIG_FSCACHE) + struct fscache_cookie *cookie = ctx->cache; + + return fscache_cookie_valid(cookie) && cookie->cache_priv && + fscache_cookie_enabled(cookie); +#else + return false; +#endif +} + /*****************************************************************************/ /* * debug tracing diff --git a/fs/netfs/io.c b/fs/netfs/io.c new file mode 100644 index 000000000000..428925899282 --- /dev/null +++ b/fs/netfs/io.c @@ -0,0 +1,657 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Network filesystem high-level read support. + * + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/module.h> +#include <linux/export.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/uio.h> +#include <linux/sched/mm.h> +#include <linux/task_io_accounting_ops.h> +#include "internal.h" + +/* + * Clear the unread part of an I/O request. + */ +static void netfs_clear_unread(struct netfs_io_subrequest *subreq) +{ + struct iov_iter iter; + + iov_iter_xarray(&iter, READ, &subreq->rreq->mapping->i_pages, + subreq->start + subreq->transferred, + subreq->len - subreq->transferred); + iov_iter_zero(iov_iter_count(&iter), &iter); +} + +static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, + bool was_async) +{ + struct netfs_io_subrequest *subreq = priv; + + netfs_subreq_terminated(subreq, transferred_or_error, was_async); +} + +/* + * Issue a read against the cache. + * - Eats the caller's ref on subreq. + */ +static void netfs_read_from_cache(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq, + enum netfs_read_from_hole read_hole) +{ + struct netfs_cache_resources *cres = &rreq->cache_resources; + struct iov_iter iter; + + netfs_stat(&netfs_n_rh_read); + iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, + subreq->start + subreq->transferred, + subreq->len - subreq->transferred); + + cres->ops->read(cres, subreq->start, &iter, read_hole, + netfs_cache_read_terminated, subreq); +} + +/* + * Fill a subrequest region with zeroes. + */ +static void netfs_fill_with_zeroes(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq) +{ + netfs_stat(&netfs_n_rh_zero); + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + netfs_subreq_terminated(subreq, 0, false); +} + +/* + * Ask the netfs to issue a read request to the server for us. + * + * The netfs is expected to read from subreq->pos + subreq->transferred to + * subreq->pos + subreq->len - 1. It may not backtrack and write data into the + * buffer prior to the transferred point as it might clobber dirty data + * obtained from the cache. + * + * Alternatively, the netfs is allowed to indicate one of two things: + * + * - NETFS_SREQ_SHORT_READ: A short read - it will get called again to try and + * make progress. + * + * - NETFS_SREQ_CLEAR_TAIL: A short read - the rest of the buffer will be + * cleared. + */ +static void netfs_read_from_server(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq) +{ + netfs_stat(&netfs_n_rh_download); + rreq->netfs_ops->issue_read(subreq); +} + +/* + * Release those waiting. + */ +static void netfs_rreq_completed(struct netfs_io_request *rreq, bool was_async) +{ + trace_netfs_rreq(rreq, netfs_rreq_trace_done); + netfs_clear_subrequests(rreq, was_async); + netfs_put_request(rreq, was_async, netfs_rreq_trace_put_complete); +} + +/* + * Deal with the completion of writing the data to the cache. We have to clear + * the PG_fscache bits on the folios involved and release the caller's ref. + * + * May be called in softirq mode and we inherit a ref from the caller. + */ +static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq, + bool was_async) +{ + struct netfs_io_subrequest *subreq; + struct folio *folio; + pgoff_t unlocked = 0; + bool have_unlocked = false; + + rcu_read_lock(); + + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE); + + xas_for_each(&xas, folio, (subreq->start + subreq->len - 1) / PAGE_SIZE) { + /* We might have multiple writes from the same huge + * folio, but we mustn't unlock a folio more than once. + */ + if (have_unlocked && folio_index(folio) <= unlocked) + continue; + unlocked = folio_index(folio); + folio_end_fscache(folio); + have_unlocked = true; + } + } + + rcu_read_unlock(); + netfs_rreq_completed(rreq, was_async); +} + +static void netfs_rreq_copy_terminated(void *priv, ssize_t transferred_or_error, + bool was_async) +{ + struct netfs_io_subrequest *subreq = priv; + struct netfs_io_request *rreq = subreq->rreq; + + if (IS_ERR_VALUE(transferred_or_error)) { + netfs_stat(&netfs_n_rh_write_failed); + trace_netfs_failure(rreq, subreq, transferred_or_error, + netfs_fail_copy_to_cache); + } else { + netfs_stat(&netfs_n_rh_write_done); + } + + trace_netfs_sreq(subreq, netfs_sreq_trace_write_term); + + /* If we decrement nr_copy_ops to 0, the ref belongs to us. */ + if (atomic_dec_and_test(&rreq->nr_copy_ops)) + netfs_rreq_unmark_after_write(rreq, was_async); + + netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); +} + +/* + * Perform any outstanding writes to the cache. We inherit a ref from the + * caller. + */ +static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq) +{ + struct netfs_cache_resources *cres = &rreq->cache_resources; + struct netfs_io_subrequest *subreq, *next, *p; + struct iov_iter iter; + int ret; + + trace_netfs_rreq(rreq, netfs_rreq_trace_copy); + + /* We don't want terminating writes trying to wake us up whilst we're + * still going through the list. + */ + atomic_inc(&rreq->nr_copy_ops); + + list_for_each_entry_safe(subreq, p, &rreq->subrequests, rreq_link) { + if (!test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) { + list_del_init(&subreq->rreq_link); + netfs_put_subrequest(subreq, false, + netfs_sreq_trace_put_no_copy); + } + } + + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + /* Amalgamate adjacent writes */ + while (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { + next = list_next_entry(subreq, rreq_link); + if (next->start != subreq->start + subreq->len) + break; + subreq->len += next->len; + list_del_init(&next->rreq_link); + netfs_put_subrequest(next, false, + netfs_sreq_trace_put_merged); + } + + ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len, + rreq->i_size, true); + if (ret < 0) { + trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write); + trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip); + continue; + } + + iov_iter_xarray(&iter, WRITE, &rreq->mapping->i_pages, + subreq->start, subreq->len); + + atomic_inc(&rreq->nr_copy_ops); + netfs_stat(&netfs_n_rh_write); + netfs_get_subrequest(subreq, netfs_sreq_trace_get_copy_to_cache); + trace_netfs_sreq(subreq, netfs_sreq_trace_write); + cres->ops->write(cres, subreq->start, &iter, + netfs_rreq_copy_terminated, subreq); + } + + /* If we decrement nr_copy_ops to 0, the usage ref belongs to us. */ + if (atomic_dec_and_test(&rreq->nr_copy_ops)) + netfs_rreq_unmark_after_write(rreq, false); +} + +static void netfs_rreq_write_to_cache_work(struct work_struct *work) +{ + struct netfs_io_request *rreq = + container_of(work, struct netfs_io_request, work); + + netfs_rreq_do_write_to_cache(rreq); +} + +static void netfs_rreq_write_to_cache(struct netfs_io_request *rreq) +{ + rreq->work.func = netfs_rreq_write_to_cache_work; + if (!queue_work(system_unbound_wq, &rreq->work)) + BUG(); +} + +/* + * Handle a short read. + */ +static void netfs_rreq_short_read(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq) +{ + __clear_bit(NETFS_SREQ_SHORT_IO, &subreq->flags); + __set_bit(NETFS_SREQ_SEEK_DATA_READ, &subreq->flags); + + netfs_stat(&netfs_n_rh_short_read); + trace_netfs_sreq(subreq, netfs_sreq_trace_resubmit_short); + + netfs_get_subrequest(subreq, netfs_sreq_trace_get_short_read); + atomic_inc(&rreq->nr_outstanding); + if (subreq->source == NETFS_READ_FROM_CACHE) + netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_CLEAR); + else + netfs_read_from_server(rreq, subreq); +} + +/* + * Resubmit any short or failed operations. Returns true if we got the rreq + * ref back. + */ +static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq) +{ + struct netfs_io_subrequest *subreq; + + WARN_ON(in_interrupt()); + + trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit); + + /* We don't want terminating submissions trying to wake us up whilst + * we're still going through the list. + */ + atomic_inc(&rreq->nr_outstanding); + + __clear_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + if (subreq->error) { + if (subreq->source != NETFS_READ_FROM_CACHE) + break; + subreq->source = NETFS_DOWNLOAD_FROM_SERVER; + subreq->error = 0; + netfs_stat(&netfs_n_rh_download_instead); + trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead); + netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); + atomic_inc(&rreq->nr_outstanding); + netfs_read_from_server(rreq, subreq); + } else if (test_bit(NETFS_SREQ_SHORT_IO, &subreq->flags)) { + netfs_rreq_short_read(rreq, subreq); + } + } + + /* If we decrement nr_outstanding to 0, the usage ref belongs to us. */ + if (atomic_dec_and_test(&rreq->nr_outstanding)) + return true; + + wake_up_var(&rreq->nr_outstanding); + return false; +} + +/* + * Check to see if the data read is still valid. + */ +static void netfs_rreq_is_still_valid(struct netfs_io_request *rreq) +{ + struct netfs_io_subrequest *subreq; + + if (!rreq->netfs_ops->is_still_valid || + rreq->netfs_ops->is_still_valid(rreq)) + return; + + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + if (subreq->source == NETFS_READ_FROM_CACHE) { + subreq->error = -ESTALE; + __set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); + } + } +} + +/* + * Assess the state of a read request and decide what to do next. + * + * Note that we could be in an ordinary kernel thread, on a workqueue or in + * softirq context at this point. We inherit a ref from the caller. + */ +static void netfs_rreq_assess(struct netfs_io_request *rreq, bool was_async) +{ + trace_netfs_rreq(rreq, netfs_rreq_trace_assess); + +again: + netfs_rreq_is_still_valid(rreq); + + if (!test_bit(NETFS_RREQ_FAILED, &rreq->flags) && + test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags)) { + if (netfs_rreq_perform_resubmissions(rreq)) + goto again; + return; + } + + netfs_rreq_unlock_folios(rreq); + + clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags); + wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS); + + if (test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags)) + return netfs_rreq_write_to_cache(rreq); + + netfs_rreq_completed(rreq, was_async); +} + +static void netfs_rreq_work(struct work_struct *work) +{ + struct netfs_io_request *rreq = + container_of(work, struct netfs_io_request, work); + netfs_rreq_assess(rreq, false); +} + +/* + * Handle the completion of all outstanding I/O operations on a read request. + * We inherit a ref from the caller. + */ +static void netfs_rreq_terminated(struct netfs_io_request *rreq, + bool was_async) +{ + if (test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags) && + was_async) { + if (!queue_work(system_unbound_wq, &rreq->work)) + BUG(); + } else { + netfs_rreq_assess(rreq, was_async); + } +} + +/** + * netfs_subreq_terminated - Note the termination of an I/O operation. + * @subreq: The I/O request that has terminated. + * @transferred_or_error: The amount of data transferred or an error code. + * @was_async: The termination was asynchronous + * + * This tells the read helper that a contributory I/O operation has terminated, + * one way or another, and that it should integrate the results. + * + * The caller indicates in @transferred_or_error the outcome of the operation, + * supplying a positive value to indicate the number of bytes transferred, 0 to + * indicate a failure to transfer anything that should be retried or a negative + * error code. The helper will look after reissuing I/O operations as + * appropriate and writing downloaded data to the cache. + * + * If @was_async is true, the caller might be running in softirq or interrupt + * context and we can't sleep. + */ +void netfs_subreq_terminated(struct netfs_io_subrequest *subreq, + ssize_t transferred_or_error, + bool was_async) +{ + struct netfs_io_request *rreq = subreq->rreq; + int u; + + _enter("[%u]{%llx,%lx},%zd", + subreq->debug_index, subreq->start, subreq->flags, + transferred_or_error); + + switch (subreq->source) { + case NETFS_READ_FROM_CACHE: + netfs_stat(&netfs_n_rh_read_done); + break; + case NETFS_DOWNLOAD_FROM_SERVER: + netfs_stat(&netfs_n_rh_download_done); + break; + default: + break; + } + + if (IS_ERR_VALUE(transferred_or_error)) { + subreq->error = transferred_or_error; + trace_netfs_failure(rreq, subreq, transferred_or_error, + netfs_fail_read); + goto failed; + } + + if (WARN(transferred_or_error > subreq->len - subreq->transferred, + "Subreq overread: R%x[%x] %zd > %zu - %zu", + rreq->debug_id, subreq->debug_index, + transferred_or_error, subreq->len, subreq->transferred)) + transferred_or_error = subreq->len - subreq->transferred; + + subreq->error = 0; + subreq->transferred += transferred_or_error; + if (subreq->transferred < subreq->len) + goto incomplete; + +complete: + __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); + if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) + set_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags); + +out: + trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); + + /* If we decrement nr_outstanding to 0, the ref belongs to us. */ + u = atomic_dec_return(&rreq->nr_outstanding); + if (u == 0) + netfs_rreq_terminated(rreq, was_async); + else if (u == 1) + wake_up_var(&rreq->nr_outstanding); + + netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); + return; + +incomplete: + if (test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags)) { + netfs_clear_unread(subreq); + subreq->transferred = subreq->len; + goto complete; + } + + if (transferred_or_error == 0) { + if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) { + subreq->error = -ENODATA; + goto failed; + } + } else { + __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); + } + + __set_bit(NETFS_SREQ_SHORT_IO, &subreq->flags); + set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); + goto out; + +failed: + if (subreq->source == NETFS_READ_FROM_CACHE) { + netfs_stat(&netfs_n_rh_read_failed); + set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); + } else { + netfs_stat(&netfs_n_rh_download_failed); + set_bit(NETFS_RREQ_FAILED, &rreq->flags); + rreq->error = subreq->error; + } + goto out; +} +EXPORT_SYMBOL(netfs_subreq_terminated); + +static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_subrequest *subreq, + loff_t i_size) +{ + struct netfs_io_request *rreq = subreq->rreq; + struct netfs_cache_resources *cres = &rreq->cache_resources; + + if (cres->ops) + return cres->ops->prepare_read(subreq, i_size); + if (subreq->start >= rreq->i_size) + return NETFS_FILL_WITH_ZEROES; + return NETFS_DOWNLOAD_FROM_SERVER; +} + +/* + * Work out what sort of subrequest the next one will be. + */ +static enum netfs_io_source +netfs_rreq_prepare_read(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq) +{ + enum netfs_io_source source; + + _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size); + + source = netfs_cache_prepare_read(subreq, rreq->i_size); + if (source == NETFS_INVALID_READ) + goto out; + + if (source == NETFS_DOWNLOAD_FROM_SERVER) { + /* Call out to the netfs to let it shrink the request to fit + * its own I/O sizes and boundaries. If it shinks it here, it + * will be called again to make simultaneous calls; if it wants + * to make serial calls, it can indicate a short read and then + * we will call it again. + */ + if (subreq->len > rreq->i_size - subreq->start) + subreq->len = rreq->i_size - subreq->start; + + if (rreq->netfs_ops->clamp_length && + !rreq->netfs_ops->clamp_length(subreq)) { + source = NETFS_INVALID_READ; + goto out; + } + } + + if (WARN_ON(subreq->len == 0)) + source = NETFS_INVALID_READ; + +out: + subreq->source = source; + trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); + return source; +} + +/* + * Slice off a piece of a read request and submit an I/O request for it. + */ +static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq, + unsigned int *_debug_index) +{ + struct netfs_io_subrequest *subreq; + enum netfs_io_source source; + + subreq = netfs_alloc_subrequest(rreq); + if (!subreq) + return false; + + subreq->debug_index = (*_debug_index)++; + subreq->start = rreq->start + rreq->submitted; + subreq->len = rreq->len - rreq->submitted; + + _debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted); + list_add_tail(&subreq->rreq_link, &rreq->subrequests); + + /* Call out to the cache to find out what it can do with the remaining + * subset. It tells us in subreq->flags what it decided should be done + * and adjusts subreq->len down if the subset crosses a cache boundary. + * + * Then when we hand the subset, it can choose to take a subset of that + * (the starts must coincide), in which case, we go around the loop + * again and ask it to download the next piece. + */ + source = netfs_rreq_prepare_read(rreq, subreq); + if (source == NETFS_INVALID_READ) + goto subreq_failed; + + atomic_inc(&rreq->nr_outstanding); + + rreq->submitted += subreq->len; + + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + switch (source) { + case NETFS_FILL_WITH_ZEROES: + netfs_fill_with_zeroes(rreq, subreq); + break; + case NETFS_DOWNLOAD_FROM_SERVER: + netfs_read_from_server(rreq, subreq); + break; + case NETFS_READ_FROM_CACHE: + netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_IGNORE); + break; + default: + BUG(); + } + + return true; + +subreq_failed: + rreq->error = subreq->error; + netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_failed); + return false; +} + +/* + * Begin the process of reading in a chunk of data, where that data may be + * stitched together from multiple sources, including multiple servers and the + * local cache. + */ +int netfs_begin_read(struct netfs_io_request *rreq, bool sync) +{ + unsigned int debug_index = 0; + int ret; + + _enter("R=%x %llx-%llx", + rreq->debug_id, rreq->start, rreq->start + rreq->len - 1); + + if (rreq->len == 0) { + pr_err("Zero-sized read [R=%x]\n", rreq->debug_id); + netfs_put_request(rreq, false, netfs_rreq_trace_put_zero_len); + return -EIO; + } + + INIT_WORK(&rreq->work, netfs_rreq_work); + + if (sync) + netfs_get_request(rreq, netfs_rreq_trace_get_hold); + + /* Chop the read into slices according to what the cache and the netfs + * want and submit each one. + */ + atomic_set(&rreq->nr_outstanding, 1); + do { + if (!netfs_rreq_submit_slice(rreq, &debug_index)) + break; + + } while (rreq->submitted < rreq->len); + + if (sync) { + /* Keep nr_outstanding incremented so that the ref always belongs to + * us, and the service code isn't punted off to a random thread pool to + * process. + */ + for (;;) { + wait_var_event(&rreq->nr_outstanding, + atomic_read(&rreq->nr_outstanding) == 1); + netfs_rreq_assess(rreq, false); + if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)) + break; + cond_resched(); + } + + ret = rreq->error; + if (ret == 0 && rreq->submitted < rreq->len) { + trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); + ret = -EIO; + } + netfs_put_request(rreq, false, netfs_rreq_trace_put_hold); + } else { + /* If we decrement nr_outstanding to 0, the ref belongs to us. */ + if (atomic_dec_and_test(&rreq->nr_outstanding)) + netfs_rreq_assess(rreq, false); + ret = 0; + } + return ret; +} diff --git a/fs/netfs/main.c b/fs/netfs/main.c new file mode 100644 index 000000000000..068568702957 --- /dev/null +++ b/fs/netfs/main.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Miscellaneous bits for the netfs support library. + * + * Copyright (C) 2022 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/module.h> +#include <linux/export.h> +#include "internal.h" +#define CREATE_TRACE_POINTS +#include <trace/events/netfs.h> + +MODULE_DESCRIPTION("Network fs support"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +unsigned netfs_debug; +module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask"); diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c new file mode 100644 index 000000000000..e86107b30ba4 --- /dev/null +++ b/fs/netfs/objects.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Object lifetime handling and tracing. + * + * Copyright (C) 2022 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/slab.h> +#include "internal.h" + +/* + * Allocate an I/O request and initialise it. + */ +struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, + struct file *file, + loff_t start, size_t len, + enum netfs_io_origin origin) +{ + static atomic_t debug_ids; + struct inode *inode = file ? file_inode(file) : mapping->host; + struct netfs_i_context *ctx = netfs_i_context(inode); + struct netfs_io_request *rreq; + int ret; + + rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL); + if (!rreq) + return ERR_PTR(-ENOMEM); + + rreq->start = start; + rreq->len = len; + rreq->origin = origin; + rreq->netfs_ops = ctx->ops; + rreq->mapping = mapping; + rreq->inode = inode; + rreq->i_size = i_size_read(inode); + rreq->debug_id = atomic_inc_return(&debug_ids); + INIT_LIST_HEAD(&rreq->subrequests); + refcount_set(&rreq->ref, 1); + __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); + if (rreq->netfs_ops->init_request) { + ret = rreq->netfs_ops->init_request(rreq, file); + if (ret < 0) { + kfree(rreq); + return ERR_PTR(ret); + } + } + + netfs_stat(&netfs_n_rh_rreq); + return rreq; +} + +void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what) +{ + int r; + + __refcount_inc(&rreq->ref, &r); + trace_netfs_rreq_ref(rreq->debug_id, r + 1, what); +} + +void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async) +{ + struct netfs_io_subrequest *subreq; + + while (!list_empty(&rreq->subrequests)) { + subreq = list_first_entry(&rreq->subrequests, + struct netfs_io_subrequest, rreq_link); + list_del(&subreq->rreq_link); + netfs_put_subrequest(subreq, was_async, + netfs_sreq_trace_put_clear); + } +} + +static void netfs_free_request(struct work_struct *work) +{ + struct netfs_io_request *rreq = + container_of(work, struct netfs_io_request, work); + + netfs_clear_subrequests(rreq, false); + if (rreq->netfs_priv) + rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv); + trace_netfs_rreq(rreq, netfs_rreq_trace_free); + if (rreq->cache_resources.ops) + rreq->cache_resources.ops->end_operation(&rreq->cache_resources); + kfree(rreq); + netfs_stat_d(&netfs_n_rh_rreq); +} + +void netfs_put_request(struct netfs_io_request *rreq, bool was_async, + enum netfs_rreq_ref_trace what) +{ + unsigned int debug_id = rreq->debug_id; + bool dead; + int r; + + dead = __refcount_dec_and_test(&rreq->ref, &r); + trace_netfs_rreq_ref(debug_id, r - 1, what); + if (dead) { + if (was_async) { + rreq->work.func = netfs_free_request; + if (!queue_work(system_unbound_wq, &rreq->work)) + BUG(); + } else { + netfs_free_request(&rreq->work); + } + } +} + +/* + * Allocate and partially initialise an I/O request structure. + */ +struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq) +{ + struct netfs_io_subrequest *subreq; + + subreq = kzalloc(sizeof(struct netfs_io_subrequest), GFP_KERNEL); + if (subreq) { + INIT_LIST_HEAD(&subreq->rreq_link); + refcount_set(&subreq->ref, 2); + subreq->rreq = rreq; + netfs_get_request(rreq, netfs_rreq_trace_get_subreq); + netfs_stat(&netfs_n_rh_sreq); + } + + return subreq; +} + +void netfs_get_subrequest(struct netfs_io_subrequest *subreq, + enum netfs_sreq_ref_trace what) +{ + int r; + + __refcount_inc(&subreq->ref, &r); + trace_netfs_sreq_ref(subreq->rreq->debug_id, subreq->debug_index, r + 1, + what); +} + +static void netfs_free_subrequest(struct netfs_io_subrequest *subreq, + bool was_async) +{ + struct netfs_io_request *rreq = subreq->rreq; + + trace_netfs_sreq(subreq, netfs_sreq_trace_free); + kfree(subreq); + netfs_stat_d(&netfs_n_rh_sreq); + netfs_put_request(rreq, was_async, netfs_rreq_trace_put_subreq); +} + +void netfs_put_subrequest(struct netfs_io_subrequest *subreq, bool was_async, + enum netfs_sreq_ref_trace what) +{ + unsigned int debug_index = subreq->debug_index; + unsigned int debug_id = subreq->rreq->debug_id; + bool dead; + int r; + + dead = __refcount_dec_and_test(&subreq->ref, &r); + trace_netfs_sreq_ref(debug_id, debug_index, r - 1, what); + if (dead) + netfs_free_subrequest(subreq, was_async); +} diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c deleted file mode 100644 index 501da990c259..000000000000 --- a/fs/netfs/read_helper.c +++ /dev/null @@ -1,1205 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* Network filesystem high-level read support. - * - * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#include <linux/module.h> -#include <linux/export.h> -#include <linux/fs.h> -#include <linux/mm.h> -#include <linux/pagemap.h> -#include <linux/slab.h> -#include <linux/uio.h> -#include <linux/sched/mm.h> -#include <linux/task_io_accounting_ops.h> -#include <linux/netfs.h> -#include "internal.h" -#define CREATE_TRACE_POINTS -#include <trace/events/netfs.h> - -MODULE_DESCRIPTION("Network fs support"); -MODULE_AUTHOR("Red Hat, Inc."); -MODULE_LICENSE("GPL"); - -unsigned netfs_debug; -module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO); -MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask"); - -static void netfs_rreq_work(struct work_struct *); -static void __netfs_put_subrequest(struct netfs_read_subrequest *, bool); - -static void netfs_put_subrequest(struct netfs_read_subrequest *subreq, - bool was_async) -{ - if (refcount_dec_and_test(&subreq->usage)) - __netfs_put_subrequest(subreq, was_async); -} - -static struct netfs_read_request *netfs_alloc_read_request( - const struct netfs_read_request_ops *ops, void *netfs_priv, - struct file *file) -{ - static atomic_t debug_ids; - struct netfs_read_request *rreq; - - rreq = kzalloc(sizeof(struct netfs_read_request), GFP_KERNEL); - if (rreq) { - rreq->netfs_ops = ops; - rreq->netfs_priv = netfs_priv; - rreq->inode = file_inode(file); - rreq->i_size = i_size_read(rreq->inode); - rreq->debug_id = atomic_inc_return(&debug_ids); - INIT_LIST_HEAD(&rreq->subrequests); - INIT_WORK(&rreq->work, netfs_rreq_work); - refcount_set(&rreq->usage, 1); - __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); - if (ops->init_rreq) - ops->init_rreq(rreq, file); - netfs_stat(&netfs_n_rh_rreq); - } - - return rreq; -} - -static void netfs_get_read_request(struct netfs_read_request *rreq) -{ - refcount_inc(&rreq->usage); -} - -static void netfs_rreq_clear_subreqs(struct netfs_read_request *rreq, - bool was_async) -{ - struct netfs_read_subrequest *subreq; - - while (!list_empty(&rreq->subrequests)) { - subreq = list_first_entry(&rreq->subrequests, - struct netfs_read_subrequest, rreq_link); - list_del(&subreq->rreq_link); - netfs_put_subrequest(subreq, was_async); - } -} - -static void netfs_free_read_request(struct work_struct *work) -{ - struct netfs_read_request *rreq = - container_of(work, struct netfs_read_request, work); - netfs_rreq_clear_subreqs(rreq, false); - if (rreq->netfs_priv) - rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv); - trace_netfs_rreq(rreq, netfs_rreq_trace_free); - if (rreq->cache_resources.ops) - rreq->cache_resources.ops->end_operation(&rreq->cache_resources); - kfree(rreq); - netfs_stat_d(&netfs_n_rh_rreq); -} - -static void netfs_put_read_request(struct netfs_read_request *rreq, bool was_async) -{ - if (refcount_dec_and_test(&rreq->usage)) { - if (was_async) { - rreq->work.func = netfs_free_read_request; - if (!queue_work(system_unbound_wq, &rreq->work)) - BUG(); - } else { - netfs_free_read_request(&rreq->work); - } - } -} - -/* - * Allocate and partially initialise an I/O request structure. - */ -static struct netfs_read_subrequest *netfs_alloc_subrequest( - struct netfs_read_request *rreq) -{ - struct netfs_read_subrequest *subreq; - - subreq = kzalloc(sizeof(struct netfs_read_subrequest), GFP_KERNEL); - if (subreq) { - INIT_LIST_HEAD(&subreq->rreq_link); - refcount_set(&subreq->usage, 2); - subreq->rreq = rreq; - netfs_get_read_request(rreq); - netfs_stat(&netfs_n_rh_sreq); - } - - return subreq; -} - -static void netfs_get_read_subrequest(struct netfs_read_subrequest *subreq) -{ - refcount_inc(&subreq->usage); -} - -static void __netfs_put_subrequest(struct netfs_read_subrequest *subreq, - bool was_async) -{ - struct netfs_read_request *rreq = subreq->rreq; - - trace_netfs_sreq(subreq, netfs_sreq_trace_free); - kfree(subreq); - netfs_stat_d(&netfs_n_rh_sreq); - netfs_put_read_request(rreq, was_async); -} - -/* - * Clear the unread part of an I/O request. - */ -static void netfs_clear_unread(struct netfs_read_subrequest *subreq) -{ - struct iov_iter iter; - - iov_iter_xarray(&iter, READ, &subreq->rreq->mapping->i_pages, - subreq->start + subreq->transferred, - subreq->len - subreq->transferred); - iov_iter_zero(iov_iter_count(&iter), &iter); -} - -static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, - bool was_async) -{ - struct netfs_read_subrequest *subreq = priv; - - netfs_subreq_terminated(subreq, transferred_or_error, was_async); -} - -/* - * Issue a read against the cache. - * - Eats the caller's ref on subreq. - */ -static void netfs_read_from_cache(struct netfs_read_request *rreq, - struct netfs_read_subrequest *subreq, - enum netfs_read_from_hole read_hole) -{ - struct netfs_cache_resources *cres = &rreq->cache_resources; - struct iov_iter iter; - - netfs_stat(&netfs_n_rh_read); - iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, - subreq->start + subreq->transferred, - subreq->len - subreq->transferred); - - cres->ops->read(cres, subreq->start, &iter, read_hole, - netfs_cache_read_terminated, subreq); -} - -/* - * Fill a subrequest region with zeroes. - */ -static void netfs_fill_with_zeroes(struct netfs_read_request *rreq, - struct netfs_read_subrequest *subreq) -{ - netfs_stat(&netfs_n_rh_zero); - __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); - netfs_subreq_terminated(subreq, 0, false); -} - -/* - * Ask the netfs to issue a read request to the server for us. - * - * The netfs is expected to read from subreq->pos + subreq->transferred to - * subreq->pos + subreq->len - 1. It may not backtrack and write data into the - * buffer prior to the transferred point as it might clobber dirty data - * obtained from the cache. - * - * Alternatively, the netfs is allowed to indicate one of two things: - * - * - NETFS_SREQ_SHORT_READ: A short read - it will get called again to try and - * make progress. - * - * - NETFS_SREQ_CLEAR_TAIL: A short read - the rest of the buffer will be - * cleared. - */ -static void netfs_read_from_server(struct netfs_read_request *rreq, - struct netfs_read_subrequest *subreq) -{ - netfs_stat(&netfs_n_rh_download); - rreq->netfs_ops->issue_op(subreq); -} - -/* - * Release those waiting. - */ -static void netfs_rreq_completed(struct netfs_read_request *rreq, bool was_async) -{ - trace_netfs_rreq(rreq, netfs_rreq_trace_done); - netfs_rreq_clear_subreqs(rreq, was_async); - netfs_put_read_request(rreq, was_async); -} - -/* - * Deal with the completion of writing the data to the cache. We have to clear - * the PG_fscache bits on the folios involved and release the caller's ref. - * - * May be called in softirq mode and we inherit a ref from the caller. - */ -static void netfs_rreq_unmark_after_write(struct netfs_read_request *rreq, - bool was_async) -{ - struct netfs_read_subrequest *subreq; - struct folio *folio; - pgoff_t unlocked = 0; - bool have_unlocked = false; - - rcu_read_lock(); - - list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { - XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE); - - xas_for_each(&xas, folio, (subreq->start + subreq->len - 1) / PAGE_SIZE) { - /* We might have multiple writes from the same huge - * folio, but we mustn't unlock a folio more than once. - */ - if (have_unlocked && folio_index(folio) <= unlocked) - continue; - unlocked = folio_index(folio); - folio_end_fscache(folio); - have_unlocked = true; - } - } - - rcu_read_unlock(); - netfs_rreq_completed(rreq, was_async); -} - -static void netfs_rreq_copy_terminated(void *priv, ssize_t transferred_or_error, - bool was_async) -{ - struct netfs_read_subrequest *subreq = priv; - struct netfs_read_request *rreq = subreq->rreq; - - if (IS_ERR_VALUE(transferred_or_error)) { - netfs_stat(&netfs_n_rh_write_failed); - trace_netfs_failure(rreq, subreq, transferred_or_error, - netfs_fail_copy_to_cache); - } else { - netfs_stat(&netfs_n_rh_write_done); - } - - trace_netfs_sreq(subreq, netfs_sreq_trace_write_term); - - /* If we decrement nr_wr_ops to 0, the ref belongs to us. */ - if (atomic_dec_and_test(&rreq->nr_wr_ops)) - netfs_rreq_unmark_after_write(rreq, was_async); - - netfs_put_subrequest(subreq, was_async); -} - -/* - * Perform any outstanding writes to the cache. We inherit a ref from the - * caller. - */ -static void netfs_rreq_do_write_to_cache(struct netfs_read_request *rreq) -{ - struct netfs_cache_resources *cres = &rreq->cache_resources; - struct netfs_read_subrequest *subreq, *next, *p; - struct iov_iter iter; - int ret; - - trace_netfs_rreq(rreq, netfs_rreq_trace_write); - - /* We don't want terminating writes trying to wake us up whilst we're - * still going through the list. - */ - atomic_inc(&rreq->nr_wr_ops); - - list_for_each_entry_safe(subreq, p, &rreq->subrequests, rreq_link) { - if (!test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags)) { - list_del_init(&subreq->rreq_link); - netfs_put_subrequest(subreq, false); - } - } - - list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { - /* Amalgamate adjacent writes */ - while (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { - next = list_next_entry(subreq, rreq_link); - if (next->start != subreq->start + subreq->len) - break; - subreq->len += next->len; - list_del_init(&next->rreq_link); - netfs_put_subrequest(next, false); - } - - ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len, - rreq->i_size, true); - if (ret < 0) { - trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write); - trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip); - continue; - } - - iov_iter_xarray(&iter, WRITE, &rreq->mapping->i_pages, - subreq->start, subreq->len); - - atomic_inc(&rreq->nr_wr_ops); - netfs_stat(&netfs_n_rh_write); - netfs_get_read_subrequest(subreq); - trace_netfs_sreq(subreq, netfs_sreq_trace_write); - cres->ops->write(cres, subreq->start, &iter, - netfs_rreq_copy_terminated, subreq); - } - - /* If we decrement nr_wr_ops to 0, the usage ref belongs to us. */ - if (atomic_dec_and_test(&rreq->nr_wr_ops)) - netfs_rreq_unmark_after_write(rreq, false); -} - -static void netfs_rreq_write_to_cache_work(struct work_struct *work) -{ - struct netfs_read_request *rreq = - container_of(work, struct netfs_read_request, work); - - netfs_rreq_do_write_to_cache(rreq); -} - -static void netfs_rreq_write_to_cache(struct netfs_read_request *rreq) -{ - rreq->work.func = netfs_rreq_write_to_cache_work; - if (!queue_work(system_unbound_wq, &rreq->work)) - BUG(); -} - -/* - * Unlock the folios in a read operation. We need to set PG_fscache on any - * folios we're going to write back before we unlock them. - */ -static void netfs_rreq_unlock(struct netfs_read_request *rreq) -{ - struct netfs_read_subrequest *subreq; - struct folio *folio; - unsigned int iopos, account = 0; - pgoff_t start_page = rreq->start / PAGE_SIZE; - pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; - bool subreq_failed = false; - - XA_STATE(xas, &rreq->mapping->i_pages, start_page); - - if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) { - __clear_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags); - list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { - __clear_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags); - } - } - - /* Walk through the pagecache and the I/O request lists simultaneously. - * We may have a mixture of cached and uncached sections and we only - * really want to write out the uncached sections. This is slightly - * complicated by the possibility that we might have huge pages with a - * mixture inside. - */ - subreq = list_first_entry(&rreq->subrequests, - struct netfs_read_subrequest, rreq_link); - iopos = 0; - subreq_failed = (subreq->error < 0); - - trace_netfs_rreq(rreq, netfs_rreq_trace_unlock); - - rcu_read_lock(); - xas_for_each(&xas, folio, last_page) { - unsigned int pgpos = (folio_index(folio) - start_page) * PAGE_SIZE; - unsigned int pgend = pgpos + folio_size(folio); - bool pg_failed = false; - - for (;;) { - if (!subreq) { - pg_failed = true; - break; - } - if (test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags)) - folio_start_fscache(folio); - pg_failed |= subreq_failed; - if (pgend < iopos + subreq->len) - break; - - account += subreq->transferred; - iopos += subreq->len; - if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { - subreq = list_next_entry(subreq, rreq_link); - subreq_failed = (subreq->error < 0); - } else { - subreq = NULL; - subreq_failed = false; - } - if (pgend == iopos) - break; - } - - if (!pg_failed) { - flush_dcache_folio(folio); - folio_mark_uptodate(folio); - } - - if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { - if (folio_index(folio) == rreq->no_unlock_folio && - test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) - _debug("no unlock"); - else - folio_unlock(folio); - } - } - rcu_read_unlock(); - - task_io_account_read(account); - if (rreq->netfs_ops->done) - rreq->netfs_ops->done(rreq); -} - -/* - * Handle a short read. - */ -static void netfs_rreq_short_read(struct netfs_read_request *rreq, - struct netfs_read_subrequest *subreq) -{ - __clear_bit(NETFS_SREQ_SHORT_READ, &subreq->flags); - __set_bit(NETFS_SREQ_SEEK_DATA_READ, &subreq->flags); - - netfs_stat(&netfs_n_rh_short_read); - trace_netfs_sreq(subreq, netfs_sreq_trace_resubmit_short); - - netfs_get_read_subrequest(subreq); - atomic_inc(&rreq->nr_rd_ops); - if (subreq->source == NETFS_READ_FROM_CACHE) - netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_CLEAR); - else - netfs_read_from_server(rreq, subreq); -} - -/* - * Resubmit any short or failed operations. Returns true if we got the rreq - * ref back. - */ -static bool netfs_rreq_perform_resubmissions(struct netfs_read_request *rreq) -{ - struct netfs_read_subrequest *subreq; - - WARN_ON(in_interrupt()); - - trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit); - - /* We don't want terminating submissions trying to wake us up whilst - * we're still going through the list. - */ - atomic_inc(&rreq->nr_rd_ops); - - __clear_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); - list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { - if (subreq->error) { - if (subreq->source != NETFS_READ_FROM_CACHE) - break; - subreq->source = NETFS_DOWNLOAD_FROM_SERVER; - subreq->error = 0; - netfs_stat(&netfs_n_rh_download_instead); - trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead); - netfs_get_read_subrequest(subreq); - atomic_inc(&rreq->nr_rd_ops); - netfs_read_from_server(rreq, subreq); - } else if (test_bit(NETFS_SREQ_SHORT_READ, &subreq->flags)) { - netfs_rreq_short_read(rreq, subreq); - } - } - - /* If we decrement nr_rd_ops to 0, the usage ref belongs to us. */ - if (atomic_dec_and_test(&rreq->nr_rd_ops)) - return true; - - wake_up_var(&rreq->nr_rd_ops); - return false; -} - -/* - * Check to see if the data read is still valid. - */ -static void netfs_rreq_is_still_valid(struct netfs_read_request *rreq) -{ - struct netfs_read_subrequest *subreq; - - if (!rreq->netfs_ops->is_still_valid || - rreq->netfs_ops->is_still_valid(rreq)) - return; - - list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { - if (subreq->source == NETFS_READ_FROM_CACHE) { - subreq->error = -ESTALE; - __set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); - } - } -} - -/* - * Assess the state of a read request and decide what to do next. - * - * Note that we could be in an ordinary kernel thread, on a workqueue or in - * softirq context at this point. We inherit a ref from the caller. - */ -static void netfs_rreq_assess(struct netfs_read_request *rreq, bool was_async) -{ - trace_netfs_rreq(rreq, netfs_rreq_trace_assess); - -again: - netfs_rreq_is_still_valid(rreq); - - if (!test_bit(NETFS_RREQ_FAILED, &rreq->flags) && - test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags)) { - if (netfs_rreq_perform_resubmissions(rreq)) - goto again; - return; - } - - netfs_rreq_unlock(rreq); - - clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags); - wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS); - - if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags)) - return netfs_rreq_write_to_cache(rreq); - - netfs_rreq_completed(rreq, was_async); -} - -static void netfs_rreq_work(struct work_struct *work) -{ - struct netfs_read_request *rreq = - container_of(work, struct netfs_read_request, work); - netfs_rreq_assess(rreq, false); -} - -/* - * Handle the completion of all outstanding I/O operations on a read request. - * We inherit a ref from the caller. - */ -static void netfs_rreq_terminated(struct netfs_read_request *rreq, - bool was_async) -{ - if (test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags) && - was_async) { - if (!queue_work(system_unbound_wq, &rreq->work)) - BUG(); - } else { - netfs_rreq_assess(rreq, was_async); - } -} - -/** - * netfs_subreq_terminated - Note the termination of an I/O operation. - * @subreq: The I/O request that has terminated. - * @transferred_or_error: The amount of data transferred or an error code. - * @was_async: The termination was asynchronous - * - * This tells the read helper that a contributory I/O operation has terminated, - * one way or another, and that it should integrate the results. - * - * The caller indicates in @transferred_or_error the outcome of the operation, - * supplying a positive value to indicate the number of bytes transferred, 0 to - * indicate a failure to transfer anything that should be retried or a negative - * error code. The helper will look after reissuing I/O operations as - * appropriate and writing downloaded data to the cache. - * - * If @was_async is true, the caller might be running in softirq or interrupt - * context and we can't sleep. - */ -void netfs_subreq_terminated(struct netfs_read_subrequest *subreq, - ssize_t transferred_or_error, - bool was_async) -{ - struct netfs_read_request *rreq = subreq->rreq; - int u; - - _enter("[%u]{%llx,%lx},%zd", - subreq->debug_index, subreq->start, subreq->flags, - transferred_or_error); - - switch (subreq->source) { - case NETFS_READ_FROM_CACHE: - netfs_stat(&netfs_n_rh_read_done); - break; - case NETFS_DOWNLOAD_FROM_SERVER: - netfs_stat(&netfs_n_rh_download_done); - break; - default: - break; - } - - if (IS_ERR_VALUE(transferred_or_error)) { - subreq->error = transferred_or_error; - trace_netfs_failure(rreq, subreq, transferred_or_error, - netfs_fail_read); - goto failed; - } - - if (WARN(transferred_or_error > subreq->len - subreq->transferred, - "Subreq overread: R%x[%x] %zd > %zu - %zu", - rreq->debug_id, subreq->debug_index, - transferred_or_error, subreq->len, subreq->transferred)) - transferred_or_error = subreq->len - subreq->transferred; - - subreq->error = 0; - subreq->transferred += transferred_or_error; - if (subreq->transferred < subreq->len) - goto incomplete; - -complete: - __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); - if (test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags)) - set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags); - -out: - trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); - - /* If we decrement nr_rd_ops to 0, the ref belongs to us. */ - u = atomic_dec_return(&rreq->nr_rd_ops); - if (u == 0) - netfs_rreq_terminated(rreq, was_async); - else if (u == 1) - wake_up_var(&rreq->nr_rd_ops); - - netfs_put_subrequest(subreq, was_async); - return; - -incomplete: - if (test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags)) { - netfs_clear_unread(subreq); - subreq->transferred = subreq->len; - goto complete; - } - - if (transferred_or_error == 0) { - if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) { - subreq->error = -ENODATA; - goto failed; - } - } else { - __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); - } - - __set_bit(NETFS_SREQ_SHORT_READ, &subreq->flags); - set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); - goto out; - -failed: - if (subreq->source == NETFS_READ_FROM_CACHE) { - netfs_stat(&netfs_n_rh_read_failed); - set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); - } else { - netfs_stat(&netfs_n_rh_download_failed); - set_bit(NETFS_RREQ_FAILED, &rreq->flags); - rreq->error = subreq->error; - } - goto out; -} -EXPORT_SYMBOL(netfs_subreq_terminated); - -static enum netfs_read_source netfs_cache_prepare_read(struct netfs_read_subrequest *subreq, - loff_t i_size) -{ - struct netfs_read_request *rreq = subreq->rreq; - struct netfs_cache_resources *cres = &rreq->cache_resources; - - if (cres->ops) - return cres->ops->prepare_read(subreq, i_size); - if (subreq->start >= rreq->i_size) - return NETFS_FILL_WITH_ZEROES; - return NETFS_DOWNLOAD_FROM_SERVER; -} - -/* - * Work out what sort of subrequest the next one will be. - */ -static enum netfs_read_source -netfs_rreq_prepare_read(struct netfs_read_request *rreq, - struct netfs_read_subrequest *subreq) -{ - enum netfs_read_source source; - - _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size); - - source = netfs_cache_prepare_read(subreq, rreq->i_size); - if (source == NETFS_INVALID_READ) - goto out; - - if (source == NETFS_DOWNLOAD_FROM_SERVER) { - /* Call out to the netfs to let it shrink the request to fit - * its own I/O sizes and boundaries. If it shinks it here, it - * will be called again to make simultaneous calls; if it wants - * to make serial calls, it can indicate a short read and then - * we will call it again. - */ - if (subreq->len > rreq->i_size - subreq->start) - subreq->len = rreq->i_size - subreq->start; - - if (rreq->netfs_ops->clamp_length && - !rreq->netfs_ops->clamp_length(subreq)) { - source = NETFS_INVALID_READ; - goto out; - } - } - - if (WARN_ON(subreq->len == 0)) - source = NETFS_INVALID_READ; - -out: - subreq->source = source; - trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); - return source; -} - -/* - * Slice off a piece of a read request and submit an I/O request for it. - */ -static bool netfs_rreq_submit_slice(struct netfs_read_request *rreq, - unsigned int *_debug_index) -{ - struct netfs_read_subrequest *subreq; - enum netfs_read_source source; - - subreq = netfs_alloc_subrequest(rreq); - if (!subreq) - return false; - - subreq->debug_index = (*_debug_index)++; - subreq->start = rreq->start + rreq->submitted; - subreq->len = rreq->len - rreq->submitted; - - _debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted); - list_add_tail(&subreq->rreq_link, &rreq->subrequests); - - /* Call out to the cache to find out what it can do with the remaining - * subset. It tells us in subreq->flags what it decided should be done - * and adjusts subreq->len down if the subset crosses a cache boundary. - * - * Then when we hand the subset, it can choose to take a subset of that - * (the starts must coincide), in which case, we go around the loop - * again and ask it to download the next piece. - */ - source = netfs_rreq_prepare_read(rreq, subreq); - if (source == NETFS_INVALID_READ) - goto subreq_failed; - - atomic_inc(&rreq->nr_rd_ops); - - rreq->submitted += subreq->len; - - trace_netfs_sreq(subreq, netfs_sreq_trace_submit); - switch (source) { - case NETFS_FILL_WITH_ZEROES: - netfs_fill_with_zeroes(rreq, subreq); - break; - case NETFS_DOWNLOAD_FROM_SERVER: - netfs_read_from_server(rreq, subreq); - break; - case NETFS_READ_FROM_CACHE: - netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_IGNORE); - break; - default: - BUG(); - } - - return true; - -subreq_failed: - rreq->error = subreq->error; - netfs_put_subrequest(subreq, false); - return false; -} - -static void netfs_cache_expand_readahead(struct netfs_read_request *rreq, - loff_t *_start, size_t *_len, loff_t i_size) -{ - struct netfs_cache_resources *cres = &rreq->cache_resources; - - if (cres->ops && cres->ops->expand_readahead) - cres->ops->expand_readahead(cres, _start, _len, i_size); -} - -static void netfs_rreq_expand(struct netfs_read_request *rreq, - struct readahead_control *ractl) -{ - /* Give the cache a chance to change the request parameters. The - * resultant request must contain the original region. - */ - netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size); - - /* Give the netfs a chance to change the request parameters. The - * resultant request must contain the original region. - */ - if (rreq->netfs_ops->expand_readahead) - rreq->netfs_ops->expand_readahead(rreq); - - /* Expand the request if the cache wants it to start earlier. Note - * that the expansion may get further extended if the VM wishes to - * insert THPs and the preferred start and/or end wind up in the middle - * of THPs. - * - * If this is the case, however, the THP size should be an integer - * multiple of the cache granule size, so we get a whole number of - * granules to deal with. - */ - if (rreq->start != readahead_pos(ractl) || - rreq->len != readahead_length(ractl)) { - readahead_expand(ractl, rreq->start, rreq->len); - rreq->start = readahead_pos(ractl); - rreq->len = readahead_length(ractl); - - trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl), - netfs_read_trace_expanded); - } -} - -/** - * netfs_readahead - Helper to manage a read request - * @ractl: The description of the readahead request - * @ops: The network filesystem's operations for the helper to use - * @netfs_priv: Private netfs data to be retained in the request - * - * Fulfil a readahead request by drawing data from the cache if possible, or - * the netfs if not. Space beyond the EOF is zero-filled. Multiple I/O - * requests from different sources will get munged together. If necessary, the - * readahead window can be expanded in either direction to a more convenient - * alighment for RPC efficiency or to make storage in the cache feasible. - * - * The calling netfs must provide a table of operations, only one of which, - * issue_op, is mandatory. It may also be passed a private token, which will - * be retained in rreq->netfs_priv and will be cleaned up by ops->cleanup(). - * - * This is usable whether or not caching is enabled. - */ -void netfs_readahead(struct readahead_control *ractl, - const struct netfs_read_request_ops *ops, - void *netfs_priv) -{ - struct netfs_read_request *rreq; - unsigned int debug_index = 0; - int ret; - - _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl)); - - if (readahead_count(ractl) == 0) - goto cleanup; - - rreq = netfs_alloc_read_request(ops, netfs_priv, ractl->file); - if (!rreq) - goto cleanup; - rreq->mapping = ractl->mapping; - rreq->start = readahead_pos(ractl); - rreq->len = readahead_length(ractl); - - if (ops->begin_cache_operation) { - ret = ops->begin_cache_operation(rreq); - if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) - goto cleanup_free; - } - - netfs_stat(&netfs_n_rh_readahead); - trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl), - netfs_read_trace_readahead); - - netfs_rreq_expand(rreq, ractl); - - atomic_set(&rreq->nr_rd_ops, 1); - do { - if (!netfs_rreq_submit_slice(rreq, &debug_index)) - break; - - } while (rreq->submitted < rreq->len); - - /* Drop the refs on the folios here rather than in the cache or - * filesystem. The locks will be dropped in netfs_rreq_unlock(). - */ - while (readahead_folio(ractl)) - ; - - /* If we decrement nr_rd_ops to 0, the ref belongs to us. */ - if (atomic_dec_and_test(&rreq->nr_rd_ops)) - netfs_rreq_assess(rreq, false); - return; - -cleanup_free: - netfs_put_read_request(rreq, false); - return; -cleanup: - if (netfs_priv) - ops->cleanup(ractl->mapping, netfs_priv); - return; -} -EXPORT_SYMBOL(netfs_readahead); - -/** - * netfs_readpage - Helper to manage a readpage request - * @file: The file to read from - * @folio: The folio to read - * @ops: The network filesystem's operations for the helper to use - * @netfs_priv: Private netfs data to be retained in the request - * - * Fulfil a readpage request by drawing data from the cache if possible, or the - * netfs if not. Space beyond the EOF is zero-filled. Multiple I/O requests - * from different sources will get munged together. - * - * The calling netfs must provide a table of operations, only one of which, - * issue_op, is mandatory. It may also be passed a private token, which will - * be retained in rreq->netfs_priv and will be cleaned up by ops->cleanup(). - * - * This is usable whether or not caching is enabled. - */ -int netfs_readpage(struct file *file, - struct folio *folio, - const struct netfs_read_request_ops *ops, - void *netfs_priv) -{ - struct netfs_read_request *rreq; - unsigned int debug_index = 0; - int ret; - - _enter("%lx", folio_index(folio)); - - rreq = netfs_alloc_read_request(ops, netfs_priv, file); - if (!rreq) { - if (netfs_priv) - ops->cleanup(folio_file_mapping(folio), netfs_priv); - folio_unlock(folio); - return -ENOMEM; - } - rreq->mapping = folio_file_mapping(folio); - rreq->start = folio_file_pos(folio); - rreq->len = folio_size(folio); - - if (ops->begin_cache_operation) { - ret = ops->begin_cache_operation(rreq); - if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) { - folio_unlock(folio); - goto out; - } - } - - netfs_stat(&netfs_n_rh_readpage); - trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage); - - netfs_get_read_request(rreq); - - atomic_set(&rreq->nr_rd_ops, 1); - do { - if (!netfs_rreq_submit_slice(rreq, &debug_index)) - break; - - } while (rreq->submitted < rreq->len); - - /* Keep nr_rd_ops incremented so that the ref always belongs to us, and - * the service code isn't punted off to a random thread pool to - * process. - */ - do { - wait_var_event(&rreq->nr_rd_ops, atomic_read(&rreq->nr_rd_ops) == 1); - netfs_rreq_assess(rreq, false); - } while (test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)); - - ret = rreq->error; - if (ret == 0 && rreq->submitted < rreq->len) { - trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_readpage); - ret = -EIO; - } -out: - netfs_put_read_request(rreq, false); - return ret; -} -EXPORT_SYMBOL(netfs_readpage); - -/* - * Prepare a folio for writing without reading first - * @folio: The folio being prepared - * @pos: starting position for the write - * @len: length of write - * - * In some cases, write_begin doesn't need to read at all: - * - full folio write - * - write that lies in a folio that is completely beyond EOF - * - write that covers the folio from start to EOF or beyond it - * - * If any of these criteria are met, then zero out the unwritten parts - * of the folio and return true. Otherwise, return false. - */ -static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len) -{ - struct inode *inode = folio_inode(folio); - loff_t i_size = i_size_read(inode); - size_t offset = offset_in_folio(folio, pos); - - /* Full folio write */ - if (offset == 0 && len >= folio_size(folio)) - return true; - - /* pos beyond last folio in the file */ - if (pos - offset >= i_size) - goto zero_out; - - /* Write that covers from the start of the folio to EOF or beyond */ - if (offset == 0 && (pos + len) >= i_size) - goto zero_out; - - return false; -zero_out: - zero_user_segments(&folio->page, 0, offset, offset + len, folio_size(folio)); - return true; -} - -/** - * netfs_write_begin - Helper to prepare for writing - * @file: The file to read from - * @mapping: The mapping to read from - * @pos: File position at which the write will begin - * @len: The length of the write (may extend beyond the end of the folio chosen) - * @aop_flags: AOP_* flags - * @_folio: Where to put the resultant folio - * @_fsdata: Place for the netfs to store a cookie - * @ops: The network filesystem's operations for the helper to use - * @netfs_priv: Private netfs data to be retained in the request - * - * Pre-read data for a write-begin request by drawing data from the cache if - * possible, or the netfs if not. Space beyond the EOF is zero-filled. - * Multiple I/O requests from different sources will get munged together. If - * necessary, the readahead window can be expanded in either direction to a - * more convenient alighment for RPC efficiency or to make storage in the cache - * feasible. - * - * The calling netfs must provide a table of operations, only one of which, - * issue_op, is mandatory. - * - * The check_write_begin() operation can be provided to check for and flush - * conflicting writes once the folio is grabbed and locked. It is passed a - * pointer to the fsdata cookie that gets returned to the VM to be passed to - * write_end. It is permitted to sleep. It should return 0 if the request - * should go ahead; unlock the folio and return -EAGAIN to cause the folio to - * be regot; or return an error. - * - * This is usable whether or not caching is enabled. - */ -int netfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned int len, unsigned int aop_flags, - struct folio **_folio, void **_fsdata, - const struct netfs_read_request_ops *ops, - void *netfs_priv) -{ - struct netfs_read_request *rreq; - struct folio *folio; - struct inode *inode = file_inode(file); - unsigned int debug_index = 0, fgp_flags; - pgoff_t index = pos >> PAGE_SHIFT; - int ret; - - DEFINE_READAHEAD(ractl, file, NULL, mapping, index); - -retry: - fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; - if (aop_flags & AOP_FLAG_NOFS) - fgp_flags |= FGP_NOFS; - folio = __filemap_get_folio(mapping, index, fgp_flags, - mapping_gfp_mask(mapping)); - if (!folio) - return -ENOMEM; - - if (ops->check_write_begin) { - /* Allow the netfs (eg. ceph) to flush conflicts. */ - ret = ops->check_write_begin(file, pos, len, folio, _fsdata); - if (ret < 0) { - trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin); - if (ret == -EAGAIN) - goto retry; - goto error; - } - } - - if (folio_test_uptodate(folio)) - goto have_folio; - - /* If the page is beyond the EOF, we want to clear it - unless it's - * within the cache granule containing the EOF, in which case we need - * to preload the granule. - */ - if (!ops->is_cache_enabled(inode) && - netfs_skip_folio_read(folio, pos, len)) { - netfs_stat(&netfs_n_rh_write_zskip); - goto have_folio_no_wait; - } - - ret = -ENOMEM; - rreq = netfs_alloc_read_request(ops, netfs_priv, file); - if (!rreq) - goto error; - rreq->mapping = folio_file_mapping(folio); - rreq->start = folio_file_pos(folio); - rreq->len = folio_size(folio); - rreq->no_unlock_folio = folio_index(folio); - __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); - netfs_priv = NULL; - - if (ops->begin_cache_operation) { - ret = ops->begin_cache_operation(rreq); - if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) - goto error_put; - } - - netfs_stat(&netfs_n_rh_write_begin); - trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin); - - /* Expand the request to meet caching requirements and download - * preferences. - */ - ractl._nr_pages = folio_nr_pages(folio); - netfs_rreq_expand(rreq, &ractl); - netfs_get_read_request(rreq); - - /* We hold the folio locks, so we can drop the references */ - folio_get(folio); - while (readahead_folio(&ractl)) - ; - - atomic_set(&rreq->nr_rd_ops, 1); - do { - if (!netfs_rreq_submit_slice(rreq, &debug_index)) - break; - - } while (rreq->submitted < rreq->len); - - /* Keep nr_rd_ops incremented so that the ref always belongs to us, and - * the service code isn't punted off to a random thread pool to - * process. - */ - for (;;) { - wait_var_event(&rreq->nr_rd_ops, atomic_read(&rreq->nr_rd_ops) == 1); - netfs_rreq_assess(rreq, false); - if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)) - break; - cond_resched(); - } - - ret = rreq->error; - if (ret == 0 && rreq->submitted < rreq->len) { - trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_write_begin); - ret = -EIO; - } - netfs_put_read_request(rreq, false); - if (ret < 0) - goto error; - -have_folio: - ret = folio_wait_fscache_killable(folio); - if (ret < 0) - goto error; -have_folio_no_wait: - if (netfs_priv) - ops->cleanup(mapping, netfs_priv); - *_folio = folio; - _leave(" = 0"); - return 0; - -error_put: - netfs_put_read_request(rreq, false); -error: - folio_unlock(folio); - folio_put(folio); - if (netfs_priv) - ops->cleanup(mapping, netfs_priv); - _leave(" = %d", ret); - return ret; -} -EXPORT_SYMBOL(netfs_write_begin); diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c index 9ae538c85378..5510a7a14a40 100644 --- a/fs/netfs/stats.c +++ b/fs/netfs/stats.c @@ -7,7 +7,6 @@ #include <linux/export.h> #include <linux/seq_file.h> -#include <linux/netfs.h> #include "internal.h" atomic_t netfs_n_rh_readahead; diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 14a72224b657..47a53b3362b6 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -4,6 +4,10 @@ config NFS_FS depends on INET && FILE_LOCKING && MULTIUSER select LOCKD select SUNRPC + select CRYPTO + select CRYPTO_HASH + select XXHASH + select CRYPTO_XXHASH select NFS_ACL_SUPPORT if NFS_V3_ACL help Choose Y here if you want to access files residing on other diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index c343666d9a42..c8520284dda7 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -358,12 +358,11 @@ __be32 nfs4_callback_devicenotify(void *argp, void *resp, struct cb_process_state *cps) { struct cb_devicenotifyargs *args = argp; + const struct pnfs_layoutdriver_type *ld = NULL; uint32_t i; __be32 res = 0; - struct nfs_client *clp = cps->clp; - struct nfs_server *server = NULL; - if (!clp) { + if (!cps->clp) { res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); goto out; } @@ -371,23 +370,15 @@ __be32 nfs4_callback_devicenotify(void *argp, void *resp, for (i = 0; i < args->ndevs; i++) { struct cb_devicenotifyitem *dev = &args->devs[i]; - if (!server || - server->pnfs_curr_ld->id != dev->cbd_layout_type) { - rcu_read_lock(); - list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) - if (server->pnfs_curr_ld && - server->pnfs_curr_ld->id == dev->cbd_layout_type) { - rcu_read_unlock(); - goto found; - } - rcu_read_unlock(); - continue; + if (!ld || ld->id != dev->cbd_layout_type) { + pnfs_put_layoutdriver(ld); + ld = pnfs_find_layoutdriver(dev->cbd_layout_type); + if (!ld) + continue; } - - found: - nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id); + nfs4_delete_deviceid(ld, cps->clp, &dev->cbd_dev_id); } - + pnfs_put_layoutdriver(ld); out: kfree(args->devs); return res; @@ -710,7 +701,7 @@ __be32 nfs4_callback_offload(void *data, void *dummy, struct nfs4_copy_state *copy, *tmp_copy; bool found = false; - copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS); + copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_KERNEL); if (!copy) return htonl(NFS4ERR_SERVERFAULT); diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index f90de8043b0f..8dcb08e1a885 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -271,10 +271,6 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, n = ntohl(*p++); if (n == 0) goto out; - if (n > ULONG_MAX / sizeof(*args->devs)) { - status = htonl(NFS4ERR_BADXDR); - goto out; - } args->devs = kmalloc_array(n, sizeof(*args->devs), GFP_KERNEL); if (!args->devs) { diff --git a/fs/nfs/client.c b/fs/nfs/client.c index d1f34229e11a..e828504cc396 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -857,7 +857,8 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str } if (clp->rpc_ops->discover_trunking != NULL && - (server->caps & NFS_CAP_FS_LOCATIONS)) { + (server->caps & NFS_CAP_FS_LOCATIONS && + (server->flags & NFS_MOUNT_TRUNK_DISCOVERY))) { error = clp->rpc_ops->discover_trunking(server, mntfh); if (error < 0) return error; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 7c9eb679dbdb..5c97cad741a7 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -439,7 +439,7 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred, struct nfs_delegation *freeme = NULL; int status = 0; - delegation = kmalloc(sizeof(*delegation), GFP_NOFS); + delegation = kmalloc(sizeof(*delegation), GFP_KERNEL_ACCOUNT); if (delegation == NULL) return -ENOMEM; nfs4_stateid_copy(&delegation->stateid, stateid); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 75cb1cbe4cde..bac4cf1a308e 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -39,6 +39,7 @@ #include <linux/sched.h> #include <linux/kmemleak.h> #include <linux/xattr.h> +#include <linux/xxhash.h> #include "delegation.h" #include "iostat.h" @@ -69,26 +70,26 @@ const struct address_space_operations nfs_dir_aops = { .freepage = nfs_readdir_clear_array, }; -static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir) +#define NFS_INIT_DTSIZE PAGE_SIZE + +static struct nfs_open_dir_context * +alloc_nfs_open_dir_context(struct inode *dir) { struct nfs_inode *nfsi = NFS_I(dir); struct nfs_open_dir_context *ctx; - ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT); if (ctx != NULL) { - ctx->duped = 0; ctx->attr_gencount = nfsi->attr_gencount; - ctx->dir_cookie = 0; - ctx->dup_cookie = 0; - ctx->page_index = 0; - ctx->eof = false; + ctx->dtsize = NFS_INIT_DTSIZE; spin_lock(&dir->i_lock); if (list_empty(&nfsi->open_files) && (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)) nfs_set_cache_invalid(dir, NFS_INO_INVALID_DATA | NFS_INO_REVAL_FORCED); - list_add(&ctx->list, &nfsi->open_files); - clear_bit(NFS_INO_FORCE_READDIR, &nfsi->flags); + list_add_tail_rcu(&ctx->list, &nfsi->open_files); + memcpy(ctx->verf, nfsi->cookieverf, sizeof(ctx->verf)); spin_unlock(&dir->i_lock); return ctx; } @@ -98,9 +99,9 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_context *ctx) { spin_lock(&dir->i_lock); - list_del(&ctx->list); + list_del_rcu(&ctx->list); spin_unlock(&dir->i_lock); - kfree(ctx); + kfree_rcu(ctx, rcu_head); } /* @@ -142,6 +143,7 @@ struct nfs_cache_array_entry { }; struct nfs_cache_array { + u64 change_attr; u64 last_cookie; unsigned int size; unsigned char page_full : 1, @@ -155,11 +157,10 @@ struct nfs_readdir_descriptor { struct page *page; struct dir_context *ctx; pgoff_t page_index; + pgoff_t page_index_max; u64 dir_cookie; u64 last_cookie; - u64 dup_cookie; loff_t current_index; - loff_t prev_index; __be32 verf[NFS_DIR_VERIFIER_SIZE]; unsigned long dir_verifier; @@ -167,24 +168,47 @@ struct nfs_readdir_descriptor { unsigned long gencount; unsigned long attr_gencount; unsigned int cache_entry_index; - signed char duped; + unsigned int buffer_fills; + unsigned int dtsize; + bool clear_cache; bool plus; bool eob; bool eof; }; -static void nfs_readdir_array_init(struct nfs_cache_array *array) +static void nfs_set_dtsize(struct nfs_readdir_descriptor *desc, unsigned int sz) +{ + struct nfs_server *server = NFS_SERVER(file_inode(desc->file)); + unsigned int maxsize = server->dtsize; + + if (sz > maxsize) + sz = maxsize; + if (sz < NFS_MIN_FILE_IO_SIZE) + sz = NFS_MIN_FILE_IO_SIZE; + desc->dtsize = sz; +} + +static void nfs_shrink_dtsize(struct nfs_readdir_descriptor *desc) { - memset(array, 0, sizeof(struct nfs_cache_array)); + nfs_set_dtsize(desc, desc->dtsize >> 1); } -static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie) +static void nfs_grow_dtsize(struct nfs_readdir_descriptor *desc) +{ + nfs_set_dtsize(desc, desc->dtsize << 1); +} + +static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie, + u64 change_attr) { struct nfs_cache_array *array; array = kmap_atomic(page); - nfs_readdir_array_init(array); + array->change_attr = change_attr; array->last_cookie = last_cookie; + array->size = 0; + array->page_full = 0; + array->page_is_eof = 0; array->cookies_are_ordered = 1; kunmap_atomic(array); } @@ -192,25 +216,31 @@ static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie) /* * we are freeing strings created by nfs_add_to_readdir_array() */ -static -void nfs_readdir_clear_array(struct page *page) +static void nfs_readdir_clear_array(struct page *page) { struct nfs_cache_array *array; - int i; + unsigned int i; array = kmap_atomic(page); for (i = 0; i < array->size; i++) kfree(array->array[i].name); - nfs_readdir_array_init(array); + array->size = 0; kunmap_atomic(array); } +static void nfs_readdir_page_reinit_array(struct page *page, u64 last_cookie, + u64 change_attr) +{ + nfs_readdir_clear_array(page); + nfs_readdir_page_init_array(page, last_cookie, change_attr); +} + static struct page * nfs_readdir_page_array_alloc(u64 last_cookie, gfp_t gfp_flags) { struct page *page = alloc_page(gfp_flags); if (page) - nfs_readdir_page_init_array(page, last_cookie); + nfs_readdir_page_init_array(page, last_cookie, 0); return page; } @@ -222,6 +252,11 @@ static void nfs_readdir_page_array_free(struct page *page) } } +static u64 nfs_readdir_array_index_cookie(struct nfs_cache_array *array) +{ + return array->size == 0 ? array->last_cookie : array->array[0].cookie; +} + static void nfs_readdir_array_set_eof(struct nfs_cache_array *array) { array->page_is_eof = 1; @@ -251,36 +286,40 @@ static const char *nfs_readdir_copy_name(const char *name, unsigned int len) return ret; } +static size_t nfs_readdir_array_maxentries(void) +{ + return (PAGE_SIZE - sizeof(struct nfs_cache_array)) / + sizeof(struct nfs_cache_array_entry); +} + /* * Check that the next array entry lies entirely within the page bounds */ static int nfs_readdir_array_can_expand(struct nfs_cache_array *array) { - struct nfs_cache_array_entry *cache_entry; - if (array->page_full) return -ENOSPC; - cache_entry = &array->array[array->size + 1]; - if ((char *)cache_entry - (char *)array > PAGE_SIZE) { + if (array->size == nfs_readdir_array_maxentries()) { array->page_full = 1; return -ENOSPC; } return 0; } -static -int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) +static int nfs_readdir_page_array_append(struct page *page, + const struct nfs_entry *entry, + u64 *cookie) { struct nfs_cache_array *array; struct nfs_cache_array_entry *cache_entry; const char *name; - int ret; + int ret = -ENOMEM; name = nfs_readdir_copy_name(entry->name, entry->len); - if (!name) - return -ENOMEM; array = kmap_atomic(page); + if (!name) + goto out; ret = nfs_readdir_array_can_expand(array); if (ret) { kfree(name); @@ -288,7 +327,7 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) } cache_entry = &array->array[array->size]; - cache_entry->cookie = entry->prev_cookie; + cache_entry->cookie = array->last_cookie; cache_entry->ino = entry->ino; cache_entry->d_type = entry->d_type; cache_entry->name_len = entry->len; @@ -300,23 +339,75 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) if (entry->eof != 0) nfs_readdir_array_set_eof(array); out: + *cookie = array->last_cookie; + kunmap_atomic(array); + return ret; +} + +#define NFS_READDIR_COOKIE_MASK (U32_MAX >> 14) +/* + * Hash algorithm allowing content addressible access to sequences + * of directory cookies. Content is addressed by the value of the + * cookie index of the first readdir entry in a page. + * + * The xxhash algorithm is chosen because it is fast, and is supposed + * to result in a decent flat distribution of hashes. + * + * We then select only the first 18 bits to avoid issues with excessive + * memory use for the page cache XArray. 18 bits should allow the caching + * of 262144 pages of sequences of readdir entries. Since each page holds + * 127 readdir entries for a typical 64-bit system, that works out to a + * cache of ~ 33 million entries per directory. + */ +static pgoff_t nfs_readdir_page_cookie_hash(u64 cookie) +{ + if (cookie == 0) + return 0; + return xxhash(&cookie, sizeof(cookie), 0) & NFS_READDIR_COOKIE_MASK; +} + +static bool nfs_readdir_page_validate(struct page *page, u64 last_cookie, + u64 change_attr) +{ + struct nfs_cache_array *array = kmap_atomic(page); + int ret = true; + + if (array->change_attr != change_attr) + ret = false; + if (nfs_readdir_array_index_cookie(array) != last_cookie) + ret = false; kunmap_atomic(array); return ret; } +static void nfs_readdir_page_unlock_and_put(struct page *page) +{ + unlock_page(page); + put_page(page); +} + +static void nfs_readdir_page_init_and_validate(struct page *page, u64 cookie, + u64 change_attr) +{ + if (PageUptodate(page)) { + if (nfs_readdir_page_validate(page, cookie, change_attr)) + return; + nfs_readdir_clear_array(page); + } + nfs_readdir_page_init_array(page, cookie, change_attr); + SetPageUptodate(page); +} + static struct page *nfs_readdir_page_get_locked(struct address_space *mapping, - pgoff_t index, u64 last_cookie) + u64 cookie, u64 change_attr) { + pgoff_t index = nfs_readdir_page_cookie_hash(cookie); struct page *page; page = grab_cache_page(mapping, index); - if (page && !PageUptodate(page)) { - nfs_readdir_page_init_array(page, last_cookie); - if (invalidate_inode_pages2_range(mapping, index + 1, -1) < 0) - nfs_zap_mapping(mapping->host, mapping); - SetPageUptodate(page); - } - + if (!page) + return NULL; + nfs_readdir_page_init_and_validate(page, cookie, change_attr); return page; } @@ -351,24 +442,19 @@ static void nfs_readdir_page_set_eof(struct page *page) kunmap_atomic(array); } -static void nfs_readdir_page_unlock_and_put(struct page *page) -{ - unlock_page(page); - put_page(page); -} - static struct page *nfs_readdir_page_get_next(struct address_space *mapping, - pgoff_t index, u64 cookie) + u64 cookie, u64 change_attr) { + pgoff_t index = nfs_readdir_page_cookie_hash(cookie); struct page *page; - page = nfs_readdir_page_get_locked(mapping, index, cookie); - if (page) { - if (nfs_readdir_page_last_cookie(page) == cookie) - return page; - nfs_readdir_page_unlock_and_put(page); - } - return NULL; + page = grab_cache_page_nowait(mapping, index); + if (!page) + return NULL; + nfs_readdir_page_init_and_validate(page, cookie, change_attr); + if (nfs_readdir_page_last_cookie(page) != cookie) + nfs_readdir_page_reinit_array(page, cookie, change_attr); + return page; } static inline @@ -390,6 +476,25 @@ bool nfs_readdir_use_cookie(const struct file *filp) return true; } +static void nfs_readdir_seek_next_array(struct nfs_cache_array *array, + struct nfs_readdir_descriptor *desc) +{ + if (array->page_full) { + desc->last_cookie = array->last_cookie; + desc->current_index += array->size; + desc->cache_entry_index = 0; + desc->page_index++; + } else + desc->last_cookie = nfs_readdir_array_index_cookie(array); +} + +static void nfs_readdir_rewind_search(struct nfs_readdir_descriptor *desc) +{ + desc->current_index = 0; + desc->last_cookie = 0; + desc->page_index = 0; +} + static int nfs_readdir_search_for_pos(struct nfs_cache_array *array, struct nfs_readdir_descriptor *desc) { @@ -401,6 +506,7 @@ static int nfs_readdir_search_for_pos(struct nfs_cache_array *array, if (diff >= array->size) { if (array->page_is_eof) goto out_eof; + nfs_readdir_seek_next_array(array, desc); return -EAGAIN; } @@ -413,16 +519,6 @@ out_eof: return -EBADCOOKIE; } -static bool -nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi) -{ - if (nfsi->cache_validity & (NFS_INO_INVALID_CHANGE | - NFS_INO_INVALID_DATA)) - return false; - smp_rmb(); - return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags); -} - static bool nfs_readdir_array_cookie_in_range(struct nfs_cache_array *array, u64 cookie) { @@ -439,8 +535,7 @@ static bool nfs_readdir_array_cookie_in_range(struct nfs_cache_array *array, static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, struct nfs_readdir_descriptor *desc) { - int i; - loff_t new_pos; + unsigned int i; int status = -EAGAIN; if (!nfs_readdir_array_cookie_in_range(array, desc->dir_cookie)) @@ -448,33 +543,10 @@ static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, for (i = 0; i < array->size; i++) { if (array->array[i].cookie == desc->dir_cookie) { - struct nfs_inode *nfsi = NFS_I(file_inode(desc->file)); - - new_pos = desc->current_index + i; - if (desc->attr_gencount != nfsi->attr_gencount || - !nfs_readdir_inode_mapping_valid(nfsi)) { - desc->duped = 0; - desc->attr_gencount = nfsi->attr_gencount; - } else if (new_pos < desc->prev_index) { - if (desc->duped > 0 - && desc->dup_cookie == desc->dir_cookie) { - if (printk_ratelimit()) { - pr_notice("NFS: directory %pD2 contains a readdir loop." - "Please contact your server vendor. " - "The file: %s has duplicate cookie %llu\n", - desc->file, array->array[i].name, desc->dir_cookie); - } - status = -ELOOP; - goto out; - } - desc->dup_cookie = desc->dir_cookie; - desc->duped = -1; - } if (nfs_readdir_use_cookie(desc->file)) desc->ctx->pos = desc->dir_cookie; else - desc->ctx->pos = new_pos; - desc->prev_index = new_pos; + desc->ctx->pos = desc->current_index + i; desc->cache_entry_index = i; return 0; } @@ -484,8 +556,8 @@ check_eof: status = -EBADCOOKIE; if (desc->dir_cookie == array->last_cookie) desc->eof = true; - } -out: + } else + nfs_readdir_seek_next_array(array, desc); return status; } @@ -501,11 +573,6 @@ static int nfs_readdir_search_array(struct nfs_readdir_descriptor *desc) else status = nfs_readdir_search_for_cookie(array, desc); - if (status == -EAGAIN) { - desc->last_cookie = array->last_cookie; - desc->current_index += array->size; - desc->page_index++; - } kunmap_atomic(array); return status; } @@ -541,7 +608,6 @@ static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc, /* We requested READDIRPLUS, but the server doesn't grok it */ if (error == -ENOTSUPP && desc->plus) { NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS; - clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); desc->plus = arg.plus = false; goto again; } @@ -591,51 +657,68 @@ int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) return 1; } -static -bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx) +#define NFS_READDIR_CACHE_USAGE_THRESHOLD (8UL) + +static bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx, + unsigned int cache_hits, + unsigned int cache_misses) { if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS)) return false; - if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags)) - return true; - if (ctx->pos == 0) + if (ctx->pos == 0 || + cache_hits + cache_misses > NFS_READDIR_CACHE_USAGE_THRESHOLD) return true; return false; } /* - * This function is called by the lookup and getattr code to request the + * This function is called by the getattr code to request the * use of readdirplus to accelerate any future lookups in the same * directory. */ -void nfs_advise_use_readdirplus(struct inode *dir) +void nfs_readdir_record_entry_cache_hit(struct inode *dir) { struct nfs_inode *nfsi = NFS_I(dir); + struct nfs_open_dir_context *ctx; if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) && - !list_empty(&nfsi->open_files)) - set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags); + S_ISDIR(dir->i_mode)) { + rcu_read_lock(); + list_for_each_entry_rcu (ctx, &nfsi->open_files, list) + atomic_inc(&ctx->cache_hits); + rcu_read_unlock(); + } } /* * This function is mainly for use by nfs_getattr(). * * If this is an 'ls -l', we want to force use of readdirplus. - * Do this by checking if there is an active file descriptor - * and calling nfs_advise_use_readdirplus, then forcing a - * cache flush. */ -void nfs_force_use_readdirplus(struct inode *dir) +void nfs_readdir_record_entry_cache_miss(struct inode *dir) { struct nfs_inode *nfsi = NFS_I(dir); + struct nfs_open_dir_context *ctx; if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) && - !list_empty(&nfsi->open_files)) { - set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags); - set_bit(NFS_INO_FORCE_READDIR, &nfsi->flags); + S_ISDIR(dir->i_mode)) { + rcu_read_lock(); + list_for_each_entry_rcu (ctx, &nfsi->open_files, list) + atomic_inc(&ctx->cache_misses); + rcu_read_unlock(); } } +static void nfs_lookup_advise_force_readdirplus(struct inode *dir, + unsigned int flags) +{ + if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) + return; + if (flags & (LOOKUP_EXCL | LOOKUP_PARENT | LOOKUP_REVAL)) + return; + nfs_readdir_record_entry_cache_miss(dir); +} + static void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry, unsigned long dir_verifier) @@ -686,8 +769,12 @@ again: status = nfs_refresh_inode(d_inode(dentry), entry->fattr); if (!status) nfs_setsecurity(d_inode(dentry), entry->fattr); + trace_nfs_readdir_lookup_revalidate(d_inode(parent), + dentry, 0, status); goto out; } else { + trace_nfs_readdir_lookup_revalidate_failed( + d_inode(parent), dentry, 0); d_invalidate(dentry); dput(dentry); dentry = NULL; @@ -709,22 +796,38 @@ again: dentry = alias; } nfs_set_verifier(dentry, dir_verifier); + trace_nfs_readdir_lookup(d_inode(parent), dentry, 0); out: dput(dentry); } +static int nfs_readdir_entry_decode(struct nfs_readdir_descriptor *desc, + struct nfs_entry *entry, + struct xdr_stream *stream) +{ + int ret; + + if (entry->fattr->label) + entry->fattr->label->len = NFS4_MAXLABELLEN; + ret = xdr_decode(desc, entry, stream); + if (ret || !desc->plus) + return ret; + nfs_prime_dcache(file_dentry(desc->file), entry, desc->dir_verifier); + return 0; +} + /* Perform conversion from xdr to cache array */ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc, struct nfs_entry *entry, - struct page **xdr_pages, - unsigned int buflen, - struct page **arrays, - size_t narrays) + struct page **xdr_pages, unsigned int buflen, + struct page **arrays, size_t narrays, + u64 change_attr) { struct address_space *mapping = desc->file->f_mapping; struct xdr_stream stream; struct xdr_buf buf; struct page *scratch, *new, *page = *arrays; + u64 cookie; int status; scratch = alloc_page(GFP_KERNEL); @@ -735,54 +838,50 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc, xdr_set_scratch_page(&stream, scratch); do { - if (entry->fattr->label) - entry->fattr->label->len = NFS4_MAXLABELLEN; - - status = xdr_decode(desc, entry, &stream); + status = nfs_readdir_entry_decode(desc, entry, &stream); if (status != 0) break; - if (desc->plus) - nfs_prime_dcache(file_dentry(desc->file), entry, - desc->dir_verifier); - - status = nfs_readdir_add_to_array(entry, page); + status = nfs_readdir_page_array_append(page, entry, &cookie); if (status != -ENOSPC) continue; if (page->mapping != mapping) { if (!--narrays) break; - new = nfs_readdir_page_array_alloc(entry->prev_cookie, - GFP_KERNEL); + new = nfs_readdir_page_array_alloc(cookie, GFP_KERNEL); if (!new) break; arrays++; *arrays = page = new; } else { - new = nfs_readdir_page_get_next(mapping, - page->index + 1, - entry->prev_cookie); + new = nfs_readdir_page_get_next(mapping, cookie, + change_attr); if (!new) break; if (page != *arrays) nfs_readdir_page_unlock_and_put(page); page = new; } - status = nfs_readdir_add_to_array(entry, page); + desc->page_index_max++; + status = nfs_readdir_page_array_append(page, entry, &cookie); } while (!status && !entry->eof); switch (status) { case -EBADCOOKIE: - if (entry->eof) { - nfs_readdir_page_set_eof(page); - status = 0; - } - break; - case -ENOSPC: + if (!entry->eof) + break; + nfs_readdir_page_set_eof(page); + fallthrough; case -EAGAIN: status = 0; break; + case -ENOSPC: + status = 0; + if (!desc->plus) + break; + while (!nfs_readdir_entry_decode(desc, entry, &stream)) + ; } if (page != *arrays) @@ -828,12 +927,14 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, __be32 *verf_arg, __be32 *verf_res, struct page **arrays, size_t narrays) { + u64 change_attr; struct page **pages; struct page *page = *arrays; struct nfs_entry *entry; size_t array_size; struct inode *inode = file_inode(desc->file); - size_t dtsize = NFS_SERVER(inode)->dtsize; + unsigned int dtsize = desc->dtsize; + unsigned int pglen; int status = -ENOMEM; entry = kzalloc(sizeof(*entry), GFP_KERNEL); @@ -851,27 +952,21 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, if (!pages) goto out; - do { - unsigned int pglen; - status = nfs_readdir_xdr_filler(desc, verf_arg, entry->cookie, - pages, dtsize, - verf_res); - if (status < 0) - break; - - pglen = status; - if (pglen == 0) { - nfs_readdir_page_set_eof(page); - break; - } - - verf_arg = verf_res; + change_attr = inode_peek_iversion_raw(inode); + status = nfs_readdir_xdr_filler(desc, verf_arg, entry->cookie, pages, + dtsize, verf_res); + if (status < 0) + goto free_pages; + pglen = status; + if (pglen != 0) status = nfs_readdir_page_filler(desc, entry, pages, pglen, - arrays, narrays); - } while (!status && nfs_readdir_page_needs_filling(page) && - page_mapping(page)); + arrays, narrays, change_attr); + else + nfs_readdir_page_set_eof(page); + desc->buffer_fills++; +free_pages: nfs_readdir_free_pages(pages, array_size); out: nfs_free_fattr(entry->fattr); @@ -896,9 +991,17 @@ nfs_readdir_page_unlock_and_put_cached(struct nfs_readdir_descriptor *desc) static struct page * nfs_readdir_page_get_cached(struct nfs_readdir_descriptor *desc) { - return nfs_readdir_page_get_locked(desc->file->f_mapping, - desc->page_index, - desc->last_cookie); + struct address_space *mapping = desc->file->f_mapping; + u64 change_attr = inode_peek_iversion_raw(mapping->host); + u64 cookie = desc->last_cookie; + struct page *page; + + page = nfs_readdir_page_get_locked(mapping, cookie, change_attr); + if (!page) + return NULL; + if (desc->clear_cache && !nfs_readdir_page_needs_filling(page)) + nfs_readdir_page_reinit_array(page, cookie, change_attr); + return page; } /* @@ -916,13 +1019,23 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc) if (!desc->page) return -ENOMEM; if (nfs_readdir_page_needs_filling(desc->page)) { + /* Grow the dtsize if we had to go back for more pages */ + if (desc->page_index == desc->page_index_max) + nfs_grow_dtsize(desc); + desc->page_index_max = desc->page_index; + trace_nfs_readdir_cache_fill(desc->file, nfsi->cookieverf, + desc->last_cookie, + desc->page->index, desc->dtsize); res = nfs_readdir_xdr_to_array(desc, nfsi->cookieverf, verf, &desc->page, 1); if (res < 0) { nfs_readdir_page_unlock_and_put_cached(desc); + trace_nfs_readdir_cache_fill_done(inode, res); if (res == -EBADCOOKIE || res == -ENOTSYNC) { invalidate_inode_pages2(desc->file->f_mapping); - desc->page_index = 0; + nfs_readdir_rewind_search(desc); + trace_nfs_readdir_invalidate_cache_range( + inode, 0, MAX_LFS_FILESIZE); return -EAGAIN; } return res; @@ -930,9 +1043,16 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc) /* * Set the cookie verifier if the page cache was empty */ - if (desc->page_index == 0) + if (desc->last_cookie == 0 && + memcmp(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf))) { memcpy(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf)); + invalidate_inode_pages2_range(desc->file->f_mapping, 1, + -1); + trace_nfs_readdir_invalidate_cache_range( + inode, 1, MAX_LFS_FILESIZE); + } + desc->clear_cache = false; } res = nfs_readdir_search_array(desc); if (res == 0) @@ -941,34 +1061,12 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc) return res; } -static bool nfs_readdir_dont_search_cache(struct nfs_readdir_descriptor *desc) -{ - struct address_space *mapping = desc->file->f_mapping; - struct inode *dir = file_inode(desc->file); - unsigned int dtsize = NFS_SERVER(dir)->dtsize; - loff_t size = i_size_read(dir); - - /* - * Default to uncached readdir if the page cache is empty, and - * we're looking for a non-zero cookie in a large directory. - */ - return desc->dir_cookie != 0 && mapping->nrpages == 0 && size > dtsize; -} - /* Search for desc->dir_cookie from the beginning of the page cache */ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc) { int res; - if (nfs_readdir_dont_search_cache(desc)) - return -EBADCOOKIE; - do { - if (desc->page_index == 0) { - desc->current_index = 0; - desc->prev_index = 0; - desc->last_cookie = 0; - } res = find_and_lock_cache_page(desc); } while (res == -EAGAIN); return res; @@ -982,7 +1080,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, { struct file *file = desc->file; struct nfs_cache_array *array; - unsigned int i = 0; + unsigned int i; array = kmap(desc->page); for (i = desc->cache_entry_index; i < array->size; i++) { @@ -995,16 +1093,17 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, break; } memcpy(desc->verf, verf, sizeof(desc->verf)); - if (i < (array->size-1)) - desc->dir_cookie = array->array[i+1].cookie; - else + if (i == array->size - 1) { desc->dir_cookie = array->last_cookie; + nfs_readdir_seek_next_array(array, desc); + } else { + desc->dir_cookie = array->array[i + 1].cookie; + desc->last_cookie = array->array[0].cookie; + } if (nfs_readdir_use_cookie(file)) desc->ctx->pos = desc->dir_cookie; else desc->ctx->pos++; - if (desc->duped != 0) - desc->duped = 1; } if (array->page_is_eof) desc->eof = !desc->eob; @@ -1046,9 +1145,16 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc) desc->page_index = 0; desc->cache_entry_index = 0; desc->last_cookie = desc->dir_cookie; - desc->duped = 0; + desc->page_index_max = 0; + + trace_nfs_readdir_uncached(desc->file, desc->verf, desc->last_cookie, + -1, desc->dtsize); status = nfs_readdir_xdr_to_array(desc, desc->verf, verf, arrays, sz); + if (status < 0) { + trace_nfs_readdir_uncached_done(file_inode(desc->file), status); + goto out_free; + } for (i = 0; !desc->eob && i < sz && arrays[i]; i++) { desc->page = arrays[i]; @@ -1056,15 +1162,44 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc) } desc->page = NULL; - + /* + * Grow the dtsize if we have to go back for more pages, + * or shrink it if we're reading too many. + */ + if (!desc->eof) { + if (!desc->eob) + nfs_grow_dtsize(desc); + else if (desc->buffer_fills == 1 && + i < (desc->page_index_max >> 1)) + nfs_shrink_dtsize(desc); + } +out_free: for (i = 0; i < sz && arrays[i]; i++) nfs_readdir_page_array_free(arrays[i]); out: + if (!nfs_readdir_use_cookie(desc->file)) + nfs_readdir_rewind_search(desc); + desc->page_index_max = -1; kfree(arrays); dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status); return status; } +#define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL) + +static bool nfs_readdir_handle_cache_misses(struct inode *inode, + struct nfs_readdir_descriptor *desc, + unsigned int cache_misses, + bool force_clear) +{ + if (desc->ctx->pos == 0 || !desc->plus) + return false; + if (cache_misses <= NFS_READDIR_CACHE_MISS_THRESHOLD && !force_clear) + return false; + trace_nfs_readdir_force_readdirplus(inode); + return true; +} + /* The file offset position represents the dirent entry number. A last cookie cache takes care of the common case of reading the whole directory. @@ -1076,7 +1211,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_dir_context *dir_ctx = file->private_data; struct nfs_readdir_descriptor *desc; - pgoff_t page_index; + unsigned int cache_hits, cache_misses; + bool force_clear; int res; dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n", @@ -1089,11 +1225,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) * to either find the entry with the appropriate number or * revalidate the cookie. */ - if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) { - res = nfs_revalidate_mapping(inode, file->f_mapping); - if (res < 0) - goto out; - } + nfs_revalidate_mapping(inode, file->f_mapping); res = -ENOMEM; desc = kzalloc(sizeof(*desc), GFP_KERNEL); @@ -1101,16 +1233,19 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) goto out; desc->file = file; desc->ctx = ctx; - desc->plus = nfs_use_readdirplus(inode, ctx); + desc->page_index_max = -1; spin_lock(&file->f_lock); desc->dir_cookie = dir_ctx->dir_cookie; - desc->dup_cookie = dir_ctx->dup_cookie; - desc->duped = dir_ctx->duped; - page_index = dir_ctx->page_index; + desc->page_index = dir_ctx->page_index; + desc->last_cookie = dir_ctx->last_cookie; desc->attr_gencount = dir_ctx->attr_gencount; desc->eof = dir_ctx->eof; + nfs_set_dtsize(desc, dir_ctx->dtsize); memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf)); + cache_hits = atomic_xchg(&dir_ctx->cache_hits, 0); + cache_misses = atomic_xchg(&dir_ctx->cache_misses, 0); + force_clear = dir_ctx->force_clear; spin_unlock(&file->f_lock); if (desc->eof) { @@ -1118,9 +1253,10 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) goto out_free; } - if (test_and_clear_bit(NFS_INO_FORCE_READDIR, &nfsi->flags) && - list_is_singular(&nfsi->open_files)) - invalidate_mapping_pages(inode->i_mapping, page_index + 1, -1); + desc->plus = nfs_use_readdirplus(inode, ctx, cache_hits, cache_misses); + force_clear = nfs_readdir_handle_cache_misses(inode, desc, cache_misses, + force_clear); + desc->clear_cache = force_clear; do { res = readdir_search_pagecache(desc); @@ -1139,9 +1275,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) break; } if (res == -ETOOSMALL && desc->plus) { - clear_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags); nfs_zap_caches(inode); - desc->page_index = 0; desc->plus = false; desc->eof = false; continue; @@ -1151,15 +1285,18 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) nfs_do_filldir(desc, nfsi->cookieverf); nfs_readdir_page_unlock_and_put_cached(desc); + if (desc->page_index == desc->page_index_max) + desc->clear_cache = force_clear; } while (!desc->eob && !desc->eof); spin_lock(&file->f_lock); dir_ctx->dir_cookie = desc->dir_cookie; - dir_ctx->dup_cookie = desc->dup_cookie; - dir_ctx->duped = desc->duped; + dir_ctx->last_cookie = desc->last_cookie; dir_ctx->attr_gencount = desc->attr_gencount; dir_ctx->page_index = desc->page_index; + dir_ctx->force_clear = force_clear; dir_ctx->eof = desc->eof; + dir_ctx->dtsize = desc->dtsize; memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf)); spin_unlock(&file->f_lock); out_free: @@ -1197,13 +1334,14 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) } if (offset != filp->f_pos) { filp->f_pos = offset; - if (nfs_readdir_use_cookie(filp)) - dir_ctx->dir_cookie = offset; - else + dir_ctx->page_index = 0; + if (!nfs_readdir_use_cookie(filp)) { dir_ctx->dir_cookie = 0; - if (offset == 0) - memset(dir_ctx->verf, 0, sizeof(dir_ctx->verf)); - dir_ctx->duped = 0; + dir_ctx->last_cookie = 0; + } else { + dir_ctx->dir_cookie = offset; + dir_ctx->last_cookie = offset; + } dir_ctx->eof = false; } spin_unlock(&filp->f_lock); @@ -1419,7 +1557,12 @@ int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags) if (flags & LOOKUP_REVAL) goto out_force; out: - return (inode->i_nlink == 0) ? -ESTALE : 0; + if (inode->i_nlink > 0 || + (inode->i_nlink == 0 && + test_bit(NFS_INO_PRESERVE_UNLINKED, &NFS_I(inode)->flags))) + return 0; + else + return -ESTALE; out_force: if (flags & LOOKUP_RCU) return -ECHILD; @@ -1469,9 +1612,7 @@ nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry, { switch (error) { case 1: - dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n", - __func__, dentry); - return 1; + break; case 0: /* * We can't d_drop the root of a disconnected tree: @@ -1480,13 +1621,10 @@ nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry, * inodes on unmount and further oopses. */ if (inode && IS_ROOT(dentry)) - return 1; - dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n", - __func__, dentry); - return 0; + error = 1; + break; } - dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n", - __func__, dentry, error); + trace_nfs_lookup_revalidate_exit(dir, dentry, 0, error); return error; } @@ -1511,15 +1649,17 @@ nfs_lookup_revalidate_delegated(struct inode *dir, struct dentry *dentry, return nfs_lookup_revalidate_done(dir, dentry, inode, 1); } -static int -nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry, - struct inode *inode) +static int nfs_lookup_revalidate_dentry(struct inode *dir, + struct dentry *dentry, + struct inode *inode, unsigned int flags) { struct nfs_fh *fhandle; struct nfs_fattr *fattr; unsigned long dir_verifier; int ret; + trace_nfs_lookup_revalidate_enter(dir, dentry, flags); + ret = -ENOMEM; fhandle = nfs_alloc_fhandle(); fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode)); @@ -1540,6 +1680,10 @@ nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry, } goto out; } + + /* Request help from readdirplus */ + nfs_lookup_advise_force_readdirplus(dir, flags); + ret = 0; if (nfs_compare_fh(NFS_FH(inode), fhandle)) goto out; @@ -1549,8 +1693,6 @@ nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry, nfs_setsecurity(inode, fattr); nfs_set_verifier(dentry, dir_verifier); - /* set a readdirplus hint that we had a cache miss */ - nfs_force_use_readdirplus(dir); ret = 1; out: nfs_free_fattr(fattr); @@ -1607,7 +1749,6 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, nfs_mark_dir_for_revalidate(dir); goto out_bad; } - nfs_advise_use_readdirplus(dir); goto out_valid; } @@ -1617,10 +1758,7 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, if (NFS_STALE(inode)) goto out_bad; - trace_nfs_lookup_revalidate_enter(dir, dentry, flags); - error = nfs_lookup_revalidate_dentry(dir, dentry, inode); - trace_nfs_lookup_revalidate_exit(dir, dentry, flags, error); - return error; + return nfs_lookup_revalidate_dentry(dir, dentry, inode, flags); out_valid: return nfs_lookup_revalidate_done(dir, dentry, inode, 1); out_bad: @@ -1814,7 +1952,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in goto out; /* Notify readdir to use READDIRPLUS */ - nfs_force_use_readdirplus(dir); + nfs_lookup_advise_force_readdirplus(dir, flags); no_entry: res = d_splice_alias(inode, dentry); @@ -2077,7 +2215,7 @@ nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, reval_dentry: if (flags & LOOKUP_RCU) return -ECHILD; - return nfs_lookup_revalidate_dentry(dir, dentry, inode); + return nfs_lookup_revalidate_dentry(dir, dentry, inode, flags); full_reval: return nfs_do_lookup_revalidate(dir, dentry, flags); @@ -2330,7 +2468,8 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry) trace_nfs_unlink_enter(dir, dentry); spin_lock(&dentry->d_lock); - if (d_count(dentry) > 1) { + if (d_count(dentry) > 1 && !test_bit(NFS_INO_PRESERVE_UNLINKED, + &NFS_I(d_inode(dentry))->flags)) { spin_unlock(&dentry->d_lock); /* Start asynchronous writeout of the inode */ write_inode_now(d_inode(dentry), 0); @@ -2989,11 +3128,8 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask) /* * Determine which access bits we want to ask for... */ - cache.mask = NFS_ACCESS_READ | NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND; - if (nfs_server_capable(inode, NFS_CAP_XATTR)) { - cache.mask |= NFS_ACCESS_XAREAD | NFS_ACCESS_XAWRITE | - NFS_ACCESS_XALIST; - } + cache.mask = NFS_ACCESS_READ | NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND | + nfs_access_xattr_mask(NFS_SERVER(inode)); if (S_ISDIR(inode->i_mode)) cache.mask |= NFS_ACCESS_DELETE | NFS_ACCESS_LOOKUP; else diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index eabfdab543c8..11c566d8769f 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -173,8 +173,8 @@ ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE); if (iov_iter_rw(iter) == READ) - return nfs_file_direct_read(iocb, iter); - return nfs_file_direct_write(iocb, iter); + return nfs_file_direct_read(iocb, iter, true); + return nfs_file_direct_write(iocb, iter, true); } static void nfs_direct_release_pages(struct page **pages, unsigned int npages) @@ -425,6 +425,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * nfs_file_direct_read - file direct read operation for NFS files * @iocb: target I/O control block * @iter: vector of user buffers into which to read data + * @swap: flag indicating this is swap IO, not O_DIRECT IO * * We use this function for direct reads instead of calling * generic_file_aio_read() in order to avoid gfar's check to see if @@ -440,7 +441,8 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * client must read the updated atime from the server back into its * cache. */ -ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) +ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, + bool swap) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -482,12 +484,14 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) if (iter_is_iovec(iter)) dreq->flags = NFS_ODIRECT_SHOULD_DIRTY; - nfs_start_io_direct(inode); + if (!swap) + nfs_start_io_direct(inode); NFS_I(inode)->read_io += count; requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); - nfs_end_io_direct(inode); + if (!swap) + nfs_end_io_direct(inode); if (requested > 0) { result = nfs_direct_wait(dreq); @@ -790,7 +794,7 @@ static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { */ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, struct iov_iter *iter, - loff_t pos) + loff_t pos, int ioflags) { struct nfs_pageio_descriptor desc; struct inode *inode = dreq->inode; @@ -798,7 +802,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, size_t requested_bytes = 0; size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE); - nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false, + nfs_pageio_init_write(&desc, inode, ioflags, false, &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; get_dreq(dreq); @@ -876,6 +880,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, * nfs_file_direct_write - file direct write operation for NFS files * @iocb: target I/O control block * @iter: vector of user buffers from which to write data + * @swap: flag indicating this is swap IO, not O_DIRECT IO * * We use this function for direct writes instead of calling * generic_file_aio_write() in order to avoid taking the inode @@ -892,7 +897,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, * Note that O_APPEND is not supported for NFS direct writes, as there * is no atomic O_APPEND write facility in the NFS protocol. */ -ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) +ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, + bool swap) { ssize_t result, requested; size_t count; @@ -906,7 +912,11 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", file, iov_iter_count(iter), (long long) iocb->ki_pos); - result = generic_write_checks(iocb, iter); + if (swap) + /* bypass generic checks */ + result = iov_iter_count(iter); + else + result = generic_write_checks(iocb, iter); if (result <= 0) return result; count = result; @@ -937,16 +947,22 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) dreq->iocb = iocb; pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode); - nfs_start_io_direct(inode); + if (swap) { + requested = nfs_direct_write_schedule_iovec(dreq, iter, pos, + FLUSH_STABLE); + } else { + nfs_start_io_direct(inode); - requested = nfs_direct_write_schedule_iovec(dreq, iter, pos); + requested = nfs_direct_write_schedule_iovec(dreq, iter, pos, + FLUSH_COND_STABLE); - if (mapping->nrpages) { - invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); - } + if (mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_SHIFT, end); + } - nfs_end_io_direct(inode); + nfs_end_io_direct(inode); + } if (requested > 0) { result = nfs_direct_wait(dreq); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 2df2a5392737..150b7fa8f0a7 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -44,11 +44,6 @@ static const struct vm_operations_struct nfs_file_vm_ops; -/* Hack for future NFS swap support */ -#ifndef IS_SWAPFILE -# define IS_SWAPFILE(inode) (0) -#endif - int nfs_check_flags(int flags) { if ((flags & (O_APPEND | O_DIRECT)) == (O_APPEND | O_DIRECT)) @@ -162,7 +157,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) ssize_t result; if (iocb->ki_flags & IOCB_DIRECT) - return nfs_file_direct_read(iocb, to); + return nfs_file_direct_read(iocb, to, false); dprintk("NFS: read(%pD2, %zu@%lu)\n", iocb->ki_filp, @@ -488,8 +483,9 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, { unsigned long blocks; long long isize; - struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host); - struct inode *inode = file->f_mapping->host; + struct inode *inode = file_inode(file); + struct rpc_clnt *clnt = NFS_CLIENT(inode); + struct nfs_client *cl = NFS_SERVER(inode)->nfs_client; spin_lock(&inode->i_lock); blocks = inode->i_blocks; @@ -502,19 +498,27 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, *span = sis->pages; + + if (cl->rpc_ops->enable_swap) + cl->rpc_ops->enable_swap(inode); + return rpc_clnt_swap_activate(clnt); } static void nfs_swap_deactivate(struct file *file) { - struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host); + struct inode *inode = file_inode(file); + struct rpc_clnt *clnt = NFS_CLIENT(inode); + struct nfs_client *cl = NFS_SERVER(inode)->nfs_client; rpc_clnt_swap_deactivate(clnt); + if (cl->rpc_ops->disable_swap) + cl->rpc_ops->disable_swap(file_inode(file)); } const struct address_space_operations nfs_file_aops = { .readpage = nfs_readpage, - .readpages = nfs_readpages, + .readahead = nfs_readahead, .dirty_folio = filemap_dirty_folio, .writepage = nfs_writepage, .writepages = nfs_writepages, @@ -619,7 +623,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) return result; if (iocb->ki_flags & IOCB_DIRECT) - return nfs_file_direct_write(iocb, from); + return nfs_file_direct_write(iocb, from, false); dprintk("NFS: write(%pD2, %zu@%Ld)\n", file, iov_iter_count(from), (long long) iocb->ki_pos); @@ -642,7 +646,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) result = generic_write_checks(iocb, from); if (result > 0) { current->backing_dev_info = inode_to_bdi(inode); - result = generic_perform_write(file, from, iocb->ki_pos); + result = generic_perform_write(iocb, from); current->backing_dev_info = NULL; } nfs_end_io_write(inode); diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 9c96e3e5ed35..76deddab0a8f 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -1075,7 +1075,7 @@ filelayout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo, unsigned int size = (fl->stripe_type == STRIPE_SPARSE) ? fl->dsaddr->ds_num : fl->dsaddr->stripe_count; - new = pnfs_alloc_commit_array(size, GFP_NOIO); + new = pnfs_alloc_commit_array(size, nfs_io_gfp_mask()); if (new) { spin_lock(&inode->i_lock); array = pnfs_add_commit_array(fl_cinfo, new, lseg); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index a553d59afa8b..604be402ae13 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -663,7 +663,7 @@ nfs4_ff_layout_stat_io_start_read(struct inode *inode, spin_unlock(&mirror->lock); if (report) - pnfs_report_layoutstat(inode, GFP_KERNEL); + pnfs_report_layoutstat(inode, nfs_io_gfp_mask()); } static void @@ -694,7 +694,7 @@ nfs4_ff_layout_stat_io_start_write(struct inode *inode, spin_unlock(&mirror->lock); if (report) - pnfs_report_layoutstat(inode, GFP_NOIO); + pnfs_report_layoutstat(inode, nfs_io_gfp_mask()); } static void @@ -806,13 +806,10 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio, bool strict_iomode) { pnfs_put_lseg(pgio->pg_lseg); - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - nfs_req_openctx(req), - req_offset(req), - req->wb_bytes, - IOMODE_READ, - strict_iomode, - GFP_KERNEL); + pgio->pg_lseg = + pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), + req_offset(req), req->wb_bytes, IOMODE_READ, + strict_iomode, nfs_io_gfp_mask()); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); pgio->pg_lseg = NULL; @@ -894,13 +891,10 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, retry: ff_layout_pg_check_layout(pgio, req); if (!pgio->pg_lseg) { - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - nfs_req_openctx(req), - req_offset(req), - req->wb_bytes, - IOMODE_RW, - false, - GFP_NOFS); + pgio->pg_lseg = + pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), + req_offset(req), req->wb_bytes, + IOMODE_RW, false, nfs_io_gfp_mask()); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); pgio->pg_lseg = NULL; @@ -953,13 +947,10 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { if (!pgio->pg_lseg) { - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - nfs_req_openctx(req), - req_offset(req), - req->wb_bytes, - IOMODE_RW, - false, - GFP_NOFS); + pgio->pg_lseg = + pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), + req_offset(req), req->wb_bytes, + IOMODE_RW, false, nfs_io_gfp_mask()); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); pgio->pg_lseg = NULL; @@ -1258,7 +1249,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, mirror = FF_LAYOUT_COMP(lseg, idx); err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), mirror, offset, length, status, opnum, - GFP_NOIO); + nfs_io_gfp_mask()); switch (status) { case NFS4ERR_DELAY: @@ -1973,7 +1964,8 @@ ff_layout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode = lseg->pls_layout->plh_inode; struct pnfs_commit_array *array, *new; - new = pnfs_alloc_commit_array(flseg->mirror_array_cnt, GFP_NOIO); + new = pnfs_alloc_commit_array(flseg->mirror_array_cnt, + nfs_io_gfp_mask()); if (new) { spin_lock(&inode->i_lock); array = pnfs_add_commit_array(fl_cinfo, new, lseg); @@ -2152,10 +2144,10 @@ ff_layout_prepare_layoutreturn(struct nfs4_layoutreturn_args *args) struct nfs4_flexfile_layoutreturn_args *ff_args; struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(args->layout); - ff_args = kmalloc(sizeof(*ff_args), GFP_KERNEL); + ff_args = kmalloc(sizeof(*ff_args), nfs_io_gfp_mask()); if (!ff_args) goto out_nomem; - ff_args->pages[0] = alloc_page(GFP_KERNEL); + ff_args->pages[0] = alloc_page(nfs_io_gfp_mask()); if (!ff_args->pages[0]) goto out_nomem_free; @@ -2192,8 +2184,8 @@ ff_layout_send_layouterror(struct pnfs_layout_segment *lseg) if (list_empty(&head)) return; - errors = kmalloc_array(NFS42_LAYOUTERROR_MAX, - sizeof(*errors), GFP_NOFS); + errors = kmalloc_array(NFS42_LAYOUTERROR_MAX, sizeof(*errors), + nfs_io_gfp_mask()); if (errors != NULL) { const struct nfs4_ff_layout_ds_err *pos; size_t n = 0; @@ -2444,7 +2436,8 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) const int dev_count = PNFS_LAYOUTSTATS_MAXDEV; /* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */ - args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo), GFP_NOIO); + args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo), + nfs_io_gfp_mask()); if (!args->devinfo) return -ENOMEM; diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index ea17fa1f31ec..e2d59bb5e6bb 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -80,6 +80,7 @@ enum nfs_param { Opt_source, Opt_tcp, Opt_timeo, + Opt_trunkdiscovery, Opt_udp, Opt_v, Opt_vers, @@ -180,6 +181,7 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = { fsparam_string("source", Opt_source), fsparam_flag ("tcp", Opt_tcp), fsparam_u32 ("timeo", Opt_timeo), + fsparam_flag_no("trunkdiscovery", Opt_trunkdiscovery), fsparam_flag ("udp", Opt_udp), fsparam_flag ("v2", Opt_v), fsparam_flag ("v3", Opt_v), @@ -529,6 +531,12 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, else ctx->flags &= ~NFS_MOUNT_NOCTO; break; + case Opt_trunkdiscovery: + if (result.negated) + ctx->flags &= ~NFS_MOUNT_TRUNK_DISCOVERY; + else + ctx->flags |= NFS_MOUNT_TRUNK_DISCOVERY; + break; case Opt_ac: if (result.negated) ctx->flags |= NFS_MOUNT_NOAC; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index cfe901650ab0..f73c09a9cf0a 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -19,8 +19,7 @@ #include "internal.h" #include "iostat.h" #include "fscache.h" - -#define NFSDBG_FACILITY NFSDBG_FSCACHE +#include "nfstrace.h" #define NFS_MAX_KEY_LEN 1000 @@ -128,8 +127,6 @@ int nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int u vcookie = fscache_acquire_volume(key, NULL, /* preferred_cache */ NULL, 0 /* coherency_data */); - dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n", - nfss, vcookie); if (IS_ERR(vcookie)) { if (vcookie != ERR_PTR(-EBUSY)) { kfree(key); @@ -152,9 +149,6 @@ void nfs_fscache_release_super_cookie(struct super_block *sb) { struct nfs_server *nfss = NFS_SB(sb); - dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n", - nfss, nfss->fscache); - fscache_relinquish_volume(nfss->fscache, NULL, false); nfss->fscache = NULL; kfree(nfss->fscache_uniq); @@ -173,7 +167,7 @@ void nfs_fscache_init_inode(struct inode *inode) if (!(nfss->fscache && S_ISREG(inode->i_mode))) return; - nfs_fscache_update_auxdata(&auxdata, nfsi); + nfs_fscache_update_auxdata(&auxdata, inode); nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache, 0, @@ -181,7 +175,7 @@ void nfs_fscache_init_inode(struct inode *inode) nfsi->fh.size, &auxdata, /* aux_data */ sizeof(auxdata), - i_size_read(&nfsi->vfs_inode)); + i_size_read(inode)); } /* @@ -192,8 +186,6 @@ void nfs_fscache_clear_inode(struct inode *inode) struct nfs_inode *nfsi = NFS_I(inode); struct fscache_cookie *cookie = nfs_i_fscache(inode); - dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie); - fscache_relinquish_cookie(cookie, false); nfsi->fscache = NULL; } @@ -220,7 +212,6 @@ void nfs_fscache_clear_inode(struct inode *inode) void nfs_fscache_open_file(struct inode *inode, struct file *filp) { struct nfs_fscache_inode_auxdata auxdata; - struct nfs_inode *nfsi = NFS_I(inode); struct fscache_cookie *cookie = nfs_i_fscache(inode); bool open_for_write = inode_is_open_for_write(inode); @@ -229,8 +220,7 @@ void nfs_fscache_open_file(struct inode *inode, struct file *filp) fscache_use_cookie(cookie, open_for_write); if (open_for_write) { - dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi); - nfs_fscache_update_auxdata(&auxdata, nfsi); + nfs_fscache_update_auxdata(&auxdata, inode); fscache_invalidate(cookie, &auxdata, i_size_read(inode), FSCACHE_INVAL_DIO_WRITE); } @@ -240,23 +230,14 @@ EXPORT_SYMBOL_GPL(nfs_fscache_open_file); void nfs_fscache_release_file(struct inode *inode, struct file *filp) { struct nfs_fscache_inode_auxdata auxdata; - struct nfs_inode *nfsi = NFS_I(inode); struct fscache_cookie *cookie = nfs_i_fscache(inode); if (fscache_cookie_valid(cookie)) { - nfs_fscache_update_auxdata(&auxdata, nfsi); + nfs_fscache_update_auxdata(&auxdata, inode); fscache_unuse_cookie(cookie, &auxdata, NULL); } } -static inline void fscache_end_operation(struct netfs_cache_resources *cres) -{ - const struct netfs_cache_ops *ops = fscache_operation_valid(cres); - - if (ops) - ops->end_operation(cres); -} - /* * Fallback page reading interface. */ @@ -319,58 +300,50 @@ static int fscache_fallback_write_page(struct inode *inode, struct page *page, /* * Retrieve a page from fscache */ -int __nfs_readpage_from_fscache(struct inode *inode, struct page *page) +int __nfs_fscache_read_page(struct inode *inode, struct page *page) { int ret; - dfprintk(FSCACHE, - "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n", - nfs_i_fscache(inode), page, page->index, page->flags, inode); - + trace_nfs_fscache_read_page(inode, page); if (PageChecked(page)) { - dfprintk(FSCACHE, "NFS: readpage_from_fscache: PageChecked\n"); ClearPageChecked(page); - return 1; + ret = 1; + goto out; } ret = fscache_fallback_read_page(inode, page); if (ret < 0) { nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL); - dfprintk(FSCACHE, - "NFS: readpage_from_fscache failed %d\n", ret); SetPageChecked(page); - return ret; + goto out; } /* Read completed synchronously */ - dfprintk(FSCACHE, "NFS: readpage_from_fscache: read successful\n"); nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK); SetPageUptodate(page); - return 0; + ret = 0; +out: + trace_nfs_fscache_read_page_exit(inode, page, ret); + return ret; } /* * Store a newly fetched page in fscache. We can be certain there's no page * stored in the cache as yet otherwise we would've read it from there. */ -void __nfs_readpage_to_fscache(struct inode *inode, struct page *page) +void __nfs_fscache_write_page(struct inode *inode, struct page *page) { int ret; - dfprintk(FSCACHE, - "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx))\n", - nfs_i_fscache(inode), page, page->index, page->flags); + trace_nfs_fscache_write_page(inode, page); ret = fscache_fallback_write_page(inode, page, true); - dfprintk(FSCACHE, - "NFS: readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n", - page, page->index, page->flags, ret); - if (ret != 0) { nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL); nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED); } else { nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_OK); } + trace_nfs_fscache_write_page_exit(inode, page, ret); } diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index 25a5c0f82392..4e980cc04779 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -45,10 +45,8 @@ extern void nfs_fscache_clear_inode(struct inode *); extern void nfs_fscache_open_file(struct inode *, struct file *); extern void nfs_fscache_release_file(struct inode *, struct file *); -extern int __nfs_readpage_from_fscache(struct inode *, struct page *); -extern void __nfs_read_completion_to_fscache(struct nfs_pgio_header *hdr, - unsigned long bytes); -extern void __nfs_readpage_to_fscache(struct inode *, struct page *); +extern int __nfs_fscache_read_page(struct inode *, struct page *); +extern void __nfs_fscache_write_page(struct inode *, struct page *); static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp) { @@ -66,11 +64,10 @@ static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp) /* * Retrieve a page from an inode data storage object. */ -static inline int nfs_readpage_from_fscache(struct inode *inode, - struct page *page) +static inline int nfs_fscache_read_page(struct inode *inode, struct page *page) { - if (NFS_I(inode)->fscache) - return __nfs_readpage_from_fscache(inode, page); + if (nfs_i_fscache(inode)) + return __nfs_fscache_read_page(inode, page); return -ENOBUFS; } @@ -78,24 +75,24 @@ static inline int nfs_readpage_from_fscache(struct inode *inode, * Store a page newly fetched from the server in an inode data storage object * in the cache. */ -static inline void nfs_readpage_to_fscache(struct inode *inode, +static inline void nfs_fscache_write_page(struct inode *inode, struct page *page) { - if (NFS_I(inode)->fscache) - __nfs_readpage_to_fscache(inode, page); + if (nfs_i_fscache(inode)) + __nfs_fscache_write_page(inode, page); } static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *auxdata, - struct nfs_inode *nfsi) + struct inode *inode) { memset(auxdata, 0, sizeof(*auxdata)); - auxdata->mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec; - auxdata->mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec; - auxdata->ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec; - auxdata->ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec; + auxdata->mtime_sec = inode->i_mtime.tv_sec; + auxdata->mtime_nsec = inode->i_mtime.tv_nsec; + auxdata->ctime_sec = inode->i_ctime.tv_sec; + auxdata->ctime_nsec = inode->i_ctime.tv_nsec; - if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) - auxdata->change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode); + if (NFS_SERVER(inode)->nfs_client->rpc_ops->version == 4) + auxdata->change_attr = inode_peek_iversion_raw(inode); } /* @@ -107,9 +104,9 @@ static inline void nfs_fscache_invalidate(struct inode *inode, int flags) struct nfs_inode *nfsi = NFS_I(inode); if (nfsi->fscache) { - nfs_fscache_update_auxdata(&auxdata, nfsi); + nfs_fscache_update_auxdata(&auxdata, inode); fscache_invalidate(nfsi->fscache, &auxdata, - i_size_read(&nfsi->vfs_inode), flags); + i_size_read(inode), flags); } } @@ -136,15 +133,11 @@ static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp) { return 1; /* True: may release page */ } -static inline int nfs_readpage_from_fscache(struct inode *inode, - struct page *page) +static inline int nfs_fscache_read_page(struct inode *inode, struct page *page) { return -ENOBUFS; } -static inline void nfs_readpage_to_fscache(struct inode *inode, - struct page *page) {} - - +static inline void nfs_fscache_write_page(struct inode *inode, struct page *page) {} static inline void nfs_fscache_invalidate(struct inode *inode, int flags) {} static inline const char *nfs_server_fscache_state(struct nfs_server *server) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 3351c2de3e08..7eb3b08d702f 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -203,14 +203,13 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) NFS_INO_INVALID_OTHER | NFS_INO_INVALID_XATTR); flags &= ~(NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE); - } else if (flags & NFS_INO_REVAL_PAGECACHE) - flags |= NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE; + } if (!nfs_has_xattr_cache(nfsi)) flags &= ~NFS_INO_INVALID_XATTR; if (flags & NFS_INO_INVALID_DATA) nfs_fscache_invalidate(inode, 0); - flags &= ~(NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED); + flags &= ~NFS_INO_REVAL_FORCED; nfsi->cache_validity |= flags; @@ -236,19 +235,17 @@ static void nfs_zap_caches_locked(struct inode *inode) nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = jiffies; - if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_DATA - | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL - | NFS_INO_INVALID_XATTR - | NFS_INO_REVAL_PAGECACHE); - } else - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL - | NFS_INO_INVALID_XATTR - | NFS_INO_REVAL_PAGECACHE); + if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | + NFS_INO_INVALID_DATA | + NFS_INO_INVALID_ACCESS | + NFS_INO_INVALID_ACL | + NFS_INO_INVALID_XATTR); + else + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | + NFS_INO_INVALID_ACCESS | + NFS_INO_INVALID_ACL | + NFS_INO_INVALID_XATTR); nfs_zap_label_cache_locked(nfsi); } @@ -564,8 +561,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) inode->i_gid = fattr->gid; else if (fattr_supported & NFS_ATTR_FATTR_GROUP) nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); - if (nfs_server_capable(inode, NFS_CAP_XATTR)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR); if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED && @@ -785,26 +780,32 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, } EXPORT_SYMBOL_GPL(nfs_setattr_update_inode); -static void nfs_readdirplus_parent_cache_miss(struct dentry *dentry) +/* + * Don't request help from readdirplus if the file is being written to, + * or if attribute caching is turned off + */ +static bool nfs_getattr_readdirplus_enable(const struct inode *inode) { - struct dentry *parent; + return nfs_server_capable(inode, NFS_CAP_READDIRPLUS) && + !nfs_have_writebacks(inode) && NFS_MAXATTRTIMEO(inode) > 5 * HZ; +} - if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS)) - return; - parent = dget_parent(dentry); - nfs_force_use_readdirplus(d_inode(parent)); - dput(parent); +static void nfs_readdirplus_parent_cache_miss(struct dentry *dentry) +{ + if (!IS_ROOT(dentry)) { + struct dentry *parent = dget_parent(dentry); + nfs_readdir_record_entry_cache_miss(d_inode(parent)); + dput(parent); + } } static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry) { - struct dentry *parent; - - if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS)) - return; - parent = dget_parent(dentry); - nfs_advise_use_readdirplus(d_inode(parent)); - dput(parent); + if (!IS_ROOT(dentry)) { + struct dentry *parent = dget_parent(dentry); + nfs_readdir_record_entry_cache_hit(d_inode(parent)); + dput(parent); + } } static u32 nfs_get_valid_attrmask(struct inode *inode) @@ -840,6 +841,7 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, int err = 0; bool force_sync = query_flags & AT_STATX_FORCE_SYNC; bool do_update = false; + bool readdirplus_enabled = nfs_getattr_readdirplus_enable(inode); trace_nfs_getattr_enter(inode); @@ -848,7 +850,8 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, STATX_INO | STATX_SIZE | STATX_BLOCKS; if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) { - nfs_readdirplus_parent_cache_hit(path->dentry); + if (readdirplus_enabled) + nfs_readdirplus_parent_cache_hit(path->dentry); goto out_no_revalidate; } @@ -898,15 +901,12 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, do_update |= cache_validity & NFS_INO_INVALID_BLOCKS; if (do_update) { - /* Update the attribute cache */ - if (!(server->flags & NFS_MOUNT_NOAC)) + if (readdirplus_enabled) nfs_readdirplus_parent_cache_miss(path->dentry); - else - nfs_readdirplus_parent_cache_hit(path->dentry); err = __nfs_revalidate_inode(server, inode); if (err) goto out; - } else + } else if (readdirplus_enabled) nfs_readdirplus_parent_cache_hit(path->dentry); out_no_revalidate: /* Only return attributes that were revalidated. */ @@ -952,7 +952,7 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) res = __nfs_find_lock_context(ctx); rcu_read_unlock(); if (res == NULL) { - new = kmalloc(sizeof(*new), GFP_KERNEL); + new = kmalloc(sizeof(*new), GFP_KERNEL_ACCOUNT); if (new == NULL) return ERR_PTR(-ENOMEM); nfs_init_lock_context(new); @@ -1030,7 +1030,7 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, { struct nfs_open_context *ctx; - ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT); if (!ctx) return ERR_PTR(-ENOMEM); nfs_sb_active(dentry->d_sb); @@ -1583,7 +1583,7 @@ struct nfs_fattr *nfs_alloc_fattr(void) { struct nfs_fattr *fattr; - fattr = kmalloc(sizeof(*fattr), GFP_NOFS); + fattr = kmalloc(sizeof(*fattr), GFP_KERNEL); if (fattr != NULL) { nfs_fattr_init(fattr); fattr->label = NULL; @@ -1599,7 +1599,7 @@ struct nfs_fattr *nfs_alloc_fattr_with_label(struct nfs_server *server) if (!fattr) return NULL; - fattr->label = nfs4_label_alloc(server, GFP_NOFS); + fattr->label = nfs4_label_alloc(server, GFP_KERNEL); if (IS_ERR(fattr->label)) { kfree(fattr); return NULL; @@ -1613,7 +1613,7 @@ struct nfs_fh *nfs_alloc_fhandle(void) { struct nfs_fh *fh; - fh = kmalloc(sizeof(struct nfs_fh), GFP_NOFS); + fh = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL); if (fh != NULL) fh->size = 0; return fh; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 2de7c56a1fbe..57b0497105c8 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -366,8 +366,8 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp, const struct nfs_client_initdata *); /* dir.c */ -extern void nfs_advise_use_readdirplus(struct inode *dir); -extern void nfs_force_use_readdirplus(struct inode *dir); +extern void nfs_readdir_record_entry_cache_hit(struct inode *dir); +extern void nfs_readdir_record_entry_cache_miss(struct inode *dir); extern unsigned long nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc); extern unsigned long nfs_access_cache_scan(struct shrinker *shrink, @@ -388,6 +388,20 @@ int nfs_mknod(struct user_namespace *, struct inode *, struct dentry *, umode_t, int nfs_rename(struct user_namespace *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); +#ifdef CONFIG_NFS_V4_2 +static inline __u32 nfs_access_xattr_mask(const struct nfs_server *server) +{ + if (!(server->caps & NFS_CAP_XATTR)) + return 0; + return NFS4_ACCESS_XAREAD | NFS4_ACCESS_XAWRITE | NFS4_ACCESS_XALIST; +} +#else +static inline __u32 nfs_access_xattr_mask(const struct nfs_server *server) +{ + return 0; +} +#endif + /* file.c */ int nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); loff_t nfs_file_llseek(struct file *, loff_t, int); @@ -573,6 +587,13 @@ nfs_write_match_verf(const struct nfs_writeverf *verf, !nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier); } +static inline gfp_t nfs_io_gfp_mask(void) +{ + if (current->flags & PF_WQ_WORKER) + return GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; + return GFP_KERNEL; +} + /* unlink.c */ extern struct rpc_task * nfs_async_rename(struct inode *old_dir, struct inode *new_dir, diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 7fba7711e6b3..05c3b4b2b3dd 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -949,13 +949,12 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, error = decode_filename_inline(xdr, &entry->name, &entry->len); if (unlikely(error)) - return error; + return -EAGAIN; /* * The type (size and byte order) of nfscookie isn't defined in * RFC 1094. This implementation assumes that it's an XDR uint32. */ - entry->prev_cookie = entry->cookie; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) return -EAGAIN; diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 9274c9c5efea..3b0b650c9c5a 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -1261,6 +1261,8 @@ static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req, static void encode_readdirplus3args(struct xdr_stream *xdr, const struct nfs3_readdirargs *args) { + uint32_t dircount = args->count; + uint32_t maxcount = args->count; __be32 *p; encode_nfs_fh3(xdr, args->fh); @@ -1273,9 +1275,8 @@ static void encode_readdirplus3args(struct xdr_stream *xdr, * readdirplus: need dircount + buffer size. * We just make sure we make dircount big enough */ - *p++ = cpu_to_be32(args->count >> 3); - - *p = cpu_to_be32(args->count); + *p++ = cpu_to_be32(dircount); + *p = cpu_to_be32(maxcount); } static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req, @@ -1967,7 +1968,6 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, bool plus) { struct user_namespace *userns = rpc_userns(entry->server->client); - struct nfs_entry old = *entry; __be32 *p; int error; u64 new_cookie; @@ -1987,15 +1987,15 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, error = decode_fileid3(xdr, &entry->ino); if (unlikely(error)) - return error; + return -EAGAIN; error = decode_inline_filename3(xdr, &entry->name, &entry->len); if (unlikely(error)) - return error; + return -EAGAIN; error = decode_cookie3(xdr, &new_cookie); if (unlikely(error)) - return error; + return -EAGAIN; entry->d_type = DT_UNKNOWN; @@ -2003,7 +2003,7 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, entry->fattr->valid = 0; error = decode_post_op_attr(xdr, entry->fattr, userns); if (unlikely(error)) - return error; + return -EAGAIN; if (entry->fattr->valid & NFS_ATTR_FATTR_V3) entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); @@ -2018,24 +2018,15 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, return -EAGAIN; if (*p != xdr_zero) { error = decode_nfs_fh3(xdr, entry->fh); - if (unlikely(error)) { - if (error == -E2BIG) - goto out_truncated; - return error; - } + if (unlikely(error)) + return -EAGAIN; } else zero_nfs_fh3(entry->fh); } - entry->prev_cookie = entry->cookie; entry->cookie = new_cookie; return 0; - -out_truncated: - dprintk("NFS: directory entry contains invalid file handle\n"); - *entry = old; - return -EAGAIN; } /* @@ -2228,6 +2219,7 @@ static int decode_fsinfo3resok(struct xdr_stream *xdr, /* ignore properties */ result->lease_time = 0; result->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED; + result->xattr_support = 0; return 0; } diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 32129446beca..068c45b3bc1a 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -175,28 +175,27 @@ static int handle_async_copy(struct nfs42_copy_res *res, nfs4_stateid *src_stateid, bool *restart) { - struct nfs4_copy_state *copy, *tmp_copy; + struct nfs4_copy_state *copy, *tmp_copy = NULL, *iter; int status = NFS4_OK; - bool found_pending = false; struct nfs_open_context *dst_ctx = nfs_file_open_context(dst); struct nfs_open_context *src_ctx = nfs_file_open_context(src); - copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS); + copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_KERNEL); if (!copy) return -ENOMEM; spin_lock(&dst_server->nfs_client->cl_lock); - list_for_each_entry(tmp_copy, + list_for_each_entry(iter, &dst_server->nfs_client->pending_cb_stateids, copies) { - if (memcmp(&res->write_res.stateid, &tmp_copy->stateid, + if (memcmp(&res->write_res.stateid, &iter->stateid, NFS4_STATEID_SIZE)) continue; - found_pending = true; - list_del(&tmp_copy->copies); + tmp_copy = iter; + list_del(&iter->copies); break; } - if (found_pending) { + if (tmp_copy) { spin_unlock(&dst_server->nfs_client->cl_lock); kfree(copy); copy = tmp_copy; @@ -254,7 +253,7 @@ static int process_copy_commit(struct file *dst, loff_t pos_dst, struct nfs_commitres cres; int status = -ENOMEM; - cres.verf = kzalloc(sizeof(struct nfs_writeverf), GFP_NOFS); + cres.verf = kzalloc(sizeof(struct nfs_writeverf), GFP_KERNEL); if (!cres.verf) goto out; @@ -357,7 +356,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, res->commit_res.verf = NULL; if (args->sync) { res->commit_res.verf = - kzalloc(sizeof(struct nfs_writeverf), GFP_NOFS); + kzalloc(sizeof(struct nfs_writeverf), GFP_KERNEL); if (!res->commit_res.verf) return -ENOMEM; } @@ -552,7 +551,7 @@ static int nfs42_do_offload_cancel_async(struct file *dst, if (!(dst_server->caps & NFS_CAP_OFFLOAD_CANCEL)) return -EOPNOTSUPP; - data = kzalloc(sizeof(struct nfs42_offloadcancel_data), GFP_NOFS); + data = kzalloc(sizeof(struct nfs42_offloadcancel_data), GFP_KERNEL); if (data == NULL) return -ENOMEM; @@ -591,8 +590,10 @@ static int _nfs42_proc_copy_notify(struct file *src, struct file *dst, ctx = get_nfs_open_context(nfs_file_open_context(src)); l_ctx = nfs_get_lock_context(ctx); - if (IS_ERR(l_ctx)) - return PTR_ERR(l_ctx); + if (IS_ERR(l_ctx)) { + status = PTR_ERR(l_ctx); + goto out; + } status = nfs4_set_rw_stateid(&args->cna_src_stateid, ctx, l_ctx, FMODE_READ); @@ -600,7 +601,7 @@ static int _nfs42_proc_copy_notify(struct file *src, struct file *dst, if (status) { if (status == -EAGAIN) status = -NFS4ERR_BAD_STATEID; - return status; + goto out; } status = nfs4_call_sync(src_server->client, src_server, &msg, @@ -609,6 +610,7 @@ static int _nfs42_proc_copy_notify(struct file *src, struct file *dst, if (status == -ENOTSUPP) src_server->caps &= ~NFS_CAP_COPY_NOTIFY; +out: put_nfs_open_context(nfs_file_open_context(src)); return status; } @@ -626,7 +628,7 @@ int nfs42_proc_copy_notify(struct file *src, struct file *dst, if (!(src_server->caps & NFS_CAP_COPY_NOTIFY)) return -EOPNOTSUPP; - args = kzalloc(sizeof(struct nfs42_copy_notify_args), GFP_NOFS); + args = kzalloc(sizeof(struct nfs42_copy_notify_args), GFP_KERNEL); if (args == NULL) return -ENOMEM; @@ -1014,7 +1016,7 @@ int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg, return -EOPNOTSUPP; if (n > NFS42_LAYOUTERROR_MAX) return -EINVAL; - data = nfs42_alloc_layouterror_data(lseg, GFP_NOFS); + data = nfs42_alloc_layouterror_data(lseg, nfs_io_gfp_mask()); if (!data) return -ENOMEM; for (i = 0; i < n; i++) { diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c index 1c4d2a05b401..ad3405c64b9e 100644 --- a/fs/nfs/nfs42xattr.c +++ b/fs/nfs/nfs42xattr.c @@ -199,7 +199,7 @@ nfs4_xattr_alloc_entry(const char *name, const void *value, flags = NFS4_XATTR_ENTRY_EXTVAL; } - buf = kmalloc(alloclen, GFP_KERNEL_ACCOUNT | GFP_NOFS); + buf = kmalloc(alloclen, GFP_KERNEL); if (buf == NULL) return NULL; entry = (struct nfs4_xattr_entry *)buf; @@ -213,7 +213,7 @@ nfs4_xattr_alloc_entry(const char *name, const void *value, if (flags & NFS4_XATTR_ENTRY_EXTVAL) { - valp = kvmalloc(len, GFP_KERNEL_ACCOUNT | GFP_NOFS); + valp = kvmalloc(len, GFP_KERNEL); if (valp == NULL) { kfree(buf); return NULL; @@ -289,8 +289,7 @@ nfs4_xattr_alloc_cache(void) { struct nfs4_xattr_cache *cache; - cache = kmem_cache_alloc(nfs4_xattr_cache_cachep, - GFP_KERNEL_ACCOUNT | GFP_NOFS); + cache = kmem_cache_alloc(nfs4_xattr_cache_cachep, GFP_KERNEL); if (cache == NULL) return NULL; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 84f39b6f1b1e..79df6e83881b 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -42,6 +42,7 @@ enum nfs4_client_state { NFS4CLNT_LEASE_MOVED, NFS4CLNT_DELEGATION_EXPIRED, NFS4CLNT_RUN_MANAGER, + NFS4CLNT_MANAGER_AVAILABLE, NFS4CLNT_RECALL_RUNNING, NFS4CLNT_RECALL_ANY_LAYOUT_READ, NFS4CLNT_RECALL_ANY_LAYOUT_RW, diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index e79ae4cbc395..d258933cf8c8 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -165,7 +165,7 @@ retry: if (sync) return -EOPNOTSUPP; cn_resp = kzalloc(sizeof(struct nfs42_copy_notify_res), - GFP_NOFS); + GFP_KERNEL); if (unlikely(cn_resp == NULL)) return -ENOMEM; @@ -180,8 +180,8 @@ retry: ret = nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count, nss, cnrs, sync); out: - if (!nfs42_files_from_same_server(file_in, file_out)) - kfree(cn_resp); + kfree(cn_resp); + if (ret == -EAGAIN) goto retry; return ret; @@ -339,7 +339,7 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt, res = ERR_PTR(-ENOMEM); len = strlen(SSC_READ_NAME_BODY) + 16; - read_name = kzalloc(len, GFP_NOFS); + read_name = kzalloc(len, GFP_KERNEL); if (read_name == NULL) goto out; snprintf(read_name, len, SSC_READ_NAME_BODY, read_name_gen++); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0e0db6c27619..e3f5b380cefe 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1392,13 +1392,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, case NFS4_OPEN_CLAIM_FH: p->o_arg.access = NFS4_ACCESS_READ | NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE | - NFS4_ACCESS_EXECUTE; -#ifdef CONFIG_NFS_V4_2 - if (!(server->caps & NFS_CAP_XATTR)) - break; - p->o_arg.access |= NFS4_ACCESS_XAREAD | NFS4_ACCESS_XAWRITE | - NFS4_ACCESS_XALIST; -#endif + NFS4_ACCESS_EXECUTE | + nfs_access_xattr_mask(server); } p->o_arg.clientid = server->nfs_client->cl_clientid; p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time); @@ -3050,6 +3045,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); if (opendata->o_res.rflags & NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK) set_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags); + if (opendata->o_res.rflags & NFS4_OPEN_RESULT_PRESERVE_UNLINKED) + set_bit(NFS_INO_PRESERVE_UNLINKED, &NFS_I(state->inode)->flags); dentry = opendata->dentry; if (d_really_is_negative(dentry)) { @@ -5904,7 +5901,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu buflen = server->rsize; npages = DIV_ROUND_UP(buflen, PAGE_SIZE) + 1; - pages = kmalloc_array(npages, sizeof(struct page *), GFP_NOFS); + pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); if (!pages) return -ENOMEM; @@ -6609,7 +6606,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, }; int status = 0; - data = kzalloc(sizeof(*data), GFP_NOFS); + data = kzalloc(sizeof(*data), GFP_KERNEL); if (data == NULL) return -ENOMEM; @@ -6797,7 +6794,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, struct nfs4_state *state = lsp->ls_state; struct inode *inode = state->inode; - p = kzalloc(sizeof(*p), GFP_NOFS); + p = kzalloc(sizeof(*p), GFP_KERNEL); if (p == NULL) return NULL; p->arg.fh = NFS_FH(inode); @@ -7202,8 +7199,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f task_setup_data.flags |= RPC_TASK_MOVEABLE; data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file), - fl->fl_u.nfs4_fl.owner, - recovery_type == NFS_LOCK_NEW ? GFP_KERNEL : GFP_NOFS); + fl->fl_u.nfs4_fl.owner, GFP_KERNEL); if (data == NULL) return -ENOMEM; if (IS_SETLKW(cmd)) @@ -7626,7 +7622,7 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp) if (server->nfs_client->cl_mvops->minor_version != 0) return; - data = kmalloc(sizeof(*data), GFP_NOFS); + data = kmalloc(sizeof(*data), GFP_KERNEL); if (!data) return; data->lsp = lsp; @@ -8012,6 +8008,18 @@ static int _nfs41_proc_get_locations(struct nfs_server *server, .rpc_resp = &res, .rpc_cred = cred, }; + struct nfs4_call_sync_data data = { + .seq_server = server, + .seq_args = &args.seq_args, + .seq_res = &res.seq_res, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clnt, + .rpc_message = &msg, + .callback_ops = server->nfs_client->cl_mvops->call_sync_ops, + .callback_data = &data, + .flags = RPC_TASK_NO_ROUND_ROBIN, + }; int status; nfs_fattr_init(&locations->fattr); @@ -8019,8 +8027,7 @@ static int _nfs41_proc_get_locations(struct nfs_server *server, locations->nlocations = 0; nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 1); - status = nfs4_call_sync_sequence(clnt, server, &msg, - &args.seq_args, &res.seq_res); + status = nfs4_call_sync_custom(&task_setup_data); if (status == NFS4_OK && res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED) status = -NFS4ERR_LEASE_MOVED; @@ -8333,6 +8340,7 @@ nfs4_bind_one_conn_to_session_done(struct rpc_task *task, void *calldata) case -NFS4ERR_DEADSESSION: nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); + return; } if (args->dir == NFS4_CDFC4_FORE_OR_BOTH && res->dir != NFS4_CDFS4_BOTH) { @@ -9291,7 +9299,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, goto out_err; ret = ERR_PTR(-ENOMEM); - calldata = kzalloc(sizeof(*calldata), GFP_NOFS); + calldata = kzalloc(sizeof(*calldata), GFP_KERNEL); if (calldata == NULL) goto out_put_clp; nfs4_init_sequence(&calldata->args, &calldata->res, 0, is_privileged); @@ -10222,7 +10230,7 @@ static int nfs41_free_stateid(struct nfs_server *server, &task_setup.rpc_client, &msg); dprintk("NFS call free_stateid %p\n", stateid); - data = kmalloc(sizeof(*data), GFP_NOFS); + data = kmalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; data->server = server; @@ -10461,6 +10469,24 @@ static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) return error + error2 + error3; } +static void nfs4_enable_swap(struct inode *inode) +{ + /* The state manager thread must always be running. + * It will notice the client is a swapper, and stay put. + */ + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + + nfs4_schedule_state_manager(clp); +} + +static void nfs4_disable_swap(struct inode *inode) +{ + /* The state manager thread will now exit once it is + * woken. + */ + wake_up_var(&NFS_SERVER(inode)->nfs_client->cl_state); +} + static const struct inode_operations nfs4_dir_inode_operations = { .create = nfs_create, .lookup = nfs_lookup, @@ -10538,6 +10564,8 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .create_server = nfs4_create_server, .clone_server = nfs_clone_server, .discover_trunking = nfs4_discover_trunking, + .enable_swap = nfs4_enable_swap, + .disable_swap = nfs4_disable_swap, }; static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 02a899e4390f..9e1c987c81e7 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -49,6 +49,7 @@ #include <linux/workqueue.h> #include <linux/bitops.h> #include <linux/jiffies.h> +#include <linux/sched/mm.h> #include <linux/sunrpc/clnt.h> @@ -666,7 +667,7 @@ nfs4_alloc_open_state(void) { struct nfs4_state *state; - state = kzalloc(sizeof(*state), GFP_NOFS); + state = kzalloc(sizeof(*state), GFP_KERNEL_ACCOUNT); if (!state) return NULL; refcount_set(&state->count, 1); @@ -820,7 +821,7 @@ static void __nfs4_close(struct nfs4_state *state, void nfs4_close_state(struct nfs4_state *state, fmode_t fmode) { - __nfs4_close(state, fmode, GFP_NOFS, 0); + __nfs4_close(state, fmode, GFP_KERNEL, 0); } void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode) @@ -869,14 +870,15 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f struct nfs4_lock_state *lsp; struct nfs_server *server = state->owner->so_server; - lsp = kzalloc(sizeof(*lsp), GFP_NOFS); + lsp = kzalloc(sizeof(*lsp), GFP_KERNEL_ACCOUNT); if (lsp == NULL) return NULL; nfs4_init_seqid_counter(&lsp->ls_seqid); refcount_set(&lsp->ls_count, 1); lsp->ls_state = state; lsp->ls_owner = fl_owner; - lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS); + lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, + 0, 0, GFP_KERNEL_ACCOUNT); if (lsp->ls_seqid.owner_id < 0) goto out_free; INIT_LIST_HEAD(&lsp->ls_locks); @@ -1205,10 +1207,17 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) { struct task_struct *task; char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1]; + struct rpc_clnt *cl = clp->cl_rpcclient; + + while (cl != cl->cl_parent) + cl = cl->cl_parent; set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); - if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) + if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) != 0) { + wake_up_var(&clp->cl_state); return; + } + set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); __module_get(THIS_MODULE); refcount_inc(&clp->cl_count); @@ -1224,6 +1233,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) printk(KERN_ERR "%s: kthread_run: %ld\n", __func__, PTR_ERR(task)); nfs4_clear_state_manager_bit(clp); + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); nfs_put_client(clp); module_put(THIS_MODULE); } @@ -2560,9 +2570,17 @@ static void nfs4_layoutreturn_any_run(struct nfs_client *clp) static void nfs4_state_manager(struct nfs_client *clp) { + unsigned int memflags; int status = 0; const char *section = "", *section_sep = ""; + /* + * State recovery can deadlock if the direct reclaim code tries + * start NFS writeback. So ensure memory allocations are all + * GFP_NOFS. + */ + memflags = memalloc_nofs_save(); + /* Ensure exclusive access to NFSv4 state */ do { trace_nfs4_state_mgr(clp); @@ -2657,6 +2675,7 @@ static void nfs4_state_manager(struct nfs_client *clp) clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state); } + memalloc_nofs_restore(memflags); nfs4_end_drain_session(clp); nfs4_clear_state_manager_bit(clp); @@ -2669,11 +2688,8 @@ static void nfs4_state_manager(struct nfs_client *clp) clear_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state); } - /* Did we race with an attempt to give us more work? */ - if (!test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)) - return; - if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) - return; + return; + } while (refcount_read(&clp->cl_count) > 1 && !signalled()); goto out_drain; @@ -2686,6 +2702,7 @@ out_error: clp->cl_hostname, -status); ssleep(1); out_drain: + memalloc_nofs_restore(memflags); nfs4_end_drain_session(clp); nfs4_clear_state_manager_bit(clp); } @@ -2693,9 +2710,31 @@ out_drain: static int nfs4_run_state_manager(void *ptr) { struct nfs_client *clp = ptr; + struct rpc_clnt *cl = clp->cl_rpcclient; + + while (cl != cl->cl_parent) + cl = cl->cl_parent; allow_signal(SIGKILL); +again: + set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); nfs4_state_manager(clp); + if (atomic_read(&cl->cl_swapper)) { + wait_var_event_interruptible(&clp->cl_state, + test_bit(NFS4CLNT_RUN_MANAGER, + &clp->cl_state)); + if (atomic_read(&cl->cl_swapper) && + test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)) + goto again; + /* Either no longer a swapper, or were signalled */ + } + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); + + if (refcount_read(&clp->cl_count) > 1 && !signalled() && + test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) && + !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state)) + goto again; + nfs_put_client(clp); return 0; } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 8e70b92df4cc..86a5f6516928 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1605,7 +1605,8 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg FATTR4_WORD0_RDATTR_ERROR, FATTR4_WORD1_MOUNTED_ON_FILEID, }; - uint32_t dircount = readdir->count >> 1; + uint32_t dircount = readdir->count; + uint32_t maxcount = readdir->count; __be32 *p, verf[2]; uint32_t attrlen = 0; unsigned int i; @@ -1618,7 +1619,6 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS| FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; attrs[2] |= FATTR4_WORD2_SECURITY_LABEL; - dircount >>= 1; } /* Use mounted_on_fileid only if the server supports it */ if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) @@ -1634,7 +1634,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg encode_nfs4_verifier(xdr, &readdir->verifier); p = reserve_space(xdr, 12 + (attrlen << 2)); *p++ = cpu_to_be32(dircount); - *p++ = cpu_to_be32(readdir->count); + *p++ = cpu_to_be32(maxcount); *p++ = cpu_to_be32(attrlen); for (i = 0; i < attrlen; i++) *p++ = cpu_to_be32(attrs[i]); @@ -7508,7 +7508,6 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE) entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); - entry->prev_cookie = entry->cookie; entry->cookie = new_cookie; return 0; diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 317ce27bdc4b..012bd7339862 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -21,7 +21,6 @@ { NFS_INO_INVALID_ATIME, "INVALID_ATIME" }, \ { NFS_INO_INVALID_ACCESS, "INVALID_ACCESS" }, \ { NFS_INO_INVALID_ACL, "INVALID_ACL" }, \ - { NFS_INO_REVAL_PAGECACHE, "REVAL_PAGECACHE" }, \ { NFS_INO_REVAL_FORCED, "REVAL_FORCED" }, \ { NFS_INO_INVALID_LABEL, "INVALID_LABEL" }, \ { NFS_INO_INVALID_CHANGE, "INVALID_CHANGE" }, \ @@ -37,7 +36,6 @@ #define nfs_show_nfsi_flags(v) \ __print_flags(v, "|", \ - { BIT(NFS_INO_ADVISE_RDPLUS), "ADVISE_RDPLUS" }, \ { BIT(NFS_INO_STALE), "STALE" }, \ { BIT(NFS_INO_ACL_LRU_SET), "ACL_LRU_SET" }, \ { BIT(NFS_INO_INVALIDATING), "INVALIDATING" }, \ @@ -162,6 +160,9 @@ DEFINE_NFS_INODE_EVENT(nfs_fsync_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_fsync_exit); DEFINE_NFS_INODE_EVENT(nfs_access_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_set_cache_invalid); +DEFINE_NFS_INODE_EVENT(nfs_readdir_force_readdirplus); +DEFINE_NFS_INODE_EVENT_DONE(nfs_readdir_cache_fill_done); +DEFINE_NFS_INODE_EVENT_DONE(nfs_readdir_uncached_done); TRACE_EVENT(nfs_access_exit, TP_PROTO( @@ -273,6 +274,122 @@ DEFINE_NFS_UPDATE_SIZE_EVENT(wcc); DEFINE_NFS_UPDATE_SIZE_EVENT(update); DEFINE_NFS_UPDATE_SIZE_EVENT(grow); +DECLARE_EVENT_CLASS(nfs_inode_range_event, + TP_PROTO( + const struct inode *inode, + loff_t range_start, + loff_t range_end + ), + + TP_ARGS(inode, range_start, range_end), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(u64, version) + __field(loff_t, range_start) + __field(loff_t, range_end) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->dev = inode->i_sb->s_dev; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->fileid = nfsi->fileid; + __entry->version = inode_peek_iversion_raw(inode); + __entry->range_start = range_start; + __entry->range_end = range_end; + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu " + "range=[%lld, %lld]", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __entry->version, + __entry->range_start, __entry->range_end + ) +); + +#define DEFINE_NFS_INODE_RANGE_EVENT(name) \ + DEFINE_EVENT(nfs_inode_range_event, name, \ + TP_PROTO( \ + const struct inode *inode, \ + loff_t range_start, \ + loff_t range_end \ + ), \ + TP_ARGS(inode, range_start, range_end)) + +DEFINE_NFS_INODE_RANGE_EVENT(nfs_readdir_invalidate_cache_range); + +DECLARE_EVENT_CLASS(nfs_readdir_event, + TP_PROTO( + const struct file *file, + const __be32 *verifier, + u64 cookie, + pgoff_t page_index, + unsigned int dtsize + ), + + TP_ARGS(file, verifier, cookie, page_index, dtsize), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(u64, version) + __array(char, verifier, NFS4_VERIFIER_SIZE) + __field(u64, cookie) + __field(pgoff_t, index) + __field(unsigned int, dtsize) + ), + + TP_fast_assign( + const struct inode *dir = file_inode(file); + const struct nfs_inode *nfsi = NFS_I(dir); + + __entry->dev = dir->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->version = inode_peek_iversion_raw(dir); + if (cookie != 0) + memcpy(__entry->verifier, verifier, + NFS4_VERIFIER_SIZE); + else + memset(__entry->verifier, 0, + NFS4_VERIFIER_SIZE); + __entry->cookie = cookie; + __entry->index = page_index; + __entry->dtsize = dtsize; + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu " + "cookie=%s:0x%llx cache_index=%lu dtsize=%u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, __entry->fhandle, + __entry->version, show_nfs4_verifier(__entry->verifier), + (unsigned long long)__entry->cookie, __entry->index, + __entry->dtsize + ) +); + +#define DEFINE_NFS_READDIR_EVENT(name) \ + DEFINE_EVENT(nfs_readdir_event, name, \ + TP_PROTO( \ + const struct file *file, \ + const __be32 *verifier, \ + u64 cookie, \ + pgoff_t page_index, \ + unsigned int dtsize \ + ), \ + TP_ARGS(file, verifier, cookie, page_index, dtsize)) + +DEFINE_NFS_READDIR_EVENT(nfs_readdir_cache_fill); +DEFINE_NFS_READDIR_EVENT(nfs_readdir_uncached); + DECLARE_EVENT_CLASS(nfs_lookup_event, TP_PROTO( const struct inode *dir, @@ -366,6 +483,9 @@ DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_enter); DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_exit); DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_revalidate_enter); DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_revalidate_exit); +DEFINE_NFS_LOOKUP_EVENT(nfs_readdir_lookup); +DEFINE_NFS_LOOKUP_EVENT(nfs_readdir_lookup_revalidate_failed); +DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_readdir_lookup_revalidate); TRACE_EVENT(nfs_atomic_open_enter, TP_PROTO( @@ -889,11 +1009,11 @@ TRACE_EVENT(nfs_aop_readpage_done, TRACE_EVENT(nfs_aop_readahead, TP_PROTO( const struct inode *inode, - struct page *page, + loff_t pos, unsigned int nr_pages ), - TP_ARGS(inode, page, nr_pages), + TP_ARGS(inode, pos, nr_pages), TP_STRUCT__entry( __field(dev_t, dev) @@ -911,7 +1031,7 @@ TRACE_EVENT(nfs_aop_readahead, __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(inode); - __entry->offset = page_index(page) << PAGE_SHIFT; + __entry->offset = pos; __entry->nr_pages = nr_pages; ), @@ -1095,6 +1215,97 @@ TRACE_EVENT(nfs_readpage_short, ) ); +DECLARE_EVENT_CLASS(nfs_fscache_page_event, + TP_PROTO( + const struct inode *inode, + struct page *page + ), + + TP_ARGS(inode, page), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(loff_t, offset) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + const struct nfs_fh *fh = &nfsi->fh; + + __entry->offset = page_index(page) << PAGE_SHIFT; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(fh); + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + (long long)__entry->offset + ) +); +DECLARE_EVENT_CLASS(nfs_fscache_page_event_done, + TP_PROTO( + const struct inode *inode, + struct page *page, + int error + ), + + TP_ARGS(inode, page, error), + + TP_STRUCT__entry( + __field(int, error) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(loff_t, offset) + ), + + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + const struct nfs_fh *fh = &nfsi->fh; + + __entry->offset = page_index(page) << PAGE_SHIFT; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(fh); + __entry->error = error; + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld error=%d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + (long long)__entry->offset, __entry->error + ) +); +#define DEFINE_NFS_FSCACHE_PAGE_EVENT(name) \ + DEFINE_EVENT(nfs_fscache_page_event, name, \ + TP_PROTO( \ + const struct inode *inode, \ + struct page *page \ + ), \ + TP_ARGS(inode, page)) +#define DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(name) \ + DEFINE_EVENT(nfs_fscache_page_event_done, name, \ + TP_PROTO( \ + const struct inode *inode, \ + struct page *page, \ + int error \ + ), \ + TP_ARGS(inode, page, error)) +DEFINE_NFS_FSCACHE_PAGE_EVENT(nfs_fscache_read_page); +DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(nfs_fscache_read_page_exit); +DEFINE_NFS_FSCACHE_PAGE_EVENT(nfs_fscache_write_page); +DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(nfs_fscache_write_page_exit); + TRACE_EVENT(nfs_pgio_error, TP_PROTO( const struct nfs_pgio_header *hdr, diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index ad7f83dc9a2d..9157dd19b8b4 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -90,10 +90,10 @@ void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos) } } -static inline struct nfs_page * -nfs_page_alloc(void) +static inline struct nfs_page *nfs_page_alloc(void) { - struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL); + struct nfs_page *p = + kmem_cache_zalloc(nfs_page_cachep, nfs_io_gfp_mask()); if (p) INIT_LIST_HEAD(&p->wb_list); return p; @@ -892,7 +892,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, struct nfs_commit_info cinfo; struct nfs_page_array *pg_array = &hdr->page_array; unsigned int pagecount, pageused; - gfp_t gfp_flags = GFP_KERNEL; + gfp_t gfp_flags = nfs_io_gfp_mask(); pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count); pg_array->npages = pagecount; @@ -979,7 +979,7 @@ nfs_pageio_alloc_mirrors(struct nfs_pageio_descriptor *desc, desc->pg_mirrors_dynamic = NULL; if (mirror_count == 1) return desc->pg_mirrors_static; - ret = kmalloc_array(mirror_count, sizeof(*ret), GFP_KERNEL); + ret = kmalloc_array(mirror_count, sizeof(*ret), nfs_io_gfp_mask()); if (ret != NULL) { for (i = 0; i < mirror_count; i++) nfs_pageio_mirror_init(&ret[i], desc->pg_bsize); @@ -1218,6 +1218,7 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) do { list_splice_init(&mirror->pg_list, &head); + mirror->pg_recoalesce = 0; while (!list_empty(&head)) { struct nfs_page *req; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 7c9090a28e5c..856c962273c7 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -92,6 +92,17 @@ find_pnfs_driver(u32 id) return local; } +const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id) +{ + return find_pnfs_driver(id); +} + +void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld) +{ + if (ld) + module_put(ld->owner); +} + void unset_pnfs_layoutdriver(struct nfs_server *nfss) { @@ -1233,7 +1244,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, int status = 0; *pcred = NULL; - lrp = kzalloc(sizeof(*lrp), GFP_NOFS); + lrp = kzalloc(sizeof(*lrp), nfs_io_gfp_mask()); if (unlikely(lrp == NULL)) { status = -ENOMEM; spin_lock(&ino->i_lock); @@ -2206,7 +2217,7 @@ _pnfs_grab_empty_layout(struct inode *ino, struct nfs_open_context *ctx) struct pnfs_layout_hdr *lo; spin_lock(&ino->i_lock); - lo = pnfs_find_alloc_layout(ino, ctx, GFP_KERNEL); + lo = pnfs_find_alloc_layout(ino, ctx, nfs_io_gfp_mask()); if (!lo) goto out_unlock; if (!test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) @@ -2249,8 +2260,8 @@ static void _lgopen_prepare_attached(struct nfs4_opendata *data, lo = _pnfs_grab_empty_layout(ino, ctx); if (!lo) return; - lgp = pnfs_alloc_init_layoutget_args(ino, ctx, ¤t_stateid, - &rng, GFP_KERNEL); + lgp = pnfs_alloc_init_layoutget_args(ino, ctx, ¤t_stateid, &rng, + nfs_io_gfp_mask()); if (!lgp) { pnfs_clear_first_layoutget(lo); nfs_layoutget_end(lo); @@ -2275,8 +2286,8 @@ static void _lgopen_prepare_floating(struct nfs4_opendata *data, }; struct nfs4_layoutget *lgp; - lgp = pnfs_alloc_init_layoutget_args(ino, ctx, ¤t_stateid, - &rng, GFP_KERNEL); + lgp = pnfs_alloc_init_layoutget_args(ino, ctx, ¤t_stateid, &rng, + nfs_io_gfp_mask()); if (!lgp) return; data->lgp = lgp; @@ -2691,13 +2702,11 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r else rd_size = nfs_dreq_bytes_left(pgio->pg_dreq); - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - nfs_req_openctx(req), - req_offset(req), - rd_size, - IOMODE_READ, - false, - GFP_KERNEL); + pgio->pg_lseg = + pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), + req_offset(req), rd_size, + IOMODE_READ, false, + nfs_io_gfp_mask()); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); pgio->pg_lseg = NULL; @@ -2718,13 +2727,10 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, pnfs_generic_pg_check_layout(pgio); pnfs_generic_pg_check_range(pgio, req); if (pgio->pg_lseg == NULL) { - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - nfs_req_openctx(req), - req_offset(req), - wb_size, - IOMODE_RW, - false, - GFP_KERNEL); + pgio->pg_lseg = + pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), + req_offset(req), wb_size, IOMODE_RW, + false, nfs_io_gfp_mask()); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); pgio->pg_lseg = NULL; @@ -3183,7 +3189,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) status = -ENOMEM; /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ - data = kzalloc(sizeof(*data), GFP_NOFS); + data = kzalloc(sizeof(*data), nfs_io_gfp_mask()); if (!data) goto clear_layoutcommitting; @@ -3250,7 +3256,7 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) { struct nfs4_threshold *thp; - thp = kzalloc(sizeof(*thp), GFP_NOFS); + thp = kzalloc(sizeof(*thp), nfs_io_gfp_mask()); if (!thp) { dprintk("%s mdsthreshold allocation failed\n", __func__); return NULL; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index f4d7548d67b2..07f11489e4e9 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -234,6 +234,8 @@ struct pnfs_devicelist { extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); +extern const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id); +extern void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld); /* nfs4proc.c */ extern size_t max_response_pages(struct nfs_server *server); diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 316f68f96e57..657c242a18ff 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -419,7 +419,7 @@ static struct nfs_commit_data * pnfs_bucket_fetch_commitdata(struct pnfs_commit_bucket *bucket, struct nfs_commit_info *cinfo) { - struct nfs_commit_data *data = nfs_commitdata_alloc(false); + struct nfs_commit_data *data = nfs_commitdata_alloc(); if (!data) return NULL; @@ -515,7 +515,11 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, unsigned int nreq = 0; if (!list_empty(mds_pages)) { - data = nfs_commitdata_alloc(true); + data = nfs_commitdata_alloc(); + if (!data) { + nfs_retry_commit(mds_pages, NULL, cinfo, -1); + return -ENOMEM; + } data->ds_commit_index = -1; list_splice_init(mds_pages, &data->pages); list_add_tail(&data->list, &list); diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 73dcaa99fa9b..e3570c656b0f 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -92,6 +92,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, info->maxfilesize = 0x7FFFFFFF; info->lease_time = 0; info->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED; + info->xattr_support = 0; return 0; } diff --git a/fs/nfs/read.c b/fs/nfs/read.c index eb00229c1a50..5e7657374bc3 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -123,7 +123,7 @@ static void nfs_readpage_release(struct nfs_page *req, int error) struct address_space *mapping = page_file_mapping(page); if (PageUptodate(page)) - nfs_readpage_to_fscache(inode, page); + nfs_fscache_write_page(inode, page); else if (!PageError(page) && !PagePrivate(page)) generic_error_remove_page(mapping, page); unlock_page(page); @@ -194,10 +194,6 @@ static void nfs_initiate_read(struct nfs_pgio_header *hdr, const struct nfs_rpc_ops *rpc_ops, struct rpc_task_setup *task_setup_data, int how) { - struct inode *inode = hdr->inode; - int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; - - task_setup_data->flags |= swap_flags; rpc_ops->read_setup(hdr, msg); trace_nfs_initiate_read(hdr); } @@ -290,9 +286,8 @@ static void nfs_readpage_result(struct rpc_task *task, } static int -readpage_async_filler(void *data, struct page *page) +readpage_async_filler(struct nfs_readdesc *desc, struct page *page) { - struct nfs_readdesc *desc = data; struct inode *inode = page_file_mapping(page)->host; unsigned int rsize = NFS_SERVER(inode)->rsize; struct nfs_page *new; @@ -306,7 +301,7 @@ readpage_async_filler(void *data, struct page *page) aligned_len = min_t(unsigned int, ALIGN(len, rsize), PAGE_SIZE); if (!IS_SYNC(page->mapping->host)) { - error = nfs_readpage_from_fscache(page->mapping->host, page); + error = nfs_fscache_read_page(page->mapping->host, page); if (error == 0) goto out_unlock; } @@ -397,14 +392,16 @@ out_unlock: return ret; } -int nfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +void nfs_readahead(struct readahead_control *ractl) { + unsigned int nr_pages = readahead_count(ractl); + struct file *file = ractl->file; struct nfs_readdesc desc; - struct inode *inode = mapping->host; + struct inode *inode = ractl->mapping->host; + struct page *page; int ret; - trace_nfs_aop_readahead(inode, lru_to_page(pages), nr_pages); + trace_nfs_aop_readahead(inode, readahead_pos(ractl), nr_pages); nfs_inc_stats(inode, NFSIOS_VFSREADPAGES); ret = -ESTALE; @@ -422,14 +419,18 @@ int nfs_readpages(struct file *file, struct address_space *mapping, nfs_pageio_init_read(&desc.pgio, inode, false, &nfs_async_read_completion_ops); - ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); + while ((page = readahead_page(ractl)) != NULL) { + ret = readpage_async_filler(&desc, page); + put_page(page); + if (ret) + break; + } nfs_pageio_complete_read(&desc.pgio); put_nfs_open_context(desc.ctx); out: trace_nfs_aop_readahead_done(inode, nr_pages, ret); - return ret; } int __init nfs_init_readpagecache(void) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 614e2809032e..f00d45cf80ef 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -70,27 +70,17 @@ static mempool_t *nfs_wdata_mempool; static struct kmem_cache *nfs_cdata_cachep; static mempool_t *nfs_commit_mempool; -struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail) +struct nfs_commit_data *nfs_commitdata_alloc(void) { struct nfs_commit_data *p; - if (never_fail) - p = mempool_alloc(nfs_commit_mempool, GFP_NOIO); - else { - /* It is OK to do some reclaim, not no safe to wait - * for anything to be returned to the pool. - * mempool_alloc() cannot handle that particular combination, - * so we need two separate attempts. - */ + p = kmem_cache_zalloc(nfs_cdata_cachep, nfs_io_gfp_mask()); + if (!p) { p = mempool_alloc(nfs_commit_mempool, GFP_NOWAIT); if (!p) - p = kmem_cache_alloc(nfs_cdata_cachep, GFP_NOIO | - __GFP_NOWARN | __GFP_NORETRY); - if (!p) return NULL; + memset(p, 0, sizeof(*p)); } - - memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->pages); return p; } @@ -104,9 +94,15 @@ EXPORT_SYMBOL_GPL(nfs_commit_free); static struct nfs_pgio_header *nfs_writehdr_alloc(void) { - struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_KERNEL); + struct nfs_pgio_header *p; - memset(p, 0, sizeof(*p)); + p = kmem_cache_zalloc(nfs_wdata_cachep, nfs_io_gfp_mask()); + if (!p) { + p = mempool_alloc(nfs_wdata_mempool, GFP_NOWAIT); + if (!p) + return NULL; + memset(p, 0, sizeof(*p)); + } p->rw_mode = FMODE_WRITE; return p; } @@ -306,7 +302,7 @@ static void nfs_set_pageerror(struct address_space *mapping) /* Force file size revalidation */ spin_lock(&inode->i_lock); nfs_set_cache_invalid(inode, NFS_INO_REVAL_FORCED | - NFS_INO_REVAL_PAGECACHE | + NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE); spin_unlock(&inode->i_lock); } @@ -316,7 +312,10 @@ static void nfs_mapping_set_error(struct page *page, int error) struct address_space *mapping = page_file_mapping(page); SetPageError(page); - mapping_set_error(mapping, error); + filemap_set_wb_err(mapping, error); + if (mapping->host) + errseq_set(&mapping->host->i_sb->s_wb_err, + error == -ENOSPC ? -ENOSPC : -EIO); nfs_set_pageerror(mapping); } @@ -1417,6 +1416,8 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr, { int priority = flush_task_priority(how); + if (IS_SWAPFILE(hdr->inode)) + task_setup_data->flags |= RPC_TASK_SWAPPER; task_setup_data->priority = priority; rpc_ops->write_setup(hdr, msg, &task_setup_data->rpc_client); trace_nfs_initiate_write(hdr); @@ -1829,7 +1830,11 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, if (list_empty(head)) return 0; - data = nfs_commitdata_alloc(true); + data = nfs_commitdata_alloc(); + if (!data) { + nfs_retry_commit(head, NULL, cinfo, -1); + return -ENOMEM; + } /* Set up the argument struct */ nfs_init_commit(data, head, NULL, cinfo); diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 66bdaa2cf496..ca611ac09f7c 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -20,6 +20,23 @@ #include "page.h" #include "btnode.h" + +/** + * nilfs_init_btnc_inode - initialize B-tree node cache inode + * @btnc_inode: inode to be initialized + * + * nilfs_init_btnc_inode() sets up an inode for B-tree node cache. + */ +void nilfs_init_btnc_inode(struct inode *btnc_inode) +{ + struct nilfs_inode_info *ii = NILFS_I(btnc_inode); + + btnc_inode->i_mode = S_IFREG; + ii->i_flags = 0; + memset(&ii->i_bmap_data, 0, sizeof(struct nilfs_bmap)); + mapping_set_gfp_mask(btnc_inode->i_mapping, GFP_NOFS); +} + void nilfs_btnode_cache_clear(struct address_space *btnc) { invalidate_mapping_pages(btnc, 0, -1); @@ -29,7 +46,7 @@ void nilfs_btnode_cache_clear(struct address_space *btnc) struct buffer_head * nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) { - struct inode *inode = NILFS_BTNC_I(btnc); + struct inode *inode = btnc->host; struct buffer_head *bh; bh = nilfs_grab_buffer(inode, btnc, blocknr, BIT(BH_NILFS_Node)); @@ -57,7 +74,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, struct buffer_head **pbh, sector_t *submit_ptr) { struct buffer_head *bh; - struct inode *inode = NILFS_BTNC_I(btnc); + struct inode *inode = btnc->host; struct page *page; int err; @@ -157,7 +174,7 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc, struct nilfs_btnode_chkey_ctxt *ctxt) { struct buffer_head *obh, *nbh; - struct inode *inode = NILFS_BTNC_I(btnc); + struct inode *inode = btnc->host; __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey; int err; diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h index 11663650add7..bd5544e63a01 100644 --- a/fs/nilfs2/btnode.h +++ b/fs/nilfs2/btnode.h @@ -30,6 +30,7 @@ struct nilfs_btnode_chkey_ctxt { struct buffer_head *newbh; }; +void nilfs_init_btnc_inode(struct inode *btnc_inode); void nilfs_btnode_cache_clear(struct address_space *); struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr); diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 3594eabe1419..f544c22fff78 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -58,7 +58,8 @@ static void nilfs_btree_free_path(struct nilfs_btree_path *path) static int nilfs_btree_get_new_block(const struct nilfs_bmap *btree, __u64 ptr, struct buffer_head **bhp) { - struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache; + struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode; + struct address_space *btnc = btnc_inode->i_mapping; struct buffer_head *bh; bh = nilfs_btnode_create_block(btnc, ptr); @@ -470,7 +471,8 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr, struct buffer_head **bhp, const struct nilfs_btree_readahead_info *ra) { - struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache; + struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode; + struct address_space *btnc = btnc_inode->i_mapping; struct buffer_head *bh, *ra_bh; sector_t submit_ptr = 0; int ret; @@ -1741,6 +1743,10 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *btree, __u64 key, dat = nilfs_bmap_get_dat(btree); } + ret = nilfs_attach_btree_node_cache(&NILFS_BMAP_I(btree)->vfs_inode); + if (ret < 0) + return ret; + ret = nilfs_bmap_prepare_alloc_ptr(btree, dreq, dat); if (ret < 0) return ret; @@ -1913,7 +1919,7 @@ static int nilfs_btree_prepare_update_v(struct nilfs_bmap *btree, path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr; path[level].bp_ctxt.bh = path[level].bp_bh; ret = nilfs_btnode_prepare_change_key( - &NILFS_BMAP_I(btree)->i_btnode_cache, + NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping, &path[level].bp_ctxt); if (ret < 0) { nilfs_dat_abort_update(dat, @@ -1939,7 +1945,7 @@ static void nilfs_btree_commit_update_v(struct nilfs_bmap *btree, if (buffer_nilfs_node(path[level].bp_bh)) { nilfs_btnode_commit_change_key( - &NILFS_BMAP_I(btree)->i_btnode_cache, + NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping, &path[level].bp_ctxt); path[level].bp_bh = path[level].bp_ctxt.bh; } @@ -1958,7 +1964,7 @@ static void nilfs_btree_abort_update_v(struct nilfs_bmap *btree, &path[level].bp_newreq.bpr_req); if (buffer_nilfs_node(path[level].bp_bh)) nilfs_btnode_abort_change_key( - &NILFS_BMAP_I(btree)->i_btnode_cache, + NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping, &path[level].bp_ctxt); } @@ -2134,7 +2140,8 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree, static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree, struct list_head *listp) { - struct address_space *btcache = &NILFS_BMAP_I(btree)->i_btnode_cache; + struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode; + struct address_space *btcache = btnc_inode->i_mapping; struct list_head lists[NILFS_BTREE_LEVEL_MAX]; struct pagevec pvec; struct buffer_head *bh, *head; @@ -2188,12 +2195,12 @@ static int nilfs_btree_assign_p(struct nilfs_bmap *btree, path[level].bp_ctxt.newkey = blocknr; path[level].bp_ctxt.bh = *bh; ret = nilfs_btnode_prepare_change_key( - &NILFS_BMAP_I(btree)->i_btnode_cache, + NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping, &path[level].bp_ctxt); if (ret < 0) return ret; nilfs_btnode_commit_change_key( - &NILFS_BMAP_I(btree)->i_btnode_cache, + NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping, &path[level].bp_ctxt); *bh = path[level].bp_ctxt.bh; } @@ -2398,6 +2405,10 @@ int nilfs_btree_init(struct nilfs_bmap *bmap) if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap), bmap->b_inode)) ret = -EIO; + else + ret = nilfs_attach_btree_node_cache( + &NILFS_BMAP_I(bmap)->vfs_inode); + return ret; } diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index dc51d3b7a7bf..3b55e239705f 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -497,7 +497,9 @@ int nilfs_dat_read(struct super_block *sb, size_t entry_size, di = NILFS_DAT_I(dat); lockdep_set_class(&di->mi.mi_sem, &dat_lock_key); nilfs_palloc_setup_cache(dat, &di->palloc_cache); - nilfs_mdt_setup_shadow_map(dat, &di->shadow); + err = nilfs_mdt_setup_shadow_map(dat, &di->shadow); + if (err) + goto failed; err = nilfs_read_inode_common(dat, raw_inode); if (err) diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index a8f5315f01e3..04fdd420eae7 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -126,9 +126,10 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn, __u64 vbn, struct buffer_head **out_bh) { + struct inode *btnc_inode = NILFS_I(inode)->i_assoc_inode; int ret; - ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache, + ret = nilfs_btnode_submit_block(btnc_inode->i_mapping, vbn ? : pbn, pbn, REQ_OP_READ, 0, out_bh, &pbn); if (ret == -EEXIST) /* internal code (cache hit) */ @@ -170,7 +171,7 @@ int nilfs_init_gcinode(struct inode *inode) ii->i_flags = 0; nilfs_bmap_init_gc(ii->i_bmap); - return 0; + return nilfs_attach_btree_node_cache(inode); } /** @@ -185,7 +186,7 @@ void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs) ii = list_first_entry(head, struct nilfs_inode_info, i_dirty); list_del_init(&ii->i_dirty); truncate_inode_pages(&ii->vfs_inode.i_data, 0); - nilfs_btnode_cache_clear(&ii->i_btnode_cache); + nilfs_btnode_cache_clear(ii->i_assoc_inode->i_mapping); iput(&ii->vfs_inode); } } diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 476a4a649f38..6045cea21f52 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -29,12 +29,16 @@ * @cno: checkpoint number * @root: pointer on NILFS root object (mounted checkpoint) * @for_gc: inode for GC flag + * @for_btnc: inode for B-tree node cache flag + * @for_shadow: inode for shadowed page cache flag */ struct nilfs_iget_args { u64 ino; __u64 cno; struct nilfs_root *root; - int for_gc; + bool for_gc; + bool for_btnc; + bool for_shadow; }; static int nilfs_iget_test(struct inode *inode, void *opaque); @@ -312,7 +316,8 @@ static int nilfs_insert_inode_locked(struct inode *inode, unsigned long ino) { struct nilfs_iget_args args = { - .ino = ino, .root = root, .cno = 0, .for_gc = 0 + .ino = ino, .root = root, .cno = 0, .for_gc = false, + .for_btnc = false, .for_shadow = false }; return insert_inode_locked4(inode, ino, nilfs_iget_test, &args); @@ -525,6 +530,19 @@ static int nilfs_iget_test(struct inode *inode, void *opaque) return 0; ii = NILFS_I(inode); + if (test_bit(NILFS_I_BTNC, &ii->i_state)) { + if (!args->for_btnc) + return 0; + } else if (args->for_btnc) { + return 0; + } + if (test_bit(NILFS_I_SHADOW, &ii->i_state)) { + if (!args->for_shadow) + return 0; + } else if (args->for_shadow) { + return 0; + } + if (!test_bit(NILFS_I_GCINODE, &ii->i_state)) return !args->for_gc; @@ -536,15 +554,17 @@ static int nilfs_iget_set(struct inode *inode, void *opaque) struct nilfs_iget_args *args = opaque; inode->i_ino = args->ino; - if (args->for_gc) { + NILFS_I(inode)->i_cno = args->cno; + NILFS_I(inode)->i_root = args->root; + if (args->root && args->ino == NILFS_ROOT_INO) + nilfs_get_root(args->root); + + if (args->for_gc) NILFS_I(inode)->i_state = BIT(NILFS_I_GCINODE); - NILFS_I(inode)->i_cno = args->cno; - NILFS_I(inode)->i_root = NULL; - } else { - if (args->root && args->ino == NILFS_ROOT_INO) - nilfs_get_root(args->root); - NILFS_I(inode)->i_root = args->root; - } + if (args->for_btnc) + NILFS_I(inode)->i_state |= BIT(NILFS_I_BTNC); + if (args->for_shadow) + NILFS_I(inode)->i_state |= BIT(NILFS_I_SHADOW); return 0; } @@ -552,7 +572,8 @@ struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root, unsigned long ino) { struct nilfs_iget_args args = { - .ino = ino, .root = root, .cno = 0, .for_gc = 0 + .ino = ino, .root = root, .cno = 0, .for_gc = false, + .for_btnc = false, .for_shadow = false }; return ilookup5(sb, ino, nilfs_iget_test, &args); @@ -562,7 +583,8 @@ struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root, unsigned long ino) { struct nilfs_iget_args args = { - .ino = ino, .root = root, .cno = 0, .for_gc = 0 + .ino = ino, .root = root, .cno = 0, .for_gc = false, + .for_btnc = false, .for_shadow = false }; return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args); @@ -593,7 +615,8 @@ struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino, __u64 cno) { struct nilfs_iget_args args = { - .ino = ino, .root = NULL, .cno = cno, .for_gc = 1 + .ino = ino, .root = NULL, .cno = cno, .for_gc = true, + .for_btnc = false, .for_shadow = false }; struct inode *inode; int err; @@ -613,6 +636,113 @@ struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino, return inode; } +/** + * nilfs_attach_btree_node_cache - attach a B-tree node cache to the inode + * @inode: inode object + * + * nilfs_attach_btree_node_cache() attaches a B-tree node cache to @inode, + * or does nothing if the inode already has it. This function allocates + * an additional inode to maintain page cache of B-tree nodes one-on-one. + * + * Return Value: On success, 0 is returned. On errors, one of the following + * negative error code is returned. + * + * %-ENOMEM - Insufficient memory available. + */ +int nilfs_attach_btree_node_cache(struct inode *inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + struct inode *btnc_inode; + struct nilfs_iget_args args; + + if (ii->i_assoc_inode) + return 0; + + args.ino = inode->i_ino; + args.root = ii->i_root; + args.cno = ii->i_cno; + args.for_gc = test_bit(NILFS_I_GCINODE, &ii->i_state) != 0; + args.for_btnc = true; + args.for_shadow = test_bit(NILFS_I_SHADOW, &ii->i_state) != 0; + + btnc_inode = iget5_locked(inode->i_sb, inode->i_ino, nilfs_iget_test, + nilfs_iget_set, &args); + if (unlikely(!btnc_inode)) + return -ENOMEM; + if (btnc_inode->i_state & I_NEW) { + nilfs_init_btnc_inode(btnc_inode); + unlock_new_inode(btnc_inode); + } + NILFS_I(btnc_inode)->i_assoc_inode = inode; + NILFS_I(btnc_inode)->i_bmap = ii->i_bmap; + ii->i_assoc_inode = btnc_inode; + + return 0; +} + +/** + * nilfs_detach_btree_node_cache - detach the B-tree node cache from the inode + * @inode: inode object + * + * nilfs_detach_btree_node_cache() detaches the B-tree node cache and its + * holder inode bound to @inode, or does nothing if @inode doesn't have it. + */ +void nilfs_detach_btree_node_cache(struct inode *inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + struct inode *btnc_inode = ii->i_assoc_inode; + + if (btnc_inode) { + NILFS_I(btnc_inode)->i_assoc_inode = NULL; + ii->i_assoc_inode = NULL; + iput(btnc_inode); + } +} + +/** + * nilfs_iget_for_shadow - obtain inode for shadow mapping + * @inode: inode object that uses shadow mapping + * + * nilfs_iget_for_shadow() allocates a pair of inodes that holds page + * caches for shadow mapping. The page cache for data pages is set up + * in one inode and the one for b-tree node pages is set up in the + * other inode, which is attached to the former inode. + * + * Return Value: On success, a pointer to the inode for data pages is + * returned. On errors, one of the following negative error code is returned + * in a pointer type. + * + * %-ENOMEM - Insufficient memory available. + */ +struct inode *nilfs_iget_for_shadow(struct inode *inode) +{ + struct nilfs_iget_args args = { + .ino = inode->i_ino, .root = NULL, .cno = 0, .for_gc = false, + .for_btnc = false, .for_shadow = true + }; + struct inode *s_inode; + int err; + + s_inode = iget5_locked(inode->i_sb, inode->i_ino, nilfs_iget_test, + nilfs_iget_set, &args); + if (unlikely(!s_inode)) + return ERR_PTR(-ENOMEM); + if (!(s_inode->i_state & I_NEW)) + return inode; + + NILFS_I(s_inode)->i_flags = 0; + memset(NILFS_I(s_inode)->i_bmap, 0, sizeof(struct nilfs_bmap)); + mapping_set_gfp_mask(s_inode->i_mapping, GFP_NOFS); + + err = nilfs_attach_btree_node_cache(s_inode); + if (unlikely(err)) { + iget_failed(s_inode); + return ERR_PTR(err); + } + unlock_new_inode(s_inode); + return s_inode; +} + void nilfs_write_inode_common(struct inode *inode, struct nilfs_inode *raw_inode, int has_bmap) { @@ -760,7 +890,8 @@ static void nilfs_clear_inode(struct inode *inode) if (test_bit(NILFS_I_BMAP, &ii->i_state)) nilfs_bmap_clear(ii->i_bmap); - nilfs_btnode_cache_clear(&ii->i_btnode_cache); + if (!test_bit(NILFS_I_BTNC, &ii->i_state)) + nilfs_detach_btree_node_cache(inode); if (ii->i_root && inode->i_ino == NILFS_ROOT_INO) nilfs_put_root(ii->i_root); diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 78db33decd72..d29a0f2b9c16 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -471,9 +471,18 @@ int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz) void nilfs_mdt_clear(struct inode *inode) { struct nilfs_mdt_info *mdi = NILFS_MDT(inode); + struct nilfs_shadow_map *shadow = mdi->mi_shadow; if (mdi->mi_palloc_cache) nilfs_palloc_destroy_cache(inode); + + if (shadow) { + struct inode *s_inode = shadow->inode; + + shadow->inode = NULL; + iput(s_inode); + mdi->mi_shadow = NULL; + } } /** @@ -507,12 +516,15 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode, struct nilfs_shadow_map *shadow) { struct nilfs_mdt_info *mi = NILFS_MDT(inode); + struct inode *s_inode; INIT_LIST_HEAD(&shadow->frozen_buffers); - address_space_init_once(&shadow->frozen_data); - nilfs_mapping_init(&shadow->frozen_data, inode); - address_space_init_once(&shadow->frozen_btnodes); - nilfs_mapping_init(&shadow->frozen_btnodes, inode); + + s_inode = nilfs_iget_for_shadow(inode); + if (IS_ERR(s_inode)) + return PTR_ERR(s_inode); + + shadow->inode = s_inode; mi->mi_shadow = shadow; return 0; } @@ -526,14 +538,15 @@ int nilfs_mdt_save_to_shadow_map(struct inode *inode) struct nilfs_mdt_info *mi = NILFS_MDT(inode); struct nilfs_inode_info *ii = NILFS_I(inode); struct nilfs_shadow_map *shadow = mi->mi_shadow; + struct inode *s_inode = shadow->inode; int ret; - ret = nilfs_copy_dirty_pages(&shadow->frozen_data, inode->i_mapping); + ret = nilfs_copy_dirty_pages(s_inode->i_mapping, inode->i_mapping); if (ret) goto out; - ret = nilfs_copy_dirty_pages(&shadow->frozen_btnodes, - &ii->i_btnode_cache); + ret = nilfs_copy_dirty_pages(NILFS_I(s_inode)->i_assoc_inode->i_mapping, + ii->i_assoc_inode->i_mapping); if (ret) goto out; @@ -549,7 +562,7 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh) struct page *page; int blkbits = inode->i_blkbits; - page = grab_cache_page(&shadow->frozen_data, bh->b_page->index); + page = grab_cache_page(shadow->inode->i_mapping, bh->b_page->index); if (!page) return -ENOMEM; @@ -581,7 +594,7 @@ nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh) struct page *page; int n; - page = find_lock_page(&shadow->frozen_data, bh->b_page->index); + page = find_lock_page(shadow->inode->i_mapping, bh->b_page->index); if (page) { if (page_has_buffers(page)) { n = bh_offset(bh) >> inode->i_blkbits; @@ -622,10 +635,11 @@ void nilfs_mdt_restore_from_shadow_map(struct inode *inode) nilfs_palloc_clear_cache(inode); nilfs_clear_dirty_pages(inode->i_mapping, true); - nilfs_copy_back_pages(inode->i_mapping, &shadow->frozen_data); + nilfs_copy_back_pages(inode->i_mapping, shadow->inode->i_mapping); - nilfs_clear_dirty_pages(&ii->i_btnode_cache, true); - nilfs_copy_back_pages(&ii->i_btnode_cache, &shadow->frozen_btnodes); + nilfs_clear_dirty_pages(ii->i_assoc_inode->i_mapping, true); + nilfs_copy_back_pages(ii->i_assoc_inode->i_mapping, + NILFS_I(shadow->inode)->i_assoc_inode->i_mapping); nilfs_bmap_restore(ii->i_bmap, &shadow->bmap_store); @@ -640,10 +654,11 @@ void nilfs_mdt_clear_shadow_map(struct inode *inode) { struct nilfs_mdt_info *mi = NILFS_MDT(inode); struct nilfs_shadow_map *shadow = mi->mi_shadow; + struct inode *shadow_btnc_inode = NILFS_I(shadow->inode)->i_assoc_inode; down_write(&mi->mi_sem); nilfs_release_frozen_buffers(shadow); - truncate_inode_pages(&shadow->frozen_data, 0); - truncate_inode_pages(&shadow->frozen_btnodes, 0); + truncate_inode_pages(shadow->inode->i_mapping, 0); + truncate_inode_pages(shadow_btnc_inode->i_mapping, 0); up_write(&mi->mi_sem); } diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h index 8f86080a436d..9e23bab3ff12 100644 --- a/fs/nilfs2/mdt.h +++ b/fs/nilfs2/mdt.h @@ -18,14 +18,12 @@ /** * struct nilfs_shadow_map - shadow mapping of meta data file * @bmap_store: shadow copy of bmap state - * @frozen_data: shadowed dirty data pages - * @frozen_btnodes: shadowed dirty b-tree nodes' pages + * @inode: holder of page caches used in shadow mapping * @frozen_buffers: list of frozen buffers */ struct nilfs_shadow_map { struct nilfs_bmap_store bmap_store; - struct address_space frozen_data; - struct address_space frozen_btnodes; + struct inode *inode; struct list_head frozen_buffers; }; diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index a7b81755c350..1344f7d475d3 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -28,7 +28,7 @@ * @i_xattr: <TODO> * @i_dir_start_lookup: page index of last successful search * @i_cno: checkpoint number for GC inode - * @i_btnode_cache: cached pages of b-tree nodes + * @i_assoc_inode: associated inode (B-tree node cache holder or back pointer) * @i_dirty: list for connecting dirty files * @xattr_sem: semaphore for extended attributes processing * @i_bh: buffer contains disk inode @@ -43,7 +43,7 @@ struct nilfs_inode_info { __u64 i_xattr; /* sector_t ??? */ __u32 i_dir_start_lookup; __u64 i_cno; /* check point number for GC inode */ - struct address_space i_btnode_cache; + struct inode *i_assoc_inode; struct list_head i_dirty; /* List for connecting dirty files */ #ifdef CONFIG_NILFS_XATTR @@ -75,13 +75,6 @@ NILFS_BMAP_I(const struct nilfs_bmap *bmap) return container_of(bmap, struct nilfs_inode_info, i_bmap_data); } -static inline struct inode *NILFS_BTNC_I(struct address_space *btnc) -{ - struct nilfs_inode_info *ii = - container_of(btnc, struct nilfs_inode_info, i_btnode_cache); - return &ii->vfs_inode; -} - /* * Dynamic state flags of NILFS on-memory inode (i_state) */ @@ -98,6 +91,8 @@ enum { NILFS_I_INODE_SYNC, /* dsync is not allowed for inode */ NILFS_I_BMAP, /* has bmap and btnode_cache */ NILFS_I_GCINODE, /* inode for GC, on memory only */ + NILFS_I_BTNC, /* inode for btree node cache */ + NILFS_I_SHADOW, /* inode for shadowed page cache */ }; /* @@ -267,6 +262,9 @@ struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root, unsigned long ino); extern struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino, __u64 cno); +int nilfs_attach_btree_node_cache(struct inode *inode); +void nilfs_detach_btree_node_cache(struct inode *inode); +struct inode *nilfs_iget_for_shadow(struct inode *inode); extern void nilfs_update_inode(struct inode *, struct buffer_head *, int); extern void nilfs_truncate(struct inode *); extern void nilfs_evict_inode(struct inode *); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 063dd16d75b5..a8e88cc38e16 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -436,22 +436,12 @@ unsigned int nilfs_page_count_clean_buffers(struct page *page, return nc; } -void nilfs_mapping_init(struct address_space *mapping, struct inode *inode) -{ - mapping->host = inode; - mapping->flags = 0; - mapping_set_gfp_mask(mapping, GFP_NOFS); - mapping->private_data = NULL; - mapping->a_ops = &empty_aops; -} - /* * NILFS2 needs clear_page_dirty() in the following two cases: * - * 1) For B-tree node pages and data pages of the dat/gcdat, NILFS2 clears - * page dirty flags when it copies back pages from the shadow cache - * (gcdat->{i_mapping,i_btnode_cache}) to its original cache - * (dat->{i_mapping,i_btnode_cache}). + * 1) For B-tree node pages and data pages of DAT file, NILFS2 clears dirty + * flag of pages when it copies back pages from shadow cache to the + * original cache. * * 2) Some B-tree operations like insertion or deletion may dispose buffers * in dirty state, and this needs to cancel the dirty state of their pages. diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index 569263b23c0c..21ddcdd4d63e 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -43,7 +43,6 @@ int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); void nilfs_copy_back_pages(struct address_space *, struct address_space *); void nilfs_clear_dirty_page(struct page *, bool); void nilfs_clear_dirty_pages(struct address_space *, bool); -void nilfs_mapping_init(struct address_space *mapping, struct inode *inode); unsigned int nilfs_page_count_clean_buffers(struct page *, unsigned int, unsigned int); unsigned long nilfs_find_uncommitted_extent(struct inode *inode, diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 85a853334771..0afe0832c754 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -733,15 +733,18 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode, struct list_head *listp) { struct nilfs_inode_info *ii = NILFS_I(inode); - struct address_space *mapping = &ii->i_btnode_cache; + struct inode *btnc_inode = ii->i_assoc_inode; struct pagevec pvec; struct buffer_head *bh, *head; unsigned int i; pgoff_t index = 0; + if (!btnc_inode) + return; + pagevec_init(&pvec); - while (pagevec_lookup_tag(&pvec, mapping, &index, + while (pagevec_lookup_tag(&pvec, btnc_inode->i_mapping, &index, PAGECACHE_TAG_DIRTY)) { for (i = 0; i < pagevec_count(&pvec); i++) { bh = head = page_buffers(pvec.pages[i]); @@ -2410,7 +2413,7 @@ nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head) continue; list_del_init(&ii->i_dirty); truncate_inode_pages(&ii->vfs_inode.i_data, 0); - nilfs_btnode_cache_clear(&ii->i_btnode_cache); + nilfs_btnode_cache_clear(ii->i_assoc_inode->i_mapping); iput(&ii->vfs_inode); } } diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 3e05c98631ec..ba108f915391 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -157,7 +157,8 @@ struct inode *nilfs_alloc_inode(struct super_block *sb) ii->i_bh = NULL; ii->i_state = 0; ii->i_cno = 0; - nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode); + ii->i_assoc_inode = NULL; + ii->i_bmap = &ii->i_bmap_data; return &ii->vfs_inode; } @@ -1377,8 +1378,6 @@ static void nilfs_inode_init_once(void *obj) #ifdef CONFIG_NILFS_XATTR init_rwsem(&ii->xattr_sem); #endif - address_space_init_once(&ii->i_btnode_cache); - ii->i_bmap = &ii->i_bmap_data; inode_init_once(&ii->vfs_inode); } diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 2ff6bd85ba8f..9b32b76a9c30 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1003,17 +1003,18 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, __u32 mask, unsigned int flags, __u32 umask, int *destroy) { - __u32 oldmask = 0; + __u32 oldmask, newmask; /* umask bits cannot be removed by user */ mask &= ~umask; spin_lock(&fsn_mark->lock); + oldmask = fsnotify_calc_mask(fsn_mark); if (!(flags & FAN_MARK_IGNORED_MASK)) { - oldmask = fsn_mark->mask; fsn_mark->mask &= ~mask; } else { fsn_mark->ignored_mask &= ~mask; } + newmask = fsnotify_calc_mask(fsn_mark); /* * We need to keep the mark around even if remaining mask cannot * result in any events (e.g. mask == FAN_ONDIR) to support incremenal @@ -1023,7 +1024,7 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, *destroy = !((fsn_mark->mask | fsn_mark->ignored_mask) & ~umask); spin_unlock(&fsn_mark->lock); - return mask & oldmask; + return oldmask & ~newmask; } static int fanotify_remove_mark(struct fsnotify_group *group, @@ -1080,24 +1081,42 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group, flags, umask); } +static void fanotify_mark_add_ignored_mask(struct fsnotify_mark *fsn_mark, + __u32 mask, unsigned int flags, + __u32 *removed) +{ + fsn_mark->ignored_mask |= mask; + + /* + * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to + * the removal of the FS_MODIFY bit in calculated mask if it was set + * because of an ignored mask that is now going to survive FS_MODIFY. + */ + if ((flags & FAN_MARK_IGNORED_SURV_MODIFY) && + !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) { + fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; + if (!(fsn_mark->mask & FS_MODIFY)) + *removed = FS_MODIFY; + } +} + static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, - __u32 mask, - unsigned int flags) + __u32 mask, unsigned int flags, + __u32 *removed) { - __u32 oldmask = -1; + __u32 oldmask, newmask; spin_lock(&fsn_mark->lock); + oldmask = fsnotify_calc_mask(fsn_mark); if (!(flags & FAN_MARK_IGNORED_MASK)) { - oldmask = fsn_mark->mask; fsn_mark->mask |= mask; } else { - fsn_mark->ignored_mask |= mask; - if (flags & FAN_MARK_IGNORED_SURV_MODIFY) - fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; + fanotify_mark_add_ignored_mask(fsn_mark, mask, flags, removed); } + newmask = fsnotify_calc_mask(fsn_mark); spin_unlock(&fsn_mark->lock); - return mask & ~oldmask; + return newmask & ~oldmask; } static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, @@ -1155,7 +1174,7 @@ static int fanotify_add_mark(struct fsnotify_group *group, __kernel_fsid_t *fsid) { struct fsnotify_mark *fsn_mark; - __u32 added; + __u32 added, removed = 0; int ret = 0; mutex_lock(&group->mark_mutex); @@ -1178,8 +1197,8 @@ static int fanotify_add_mark(struct fsnotify_group *group, goto out; } - added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); - if (added & ~fsnotify_conn_mask(fsn_mark->connector)) + added = fanotify_mark_add_to_mask(fsn_mark, mask, flags, &removed); + if (removed || (added & ~fsnotify_conn_mask(fsn_mark->connector))) fsnotify_recalc_mask(fsn_mark->connector); out: diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index ab81a0776ece..70a8516b78bc 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -70,8 +70,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb) spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); - if (iput_inode) - iput(iput_inode); + iput(iput_inode); /* for each watch, send FS_UNMOUNT and then remove it */ fsnotify_inode(inode, FS_UNMOUNT); @@ -85,8 +84,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb) } spin_unlock(&sb->s_inode_list_lock); - if (iput_inode) - iput(iput_inode); + iput(iput_inode); } void fsnotify_sb_delete(struct super_block *sb) @@ -531,11 +529,13 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, /* - * if this is a modify event we may need to clear the ignored masks - * otherwise return if none of the marks care about this type of event. + * If this is a modify event we may need to clear some ignored masks. + * In that case, the object with ignored masks will have the FS_MODIFY + * event in its mask. + * Otherwise, return if none of the marks care about this type of event. */ test_mask = (mask & ALL_FSNOTIFY_EVENTS); - if (!(mask & FS_MODIFY) && !(test_mask & marks_mask)) + if (!(test_mask & marks_mask)) return 0; iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 9007d6affff3..4853184f7dde 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -127,7 +127,7 @@ static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) return; hlist_for_each_entry(mark, &conn->list, obj_list) { if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) - new_mask |= mark->mask; + new_mask |= fsnotify_calc_mask(mark); } *fsnotify_conn_mask_p(conn) = new_mask; } @@ -692,7 +692,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, if (ret) goto err; - if (mark->mask) + if (mark->mask || mark->ignored_mask) fsnotify_recalc_mask(mark->connector); return ret; diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index d154dcfe06af..90e3dad8ee45 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -1746,7 +1746,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { set_buffer_dirty(bh); } while ((bh = bh->b_this_page) != head); spin_unlock(&mapping->private_lock); - block_dirty_folio(mapping, page_folio(page)); + filemap_dirty_folio(mapping, page_folio(page)); if (unlikely(buffers_to_free)) { do { bh = buffers_to_free->b_this_page; diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 273f65e0aaba..0b6f551a342a 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -337,7 +337,6 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) /* Read information header from global quota file */ int ocfs2_global_read_info(struct super_block *sb, int type) { - struct inode *gqinode = NULL; unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE, GROUP_QUOTA_SYSTEM_INODE }; struct ocfs2_global_disk_dqinfo dinfo; @@ -346,29 +345,31 @@ int ocfs2_global_read_info(struct super_block *sb, int type) u64 pcount; int status; + oinfo->dqi_gi.dqi_sb = sb; + oinfo->dqi_gi.dqi_type = type; + ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo); + oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk); + oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops; + oinfo->dqi_gqi_bh = NULL; + oinfo->dqi_gqi_count = 0; + /* Read global header */ - gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type], + oinfo->dqi_gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type], OCFS2_INVALID_SLOT); - if (!gqinode) { + if (!oinfo->dqi_gqinode) { mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n", type); status = -EINVAL; goto out_err; } - oinfo->dqi_gi.dqi_sb = sb; - oinfo->dqi_gi.dqi_type = type; - oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk); - oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops; - oinfo->dqi_gqi_bh = NULL; - oinfo->dqi_gqi_count = 0; - oinfo->dqi_gqinode = gqinode; + status = ocfs2_lock_global_qf(oinfo, 0); if (status < 0) { mlog_errno(status); goto out_err; } - status = ocfs2_extent_map_get_blocks(gqinode, 0, &oinfo->dqi_giblk, + status = ocfs2_extent_map_get_blocks(oinfo->dqi_gqinode, 0, &oinfo->dqi_giblk, &pcount, NULL); if (status < 0) goto out_unlock; diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 0e4b16d4c037..b1a8b046f4c2 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -702,8 +702,6 @@ static int ocfs2_local_read_info(struct super_block *sb, int type) info->dqi_priv = oinfo; oinfo->dqi_type = type; INIT_LIST_HEAD(&oinfo->dqi_chunk); - oinfo->dqi_gqinode = NULL; - ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo); oinfo->dqi_rec = NULL; oinfo->dqi_lqi_bh = NULL; oinfo->dqi_libh = NULL; diff --git a/fs/open.c b/fs/open.c index 9ff2f621b760..1315253e0247 100644 --- a/fs/open.c +++ b/fs/open.c @@ -835,7 +835,6 @@ static int do_dentry_open(struct file *f, likely(f->f_op->write || f->f_op->write_iter)) f->f_mode |= FMODE_CAN_WRITE; - f->f_write_hint = WRITE_LIFE_NOT_SET; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); diff --git a/fs/pipe.c b/fs/pipe.c index 2667db9506e2..9648ac15164a 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -607,7 +607,7 @@ out: static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct pipe_inode_info *pipe = filp->private_data; - int count, head, tail, mask; + unsigned int count, head, tail, mask; switch (cmd) { case FIONREAD: @@ -804,7 +804,7 @@ struct pipe_inode_info *alloc_pipe_info(void) if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user()) goto out_revert_acct; - pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), + pipe->bufs = kvcalloc(pipe_bufs, sizeof(struct pipe_buffer), GFP_KERNEL_ACCOUNT); if (pipe->bufs) { @@ -829,7 +829,7 @@ out_free_uid: void free_pipe_info(struct pipe_inode_info *pipe) { - int i; + unsigned int i; #ifdef CONFIG_WATCH_QUEUE if (pipe->watch_queue) @@ -849,7 +849,7 @@ void free_pipe_info(struct pipe_inode_info *pipe) #endif if (pipe->tmp_page) __free_page(pipe->tmp_page); - kfree(pipe->bufs); + kvfree(pipe->bufs); kfree(pipe); } @@ -1264,8 +1264,7 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) if (nr_slots < n) return -EBUSY; - bufs = kcalloc(nr_slots, sizeof(*bufs), - GFP_KERNEL_ACCOUNT | __GFP_NOWARN); + bufs = kvcalloc(nr_slots, sizeof(*bufs), GFP_KERNEL_ACCOUNT); if (unlikely(!bufs)) return -ENOMEM; @@ -1292,7 +1291,7 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) head = n; tail = 0; - kfree(pipe->bufs); + kvfree(pipe->bufs); pipe->bufs = bufs; pipe->ring_size = nr_slots; if (pipe->max_usage > nr_slots) diff --git a/fs/proc/array.c b/fs/proc/array.c index fd8b0c12b2cb..eb815759842c 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -88,7 +88,6 @@ #include <linux/pid_namespace.h> #include <linux/prctl.h> #include <linux/ptrace.h> -#include <linux/tracehook.h> #include <linux/string_helpers.h> #include <linux/user_namespace.h> #include <linux/fs_struct.h> diff --git a/fs/proc/base.c b/fs/proc/base.c index d654ce7150fd..c1031843cc6a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -74,7 +74,6 @@ #include <linux/mount.h> #include <linux/security.h> #include <linux/ptrace.h> -#include <linux/tracehook.h> #include <linux/printk.h> #include <linux/cache.h> #include <linux/cgroup.h> @@ -1764,25 +1763,25 @@ out: static int do_proc_readlink(struct path *path, char __user *buffer, int buflen) { - char *tmp = (char *)__get_free_page(GFP_KERNEL); + char *tmp = kmalloc(PATH_MAX, GFP_KERNEL); char *pathname; int len; if (!tmp) return -ENOMEM; - pathname = d_path(path, tmp, PAGE_SIZE); + pathname = d_path(path, tmp, PATH_MAX); len = PTR_ERR(pathname); if (IS_ERR(pathname)) goto out; - len = tmp + PAGE_SIZE - 1 - pathname; + len = tmp + PATH_MAX - 1 - pathname; if (len > buflen) len = buflen; if (copy_to_user(buffer, pathname, len)) len = -EFAULT; out: - free_page((unsigned long)tmp); + kfree(tmp); return len; } diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 702754dd1daf..6f1b8ddc6f7a 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -62,7 +62,8 @@ core_param(novmcoredd, vmcoredd_disabled, bool, 0); /* Device Dump Size */ static size_t vmcoredd_orig_sz; -static DECLARE_RWSEM(vmcore_cb_rwsem); +static DEFINE_SPINLOCK(vmcore_cb_lock); +DEFINE_STATIC_SRCU(vmcore_cb_srcu); /* List of registered vmcore callbacks. */ static LIST_HEAD(vmcore_cb_list); /* Whether the vmcore has been opened once. */ @@ -70,8 +71,8 @@ static bool vmcore_opened; void register_vmcore_cb(struct vmcore_cb *cb) { - down_write(&vmcore_cb_rwsem); INIT_LIST_HEAD(&cb->next); + spin_lock(&vmcore_cb_lock); list_add_tail(&cb->next, &vmcore_cb_list); /* * Registering a vmcore callback after the vmcore was opened is @@ -79,14 +80,14 @@ void register_vmcore_cb(struct vmcore_cb *cb) */ if (vmcore_opened) pr_warn_once("Unexpected vmcore callback registration\n"); - up_write(&vmcore_cb_rwsem); + spin_unlock(&vmcore_cb_lock); } EXPORT_SYMBOL_GPL(register_vmcore_cb); void unregister_vmcore_cb(struct vmcore_cb *cb) { - down_write(&vmcore_cb_rwsem); - list_del(&cb->next); + spin_lock(&vmcore_cb_lock); + list_del_rcu(&cb->next); /* * Unregistering a vmcore callback after the vmcore was opened is * very unusual (e.g., forced driver removal), but we cannot stop @@ -94,7 +95,9 @@ void unregister_vmcore_cb(struct vmcore_cb *cb) */ if (vmcore_opened) pr_warn_once("Unexpected vmcore callback unregistration\n"); - up_write(&vmcore_cb_rwsem); + spin_unlock(&vmcore_cb_lock); + + synchronize_srcu(&vmcore_cb_srcu); } EXPORT_SYMBOL_GPL(unregister_vmcore_cb); @@ -103,9 +106,8 @@ static bool pfn_is_ram(unsigned long pfn) struct vmcore_cb *cb; bool ret = true; - lockdep_assert_held_read(&vmcore_cb_rwsem); - - list_for_each_entry(cb, &vmcore_cb_list, next) { + list_for_each_entry_srcu(cb, &vmcore_cb_list, next, + srcu_read_lock_held(&vmcore_cb_srcu)) { if (unlikely(!cb->pfn_is_ram)) continue; ret = cb->pfn_is_ram(cb, pfn); @@ -118,9 +120,9 @@ static bool pfn_is_ram(unsigned long pfn) static int open_vmcore(struct inode *inode, struct file *file) { - down_read(&vmcore_cb_rwsem); + spin_lock(&vmcore_cb_lock); vmcore_opened = true; - up_read(&vmcore_cb_rwsem); + spin_unlock(&vmcore_cb_lock); return 0; } @@ -133,6 +135,7 @@ ssize_t read_from_oldmem(char *buf, size_t count, unsigned long pfn, offset; size_t nr_bytes; ssize_t read = 0, tmp; + int idx; if (!count) return 0; @@ -140,7 +143,7 @@ ssize_t read_from_oldmem(char *buf, size_t count, offset = (unsigned long)(*ppos % PAGE_SIZE); pfn = (unsigned long)(*ppos / PAGE_SIZE); - down_read(&vmcore_cb_rwsem); + idx = srcu_read_lock(&vmcore_cb_srcu); do { if (count > (PAGE_SIZE - offset)) nr_bytes = PAGE_SIZE - offset; @@ -165,7 +168,7 @@ ssize_t read_from_oldmem(char *buf, size_t count, offset, userbuf); } if (tmp < 0) { - up_read(&vmcore_cb_rwsem); + srcu_read_unlock(&vmcore_cb_srcu, idx); return tmp; } @@ -176,8 +179,8 @@ ssize_t read_from_oldmem(char *buf, size_t count, ++pfn; offset = 0; } while (count); + srcu_read_unlock(&vmcore_cb_srcu, idx); - up_read(&vmcore_cb_rwsem); return read; } @@ -477,7 +480,7 @@ static const struct vm_operations_struct vmcore_mmap_ops = { /** * vmcore_alloc_buf - allocate buffer in vmalloc memory - * @sizez: size of buffer + * @size: size of buffer * * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap * the buffer to user-space by means of remap_vmalloc_range(). @@ -568,18 +571,18 @@ static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma, unsigned long from, unsigned long pfn, unsigned long size, pgprot_t prot) { - int ret; + int ret, idx; /* - * Check if oldmem_pfn_is_ram was registered to avoid - * looping over all pages without a reason. + * Check if a callback was registered to avoid looping over all + * pages without a reason. */ - down_read(&vmcore_cb_rwsem); + idx = srcu_read_lock(&vmcore_cb_srcu); if (!list_empty(&vmcore_cb_list)) ret = remap_oldmem_pfn_checked(vma, from, pfn, size, prot); else ret = remap_oldmem_pfn_range(vma, from, pfn, size, prot); - up_read(&vmcore_cb_rwsem); + srcu_read_unlock(&vmcore_cb_srcu, idx); return ret; } diff --git a/fs/read_write.c b/fs/read_write.c index dc5000173b80..e643aec2b0ef 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1630,7 +1630,6 @@ int generic_write_checks_count(struct kiocb *iocb, loff_t *count) if (!*count) return 0; - /* FIXME: this is for backwards compatibility with 2.4 */ if (iocb->ki_flags & IOCB_APPEND) iocb->ki_pos = i_size_read(inode); diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig index 8fd54ed8f844..33c8b0dd07a2 100644 --- a/fs/reiserfs/Kconfig +++ b/fs/reiserfs/Kconfig @@ -1,10 +1,14 @@ # SPDX-License-Identifier: GPL-2.0-only config REISERFS_FS - tristate "Reiserfs support" + tristate "Reiserfs support (deprecated)" select CRC32 help - Stores not just filenames but the files themselves in a balanced - tree. Uses journalling. + Reiserfs is deprecated and scheduled to be removed from the kernel + in 2025. If you are still using it, please migrate to another + filesystem or tell us your usecase for reiserfs. + + Reiserfs stores not just filenames but the files themselves in a + balanced tree. Uses journalling. Balanced trees are more efficient than traditional file system architectural foundations. diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index e4221fa85ea2..36c59b25486c 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2763,13 +2763,6 @@ static int reiserfs_write_begin(struct file *file, int old_ref = 0; inode = mapping->host; - *fsdata = NULL; - if (flags & AOP_FLAG_CONT_EXPAND && - (pos & (inode->i_sb->s_blocksize - 1)) == 0) { - pos ++; - *fsdata = (void *)(unsigned long)flags; - } - index = pos >> PAGE_SHIFT; page = grab_cache_page_write_begin(mapping, index, flags); if (!page) @@ -2896,9 +2889,6 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, unsigned start; bool locked = false; - if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND) - pos ++; - reiserfs_wait_on_write_block(inode->i_sb); if (reiserfs_transaction_running(inode->i_sb)) th = current->journal_info; @@ -3316,7 +3306,11 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, /* fill in hole pointers in the expanding truncate case. */ if (attr->ia_size > inode->i_size) { - error = generic_cont_expand_simple(inode, attr->ia_size); + loff_t pos = attr->ia_size; + + if ((pos & (inode->i_sb->s_blocksize - 1)) == 0) + pos++; + error = generic_cont_expand_simple(inode, pos); if (REISERFS_I(inode)->i_prealloc_count > 0) { int err; struct reiserfs_transaction_handle th; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 756c4916c7ae..cfb7c44c7366 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1652,6 +1652,8 @@ static int read_super_block(struct super_block *s, int offset) return 1; } + reiserfs_warning(NULL, "", "reiserfs filesystem is deprecated and " + "scheduled to be removed from the kernel in 2025"); SB_BUFFER_WITH_SB(s) = bh; SB_DISK_SUPER_BLOCK(s) = rs; diff --git a/fs/seq_file.c b/fs/seq_file.c index f8e1f4ee87ff..7ab8a58c29b6 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -554,9 +554,9 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc) } EXPORT_SYMBOL(seq_dentry); -static void *single_start(struct seq_file *p, loff_t *pos) +void *single_start(struct seq_file *p, loff_t *pos) { - return NULL + (*pos == 0); + return *pos ? NULL : SEQ_START_TOKEN; } static void *single_next(struct seq_file *p, void *v, loff_t *pos) diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h index 38b8fc514860..0507aecfc669 100644 --- a/fs/smbfs_common/smb2pdu.h +++ b/fs/smbfs_common/smb2pdu.h @@ -61,6 +61,40 @@ #define NUMBER_OF_SMB2_COMMANDS 0x0013 /* + * Size of the session key (crypto key encrypted with the password + */ +#define SMB2_NTLMV2_SESSKEY_SIZE 16 +#define SMB2_SIGNATURE_SIZE 16 +#define SMB2_HMACSHA256_SIZE 32 +#define SMB2_CMACAES_SIZE 16 +#define SMB3_GCM128_CRYPTKEY_SIZE 16 +#define SMB3_GCM256_CRYPTKEY_SIZE 32 + +/* + * Size of the smb3 encryption/decryption keys + * This size is big enough to store any cipher key types. + */ +#define SMB3_ENC_DEC_KEY_SIZE 32 + +/* + * Size of the smb3 signing key + */ +#define SMB3_SIGN_KEY_SIZE 16 + +#define CIFS_CLIENT_CHALLENGE_SIZE 8 + +/* Maximum buffer size value we can send with 1 credit */ +#define SMB2_MAX_BUFFER_SIZE 65536 + +/* + * The default wsize is 1M for SMB2 (and for some CIFS cases). + * find_get_pages seems to return a maximum of 256 + * pages in a single call. With PAGE_SIZE == 4k, this means we can + * fill a single wsize request with a single call. + */ +#define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024) + +/* * SMB2 Header Definition * * "MBZ" : Must be Zero @@ -88,6 +122,15 @@ #define SMB2_FLAGS_DFS_OPERATIONS cpu_to_le32(0x10000000) #define SMB2_FLAGS_REPLAY_OPERATION cpu_to_le32(0x20000000) /* SMB3 & up */ +/* + * Definitions for SMB2 Protocol Data Units (network frames) + * + * See MS-SMB2.PDF specification for protocol details. + * The Naming convention is the lower case version of the SMB2 + * command code name for the struct. Note that structures must be packed. + * + */ + /* See MS-SMB2 section 2.2.1 */ struct smb2_hdr { __le32 ProtocolId; /* 0xFE 'S' 'M' 'B' */ @@ -115,6 +158,18 @@ struct smb2_pdu { __le16 StructureSize2; /* size of wct area (varies, request specific) */ } __packed; +#define SMB2_ERROR_STRUCTURE_SIZE2 9 +#define SMB2_ERROR_STRUCTURE_SIZE2_LE cpu_to_le16(SMB2_ERROR_STRUCTURE_SIZE2) + +struct smb2_err_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; + __u8 ErrorContextCount; + __u8 Reserved; + __le32 ByteCount; /* even if zero, at least one byte follows */ + __u8 ErrorData[1]; /* variable length */ +} __packed; + #define SMB3_AES_CCM_NONCE 11 #define SMB3_AES_GCM_NONCE 12 @@ -608,8 +663,8 @@ struct smb2_close_req { __le16 StructureSize; /* Must be 24 */ __le16 Flags; __le32 Reserved; - __le64 PersistentFileId; /* opaque endianness */ - __le64 VolatileFileId; /* opaque endianness */ + __u64 PersistentFileId; /* opaque endianness */ + __u64 VolatileFileId; /* opaque endianness */ } __packed; /* @@ -653,8 +708,8 @@ struct smb2_read_req { __u8 Flags; /* MBZ unless SMB3.02 or later */ __le32 Length; __le64 Offset; - __le64 PersistentFileId; - __le64 VolatileFileId; + __u64 PersistentFileId; + __u64 VolatileFileId; __le32 MinimumCount; __le32 Channel; /* MBZ except for SMB3 or later */ __le32 RemainingBytes; @@ -692,8 +747,8 @@ struct smb2_write_req { __le16 DataOffset; /* offset from start of SMB2 header to write data */ __le32 Length; __le64 Offset; - __le64 PersistentFileId; /* opaque endianness */ - __le64 VolatileFileId; /* opaque endianness */ + __u64 PersistentFileId; /* opaque endianness */ + __u64 VolatileFileId; /* opaque endianness */ __le32 Channel; /* MBZ unless SMB3.02 or later */ __le32 RemainingBytes; __le16 WriteChannelInfoOffset; @@ -722,8 +777,8 @@ struct smb2_flush_req { __le16 StructureSize; /* Must be 24 */ __le16 Reserved1; __le32 Reserved2; - __le64 PersistentFileId; - __le64 VolatileFileId; + __u64 PersistentFileId; + __u64 VolatileFileId; } __packed; struct smb2_flush_rsp { @@ -732,6 +787,123 @@ struct smb2_flush_rsp { __le16 Reserved; } __packed; +#define SMB2_LOCKFLAG_SHARED 0x0001 +#define SMB2_LOCKFLAG_EXCLUSIVE 0x0002 +#define SMB2_LOCKFLAG_UNLOCK 0x0004 +#define SMB2_LOCKFLAG_FAIL_IMMEDIATELY 0x0010 +#define SMB2_LOCKFLAG_MASK 0x0007 + +struct smb2_lock_element { + __le64 Offset; + __le64 Length; + __le32 Flags; + __le32 Reserved; +} __packed; + +struct smb2_lock_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 48 */ + __le16 LockCount; + /* + * The least significant four bits are the index, the other 28 bits are + * the lock sequence number (0 to 64). See MS-SMB2 2.2.26 + */ + __le32 LockSequenceNumber; + __u64 PersistentFileId; + __u64 VolatileFileId; + /* Followed by at least one */ + struct smb2_lock_element locks[1]; +} __packed; + +struct smb2_lock_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __le16 Reserved; +} __packed; + +struct smb2_echo_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __u16 Reserved; +} __packed; + +struct smb2_echo_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __u16 Reserved; +} __packed; + +/* + * Valid FileInformation classes for query directory + * + * Note that these are a subset of the (file) QUERY_INFO levels defined + * later in this file (but since QUERY_DIRECTORY uses equivalent numbers + * we do not redefine them here) + * + * FileDirectoryInfomation 0x01 + * FileFullDirectoryInformation 0x02 + * FileIdFullDirectoryInformation 0x26 + * FileBothDirectoryInformation 0x03 + * FileIdBothDirectoryInformation 0x25 + * FileNamesInformation 0x0C + * FileIdExtdDirectoryInformation 0x3C + */ + +/* search (query_directory) Flags field */ +#define SMB2_RESTART_SCANS 0x01 +#define SMB2_RETURN_SINGLE_ENTRY 0x02 +#define SMB2_INDEX_SPECIFIED 0x04 +#define SMB2_REOPEN 0x10 + +struct smb2_query_directory_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 33 */ + __u8 FileInformationClass; + __u8 Flags; + __le32 FileIndex; + __u64 PersistentFileId; + __u64 VolatileFileId; + __le16 FileNameOffset; + __le16 FileNameLength; + __le32 OutputBufferLength; + __u8 Buffer[1]; +} __packed; + +struct smb2_query_directory_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 9 */ + __le16 OutputBufferOffset; + __le32 OutputBufferLength; + __u8 Buffer[1]; +} __packed; + +/* + * Maximum number of iovs we need for a set-info request. + * The largest one is rename/hardlink + * [0] : struct smb2_set_info_req + smb2_file_[rename|link]_info + * [1] : path + * [2] : compound padding + */ +#define SMB2_SET_INFO_IOV_SIZE 3 + +struct smb2_set_info_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 33 */ + __u8 InfoType; + __u8 FileInfoClass; + __le32 BufferLength; + __le16 BufferOffset; + __u16 Reserved; + __le32 AdditionalInformation; + __u64 PersistentFileId; + __u64 VolatileFileId; + __u8 Buffer[1]; +} __packed; + +struct smb2_set_info_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 2 */ +} __packed; /* * SMB2_NOTIFY See MS-SMB2 section 2.2.35 @@ -769,8 +941,8 @@ struct smb2_change_notify_req { __le16 StructureSize; __le16 Flags; __le32 OutputBufferLength; - __le64 PersistentFileId; /* opaque endianness */ - __le64 VolatileFileId; /* opaque endianness */ + __u64 PersistentFileId; /* opaque endianness */ + __u64 VolatileFileId; /* opaque endianness */ __le32 CompletionFilter; __u32 Reserved; } __packed; @@ -978,12 +1150,455 @@ struct smb2_create_rsp { __le64 EndofFile; __le32 FileAttributes; __le32 Reserved2; - __le64 PersistentFileId; - __le64 VolatileFileId; + __u64 PersistentFileId; + __u64 VolatileFileId; __le32 CreateContextsOffset; __le32 CreateContextsLength; __u8 Buffer[1]; } __packed; +struct create_posix { + struct create_context ccontext; + __u8 Name[16]; + __le32 Mode; + __u32 Reserved; +} __packed; + +#define SMB2_LEASE_NONE_LE cpu_to_le32(0x00) +#define SMB2_LEASE_READ_CACHING_LE cpu_to_le32(0x01) +#define SMB2_LEASE_HANDLE_CACHING_LE cpu_to_le32(0x02) +#define SMB2_LEASE_WRITE_CACHING_LE cpu_to_le32(0x04) + +#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS_LE cpu_to_le32(0x02) + +#define SMB2_LEASE_KEY_SIZE 16 + +struct lease_context { + __u8 LeaseKey[SMB2_LEASE_KEY_SIZE]; + __le32 LeaseState; + __le32 LeaseFlags; + __le64 LeaseDuration; +} __packed; + +struct lease_context_v2 { + __u8 LeaseKey[SMB2_LEASE_KEY_SIZE]; + __le32 LeaseState; + __le32 LeaseFlags; + __le64 LeaseDuration; + __u8 ParentLeaseKey[SMB2_LEASE_KEY_SIZE]; + __le16 Epoch; + __le16 Reserved; +} __packed; + +struct create_lease { + struct create_context ccontext; + __u8 Name[8]; + struct lease_context lcontext; +} __packed; + +struct create_lease_v2 { + struct create_context ccontext; + __u8 Name[8]; + struct lease_context_v2 lcontext; + __u8 Pad[4]; +} __packed; + +/* See MS-SMB2 2.2.31 and 2.2.32 */ +struct smb2_ioctl_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 57 */ + __le16 Reserved; /* offset from start of SMB2 header to write data */ + __le32 CtlCode; + __u64 PersistentFileId; + __u64 VolatileFileId; + __le32 InputOffset; /* Reserved MBZ */ + __le32 InputCount; + __le32 MaxInputResponse; + __le32 OutputOffset; + __le32 OutputCount; + __le32 MaxOutputResponse; + __le32 Flags; + __le32 Reserved2; + __u8 Buffer[]; +} __packed; + +struct smb2_ioctl_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 49 */ + __le16 Reserved; + __le32 CtlCode; + __u64 PersistentFileId; + __u64 VolatileFileId; + __le32 InputOffset; /* Reserved MBZ */ + __le32 InputCount; + __le32 OutputOffset; + __le32 OutputCount; + __le32 Flags; + __le32 Reserved2; + __u8 Buffer[]; +} __packed; + +/* this goes in the ioctl buffer when doing FSCTL_SET_ZERO_DATA */ +struct file_zero_data_information { + __le64 FileOffset; + __le64 BeyondFinalZero; +} __packed; + +/* Reparse structures - see MS-FSCC 2.1.2 */ + +/* struct fsctl_reparse_info_req is empty, only response structs (see below) */ +struct reparse_data_buffer { + __le32 ReparseTag; + __le16 ReparseDataLength; + __u16 Reserved; + __u8 DataBuffer[]; /* Variable Length */ +} __packed; + +struct reparse_guid_data_buffer { + __le32 ReparseTag; + __le16 ReparseDataLength; + __u16 Reserved; + __u8 ReparseGuid[16]; + __u8 DataBuffer[]; /* Variable Length */ +} __packed; + +struct reparse_mount_point_data_buffer { + __le32 ReparseTag; + __le16 ReparseDataLength; + __u16 Reserved; + __le16 SubstituteNameOffset; + __le16 SubstituteNameLength; + __le16 PrintNameOffset; + __le16 PrintNameLength; + __u8 PathBuffer[]; /* Variable Length */ +} __packed; + +#define SYMLINK_FLAG_RELATIVE 0x00000001 + +struct reparse_symlink_data_buffer { + __le32 ReparseTag; + __le16 ReparseDataLength; + __u16 Reserved; + __le16 SubstituteNameOffset; + __le16 SubstituteNameLength; + __le16 PrintNameOffset; + __le16 PrintNameLength; + __le32 Flags; + __u8 PathBuffer[]; /* Variable Length */ +} __packed; + +/* See MS-FSCC 2.1.2.6 and cifspdu.h for struct reparse_posix_data */ + +struct validate_negotiate_info_req { + __le32 Capabilities; + __u8 Guid[SMB2_CLIENT_GUID_SIZE]; + __le16 SecurityMode; + __le16 DialectCount; + __le16 Dialects[4]; /* BB expand this if autonegotiate > 4 dialects */ +} __packed; + +struct validate_negotiate_info_rsp { + __le32 Capabilities; + __u8 Guid[SMB2_CLIENT_GUID_SIZE]; + __le16 SecurityMode; + __le16 Dialect; /* Dialect in use for the connection */ +} __packed; + +struct duplicate_extents_to_file { + __u64 PersistentFileHandle; /* source file handle, opaque endianness */ + __u64 VolatileFileHandle; + __le64 SourceFileOffset; + __le64 TargetFileOffset; + __le64 ByteCount; /* Bytes to be copied */ +} __packed; + +/* Possible InfoType values */ +#define SMB2_O_INFO_FILE 0x01 +#define SMB2_O_INFO_FILESYSTEM 0x02 +#define SMB2_O_INFO_SECURITY 0x03 +#define SMB2_O_INFO_QUOTA 0x04 + +/* SMB2 Query Info see MS-SMB2 (2.2.37) or MS-DTYP */ + +/* List of QUERY INFO levels (those also valid for QUERY_DIR are noted below */ +#define FILE_DIRECTORY_INFORMATION 1 /* also for QUERY_DIR */ +#define FILE_FULL_DIRECTORY_INFORMATION 2 /* also for QUERY_DIR */ +#define FILE_BOTH_DIRECTORY_INFORMATION 3 /* also for QUERY_DIR */ +#define FILE_BASIC_INFORMATION 4 +#define FILE_STANDARD_INFORMATION 5 +#define FILE_INTERNAL_INFORMATION 6 +#define FILE_EA_INFORMATION 7 +#define FILE_ACCESS_INFORMATION 8 +#define FILE_NAME_INFORMATION 9 +#define FILE_RENAME_INFORMATION 10 +#define FILE_LINK_INFORMATION 11 +#define FILE_NAMES_INFORMATION 12 /* also for QUERY_DIR */ +#define FILE_DISPOSITION_INFORMATION 13 +#define FILE_POSITION_INFORMATION 14 +#define FILE_FULL_EA_INFORMATION 15 +#define FILE_MODE_INFORMATION 16 +#define FILE_ALIGNMENT_INFORMATION 17 +#define FILE_ALL_INFORMATION 18 +#define FILE_ALLOCATION_INFORMATION 19 +#define FILE_END_OF_FILE_INFORMATION 20 +#define FILE_ALTERNATE_NAME_INFORMATION 21 +#define FILE_STREAM_INFORMATION 22 +#define FILE_PIPE_INFORMATION 23 +#define FILE_PIPE_LOCAL_INFORMATION 24 +#define FILE_PIPE_REMOTE_INFORMATION 25 +#define FILE_MAILSLOT_QUERY_INFORMATION 26 +#define FILE_MAILSLOT_SET_INFORMATION 27 +#define FILE_COMPRESSION_INFORMATION 28 +#define FILE_OBJECT_ID_INFORMATION 29 +/* Number 30 not defined in documents */ +#define FILE_MOVE_CLUSTER_INFORMATION 31 +#define FILE_QUOTA_INFORMATION 32 +#define FILE_REPARSE_POINT_INFORMATION 33 +#define FILE_NETWORK_OPEN_INFORMATION 34 +#define FILE_ATTRIBUTE_TAG_INFORMATION 35 +#define FILE_TRACKING_INFORMATION 36 +#define FILEID_BOTH_DIRECTORY_INFORMATION 37 /* also for QUERY_DIR */ +#define FILEID_FULL_DIRECTORY_INFORMATION 38 /* also for QUERY_DIR */ +#define FILE_VALID_DATA_LENGTH_INFORMATION 39 +#define FILE_SHORT_NAME_INFORMATION 40 +#define FILE_SFIO_RESERVE_INFORMATION 44 +#define FILE_SFIO_VOLUME_INFORMATION 45 +#define FILE_HARD_LINK_INFORMATION 46 +#define FILE_NORMALIZED_NAME_INFORMATION 48 +#define FILEID_GLOBAL_TX_DIRECTORY_INFORMATION 50 +#define FILE_STANDARD_LINK_INFORMATION 54 +#define FILE_ID_INFORMATION 59 +#define FILE_ID_EXTD_DIRECTORY_INFORMATION 60 /* also for QUERY_DIR */ +/* Used for Query Info and Find File POSIX Info for SMB3.1.1 and SMB1 */ +#define SMB_FIND_FILE_POSIX_INFO 0x064 + +/* Security info type additionalinfo flags. */ +#define OWNER_SECINFO 0x00000001 +#define GROUP_SECINFO 0x00000002 +#define DACL_SECINFO 0x00000004 +#define SACL_SECINFO 0x00000008 +#define LABEL_SECINFO 0x00000010 +#define ATTRIBUTE_SECINFO 0x00000020 +#define SCOPE_SECINFO 0x00000040 +#define BACKUP_SECINFO 0x00010000 +#define UNPROTECTED_SACL_SECINFO 0x10000000 +#define UNPROTECTED_DACL_SECINFO 0x20000000 +#define PROTECTED_SACL_SECINFO 0x40000000 +#define PROTECTED_DACL_SECINFO 0x80000000 + +/* Flags used for FileFullEAinfo */ +#define SL_RESTART_SCAN 0x00000001 +#define SL_RETURN_SINGLE_ENTRY 0x00000002 +#define SL_INDEX_SPECIFIED 0x00000004 + +struct smb2_query_info_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 41 */ + __u8 InfoType; + __u8 FileInfoClass; + __le32 OutputBufferLength; + __le16 InputBufferOffset; + __u16 Reserved; + __le32 InputBufferLength; + __le32 AdditionalInformation; + __le32 Flags; + __u64 PersistentFileId; + __u64 VolatileFileId; + __u8 Buffer[1]; +} __packed; + +struct smb2_query_info_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 9 */ + __le16 OutputBufferOffset; + __le32 OutputBufferLength; + __u8 Buffer[1]; +} __packed; + +/* + * PDU query infolevel structure definitions + */ + +struct file_allocated_range_buffer { + __le64 file_offset; + __le64 length; +} __packed; + +struct smb2_file_internal_info { + __le64 IndexNumber; +} __packed; /* level 6 Query */ + +struct smb2_file_rename_info { /* encoding of request for level 10 */ + __u8 ReplaceIfExists; /* 1 = replace existing target with new */ + /* 0 = fail if target already exists */ + __u8 Reserved[7]; + __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ + __le32 FileNameLength; + char FileName[]; /* New name to be assigned */ + /* padding - overall struct size must be >= 24 so filename + pad >= 6 */ +} __packed; /* level 10 Set */ + +struct smb2_file_link_info { /* encoding of request for level 11 */ + __u8 ReplaceIfExists; /* 1 = replace existing link with new */ + /* 0 = fail if link already exists */ + __u8 Reserved[7]; + __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ + __le32 FileNameLength; + char FileName[]; /* Name to be assigned to new link */ +} __packed; /* level 11 Set */ + +/* + * This level 18, although with struct with same name is different from cifs + * level 0x107. Level 0x107 has an extra u64 between AccessFlags and + * CurrentByteOffset. + */ +struct smb2_file_all_info { /* data block encoding of response to level 18 */ + __le64 CreationTime; /* Beginning of FILE_BASIC_INFO equivalent */ + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le32 Attributes; + __u32 Pad1; /* End of FILE_BASIC_INFO_INFO equivalent */ + __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */ + __le64 EndOfFile; /* size ie offset to first free byte in file */ + __le32 NumberOfLinks; /* hard links */ + __u8 DeletePending; + __u8 Directory; + __u16 Pad2; /* End of FILE_STANDARD_INFO equivalent */ + __le64 IndexNumber; + __le32 EASize; + __le32 AccessFlags; + __le64 CurrentByteOffset; + __le32 Mode; + __le32 AlignmentRequirement; + __le32 FileNameLength; + char FileName[1]; +} __packed; /* level 18 Query */ + +struct smb2_file_eof_info { /* encoding of request for level 10 */ + __le64 EndOfFile; /* new end of file value */ +} __packed; /* level 20 Set */ + +/* Level 100 query info */ +struct smb311_posix_qinfo { + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 EndOfFile; + __le64 AllocationSize; + __le32 DosAttributes; + __le64 Inode; + __le32 DeviceId; + __le32 Zero; + /* beginning of POSIX Create Context Response */ + __le32 HardLinks; + __le32 ReparseTag; + __le32 Mode; + u8 Sids[]; + /* + * var sized owner SID + * var sized group SID + * le32 filenamelength + * u8 filename[] + */ +} __packed; + +/* File System Information Classes */ +#define FS_VOLUME_INFORMATION 1 /* Query */ +#define FS_LABEL_INFORMATION 2 /* Set */ +#define FS_SIZE_INFORMATION 3 /* Query */ +#define FS_DEVICE_INFORMATION 4 /* Query */ +#define FS_ATTRIBUTE_INFORMATION 5 /* Query */ +#define FS_CONTROL_INFORMATION 6 /* Query, Set */ +#define FS_FULL_SIZE_INFORMATION 7 /* Query */ +#define FS_OBJECT_ID_INFORMATION 8 /* Query, Set */ +#define FS_DRIVER_PATH_INFORMATION 9 /* Query */ +#define FS_SECTOR_SIZE_INFORMATION 11 /* SMB3 or later. Query */ +#define FS_POSIX_INFORMATION 100 /* SMB3.1.1 POSIX. Query */ + +struct smb2_fs_full_size_info { + __le64 TotalAllocationUnits; + __le64 CallerAvailableAllocationUnits; + __le64 ActualAvailableAllocationUnits; + __le32 SectorsPerAllocationUnit; + __le32 BytesPerSector; +} __packed; + +#define SSINFO_FLAGS_ALIGNED_DEVICE 0x00000001 +#define SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE 0x00000002 +#define SSINFO_FLAGS_NO_SEEK_PENALTY 0x00000004 +#define SSINFO_FLAGS_TRIM_ENABLED 0x00000008 + +/* sector size info struct */ +struct smb3_fs_ss_info { + __le32 LogicalBytesPerSector; + __le32 PhysicalBytesPerSectorForAtomicity; + __le32 PhysicalBytesPerSectorForPerf; + __le32 FSEffPhysicalBytesPerSectorForAtomicity; + __le32 Flags; + __le32 ByteOffsetForSectorAlignment; + __le32 ByteOffsetForPartitionAlignment; +} __packed; + +/* File System Control Information */ +struct smb2_fs_control_info { + __le64 FreeSpaceStartFiltering; + __le64 FreeSpaceThreshold; + __le64 FreeSpaceStopFiltering; + __le64 DefaultQuotaThreshold; + __le64 DefaultQuotaLimit; + __le32 FileSystemControlFlags; + __le32 Padding; +} __packed; + +/* volume info struct - see MS-FSCC 2.5.9 */ +#define MAX_VOL_LABEL_LEN 32 +struct smb3_fs_vol_info { + __le64 VolumeCreationTime; + __u32 VolumeSerialNumber; + __le32 VolumeLabelLength; /* includes trailing null */ + __u8 SupportsObjects; /* True if eg like NTFS, supports objects */ + __u8 Reserved; + __u8 VolumeLabel[]; /* variable len */ +} __packed; + +/* See MS-SMB2 2.2.23 through 2.2.25 */ +struct smb2_oplock_break { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 24 */ + __u8 OplockLevel; + __u8 Reserved; + __le32 Reserved2; + __u64 PersistentFid; + __u64 VolatileFid; +} __packed; + +#define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01) + +struct smb2_lease_break { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 44 */ + __le16 Epoch; + __le32 Flags; + __u8 LeaseKey[16]; + __le32 CurrentLeaseState; + __le32 NewLeaseState; + __le32 BreakReason; + __le32 AccessMaskHint; + __le32 ShareMaskHint; +} __packed; + +struct smb2_lease_ack { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 36 */ + __le16 Reserved; + __le32 Flags; + __u8 LeaseKey[16]; + __le32 LeaseState; + __le64 LeaseDuration; +} __packed; +#define OP_BREAK_STRUCT_SIZE_20 24 +#define OP_BREAK_STRUCT_SIZE_21 36 #endif /* _COMMON_SMB2PDU_H */ diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index e747c135c1d1..98467bb76737 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -103,7 +103,7 @@ int __init sysfs_init(void) if (IS_ERR(sysfs_root)) return PTR_ERR(sysfs_root); - sysfs_root_kn = sysfs_root->kn; + sysfs_root_kn = kernfs_root_to_node(sysfs_root); err = register_filesystem(&sysfs_fs_type); if (err) { diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index dbe72f664abf..86151889548e 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -349,20 +349,97 @@ out_budg: return err; } -static int do_tmpfile(struct inode *dir, struct dentry *dentry, - umode_t mode, struct inode **whiteout) +static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry) +{ + int err; + umode_t mode = S_IFCHR | WHITEOUT_MODE; + struct inode *inode; + struct ubifs_info *c = dir->i_sb->s_fs_info; + struct fscrypt_name nm; + + /* + * Create an inode('nlink = 1') for whiteout without updating journal, + * let ubifs_jnl_rename() store it on flash to complete rename whiteout + * atomically. + */ + + dbg_gen("dent '%pd', mode %#hx in dir ino %lu", + dentry, mode, dir->i_ino); + + err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); + if (err) + return ERR_PTR(err); + + inode = ubifs_new_inode(c, dir, mode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_free; + } + + init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); + ubifs_assert(c, inode->i_op == &ubifs_file_inode_operations); + + err = ubifs_init_security(dir, inode, &dentry->d_name); + if (err) + goto out_inode; + + /* The dir size is updated by do_rename. */ + insert_inode_hash(inode); + + return inode; + +out_inode: + make_bad_inode(inode); + iput(inode); +out_free: + fscrypt_free_filename(&nm); + ubifs_err(c, "cannot create whiteout file, error %d", err); + return ERR_PTR(err); +} + +/** + * lock_2_inodes - a wrapper for locking two UBIFS inodes. + * @inode1: first inode + * @inode2: second inode + * + * We do not implement any tricks to guarantee strict lock ordering, because + * VFS has already done it for us on the @i_mutex. So this is just a simple + * wrapper function. + */ +static void lock_2_inodes(struct inode *inode1, struct inode *inode2) +{ + mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1); + mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2); +} + +/** + * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes. + * @inode1: first inode + * @inode2: second inode + */ +static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) +{ + mutex_unlock(&ubifs_inode(inode2)->ui_mutex); + mutex_unlock(&ubifs_inode(inode1)->ui_mutex); +} + +static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; struct ubifs_info *c = dir->i_sb->s_fs_info; - struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1}; + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .dirtied_ino = 1}; struct ubifs_budget_req ino_req = { .dirtied_ino = 1 }; - struct ubifs_inode *ui, *dir_ui = ubifs_inode(dir); + struct ubifs_inode *ui; int err, instantiated = 0; struct fscrypt_name nm; /* - * Budget request settings: new dirty inode, new direntry, - * budget for dirtied inode will be released via writeback. + * Budget request settings: new inode, new direntry, changing the + * parent directory inode. + * Allocate budget separately for new dirtied inode, the budget will + * be released via writeback. */ dbg_gen("dent '%pd', mode %#hx in dir ino %lu", @@ -392,42 +469,30 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry, } ui = ubifs_inode(inode); - if (whiteout) { - init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); - ubifs_assert(c, inode->i_op == &ubifs_file_inode_operations); - } - err = ubifs_init_security(dir, inode, &dentry->d_name); if (err) goto out_inode; mutex_lock(&ui->ui_mutex); insert_inode_hash(inode); - - if (whiteout) { - mark_inode_dirty(inode); - drop_nlink(inode); - *whiteout = inode; - } else { - d_tmpfile(dentry, inode); - } + d_tmpfile(dentry, inode); ubifs_assert(c, ui->dirty); instantiated = 1; mutex_unlock(&ui->ui_mutex); - mutex_lock(&dir_ui->ui_mutex); + lock_2_inodes(dir, inode); err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); if (err) goto out_cancel; - mutex_unlock(&dir_ui->ui_mutex); + unlock_2_inodes(dir, inode); ubifs_release_budget(c, &req); return 0; out_cancel: - mutex_unlock(&dir_ui->ui_mutex); + unlock_2_inodes(dir, inode); out_inode: make_bad_inode(inode); if (!instantiated) @@ -441,12 +506,6 @@ out_budg: return err; } -static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) -{ - return do_tmpfile(dir, dentry, mode, NULL); -} - /** * vfs_dent_type - get VFS directory entry type. * @type: UBIFS directory entry type @@ -660,32 +719,6 @@ static int ubifs_dir_release(struct inode *dir, struct file *file) return 0; } -/** - * lock_2_inodes - a wrapper for locking two UBIFS inodes. - * @inode1: first inode - * @inode2: second inode - * - * We do not implement any tricks to guarantee strict lock ordering, because - * VFS has already done it for us on the @i_mutex. So this is just a simple - * wrapper function. - */ -static void lock_2_inodes(struct inode *inode1, struct inode *inode2) -{ - mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1); - mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2); -} - -/** - * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes. - * @inode1: first inode - * @inode2: second inode - */ -static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) -{ - mutex_unlock(&ubifs_inode(inode2)->ui_mutex); - mutex_unlock(&ubifs_inode(inode1)->ui_mutex); -} - static int ubifs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { @@ -949,7 +982,8 @@ static int ubifs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, struct ubifs_inode *dir_ui = ubifs_inode(dir); struct ubifs_info *c = dir->i_sb->s_fs_info; int err, sz_change; - struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 }; + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .dirtied_ino = 1}; struct fscrypt_name nm; /* @@ -1264,17 +1298,19 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, .dirtied_ino = 3 }; struct ubifs_budget_req ino_req = { .dirtied_ino = 1, .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; + struct ubifs_budget_req wht_req; struct timespec64 time; unsigned int saved_nlink; struct fscrypt_name old_nm, new_nm; /* - * Budget request settings: deletion direntry, new direntry, removing - * the old inode, and changing old and new parent directory inodes. + * Budget request settings: + * req: deletion direntry, new direntry, removing the old inode, + * and changing old and new parent directory inodes. + * + * wht_req: new whiteout inode for RENAME_WHITEOUT. * - * However, this operation also marks the target inode as dirty and - * does not write it, so we allocate budget for the target inode - * separately. + * ino_req: marks the target inode as dirty and does not write it. */ dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu flags 0x%x", @@ -1331,20 +1367,44 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_release; } - err = do_tmpfile(old_dir, old_dentry, S_IFCHR | WHITEOUT_MODE, &whiteout); - if (err) { + /* + * The whiteout inode without dentry is pinned in memory, + * umount won't happen during rename process because we + * got parent dentry. + */ + whiteout = create_whiteout(old_dir, old_dentry); + if (IS_ERR(whiteout)) { + err = PTR_ERR(whiteout); kfree(dev); goto out_release; } - spin_lock(&whiteout->i_lock); - whiteout->i_state |= I_LINKABLE; - spin_unlock(&whiteout->i_lock); - whiteout_ui = ubifs_inode(whiteout); whiteout_ui->data = dev; whiteout_ui->data_len = ubifs_encode_dev(dev, MKDEV(0, 0)); ubifs_assert(c, !whiteout_ui->dirty); + + memset(&wht_req, 0, sizeof(struct ubifs_budget_req)); + wht_req.new_ino = 1; + wht_req.new_ino_d = ALIGN(whiteout_ui->data_len, 8); + /* + * To avoid deadlock between space budget (holds ui_mutex and + * waits wb work) and writeback work(waits ui_mutex), do space + * budget before ubifs inodes locked. + */ + err = ubifs_budget_space(c, &wht_req); + if (err) { + /* + * Whiteout inode can not be written on flash by + * ubifs_jnl_write_inode(), because it's neither + * dirty nor zero-nlink. + */ + iput(whiteout); + goto out_release; + } + + /* Add the old_dentry size to the old_dir size. */ + old_sz -= CALC_DENT_SIZE(fname_len(&old_nm)); } lock_4_inodes(old_dir, new_dir, new_inode, whiteout); @@ -1416,29 +1476,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir); if (unlink && IS_SYNC(new_inode)) sync = 1; - } - - if (whiteout) { - struct ubifs_budget_req wht_req = { .dirtied_ino = 1, - .dirtied_ino_d = \ - ALIGN(ubifs_inode(whiteout)->data_len, 8) }; - - err = ubifs_budget_space(c, &wht_req); - if (err) { - kfree(whiteout_ui->data); - whiteout_ui->data_len = 0; - iput(whiteout); - goto out_release; - } - - inc_nlink(whiteout); - mark_inode_dirty(whiteout); - - spin_lock(&whiteout->i_lock); - whiteout->i_state &= ~I_LINKABLE; - spin_unlock(&whiteout->i_lock); - - iput(whiteout); + /* + * S_SYNC flag of whiteout inherits from the old_dir, and we + * have already checked the old dir inode. So there is no need + * to check whiteout. + */ } err = ubifs_jnl_rename(c, old_dir, old_inode, &old_nm, new_dir, @@ -1449,6 +1491,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, unlock_4_inodes(old_dir, new_dir, new_inode, whiteout); ubifs_release_budget(c, &req); + if (whiteout) { + ubifs_release_budget(c, &wht_req); + iput(whiteout); + } + mutex_lock(&old_inode_ui->ui_mutex); release = old_inode_ui->dirty; mark_inode_dirty_sync(old_inode); @@ -1457,11 +1504,16 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, if (release) ubifs_release_budget(c, &ino_req); if (IS_SYNC(old_inode)) - err = old_inode->i_sb->s_op->write_inode(old_inode, NULL); + /* + * Rename finished here. Although old inode cannot be updated + * on flash, old ctime is not a big problem, don't return err + * code to userspace. + */ + old_inode->i_sb->s_op->write_inode(old_inode, NULL); fscrypt_free_filename(&old_nm); fscrypt_free_filename(&new_nm); - return err; + return 0; out_cancel: if (unlink) { @@ -1482,11 +1534,11 @@ out_cancel: inc_nlink(old_dir); } } + unlock_4_inodes(old_dir, new_dir, new_inode, whiteout); if (whiteout) { - drop_nlink(whiteout); + ubifs_release_budget(c, &wht_req); iput(whiteout); } - unlock_4_inodes(old_dir, new_dir, new_inode, whiteout); out_release: ubifs_release_budget(c, &ino_req); ubifs_release_budget(c, &req); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 8a9ffc2d4167..0383fbdc95ff 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -570,7 +570,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, } if (!PagePrivate(page)) { - SetPagePrivate(page); + attach_page_private(page, (void *)1); atomic_long_inc(&c->dirty_pg_cnt); __set_page_dirty_nobuffers(page); } @@ -947,7 +947,7 @@ static int do_writepage(struct page *page, int len) release_existing_page_budget(c); atomic_long_dec(&c->dirty_pg_cnt); - ClearPagePrivate(page); + detach_page_private(page); ClearPageChecked(page); kunmap(page); @@ -1304,7 +1304,7 @@ static void ubifs_invalidate_folio(struct folio *folio, size_t offset, release_existing_page_budget(c); atomic_long_dec(&c->dirty_pg_cnt); - folio_clear_private(folio); + folio_detach_private(folio); folio_clear_checked(folio); } @@ -1471,8 +1471,8 @@ static int ubifs_migrate_page(struct address_space *mapping, return rc; if (PagePrivate(page)) { - ClearPagePrivate(page); - SetPagePrivate(newpage); + detach_page_private(page); + attach_page_private(newpage, (void *)1); } if (mode != MIGRATE_SYNC_NO_COPY) @@ -1496,7 +1496,7 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags) return 0; ubifs_assert(c, PagePrivate(page)); ubifs_assert(c, 0); - ClearPagePrivate(page); + detach_page_private(page); ClearPageChecked(page); return 1; } @@ -1567,7 +1567,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) else { if (!PageChecked(page)) ubifs_convert_page_budget(c); - SetPagePrivate(page); + attach_page_private(page, (void *)1); atomic_long_inc(&c->dirty_pg_cnt); __set_page_dirty_nobuffers(page); } diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 789a7813f3fa..1607a3c76681 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -854,16 +854,42 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) */ n = aligned_len >> c->max_write_shift; if (n) { - n <<= c->max_write_shift; + int m = n - 1; + dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, wbuf->offs); - err = ubifs_leb_write(c, wbuf->lnum, buf + written, - wbuf->offs, n); + + if (m) { + /* '(n-1)<<c->max_write_shift < len' is always true. */ + m <<= c->max_write_shift; + err = ubifs_leb_write(c, wbuf->lnum, buf + written, + wbuf->offs, m); + if (err) + goto out; + wbuf->offs += m; + aligned_len -= m; + len -= m; + written += m; + } + + /* + * The non-written len of buf may be less than 'n' because + * parameter 'len' is not 8 bytes aligned, so here we read + * min(len, n) bytes from buf. + */ + n = 1 << c->max_write_shift; + memcpy(wbuf->buf, buf + written, min(len, n)); + if (n > len) { + ubifs_assert(c, n - len < 8); + ubifs_pad(c, wbuf->buf + len, n - len); + } + + err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, n); if (err) goto out; wbuf->offs += n; aligned_len -= n; - len -= n; + len -= min(len, n); written += n; } diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index c6a863487780..71bcebe45f9c 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -108,7 +108,7 @@ static int setflags(struct inode *inode, int flags) struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_budget_req req = { .dirtied_ino = 1, - .dirtied_ino_d = ui->data_len }; + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; err = ubifs_budget_space(c, &req); if (err) diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 8ea680dba61e..75dab0ae3939 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -1207,9 +1207,9 @@ out_free: * @sync: non-zero if the write-buffer has to be synchronized * * This function implements the re-name operation which may involve writing up - * to 4 inodes and 2 directory entries. It marks the written inodes as clean - * and returns zero on success. In case of failure, a negative error code is - * returned. + * to 4 inodes(new inode, whiteout inode, old and new parent directory inodes) + * and 2 directory entries. It marks the written inodes as clean and returns + * zero on success. In case of failure, a negative error code is returned. */ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, const struct inode *old_inode, @@ -1222,14 +1222,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, void *p; union ubifs_key key; struct ubifs_dent_node *dent, *dent2; - int err, dlen1, dlen2, ilen, lnum, offs, len, orphan_added = 0; + int err, dlen1, dlen2, ilen, wlen, lnum, offs, len, orphan_added = 0; int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ; int last_reference = !!(new_inode && new_inode->i_nlink == 0); int move = (old_dir != new_dir); - struct ubifs_inode *new_ui; + struct ubifs_inode *new_ui, *whiteout_ui; u8 hash_old_dir[UBIFS_HASH_ARR_SZ]; u8 hash_new_dir[UBIFS_HASH_ARR_SZ]; u8 hash_new_inode[UBIFS_HASH_ARR_SZ]; + u8 hash_whiteout_inode[UBIFS_HASH_ARR_SZ]; u8 hash_dent1[UBIFS_HASH_ARR_SZ]; u8 hash_dent2[UBIFS_HASH_ARR_SZ]; @@ -1249,9 +1250,20 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, } else ilen = 0; + if (whiteout) { + whiteout_ui = ubifs_inode(whiteout); + ubifs_assert(c, mutex_is_locked(&whiteout_ui->ui_mutex)); + ubifs_assert(c, whiteout->i_nlink == 1); + ubifs_assert(c, !whiteout_ui->dirty); + wlen = UBIFS_INO_NODE_SZ; + wlen += whiteout_ui->data_len; + } else + wlen = 0; + aligned_dlen1 = ALIGN(dlen1, 8); aligned_dlen2 = ALIGN(dlen2, 8); - len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + ALIGN(plen, 8); + len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + + ALIGN(wlen, 8) + ALIGN(plen, 8); if (move) len += plen; @@ -1313,6 +1325,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, p += ALIGN(ilen, 8); } + if (whiteout) { + pack_inode(c, p, whiteout, 0); + err = ubifs_node_calc_hash(c, p, hash_whiteout_inode); + if (err) + goto out_release; + + p += ALIGN(wlen, 8); + } + if (!move) { pack_inode(c, p, old_dir, 1); err = ubifs_node_calc_hash(c, p, hash_old_dir); @@ -1352,6 +1373,9 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, if (new_inode) ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, new_inode->i_ino); + if (whiteout) + ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, + whiteout->i_ino); } release_head(c, BASEHD); @@ -1368,8 +1392,6 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, hash_dent2, old_nm); if (err) goto out_ro; - - ubifs_delete_orphan(c, whiteout->i_ino); } else { err = ubifs_add_dirt(c, lnum, dlen2); if (err) @@ -1390,6 +1412,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, offs += ALIGN(ilen, 8); } + if (whiteout) { + ino_key_init(c, &key, whiteout->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs, wlen, + hash_whiteout_inode); + if (err) + goto out_ro; + offs += ALIGN(wlen, 8); + } + ino_key_init(c, &key, old_dir->i_ino); err = ubifs_tnc_add(c, &key, lnum, offs, plen, hash_old_dir); if (err) @@ -1410,6 +1441,11 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, new_ui->synced_i_size = new_ui->ui_size; spin_unlock(&new_ui->ui_lock); } + /* + * No need to mark whiteout inode clean. + * Whiteout doesn't have non-zero size, no need to update + * synced_i_size for whiteout_ui. + */ mark_inode_clean(c, ubifs_inode(old_dir)); if (move) mark_inode_clean(c, ubifs_inode(new_dir)); diff --git a/fs/ubifs/sysfs.c b/fs/ubifs/sysfs.c index 7acc5a74e5fa..06ad8fa1fcfb 100644 --- a/fs/ubifs/sysfs.c +++ b/fs/ubifs/sysfs.c @@ -42,6 +42,7 @@ static struct attribute *ubifs_attrs[] = { ATTR_LIST(errors_crc), NULL, }; +ATTRIBUTE_GROUPS(ubifs); static ssize_t ubifs_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) @@ -74,7 +75,7 @@ static const struct sysfs_ops ubifs_attr_ops = { }; static struct kobj_type ubifs_sb_ktype = { - .default_attrs = ubifs_attrs, + .default_groups = ubifs_groups, .sysfs_ops = &ubifs_attr_ops, .release = ubifs_sb_release, }; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index f55828c0a300..008fa46ef61e 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -381,7 +381,7 @@ struct ubifs_gced_idx_leb { * @ui_mutex exists for two main reasons. At first it prevents inodes from * being written back while UBIFS changing them, being in the middle of an VFS * operation. This way UBIFS makes sure the inode fields are consistent. For - * example, in 'ubifs_rename()' we change 3 inodes simultaneously, and + * example, in 'ubifs_rename()' we change 4 inodes simultaneously, and * write-back must not write any of them before we have finished. * * The second reason is budgeting - UBIFS has to budget all operations. If an diff --git a/fs/udf/super.c b/fs/udf/super.c index 48871615e489..4042d9739fb7 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -2474,7 +2474,6 @@ static unsigned int udf_count_free_table(struct super_block *sb, unsigned int accum = 0; uint32_t elen; struct kernel_lb_addr eloc; - int8_t etype; struct extent_position epos; mutex_lock(&UDF_SB(sb)->s_alloc_mutex); @@ -2482,7 +2481,7 @@ static unsigned int udf_count_free_table(struct super_block *sb, epos.offset = sizeof(struct unallocSpaceEntry); epos.bh = NULL; - while ((etype = udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1) + while (udf_next_aext(table, &epos, &eloc, &elen, 1) != -1) accum += (elen >> table->i_sb->s_blocksize_bits); brelse(epos.bh); diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile index 0cc87423de82..0e51c0025a16 100644 --- a/fs/unicode/Makefile +++ b/fs/unicode/Makefile @@ -33,7 +33,7 @@ $(obj)/utf8data.c: $(obj)/mkutf8data $(filter %.txt, $(cmd_utf8data)) FORCE else $(obj)/utf8data.c: $(src)/utf8data.c_shipped FORCE - $(call if_changed,shipped) + $(call if_changed,copy) endif diff --git a/fs/verity/verify.c b/fs/verity/verify.c index 0adb970f4e73..14e2fb49cff5 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Data verification functions, i.e. hooks for ->readpages() + * Data verification functions, i.e. hooks for ->readahead() * * Copyright 2019 Google LLC */ @@ -214,7 +214,7 @@ EXPORT_SYMBOL_GPL(fsverity_verify_page); * that fail verification are set to the Error state. Verification is skipped * for pages already in the Error state, e.g. due to fscrypt decryption failure. * - * This is a helper function for use by the ->readpages() method of filesystems + * This is a helper function for use by the ->readahead() method of filesystems * that issue bios to read data directly into the page cache. Filesystems that * populate the page cache without issuing bios (e.g. non block-based * filesystems) must instead call fsverity_verify_page() directly on each page. diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 353e53b892e6..b52ed339727f 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -82,6 +82,24 @@ xfs_prealloc_blocks( } /* + * The number of blocks per AG that we withhold from xfs_mod_fdblocks to + * guarantee that we can refill the AGFL prior to allocating space in a nearly + * full AG. Although the the space described by the free space btrees, the + * blocks used by the freesp btrees themselves, and the blocks owned by the + * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk + * free space in the AG drop so low that the free space btrees cannot refill an + * empty AGFL up to the minimum level. Rather than grind through empty AGs + * until the fs goes down, we subtract this many AG blocks from the incore + * fdblocks to ensure user allocation does not overcommit the space the + * filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to + * withhold space from xfs_mod_fdblocks, so we do not account for that here. + */ +#define XFS_ALLOCBT_AGFL_RESERVE 4 + +/* + * Compute the number of blocks that we set aside to guarantee the ability to + * refill the AGFL and handle a full bmap btree split. + * * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of * AGF buffer (PV 947395), we place constraints on the relationship among * actual allocations for data blocks, freelist blocks, and potential file data @@ -93,14 +111,14 @@ xfs_prealloc_blocks( * extents need to be actually allocated. To get around this, we explicitly set * aside a few blocks which will not be reserved in delayed allocation. * - * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a - * potential split of the file's bmap btree. + * For each AG, we need to reserve enough blocks to replenish a totally empty + * AGFL and 4 more to handle a potential split of the file's bmap btree. */ unsigned int xfs_alloc_set_aside( struct xfs_mount *mp) { - return mp->m_sb.sb_agcount * (XFS_ALLOC_AGFL_RESERVE + 4); + return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4); } /* @@ -124,12 +142,12 @@ xfs_alloc_ag_max_usable( unsigned int blocks; blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */ - blocks += XFS_ALLOC_AGFL_RESERVE; + blocks += XFS_ALLOCBT_AGFL_RESERVE; blocks += 3; /* AGF, AGI btree root blocks */ if (xfs_has_finobt(mp)) blocks++; /* finobt root block */ if (xfs_has_rmapbt(mp)) - blocks++; /* rmap root block */ + blocks++; /* rmap root block */ if (xfs_has_reflink(mp)) blocks++; /* refcount root block */ diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 1c14a0b1abea..d4c057b764f9 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -88,7 +88,6 @@ typedef struct xfs_alloc_arg { #define XFS_ALLOC_NOBUSY (1 << 2)/* Busy extents not allowed */ /* freespace limit calculations */ -#define XFS_ALLOC_AGFL_RESERVE 4 unsigned int xfs_alloc_set_aside(struct xfs_mount *mp); unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp); diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 50546eadaae2..5f1e4799e8fa 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -19,7 +19,11 @@ #include "xfs_error.h" #include "xfs_trace.h" -struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR }; +const struct xfs_name xfs_name_dotdot = { + .name = (const unsigned char *)"..", + .len = 2, + .type = XFS_DIR3_FT_DIR, +}; /* * Convert inode mode to directory entry filetype @@ -54,10 +58,10 @@ xfs_mode_to_ftype( */ xfs_dahash_t xfs_ascii_ci_hashname( - struct xfs_name *name) + const struct xfs_name *name) { - xfs_dahash_t hash; - int i; + xfs_dahash_t hash; + int i; for (i = 0, hash = 0; i < name->len; i++) hash = tolower(name->name[i]) ^ rol32(hash, 7); @@ -243,7 +247,7 @@ int xfs_dir_createname( struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, + const struct xfs_name *name, xfs_ino_t inum, /* new entry inode number */ xfs_extlen_t total) /* bmap's total block count */ { @@ -337,16 +341,16 @@ xfs_dir_cilookup_result( int xfs_dir_lookup( - xfs_trans_t *tp, - xfs_inode_t *dp, - struct xfs_name *name, - xfs_ino_t *inum, /* out: inode number */ - struct xfs_name *ci_name) /* out: actual name if CI match */ + struct xfs_trans *tp, + struct xfs_inode *dp, + const struct xfs_name *name, + xfs_ino_t *inum, /* out: inode number */ + struct xfs_name *ci_name) /* out: actual name if CI match */ { - struct xfs_da_args *args; - int rval; - int v; /* type-checking value */ - int lock_mode; + struct xfs_da_args *args; + int rval; + int v; /* type-checking value */ + int lock_mode; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_lookup); @@ -475,7 +479,7 @@ int xfs_dir_replace( struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, /* name of entry to replace */ + const struct xfs_name *name, /* name of entry to replace */ xfs_ino_t inum, /* new inode number */ xfs_extlen_t total) /* bmap's total block count */ { @@ -728,7 +732,7 @@ xfs_dir2_namecheck( xfs_dahash_t xfs_dir2_hashname( struct xfs_mount *mp, - struct xfs_name *name) + const struct xfs_name *name) { if (unlikely(xfs_has_asciici(mp))) return xfs_ascii_ci_hashname(name); diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index d03e6098ded9..b6df3c34b26a 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -21,7 +21,7 @@ struct xfs_dir2_data_unused; struct xfs_dir3_icfree_hdr; struct xfs_dir3_icleaf_hdr; -extern struct xfs_name xfs_name_dotdot; +extern const struct xfs_name xfs_name_dotdot; /* * Convert inode mode to directory entry filetype @@ -39,16 +39,16 @@ extern int xfs_dir_isempty(struct xfs_inode *dp); extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_inode *pdp); extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, xfs_ino_t inum, + const struct xfs_name *name, xfs_ino_t inum, xfs_extlen_t tot); extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, xfs_ino_t *inum, + const struct xfs_name *name, xfs_ino_t *inum, struct xfs_name *ci_name); extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name, xfs_ino_t ino, xfs_extlen_t tot); extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, xfs_ino_t inum, + const struct xfs_name *name, xfs_ino_t inum, xfs_extlen_t tot); extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name); diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 711709a2aa53..7404a9ff1a92 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -40,7 +40,7 @@ struct xfs_dir3_icfree_hdr { }; /* xfs_dir2.c */ -xfs_dahash_t xfs_ascii_ci_hashname(struct xfs_name *name); +xfs_dahash_t xfs_ascii_ci_hashname(const struct xfs_name *name); enum xfs_dacmp xfs_ascii_ci_compname(struct xfs_da_args *args, const unsigned char *name, int len); extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, @@ -201,7 +201,8 @@ xfs_dir2_data_entsize( return round_up(len, XFS_DIR2_DATA_ALIGN); } -xfs_dahash_t xfs_dir2_hashname(struct xfs_mount *mp, struct xfs_name *name); +xfs_dahash_t xfs_dir2_hashname(struct xfs_mount *mp, + const struct xfs_name *name); enum xfs_dacmp xfs_dir2_compname(struct xfs_da_args *args, const unsigned char *name, int len); diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h index 1719e1c4da59..3590e10e3e62 100644 --- a/fs/xfs/scrub/attr.h +++ b/fs/xfs/scrub/attr.h @@ -24,7 +24,7 @@ struct xchk_xattr_buf { * space bitmap follows immediately after; and we have a third buffer * for storing intermediate bitmap results. */ - uint8_t buf[0]; + uint8_t buf[]; }; /* A place to store attribute values. */ diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c index 32fa02945f73..ae4345b37621 100644 --- a/fs/xfs/xfs_bio_io.c +++ b/fs/xfs/xfs_bio_io.c @@ -9,39 +9,6 @@ static inline unsigned int bio_max_vecs(unsigned int count) return bio_max_segs(howmany(count, PAGE_SIZE)); } -static void -xfs_flush_bdev_async_endio( - struct bio *bio) -{ - complete(bio->bi_private); -} - -/* - * Submit a request for an async cache flush to run. If the request queue does - * not require flush operations, just skip it altogether. If the caller needs - * to wait for the flush completion at a later point in time, they must supply a - * valid completion. This will be signalled when the flush completes. The - * caller never sees the bio that is issued here. - */ -void -xfs_flush_bdev_async( - struct bio *bio, - struct block_device *bdev, - struct completion *done) -{ - struct request_queue *q = bdev->bd_disk->queue; - - if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { - complete(done); - return; - } - - bio_init(bio, bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC); - bio->bi_private = done; - bio->bi_end_io = xfs_flush_bdev_async_endio; - - submit_bio(bio); -} int xfs_rw_bdev( struct block_device *bdev, diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index e1f4d7d5a011..761dde155099 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -463,7 +463,7 @@ xfs_bui_item_recover( struct xfs_bui_log_item *buip = BUI_ITEM(lip); struct xfs_trans *tp; struct xfs_inode *ip = NULL; - struct xfs_mount *mp = lip->li_mountp; + struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_map_extent *bmap; struct xfs_bud_log_item *budp; xfs_filblks_t count; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 44795018df1d..e1afb9e503e1 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -14,6 +14,7 @@ #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_log_recover.h" +#include "xfs_log_priv.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_errortag.h" @@ -813,7 +814,15 @@ xfs_buf_read_map( * buffer. */ if (error) { - if (!xfs_is_shutdown(target->bt_mount)) + /* + * Check against log shutdown for error reporting because + * metadata writeback may require a read first and we need to + * report errors in metadata writeback until the log is shut + * down. High level transaction read functions already check + * against mount shutdown, anyway, so we only need to be + * concerned about low level IO interactions here. + */ + if (!xlog_is_shutdown(target->bt_mount->m_log)) xfs_buf_ioerror_alert(bp, fa); bp->b_flags &= ~XBF_DONE; @@ -1174,10 +1183,10 @@ xfs_buf_ioend_handle_error( struct xfs_error_cfg *cfg; /* - * If we've already decided to shutdown the filesystem because of I/O - * errors, there's no point in giving this a retry. + * If we've already shutdown the journal because of I/O errors, there's + * no point in giving this a retry. */ - if (xfs_is_shutdown(mp)) + if (xlog_is_shutdown(mp->m_log)) goto out_stale; xfs_buf_ioerror_alert_ratelimited(bp); @@ -1588,8 +1597,23 @@ __xfs_buf_submit( ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); - /* on shutdown we stale and complete the buffer immediately */ - if (xfs_is_shutdown(bp->b_mount)) { + /* + * On log shutdown we stale and complete the buffer immediately. We can + * be called to read the superblock before the log has been set up, so + * be careful checking the log state. + * + * Checking the mount shutdown state here can result in the log tail + * moving inappropriately on disk as the log may not yet be shut down. + * i.e. failing this buffer on mount shutdown can remove it from the AIL + * and move the tail of the log forwards without having written this + * buffer to disk. This corrupts the log tail state in memory, and + * because the log may not be shut down yet, it can then be propagated + * to disk before the log is shutdown. Hence we check log shutdown + * state here rather than mount state to avoid corrupting the log tail + * on shutdown. + */ + if (bp->b_mount->m_log && + xlog_is_shutdown(bp->b_mount->m_log)) { xfs_buf_ioend_fail(bp); return -EIO; } @@ -1803,10 +1827,10 @@ xfs_buftarg_drain( * If one or more failed buffers were freed, that means dirty metadata * was thrown away. This should only ever happen after I/O completion * handling has elevated I/O error(s) to permanent failures and shuts - * down the fs. + * down the journal. */ if (write_fail) { - ASSERT(xfs_is_shutdown(btp->bt_mount)); + ASSERT(xlog_is_shutdown(btp->bt_mount->m_log)); xfs_alert(btp->bt_mount, "Please run xfs_repair to determine the extent of the problem."); } @@ -2089,12 +2113,13 @@ xfs_buf_delwri_submit_buffers( blk_start_plug(&plug); list_for_each_entry_safe(bp, n, buffer_list, b_list) { if (!wait_list) { + if (!xfs_buf_trylock(bp)) + continue; if (xfs_buf_ispinned(bp)) { + xfs_buf_unlock(bp); pinned++; continue; } - if (!xfs_buf_trylock(bp)) - continue; } else { xfs_buf_lock(bp); } diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index a7a8e4528881..522d450a94b1 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -21,6 +21,7 @@ #include "xfs_dquot.h" #include "xfs_trace.h" #include "xfs_log.h" +#include "xfs_log_priv.h" struct kmem_cache *xfs_buf_item_cache; @@ -428,7 +429,7 @@ xfs_buf_item_format( * occurs during recovery. */ if (bip->bli_flags & XFS_BLI_INODE_BUF) { - if (xfs_has_v3inodes(lip->li_mountp) || + if (xfs_has_v3inodes(lip->li_log->l_mp) || !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && xfs_log_item_in_current_chkpt(lip))) bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; @@ -616,7 +617,7 @@ xfs_buf_item_put( * that case, the bli is freed on buffer writeback completion. */ aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) || - xfs_is_shutdown(lip->li_mountp); + xlog_is_shutdown(lip->li_log); dirty = bip->bli_flags & XFS_BLI_DIRTY; if (dirty && !aborted) return false; diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 47ef9c9c5c17..0e50f2c9348e 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -604,7 +604,7 @@ xfs_efi_item_recover( struct list_head *capture_list) { struct xfs_efi_log_item *efip = EFI_ITEM(lip); - struct xfs_mount *mp = lip->li_mountp; + struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_efd_log_item *efdp; struct xfs_trans *tp; struct xfs_extent *extp; diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 48287caad28b..10e1cb71439e 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -864,8 +864,8 @@ xfs_getfsmap( !xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[1])) return -EINVAL; - use_rmap = capable(CAP_SYS_ADMIN) && - xfs_has_rmapbt(mp); + use_rmap = xfs_has_rmapbt(mp) && + has_capability_noaudit(current, CAP_SYS_ADMIN); head->fmh_entries = 0; /* Set up our device handlers. */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 33e26690a8c4..68f74549fa22 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -17,6 +17,7 @@ #include "xfs_fsops.h" #include "xfs_trans_space.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_trace.h" @@ -347,7 +348,7 @@ xfs_fs_counts( cnt->allocino = percpu_counter_read_positive(&mp->m_icount); cnt->freeino = percpu_counter_read_positive(&mp->m_ifree); cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) - - mp->m_alloc_set_aside; + xfs_fdblocks_unavailable(mp); spin_lock(&mp->m_sb_lock); cnt->freertx = mp->m_sb.sb_frextents; @@ -430,46 +431,36 @@ xfs_reserve_blocks( * If the request is larger than the current reservation, reserve the * blocks before we update the reserve counters. Sample m_fdblocks and * perform a partial reservation if the request exceeds free space. + * + * The code below estimates how many blocks it can request from + * fdblocks to stash in the reserve pool. This is a classic TOCTOU + * race since fdblocks updates are not always coordinated via + * m_sb_lock. Set the reserve size even if there's not enough free + * space to fill it because mod_fdblocks will refill an undersized + * reserve when it can. */ - error = -ENOSPC; - do { - free = percpu_counter_sum(&mp->m_fdblocks) - - mp->m_alloc_set_aside; - if (free <= 0) - break; - - delta = request - mp->m_resblks; - lcounter = free - delta; - if (lcounter < 0) - /* We can't satisfy the request, just get what we can */ - fdblks_delta = free; - else - fdblks_delta = delta; - + free = percpu_counter_sum(&mp->m_fdblocks) - + xfs_fdblocks_unavailable(mp); + delta = request - mp->m_resblks; + mp->m_resblks = request; + if (delta > 0 && free > 0) { /* * We'll either succeed in getting space from the free block - * count or we'll get an ENOSPC. If we get a ENOSPC, it means - * things changed while we were calculating fdblks_delta and so - * we should try again to see if there is anything left to - * reserve. + * count or we'll get an ENOSPC. Don't set the reserved flag + * here - we don't want to reserve the extra reserve blocks + * from the reserve. * - * Don't set the reserved flag here - we don't want to reserve - * the extra reserve blocks from the reserve..... + * The desired reserve size can change after we drop the lock. + * Use mod_fdblocks to put the space into the reserve or into + * fdblocks as appropriate. */ + fdblks_delta = min(free, delta); spin_unlock(&mp->m_sb_lock); error = xfs_mod_fdblocks(mp, -fdblks_delta, 0); + if (!error) + xfs_mod_fdblocks(mp, fdblks_delta, 0); spin_lock(&mp->m_sb_lock); - } while (error == -ENOSPC); - - /* - * Update the reserve counters if blocks have been successfully - * allocated. - */ - if (!error && fdblks_delta) { - mp->m_resblks += fdblks_delta; - mp->m_resblks_avail += fdblks_delta; } - out: if (outval) { outval->resblks = mp->m_resblks; @@ -528,8 +519,11 @@ xfs_do_force_shutdown( int tag; const char *why; - if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate)) + + if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate)) { + xlog_shutdown_wait(mp->m_log); return; + } if (mp->m_sb_bp) mp->m_sb_bp->b_flags |= XBF_DONE; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 9e434cb41e48..bffd6eb0b298 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -23,6 +23,7 @@ #include "xfs_reflink.h" #include "xfs_ialloc.h" #include "xfs_ag.h" +#include "xfs_log_priv.h" #include <linux/iversion.h> @@ -873,9 +874,16 @@ xfs_reclaim_inode( if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING)) goto out_iunlock; - if (xfs_is_shutdown(ip->i_mount)) { + /* + * Check for log shutdown because aborting the inode can move the log + * tail and corrupt in memory state. This is fine if the log is shut + * down, but if the log is still active and only the mount is shut down + * then the in-memory log tail movement caused by the abort can be + * incorrectly propagated to disk. + */ + if (xlog_is_shutdown(ip->i_mount->m_log)) { xfs_iunpin_wait(ip); - xfs_iflush_abort(ip); + xfs_iflush_shutdown_abort(ip); goto reclaim; } if (xfs_ipincount(ip)) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 04bf467b1090..9de6205fe134 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -35,6 +35,7 @@ #include "xfs_bmap_btree.h" #include "xfs_reflink.h" #include "xfs_ag.h" +#include "xfs_log_priv.h" struct kmem_cache *xfs_inode_cache; @@ -658,9 +659,9 @@ xfs_ip2xflags( */ int xfs_lookup( - xfs_inode_t *dp, - struct xfs_name *name, - xfs_inode_t **ipp, + struct xfs_inode *dp, + const struct xfs_name *name, + struct xfs_inode **ipp, struct xfs_name *ci_name) { xfs_ino_t inum; @@ -1217,7 +1218,7 @@ xfs_link( { xfs_mount_t *mp = tdp->i_mount; xfs_trans_t *tp; - int error; + int error, nospace_error = 0; int resblks; trace_xfs_link(tdp, target_name); @@ -1236,19 +1237,11 @@ xfs_link( goto std_return; resblks = XFS_LINK_SPACE_RES(mp, target_name->len); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp); - if (error == -ENOSPC) { - resblks = 0; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp); - } + error = xfs_trans_alloc_dir(tdp, &M_RES(mp)->tr_link, sip, &resblks, + &tp, &nospace_error); if (error) goto std_return; - xfs_lock_two_inodes(sip, XFS_ILOCK_EXCL, tdp, XFS_ILOCK_EXCL); - - xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); - error = xfs_iext_count_may_overflow(tdp, XFS_DATA_FORK, XFS_IEXT_DIR_MANIP_CNT(mp)); if (error) @@ -1306,6 +1299,8 @@ xfs_link( error_return: xfs_trans_cancel(tp); std_return: + if (error == -ENOSPC && nospace_error) + error = nospace_error; return error; } @@ -2755,6 +2750,7 @@ xfs_remove( xfs_mount_t *mp = dp->i_mount; xfs_trans_t *tp = NULL; int is_dir = S_ISDIR(VFS_I(ip)->i_mode); + int dontcare; int error = 0; uint resblks; @@ -2772,31 +2768,24 @@ xfs_remove( goto std_return; /* - * We try to get the real space reservation first, - * allowing for directory btree deletion(s) implying - * possible bmap insert(s). If we can't get the space - * reservation then we use 0 instead, and avoid the bmap - * btree insert(s) in the directory code by, if the bmap - * insert tries to happen, instead trimming the LAST - * block from the directory. + * We try to get the real space reservation first, allowing for + * directory btree deletion(s) implying possible bmap insert(s). If we + * can't get the space reservation then we use 0 instead, and avoid the + * bmap btree insert(s) in the directory code by, if the bmap insert + * tries to happen, instead trimming the LAST block from the directory. + * + * Ignore EDQUOT and ENOSPC being returned via nospace_error because + * the directory code can handle a reservationless update and we don't + * want to prevent a user from trying to free space by deleting things. */ resblks = XFS_REMOVE_SPACE_RES(mp); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp); - if (error == -ENOSPC) { - resblks = 0; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0, - &tp); - } + error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, ip, &resblks, + &tp, &dontcare); if (error) { ASSERT(error != -ENOSPC); goto std_return; } - xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL); - - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - /* * If we're removing a directory perform some additional validation. */ @@ -3109,7 +3098,8 @@ xfs_rename( bool new_parent = (src_dp != target_dp); bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode); int spaceres; - int error; + bool retried = false; + int error, nospace_error = 0; trace_xfs_rename(src_dp, target_dp, src_name, target_name); @@ -3133,9 +3123,12 @@ xfs_rename( xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip, inodes, &num_inodes); +retry: + nospace_error = 0; spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp); if (error == -ENOSPC) { + nospace_error = error; spaceres = 0; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0, &tp); @@ -3190,6 +3183,31 @@ xfs_rename( spaceres); /* + * Try to reserve quota to handle an expansion of the target directory. + * We'll allow the rename to continue in reservationless mode if we hit + * a space usage constraint. If we trigger reservationless mode, save + * the errno if there isn't any free space in the target directory. + */ + if (spaceres != 0) { + error = xfs_trans_reserve_quota_nblks(tp, target_dp, spaceres, + 0, false); + if (error == -EDQUOT || error == -ENOSPC) { + if (!retried) { + xfs_trans_cancel(tp); + xfs_blockgc_free_quota(target_dp, 0); + retried = true; + goto retry; + } + + nospace_error = error; + spaceres = 0; + error = 0; + } + if (error) + goto out_trans_cancel; + } + + /* * Check for expected errors before we dirty the transaction * so we can return an error without a transaction abort. * @@ -3435,6 +3453,8 @@ out_trans_cancel: out_release_wip: if (wip) xfs_irele(wip); + if (error == -ENOSPC && nospace_error) + error = nospace_error; return error; } @@ -3611,7 +3631,7 @@ xfs_iflush_cluster( /* * We must use the safe variant here as on shutdown xfs_iflush_abort() - * can remove itself from the list. + * will remove itself from the list. */ list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { iip = (struct xfs_inode_log_item *)lip; @@ -3659,7 +3679,7 @@ xfs_iflush_cluster( * AIL, leaving a dirty/unpinned inode attached to the buffer * that otherwise looks like it should be flushed. */ - if (xfs_is_shutdown(mp)) { + if (xlog_is_shutdown(mp->m_log)) { xfs_iunpin_wait(ip); xfs_iflush_abort(ip); xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -3685,9 +3705,19 @@ xfs_iflush_cluster( } if (error) { + /* + * Shutdown first so we kill the log before we release this + * buffer. If it is an INODE_ALLOC buffer and pins the tail + * of the log, failing it before the _log_ is shut down can + * result in the log tail being moved forward in the journal + * on disk because log writes can still be taking place. Hence + * unpinning the tail will allow the ICREATE intent to be + * removed from the log an recovery will fail with uninitialised + * inode cluster buffers. + */ + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); bp->b_flags |= XBF_ASYNC; xfs_buf_ioend_fail(bp); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return error; } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index b7e8f14d9fca..740ab13d1aa2 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -402,7 +402,7 @@ enum layout_break_reason { int xfs_release(struct xfs_inode *ip); void xfs_inactive(struct xfs_inode *ip); -int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, +int xfs_lookup(struct xfs_inode *dp, const struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); int xfs_create(struct user_namespace *mnt_userns, struct xfs_inode *dp, struct xfs_name *name, diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 90d8e591baf8..9e6ef55cf29e 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -17,6 +17,7 @@ #include "xfs_trans_priv.h" #include "xfs_buf_item.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_error.h" #include <linux/iversion.h> @@ -543,10 +544,17 @@ xfs_inode_item_push( uint rval = XFS_ITEM_SUCCESS; int error; - ASSERT(iip->ili_item.li_buf); + if (!bp || (ip->i_flags & XFS_ISTALE)) { + /* + * Inode item/buffer is being being aborted due to cluster + * buffer deletion. Trigger a log force to have that operation + * completed and items removed from the AIL before the next push + * attempt. + */ + return XFS_ITEM_PINNED; + } - if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp) || - (ip->i_flags & XFS_ISTALE)) + if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp)) return XFS_ITEM_PINNED; if (xfs_iflags_test(ip, XFS_IFLUSHING)) @@ -720,6 +728,17 @@ xfs_iflush_ail_updates( if (INODE_ITEM(lip)->ili_flush_lsn != lip->li_lsn) continue; + /* + * dgc: Not sure how this happens, but it happens very + * occassionaly via generic/388. xfs_iflush_abort() also + * silently handles this same "under writeback but not in AIL at + * shutdown" condition via xfs_trans_ail_delete(). + */ + if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) { + ASSERT(xlog_is_shutdown(lip->li_log)); + continue; + } + lsn = xfs_ail_delete_one(ailp, lip); if (!tail_lsn && lsn) tail_lsn = lsn; @@ -822,46 +841,143 @@ xfs_buf_inode_io_fail( } /* - * This is the inode flushing abort routine. It is called when - * the filesystem is shutting down to clean up the inode state. It is - * responsible for removing the inode item from the AIL if it has not been - * re-logged and clearing the inode's flush state. + * Clear the inode logging fields so no more flushes are attempted. If we are + * on a buffer list, it is now safe to remove it because the buffer is + * guaranteed to be locked. The caller will drop the reference to the buffer + * the log item held. + */ +static void +xfs_iflush_abort_clean( + struct xfs_inode_log_item *iip) +{ + iip->ili_last_fields = 0; + iip->ili_fields = 0; + iip->ili_fsync_fields = 0; + iip->ili_flush_lsn = 0; + iip->ili_item.li_buf = NULL; + list_del_init(&iip->ili_item.li_bio_list); +} + +/* + * Abort flushing the inode from a context holding the cluster buffer locked. + * + * This is the normal runtime method of aborting writeback of an inode that is + * attached to a cluster buffer. It occurs when the inode and the backing + * cluster buffer have been freed (i.e. inode is XFS_ISTALE), or when cluster + * flushing or buffer IO completion encounters a log shutdown situation. + * + * If we need to abort inode writeback and we don't already hold the buffer + * locked, call xfs_iflush_shutdown_abort() instead as this should only ever be + * necessary in a shutdown situation. */ void xfs_iflush_abort( struct xfs_inode *ip) { struct xfs_inode_log_item *iip = ip->i_itemp; - struct xfs_buf *bp = NULL; + struct xfs_buf *bp; - if (iip) { - /* - * Clear the failed bit before removing the item from the AIL so - * xfs_trans_ail_delete() doesn't try to clear and release the - * buffer attached to the log item before we are done with it. - */ - clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags); - xfs_trans_ail_delete(&iip->ili_item, 0); + if (!iip) { + /* clean inode, nothing to do */ + xfs_iflags_clear(ip, XFS_IFLUSHING); + return; + } + + /* + * Remove the inode item from the AIL before we clear its internal + * state. Whilst the inode is in the AIL, it should have a valid buffer + * pointer for push operations to access - it is only safe to remove the + * inode from the buffer once it has been removed from the AIL. + * + * We also clear the failed bit before removing the item from the AIL + * as xfs_trans_ail_delete()->xfs_clear_li_failed() will release buffer + * references the inode item owns and needs to hold until we've fully + * aborted the inode log item and detached it from the buffer. + */ + clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags); + xfs_trans_ail_delete(&iip->ili_item, 0); + + /* + * Grab the inode buffer so can we release the reference the inode log + * item holds on it. + */ + spin_lock(&iip->ili_lock); + bp = iip->ili_item.li_buf; + xfs_iflush_abort_clean(iip); + spin_unlock(&iip->ili_lock); + + xfs_iflags_clear(ip, XFS_IFLUSHING); + if (bp) + xfs_buf_rele(bp); +} +/* + * Abort an inode flush in the case of a shutdown filesystem. This can be called + * from anywhere with just an inode reference and does not require holding the + * inode cluster buffer locked. If the inode is attached to a cluster buffer, + * it will grab and lock it safely, then abort the inode flush. + */ +void +xfs_iflush_shutdown_abort( + struct xfs_inode *ip) +{ + struct xfs_inode_log_item *iip = ip->i_itemp; + struct xfs_buf *bp; + + if (!iip) { + /* clean inode, nothing to do */ + xfs_iflags_clear(ip, XFS_IFLUSHING); + return; + } + + spin_lock(&iip->ili_lock); + bp = iip->ili_item.li_buf; + if (!bp) { + spin_unlock(&iip->ili_lock); + xfs_iflush_abort(ip); + return; + } + + /* + * We have to take a reference to the buffer so that it doesn't get + * freed when we drop the ili_lock and then wait to lock the buffer. + * We'll clean up the extra reference after we pick up the ili_lock + * again. + */ + xfs_buf_hold(bp); + spin_unlock(&iip->ili_lock); + xfs_buf_lock(bp); + + spin_lock(&iip->ili_lock); + if (!iip->ili_item.li_buf) { /* - * Clear the inode logging fields so no more flushes are - * attempted. + * Raced with another removal, hold the only reference + * to bp now. Inode should not be in the AIL now, so just clean + * up and return; */ - spin_lock(&iip->ili_lock); - iip->ili_last_fields = 0; - iip->ili_fields = 0; - iip->ili_fsync_fields = 0; - iip->ili_flush_lsn = 0; - bp = iip->ili_item.li_buf; - iip->ili_item.li_buf = NULL; - list_del_init(&iip->ili_item.li_bio_list); + ASSERT(list_empty(&iip->ili_item.li_bio_list)); + ASSERT(!test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags)); + xfs_iflush_abort_clean(iip); spin_unlock(&iip->ili_lock); + xfs_iflags_clear(ip, XFS_IFLUSHING); + xfs_buf_relse(bp); + return; } - xfs_iflags_clear(ip, XFS_IFLUSHING); - if (bp) - xfs_buf_rele(bp); + + /* + * Got two references to bp. The first will get dropped by + * xfs_iflush_abort() when the item is removed from the buffer list, but + * we can't drop our reference until _abort() returns because we have to + * unlock the buffer as well. Hence we abort and then unlock and release + * our reference to the buffer. + */ + ASSERT(iip->ili_item.li_buf == bp); + spin_unlock(&iip->ili_lock); + xfs_iflush_abort(ip); + xfs_buf_relse(bp); } + /* * convert an xfs_inode_log_format struct from the old 32 bit version * (which can have different field alignments) to the native 64 bit version diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 1a302000d604..bbd836a44ff0 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -44,6 +44,7 @@ static inline int xfs_inode_clean(struct xfs_inode *ip) extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); extern void xfs_inode_item_destroy(struct xfs_inode *); extern void xfs_iflush_abort(struct xfs_inode *); +extern void xfs_iflush_shutdown_abort(struct xfs_inode *); extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, struct xfs_inode_log_format *); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 2515fe8299e1..83481005317a 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1189,7 +1189,7 @@ xfs_ioctl_setattr_get_trans( goto out_error; error = xfs_trans_alloc_ichange(ip, NULL, NULL, pdqp, - capable(CAP_FOWNER), &tp); + has_capability_noaudit(current, CAP_FOWNER), &tp); if (error) goto out_error; diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 004ed2a251e8..ca25ed89b706 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -217,7 +217,7 @@ xfs_compat_ioc_fsbulkstat( inumbers_fmt_pf inumbers_func = xfs_fsinumbers_fmt_compat; bulkstat_one_fmt_pf bs_one_func = xfs_fsbulkstat_one_fmt_compat; -#ifdef CONFIG_X86_X32 +#ifdef CONFIG_X86_X32_ABI if (in_x32_syscall()) { /* * ... but on x32 the input xfs_fsop_bulkreq has pointers diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index b79b3846e71b..b34e8e4344a8 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -613,37 +613,6 @@ xfs_vn_getattr( return 0; } -static void -xfs_setattr_mode( - struct xfs_inode *ip, - struct iattr *iattr) -{ - struct inode *inode = VFS_I(ip); - umode_t mode = iattr->ia_mode; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - inode->i_mode &= S_IFMT; - inode->i_mode |= mode & ~S_IFMT; -} - -void -xfs_setattr_time( - struct xfs_inode *ip, - struct iattr *iattr) -{ - struct inode *inode = VFS_I(ip); - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - if (iattr->ia_valid & ATTR_ATIME) - inode->i_atime = iattr->ia_atime; - if (iattr->ia_valid & ATTR_CTIME) - inode->i_ctime = iattr->ia_ctime; - if (iattr->ia_valid & ATTR_MTIME) - inode->i_mtime = iattr->ia_mtime; -} - static int xfs_vn_change_ok( struct user_namespace *mnt_userns, @@ -678,10 +647,10 @@ xfs_setattr_nonsize( int mask = iattr->ia_valid; xfs_trans_t *tp; int error; - kuid_t uid = GLOBAL_ROOT_UID, iuid = GLOBAL_ROOT_UID; - kgid_t gid = GLOBAL_ROOT_GID, igid = GLOBAL_ROOT_GID; + kuid_t uid = GLOBAL_ROOT_UID; + kgid_t gid = GLOBAL_ROOT_GID; struct xfs_dquot *udqp = NULL, *gdqp = NULL; - struct xfs_dquot *olddquot1 = NULL, *olddquot2 = NULL; + struct xfs_dquot *old_udqp = NULL, *old_gdqp = NULL; ASSERT((mask & ATTR_SIZE) == 0); @@ -723,66 +692,30 @@ xfs_setattr_nonsize( } error = xfs_trans_alloc_ichange(ip, udqp, gdqp, NULL, - capable(CAP_FOWNER), &tp); + has_capability_noaudit(current, CAP_FOWNER), &tp); if (error) goto out_dqrele; /* - * Change file ownership. Must be the owner or privileged. + * Register quota modifications in the transaction. Must be the owner + * or privileged. These IDs could have changed since we last looked at + * them. But, we're assured that if the ownership did change while we + * didn't have the inode locked, inode's dquot(s) would have changed + * also. */ - if (mask & (ATTR_UID|ATTR_GID)) { - /* - * These IDs could have changed since we last looked at them. - * But, we're assured that if the ownership did change - * while we didn't have the inode locked, inode's dquot(s) - * would have changed also. - */ - iuid = inode->i_uid; - igid = inode->i_gid; - gid = (mask & ATTR_GID) ? iattr->ia_gid : igid; - uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid; - - /* - * CAP_FSETID overrides the following restrictions: - * - * The set-user-ID and set-group-ID bits of a file will be - * cleared upon successful return from chown() - */ - if ((inode->i_mode & (S_ISUID|S_ISGID)) && - !capable(CAP_FSETID)) - inode->i_mode &= ~(S_ISUID|S_ISGID); - - /* - * Change the ownerships and register quota modifications - * in the transaction. - */ - if (!uid_eq(iuid, uid)) { - if (XFS_IS_UQUOTA_ON(mp)) { - ASSERT(mask & ATTR_UID); - ASSERT(udqp); - olddquot1 = xfs_qm_vop_chown(tp, ip, - &ip->i_udquot, udqp); - } - inode->i_uid = uid; - } - if (!gid_eq(igid, gid)) { - if (XFS_IS_GQUOTA_ON(mp)) { - ASSERT(xfs_has_pquotino(mp) || - !XFS_IS_PQUOTA_ON(mp)); - ASSERT(mask & ATTR_GID); - ASSERT(gdqp); - olddquot2 = xfs_qm_vop_chown(tp, ip, - &ip->i_gdquot, gdqp); - } - inode->i_gid = gid; - } + if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp) && + !uid_eq(inode->i_uid, iattr->ia_uid)) { + ASSERT(udqp); + old_udqp = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp); + } + if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp) && + !gid_eq(inode->i_gid, iattr->ia_gid)) { + ASSERT(xfs_has_pquotino(mp) || !XFS_IS_PQUOTA_ON(mp)); + ASSERT(gdqp); + old_gdqp = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp); } - if (mask & ATTR_MODE) - xfs_setattr_mode(ip, iattr); - if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) - xfs_setattr_time(ip, iattr); - + setattr_copy(mnt_userns, inode, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); @@ -794,8 +727,8 @@ xfs_setattr_nonsize( /* * Release any dquot(s) the inode had kept before chown. */ - xfs_qm_dqrele(olddquot1); - xfs_qm_dqrele(olddquot2); + xfs_qm_dqrele(old_udqp); + xfs_qm_dqrele(old_gdqp); xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); @@ -1006,11 +939,8 @@ xfs_setattr_size( xfs_inode_clear_eofblocks_tag(ip); } - if (iattr->ia_valid & ATTR_MODE) - xfs_setattr_mode(ip, iattr); - if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) - xfs_setattr_time(ip, iattr); - + ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID))); + setattr_copy(mnt_userns, inode, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 09a8fba84ff9..cb9105d667db 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -197,8 +197,6 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y) int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count, char *data, unsigned int op); -void xfs_flush_bdev_async(struct bio *bio, struct block_device *bdev, - struct completion *done); #define ASSERT_ALWAYS(expr) \ (likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__)) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 16f9edbda4eb..499e15b24215 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -487,7 +487,10 @@ out_error: * Run all the pending iclog callbacks and wake log force waiters and iclog * space waiters so they can process the newly set shutdown state. We really * don't care what order we process callbacks here because the log is shut down - * and so state cannot change on disk anymore. + * and so state cannot change on disk anymore. However, we cannot wake waiters + * until the callbacks have been processed because we may be in unmount and + * we must ensure that all AIL operations the callbacks perform have completed + * before we tear down the AIL. * * We avoid processing actively referenced iclogs so that we don't run callbacks * while the iclog owner might still be preparing the iclog for IO submssion. @@ -501,7 +504,6 @@ xlog_state_shutdown_callbacks( struct xlog_in_core *iclog; LIST_HEAD(cb_list); - spin_lock(&log->l_icloglock); iclog = log->l_iclog; do { if (atomic_read(&iclog->ic_refcnt)) { @@ -509,26 +511,22 @@ xlog_state_shutdown_callbacks( continue; } list_splice_init(&iclog->ic_callbacks, &cb_list); + spin_unlock(&log->l_icloglock); + + xlog_cil_process_committed(&cb_list); + + spin_lock(&log->l_icloglock); wake_up_all(&iclog->ic_write_wait); wake_up_all(&iclog->ic_force_wait); } while ((iclog = iclog->ic_next) != log->l_iclog); wake_up_all(&log->l_flush_wait); - spin_unlock(&log->l_icloglock); - - xlog_cil_process_committed(&cb_list); } /* * Flush iclog to disk if this is the last reference to the given iclog and the * it is in the WANT_SYNC state. * - * If the caller passes in a non-zero @old_tail_lsn and the current log tail - * does not match, there may be metadata on disk that must be persisted before - * this iclog is written. To satisfy that requirement, set the - * XLOG_ICL_NEED_FLUSH flag as a condition for writing this iclog with the new - * log tail value. - * * If XLOG_ICL_NEED_FUA is already set on the iclog, we need to ensure that the * log tail is updated correctly. NEED_FUA indicates that the iclog will be * written to stable storage, and implies that a commit record is contained @@ -545,12 +543,10 @@ xlog_state_shutdown_callbacks( * always capture the tail lsn on the iclog on the first NEED_FUA release * regardless of the number of active reference counts on this iclog. */ - int xlog_state_release_iclog( struct xlog *log, - struct xlog_in_core *iclog, - xfs_lsn_t old_tail_lsn) + struct xlog_in_core *iclog) { xfs_lsn_t tail_lsn; bool last_ref; @@ -561,18 +557,14 @@ xlog_state_release_iclog( /* * Grabbing the current log tail needs to be atomic w.r.t. the writing * of the tail LSN into the iclog so we guarantee that the log tail does - * not move between deciding if a cache flush is required and writing - * the LSN into the iclog below. + * not move between the first time we know that the iclog needs to be + * made stable and when we eventually submit it. */ - if (old_tail_lsn || iclog->ic_state == XLOG_STATE_WANT_SYNC) { + if ((iclog->ic_state == XLOG_STATE_WANT_SYNC || + (iclog->ic_flags & XLOG_ICL_NEED_FUA)) && + !iclog->ic_header.h_tail_lsn) { tail_lsn = xlog_assign_tail_lsn(log->l_mp); - - if (old_tail_lsn && tail_lsn != old_tail_lsn) - iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; - - if ((iclog->ic_flags & XLOG_ICL_NEED_FUA) && - !iclog->ic_header.h_tail_lsn) - iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); + iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); } last_ref = atomic_dec_and_test(&iclog->ic_refcnt); @@ -583,11 +575,8 @@ xlog_state_release_iclog( * pending iclog callbacks that were waiting on the release of * this iclog. */ - if (last_ref) { - spin_unlock(&log->l_icloglock); + if (last_ref) xlog_state_shutdown_callbacks(log); - spin_lock(&log->l_icloglock); - } return -EIO; } @@ -600,8 +589,6 @@ xlog_state_release_iclog( } iclog->ic_state = XLOG_STATE_SYNCING; - if (!iclog->ic_header.h_tail_lsn) - iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); xlog_verify_tail_lsn(log, iclog); trace_xlog_iclog_syncing(iclog, _RET_IP_); @@ -812,10 +799,9 @@ xfs_log_mount_finish( * mount failure occurs. */ mp->m_super->s_flags |= SB_ACTIVE; + xfs_log_work_queue(mp); if (xlog_recovery_needed(log)) error = xlog_recover_finish(log); - if (!error) - xfs_log_work_queue(mp); mp->m_super->s_flags &= ~SB_ACTIVE; evict_inodes(mp->m_super); @@ -874,7 +860,7 @@ xlog_force_iclog( iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA; if (iclog->ic_state == XLOG_STATE_ACTIVE) xlog_state_switch_iclogs(iclog->ic_log, iclog, 0); - return xlog_state_release_iclog(iclog->ic_log, iclog, 0); + return xlog_state_release_iclog(iclog->ic_log, iclog); } /* @@ -1102,7 +1088,7 @@ xfs_log_item_init( int type, const struct xfs_item_ops *ops) { - item->li_mountp = mp; + item->li_log = mp->m_log; item->li_ailp = mp->m_ail; item->li_type = type; item->li_ops = ops; @@ -1374,7 +1360,7 @@ xlog_ioend_work( */ if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { xfs_alert(log->l_mp, "log I/O error %d", error); - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } xlog_state_done_syncing(iclog); @@ -1913,7 +1899,7 @@ xlog_write_iclog( iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) { - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); return; } if (is_vmalloc_addr(iclog->ic_data)) @@ -2412,7 +2398,7 @@ xlog_write_copy_finish( ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || xlog_is_shutdown(log)); release_iclog: - error = xlog_state_release_iclog(log, iclog, 0); + error = xlog_state_release_iclog(log, iclog); spin_unlock(&log->l_icloglock); return error; } @@ -2488,7 +2474,7 @@ xlog_write( xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, "ctx ticket reservation ran out. Need to up reservation"); xlog_print_tic_res(log->l_mp, ticket); - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } len = xlog_write_calc_vec_length(ticket, log_vector, optype); @@ -2629,7 +2615,7 @@ next_lv: spin_lock(&log->l_icloglock); xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); - error = xlog_state_release_iclog(log, iclog, 0); + error = xlog_state_release_iclog(log, iclog); spin_unlock(&log->l_icloglock); return error; @@ -3053,7 +3039,7 @@ restart: * reference to the iclog. */ if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) - error = xlog_state_release_iclog(log, iclog, 0); + error = xlog_state_release_iclog(log, iclog); spin_unlock(&log->l_icloglock); if (error) return error; @@ -3822,9 +3808,10 @@ xlog_verify_iclog( #endif /* - * Perform a forced shutdown on the log. This should be called once and once - * only by the high level filesystem shutdown code to shut the log subsystem - * down cleanly. + * Perform a forced shutdown on the log. + * + * This can be called from low level log code to trigger a shutdown, or from the + * high level mount shutdown code when the mount shuts down. * * Our main objectives here are to make sure that: * a. if the shutdown was not due to a log IO error, flush the logs to @@ -3833,6 +3820,8 @@ xlog_verify_iclog( * parties to find out. Nothing new gets queued after this is done. * c. Tasks sleeping on log reservations, pinned objects and * other resources get woken up. + * d. The mount is also marked as shut down so that log triggered shutdowns + * still behave the same as if they called xfs_forced_shutdown(). * * Return true if the shutdown cause was a log IO error and we actually shut the * log down. @@ -3844,25 +3833,25 @@ xlog_force_shutdown( { bool log_error = (shutdown_flags & SHUTDOWN_LOG_IO_ERROR); - /* - * If this happens during log recovery then we aren't using the runtime - * log mechanisms yet so there's nothing to shut down. - */ - if (!log || xlog_in_recovery(log)) + if (!log) return false; - ASSERT(!xlog_is_shutdown(log)); - /* * Flush all the completed transactions to disk before marking the log * being shut down. We need to do this first as shutting down the log * before the force will prevent the log force from flushing the iclogs * to disk. * - * Re-entry due to a log IO error shutdown during the log force is - * prevented by the atomicity of higher level shutdown code. + * When we are in recovery, there are no transactions to flush, and + * we don't want to touch the log because we don't want to perturb the + * current head/tail for future recovery attempts. Hence we need to + * avoid a log force in this case. + * + * If we are shutting down due to a log IO error, then we must avoid + * trying to write the log as that may just result in more IO errors and + * an endless shutdown/force loop. */ - if (!log_error) + if (!log_error && !xlog_in_recovery(log)) xfs_log_force(log->l_mp, XFS_LOG_SYNC); /* @@ -3879,12 +3868,25 @@ xlog_force_shutdown( spin_lock(&log->l_icloglock); if (test_and_set_bit(XLOG_IO_ERROR, &log->l_opstate)) { spin_unlock(&log->l_icloglock); - ASSERT(0); return false; } spin_unlock(&log->l_icloglock); /* + * If this log shutdown also sets the mount shutdown state, issue a + * shutdown warning message. + */ + if (!test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &log->l_mp->m_opstate)) { + xfs_alert_tag(log->l_mp, XFS_PTAG_SHUTDOWN_LOGERROR, +"Filesystem has been shut down due to log error (0x%x).", + shutdown_flags); + xfs_alert(log->l_mp, +"Please unmount the filesystem and rectify the problem(s)."); + if (xfs_error_level >= XFS_ERRLEVEL_HIGH) + xfs_stack_trace(); + } + + /* * We don't want anybody waiting for log reservations after this. That * means we have to wake up everybody queued up on reserveq as well as * writeq. In addition, we make sure in xlog_{re}grant_log_space that @@ -3904,8 +3906,12 @@ xlog_force_shutdown( wake_up_all(&log->l_cilp->xc_start_wait); wake_up_all(&log->l_cilp->xc_commit_wait); spin_unlock(&log->l_cilp->xc_push_lock); + + spin_lock(&log->l_icloglock); xlog_state_shutdown_callbacks(log); + spin_unlock(&log->l_icloglock); + wake_up_var(&log->l_opstate); return log_error; } diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 83a039762b81..ba57323bfdce 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -540,7 +540,7 @@ xlog_cil_insert_items( spin_unlock(&cil->xc_cil_lock); if (tp->t_ticket->t_curr_res < 0) - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } static void @@ -705,11 +705,21 @@ xlog_cil_set_ctx_write_state( * The LSN we need to pass to the log items on transaction * commit is the LSN reported by the first log vector write, not * the commit lsn. If we use the commit record lsn then we can - * move the tail beyond the grant write head. + * move the grant write head beyond the tail LSN and overwrite + * it. */ ctx->start_lsn = lsn; wake_up_all(&cil->xc_start_wait); spin_unlock(&cil->xc_push_lock); + + /* + * Make sure the metadata we are about to overwrite in the log + * has been flushed to stable storage before this iclog is + * issued. + */ + spin_lock(&cil->xc_log->l_icloglock); + iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; + spin_unlock(&cil->xc_log->l_icloglock); return; } @@ -854,7 +864,7 @@ xlog_cil_write_commit_record( error = xlog_write(log, ctx, &vec, ctx->ticket, XLOG_COMMIT_TRANS); if (error) - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); return error; } @@ -888,10 +898,7 @@ xlog_cil_push_work( struct xfs_trans_header thdr; struct xfs_log_iovec lhdr; struct xfs_log_vec lvhdr = { NULL }; - xfs_lsn_t preflush_tail_lsn; xfs_csn_t push_seq; - struct bio bio; - DECLARE_COMPLETION_ONSTACK(bdev_flush); bool push_commit_stable; new_ctx = xlog_cil_ctx_alloc(); @@ -962,23 +969,6 @@ xlog_cil_push_work( spin_unlock(&cil->xc_push_lock); /* - * The CIL is stable at this point - nothing new will be added to it - * because we hold the flush lock exclusively. Hence we can now issue - * a cache flush to ensure all the completed metadata in the journal we - * are about to overwrite is on stable storage. - * - * Because we are issuing this cache flush before we've written the - * tail lsn to the iclog, we can have metadata IO completions move the - * tail forwards between the completion of this flush and the iclog - * being written. In this case, we need to re-issue the cache flush - * before the iclog write. To detect whether the log tail moves, sample - * the tail LSN *before* we issue the flush. - */ - preflush_tail_lsn = atomic64_read(&log->l_tail_lsn); - xfs_flush_bdev_async(&bio, log->l_mp->m_ddev_targp->bt_bdev, - &bdev_flush); - - /* * Pull all the log vectors off the items in the CIL, and remove the * items from the CIL. We don't need the CIL lock here because it's only * needed on the transaction commit side which is currently locked out @@ -1054,12 +1044,6 @@ xlog_cil_push_work( lvhdr.lv_iovecp = &lhdr; lvhdr.lv_next = ctx->lv_chain; - /* - * Before we format and submit the first iclog, we have to ensure that - * the metadata writeback ordering cache flush is complete. - */ - wait_for_completion(&bdev_flush); - error = xlog_cil_write_chain(ctx, &lvhdr); if (error) goto out_abort_free_ticket; @@ -1118,7 +1102,7 @@ xlog_cil_push_work( if (push_commit_stable && ctx->commit_iclog->ic_state == XLOG_STATE_ACTIVE) xlog_state_switch_iclogs(log, ctx->commit_iclog, 0); - xlog_state_release_iclog(log, ctx->commit_iclog, preflush_tail_lsn); + xlog_state_release_iclog(log, ctx->commit_iclog); /* Not safe to reference ctx now! */ @@ -1139,7 +1123,7 @@ out_abort_free_ticket: return; } spin_lock(&log->l_icloglock); - xlog_state_release_iclog(log, ctx->commit_iclog, 0); + xlog_state_release_iclog(log, ctx->commit_iclog); /* Not safe to reference ctx now! */ spin_unlock(&log->l_icloglock); } @@ -1243,18 +1227,27 @@ xlog_cil_push_now( if (!async) flush_workqueue(cil->xc_push_wq); + spin_lock(&cil->xc_push_lock); + + /* + * If this is an async flush request, we always need to set the + * xc_push_commit_stable flag even if something else has already queued + * a push. The flush caller is asking for the CIL to be on stable + * storage when the next push completes, so regardless of who has queued + * the push, the flush requires stable semantics from it. + */ + cil->xc_push_commit_stable = async; + /* * If the CIL is empty or we've already pushed the sequence then - * there's no work we need to do. + * there's no more work that we need to do. */ - spin_lock(&cil->xc_push_lock); if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) { spin_unlock(&cil->xc_push_lock); return; } cil->xc_push_seq = push_seq; - cil->xc_push_commit_stable = async; queue_work(cil->xc_push_wq, &cil->xc_ctx->push_work); spin_unlock(&cil->xc_push_lock); } @@ -1352,6 +1345,13 @@ xlog_cil_flush( trace_xfs_log_force(log->l_mp, seq, _RET_IP_); xlog_cil_push_now(log, seq, true); + + /* + * If the CIL is empty, make sure that any previous checkpoint that may + * still be in an active iclog is pushed to stable storage. + */ + if (list_empty(&log->l_cilp->xc_cil)) + xfs_log_force(log->l_mp, 0); } /* @@ -1468,7 +1468,7 @@ bool xfs_log_item_in_current_chkpt( struct xfs_log_item *lip) { - struct xfs_cil *cil = lip->li_mountp->m_log->l_cilp; + struct xfs_cil *cil = lip->li_log->l_cilp; if (list_empty(&lip->li_cil)) return false; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 23103d68423c..401cdc400980 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -484,6 +484,17 @@ xlog_is_shutdown(struct xlog *log) return test_bit(XLOG_IO_ERROR, &log->l_opstate); } +/* + * Wait until the xlog_force_shutdown() has marked the log as shut down + * so xlog_is_shutdown() will always return true. + */ +static inline void +xlog_shutdown_wait( + struct xlog *log) +{ + wait_var_event(&log->l_opstate, xlog_is_shutdown(log)); +} + /* common routines */ extern int xlog_recover( @@ -524,8 +535,7 @@ void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog, int eventual_size); -int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog, - xfs_lsn_t log_tail_lsn); +int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog); /* * When we crack an atomic LSN, we sample it first so that the value will not diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 96c997ed2ec8..c4ad4296c540 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2485,7 +2485,7 @@ xlog_finish_defer_ops( error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres, dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp); if (error) { - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(mp->m_log, SHUTDOWN_LOG_IO_ERROR); return error; } @@ -2519,21 +2519,22 @@ xlog_abort_defer_ops( xfs_defer_ops_capture_free(mp, dfc); } } + /* * When this is called, all of the log intent items which did not have - * corresponding log done items should be in the AIL. What we do now - * is update the data structures associated with each one. + * corresponding log done items should be in the AIL. What we do now is update + * the data structures associated with each one. * - * Since we process the log intent items in normal transactions, they - * will be removed at some point after the commit. This prevents us - * from just walking down the list processing each one. We'll use a - * flag in the intent item to skip those that we've already processed - * and use the AIL iteration mechanism's generation count to try to - * speed this up at least a bit. + * Since we process the log intent items in normal transactions, they will be + * removed at some point after the commit. This prevents us from just walking + * down the list processing each one. We'll use a flag in the intent item to + * skip those that we've already processed and use the AIL iteration mechanism's + * generation count to try to speed this up at least a bit. * - * When we start, we know that the intents are the only things in the - * AIL. As we process them, however, other items are added to the - * AIL. + * When we start, we know that the intents are the only things in the AIL. As we + * process them, however, other items are added to the AIL. Hence we know we + * have started recovery on all the pending intents when we find an non-intent + * item in the AIL. */ STATIC int xlog_recover_process_intents( @@ -2556,17 +2557,8 @@ xlog_recover_process_intents( for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL; lip = xfs_trans_ail_cursor_next(ailp, &cur)) { - /* - * We're done when we see something other than an intent. - * There should be no intents left in the AIL now. - */ - if (!xlog_item_is_intent(lip)) { -#ifdef DEBUG - for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) - ASSERT(!xlog_item_is_intent(lip)); -#endif + if (!xlog_item_is_intent(lip)) break; - } /* * We should never see a redo item with a LSN higher than @@ -2607,8 +2599,9 @@ err: } /* - * A cancel occurs when the mount has failed and we're bailing out. - * Release all pending log intent items so they don't pin the AIL. + * A cancel occurs when the mount has failed and we're bailing out. Release all + * pending log intent items that we haven't started recovery on so they don't + * pin the AIL. */ STATIC void xlog_recover_cancel_intents( @@ -2622,17 +2615,8 @@ xlog_recover_cancel_intents( spin_lock(&ailp->ail_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); while (lip != NULL) { - /* - * We're done when we see something other than an intent. - * There should be no intents left in the AIL now. - */ - if (!xlog_item_is_intent(lip)) { -#ifdef DEBUG - for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) - ASSERT(!xlog_item_is_intent(lip)); -#endif + if (!xlog_item_is_intent(lip)) break; - } spin_unlock(&ailp->ail_lock); lip->li_ops->iop_release(lip); @@ -3470,7 +3454,7 @@ xlog_recover_finish( */ xlog_recover_cancel_intents(log); xfs_alert(log->l_mp, "Failed to recover intents"); - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); return error; } @@ -3517,7 +3501,7 @@ xlog_recover_finish( * end of intents processing can be pushed through the CIL * and AIL. */ - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } return 0; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index bed73e8002a5..c5f153c3693f 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -21,6 +21,7 @@ #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_fsops.h" @@ -1146,7 +1147,7 @@ xfs_mod_fdblocks( * problems (i.e. transaction abort, pagecache discards, etc.) than * slightly premature -ENOSPC. */ - set_aside = mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); + set_aside = xfs_fdblocks_unavailable(mp); percpu_counter_add_batch(&mp->m_fdblocks, delta, batch); if (__percpu_counter_compare(&mp->m_fdblocks, set_aside, XFS_FDBLOCKS_BATCH) >= 0) { diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 00720a02e761..f6dc19de8322 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -479,6 +479,21 @@ extern void xfs_unmountfs(xfs_mount_t *); */ #define XFS_FDBLOCKS_BATCH 1024 +/* + * Estimate the amount of free space that is not available to userspace and is + * not explicitly reserved from the incore fdblocks. This includes: + * + * - The minimum number of blocks needed to support splitting a bmap btree + * - The blocks currently in use by the freespace btrees because they record + * the actual blocks that will fill per-AG metadata space reservations + */ +static inline uint64_t +xfs_fdblocks_unavailable( + struct xfs_mount *mp) +{ + return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); +} + extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved); extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 4abe17312c2b..37a24f0f7cd4 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -319,7 +319,8 @@ xfs_fs_commit_blocks( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - xfs_setattr_time(ip, iattr); + ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID))); + setattr_copy(&init_user_ns, inode, iattr); if (update_isize) { i_size_write(inode, iattr->ia_size); ip->i_disk_size = iattr->ia_size; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 32ac8d9c8940..f165d1a3de1d 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -25,6 +25,7 @@ #include "xfs_error.h" #include "xfs_ag.h" #include "xfs_ialloc.h" +#include "xfs_log_priv.h" /* * The global quota manager. There is only one of these for the entire @@ -121,8 +122,7 @@ xfs_qm_dqpurge( struct xfs_dquot *dqp, void *data) { - struct xfs_mount *mp = dqp->q_mount; - struct xfs_quotainfo *qi = mp->m_quotainfo; + struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; int error = -EAGAIN; xfs_dqlock(dqp); @@ -157,7 +157,7 @@ xfs_qm_dqpurge( } ASSERT(atomic_read(&dqp->q_pincount) == 0); - ASSERT(xfs_is_shutdown(mp) || + ASSERT(xlog_is_shutdown(dqp->q_logitem.qli_item.li_log) || !test_bit(XFS_LI_IN_AIL, &dqp->q_logitem.qli_item.li_flags)); xfs_dqfunlock(dqp); @@ -172,7 +172,7 @@ xfs_qm_dqpurge( */ ASSERT(!list_empty(&dqp->q_lru)); list_lru_del(&qi->qi_lru, &dqp->q_lru); - XFS_STATS_DEC(mp, xs_qm_dquot_unused); + XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused); xfs_qm_dqdestroy(dqp); return 0; diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index d3da67772d57..0d868c93144d 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -457,7 +457,7 @@ xfs_cui_item_recover( struct xfs_cud_log_item *cudp; struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; - struct xfs_mount *mp = lip->li_mountp; + struct xfs_mount *mp = lip->li_log->l_mp; xfs_fsblock_t new_fsb; xfs_extlen_t new_len; unsigned int refc_type; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index db70060e7bf6..54e68e5693fd 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -425,7 +425,10 @@ convert: if (!convert_now || cmap->br_state == XFS_EXT_NORM) return 0; trace_xfs_reflink_convert_cow(ip, cmap); - return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); + error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); + if (!error) + cmap->br_state = XFS_EXT_NORM; + return error; out_trans_cancel: xfs_trans_cancel(tp); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index c3966b4c58ef..a22b2d19ef91 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -510,7 +510,7 @@ xfs_rui_item_recover( struct xfs_rud_log_item *rudp; struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; - struct xfs_mount *mp = lip->li_mountp; + struct xfs_mount *mp = lip->li_log->l_mp; enum xfs_rmap_intent_type type; xfs_exntst_t state; int i; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index d84714e4e46a..54be9d64093e 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -815,7 +815,8 @@ xfs_fs_statfs( spin_unlock(&mp->m_sb_lock); /* make sure statp->f_bfree does not underflow */ - statp->f_bfree = max_t(int64_t, fdblocks - mp->m_alloc_set_aside, 0); + statp->f_bfree = max_t(int64_t, 0, + fdblocks - xfs_fdblocks_unavailable(mp)); statp->f_bavail = statp->f_bfree; fakeinos = XFS_FSB_TO_INO(mp, statp->f_bfree); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 4a8076ef8cb4..b141ef78c755 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -933,7 +933,7 @@ DEFINE_IREF_EVENT(xfs_inode_unpin); DEFINE_IREF_EVENT(xfs_inode_unpin_nowait); DECLARE_EVENT_CLASS(xfs_namespace_class, - TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), + TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name), TP_ARGS(dp, name), TP_STRUCT__entry( __field(dev_t, dev) @@ -956,7 +956,7 @@ DECLARE_EVENT_CLASS(xfs_namespace_class, #define DEFINE_NAMESPACE_EVENT(name) \ DEFINE_EVENT(xfs_namespace_class, name, \ - TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), \ + TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name), \ TP_ARGS(dp, name)) DEFINE_NAMESPACE_EVENT(xfs_remove); DEFINE_NAMESPACE_EVENT(xfs_link); @@ -1308,7 +1308,7 @@ DECLARE_EVENT_CLASS(xfs_log_item_class, __field(xfs_lsn_t, lsn) ), TP_fast_assign( - __entry->dev = lip->li_mountp->m_super->s_dev; + __entry->dev = lip->li_log->l_mp->m_super->s_dev; __entry->lip = lip; __entry->type = lip->li_type; __entry->flags = lip->li_flags; @@ -1361,7 +1361,7 @@ DECLARE_EVENT_CLASS(xfs_ail_class, __field(xfs_lsn_t, new_lsn) ), TP_fast_assign( - __entry->dev = lip->li_mountp->m_super->s_dev; + __entry->dev = lip->li_log->l_mp->m_super->s_dev; __entry->lip = lip; __entry->type = lip->li_type; __entry->flags = lip->li_flags; diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 59e2f9031b9f..0ac717aad380 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -648,7 +648,7 @@ xfs_trans_add_item( struct xfs_trans *tp, struct xfs_log_item *lip) { - ASSERT(lip->li_mountp == tp->t_mountp); + ASSERT(lip->li_log == tp->t_mountp->m_log); ASSERT(lip->li_ailp == tp->t_mountp->m_ail); ASSERT(list_empty(&lip->li_trans)); ASSERT(!test_bit(XFS_LI_DIRTY, &lip->li_flags)); @@ -775,7 +775,7 @@ xfs_trans_committed_bulk( * object into the AIL as we are in a shutdown situation. */ if (aborted) { - ASSERT(xfs_is_shutdown(ailp->ail_mount)); + ASSERT(xlog_is_shutdown(ailp->ail_log)); if (lip->li_ops->iop_unpin) lip->li_ops->iop_unpin(lip, 1); continue; @@ -836,6 +836,7 @@ __xfs_trans_commit( bool regrant) { struct xfs_mount *mp = tp->t_mountp; + struct xlog *log = mp->m_log; xfs_csn_t commit_seq = 0; int error = 0; int sync = tp->t_flags & XFS_TRANS_SYNC; @@ -864,7 +865,13 @@ __xfs_trans_commit( if (!(tp->t_flags & XFS_TRANS_DIRTY)) goto out_unreserve; - if (xfs_is_shutdown(mp)) { + /* + * We must check against log shutdown here because we cannot abort log + * items and leave them dirty, inconsistent and unpinned in memory while + * the log is active. This leaves them open to being written back to + * disk, and that will lead to on-disk corruption. + */ + if (xlog_is_shutdown(log)) { error = -EIO; goto out_unreserve; } @@ -878,7 +885,7 @@ __xfs_trans_commit( xfs_trans_apply_sb_deltas(tp); xfs_trans_apply_dquot_deltas(tp); - xlog_cil_commit(mp->m_log, tp, &commit_seq, regrant); + xlog_cil_commit(log, tp, &commit_seq, regrant); xfs_trans_free(tp); @@ -905,10 +912,10 @@ out_unreserve: */ xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - if (regrant && !xlog_is_shutdown(mp->m_log)) - xfs_log_ticket_regrant(mp->m_log, tp->t_ticket); + if (regrant && !xlog_is_shutdown(log)) + xfs_log_ticket_regrant(log, tp->t_ticket); else - xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); + xfs_log_ticket_ungrant(log, tp->t_ticket); tp->t_ticket = NULL; } xfs_trans_free_items(tp, !!error); @@ -926,18 +933,27 @@ xfs_trans_commit( } /* - * Unlock all of the transaction's items and free the transaction. - * The transaction must not have modified any of its items, because - * there is no way to restore them to their previous state. + * Unlock all of the transaction's items and free the transaction. If the + * transaction is dirty, we must shut down the filesystem because there is no + * way to restore them to their previous state. * - * If the transaction has made a log reservation, make sure to release - * it as well. + * If the transaction has made a log reservation, make sure to release it as + * well. + * + * This is a high level function (equivalent to xfs_trans_commit()) and so can + * be called after the transaction has effectively been aborted due to the mount + * being shut down. However, if the mount has not been shut down and the + * transaction is dirty we will shut the mount down and, in doing so, that + * guarantees that the log is shut down, too. Hence we don't need to be as + * careful with shutdown state and dirty items here as we need to be in + * xfs_trans_commit(). */ void xfs_trans_cancel( struct xfs_trans *tp) { struct xfs_mount *mp = tp->t_mountp; + struct xlog *log = mp->m_log; bool dirty = (tp->t_flags & XFS_TRANS_DIRTY); trace_xfs_trans_cancel(tp, _RET_IP_); @@ -955,16 +971,18 @@ xfs_trans_cancel( } /* - * See if the caller is relying on us to shut down the - * filesystem. This happens in paths where we detect - * corruption and decide to give up. + * See if the caller is relying on us to shut down the filesystem. We + * only want an error report if there isn't already a shutdown in + * progress, so we only need to check against the mount shutdown state + * here. */ if (dirty && !xfs_is_shutdown(mp)) { XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } #ifdef DEBUG - if (!dirty && !xfs_is_shutdown(mp)) { + /* Log items need to be consistent until the log is shut down. */ + if (!dirty && !xlog_is_shutdown(log)) { struct xfs_log_item *lip; list_for_each_entry(lip, &tp->t_items, li_trans) @@ -975,7 +993,7 @@ xfs_trans_cancel( xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); + xfs_log_ticket_ungrant(log, tp->t_ticket); tp->t_ticket = NULL; } @@ -1210,3 +1228,89 @@ out_cancel: xfs_trans_cancel(tp); return error; } + +/* + * Allocate an transaction, lock and join the directory and child inodes to it, + * and reserve quota for a directory update. If there isn't sufficient space, + * @dblocks will be set to zero for a reservationless directory update and + * @nospace_error will be set to a negative errno describing the space + * constraint we hit. + * + * The caller must ensure that the on-disk dquots attached to this inode have + * already been allocated and initialized. The ILOCKs will be dropped when the + * transaction is committed or cancelled. + */ +int +xfs_trans_alloc_dir( + struct xfs_inode *dp, + struct xfs_trans_res *resv, + struct xfs_inode *ip, + unsigned int *dblocks, + struct xfs_trans **tpp, + int *nospace_error) +{ + struct xfs_trans *tp; + struct xfs_mount *mp = ip->i_mount; + unsigned int resblks; + bool retried = false; + int error; + +retry: + *nospace_error = 0; + resblks = *dblocks; + error = xfs_trans_alloc(mp, resv, resblks, 0, 0, &tp); + if (error == -ENOSPC) { + *nospace_error = error; + resblks = 0; + error = xfs_trans_alloc(mp, resv, resblks, 0, 0, &tp); + } + if (error) + return error; + + xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL); + + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + error = xfs_qm_dqattach_locked(dp, false); + if (error) { + /* Caller should have allocated the dquots! */ + ASSERT(error != -ENOENT); + goto out_cancel; + } + + error = xfs_qm_dqattach_locked(ip, false); + if (error) { + /* Caller should have allocated the dquots! */ + ASSERT(error != -ENOENT); + goto out_cancel; + } + + if (resblks == 0) + goto done; + + error = xfs_trans_reserve_quota_nblks(tp, dp, resblks, 0, false); + if (error == -EDQUOT || error == -ENOSPC) { + if (!retried) { + xfs_trans_cancel(tp); + xfs_blockgc_free_quota(dp, 0); + retried = true; + goto retry; + } + + *nospace_error = error; + resblks = 0; + error = 0; + } + if (error) + goto out_cancel; + +done: + *tpp = tp; + *dblocks = resblks; + return 0; + +out_cancel: + xfs_trans_cancel(tp); + return error; +} diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index a487b264a9eb..de177842b951 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -8,6 +8,7 @@ /* kernel only transaction subsystem defines */ +struct xlog; struct xfs_buf; struct xfs_buftarg; struct xfs_efd_log_item; @@ -31,7 +32,7 @@ struct xfs_log_item { struct list_head li_ail; /* AIL pointers */ struct list_head li_trans; /* transaction list */ xfs_lsn_t li_lsn; /* last on-disk lsn */ - struct xfs_mount *li_mountp; /* ptr to fs mount */ + struct xlog *li_log; struct xfs_ail *li_ailp; /* ptr to AIL */ uint li_type; /* item type */ unsigned long li_flags; /* misc flags */ @@ -259,6 +260,9 @@ int xfs_trans_alloc_icreate(struct xfs_mount *mp, struct xfs_trans_res *resv, int xfs_trans_alloc_ichange(struct xfs_inode *ip, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, bool force, struct xfs_trans **tpp); +int xfs_trans_alloc_dir(struct xfs_inode *dp, struct xfs_trans_res *resv, + struct xfs_inode *ip, unsigned int *dblocks, + struct xfs_trans **tpp, int *nospace_error); static inline void xfs_trans_set_context( diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 2a8c8dc54c95..d3a97a028560 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -398,7 +398,7 @@ xfsaild_push_item( * If log item pinning is enabled, skip the push and track the item as * pinned. This can help induce head-behind-tail conditions. */ - if (XFS_TEST_ERROR(false, ailp->ail_mount, XFS_ERRTAG_LOG_ITEM_PIN)) + if (XFS_TEST_ERROR(false, ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN)) return XFS_ITEM_PINNED; /* @@ -418,7 +418,7 @@ static long xfsaild_push( struct xfs_ail *ailp) { - xfs_mount_t *mp = ailp->ail_mount; + struct xfs_mount *mp = ailp->ail_log->l_mp; struct xfs_ail_cursor cur; struct xfs_log_item *lip; xfs_lsn_t lsn; @@ -443,15 +443,27 @@ xfsaild_push( ailp->ail_log_flush = 0; XFS_STATS_INC(mp, xs_push_ail_flush); - xlog_cil_flush(mp->m_log); + xlog_cil_flush(ailp->ail_log); } spin_lock(&ailp->ail_lock); - /* barrier matches the ail_target update in xfs_ail_push() */ - smp_rmb(); - target = ailp->ail_target; - ailp->ail_target_prev = target; + /* + * If we have a sync push waiter, we always have to push till the AIL is + * empty. Update the target to point to the end of the AIL so that + * capture updates that occur after the sync push waiter has gone to + * sleep. + */ + if (waitqueue_active(&ailp->ail_empty)) { + lip = xfs_ail_max(ailp); + if (lip) + target = lip->li_lsn; + } else { + /* barrier matches the ail_target update in xfs_ail_push() */ + smp_rmb(); + target = ailp->ail_target; + ailp->ail_target_prev = target; + } /* we're done if the AIL is empty or our push has reached the end */ lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->ail_last_pushed_lsn); @@ -620,7 +632,7 @@ xfsaild( * opportunity to release such buffers from the queue. */ ASSERT(list_empty(&ailp->ail_buf_list) || - xfs_is_shutdown(ailp->ail_mount)); + xlog_is_shutdown(ailp->ail_log)); xfs_buf_delwri_cancel(&ailp->ail_buf_list); break; } @@ -683,7 +695,7 @@ xfs_ail_push( struct xfs_log_item *lip; lip = xfs_ail_min(ailp); - if (!lip || xfs_is_shutdown(ailp->ail_mount) || + if (!lip || xlog_is_shutdown(ailp->ail_log) || XFS_LSN_CMP(threshold_lsn, ailp->ail_target) <= 0) return; @@ -724,7 +736,6 @@ xfs_ail_push_all_sync( spin_lock(&ailp->ail_lock); while ((lip = xfs_ail_max(ailp)) != NULL) { prepare_to_wait(&ailp->ail_empty, &wait, TASK_UNINTERRUPTIBLE); - ailp->ail_target = lip->li_lsn; wake_up_process(ailp->ail_task); spin_unlock(&ailp->ail_lock); schedule(); @@ -740,7 +751,7 @@ xfs_ail_update_finish( struct xfs_ail *ailp, xfs_lsn_t old_lsn) __releases(ailp->ail_lock) { - struct xfs_mount *mp = ailp->ail_mount; + struct xlog *log = ailp->ail_log; /* if the tail lsn hasn't changed, don't do updates or wakeups. */ if (!old_lsn || old_lsn == __xfs_ail_min_lsn(ailp)) { @@ -748,13 +759,13 @@ xfs_ail_update_finish( return; } - if (!xfs_is_shutdown(mp)) - xlog_assign_tail_lsn_locked(mp); + if (!xlog_is_shutdown(log)) + xlog_assign_tail_lsn_locked(log->l_mp); if (list_empty(&ailp->ail_head)) wake_up_all(&ailp->ail_empty); spin_unlock(&ailp->ail_lock); - xfs_log_space_wake(mp); + xfs_log_space_wake(log->l_mp); } /* @@ -862,17 +873,17 @@ xfs_trans_ail_delete( int shutdown_type) { struct xfs_ail *ailp = lip->li_ailp; - struct xfs_mount *mp = ailp->ail_mount; + struct xlog *log = ailp->ail_log; xfs_lsn_t tail_lsn; spin_lock(&ailp->ail_lock); if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) { spin_unlock(&ailp->ail_lock); - if (shutdown_type && !xfs_is_shutdown(mp)) { - xfs_alert_tag(mp, XFS_PTAG_AILDELETE, + if (shutdown_type && !xlog_is_shutdown(log)) { + xfs_alert_tag(log->l_mp, XFS_PTAG_AILDELETE, "%s: attempting to delete a log item that is not in the AIL", __func__); - xfs_force_shutdown(mp, shutdown_type); + xlog_force_shutdown(log, shutdown_type); } return; } @@ -893,7 +904,7 @@ xfs_trans_ail_init( if (!ailp) return -ENOMEM; - ailp->ail_mount = mp; + ailp->ail_log = mp->m_log; INIT_LIST_HEAD(&ailp->ail_head); INIT_LIST_HEAD(&ailp->ail_cursors); spin_lock_init(&ailp->ail_lock); @@ -901,7 +912,7 @@ xfs_trans_ail_init( init_waitqueue_head(&ailp->ail_empty); ailp->ail_task = kthread_run(xfsaild, ailp, "xfsaild/%s", - ailp->ail_mount->m_super->s_id); + mp->m_super->s_id); if (IS_ERR(ailp->ail_task)) goto out_free_ailp; diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 3004aeac9110..f0d79a9050ba 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -6,6 +6,7 @@ #ifndef __XFS_TRANS_PRIV_H__ #define __XFS_TRANS_PRIV_H__ +struct xlog; struct xfs_log_item; struct xfs_mount; struct xfs_trans; @@ -50,7 +51,7 @@ struct xfs_ail_cursor { * Eventually we need to drive the locking in here as well. */ struct xfs_ail { - struct xfs_mount *ail_mount; + struct xlog *ail_log; struct task_struct *ail_task; struct list_head ail_head; xfs_lsn_t ail_target; diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 804b6e265685..3614c7834007 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -695,7 +695,6 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) bio = bio_alloc(bdev, nr_pages, REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS); bio->bi_iter.bi_sector = zi->i_zsector; - bio->bi_write_hint = iocb->ki_hint; bio->bi_ioprio = iocb->ki_ioprio; if (iocb->ki_flags & IOCB_DSYNC) bio->bi_opf |= REQ_FUA; |