diff options
Diffstat (limited to 'fs')
456 files changed, 15720 insertions, 13919 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig index d7bc93447c85..0c63df574ee7 100644 --- a/fs/9p/Kconfig +++ b/fs/9p/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config 9P_FS tristate "Plan 9 Resource Sharing Support (9P2000)" - depends on INET && NET_9P + depends on NET_9P select NETFS_SUPPORT help If you say Y here, you will get experimental support for diff --git a/fs/9p/cache.h b/fs/9p/cache.h index 1923affcdc62..ee1b6b06a2fd 100644 --- a/fs/9p/cache.h +++ b/fs/9p/cache.h @@ -8,9 +8,8 @@ #ifndef _9P_CACHE_H #define _9P_CACHE_H -#include <linux/fscache.h> - #ifdef CONFIG_9P_FSCACHE +#include <linux/fscache.h> extern int v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses, const char *dev_name); diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 805151114e96..de009a33e0e2 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -41,14 +41,24 @@ void v9fs_fid_add(struct dentry *dentry, struct p9_fid **pfid) *pfid = NULL; } +static bool v9fs_is_writeable(int mode) +{ + if (mode & (P9_OWRITE|P9_ORDWR)) + return true; + else + return false; +} + /** * v9fs_fid_find_inode - search for an open fid off of the inode list * @inode: return a fid pointing to a specific inode + * @want_writeable: only consider fids which are writeable * @uid: return a fid belonging to the specified user + * @any: ignore uid as a selection criteria * */ - -static struct p9_fid *v9fs_fid_find_inode(struct inode *inode, kuid_t uid) +struct p9_fid *v9fs_fid_find_inode(struct inode *inode, bool want_writeable, + kuid_t uid, bool any) { struct hlist_head *h; struct p9_fid *fid, *ret = NULL; @@ -58,7 +68,12 @@ static struct p9_fid *v9fs_fid_find_inode(struct inode *inode, kuid_t uid) spin_lock(&inode->i_lock); h = (struct hlist_head *)&inode->i_private; hlist_for_each_entry(fid, h, ilist) { - if (uid_eq(fid->uid, uid)) { + if (any || uid_eq(fid->uid, uid)) { + if (want_writeable && !v9fs_is_writeable(fid->mode)) { + p9_debug(P9_DEBUG_VFS, " mode: %x not writeable?\n", + fid->mode); + continue; + } p9_fid_get(fid); ret = fid; break; @@ -118,7 +133,7 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any) spin_unlock(&dentry->d_lock); } else { if (dentry->d_inode) - ret = v9fs_fid_find_inode(dentry->d_inode, uid); + ret = v9fs_fid_find_inode(dentry->d_inode, false, uid, any); } return ret; @@ -299,28 +314,3 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) return v9fs_fid_lookup_with_uid(dentry, uid, any); } -struct p9_fid *v9fs_writeback_fid(struct dentry *dentry) -{ - int err; - struct p9_fid *fid, *ofid; - - ofid = v9fs_fid_lookup_with_uid(dentry, GLOBAL_ROOT_UID, 0); - fid = clone_fid(ofid); - if (IS_ERR(fid)) - goto error_out; - p9_fid_put(ofid); - /* - * writeback fid will only be used to write back the - * dirty pages. We always request for the open fid in read-write - * mode so that a partial page write which result in page - * read can work. - */ - err = p9_client_open(fid, O_RDWR); - if (err < 0) { - p9_fid_put(fid); - fid = ERR_PTR(err); - goto error_out; - } -error_out: - return fid; -} diff --git a/fs/9p/fid.h b/fs/9p/fid.h index 8a4e8cd12ca2..0c51889a60b3 100644 --- a/fs/9p/fid.h +++ b/fs/9p/fid.h @@ -7,14 +7,16 @@ #ifndef FS_9P_FID_H #define FS_9P_FID_H #include <linux/list.h> +#include "v9fs.h" +struct p9_fid *v9fs_fid_find_inode(struct inode *inode, bool want_writeable, + kuid_t uid, bool any); struct p9_fid *v9fs_fid_lookup(struct dentry *dentry); static inline struct p9_fid *v9fs_parent_fid(struct dentry *dentry) { return v9fs_fid_lookup(dentry->d_parent); } void v9fs_fid_add(struct dentry *dentry, struct p9_fid **fid); -struct p9_fid *v9fs_writeback_fid(struct dentry *dentry); void v9fs_open_fid_add(struct inode *inode, struct p9_fid **fid); static inline struct p9_fid *clone_fid(struct p9_fid *fid) { @@ -32,4 +34,31 @@ static inline struct p9_fid *v9fs_fid_clone(struct dentry *dentry) p9_fid_put(fid); return nfid; } +/** + * v9fs_fid_addmodes - add cache flags to fid mode (for client use only) + * @fid: fid to augment + * @s_flags: session info mount flags + * @s_cache: session info cache flags + * @f_flags: unix open flags + * + * make sure mode reflects flags of underlying mounts + * also qid.version == 0 reflects a synthetic or legacy file system + * NOTE: these are set after open so only reflect 9p client not + * underlying file system on server. + */ +static inline void v9fs_fid_add_modes(struct p9_fid *fid, int s_flags, + int s_cache, unsigned int f_flags) +{ + if (fid->qid.type != P9_QTFILE) + return; + + if ((!s_cache) || + ((fid->qid.version == 0) && !(s_flags & V9FS_IGNORE_QV)) || + (s_flags & V9FS_DIRECT_IO) || (f_flags & O_DIRECT)) { + fid->mode |= P9L_DIRECT; /* no read or write cache */ + } else if ((!(s_cache & CACHE_WRITEBACK)) || + (f_flags & O_DSYNC) | (s_flags & V9FS_SYNC)) { + fid->mode |= P9L_NOWRITECACHE; + } +} #endif diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 61a51b90600d..c7f774fe398f 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -38,9 +38,7 @@ enum { /* String options */ Opt_uname, Opt_remotename, Opt_cache, Opt_cachetag, /* Options that take no arguments */ - Opt_nodevmap, - /* Cache options */ - Opt_cache_loose, Opt_fscache, Opt_mmap, + Opt_nodevmap, Opt_noxattr, Opt_directio, Opt_ignoreqv, /* Access options */ Opt_access, Opt_posixacl, /* Lock timeout option */ @@ -57,10 +55,10 @@ static const match_table_t tokens = { {Opt_uname, "uname=%s"}, {Opt_remotename, "aname=%s"}, {Opt_nodevmap, "nodevmap"}, + {Opt_noxattr, "noxattr"}, + {Opt_directio, "directio"}, + {Opt_ignoreqv, "ignoreqv"}, {Opt_cache, "cache=%s"}, - {Opt_cache_loose, "loose"}, - {Opt_fscache, "fscache"}, - {Opt_mmap, "mmap"}, {Opt_cachetag, "cachetag=%s"}, {Opt_access, "access=%s"}, {Opt_posixacl, "posixacl"}, @@ -68,32 +66,30 @@ static const match_table_t tokens = { {Opt_err, NULL} }; -static const char *const v9fs_cache_modes[nr__p9_cache_modes] = { - [CACHE_NONE] = "none", - [CACHE_MMAP] = "mmap", - [CACHE_LOOSE] = "loose", - [CACHE_FSCACHE] = "fscache", -}; - /* Interpret mount options for cache mode */ static int get_cache_mode(char *s) { int version = -EINVAL; if (!strcmp(s, "loose")) { - version = CACHE_LOOSE; + version = CACHE_SC_LOOSE; p9_debug(P9_DEBUG_9P, "Cache mode: loose\n"); } else if (!strcmp(s, "fscache")) { - version = CACHE_FSCACHE; + version = CACHE_SC_FSCACHE; p9_debug(P9_DEBUG_9P, "Cache mode: fscache\n"); } else if (!strcmp(s, "mmap")) { - version = CACHE_MMAP; + version = CACHE_SC_MMAP; p9_debug(P9_DEBUG_9P, "Cache mode: mmap\n"); + } else if (!strcmp(s, "readahead")) { + version = CACHE_SC_READAHEAD; + p9_debug(P9_DEBUG_9P, "Cache mode: readahead\n"); } else if (!strcmp(s, "none")) { - version = CACHE_NONE; + version = CACHE_SC_NONE; p9_debug(P9_DEBUG_9P, "Cache mode: none\n"); - } else - pr_info("Unknown Cache mode %s\n", s); + } else if (kstrtoint(s, 0, &version) != 0) { + version = -EINVAL; + pr_info("Unknown Cache mode or invalid value %s\n", s); + } return version; } @@ -121,9 +117,9 @@ int v9fs_show_options(struct seq_file *m, struct dentry *root) if (v9ses->nodev) seq_puts(m, ",nodevmap"); if (v9ses->cache) - seq_printf(m, ",%s", v9fs_cache_modes[v9ses->cache]); + seq_printf(m, ",cache=%x", v9ses->cache); #ifdef CONFIG_9P_FSCACHE - if (v9ses->cachetag && v9ses->cache == CACHE_FSCACHE) + if (v9ses->cachetag && (v9ses->cache & CACHE_FSCACHE)) seq_printf(m, ",cachetag=%s", v9ses->cachetag); #endif @@ -143,9 +139,16 @@ int v9fs_show_options(struct seq_file *m, struct dentry *root) break; } + if (v9ses->flags & V9FS_IGNORE_QV) + seq_puts(m, ",ignoreqv"); + if (v9ses->flags & V9FS_DIRECT_IO) + seq_puts(m, ",directio"); if (v9ses->flags & V9FS_POSIX_ACL) seq_puts(m, ",posixacl"); + if (v9ses->flags & V9FS_NO_XATTR) + seq_puts(m, ",noxattr"); + return p9_show_client_options(m, v9ses->clnt); } @@ -266,14 +269,14 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) case Opt_nodevmap: v9ses->nodev = 1; break; - case Opt_cache_loose: - v9ses->cache = CACHE_LOOSE; + case Opt_noxattr: + v9ses->flags |= V9FS_NO_XATTR; break; - case Opt_fscache: - v9ses->cache = CACHE_FSCACHE; + case Opt_directio: + v9ses->flags |= V9FS_DIRECT_IO; break; - case Opt_mmap: - v9ses->cache = CACHE_MMAP; + case Opt_ignoreqv: + v9ses->flags |= V9FS_IGNORE_QV; break; case Opt_cachetag: #ifdef CONFIG_9P_FSCACHE @@ -468,7 +471,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, #ifdef CONFIG_9P_FSCACHE /* register the session for caching */ - if (v9ses->cache == CACHE_FSCACHE) { + if (v9ses->cache & CACHE_FSCACHE) { rc = v9fs_cache_session_get_cookie(v9ses, dev_name); if (rc < 0) goto err_clnt; diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index f3f74d197b5d..06a2514f0d88 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -31,29 +31,54 @@ #define V9FS_ACL_MASK V9FS_POSIX_ACL enum p9_session_flags { - V9FS_PROTO_2000U = 0x01, - V9FS_PROTO_2000L = 0x02, - V9FS_ACCESS_SINGLE = 0x04, - V9FS_ACCESS_USER = 0x08, - V9FS_ACCESS_CLIENT = 0x10, - V9FS_POSIX_ACL = 0x20 + V9FS_PROTO_2000U = 0x01, + V9FS_PROTO_2000L = 0x02, + V9FS_ACCESS_SINGLE = 0x04, + V9FS_ACCESS_USER = 0x08, + V9FS_ACCESS_CLIENT = 0x10, + V9FS_POSIX_ACL = 0x20, + V9FS_NO_XATTR = 0x40, + V9FS_IGNORE_QV = 0x80, /* ignore qid.version for cache hints */ + V9FS_DIRECT_IO = 0x100, + V9FS_SYNC = 0x200 }; -/* possible values of ->cache */ /** - * enum p9_cache_modes - user specified cache preferences - * @CACHE_NONE: do not cache data, dentries, or directory contents (default) - * @CACHE_LOOSE: cache data, dentries, and directory contents w/no consistency + * enum p9_cache_shortcuts - human readable cache preferences + * @CACHE_SC_NONE: disable all caches + * @CACHE_SC_READAHEAD: only provide caching for readahead + * @CACHE_SC_MMAP: provide caching to enable mmap + * @CACHE_SC_LOOSE: non-coherent caching for files and meta data + * @CACHE_SC_FSCACHE: persistent non-coherent caching for files and meta-data * - * eventually support loose, tight, time, session, default always none */ -enum p9_cache_modes { - CACHE_NONE, - CACHE_MMAP, - CACHE_LOOSE, - CACHE_FSCACHE, - nr__p9_cache_modes +enum p9_cache_shortcuts { + CACHE_SC_NONE = 0b00000000, + CACHE_SC_READAHEAD = 0b00000001, + CACHE_SC_MMAP = 0b00000101, + CACHE_SC_LOOSE = 0b00001111, + CACHE_SC_FSCACHE = 0b10001111, +}; + +/** + * enum p9_cache_bits - possible values of ->cache + * @CACHE_NONE: caches disabled + * @CACHE_FILE: file caching (open to close) + * @CACHE_META: meta-data and directory caching + * @CACHE_WRITEBACK: write-back caching for files + * @CACHE_LOOSE: don't check cache consistency + * @CACHE_FSCACHE: local persistent caches + * + */ + +enum p9_cache_bits { + CACHE_NONE = 0b00000000, + CACHE_FILE = 0b00000001, + CACHE_META = 0b00000010, + CACHE_WRITEBACK = 0b00000100, + CACHE_LOOSE = 0b00001000, + CACHE_FSCACHE = 0b10000000, }; /** @@ -62,7 +87,7 @@ enum p9_cache_modes { * @nodev: set to 1 to disable device mapping * @debug: debug level * @afid: authentication handle - * @cache: cache mode of type &p9_cache_modes + * @cache: cache mode of type &p9_cache_bits * @cachetag: the tag of the cache associated with this session * @fscache: session cookie associated with FS-Cache * @uname: string user name to mount hierarchy as @@ -112,7 +137,6 @@ struct v9fs_inode { struct netfs_inode netfs; /* Netfslib context and vfs inode */ struct p9_qid qid; unsigned int cache_validity; - struct p9_fid *writeback_fid; struct mutex v_mutex; }; diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 75106b9f293d..cdf441f22e07 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -36,10 +36,6 @@ extern const struct file_operations v9fs_dir_operations; extern const struct file_operations v9fs_dir_operations_dotl; extern const struct dentry_operations v9fs_dentry_operations; extern const struct dentry_operations v9fs_cached_dentry_operations; -extern const struct file_operations v9fs_cached_file_operations; -extern const struct file_operations v9fs_cached_file_operations_dotl; -extern const struct file_operations v9fs_mmap_file_operations; -extern const struct file_operations v9fs_mmap_file_operations_dotl; extern struct kmem_cache *v9fs_inode_cache; struct inode *v9fs_alloc_inode(struct super_block *sb); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 6f46d7e4c750..8a635999a7d6 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -12,7 +12,6 @@ #include <linux/file.h> #include <linux/stat.h> #include <linux/string.h> -#include <linux/inet.h> #include <linux/pagemap.h> #include <linux/sched.h> #include <linux/swap.h> @@ -57,8 +56,6 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) */ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file) { - struct inode *inode = file_inode(file); - struct v9fs_inode *v9inode = V9FS_I(inode); struct p9_fid *fid = file->private_data; BUG_ON(!fid); @@ -66,11 +63,8 @@ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file) /* we might need to read from a fid that was opened write-only * for read-modify-write of page cache, use the writeback fid * for that */ - if (rreq->origin == NETFS_READ_FOR_WRITE && - (fid->mode & O_ACCMODE) == O_WRONLY) { - fid = v9inode->writeback_fid; - BUG_ON(!fid); - } + WARN_ON(rreq->origin == NETFS_READ_FOR_WRITE && + !(fid->mode & P9_ORDWR)); p9_fid_get(fid); rreq->netfs_priv = fid; @@ -120,8 +114,6 @@ const struct netfs_request_ops v9fs_req_ops = { static bool v9fs_release_folio(struct folio *folio, gfp_t gfp) { - struct inode *inode = folio_inode(folio); - if (folio_test_private(folio)) return false; #ifdef CONFIG_9P_FSCACHE @@ -130,8 +122,8 @@ static bool v9fs_release_folio(struct folio *folio, gfp_t gfp) return false; folio_wait_fscache(folio); } + fscache_note_page_release(v9fs_inode_cookie(V9FS_I(folio_inode(folio)))); #endif - fscache_note_page_release(v9fs_inode_cookie(V9FS_I(inode))); return true; } @@ -141,6 +133,7 @@ static void v9fs_invalidate_folio(struct folio *folio, size_t offset, folio_wait_fscache(folio); } +#ifdef CONFIG_9P_FSCACHE static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error, bool was_async) { @@ -154,17 +147,19 @@ static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error, i_size_read(&v9inode->netfs.inode), 0); } } +#endif static int v9fs_vfs_write_folio_locked(struct folio *folio) { struct inode *inode = folio_inode(folio); - struct v9fs_inode *v9inode = V9FS_I(inode); - struct fscache_cookie *cookie = v9fs_inode_cookie(v9inode); loff_t start = folio_pos(folio); loff_t i_size = i_size_read(inode); struct iov_iter from; size_t len = folio_size(folio); + struct p9_fid *writeback_fid; int err; + struct v9fs_inode __maybe_unused *v9inode = V9FS_I(inode); + struct fscache_cookie __maybe_unused *cookie = v9fs_inode_cookie(v9inode); if (start >= i_size) return 0; /* Simultaneous truncation occurred */ @@ -173,25 +168,33 @@ static int v9fs_vfs_write_folio_locked(struct folio *folio) iov_iter_xarray(&from, ITER_SOURCE, &folio_mapping(folio)->i_pages, start, len); - /* We should have writeback_fid always set */ - BUG_ON(!v9inode->writeback_fid); + writeback_fid = v9fs_fid_find_inode(inode, true, INVALID_UID, true); + if (!writeback_fid) { + WARN_ONCE(1, "folio expected an open fid inode->i_private=%p\n", + inode->i_private); + return -EINVAL; + } folio_wait_fscache(folio); folio_start_writeback(folio); - p9_client_write(v9inode->writeback_fid, start, &from, &err); + p9_client_write(writeback_fid, start, &from, &err); +#ifdef CONFIG_9P_FSCACHE if (err == 0 && - fscache_cookie_enabled(cookie) && - test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags)) { + fscache_cookie_enabled(cookie) && + test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags)) { folio_start_fscache(folio); fscache_write_to_cache(v9fs_inode_cookie(v9inode), - folio_mapping(folio), start, len, i_size, - v9fs_write_to_cache_done, v9inode, - true); + folio_mapping(folio), start, len, i_size, + v9fs_write_to_cache_done, v9inode, + true); } +#endif folio_end_writeback(folio); + p9_fid_put(writeback_fid); + return err; } @@ -298,7 +301,6 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping, loff_t last_pos = pos + copied; struct folio *folio = page_folio(subpage); struct inode *inode = mapping->host; - struct v9fs_inode *v9inode = V9FS_I(inode); p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); @@ -318,7 +320,10 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping, if (last_pos > inode->i_size) { inode_add_bytes(inode, last_pos - inode->i_size); i_size_write(inode, last_pos); - fscache_update_cookie(v9fs_inode_cookie(v9inode), NULL, &last_pos); +#ifdef CONFIG_9P_FSCACHE + fscache_update_cookie(v9fs_inode_cookie(V9FS_I(inode)), NULL, + &last_pos); +#endif } folio_mark_dirty(folio); out: diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index 65fa2df5e49b..f16f73581634 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -13,7 +13,6 @@ #include <linux/pagemap.h> #include <linux/stat.h> #include <linux/string.h> -#include <linux/inet.h> #include <linux/namei.h> #include <linux/sched.h> #include <linux/slab.h> diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 3d74b04fe0de..45b684b7d8d7 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -13,7 +13,6 @@ #include <linux/stat.h> #include <linux/string.h> #include <linux/sched.h> -#include <linux/inet.h> #include <linux/slab.h> #include <linux/uio.h> #include <linux/fscache.h> @@ -197,9 +196,9 @@ static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx) /** - * v9fs_dir_release - called on a close of a file or directory - * @inode: inode of the directory - * @filp: file pointer to a directory + * v9fs_dir_release - close a directory or a file + * @inode: inode of the directory or file + * @filp: file pointer to a directory or file * */ @@ -214,7 +213,11 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) fid = filp->private_data; p9_debug(P9_DEBUG_VFS, "inode: %p filp: %p fid: %d\n", inode, filp, fid ? fid->fid : -1); + if (fid) { + if ((S_ISREG(inode->i_mode)) && (filp->f_mode & FMODE_WRITE)) + retval = filemap_fdatawrite(inode->i_mapping); + spin_lock(&inode->i_lock); hlist_del(&fid->ilist); spin_unlock(&inode->i_lock); diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 44c15eb2b908..6c31b8c8112d 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -14,7 +14,6 @@ #include <linux/file.h> #include <linux/stat.h> #include <linux/string.h> -#include <linux/inet.h> #include <linux/list.h> #include <linux/pagemap.h> #include <linux/utsname.h> @@ -29,7 +28,6 @@ #include "fid.h" #include "cache.h" -static const struct vm_operations_struct v9fs_file_vm_ops; static const struct vm_operations_struct v9fs_mmap_file_vm_ops; /** @@ -42,13 +40,11 @@ static const struct vm_operations_struct v9fs_mmap_file_vm_ops; int v9fs_file_open(struct inode *inode, struct file *file) { int err; - struct v9fs_inode *v9inode; struct v9fs_session_info *v9ses; - struct p9_fid *fid, *writeback_fid; + struct p9_fid *fid; int omode; p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file); - v9inode = V9FS_I(inode); v9ses = v9fs_inode2v9ses(inode); if (v9fs_proto_dotl(v9ses)) omode = v9fs_open_to_dotl_flags(file->f_flags); @@ -61,7 +57,19 @@ int v9fs_file_open(struct inode *inode, struct file *file) if (IS_ERR(fid)) return PTR_ERR(fid); - err = p9_client_open(fid, omode); + if ((v9ses->cache & CACHE_WRITEBACK) && (omode & P9_OWRITE)) { + int writeback_omode = (omode & ~P9_OWRITE) | P9_ORDWR; + + p9_debug(P9_DEBUG_CACHE, "write-only file with writeback enabled, try opening O_RDWR\n"); + err = p9_client_open(fid, writeback_omode); + if (err < 0) { + p9_debug(P9_DEBUG_CACHE, "could not open O_RDWR, disabling caches\n"); + err = p9_client_open(fid, omode); + fid->mode |= P9L_DIRECT; + } + } else { + err = p9_client_open(fid, omode); + } if (err < 0) { p9_fid_put(fid); return err; @@ -73,36 +81,14 @@ int v9fs_file_open(struct inode *inode, struct file *file) file->private_data = fid; } - mutex_lock(&v9inode->v_mutex); - if ((v9ses->cache) && !v9inode->writeback_fid && - ((file->f_flags & O_ACCMODE) != O_RDONLY)) { - /* - * clone a fid and add it to writeback_fid - * we do it during open time instead of - * page dirty time via write_begin/page_mkwrite - * because we want write after unlink usecase - * to work. - */ - writeback_fid = v9fs_writeback_fid(file_dentry(file)); - if (IS_ERR(writeback_fid)) { - err = PTR_ERR(writeback_fid); - mutex_unlock(&v9inode->v_mutex); - goto out_error; - } - v9inode->writeback_fid = (void *) writeback_fid; - } - mutex_unlock(&v9inode->v_mutex); #ifdef CONFIG_9P_FSCACHE - if (v9ses->cache == CACHE_FSCACHE) - fscache_use_cookie(v9fs_inode_cookie(v9inode), + if (v9ses->cache & CACHE_FSCACHE) + fscache_use_cookie(v9fs_inode_cookie(V9FS_I(inode)), file->f_mode & FMODE_WRITE); #endif + v9fs_fid_add_modes(fid, v9ses->flags, v9ses->cache, file->f_flags); v9fs_open_fid_add(inode, &fid); return 0; -out_error: - p9_fid_put(file->private_data); - file->private_data = NULL; - return err; } /** @@ -369,8 +355,13 @@ v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) struct p9_fid *fid = iocb->ki_filp->private_data; int ret, err = 0; - p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n", - iov_iter_count(to), iocb->ki_pos); + p9_debug(P9_DEBUG_VFS, "fid %d count %zu offset %lld\n", + fid->fid, iov_iter_count(to), iocb->ki_pos); + + if (!(fid->mode & P9L_DIRECT)) { + p9_debug(P9_DEBUG_VFS, "(cached)\n"); + return generic_file_read_iter(iocb, to); + } if (iocb->ki_filp->f_flags & O_NONBLOCK) ret = p9_client_read_once(fid, iocb->ki_pos, to, &err); @@ -393,10 +384,18 @@ static ssize_t v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; + struct p9_fid *fid = file->private_data; ssize_t retval; loff_t origin; int err = 0; + p9_debug(P9_DEBUG_VFS, "fid %d\n", fid->fid); + + if (!(fid->mode & (P9L_DIRECT | P9L_NOWRITECACHE))) { + p9_debug(P9_DEBUG_CACHE, "(cached)\n"); + return generic_file_write_iter(iocb, from); + } + retval = generic_write_checks(iocb, from); if (retval <= 0) return retval; @@ -478,45 +477,18 @@ static int v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma) { int retval; + struct inode *inode = file_inode(filp); + struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); + p9_debug(P9_DEBUG_MMAP, "filp :%p\n", filp); - retval = generic_file_mmap(filp, vma); - if (!retval) - vma->vm_ops = &v9fs_file_vm_ops; - - return retval; -} - -static int -v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma) -{ - int retval; - struct inode *inode; - struct v9fs_inode *v9inode; - struct p9_fid *fid; - - inode = file_inode(filp); - v9inode = V9FS_I(inode); - mutex_lock(&v9inode->v_mutex); - if (!v9inode->writeback_fid && - (vma->vm_flags & VM_SHARED) && - (vma->vm_flags & VM_WRITE)) { - /* - * clone a fid and add it to writeback_fid - * we do it during mmap instead of - * page dirty time via write_begin/page_mkwrite - * because we want write after unlink usecase - * to work. - */ - fid = v9fs_writeback_fid(file_dentry(filp)); - if (IS_ERR(fid)) { - retval = PTR_ERR(fid); - mutex_unlock(&v9inode->v_mutex); - return retval; - } - v9inode->writeback_fid = (void *) fid; + if (!(v9ses->cache & CACHE_WRITEBACK)) { + p9_debug(P9_DEBUG_CACHE, "(no mmap mode)"); + if (vma->vm_flags & VM_MAYSHARE) + return -ENODEV; + invalidate_inode_pages2(filp->f_mapping); + return generic_file_readonly_mmap(filp, vma); } - mutex_unlock(&v9inode->v_mutex); retval = generic_file_mmap(filp, vma); if (!retval) @@ -528,7 +500,6 @@ v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma) static vm_fault_t v9fs_vm_page_mkwrite(struct vm_fault *vmf) { - struct v9fs_inode *v9inode; struct folio *folio = page_folio(vmf->page); struct file *filp = vmf->vma->vm_file; struct inode *inode = file_inode(filp); @@ -537,8 +508,6 @@ v9fs_vm_page_mkwrite(struct vm_fault *vmf) p9_debug(P9_DEBUG_VFS, "folio %p fid %lx\n", folio, (unsigned long)filp->private_data); - v9inode = V9FS_I(inode); - /* Wait for the page to be written to the cache before we allow it to * be modified. We then assume the entire page will need writing back. */ @@ -551,7 +520,6 @@ v9fs_vm_page_mkwrite(struct vm_fault *vmf) /* Update file times before taking page lock */ file_update_time(filp); - BUG_ON(!v9inode->writeback_fid); if (folio_lock_killable(folio) < 0) return VM_FAULT_RETRY; if (folio_mapping(folio) != inode->i_mapping) @@ -564,35 +532,6 @@ out_unlock: return VM_FAULT_NOPAGE; } -/** - * v9fs_mmap_file_read_iter - read from a file - * @iocb: The operation parameters - * @to: The buffer to read into - * - */ -static ssize_t -v9fs_mmap_file_read_iter(struct kiocb *iocb, struct iov_iter *to) -{ - /* TODO: Check if there are dirty pages */ - return v9fs_file_read_iter(iocb, to); -} - -/** - * v9fs_mmap_file_write_iter - write to a file - * @iocb: The operation parameters - * @from: The data to write - * - */ -static ssize_t -v9fs_mmap_file_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - /* - * TODO: invalidate mmaps on filp's inode between - * offset and offset+count - */ - return v9fs_file_write_iter(iocb, from); -} - static void v9fs_mmap_vm_close(struct vm_area_struct *vma) { struct inode *inode; @@ -615,13 +554,6 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma) filemap_fdatawrite_wbc(inode->i_mapping, &wbc); } - -static const struct vm_operations_struct v9fs_file_vm_ops = { - .fault = filemap_fault, - .map_pages = filemap_map_pages, - .page_mkwrite = v9fs_vm_page_mkwrite, -}; - static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { .close = v9fs_mmap_vm_close, .fault = filemap_fault, @@ -629,34 +561,6 @@ static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { .page_mkwrite = v9fs_vm_page_mkwrite, }; - -const struct file_operations v9fs_cached_file_operations = { - .llseek = generic_file_llseek, - .read_iter = generic_file_read_iter, - .write_iter = generic_file_write_iter, - .open = v9fs_file_open, - .release = v9fs_dir_release, - .lock = v9fs_file_lock, - .mmap = v9fs_file_mmap, - .splice_read = generic_file_splice_read, - .splice_write = iter_file_splice_write, - .fsync = v9fs_file_fsync, -}; - -const struct file_operations v9fs_cached_file_operations_dotl = { - .llseek = generic_file_llseek, - .read_iter = generic_file_read_iter, - .write_iter = generic_file_write_iter, - .open = v9fs_file_open, - .release = v9fs_dir_release, - .lock = v9fs_file_lock_dotl, - .flock = v9fs_file_flock_dotl, - .mmap = v9fs_file_mmap, - .splice_read = generic_file_splice_read, - .splice_write = iter_file_splice_write, - .fsync = v9fs_file_fsync_dotl, -}; - const struct file_operations v9fs_file_operations = { .llseek = generic_file_llseek, .read_iter = v9fs_file_read_iter, @@ -678,34 +582,7 @@ const struct file_operations v9fs_file_operations_dotl = { .release = v9fs_dir_release, .lock = v9fs_file_lock_dotl, .flock = v9fs_file_flock_dotl, - .mmap = generic_file_readonly_mmap, - .splice_read = generic_file_splice_read, - .splice_write = iter_file_splice_write, - .fsync = v9fs_file_fsync_dotl, -}; - -const struct file_operations v9fs_mmap_file_operations = { - .llseek = generic_file_llseek, - .read_iter = v9fs_mmap_file_read_iter, - .write_iter = v9fs_mmap_file_write_iter, - .open = v9fs_file_open, - .release = v9fs_dir_release, - .lock = v9fs_file_lock, - .mmap = v9fs_mmap_file_mmap, - .splice_read = generic_file_splice_read, - .splice_write = iter_file_splice_write, - .fsync = v9fs_file_fsync, -}; - -const struct file_operations v9fs_mmap_file_operations_dotl = { - .llseek = generic_file_llseek, - .read_iter = v9fs_mmap_file_read_iter, - .write_iter = v9fs_mmap_file_write_iter, - .open = v9fs_file_open, - .release = v9fs_dir_release, - .lock = v9fs_file_lock_dotl, - .flock = v9fs_file_flock_dotl, - .mmap = v9fs_mmap_file_mmap, + .mmap = v9fs_file_mmap, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fsync = v9fs_file_fsync_dotl, diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 1d523bec0a94..36b466e35887 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -15,7 +15,6 @@ #include <linux/pagemap.h> #include <linux/stat.h> #include <linux/string.h> -#include <linux/inet.h> #include <linux/namei.h> #include <linux/sched.h> #include <linux/slab.h> @@ -230,7 +229,6 @@ struct inode *v9fs_alloc_inode(struct super_block *sb) v9inode = alloc_inode_sb(sb, v9fs_inode_cache, GFP_KERNEL); if (!v9inode) return NULL; - v9inode->writeback_fid = NULL; v9inode->cache_validity = 0; mutex_init(&v9inode->v_mutex); return &v9inode->netfs.inode; @@ -287,24 +285,10 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, case S_IFREG: if (v9fs_proto_dotl(v9ses)) { inode->i_op = &v9fs_file_inode_operations_dotl; - if (v9ses->cache == CACHE_LOOSE || - v9ses->cache == CACHE_FSCACHE) - inode->i_fop = - &v9fs_cached_file_operations_dotl; - else if (v9ses->cache == CACHE_MMAP) - inode->i_fop = &v9fs_mmap_file_operations_dotl; - else - inode->i_fop = &v9fs_file_operations_dotl; + inode->i_fop = &v9fs_file_operations_dotl; } else { inode->i_op = &v9fs_file_inode_operations; - if (v9ses->cache == CACHE_LOOSE || - v9ses->cache == CACHE_FSCACHE) - inode->i_fop = - &v9fs_cached_file_operations; - else if (v9ses->cache == CACHE_MMAP) - inode->i_fop = &v9fs_mmap_file_operations; - else - inode->i_fop = &v9fs_file_operations; + inode->i_fop = &v9fs_file_operations; } break; @@ -386,20 +370,23 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev) */ void v9fs_evict_inode(struct inode *inode) { - struct v9fs_inode *v9inode = V9FS_I(inode); - __le32 version; + struct v9fs_inode __maybe_unused *v9inode = V9FS_I(inode); + __le32 __maybe_unused version; truncate_inode_pages_final(&inode->i_data); + +#ifdef CONFIG_9P_FSCACHE version = cpu_to_le32(v9inode->qid.version); fscache_clear_inode_writeback(v9fs_inode_cookie(v9inode), inode, &version); +#endif + clear_inode(inode); filemap_fdatawrite(&inode->i_data); +#ifdef CONFIG_9P_FSCACHE fscache_relinquish_cookie(v9fs_inode_cookie(v9inode), false); - /* clunk the fid stashed in writeback_fid */ - p9_fid_put(v9inode->writeback_fid); - v9inode->writeback_fid = NULL; +#endif } static int v9fs_test_inode(struct inode *inode, void *data) @@ -779,7 +766,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, inode = NULL; else if (IS_ERR(fid)) inode = ERR_CAST(fid); - else if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) + else if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); else inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); @@ -808,11 +795,12 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, { int err; u32 perm; - struct v9fs_inode *v9inode; + struct v9fs_inode __maybe_unused *v9inode; struct v9fs_session_info *v9ses; - struct p9_fid *fid, *inode_fid; + struct p9_fid *fid; struct dentry *res = NULL; struct inode *inode; + int p9_omode; if (d_in_lookup(dentry)) { res = v9fs_vfs_lookup(dir, dentry, 0); @@ -831,9 +819,14 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, v9ses = v9fs_inode2v9ses(dir); perm = unixmode2p9mode(v9ses, mode); - fid = v9fs_create(v9ses, dir, dentry, NULL, perm, - v9fs_uflags2omode(flags, - v9fs_proto_dotu(v9ses))); + p9_omode = v9fs_uflags2omode(flags, v9fs_proto_dotu(v9ses)); + + if ((v9ses->cache & CACHE_WRITEBACK) && (p9_omode & P9_OWRITE)) { + p9_omode = (p9_omode & ~P9_OWRITE) | P9_ORDWR; + p9_debug(P9_DEBUG_CACHE, + "write-only file with writeback enabled, creating w/ O_RDWR\n"); + } + fid = v9fs_create(v9ses, dir, dentry, NULL, perm, p9_omode); if (IS_ERR(fid)) { err = PTR_ERR(fid); goto error; @@ -842,33 +835,18 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, v9fs_invalidate_inode_attr(dir); inode = d_inode(dentry); v9inode = V9FS_I(inode); - mutex_lock(&v9inode->v_mutex); - if ((v9ses->cache) && !v9inode->writeback_fid && - ((flags & O_ACCMODE) != O_RDONLY)) { - /* - * clone a fid and add it to writeback_fid - * we do it during open time instead of - * page dirty time via write_begin/page_mkwrite - * because we want write after unlink usecase - * to work. - */ - inode_fid = v9fs_writeback_fid(dentry); - if (IS_ERR(inode_fid)) { - err = PTR_ERR(inode_fid); - mutex_unlock(&v9inode->v_mutex); - goto error; - } - v9inode->writeback_fid = (void *) inode_fid; - } - mutex_unlock(&v9inode->v_mutex); err = finish_open(file, dentry, generic_file_open); if (err) goto error; file->private_data = fid; - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) +#ifdef CONFIG_9P_FSCACHE + if (v9ses->cache & CACHE_FSCACHE) fscache_use_cookie(v9fs_inode_cookie(v9inode), file->f_mode & FMODE_WRITE); +#endif + + v9fs_fid_add_modes(fid, v9ses->flags, v9ses->cache, file->f_flags); v9fs_open_fid_add(inode, &fid); file->f_mode |= FMODE_CREATED; @@ -1030,15 +1008,24 @@ v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; + struct inode *inode = d_inode(dentry); struct v9fs_session_info *v9ses; struct p9_fid *fid; struct p9_wstat *st; p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { - generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); + if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { + generic_fillattr(&nop_mnt_idmap, inode, stat); return 0; + } else if (v9ses->cache & CACHE_WRITEBACK) { + if (S_ISREG(inode->i_mode)) { + int retval = filemap_fdatawrite(inode->i_mapping); + + if (retval) + p9_debug(P9_DEBUG_ERROR, + "flushing writeback during getattr returned %d\n", retval); + } } fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) @@ -1070,7 +1057,6 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap, { int retval, use_dentry = 0; struct inode *inode = d_inode(dentry); - struct v9fs_inode *v9inode = V9FS_I(inode); struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL; struct p9_wstat wstat; @@ -1115,8 +1101,12 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap, } /* Write all dirty data */ - if (d_is_reg(dentry)) - filemap_write_and_wait(inode->i_mapping); + if (d_is_reg(dentry)) { + retval = filemap_fdatawrite(inode->i_mapping); + if (retval) + p9_debug(P9_DEBUG_ERROR, + "flushing writeback during setattr returned %d\n", retval); + } retval = p9_client_wstat(fid, &wstat); @@ -1127,9 +1117,17 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap, return retval; if ((iattr->ia_valid & ATTR_SIZE) && - iattr->ia_size != i_size_read(inode)) { + iattr->ia_size != i_size_read(inode)) { truncate_setsize(inode, iattr->ia_size); - fscache_resize_cookie(v9fs_inode_cookie(v9inode), iattr->ia_size); + truncate_pagecache(inode, iattr->ia_size); + +#ifdef CONFIG_9P_FSCACHE + if (v9ses->cache & CACHE_FSCACHE) { + struct v9fs_inode *v9inode = V9FS_I(inode); + + fscache_resize_cookie(v9fs_inode_cookie(v9inode), iattr->ia_size); + } +#endif } v9fs_invalidate_inode_attr(inode); @@ -1413,7 +1411,7 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) * We don't want to refresh inode->i_size, * because we may have cached data */ - flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ? + flags = (v9ses->cache & CACHE_LOOSE) ? V9FS_STAT2INODE_KEEP_ISIZE : 0; v9fs_stat2inode(st, inode, inode->i_sb, flags); out: diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 331ed60d8fcb..5361cd2d7996 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -13,7 +13,6 @@ #include <linux/pagemap.h> #include <linux/stat.h> #include <linux/string.h> -#include <linux/inet.h> #include <linux/namei.h> #include <linux/sched.h> #include <linux/slab.h> @@ -232,12 +231,12 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, int err = 0; kgid_t gid; umode_t mode; + int p9_omode = v9fs_open_to_dotl_flags(flags); const unsigned char *name = NULL; struct p9_qid qid; struct inode *inode; struct p9_fid *fid = NULL; - struct v9fs_inode *v9inode; - struct p9_fid *dfid = NULL, *ofid = NULL, *inode_fid = NULL; + struct p9_fid *dfid = NULL, *ofid = NULL; struct v9fs_session_info *v9ses; struct posix_acl *pacl = NULL, *dacl = NULL; struct dentry *res = NULL; @@ -282,14 +281,19 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, /* Update mode based on ACL value */ err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); if (err) { - p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n", + p9_debug(P9_DEBUG_VFS, "Failed to get acl values in create %d\n", err); goto out; } - err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags), - mode, gid, &qid); + + if ((v9ses->cache & CACHE_WRITEBACK) && (p9_omode & P9_OWRITE)) { + p9_omode = (p9_omode & ~P9_OWRITE) | P9_ORDWR; + p9_debug(P9_DEBUG_CACHE, + "write-only file with writeback enabled, creating w/ O_RDWR\n"); + } + err = p9_client_create_dotl(ofid, name, p9_omode, mode, gid, &qid); if (err < 0) { - p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n", + p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in create %d\n", err); goto out; } @@ -314,36 +318,19 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, v9fs_fid_add(dentry, &fid); d_instantiate(dentry, inode); - v9inode = V9FS_I(inode); - mutex_lock(&v9inode->v_mutex); - if ((v9ses->cache) && !v9inode->writeback_fid && - ((flags & O_ACCMODE) != O_RDONLY)) { - /* - * clone a fid and add it to writeback_fid - * we do it during open time instead of - * page dirty time via write_begin/page_mkwrite - * because we want write after unlink usecase - * to work. - */ - inode_fid = v9fs_writeback_fid(dentry); - if (IS_ERR(inode_fid)) { - err = PTR_ERR(inode_fid); - mutex_unlock(&v9inode->v_mutex); - goto out; - } - v9inode->writeback_fid = (void *) inode_fid; - } - mutex_unlock(&v9inode->v_mutex); /* Since we are opening a file, assign the open fid to the file */ err = finish_open(file, dentry, generic_file_open); if (err) goto out; file->private_data = ofid; #ifdef CONFIG_9P_FSCACHE - if (v9ses->cache == CACHE_FSCACHE) + if (v9ses->cache & CACHE_FSCACHE) { + struct v9fs_inode *v9inode = V9FS_I(inode); fscache_use_cookie(v9fs_inode_cookie(v9inode), file->f_mode & FMODE_WRITE); + } #endif + v9fs_fid_add_modes(ofid, v9ses->flags, v9ses->cache, flags); v9fs_open_fid_add(inode, &ofid); file->f_mode |= FMODE_CREATED; out: @@ -415,7 +402,7 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, } /* instantiate inode and assign the unopened fid to the dentry */ - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); @@ -458,13 +445,22 @@ v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap, struct dentry *dentry = path->dentry; struct v9fs_session_info *v9ses; struct p9_fid *fid; + struct inode *inode = d_inode(dentry); struct p9_stat_dotl *st; p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { - generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); + if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { + generic_fillattr(&nop_mnt_idmap, inode, stat); return 0; + } else if (v9ses->cache) { + if (S_ISREG(inode->i_mode)) { + int retval = filemap_fdatawrite(inode->i_mapping); + + if (retval) + p9_debug(P9_DEBUG_ERROR, + "flushing writeback during getattr returned %d\n", retval); + } } fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) @@ -540,12 +536,13 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { int retval, use_dentry = 0; + struct inode *inode = d_inode(dentry); + struct v9fs_session_info __maybe_unused *v9ses; struct p9_fid *fid = NULL; struct p9_iattr_dotl p9attr = { .uid = INVALID_UID, .gid = INVALID_GID, }; - struct inode *inode = d_inode(dentry); p9_debug(P9_DEBUG_VFS, "\n"); @@ -553,6 +550,8 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap, if (retval) return retval; + v9ses = v9fs_dentry2v9ses(dentry); + p9attr.valid = v9fs_mapped_iattr_valid(iattr->ia_valid); if (iattr->ia_valid & ATTR_MODE) p9attr.mode = iattr->ia_mode; @@ -583,8 +582,12 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap, return PTR_ERR(fid); /* Write all dirty data */ - if (S_ISREG(inode->i_mode)) - filemap_write_and_wait(inode->i_mapping); + if (S_ISREG(inode->i_mode)) { + retval = filemap_fdatawrite(inode->i_mapping); + if (retval < 0) + p9_debug(P9_DEBUG_ERROR, + "Flushing file prior to setattr failed: %d\n", retval); + } retval = p9_client_setattr(fid, &p9attr); if (retval < 0) { @@ -593,9 +596,17 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap, return retval; } - if ((iattr->ia_valid & ATTR_SIZE) && - iattr->ia_size != i_size_read(inode)) + if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size != + i_size_read(inode)) { truncate_setsize(inode, iattr->ia_size); + truncate_pagecache(inode, iattr->ia_size); + +#ifdef CONFIG_9P_FSCACHE + if (v9ses->cache & CACHE_FSCACHE) + fscache_resize_cookie(v9fs_inode_cookie(V9FS_I(inode)), + iattr->ia_size); +#endif + } v9fs_invalidate_inode_attr(inode); setattr_copy(&nop_mnt_idmap, inode, iattr); @@ -722,7 +733,7 @@ v9fs_vfs_symlink_dotl(struct mnt_idmap *idmap, struct inode *dir, } v9fs_invalidate_inode_attr(dir); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { /* Now walk from the parent so we can get an unopened fid. */ fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { @@ -799,7 +810,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, } v9fs_invalidate_inode_attr(dir); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { /* Get the latest stat info from server. */ struct p9_fid *fid; @@ -876,7 +887,7 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir, } /* instantiate inode and assign the unopened fid to the dentry */ - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); @@ -961,7 +972,7 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) * We don't want to refresh inode->i_size, * because we may have cached data */ - flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ? + flags = (v9ses->cache & CACHE_LOOSE) ? V9FS_STAT2INODE_KEEP_ISIZE : 0; v9fs_stat2inode_dotl(st, inode, flags); out: diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 266c4693e20c..73db55c050bf 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -12,7 +12,6 @@ #include <linux/file.h> #include <linux/stat.h> #include <linux/string.h> -#include <linux/inet.h> #include <linux/pagemap.h> #include <linux/mount.h> #include <linux/sched.h> @@ -64,7 +63,8 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, sb->s_magic = V9FS_MAGIC; if (v9fs_proto_dotl(v9ses)) { sb->s_op = &v9fs_super_ops_dotl; - sb->s_xattr = v9fs_xattr_handlers; + if (!(v9ses->flags & V9FS_NO_XATTR)) + sb->s_xattr = v9fs_xattr_handlers; } else { sb->s_op = &v9fs_super_ops; sb->s_time_max = U32_MAX; @@ -84,9 +84,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, sb->s_bdi->io_pages = v9ses->maxdata >> PAGE_SHIFT; } - sb->s_flags |= SB_ACTIVE | SB_DIRSYNC; - if (!v9ses->cache) - sb->s_flags |= SB_SYNCHRONOUS; + sb->s_flags |= SB_ACTIVE; #ifdef CONFIG_9P_FS_POSIX_ACL if ((v9ses->flags & V9FS_ACL_MASK) == V9FS_POSIX_ACL) @@ -137,7 +135,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, if (retval) goto release_sb; - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) + if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) sb->s_d_op = &v9fs_cached_dentry_operations; else sb->s_d_op = &v9fs_dentry_operations; @@ -278,7 +276,7 @@ static int v9fs_drop_inode(struct inode *inode) struct v9fs_session_info *v9ses; v9ses = v9fs_inode2v9ses(inode); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) + if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) return generic_drop_inode(inode); /* * in case of non cached mode always drop the @@ -291,49 +289,30 @@ static int v9fs_drop_inode(struct inode *inode) static int v9fs_write_inode(struct inode *inode, struct writeback_control *wbc) { - int ret; - struct p9_wstat wstat; struct v9fs_inode *v9inode; + /* * send an fsync request to server irrespective of * wbc->sync_mode. */ p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); - v9inode = V9FS_I(inode); - if (!v9inode->writeback_fid) - return 0; - v9fs_blank_wstat(&wstat); - ret = p9_client_wstat(v9inode->writeback_fid, &wstat); - if (ret < 0) { - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); - return ret; - } + v9inode = V9FS_I(inode); fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode)); + return 0; } static int v9fs_write_inode_dotl(struct inode *inode, struct writeback_control *wbc) { - int ret; struct v9fs_inode *v9inode; - /* - * send an fsync request to server irrespective of - * wbc->sync_mode. - */ + v9inode = V9FS_I(inode); - p9_debug(P9_DEBUG_VFS, "%s: inode %p, writeback_fid %p\n", - __func__, inode, v9inode->writeback_fid); - if (!v9inode->writeback_fid) - return 0; - - ret = p9_client_fsync(v9inode->writeback_fid, 0); - if (ret < 0) { - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); - return ret; - } + p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); + fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode)); + return 0; } diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index 1974a38bce20..e00cf8109b3f 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -185,10 +185,6 @@ static struct xattr_handler v9fs_xattr_security_handler = { const struct xattr_handler *v9fs_xattr_handlers[] = { &v9fs_xattr_user_handler, &v9fs_xattr_trusted_handler, -#ifdef CONFIG_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif #ifdef CONFIG_9P_FS_SECURITY &v9fs_xattr_security_handler, #endif diff --git a/fs/Kconfig b/fs/Kconfig index e99830c65033..cc07a0cd3172 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -250,16 +250,9 @@ config HUGETLBFS config HUGETLB_PAGE def_bool HUGETLBFS -# -# Select this config option from the architecture Kconfig, if it is preferred -# to enable the feature of HugeTLB Vmemmap Optimization (HVO). -# -config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP - bool - config HUGETLB_PAGE_OPTIMIZE_VMEMMAP def_bool HUGETLB_PAGE - depends on ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP + depends on ARCH_WANT_OPTIMIZE_VMEMMAP depends on SPARSEMEM_VMEMMAP config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON diff --git a/fs/Makefile b/fs/Makefile index 05f89b5c962f..834f1c3dba46 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -6,7 +6,6 @@ # Rewritten to use lists instead of if-statements. # -obj-$(CONFIG_SYSCTL) += sysctls.o obj-y := open.o read_write.o file_table.o super.o \ char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ @@ -50,7 +49,7 @@ obj-$(CONFIG_FS_MBCACHE) += mbcache.o obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ obj-$(CONFIG_COREDUMP) += coredump.o -obj-$(CONFIG_SYSCTL) += drop_caches.o +obj-$(CONFIG_SYSCTL) += drop_caches.o sysctls.o obj-$(CONFIG_FHANDLE) += fhandle.o obj-y += iomap/ @@ -124,7 +123,7 @@ obj-$(CONFIG_9P_FS) += 9p/ obj-$(CONFIG_AFS_FS) += afs/ obj-$(CONFIG_NILFS2_FS) += nilfs2/ obj-$(CONFIG_BEFS_FS) += befs/ -obj-$(CONFIG_HOSTFS) += hostfs/ +obj-y += hostfs/ obj-$(CONFIG_CACHEFILES) += cachefiles/ obj-$(CONFIG_DEBUG_FS) += debugfs/ obj-$(CONFIG_TRACING) += tracefs/ diff --git a/fs/afs/afs.h b/fs/afs/afs.h index 432cb4b23961..81815724db6c 100644 --- a/fs/afs/afs.h +++ b/fs/afs/afs.h @@ -19,8 +19,8 @@ #define AFSPATHMAX 1024 /* Maximum length of a pathname plus NUL */ #define AFSOPAQUEMAX 1024 /* Maximum length of an opaque field */ -#define AFS_VL_MAX_LIFESPAN (120 * HZ) -#define AFS_PROBE_MAX_LIFESPAN (30 * HZ) +#define AFS_VL_MAX_LIFESPAN 120 +#define AFS_PROBE_MAX_LIFESPAN 30 typedef u64 afs_volid_t; typedef u64 afs_vnodeid_t; diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 82690d1dd49a..4dd97afa536c 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -275,6 +275,7 @@ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key) loff_t i_size; int nr_pages, i; int ret; + loff_t remote_size = 0; _enter(""); @@ -289,6 +290,8 @@ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key) expand: i_size = i_size_read(&dvnode->netfs.inode); + if (i_size < remote_size) + i_size = remote_size; if (i_size < 2048) { ret = afs_bad(dvnode, afs_file_error_dir_small); goto error; @@ -319,16 +322,16 @@ expand: struct folio *folio; folio = filemap_get_folio(mapping, i); - if (!folio) { + if (IS_ERR(folio)) { if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) afs_stat_v(dvnode, n_inval); - - ret = -ENOMEM; folio = __filemap_get_folio(mapping, i, FGP_LOCK | FGP_CREAT, mapping->gfp_mask); - if (!folio) + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto error; + } folio_attach_private(folio, (void *)1); folio_unlock(folio); } @@ -364,6 +367,7 @@ expand: * buffer. */ up_write(&dvnode->validate_lock); + remote_size = req->file_size; goto expand; } @@ -524,7 +528,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, */ folio = __filemap_get_folio(dir->i_mapping, ctx->pos / PAGE_SIZE, FGP_ACCESSED, 0); - if (!folio) { + if (IS_ERR(folio)) { ret = afs_bad(dvnode, afs_file_error_dir_missing_page); break; } diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index 0ab7752d1b75..e2fa577b66fe 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -115,11 +115,12 @@ static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index) folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mapping->gfp_mask); - if (!folio) + if (IS_ERR(folio)) { clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); - else if (folio && !folio_test_private(folio)) + return NULL; + } + if (!folio_test_private(folio)) folio_attach_private(folio, (void *)1); - return folio; } diff --git a/fs/afs/file.c b/fs/afs/file.c index 68d6d5dc608d..719b31374879 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -569,20 +569,10 @@ static void afs_vm_close(struct vm_area_struct *vma) static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { struct afs_vnode *vnode = AFS_FS_I(file_inode(vmf->vma->vm_file)); - struct afs_file *af = vmf->vma->vm_file->private_data; - switch (afs_validate(vnode, af->key)) { - case 0: + if (afs_pagecache_valid(vnode)) return filemap_map_pages(vmf, start_pgoff, end_pgoff); - case -ENOMEM: - return VM_FAULT_OOM; - case -EINTR: - case -ERESTARTSYS: - return VM_FAULT_RETRY; - case -ESTALE: - default: - return VM_FAULT_SIGBUS; - } + return 0; } static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 0167e96e5198..866bab860a88 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -230,6 +230,7 @@ static void afs_apply_status(struct afs_operation *op, set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags); } change_size = true; + data_changed = true; } else if (vnode->status.type == AFS_FTYPE_DIR) { /* Expected directory change is handled elsewhere so * that we can locally edit the directory and save on a @@ -449,7 +450,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) 0 : FSCACHE_ADV_SINGLE_CHUNK, &key, sizeof(key), &aux, sizeof(aux), - vnode->status.size)); + i_size_read(&vnode->netfs.inode))); #endif } @@ -668,6 +669,24 @@ bool afs_check_validity(struct afs_vnode *vnode) } /* + * Returns true if the pagecache is still valid. Does not sleep. + */ +bool afs_pagecache_valid(struct afs_vnode *vnode) +{ + if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) { + if (vnode->netfs.inode.i_nlink) + clear_nlink(&vnode->netfs.inode); + return true; + } + + if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) && + afs_check_validity(vnode)) + return true; + + return false; +} + +/* * validate a vnode/inode * - there are several things we need to check * - parent dir data changes (rm, rmdir, rename, mkdir, create, link, @@ -684,14 +703,7 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) vnode->fid.vid, vnode->fid.vnode, vnode->flags, key_serial(key)); - if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) { - if (vnode->netfs.inode.i_nlink) - clear_nlink(&vnode->netfs.inode); - goto valid; - } - - if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) && - afs_check_validity(vnode)) + if (afs_pagecache_valid(vnode)) goto valid; down_write(&vnode->validate_lock); @@ -765,6 +777,13 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path, if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) && stat->nlink > 0) stat->nlink -= 1; + + /* Lie about the size of directories. We maintain a locally + * edited copy and may make different allocation decisions on + * it, but we need to give userspace the server's size. + */ + if (S_ISDIR(inode->i_mode)) + stat->size = vnode->netfs.remote_i_size; } while (need_seqretry(&vnode->cb_lock, seq)); done_seqretry(&vnode->cb_lock, seq); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index ad8523d0d038..9d3d64921106 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -128,7 +128,7 @@ struct afs_call { spinlock_t state_lock; int error; /* error code */ u32 abort_code; /* Remote abort ID or 0 */ - unsigned int max_lifespan; /* Maximum lifespan to set if not 0 */ + unsigned int max_lifespan; /* Maximum lifespan in secs to set if not 0 */ unsigned request_size; /* size of request data */ unsigned reply_max; /* maximum size of reply */ unsigned count2; /* count used in unmarshalling */ @@ -1171,6 +1171,7 @@ extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *); extern struct inode *afs_root_iget(struct super_block *, struct key *); extern bool afs_check_validity(struct afs_vnode *); extern int afs_validate(struct afs_vnode *, struct key *); +bool afs_pagecache_valid(struct afs_vnode *); extern int afs_getattr(struct mnt_idmap *idmap, const struct path *, struct kstat *, u32, unsigned int); extern int afs_setattr(struct mnt_idmap *idmap, struct dentry *, struct iattr *); diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index e08b850c3e6d..ed1644e7683f 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -335,7 +335,9 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) /* create a call */ rxcall = rxrpc_kernel_begin_call(call->net->socket, srx, call->key, (unsigned long)call, - tx_total_len, gfp, + tx_total_len, + call->max_lifespan, + gfp, (call->async ? afs_wake_up_async_call : afs_wake_up_call_waiter), @@ -350,10 +352,6 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) } call->rxcall = rxcall; - - if (call->max_lifespan) - rxrpc_kernel_set_max_life(call->net->socket, rxcall, - call->max_lifespan); call->issue_time = ktime_get_real(); /* send the request */ diff --git a/fs/afs/write.c b/fs/afs/write.c index 571f3b9a417e..c822d6006033 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -232,7 +232,7 @@ static void afs_kill_pages(struct address_space *mapping, _debug("kill %lx (to %lx)", index, last); folio = filemap_get_folio(mapping, index); - if (!folio) { + if (IS_ERR(folio)) { next = index + 1; continue; } @@ -270,7 +270,7 @@ static void afs_redirty_pages(struct writeback_control *wbc, _debug("redirty %llx @%llx", len, start); folio = filemap_get_folio(mapping, index); - if (!folio) { + if (IS_ERR(folio)) { next = index + 1; continue; } diff --git a/fs/attr.c b/fs/attr.c index aca9ff7aed33..d60dc1edb526 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -47,6 +47,7 @@ int setattr_should_drop_sgid(struct mnt_idmap *idmap, return ATTR_KILL_SGID; return 0; } +EXPORT_SYMBOL(setattr_should_drop_sgid); /** * setattr_should_drop_suidgid - determine whether the set{g,u}id bit needs to diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 8a884e795f6a..1033fbdfdbec 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -2058,7 +2058,7 @@ static int elf_core_dump(struct coredump_params *cprm) has_dumped = 1; - offset += sizeof(elf); /* Elf header */ + offset += sizeof(elf); /* ELF header */ offset += segs * sizeof(struct elf_phdr); /* Program headers */ /* Write notes phdr entry */ @@ -2174,7 +2174,6 @@ static void __exit exit_elf_binfmt(void) core_initcall(init_elf_binfmt); module_exit(exit_elf_binfmt); -MODULE_LICENSE("GPL"); #ifdef CONFIG_BINFMT_ELF_KUNIT_TEST #include "binfmt_elf_test.c" diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index a05eafcacfb2..05a1471d5283 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1540,7 +1540,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) fill_note(&auxv_note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv); thread_status_size += notesize(&auxv_note); - offset = sizeof(*elf); /* Elf header */ + offset = sizeof(*elf); /* ELF header */ offset += segs * sizeof(struct elf_phdr); /* Program headers */ /* Write notes phdr entry */ diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 37b6bab90c83..66fa9ab2c046 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -2,6 +2,7 @@ config BTRFS_FS tristate "Btrfs filesystem support" + select BLK_CGROUP_PUNT_BIO select CRYPTO select CRYPTO_CRC32C select LIBCRC32C diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index e54f0884802a..79336fa853db 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -45,7 +45,8 @@ static int check_extent_in_eb(struct btrfs_backref_walk_ctx *ctx, int root_count; bool cached; - if (!btrfs_file_extent_compression(eb, fi) && + if (!ctx->ignore_extent_item_pos && + !btrfs_file_extent_compression(eb, fi) && !btrfs_file_extent_encryption(eb, fi) && !btrfs_file_extent_other_encoding(eb, fi)) { u64 data_offset; @@ -552,7 +553,7 @@ static int add_all_parents(struct btrfs_backref_walk_ctx *ctx, count++; else goto next; - if (!ctx->ignore_extent_item_pos) { + if (!ctx->skip_inode_ref_list) { ret = check_extent_in_eb(ctx, &key, eb, fi, &eie); if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0) @@ -564,7 +565,7 @@ static int add_all_parents(struct btrfs_backref_walk_ctx *ctx, eie, (void **)&old, GFP_NOFS); if (ret < 0) break; - if (!ret && !ctx->ignore_extent_item_pos) { + if (!ret && !ctx->skip_inode_ref_list) { while (old->next) old = old->next; old->next = eie; @@ -1606,7 +1607,7 @@ again: goto out; } if (ref->count && ref->parent) { - if (!ctx->ignore_extent_item_pos && !ref->inode_list && + if (!ctx->skip_inode_ref_list && !ref->inode_list && ref->level == 0) { struct btrfs_tree_parent_check check = { 0 }; struct extent_buffer *eb; @@ -1647,7 +1648,7 @@ again: (void **)&eie, GFP_NOFS); if (ret < 0) goto out; - if (!ret && !ctx->ignore_extent_item_pos) { + if (!ret && !ctx->skip_inode_ref_list) { /* * We've recorded that parent, so we must extend * its inode list here. @@ -1743,7 +1744,7 @@ int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx *ctx) static int btrfs_find_all_roots_safe(struct btrfs_backref_walk_ctx *ctx) { const u64 orig_bytenr = ctx->bytenr; - const bool orig_ignore_extent_item_pos = ctx->ignore_extent_item_pos; + const bool orig_skip_inode_ref_list = ctx->skip_inode_ref_list; bool roots_ulist_allocated = false; struct ulist_iterator uiter; int ret = 0; @@ -1764,7 +1765,7 @@ static int btrfs_find_all_roots_safe(struct btrfs_backref_walk_ctx *ctx) roots_ulist_allocated = true; } - ctx->ignore_extent_item_pos = true; + ctx->skip_inode_ref_list = true; ULIST_ITER_INIT(&uiter); while (1) { @@ -1789,7 +1790,7 @@ static int btrfs_find_all_roots_safe(struct btrfs_backref_walk_ctx *ctx) ulist_free(ctx->refs); ctx->refs = NULL; ctx->bytenr = orig_bytenr; - ctx->ignore_extent_item_pos = orig_ignore_extent_item_pos; + ctx->skip_inode_ref_list = orig_skip_inode_ref_list; return ret; } @@ -1912,7 +1913,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, goto out_trans; } - walk_ctx.ignore_extent_item_pos = true; + walk_ctx.skip_inode_ref_list = true; walk_ctx.trans = trans; walk_ctx.fs_info = fs_info; walk_ctx.refs = &ctx->refs; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index ef6bbea3f456..1616e3e3f1e4 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -60,6 +60,12 @@ struct btrfs_backref_walk_ctx { * @extent_item_pos is ignored. */ bool ignore_extent_item_pos; + /* + * If true and bytenr corresponds to a data extent, then the inode list + * (each member describing inode number, file offset and root) is not + * added to each reference added to the @refs ulist. + */ + bool skip_inode_ref_list; /* A valid transaction handle or NULL. */ struct btrfs_trans_handle *trans; /* diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 726592868e9c..5379c4714905 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -31,11 +31,11 @@ struct btrfs_failed_bio { * Initialize a btrfs_bio structure. This skips the embedded bio itself as it * is already initialized by the block layer. */ -void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, +void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, btrfs_bio_end_io_t end_io, void *private) { memset(bbio, 0, offsetof(struct btrfs_bio, bio)); - bbio->inode = inode; + bbio->fs_info = fs_info; bbio->end_io = end_io; bbio->private = private; atomic_set(&bbio->pending_ios, 1); @@ -48,41 +48,58 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, * Just like the underlying bio_alloc_bioset it will not fail as it is backed by * a mempool. */ -struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, - struct btrfs_inode *inode, - btrfs_bio_end_io_t end_io, void *private) +struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + struct btrfs_fs_info *fs_info, + btrfs_bio_end_io_t end_io, void *private) { + struct btrfs_bio *bbio; struct bio *bio; bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); - btrfs_bio_init(btrfs_bio(bio), inode, end_io, private); - return bio; + bbio = btrfs_bio(bio); + btrfs_bio_init(bbio, fs_info, end_io, private); + return bbio; } -static struct bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, - struct bio *orig, u64 map_length, - bool use_append) +static blk_status_t btrfs_bio_extract_ordered_extent(struct btrfs_bio *bbio) { - struct btrfs_bio *orig_bbio = btrfs_bio(orig); + struct btrfs_ordered_extent *ordered; + int ret; + + ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset); + if (WARN_ON_ONCE(!ordered)) + return BLK_STS_IOERR; + ret = btrfs_extract_ordered_extent(bbio, ordered); + btrfs_put_ordered_extent(ordered); + + return errno_to_blk_status(ret); +} + +static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, + struct btrfs_bio *orig_bbio, + u64 map_length, bool use_append) +{ + struct btrfs_bio *bbio; struct bio *bio; if (use_append) { unsigned int nr_segs; - bio = bio_split_rw(orig, &fs_info->limits, &nr_segs, + bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs, &btrfs_clone_bioset, map_length); } else { - bio = bio_split(orig, map_length >> SECTOR_SHIFT, GFP_NOFS, - &btrfs_clone_bioset); + bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, + GFP_NOFS, &btrfs_clone_bioset); } - btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, NULL, orig_bbio); - - btrfs_bio(bio)->file_offset = orig_bbio->file_offset; - if (!(orig->bi_opf & REQ_BTRFS_ONE_ORDERED)) + bbio = btrfs_bio(bio); + btrfs_bio_init(bbio, fs_info, NULL, orig_bbio); + bbio->inode = orig_bbio->inode; + bbio->file_offset = orig_bbio->file_offset; + if (!(orig_bbio->bio.bi_opf & REQ_BTRFS_ONE_ORDERED)) orig_bbio->file_offset += map_length; atomic_inc(&orig_bbio->pending_ios); - return bio; + return bbio; } static void btrfs_orig_write_end_io(struct bio *bio); @@ -164,7 +181,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, goto done; } - btrfs_submit_bio(&repair_bbio->bio, mirror); + btrfs_submit_bio(repair_bbio, mirror); return; } @@ -224,15 +241,16 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &btrfs_repair_bioset); repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector; - bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset); + __bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset); repair_bbio = btrfs_bio(repair_bio); - btrfs_bio_init(repair_bbio, failed_bbio->inode, NULL, fbio); + btrfs_bio_init(repair_bbio, fs_info, NULL, fbio); + repair_bbio->inode = failed_bbio->inode; repair_bbio->file_offset = failed_bbio->file_offset + bio_offset; mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); - btrfs_submit_bio(repair_bio, mirror); + btrfs_submit_bio(repair_bbio, mirror); return fbio; } @@ -246,6 +264,9 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de struct btrfs_failed_bio *fbio = NULL; u32 offset = 0; + /* Read-repair requires the inode field to be set by the submitter. */ + ASSERT(inode); + /* * Hand off repair bios to the repair code as there is no upper level * submitter for them. @@ -306,17 +327,17 @@ static void btrfs_end_bio_work(struct work_struct *work) struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); /* Metadata reads are checked and repaired by the submitter. */ - if (bbio->bio.bi_opf & REQ_META) - bbio->end_io(bbio); - else + if (bbio->inode && !(bbio->bio.bi_opf & REQ_META)) btrfs_check_read_bio(bbio, bbio->bio.bi_private); + else + bbio->end_io(bbio); } static void btrfs_simple_end_io(struct bio *bio) { struct btrfs_bio *bbio = btrfs_bio(bio); struct btrfs_device *dev = bio->bi_private; - struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + struct btrfs_fs_info *fs_info = bbio->fs_info; btrfs_bio_counter_dec(fs_info); @@ -340,7 +361,8 @@ static void btrfs_raid56_end_io(struct bio *bio) btrfs_bio_counter_dec(bioc->fs_info); bbio->mirror_num = bioc->mirror_num; - if (bio_op(bio) == REQ_OP_READ && !(bbio->bio.bi_opf & REQ_META)) + if (bio_op(bio) == REQ_OP_READ && bbio->inode && + !(bbio->bio.bi_opf & REQ_META)) btrfs_check_read_bio(bbio, NULL); else btrfs_orig_bbio_end_io(bbio); @@ -418,7 +440,11 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) dev->devid, bio->bi_iter.bi_size); btrfsic_check_bio(bio); - submit_bio(bio); + + if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT) + blkcg_punt_bio_submit(bio); + else + submit_bio(bio); } static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) @@ -534,10 +560,10 @@ static void run_one_async_done(struct btrfs_work *work) /* * All of the bios that pass through here are from async helpers. - * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context. - * This changes nothing when cgroups aren't in use. + * Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's + * context. This changes nothing when cgroups aren't in use. */ - bio->bi_opf |= REQ_CGROUP_PUNT; + bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT; __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); } @@ -562,7 +588,7 @@ static bool should_async_write(struct btrfs_bio *bbio) * in order. */ if (bbio->bio.bi_opf & REQ_META) { - struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + struct btrfs_fs_info *fs_info = bbio->fs_info; if (btrfs_is_zoned(fs_info)) return false; @@ -582,7 +608,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, struct btrfs_io_context *bioc, struct btrfs_io_stripe *smap, int mirror_num) { - struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + struct btrfs_fs_info *fs_info = bbio->fs_info; struct async_submit_bio *async; async = kmalloc(sizeof(*async), GFP_NOFS); @@ -603,12 +629,12 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, return true; } -static bool btrfs_submit_chunk(struct bio *bio, int mirror_num) +static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) { - struct btrfs_bio *bbio = btrfs_bio(bio); struct btrfs_inode *inode = bbio->inode; - struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_fs_info *fs_info = bbio->fs_info; struct btrfs_bio *orig_bbio = bbio; + struct bio *bio = &bbio->bio; u64 logical = bio->bi_iter.bi_sector << 9; u64 length = bio->bi_iter.bi_size; u64 map_length = length; @@ -631,15 +657,15 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num) map_length = min(map_length, fs_info->max_zone_append_size); if (map_length < length) { - bio = btrfs_split_bio(fs_info, bio, map_length, use_append); - bbio = btrfs_bio(bio); + bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append); + bio = &bbio->bio; } /* * Save the iter for the end_io handler and preload the checksums for * data reads. */ - if (bio_op(bio) == REQ_OP_READ && !(bio->bi_opf & REQ_META)) { + if (bio_op(bio) == REQ_OP_READ && inode && !(bio->bi_opf & REQ_META)) { bbio->saved_iter = bio->bi_iter; ret = btrfs_lookup_bio_sums(bbio); if (ret) @@ -650,7 +676,7 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num) if (use_append) { bio->bi_opf &= ~REQ_OP_WRITE; bio->bi_opf |= REQ_OP_ZONE_APPEND; - ret = btrfs_extract_ordered_extent(btrfs_bio(bio)); + ret = btrfs_bio_extract_ordered_extent(bbio); if (ret) goto fail_put_bio; } @@ -659,7 +685,7 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num) * Csum items for reloc roots have already been cloned at this * point, so they are handled as part of the no-checksum case. */ - if (!(inode->flags & BTRFS_INODE_NODATASUM) && + if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) && !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && !btrfs_is_data_reloc_root(inode->root)) { if (should_async_write(bbio) && @@ -686,9 +712,12 @@ fail: return true; } -void btrfs_submit_bio(struct bio *bio, int mirror_num) +void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) { - while (!btrfs_submit_chunk(bio, mirror_num)) + /* If bbio->inode is not populated, its file_offset must be 0. */ + ASSERT(bbio->inode || bbio->file_offset == 0); + + while (!btrfs_submit_chunk(bbio, mirror_num)) ; } @@ -706,12 +735,9 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct page *page, unsigned int pg_offset, int mirror_num) { - struct btrfs_device *dev; + struct btrfs_io_stripe smap = { 0 }; struct bio_vec bvec; struct bio bio; - u64 map_length = 0; - u64 sector; - struct btrfs_io_context *bioc = NULL; int ret = 0; ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); @@ -720,68 +746,38 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, if (btrfs_repair_one_zone(fs_info, logical)) return 0; - map_length = length; - /* * Avoid races with device replace and make sure our bioc has devices * associated to its stripes that don't go away while we are doing the * read repair operation. */ btrfs_bio_counter_inc_blocked(fs_info); - if (btrfs_is_parity_mirror(fs_info, logical, length)) { - /* - * Note that we don't use BTRFS_MAP_WRITE because it's supposed - * to update all raid stripes, but here we just want to correct - * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad - * stripe's dev and sector. - */ - ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, - &map_length, &bioc, 0); - if (ret) - goto out_counter_dec; - ASSERT(bioc->mirror_num == 1); - } else { - ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, - &map_length, &bioc, mirror_num); - if (ret) - goto out_counter_dec; - /* - * This happens when dev-replace is also running, and the - * mirror_num indicates the dev-replace target. - * - * In this case, we don't need to do anything, as the read - * error just means the replace progress hasn't reached our - * read range, and later replace routine would handle it well. - */ - if (mirror_num != bioc->mirror_num) - goto out_counter_dec; - } - - sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; - dev = bioc->stripes[bioc->mirror_num - 1].dev; - btrfs_put_bioc(bioc); + ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num); + if (ret < 0) + goto out_counter_dec; - if (!dev || !dev->bdev || - !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { + if (!smap.dev->bdev || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state)) { ret = -EIO; goto out_counter_dec; } - bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); - bio.bi_iter.bi_sector = sector; + bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); + bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; __bio_add_page(&bio, page, length, pg_offset); btrfsic_check_bio(&bio); ret = submit_bio_wait(&bio); if (ret) { /* try to remap that extent elsewhere? */ - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); + btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS); goto out_bio_uninit; } btrfs_info_rl_in_rcu(fs_info, "read error corrected: ino %llu off %llu (dev %s sector %llu)", - ino, start, btrfs_dev_name(dev), sector); + ino, start, btrfs_dev_name(smap.dev), + smap.physical >> SECTOR_SHIFT); ret = 0; out_bio_uninit: @@ -791,6 +787,45 @@ out_counter_dec: return ret; } +/* + * Submit a btrfs_bio based repair write. + * + * If @dev_replace is true, the write would be submitted to dev-replace target. + */ +void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace) +{ + struct btrfs_fs_info *fs_info = bbio->fs_info; + u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; + u64 length = bbio->bio.bi_iter.bi_size; + struct btrfs_io_stripe smap = { 0 }; + int ret; + + ASSERT(fs_info); + ASSERT(mirror_num > 0); + ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE); + ASSERT(!bbio->inode); + + btrfs_bio_counter_inc_blocked(fs_info); + ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num); + if (ret < 0) + goto fail; + + if (dev_replace) { + if (btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE && btrfs_is_zoned(fs_info)) { + bbio->bio.bi_opf &= ~REQ_OP_WRITE; + bbio->bio.bi_opf |= REQ_OP_ZONE_APPEND; + } + ASSERT(smap.dev == fs_info->dev_replace.srcdev); + smap.dev = fs_info->dev_replace.tgtdev; + } + __btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num); + return; + +fail: + btrfs_bio_counter_dec(fs_info); + btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); +} + int __init btrfs_bioset_init(void) { if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index 873ff85817f0..a8eca3a65673 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -30,7 +30,10 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); * passed to btrfs_submit_bio for mapping to the physical devices. */ struct btrfs_bio { - /* Inode and offset into it that this I/O operates on. */ + /* + * Inode and offset into it that this I/O operates on. + * Only set for data I/O. + */ struct btrfs_inode *inode; u64 file_offset; @@ -58,6 +61,9 @@ struct btrfs_bio { atomic_t pending_ios; struct work_struct end_io_work; + /* File system that this I/O operates on. */ + struct btrfs_fs_info *fs_info; + /* * This member must come last, bio_alloc_bioset will allocate enough * bytes for entire btrfs_bio but relies on bio being last. @@ -73,11 +79,11 @@ static inline struct btrfs_bio *btrfs_bio(struct bio *bio) int __init btrfs_bioset_init(void); void __cold btrfs_bioset_exit(void); -void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, +void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, btrfs_bio_end_io_t end_io, void *private); -struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, - struct btrfs_inode *inode, - btrfs_bio_end_io_t end_io, void *private); +struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + struct btrfs_fs_info *fs_info, + btrfs_bio_end_io_t end_io, void *private); static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { @@ -88,7 +94,11 @@ static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) /* Bio only refers to one ordered extent. */ #define REQ_BTRFS_ONE_ORDERED REQ_DRV -void btrfs_submit_bio(struct bio *bio, int mirror_num); +/* Submit using blkcg_punt_bio_submit. */ +#define REQ_BTRFS_CGROUP_PUNT REQ_FS_PRIVATE + +void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num); +void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace); int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct page *page, unsigned int pg_offset, int mirror_num); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 5fc670c27f86..957ad1c31c4f 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -160,15 +160,6 @@ void btrfs_put_block_group(struct btrfs_block_group *cache) btrfs_discard_cancel_work(&cache->fs_info->discard_ctl, cache); - /* - * If not empty, someone is still holding mutex of - * full_stripe_lock, which can only be released by caller. - * And it will definitely cause use-after-free when caller - * tries to release full stripe lock. - * - * No better way to resolve, but only to warn. - */ - WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); kfree(cache->free_space_ctl); kfree(cache->physical_map); kfree(cache); @@ -1977,12 +1968,12 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, map = em->map_lookup; data_stripe_length = em->orig_block_len; - io_stripe_size = map->stripe_len; + io_stripe_size = BTRFS_STRIPE_LEN; chunk_start = em->start; /* For RAID5/6 adjust to a full IO stripe length */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) - io_stripe_size = map->stripe_len * nr_data_stripes(map); + io_stripe_size = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT; buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); if (!buf) { @@ -1992,28 +1983,28 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, for (i = 0; i < map->num_stripes; i++) { bool already_inserted = false; - u64 stripe_nr; - u64 offset; + u32 stripe_nr; + u32 offset; int j; if (!in_range(physical, map->stripes[i].physical, data_stripe_length)) continue; - stripe_nr = physical - map->stripes[i].physical; - stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset); + stripe_nr = (physical - map->stripes[i].physical) >> + BTRFS_STRIPE_LEN_SHIFT; + offset = (physical - map->stripes[i].physical) & + BTRFS_STRIPE_LEN_MASK; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID10)) { - stripe_nr = stripe_nr * map->num_stripes + i; - stripe_nr = div_u64(stripe_nr, map->sub_stripes); - } + BTRFS_BLOCK_GROUP_RAID10)) + stripe_nr = div_u64(stripe_nr * map->num_stripes + i, + map->sub_stripes); /* * The remaining case would be for RAID56, multiply by * nr_data_stripes(). Alternatively, just use rmap_len below * instead of map->stripe_len */ - bytenr = chunk_start + stripe_nr * io_stripe_size + offset; /* Ensure we don't add duplicate addresses */ @@ -2124,8 +2115,6 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( btrfs_init_free_space_ctl(cache, cache->free_space_ctl); atomic_set(&cache->frozen, 0); mutex_init(&cache->free_space_lock); - cache->full_stripe_locks_root.root = RB_ROOT; - mutex_init(&cache->full_stripe_locks_root.lock); return cache; } @@ -2672,7 +2661,7 @@ static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset) } struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, - u64 bytes_used, u64 type, + u64 type, u64 chunk_offset, u64 size) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -2687,7 +2676,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran cache->length = size; set_free_space_tree_thresholds(cache); - cache->used = bytes_used; cache->flags = type; cache->cached = BTRFS_CACHE_FINISHED; cache->global_root_id = calculate_global_root_id(fs_info, cache->start); @@ -2738,9 +2726,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran #ifdef CONFIG_BTRFS_DEBUG if (btrfs_should_fragment_free_space(cache)) { - u64 new_bytes_used = size - bytes_used; - - cache->space_info->bytes_used += new_bytes_used >> 1; + cache->space_info->bytes_used += size >> 1; fragment_free_space(cache); } #endif diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 6e4a0b429ac3..cc0e4b37db2d 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -91,14 +91,6 @@ struct btrfs_caching_control { /* Once caching_thread() finds this much free space, it will wake up waiters. */ #define CACHING_CTL_WAKE_UP SZ_2M -/* - * Tree to record all locked full stripes of a RAID5/6 block group - */ -struct btrfs_full_stripe_locks_tree { - struct rb_root root; - struct mutex lock; -}; - struct btrfs_block_group { struct btrfs_fs_info *fs_info; struct inode *inode; @@ -229,9 +221,6 @@ struct btrfs_block_group { */ int swap_extents; - /* Record locked full stripes for RAID5/6 block group */ - struct btrfs_full_stripe_locks_tree full_stripe_locks_root; - /* * Allocation offset for the block group to implement sequential * allocation. This is used only on a zoned filesystem. @@ -302,7 +291,7 @@ void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info); void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg); int btrfs_read_block_groups(struct btrfs_fs_info *info); struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, - u64 bytes_used, u64 type, + u64 type, u64 chunk_offset, u64 size); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans); int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 5367a14d44d2..ac18c43fadad 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -124,7 +124,8 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, } else { num_bytes = 0; } - if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) { + if (qgroup_to_release_ret && + block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) { qgroup_to_release = block_rsv->qgroup_rsv_reserved - block_rsv->qgroup_rsv_size; block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size; @@ -232,9 +233,6 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent) u64 num_bytes = 0; int ret = -ENOSPC; - if (!block_rsv) - return 0; - spin_lock(&block_rsv->lock); num_bytes = mult_perc(block_rsv->size, min_percent); if (block_rsv->reserved >= num_bytes) @@ -245,17 +243,15 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent) } int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, u64 min_reserved, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, enum btrfs_reserve_flush_enum flush) { - u64 num_bytes = 0; int ret = -ENOSPC; if (!block_rsv) return 0; spin_lock(&block_rsv->lock); - num_bytes = min_reserved; if (block_rsv->reserved >= num_bytes) ret = 0; else @@ -355,17 +351,19 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) /* * But we also want to reserve enough space so we can do the fallback - * global reserve for an unlink, which is an additional 5 items (see the - * comment in __unlink_start_trans for what we're modifying.) + * global reserve for an unlink, which is an additional + * BTRFS_UNLINK_METADATA_UNITS items. * * But we also need space for the delayed ref updates from the unlink, - * so its 10, 5 for the actual operation, and 5 for the delayed ref - * updates. + * so add BTRFS_UNLINK_METADATA_UNITS units for delayed refs, one for + * each unlink metadata item. */ - min_items += 10; + min_items += BTRFS_UNLINK_METADATA_UNITS; num_bytes = max_t(u64, num_bytes, - btrfs_calc_insert_metadata_size(fs_info, min_items)); + btrfs_calc_insert_metadata_size(fs_info, min_items) + + btrfs_calc_delayed_ref_bytes(fs_info, + BTRFS_UNLINK_METADATA_UNITS)); spin_lock(&sinfo->lock); spin_lock(&block_rsv->lock); diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 4cc41c9aaa82..6dc781709aca 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -65,7 +65,7 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush); int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent); int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, u64 min_reserved, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, enum btrfs_reserve_flush_enum flush); int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes, diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 9dc21622806e..ec2ae4406c16 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -142,11 +142,22 @@ struct btrfs_inode { /* a local copy of root's last_log_commit */ int last_log_commit; - /* - * Total number of bytes pending delalloc, used by stat to calculate the - * real block usage of the file. This is used only for files. - */ - u64 delalloc_bytes; + union { + /* + * Total number of bytes pending delalloc, used by stat to + * calculate the real block usage of the file. This is used + * only for files. + */ + u64 delalloc_bytes; + /* + * The lowest possible index of the next dir index key which + * points to an inode that needs to be logged. + * This is used only for directories. + * Use the helpers btrfs_get_first_dir_index_to_log() and + * btrfs_set_first_dir_index_to_log() to access this field. + */ + u64 first_dir_index_to_log; + }; union { /* @@ -247,6 +258,17 @@ struct btrfs_inode { struct inode vfs_inode; }; +static inline u64 btrfs_get_first_dir_index_to_log(const struct btrfs_inode *inode) +{ + return READ_ONCE(inode->first_dir_index_to_log); +} + +static inline void btrfs_set_first_dir_index_to_log(struct btrfs_inode *inode, + u64 index) +{ + WRITE_ONCE(inode->first_dir_index_to_log, index); +} + static inline struct btrfs_inode *BTRFS_I(const struct inode *inode) { return container_of(inode, struct btrfs_inode, vfs_inode); @@ -407,7 +429,8 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, u32 pgoff, u8 *csum, const u8 * const csum_expected); -blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio); +int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, + struct btrfs_ordered_extent *ordered); bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, u32 bio_offset, struct bio_vec *bv); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index f42f31f22d13..2d0493f0a184 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -37,6 +37,8 @@ #include "file-item.h" #include "super.h" +struct bio_set btrfs_compressed_bioset; + static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; const char* btrfs_compress_type2str(enum btrfs_compression_type type) @@ -54,6 +56,25 @@ const char* btrfs_compress_type2str(enum btrfs_compression_type type) return NULL; } +static inline struct compressed_bio *to_compressed_bio(struct btrfs_bio *bbio) +{ + return container_of(bbio, struct compressed_bio, bbio); +} + +static struct compressed_bio *alloc_compressed_bio(struct btrfs_inode *inode, + u64 start, blk_opf_t op, + btrfs_bio_end_io_t end_io) +{ + struct btrfs_bio *bbio; + + bbio = btrfs_bio(bio_alloc_bioset(NULL, BTRFS_MAX_COMPRESSED_PAGES, op, + GFP_NOFS, &btrfs_compressed_bioset)); + btrfs_bio_init(bbio, inode->root->fs_info, end_io, NULL); + bbio->inode = inode; + bbio->file_offset = start; + return to_compressed_bio(bbio); +} + bool btrfs_compress_is_valid_type(const char *str, size_t len) { int i; @@ -139,32 +160,25 @@ static int compression_decompress(int type, struct list_head *ws, } } +static void btrfs_free_compressed_pages(struct compressed_bio *cb) +{ + for (unsigned int i = 0; i < cb->nr_pages; i++) + put_page(cb->compressed_pages[i]); + kfree(cb->compressed_pages); +} + static int btrfs_decompress_bio(struct compressed_bio *cb); static void end_compressed_bio_read(struct btrfs_bio *bbio) { - struct compressed_bio *cb = bbio->private; - unsigned int index; - struct page *page; + struct compressed_bio *cb = to_compressed_bio(bbio); + blk_status_t status = bbio->bio.bi_status; - if (bbio->bio.bi_status) - cb->status = bbio->bio.bi_status; - else - cb->status = errno_to_blk_status(btrfs_decompress_bio(cb)); - - /* Release the compressed pages */ - for (index = 0; index < cb->nr_pages; index++) { - page = cb->compressed_pages[index]; - page->mapping = NULL; - put_page(page); - } - - /* Do io completion on the original bio */ - btrfs_bio_end_io(btrfs_bio(cb->orig_bio), cb->status); + if (!status) + status = errno_to_blk_status(btrfs_decompress_bio(cb)); - /* Finally free the cb struct */ - kfree(cb->compressed_pages); - kfree(cb); + btrfs_free_compressed_pages(cb); + btrfs_bio_end_io(cb->orig_bbio, status); bio_put(&bbio->bio); } @@ -172,14 +186,14 @@ static void end_compressed_bio_read(struct btrfs_bio *bbio) * Clear the writeback bits on all of the file * pages for a compressed write */ -static noinline void end_compressed_writeback(struct inode *inode, - const struct compressed_bio *cb) +static noinline void end_compressed_writeback(const struct compressed_bio *cb) { + struct inode *inode = &cb->bbio.inode->vfs_inode; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); unsigned long index = cb->start >> PAGE_SHIFT; unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; struct folio_batch fbatch; - const int errno = blk_status_to_errno(cb->status); + const int errno = blk_status_to_errno(cb->bbio.bio.bi_status); int i; int ret; @@ -207,45 +221,25 @@ static noinline void end_compressed_writeback(struct inode *inode, /* the inode may be gone now */ } -static void finish_compressed_bio_write(struct compressed_bio *cb) +static void btrfs_finish_compressed_write_work(struct work_struct *work) { - struct inode *inode = cb->inode; - unsigned int index; + struct compressed_bio *cb = + container_of(work, struct compressed_bio, write_end_work); /* * Ok, we're the last bio for this extent, step one is to call back * into the FS and do all the end_io operations. */ - btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL, + btrfs_writepage_endio_finish_ordered(cb->bbio.inode, NULL, cb->start, cb->start + cb->len - 1, - cb->status == BLK_STS_OK); + cb->bbio.bio.bi_status == BLK_STS_OK); if (cb->writeback) - end_compressed_writeback(inode, cb); + end_compressed_writeback(cb); /* Note, our inode could be gone now */ - /* - * Release the compressed pages, these came from alloc_page and - * are not attached to the inode at all - */ - for (index = 0; index < cb->nr_pages; index++) { - struct page *page = cb->compressed_pages[index]; - - page->mapping = NULL; - put_page(page); - } - - /* Finally free the cb struct */ - kfree(cb->compressed_pages); - kfree(cb); -} - -static void btrfs_finish_compressed_write_work(struct work_struct *work) -{ - struct compressed_bio *cb = - container_of(work, struct compressed_bio, write_end_work); - - finish_compressed_bio_write(cb); + btrfs_free_compressed_pages(cb); + bio_put(&cb->bbio.bio); } /* @@ -257,13 +251,25 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) */ static void end_compressed_bio_write(struct btrfs_bio *bbio) { - struct compressed_bio *cb = bbio->private; - struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); + struct compressed_bio *cb = to_compressed_bio(bbio); + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; - cb->status = bbio->bio.bi_status; queue_work(fs_info->compressed_write_workers, &cb->write_end_work); +} - bio_put(&bbio->bio); +static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb) +{ + struct bio *bio = &cb->bbio.bio; + u32 offset = 0; + + while (offset < cb->compressed_len) { + u32 len = min_t(u32, cb->compressed_len - offset, PAGE_SIZE); + + /* Maximum compressed extent is smaller than bio size limit. */ + __bio_add_page(bio, cb->compressed_pages[offset >> PAGE_SHIFT], + len, 0); + offset += len; + } } /* @@ -275,28 +281,24 @@ static void end_compressed_bio_write(struct btrfs_bio *bbio) * This also checksums the file bytes and gets things ready for * the end io hooks. */ -blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, +void btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, unsigned int len, u64 disk_start, unsigned int compressed_len, struct page **compressed_pages, unsigned int nr_pages, blk_opf_t write_flags, - struct cgroup_subsys_state *blkcg_css, bool writeback) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct bio *bio = NULL; struct compressed_bio *cb; - u64 cur_disk_bytenr = disk_start; - blk_status_t ret = BLK_STS_OK; ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(len, fs_info->sectorsize)); - cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS); - if (!cb) - return BLK_STS_RESOURCE; - cb->status = BLK_STS_OK; - cb->inode = &inode->vfs_inode; + + write_flags |= REQ_BTRFS_ONE_ORDERED; + + cb = alloc_compressed_bio(inode, start, REQ_OP_WRITE | write_flags, + end_compressed_bio_write); cb->start = start; cb->len = len; cb->compressed_pages = compressed_pages; @@ -304,56 +306,10 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, cb->writeback = writeback; INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); cb->nr_pages = nr_pages; + cb->bbio.bio.bi_iter.bi_sector = disk_start >> SECTOR_SHIFT; + btrfs_add_compressed_bio_pages(cb); - if (blkcg_css) { - kthread_associate_blkcg(blkcg_css); - write_flags |= REQ_CGROUP_PUNT; - } - - write_flags |= REQ_BTRFS_ONE_ORDERED; - bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_WRITE | write_flags, - BTRFS_I(cb->inode), end_compressed_bio_write, cb); - bio->bi_iter.bi_sector = cur_disk_bytenr >> SECTOR_SHIFT; - btrfs_bio(bio)->file_offset = start; - - while (cur_disk_bytenr < disk_start + compressed_len) { - u64 offset = cur_disk_bytenr - disk_start; - unsigned int index = offset >> PAGE_SHIFT; - unsigned int real_size; - unsigned int added; - struct page *page = compressed_pages[index]; - - /* - * We have various limits on the real read size: - * - page boundary - * - compressed length boundary - */ - real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset)); - real_size = min_t(u64, real_size, compressed_len - offset); - ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); - - added = bio_add_page(bio, page, real_size, offset_in_page(offset)); - /* - * Maximum compressed extent is smaller than bio size limit, - * thus bio_add_page() should always success. - */ - ASSERT(added == real_size); - cur_disk_bytenr += added; - } - - /* Finished the range. */ - ASSERT(bio->bi_iter.bi_size); - btrfs_submit_bio(bio, 0); - if (blkcg_css) - kthread_associate_blkcg(NULL); - return ret; -} - -static u64 bio_end_offset(struct bio *bio) -{ - struct bio_vec *last = bio_last_bvec_all(bio); - - return page_offset(last->bv_page) + last->bv_len + last->bv_offset; + btrfs_submit_bio(&cb->bbio, 0); } /* @@ -374,7 +330,8 @@ static noinline int add_ra_bio_pages(struct inode *inode, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); unsigned long end_index; - u64 cur = bio_end_offset(cb->orig_bio); + struct bio *orig_bio = &cb->orig_bbio->bio; + u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size; u64 isize = i_size_read(inode); int ret; struct page *page; @@ -464,7 +421,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, */ if (!em || cur < em->start || (cur + fs_info->sectorsize > extent_map_end(em)) || - (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) { + (em->block_start >> 9) != orig_bio->bi_iter.bi_sector) { free_extent_map(em); unlock_extent(tree, cur, page_end, NULL); unlock_page(page); @@ -484,7 +441,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, } add_size = min(em->start + em->len, page_end + 1) - cur; - ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur)); + ret = bio_add_page(orig_bio, page, add_size, offset_in_page(cur)); if (ret != add_size) { unlock_extent(tree, cur, page_end, NULL); unlock_page(page); @@ -515,17 +472,14 @@ static noinline int add_ra_bio_pages(struct inode *inode, * After the compressed pages are read, we copy the bytes into the * bio we were passed and then call the bio end_io calls */ -void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, - int mirror_num) +void btrfs_submit_compressed_read(struct btrfs_bio *bbio, int mirror_num) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_map_tree *em_tree; + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_map_tree *em_tree = &inode->extent_tree; struct compressed_bio *cb; unsigned int compressed_len; - struct bio *comp_bio; - const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; - u64 cur_disk_byte = disk_bytenr; - u64 file_offset; + u64 file_offset = bbio->file_offset; u64 em_len; u64 em_start; struct extent_map *em; @@ -533,12 +487,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int memstall = 0; blk_status_t ret; int ret2; - int i; - - em_tree = &BTRFS_I(inode)->extent_tree; - - file_offset = bio_first_bvec_all(bio)->bv_offset + - page_offset(bio_first_page_all(bio)); /* we need the actual starting offset of this extent in the file */ read_lock(&em_tree->lock); @@ -551,102 +499,54 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ASSERT(em->compress_type != BTRFS_COMPRESS_NONE); compressed_len = em->block_len; - cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS); - if (!cb) { - ret = BLK_STS_RESOURCE; - goto out; - } - cb->status = BLK_STS_OK; - cb->inode = inode; + cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ, + end_compressed_bio_read); cb->start = em->orig_start; em_len = em->len; em_start = em->start; - cb->len = bio->bi_iter.bi_size; + cb->len = bbio->bio.bi_iter.bi_size; cb->compressed_len = compressed_len; cb->compress_type = em->compress_type; - cb->orig_bio = bio; + cb->orig_bbio = bbio; free_extent_map(em); - em = NULL; cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS); if (!cb->compressed_pages) { ret = BLK_STS_RESOURCE; - goto fail; + goto out_free_bio; } ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages); if (ret2) { ret = BLK_STS_RESOURCE; - goto fail; + goto out_free_compressed_pages; } - add_ra_bio_pages(inode, em_start + em_len, cb, &memstall, &pflags); + add_ra_bio_pages(&inode->vfs_inode, em_start + em_len, cb, &memstall, + &pflags); /* include any pages we added in add_ra-bio_pages */ - cb->len = bio->bi_iter.bi_size; - - comp_bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, BTRFS_I(cb->inode), - end_compressed_bio_read, cb); - comp_bio->bi_iter.bi_sector = (cur_disk_byte >> SECTOR_SHIFT); - - while (cur_disk_byte < disk_bytenr + compressed_len) { - u64 offset = cur_disk_byte - disk_bytenr; - unsigned int index = offset >> PAGE_SHIFT; - unsigned int real_size; - unsigned int added; - struct page *page = cb->compressed_pages[index]; - - /* - * We have various limit on the real read size: - * - page boundary - * - compressed length boundary - */ - real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset)); - real_size = min_t(u64, real_size, compressed_len - offset); - ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); - - added = bio_add_page(comp_bio, page, real_size, offset_in_page(offset)); - /* - * Maximum compressed extent is smaller than bio size limit, - * thus bio_add_page() should always success. - */ - ASSERT(added == real_size); - cur_disk_byte += added; - } + cb->len = bbio->bio.bi_iter.bi_size; + cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector; + btrfs_add_compressed_bio_pages(cb); if (memstall) psi_memstall_leave(&pflags); - /* - * Stash the initial offset of this chunk, as there is no direct - * correlation between compressed pages and the original file offset. - * The field is only used for printing error messages anyway. - */ - btrfs_bio(comp_bio)->file_offset = file_offset; - - ASSERT(comp_bio->bi_iter.bi_size); - btrfs_submit_bio(comp_bio, mirror_num); + btrfs_submit_bio(&cb->bbio, mirror_num); return; -fail: - if (cb->compressed_pages) { - for (i = 0; i < cb->nr_pages; i++) { - if (cb->compressed_pages[i]) - __free_page(cb->compressed_pages[i]); - } - } - +out_free_compressed_pages: kfree(cb->compressed_pages); - kfree(cb); +out_free_bio: + bio_put(&cb->bbio.bio); out: - free_extent_map(em); - btrfs_bio_end_io(btrfs_bio(bio), ret); - return; + btrfs_bio_end_io(bbio, ret); } /* @@ -1038,6 +938,8 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) ret = compression_decompress_bio(workspace, cb); put_workspace(type, workspace); + if (!ret) + zero_fill_bio(&cb->orig_bbio->bio); return ret; } @@ -1062,6 +964,10 @@ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, int __init btrfs_init_compress(void) { + if (bioset_init(&btrfs_compressed_bioset, BIO_POOL_SIZE, + offsetof(struct compressed_bio, bbio.bio), + BIOSET_NEED_BVECS)) + return -ENOMEM; btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE); btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB); btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO); @@ -1075,6 +981,7 @@ void __cold btrfs_exit_compress(void) btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB); btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO); zstd_cleanup_workspace_manager(); + bioset_exit(&btrfs_compressed_bioset); } /* @@ -1110,7 +1017,7 @@ void __cold btrfs_exit_compress(void) int btrfs_decompress_buf2page(const char *buf, u32 buf_len, struct compressed_bio *cb, u32 decompressed) { - struct bio *orig_bio = cb->orig_bio; + struct bio *orig_bio = &cb->orig_bbio->bio; /* Offset inside the full decompressed extent */ u32 cur_offset; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index a5e3377db9ad..19ab2abeddc0 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -6,8 +6,8 @@ #ifndef BTRFS_COMPRESSION_H #define BTRFS_COMPRESSION_H -#include <linux/blk_types.h> #include <linux/sizes.h> +#include "bio.h" struct btrfs_inode; @@ -23,6 +23,7 @@ struct btrfs_inode; /* Maximum length of compressed data stored on disk */ #define BTRFS_MAX_COMPRESSED (SZ_128K) +#define BTRFS_MAX_COMPRESSED_PAGES (BTRFS_MAX_COMPRESSED / PAGE_SIZE) static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); /* Maximum size of data before compression */ @@ -37,9 +38,6 @@ struct compressed_bio { /* the pages with the compressed data on them */ struct page **compressed_pages; - /* inode that owns this data */ - struct inode *inode; - /* starting offset in the inode for our pages */ u64 start; @@ -55,14 +53,14 @@ struct compressed_bio { /* Whether this is a write for writeback. */ bool writeback; - /* IO errors */ - blk_status_t status; - union { /* For reads, this is the bio we are copying the data into */ - struct bio *orig_bio; + struct btrfs_bio *orig_bbio; struct work_struct write_end_work; }; + + /* Must be last. */ + struct btrfs_bio bbio; }; static inline unsigned int btrfs_compress_type(unsigned int type_level) @@ -88,16 +86,14 @@ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, int btrfs_decompress_buf2page(const char *buf, u32 buf_len, struct compressed_bio *cb, u32 decompressed); -blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, +void btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, unsigned int len, u64 disk_start, unsigned int compressed_len, struct page **compressed_pages, unsigned int nr_pages, blk_opf_t write_flags, - struct cgroup_subsys_state *blkcg_css, bool writeback); -void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, - int mirror_num); +void btrfs_submit_compressed_read(struct btrfs_bio *bbio, int mirror_num); unsigned int btrfs_compress_str2level(unsigned int type, const char *str); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a5b6bb54545f..2ff2961b1183 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -854,7 +854,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, * Search for a key in the given extent_buffer. * * The lower boundary for the search is specified by the slot number @first_slot. - * Use a value of 0 to search over the whole extent buffer. + * Use a value of 0 to search over the whole extent buffer. Works for both + * leaves and nodes. * * The slot in the extent buffer is returned via @slot. If the key exists in the * extent buffer, then @slot will point to the slot where the key is, otherwise @@ -863,8 +864,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, * Slot may point to the total number of items (i.e. one position beyond the last * key) if the key is bigger than the last key in the extent buffer. */ -int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot, - const struct btrfs_key *key, int *slot) +int btrfs_bin_search(struct extent_buffer *eb, int first_slot, + const struct btrfs_key *key, int *slot) { unsigned long p; int item_size; @@ -959,7 +960,7 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, if (slot < 0 || slot >= btrfs_header_nritems(parent)) return ERR_PTR(-ENOENT); - BUG_ON(level == 0); + ASSERT(level); check.level = level - 1; check.transid = btrfs_node_ptr_generation(parent, slot); @@ -1064,11 +1065,14 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) return 0; - left = btrfs_read_node_slot(parent, pslot - 1); - if (IS_ERR(left)) - left = NULL; + if (pslot) { + left = btrfs_read_node_slot(parent, pslot - 1); + if (IS_ERR(left)) { + ret = PTR_ERR(left); + left = NULL; + goto enospc; + } - if (left) { __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); wret = btrfs_cow_block(trans, root, left, parent, pslot - 1, &left, @@ -1079,11 +1083,14 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } } - right = btrfs_read_node_slot(parent, pslot + 1); - if (IS_ERR(right)) - right = NULL; + if (pslot + 1 < btrfs_header_nritems(parent)) { + right = btrfs_read_node_slot(parent, pslot + 1); + if (IS_ERR(right)) { + ret = PTR_ERR(right); + right = NULL; + goto enospc; + } - if (right) { __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); wret = btrfs_cow_block(trans, root, right, parent, pslot + 1, &right, @@ -1240,14 +1247,14 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (!parent) return 1; - left = btrfs_read_node_slot(parent, pslot - 1); - if (IS_ERR(left)) - left = NULL; - /* first, try to make some room in the middle buffer */ - if (left) { + if (pslot) { u32 left_nr; + left = btrfs_read_node_slot(parent, pslot - 1); + if (IS_ERR(left)) + return PTR_ERR(left); + __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); left_nr = btrfs_header_nritems(left); @@ -1292,16 +1299,17 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_tree_unlock(left); free_extent_buffer(left); } - right = btrfs_read_node_slot(parent, pslot + 1); - if (IS_ERR(right)) - right = NULL; /* * then try to empty the right most buffer into the middle */ - if (right) { + if (pslot + 1 < btrfs_header_nritems(parent)) { u32 right_nr; + right = btrfs_read_node_slot(parent, pslot + 1); + if (IS_ERR(right)) + return PTR_ERR(right); + __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); right_nr = btrfs_header_nritems(right); @@ -1864,7 +1872,7 @@ static inline int search_for_key_slot(struct extent_buffer *eb, return 0; } - return btrfs_generic_bin_search(eb, search_low_slot, key, slot); + return btrfs_bin_search(eb, search_low_slot, key, slot); } static int search_leaf(struct btrfs_trans_handle *trans, @@ -2321,7 +2329,7 @@ again: */ btrfs_unlock_up_safe(p, level + 1); - ret = btrfs_bin_search(b, key, &slot); + ret = btrfs_bin_search(b, 0, key, &slot); if (ret < 0) goto done; @@ -2482,26 +2490,15 @@ int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *path) { - while (1) { + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { int ret; - const int slot = path->slots[0]; - const struct extent_buffer *leaf = path->nodes[0]; - /* This is where we start walking the path. */ - if (slot >= btrfs_header_nritems(leaf)) { - /* - * If we've reached the last slot in this leaf we need - * to go to the next leaf and reset the path. - */ - ret = btrfs_next_leaf(root, path); - if (ret) - return ret; - continue; - } - /* Store the found, valid item in @key. */ - btrfs_item_key_to_cpu(leaf, key, slot); - break; + ret = btrfs_next_leaf(root, path); + if (ret) + return ret; } + + btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); return 0; } @@ -2630,6 +2627,10 @@ static bool check_sibling_keys(struct extent_buffer *left, } if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) { + btrfs_crit(left->fs_info, "left extent buffer:"); + btrfs_print_tree(left, false); + btrfs_crit(left->fs_info, "right extent buffer:"); + btrfs_print_tree(right, false); btrfs_crit(left->fs_info, "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)", left_last.objectid, left_last.type, @@ -3198,12 +3199,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_write_locked(path->nodes[1]); right = btrfs_read_node_slot(upper, slot + 1); - /* - * slot + 1 is not valid or we fail to read the right node, - * no big deal, just return. - */ if (IS_ERR(right)) - return 1; + return PTR_ERR(right); __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); @@ -3222,6 +3219,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (check_sibling_keys(left, right)) { ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); btrfs_tree_unlock(right); free_extent_buffer(right); return ret; @@ -3417,12 +3415,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_write_locked(path->nodes[1]); left = btrfs_read_node_slot(path->nodes[1], slot - 1); - /* - * slot - 1 is not valid or we fail to read the left node, - * no big deal, just return. - */ if (IS_ERR(left)) - return 1; + return PTR_ERR(left); __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); @@ -3444,6 +3438,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root if (check_sibling_keys(left, right)) { ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); goto out; } return __push_leaf_left(trans, path, min_data_size, empty, left, @@ -4489,10 +4484,12 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) { struct btrfs_key key; + struct btrfs_key orig_key; struct btrfs_disk_key found_key; int ret; btrfs_item_key_to_cpu(path->nodes[0], &key, 0); + orig_key = key; if (key.offset > 0) { key.offset--; @@ -4509,8 +4506,36 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) btrfs_release_path(path); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) + if (ret <= 0) return ret; + + /* + * Previous key not found. Even if we were at slot 0 of the leaf we had + * before releasing the path and calling btrfs_search_slot(), we now may + * be in a slot pointing to the same original key - this can happen if + * after we released the path, one of more items were moved from a + * sibling leaf into the front of the leaf we had due to an insertion + * (see push_leaf_right()). + * If we hit this case and our slot is > 0 and just decrement the slot + * so that the caller does not process the same key again, which may or + * may not break the caller, depending on its logic. + */ + if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { + btrfs_item_key(path->nodes[0], &found_key, path->slots[0]); + ret = comp_keys(&found_key, &orig_key); + if (ret == 0) { + if (path->slots[0] > 0) { + path->slots[0]--; + return 0; + } + /* + * At slot 0, same key as before, it means orig_key is + * the lowest, leftmost, key in the tree. We're done. + */ + return 1; + } + } + btrfs_item_key(path->nodes[0], &found_key, 0); ret = comp_keys(&found_key, &key); /* @@ -4576,7 +4601,7 @@ again: while (1) { nritems = btrfs_header_nritems(cur); level = btrfs_header_level(cur); - sret = btrfs_bin_search(cur, min_key, &slot); + sret = btrfs_bin_search(cur, 0, min_key, &slot); if (sret < 0) { ret = sret; goto out; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 97897107fab5..4c1986cd5bed 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -508,22 +508,9 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); int __init btrfs_ctree_init(void); void __cold btrfs_ctree_exit(void); -int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot, - const struct btrfs_key *key, int *slot); +int btrfs_bin_search(struct extent_buffer *eb, int first_slot, + const struct btrfs_key *key, int *slot); -/* - * Simple binary search on an extent buffer. Works for both leaves and nodes, and - * always searches over the whole range of keys (slot 0 to slot 'nritems - 1'). - */ -static inline int btrfs_bin_search(struct extent_buffer *eb, - const struct btrfs_key *key, - int *slot) -{ - return btrfs_generic_bin_search(eb, 0, key, slot); -} - -int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, - int *slot); int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 7ddb1d104e8e..427abaf608b8 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -358,8 +358,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, * racing with an ordered completion or some such that would think it * needs to free the reservation we just made. */ - spin_lock(&inode->lock); nr_extents = count_max_extents(fs_info, num_bytes); + spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, nr_extents); inode->csum_bytes += disk_num_bytes; btrfs_calculate_inode_block_rsv_size(fs_info, inode); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 886ffb232eac..0b32432d7d56 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -53,24 +53,6 @@ bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) return ret; } -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) -{ - u64 num_entries = - atomic_read(&trans->transaction->delayed_refs.num_entries); - u64 avg_runtime; - u64 val; - - smp_mb(); - avg_runtime = trans->fs_info->avg_delayed_ref_runtime; - val = num_entries * avg_runtime; - if (val >= NSEC_PER_SEC) - return 1; - if (val >= NSEC_PER_SEC / 2) - return 2; - - return btrfs_check_space_for_delayed_refs(trans->fs_info); -} - /* * Release a ref head's reservation. * @@ -83,20 +65,9 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) { struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; - u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr); + const u64 num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr); u64 released = 0; - /* - * We have to check the mount option here because we could be enabling - * the free space tree for the first time and don't have the compat_ro - * option set yet. - * - * We need extra reservations if we have the free space tree because - * we'll have to modify that tree as well. - */ - if (btrfs_test_opt(fs_info, FREE_SPACE_TREE)) - num_bytes *= 2; - released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); if (released) trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", @@ -118,18 +89,8 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) if (!trans->delayed_ref_updates) return; - num_bytes = btrfs_calc_insert_metadata_size(fs_info, - trans->delayed_ref_updates); - /* - * We have to check the mount option here because we could be enabling - * the free space tree for the first time and don't have the compat_ro - * option set yet. - * - * We need extra reservations if we have the free space tree because - * we'll have to modify that tree as well. - */ - if (btrfs_test_opt(fs_info, FREE_SPACE_TREE)) - num_bytes *= 2; + num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, + trans->delayed_ref_updates); spin_lock(&delayed_rsv->lock); delayed_rsv->size += num_bytes; @@ -200,7 +161,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush) { struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; - u64 limit = btrfs_calc_insert_metadata_size(fs_info, 1); + u64 limit = btrfs_calc_delayed_ref_bytes(fs_info, 1); u64 num_bytes = 0; int ret = -ENOSPC; @@ -217,7 +178,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); if (ret) return ret; - btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0); + btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false); trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, num_bytes, 1); return 0; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 2eb34abf700f..b54261fe509b 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -253,6 +253,27 @@ extern struct kmem_cache *btrfs_delayed_extent_op_cachep; int __init btrfs_delayed_ref_init(void); void __cold btrfs_delayed_ref_exit(void); +static inline u64 btrfs_calc_delayed_ref_bytes(const struct btrfs_fs_info *fs_info, + int num_delayed_refs) +{ + u64 num_bytes; + + num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_delayed_refs); + + /* + * We have to check the mount option here because we could be enabling + * the free space tree for the first time and don't have the compat_ro + * option set yet. + * + * We need extra reservations if we have the free space tree because + * we'll have to modify that tree as well. + */ + if (btrfs_test_opt(fs_info, FREE_SPACE_TREE)) + num_bytes *= 2; + + return num_bytes; +} + static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref, int action, u64 bytenr, u64 len, u64 parent) { @@ -385,7 +406,6 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *src, u64 num_bytes); -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans); bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); /* diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 317aeff6c1da..a6d77fe41e1a 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -56,11 +56,9 @@ #define BTRFS_DISCARD_DELAY (120ULL * NSEC_PER_SEC) #define BTRFS_DISCARD_UNUSED_DELAY (10ULL * NSEC_PER_SEC) -/* Target completion latency of discarding all discardable extents */ -#define BTRFS_DISCARD_TARGET_MSEC (6 * 60 * 60UL * MSEC_PER_SEC) #define BTRFS_DISCARD_MIN_DELAY_MSEC (1UL) #define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL) -#define BTRFS_DISCARD_MAX_IOPS (10U) +#define BTRFS_DISCARD_MAX_IOPS (1000U) /* Monotonically decreasing minimum length filters after index 0 */ static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = { @@ -577,6 +575,7 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) s32 discardable_extents; s64 discardable_bytes; u32 iops_limit; + unsigned long min_delay = BTRFS_DISCARD_MIN_DELAY_MSEC; unsigned long delay; discardable_extents = atomic_read(&discard_ctl->discardable_extents); @@ -607,13 +606,19 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) } iops_limit = READ_ONCE(discard_ctl->iops_limit); - if (iops_limit) + + if (iops_limit) { delay = MSEC_PER_SEC / iops_limit; - else - delay = BTRFS_DISCARD_TARGET_MSEC / discardable_extents; + } else { + /* + * Unset iops_limit means go as fast as possible, so allow a + * delay of 0. + */ + delay = 0; + min_delay = 0; + } - delay = clamp(delay, BTRFS_DISCARD_MIN_DELAY_MSEC, - BTRFS_DISCARD_MAX_DELAY_MSEC); + delay = clamp(delay, min_delay, BTRFS_DISCARD_MAX_DELAY_MSEC); discard_ctl->delay_ms = delay; spin_unlock(&discard_ctl->lock); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9e1596bb208d..fbf9006c6234 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1341,17 +1341,8 @@ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) { int ret; - unsigned int nofs_flag; - /* - * We might be called under a transaction (e.g. indirect backref - * resolution) which could deadlock if it triggers memory reclaim - */ - nofs_flag = memalloc_nofs_save(); - ret = btrfs_drew_lock_init(&root->snapshot_lock); - memalloc_nofs_restore(nofs_flag); - if (ret) - goto fail; + btrfs_drew_lock_init(&root->snapshot_lock); if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && !btrfs_is_data_reloc_root(root)) { @@ -2065,7 +2056,6 @@ void btrfs_put_root(struct btrfs_root *root) WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)); if (root->anon_dev) free_anon_bdev(root->anon_dev); - btrfs_drew_lock_destroy(&root->snapshot_lock); free_root_extent_buffers(root); #ifdef CONFIG_BTRFS_DEBUG spin_lock(&root->fs_info->fs_roots_radix_lock); @@ -2125,11 +2115,16 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info) atomic_set(&fs_info->reloc_cancel_req, 0); } -static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) +static int btrfs_init_btree_inode(struct super_block *sb) { - struct inode *inode = fs_info->btree_inode; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID, fs_info->tree_root); + struct inode *inode; + + inode = new_inode(sb); + if (!inode) + return -ENOMEM; inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; set_nlink(inode, 1); @@ -2140,6 +2135,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) */ inode->i_size = OFFSET_MAX; inode->i_mapping->a_ops = &btree_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, @@ -2152,6 +2148,9 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) BTRFS_I(inode)->location.offset = 0; set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); __insert_inode_hash(inode, hash); + fs_info->btree_inode = inode; + + return 0; } static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) @@ -2966,7 +2965,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) atomic64_set(&fs_info->free_chunk_space, 0); fs_info->tree_mod_log = RB_ROOT; fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; - fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */ btrfs_init_ref_verify(fs_info); fs_info->thread_pool_size = min_t(unsigned long, @@ -3123,23 +3121,34 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) { int ret; const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE); - bool clear_free_space_tree = false; + bool rebuild_free_space_tree = false; if (btrfs_test_opt(fs_info, CLEAR_CACHE) && btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - clear_free_space_tree = true; + rebuild_free_space_tree = true; } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) { btrfs_warn(fs_info, "free space tree is invalid"); - clear_free_space_tree = true; + rebuild_free_space_tree = true; + } + + if (rebuild_free_space_tree) { + btrfs_info(fs_info, "rebuilding free space tree"); + ret = btrfs_rebuild_free_space_tree(fs_info); + if (ret) { + btrfs_warn(fs_info, + "failed to rebuild free space tree: %d", ret); + goto out; + } } - if (clear_free_space_tree) { - btrfs_info(fs_info, "clearing free space tree"); - ret = btrfs_clear_free_space_tree(fs_info); + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && + !btrfs_test_opt(fs_info, FREE_SPACE_TREE)) { + btrfs_info(fs_info, "disabling free space tree"); + ret = btrfs_delete_free_space_tree(fs_info); if (ret) { btrfs_warn(fs_info, - "failed to clear free space tree: %d", ret); + "failed to disable free space tree: %d", ret); goto out; } } @@ -3344,14 +3353,11 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device struct btrfs_root *tree_root; struct btrfs_root *chunk_root; int ret; - int err = -EINVAL; int level; ret = init_mount_fs_info(fs_info, sb); - if (ret) { - err = ret; + if (ret) goto fail; - } /* These need to be init'ed before we start creating inodes and such. */ tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, @@ -3361,17 +3367,13 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device GFP_KERNEL); fs_info->chunk_root = chunk_root; if (!tree_root || !chunk_root) { - err = -ENOMEM; + ret = -ENOMEM; goto fail; } - fs_info->btree_inode = new_inode(sb); - if (!fs_info->btree_inode) { - err = -ENOMEM; + ret = btrfs_init_btree_inode(sb); + if (ret) goto fail; - } - mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); - btrfs_init_btree_inode(fs_info); invalidate_bdev(fs_devices->latest_dev->bdev); @@ -3380,7 +3382,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev); if (IS_ERR(disk_super)) { - err = PTR_ERR(disk_super); + ret = PTR_ERR(disk_super); goto fail_alloc; } @@ -3392,7 +3394,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (!btrfs_supported_super_csum(csum_type)) { btrfs_err(fs_info, "unsupported checksum algorithm: %u", csum_type); - err = -EINVAL; + ret = -EINVAL; btrfs_release_disk_super(disk_super); goto fail_alloc; } @@ -3401,7 +3403,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device ret = btrfs_init_csum_hash(fs_info, csum_type); if (ret) { - err = ret; btrfs_release_disk_super(disk_super); goto fail_alloc; } @@ -3412,7 +3413,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ if (btrfs_check_super_csum(fs_info, disk_super)) { btrfs_err(fs_info, "superblock checksum mismatch"); - err = -EINVAL; + ret = -EINVAL; btrfs_release_disk_super(disk_super); goto fail_alloc; } @@ -3442,12 +3443,15 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device ret = btrfs_validate_mount_super(fs_info); if (ret) { btrfs_err(fs_info, "superblock contains fatal errors"); - err = -EINVAL; + ret = -EINVAL; goto fail_alloc; } - if (!btrfs_super_root(disk_super)) + if (!btrfs_super_root(disk_super)) { + btrfs_err(fs_info, "invalid superblock tree root bytenr"); + ret = -EINVAL; goto fail_alloc; + } /* check FS state, whether FS is broken. */ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) @@ -3474,16 +3478,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->stripesize = stripesize; ret = btrfs_parse_options(fs_info, options, sb->s_flags); - if (ret) { - err = ret; + if (ret) goto fail_alloc; - } ret = btrfs_check_features(fs_info, !sb_rdonly(sb)); - if (ret < 0) { - err = ret; + if (ret < 0) goto fail_alloc; - } if (sectorsize < PAGE_SIZE) { struct btrfs_subpage_info *subpage_info; @@ -3503,17 +3503,17 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device "read-write for sector size %u with page size %lu is experimental", sectorsize, PAGE_SIZE); subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL); - if (!subpage_info) + if (!subpage_info) { + ret = -ENOMEM; goto fail_alloc; + } btrfs_init_subpage_info(subpage_info, sectorsize); fs_info->subpage_info = subpage_info; } ret = btrfs_init_workqueues(fs_info); - if (ret) { - err = ret; + if (ret) goto fail_sb_buffer; - } sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); @@ -3559,6 +3559,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device btrfs_free_extra_devids(fs_devices); if (!fs_devices->latest_dev->bdev) { btrfs_err(fs_info, "failed to read devices"); + ret = -EIO; goto fail_tree_roots; } @@ -3574,8 +3575,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device ret = btrfs_get_dev_zone_info_all_devices(fs_info); if (ret) { btrfs_err(fs_info, - "zoned: failed to read device zone info: %d", - ret); + "zoned: failed to read device zone info: %d", ret); goto fail_block_groups; } @@ -3654,19 +3654,24 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device !btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, "writable mount is not allowed due to too many missing devices"); + ret = -EINVAL; goto fail_sysfs; } fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info, "btrfs-cleaner"); - if (IS_ERR(fs_info->cleaner_kthread)) + if (IS_ERR(fs_info->cleaner_kthread)) { + ret = PTR_ERR(fs_info->cleaner_kthread); goto fail_sysfs; + } fs_info->transaction_kthread = kthread_run(transaction_kthread, tree_root, "btrfs-transaction"); - if (IS_ERR(fs_info->transaction_kthread)) + if (IS_ERR(fs_info->transaction_kthread)) { + ret = PTR_ERR(fs_info->transaction_kthread); goto fail_cleaner; + } if (!btrfs_test_opt(fs_info, NOSSD) && !fs_info->fs_devices->rotating) { @@ -3684,7 +3689,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->fs_devices->discardable) { btrfs_set_and_info(fs_info, DISCARD_ASYNC, "auto enabling async discard"); - btrfs_clear_opt(fs_info->mount_opt, NODISCARD); } #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY @@ -3711,16 +3715,14 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device !btrfs_test_opt(fs_info, NOLOGREPLAY)) { btrfs_info(fs_info, "start tree-log replay"); ret = btrfs_replay_log(fs_info, fs_devices); - if (ret) { - err = ret; + if (ret) goto fail_qgroup; - } } fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true); if (IS_ERR(fs_info->fs_root)) { - err = PTR_ERR(fs_info->fs_root); - btrfs_warn(fs_info, "failed to read fs tree: %d", err); + ret = PTR_ERR(fs_info->fs_root); + btrfs_warn(fs_info, "failed to read fs tree: %d", ret); fs_info->fs_root = NULL; goto fail_qgroup; } @@ -3797,7 +3799,8 @@ fail_alloc: iput(fs_info->btree_inode); fail: btrfs_close_devices(fs_info->fs_devices); - return err; + ASSERT(ret < 0); + return ret; } ALLOW_ERROR_INJECTION(open_ctree, ERRNO); @@ -4094,6 +4097,8 @@ static void write_dev_flush(struct btrfs_device *device) { struct bio *bio = &device->flush_bio; + device->last_flush_error = BLK_STS_OK; + #ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY /* * When a disk has write caching disabled, we skip submission of a bio @@ -4122,25 +4127,24 @@ static void write_dev_flush(struct btrfs_device *device) /* * If the flush bio has been submitted by write_dev_flush, wait for it. + * Return true for any error, and false otherwise. */ -static blk_status_t wait_dev_flush(struct btrfs_device *device) +static bool wait_dev_flush(struct btrfs_device *device) { struct bio *bio = &device->flush_bio; - if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)) - return BLK_STS_OK; + if (!test_and_clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)) + return false; - clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); wait_for_completion_io(&device->flush_wait); - return bio->bi_status; -} + if (bio->bi_status) { + device->last_flush_error = bio->bi_status; + btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS); + return true; + } -static int check_barrier_error(struct btrfs_fs_info *fs_info) -{ - if (!btrfs_check_rw_degradable(fs_info, NULL)) - return -EIO; - return 0; + return false; } /* @@ -4152,7 +4156,6 @@ static int barrier_all_devices(struct btrfs_fs_info *info) struct list_head *head; struct btrfs_device *dev; int errors_wait = 0; - blk_status_t ret; lockdep_assert_held(&info->fs_devices->device_list_mutex); /* send down all the barriers */ @@ -4167,7 +4170,6 @@ static int barrier_all_devices(struct btrfs_fs_info *info) continue; write_dev_flush(dev); - dev->last_flush_error = BLK_STS_OK; } /* wait for all the barriers */ @@ -4182,23 +4184,17 @@ static int barrier_all_devices(struct btrfs_fs_info *info) !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue; - ret = wait_dev_flush(dev); - if (ret) { - dev->last_flush_error = ret; - btrfs_dev_stat_inc_and_print(dev, - BTRFS_DEV_STAT_FLUSH_ERRS); + if (wait_dev_flush(dev)) errors_wait++; - } } - if (errors_wait) { - /* - * At some point we need the status of all disks - * to arrive at the volume status. So error checking - * is being pushed to a separate loop. - */ - return check_barrier_error(info); - } + /* + * Checks last_flush_error of disks in order to determine the device + * state. + */ + if (errors_wait && !btrfs_check_rw_degradable(info, NULL)) + return -EIO; + return 0; } @@ -4404,12 +4400,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) root_objectid = gang[i]->root_key.objectid; err = btrfs_orphan_cleanup(gang[i]); if (err) - break; + goto out; btrfs_put_root(gang[i]); } root_objectid++; } - +out: /* release the uncleaned roots due to error */ for (; i < ret; i++) { if (gang[i]) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 824c657f59e8..5cd289de4e92 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1894,8 +1894,7 @@ static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( } static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *locked_ref, - unsigned long *run_refs) + struct btrfs_delayed_ref_head *locked_ref) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; @@ -1917,7 +1916,6 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, return -EAGAIN; } - (*run_refs)++; ref->in_tree = 0; rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree); RB_CLEAR_NODE(&ref->ref_node); @@ -1981,10 +1979,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_head *locked_ref = NULL; - ktime_t start = ktime_get(); int ret; unsigned long count = 0; - unsigned long actual_count = 0; delayed_refs = &trans->transaction->delayed_refs; do { @@ -2014,8 +2010,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, spin_lock(&locked_ref->lock); btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); - ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, - &actual_count); + ret = btrfs_run_delayed_refs_for_head(trans, locked_ref); if (ret < 0 && ret != -EAGAIN) { /* * Error, btrfs_run_delayed_refs_for_head already @@ -2046,24 +2041,6 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, cond_resched(); } while ((nr != -1 && count < nr) || locked_ref); - /* - * We don't want to include ref heads since we can have empty ref heads - * and those will drastically skew our runtime down since we just do - * accounting, no actual extent tree updates. - */ - if (actual_count > 0) { - u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); - u64 avg; - - /* - * We weigh the current average higher than our current runtime - * to avoid large swings in the average. - */ - spin_lock(&delayed_refs->lock); - avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; - fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */ - spin_unlock(&delayed_refs->lock); - } return 0; } @@ -5509,11 +5486,11 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, { int level = wc->level; int lookup_info = 1; - int ret; + int ret = 0; while (level >= 0) { ret = walk_down_proc(trans, root, path, wc, lookup_info); - if (ret > 0) + if (ret) break; if (level == 0) @@ -5528,10 +5505,10 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, path->slots[level]++; continue; } else if (ret < 0) - return ret; + break; level = wc->level; } - return 0; + return (ret == 1) ? 0 : ret; } static noinline int walk_up_tree(struct btrfs_trans_handle *trans, @@ -5708,12 +5685,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) ret = walk_down_tree(trans, root, path, wc); if (ret < 0) { + btrfs_abort_transaction(trans, ret); err = ret; break; } ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); if (ret < 0) { + btrfs_abort_transaction(trans, ret); err = ret; break; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 40300e8e5f99..a1adadd5d25d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -97,11 +97,13 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) * how many bytes are there before stripe/ordered extent boundary. */ struct btrfs_bio_ctrl { - struct bio *bio; + struct btrfs_bio *bbio; int mirror_num; enum btrfs_compression_type compress_type; u32 len_to_oe_boundary; + blk_opf_t opf; btrfs_bio_end_io_t end_io_func; + struct writeback_control *wbc; /* * This is for metadata read, to provide the extra needed verification @@ -117,51 +119,41 @@ struct btrfs_bio_ctrl { * does the unlocking. */ bool extent_locked; - - /* Tell the submit_bio code to use REQ_SYNC */ - bool sync_io; }; static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { - struct bio *bio; - struct bio_vec *bv; - struct inode *inode; - int mirror_num; + struct btrfs_bio *bbio = bio_ctrl->bbio; + int mirror_num = bio_ctrl->mirror_num; - if (!bio_ctrl->bio) + if (!bbio) return; - bio = bio_ctrl->bio; - bv = bio_first_bvec_all(bio); - inode = bv->bv_page->mapping->host; - mirror_num = bio_ctrl->mirror_num; - /* Caller should ensure the bio has at least some range added */ - ASSERT(bio->bi_iter.bi_size); + ASSERT(bbio->bio.bi_iter.bi_size); - if (!is_data_inode(inode)) { - if (btrfs_op(bio) != BTRFS_MAP_WRITE) { + if (!is_data_inode(&bbio->inode->vfs_inode)) { + if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) { /* * For metadata read, we should have the parent_check, * and copy it to bbio for metadata verification. */ ASSERT(bio_ctrl->parent_check); - memcpy(&btrfs_bio(bio)->parent_check, + memcpy(&bbio->parent_check, bio_ctrl->parent_check, sizeof(struct btrfs_tree_parent_check)); } - bio->bi_opf |= REQ_META; + bbio->bio.bi_opf |= REQ_META; } - if (btrfs_op(bio) == BTRFS_MAP_READ && + if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ && bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) - btrfs_submit_compressed_read(inode, bio, mirror_num); + btrfs_submit_compressed_read(bbio, mirror_num); else - btrfs_submit_bio(bio, mirror_num); + btrfs_submit_bio(bbio, mirror_num); - /* The bio is owned by the end_io handler now */ - bio_ctrl->bio = NULL; + /* The bbio is owned by the end_io handler now */ + bio_ctrl->bbio = NULL; } /* @@ -169,16 +161,16 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) */ static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret) { - struct bio *bio = bio_ctrl->bio; + struct btrfs_bio *bbio = bio_ctrl->bbio; - if (!bio) + if (!bbio) return; if (ret) { ASSERT(ret < 0); - btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); + btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); /* The bio is owned by the end_io handler now */ - bio_ctrl->bio = NULL; + bio_ctrl->bbio = NULL; } else { submit_one_bio(bio_ctrl); } @@ -867,89 +859,52 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) return 0; } -/* - * Attempt to add a page to bio. - * - * @bio_ctrl: record both the bio, and its bio_flags - * @page: page to add to the bio - * @disk_bytenr: offset of the new bio or to check whether we are adding - * a contiguous page to the previous one - * @size: portion of page that we want to write - * @pg_offset: starting offset in the page - * @compress_type: compression type of the current bio to see if we can merge them - * - * Attempt to add a page to bio considering stripe alignment etc. - * - * Return >= 0 for the number of bytes added to the bio. - * Can return 0 if the current bio is already at stripe/zone boundary. - * Return <0 for error. - */ -static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, - struct page *page, - u64 disk_bytenr, unsigned int size, - unsigned int pg_offset, - enum btrfs_compression_type compress_type) +static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, + struct page *page, u64 disk_bytenr, + unsigned int pg_offset) { - struct bio *bio = bio_ctrl->bio; - u32 bio_size = bio->bi_iter.bi_size; - u32 real_size; + struct bio *bio = &bio_ctrl->bbio->bio; + struct bio_vec *bvec = bio_last_bvec_all(bio); const sector_t sector = disk_bytenr >> SECTOR_SHIFT; - bool contig = false; - ASSERT(bio); - /* The limit should be calculated when bio_ctrl->bio is allocated */ - ASSERT(bio_ctrl->len_to_oe_boundary); - if (bio_ctrl->compress_type != compress_type) - return 0; - - - if (bio->bi_iter.bi_size == 0) { - /* We can always add a page into an empty bio. */ - contig = true; - } else if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE) { - struct bio_vec *bvec = bio_last_bvec_all(bio); - - /* - * The contig check requires the following conditions to be met: - * 1) The pages are belonging to the same inode - * This is implied by the call chain. - * - * 2) The range has adjacent logical bytenr - * - * 3) The range has adjacent file offset - * This is required for the usage of btrfs_bio->file_offset. - */ - if (bio_end_sector(bio) == sector && - page_offset(bvec->bv_page) + bvec->bv_offset + - bvec->bv_len == page_offset(page) + pg_offset) - contig = true; - } else { + if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { /* - * For compression, all IO should have its logical bytenr - * set to the starting bytenr of the compressed extent. + * For compression, all IO should have its logical bytenr set + * to the starting bytenr of the compressed extent. */ - contig = bio->bi_iter.bi_sector == sector; + return bio->bi_iter.bi_sector == sector; } - if (!contig) - return 0; - - real_size = min(bio_ctrl->len_to_oe_boundary - bio_size, size); - /* - * If real_size is 0, never call bio_add_*_page(), as even size is 0, - * bio will still execute its endio function on the page! + * The contig check requires the following conditions to be met: + * + * 1) The pages are belonging to the same inode + * This is implied by the call chain. + * + * 2) The range has adjacent logical bytenr + * + * 3) The range has adjacent file offset + * This is required for the usage of btrfs_bio->file_offset. */ - if (real_size == 0) - return 0; - - return bio_add_page(bio, page, real_size, pg_offset); + return bio_end_sector(bio) == sector && + page_offset(bvec->bv_page) + bvec->bv_offset + bvec->bv_len == + page_offset(page) + pg_offset; } -static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, - struct btrfs_inode *inode, u64 file_offset) +static void alloc_new_bio(struct btrfs_inode *inode, + struct btrfs_bio_ctrl *bio_ctrl, + u64 disk_bytenr, u64 file_offset) { - struct btrfs_ordered_extent *ordered; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_bio *bbio; + + bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info, + bio_ctrl->end_io_func, NULL); + bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; + bbio->inode = inode; + bbio->file_offset = file_offset; + bio_ctrl->bbio = bbio; + bio_ctrl->len_to_oe_boundary = U32_MAX; /* * Limit the extent to the ordered boundary for Zone Append. @@ -957,132 +912,89 @@ static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, * them. */ if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE && - btrfs_use_zone_append(btrfs_bio(bio_ctrl->bio))) { + btrfs_use_zone_append(bbio)) { + struct btrfs_ordered_extent *ordered; + ordered = btrfs_lookup_ordered_extent(inode, file_offset); if (ordered) { bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, ordered->file_offset + ordered->disk_num_bytes - file_offset); btrfs_put_ordered_extent(ordered); - return; } } - bio_ctrl->len_to_oe_boundary = U32_MAX; -} - -static void alloc_new_bio(struct btrfs_inode *inode, - struct btrfs_bio_ctrl *bio_ctrl, - struct writeback_control *wbc, blk_opf_t opf, - u64 disk_bytenr, u32 offset, u64 file_offset, - enum btrfs_compression_type compress_type) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct bio *bio; - - bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, inode, bio_ctrl->end_io_func, - NULL); - /* - * For compressed page range, its disk_bytenr is always @disk_bytenr - * passed in, no matter if we have added any range into previous bio. - */ - if (compress_type != BTRFS_COMPRESS_NONE) - bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; - else - bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT; - btrfs_bio(bio)->file_offset = file_offset; - bio_ctrl->bio = bio; - bio_ctrl->compress_type = compress_type; - calc_bio_boundaries(bio_ctrl, inode, file_offset); - - if (wbc) { + if (bio_ctrl->wbc) { /* * Pick the last added device to support cgroup writeback. For * multi-device file systems this means blk-cgroup policies have * to always be set on the last added/replaced device. * This is a bit odd but has been like that for a long time. */ - bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); - wbc_init_bio(wbc, bio); + bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev); + wbc_init_bio(bio_ctrl->wbc, &bbio->bio); } } /* - * @opf: bio REQ_OP_* and REQ_* flags as one value - * @wbc: optional writeback control for io accounting * @disk_bytenr: logical bytenr where the write will be * @page: page to add to the bio * @size: portion of page that we want to write to * @pg_offset: offset of the new bio or to check whether we are adding * a contiguous page to the previous one - * @compress_type: compress type for current bio * - * The will either add the page into the existing @bio_ctrl->bio, or allocate a - * new one in @bio_ctrl->bio. + * The will either add the page into the existing @bio_ctrl->bbio, or allocate a + * new one in @bio_ctrl->bbio. * The mirror number for this IO should already be initizlied in * @bio_ctrl->mirror_num. */ -static int submit_extent_page(blk_opf_t opf, - struct writeback_control *wbc, - struct btrfs_bio_ctrl *bio_ctrl, - u64 disk_bytenr, struct page *page, - size_t size, unsigned long pg_offset, - enum btrfs_compression_type compress_type, - bool force_bio_submit) +static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, + u64 disk_bytenr, struct page *page, + size_t size, unsigned long pg_offset) { struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - unsigned int cur = pg_offset; - - ASSERT(bio_ctrl); - - ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE && - pg_offset + size <= PAGE_SIZE); + ASSERT(pg_offset + size <= PAGE_SIZE); ASSERT(bio_ctrl->end_io_func); - if (force_bio_submit) + if (bio_ctrl->bbio && + !btrfs_bio_is_contig(bio_ctrl, page, disk_bytenr, pg_offset)) submit_one_bio(bio_ctrl); - while (cur < pg_offset + size) { - u32 offset = cur - pg_offset; - int added; + do { + u32 len = size; /* Allocate new bio if needed */ - if (!bio_ctrl->bio) { - alloc_new_bio(inode, bio_ctrl, wbc, opf, disk_bytenr, - offset, page_offset(page) + cur, - compress_type); + if (!bio_ctrl->bbio) { + alloc_new_bio(inode, bio_ctrl, disk_bytenr, + page_offset(page) + pg_offset); } - /* - * We must go through btrfs_bio_add_page() to ensure each - * page range won't cross various boundaries. - */ - if (compress_type != BTRFS_COMPRESS_NONE) - added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, - size - offset, pg_offset + offset, - compress_type); - else - added = btrfs_bio_add_page(bio_ctrl, page, - disk_bytenr + offset, size - offset, - pg_offset + offset, compress_type); - - /* Metadata page range should never be split */ - if (!is_data_inode(&inode->vfs_inode)) - ASSERT(added == 0 || added == size - offset); - - /* At least we added some page, update the account */ - if (wbc && added) - wbc_account_cgroup_owner(wbc, page, added); - - /* We have reached boundary, submit right now */ - if (added < size - offset) { - /* The bio should contain some page(s) */ - ASSERT(bio_ctrl->bio->bi_iter.bi_size); + + /* Cap to the current ordered extent boundary if there is one. */ + if (len > bio_ctrl->len_to_oe_boundary) { + ASSERT(bio_ctrl->compress_type == BTRFS_COMPRESS_NONE); + ASSERT(is_data_inode(&inode->vfs_inode)); + len = bio_ctrl->len_to_oe_boundary; + } + + if (bio_add_page(&bio_ctrl->bbio->bio, page, len, pg_offset) != len) { + /* bio full: move on to a new one */ submit_one_bio(bio_ctrl); + continue; } - cur += added; - } - return 0; + + if (bio_ctrl->wbc) + wbc_account_cgroup_owner(bio_ctrl->wbc, page, len); + + size -= len; + pg_offset += len; + disk_bytenr += len; + bio_ctrl->len_to_oe_boundary -= len; + + /* Ordered extent boundary: move on to a new bio. */ + if (bio_ctrl->len_to_oe_boundary == 0) + submit_one_bio(bio_ctrl); + } while (size); } static int attach_extent_buffer_page(struct extent_buffer *eb, @@ -1193,8 +1105,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * return 0 on success, otherwise return error */ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, - struct btrfs_bio_ctrl *bio_ctrl, - blk_opf_t read_flags, u64 *prev_em_start) + struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start) { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -1216,7 +1127,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, unlock_extent(tree, start, end, NULL); btrfs_page_set_error(fs_info, page, start, PAGE_SIZE); unlock_page(page); - goto out; + return ret; } if (page->index == last_byte >> PAGE_SHIFT) { @@ -1230,7 +1141,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, bio_ctrl->end_io_func = end_bio_extent_readpage; begin_page_read(fs_info, page); while (cur <= end) { - unsigned long this_bio_flag = 0; + enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; bool force_bio_submit = false; u64 disk_bytenr; @@ -1247,19 +1158,18 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, if (IS_ERR(em)) { unlock_extent(tree, cur, end, NULL); end_page_read(page, false, cur, end + 1 - cur); - ret = PTR_ERR(em); - break; + return PTR_ERR(em); } extent_offset = cur - em->start; BUG_ON(extent_map_end(em) <= cur); BUG_ON(end < cur); if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) - this_bio_flag = em->compress_type; + compress_type = em->compress_type; iosize = min(extent_map_end(em) - cur, end - cur + 1); iosize = ALIGN(iosize, blocksize); - if (this_bio_flag != BTRFS_COMPRESS_NONE) + if (compress_type != BTRFS_COMPRESS_NONE) disk_bytenr = em->block_start; else disk_bytenr = em->block_start + extent_offset; @@ -1331,24 +1241,20 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, continue; } - ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, - bio_ctrl, disk_bytenr, page, iosize, - pg_offset, this_bio_flag, - force_bio_submit); - if (ret) { - /* - * We have to unlock the remaining range, or the page - * will never be unlocked. - */ - unlock_extent(tree, cur, end, NULL); - end_page_read(page, false, cur, end + 1 - cur); - goto out; + if (bio_ctrl->compress_type != compress_type) { + submit_one_bio(bio_ctrl); + bio_ctrl->compress_type = compress_type; } + + if (force_bio_submit) + submit_one_bio(bio_ctrl); + submit_extent_page(bio_ctrl, disk_bytenr, page, iosize, + pg_offset); cur = cur + iosize; pg_offset += iosize; } -out: - return ret; + + return 0; } int btrfs_read_folio(struct file *file, struct folio *folio) @@ -1357,12 +1263,12 @@ int btrfs_read_folio(struct file *file, struct folio *folio) struct btrfs_inode *inode = BTRFS_I(page->mapping->host); u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; - struct btrfs_bio_ctrl bio_ctrl = { 0 }; + struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; int ret; btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); + ret = btrfs_do_readpage(page, NULL, &bio_ctrl, NULL); /* * If btrfs_do_readpage() failed we will want to submit the assembled * bio to do the cleanup. @@ -1384,7 +1290,7 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, for (index = 0; index < nr_pages; index++) { btrfs_do_readpage(pages[index], em_cached, bio_ctrl, - REQ_RAHEAD, prev_em_start); + prev_em_start); put_page(pages[index]); } } @@ -1520,7 +1426,6 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, */ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, struct page *page, - struct writeback_control *wbc, struct btrfs_bio_ctrl *bio_ctrl, loff_t i_size, int *nr_ret) @@ -1531,18 +1436,14 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, u64 extent_offset; u64 block_start; struct extent_map *em; - int saved_ret = 0; int ret = 0; int nr = 0; - enum req_op op = REQ_OP_WRITE; - const blk_opf_t write_flags = wbc_to_write_flags(wbc); - bool has_error = false; bool compressed; ret = btrfs_writepage_cow_fixup(page); if (ret) { /* Fixup worker will requeue */ - redirty_page_for_writepage(wbc, page); + redirty_page_for_writepage(bio_ctrl->wbc, page); unlock_page(page); return 1; } @@ -1551,7 +1452,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * we don't want to touch the inode after unlocking the page, * so we update the mapping writeback index now */ - wbc->nr_to_write--; + bio_ctrl->wbc->nr_to_write--; bio_ctrl->end_io_func = end_bio_extent_writepage; while (cur <= end) { @@ -1587,10 +1488,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, if (IS_ERR(em)) { btrfs_page_set_error(fs_info, page, cur, end - cur + 1); ret = PTR_ERR_OR_ZERO(em); - has_error = true; - if (!saved_ret) - saved_ret = ret; - break; + goto out_error; } extent_offset = cur - em->start; @@ -1642,33 +1540,21 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, */ btrfs_page_clear_dirty(fs_info, page, cur, iosize); - ret = submit_extent_page(op | write_flags, wbc, - bio_ctrl, disk_bytenr, - page, iosize, - cur - page_offset(page), - 0, false); - if (ret) { - has_error = true; - if (!saved_ret) - saved_ret = ret; - - btrfs_page_set_error(fs_info, page, cur, iosize); - if (PageWriteback(page)) - btrfs_page_clear_writeback(fs_info, page, cur, - iosize); - } - + submit_extent_page(bio_ctrl, disk_bytenr, page, iosize, + cur - page_offset(page)); cur += iosize; nr++; } + + btrfs_page_assert_not_dirty(fs_info, page); + *nr_ret = nr; + return 0; + +out_error: /* * If we finish without problem, we should not only clear page dirty, * but also empty subpage dirty bits */ - if (!has_error) - btrfs_page_assert_not_dirty(fs_info, page); - else - ret = saved_ret; *nr_ret = nr; return ret; } @@ -1682,8 +1568,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * Return 0 if everything goes well. * Return <0 for error. */ -static int __extent_writepage(struct page *page, struct writeback_control *wbc, - struct btrfs_bio_ctrl *bio_ctrl) +static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl) { struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; @@ -1696,7 +1581,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, loff_t i_size = i_size_read(inode); unsigned long end_index = i_size >> PAGE_SHIFT; - trace___extent_writepage(page, inode, wbc); + trace___extent_writepage(page, inode, bio_ctrl->wbc); WARN_ON(!PageLocked(page)); @@ -1721,15 +1606,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, } if (!bio_ctrl->extent_locked) { - ret = writepage_delalloc(BTRFS_I(inode), page, wbc); + ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc); if (ret == 1) return 0; if (ret) goto done; } - ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, bio_ctrl, i_size, - &nr); + ret = __extent_writepage_io(BTRFS_I(inode), page, bio_ctrl, i_size, &nr); if (ret == 1) return 0; @@ -1773,6 +1657,8 @@ done: if (PageError(page)) end_extent_writepage(page, ret, page_start, page_end); if (bio_ctrl->extent_locked) { + struct writeback_control *wbc = bio_ctrl->wbc; + /* * If bio_ctrl->extent_locked, it's from extent_write_locked_range(), * the page can either be locked by lock_page() or @@ -1828,7 +1714,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { btrfs_tree_unlock(eb); - if (!bio_ctrl->sync_io) + if (bio_ctrl->wbc->sync_mode != WB_SYNC_ALL) return 0; if (!flush) { submit_write_bio(bio_ctrl, 0); @@ -2113,15 +1999,12 @@ static void prepare_eb_write(struct extent_buffer *eb) * Unlike the work in write_one_eb(), we rely completely on extent locking. * Page locking is only utilized at minimum to keep the VMM code happy. */ -static int write_one_subpage_eb(struct extent_buffer *eb, - struct writeback_control *wbc, - struct btrfs_bio_ctrl *bio_ctrl) +static void write_one_subpage_eb(struct extent_buffer *eb, + struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_fs_info *fs_info = eb->fs_info; struct page *page = eb->pages[0]; - blk_opf_t write_flags = wbc_to_write_flags(wbc); bool no_dirty_ebs = false; - int ret; prepare_eb_write(eb); @@ -2137,36 +2020,22 @@ static int write_one_subpage_eb(struct extent_buffer *eb, bio_ctrl->end_io_func = end_bio_subpage_eb_writepage; - ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - bio_ctrl, eb->start, page, eb->len, - eb->start - page_offset(page), 0, false); - if (ret) { - btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len); - set_btree_ioerr(page, eb); - unlock_page(page); - - if (atomic_dec_and_test(&eb->io_pages)) - end_extent_buffer_writeback(eb); - return -EIO; - } + submit_extent_page(bio_ctrl, eb->start, page, eb->len, + eb->start - page_offset(page)); unlock_page(page); /* * Submission finished without problem, if no range of the page is * dirty anymore, we have submitted a page. Update nr_written in wbc. */ if (no_dirty_ebs) - wbc->nr_to_write--; - return ret; + bio_ctrl->wbc->nr_to_write--; } -static noinline_for_stack int write_one_eb(struct extent_buffer *eb, - struct writeback_control *wbc, +static noinline_for_stack void write_one_eb(struct extent_buffer *eb, struct btrfs_bio_ctrl *bio_ctrl) { u64 disk_bytenr = eb->start; int i, num_pages; - blk_opf_t write_flags = wbc_to_write_flags(wbc); - int ret = 0; prepare_eb_write(eb); @@ -2178,32 +2047,11 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); - ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - bio_ctrl, disk_bytenr, p, - PAGE_SIZE, 0, 0, false); - if (ret) { - set_btree_ioerr(p, eb); - if (PageWriteback(p)) - end_page_writeback(p); - if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) - end_extent_buffer_writeback(eb); - ret = -EIO; - break; - } + submit_extent_page(bio_ctrl, disk_bytenr, p, PAGE_SIZE, 0); disk_bytenr += PAGE_SIZE; - wbc->nr_to_write--; + bio_ctrl->wbc->nr_to_write--; unlock_page(p); } - - if (unlikely(ret)) { - for (; i < num_pages; i++) { - struct page *p = eb->pages[i]; - clear_page_dirty_for_io(p); - unlock_page(p); - } - } - - return ret; } /* @@ -2220,9 +2068,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, * Return >=0 for the number of submitted extent buffers. * Return <0 for fatal error. */ -static int submit_eb_subpage(struct page *page, - struct writeback_control *wbc, - struct btrfs_bio_ctrl *bio_ctrl) +static int submit_eb_subpage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); int submitted = 0; @@ -2284,10 +2130,8 @@ static int submit_eb_subpage(struct page *page, free_extent_buffer(eb); goto cleanup; } - ret = write_one_subpage_eb(eb, wbc, bio_ctrl); + write_one_subpage_eb(eb, bio_ctrl); free_extent_buffer(eb); - if (ret < 0) - goto cleanup; submitted++; } return submitted; @@ -2318,8 +2162,7 @@ cleanup: * previous call. * Return <0 for fatal error. */ -static int submit_eb_page(struct page *page, struct writeback_control *wbc, - struct btrfs_bio_ctrl *bio_ctrl, +static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl, struct extent_buffer **eb_context) { struct address_space *mapping = page->mapping; @@ -2331,7 +2174,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, return 0; if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) - return submit_eb_subpage(page, wbc, bio_ctrl); + return submit_eb_subpage(page, bio_ctrl); spin_lock(&mapping->private_lock); if (!PagePrivate(page)) { @@ -2364,7 +2207,8 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, * If for_sync, this hole will be filled with * trasnsaction commit. */ - if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) + if (bio_ctrl->wbc->sync_mode == WB_SYNC_ALL && + !bio_ctrl->wbc->for_sync) ret = -EAGAIN; else ret = 0; @@ -2389,10 +2233,8 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, btrfs_schedule_zone_finish_bg(cache, eb); btrfs_put_block_group(cache); } - ret = write_one_eb(eb, wbc, bio_ctrl); + write_one_eb(eb, bio_ctrl); free_extent_buffer(eb); - if (ret < 0) - return ret; return 1; } @@ -2401,8 +2243,9 @@ int btree_write_cache_pages(struct address_space *mapping, { struct extent_buffer *eb_context = NULL; struct btrfs_bio_ctrl bio_ctrl = { + .wbc = wbc, + .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), .extent_locked = 0, - .sync_io = (wbc->sync_mode == WB_SYNC_ALL), }; struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; int ret = 0; @@ -2445,8 +2288,7 @@ retry: for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; - ret = submit_eb_page(&folio->page, wbc, &bio_ctrl, - &eb_context); + ret = submit_eb_page(&folio->page, &bio_ctrl, &eb_context); if (ret == 0) continue; if (ret < 0) { @@ -2529,9 +2371,9 @@ retry: * existing IO to complete. */ static int extent_write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc, struct btrfs_bio_ctrl *bio_ctrl) { + struct writeback_control *wbc = bio_ctrl->wbc; struct inode *inode = mapping->host; int ret = 0; int done = 0; @@ -2632,7 +2474,7 @@ retry: continue; } - ret = __extent_writepage(&folio->page, wbc, bio_ctrl); + ret = __extent_writepage(&folio->page, bio_ctrl); if (ret < 0) { done = 1; break; @@ -2688,18 +2530,19 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) u64 cur = start; unsigned long nr_pages; const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize; - struct btrfs_bio_ctrl bio_ctrl = { - .extent_locked = 1, - .sync_io = 1, - }; struct writeback_control wbc_writepages = { .sync_mode = WB_SYNC_ALL, .range_start = start, .range_end = end + 1, - /* We're called from an async helper function */ - .punt_to_cgroup = 1, .no_cgroup_owner = 1, }; + struct btrfs_bio_ctrl bio_ctrl = { + .wbc = &wbc_writepages, + /* We're called from an async helper function */ + .opf = REQ_OP_WRITE | REQ_BTRFS_CGROUP_PUNT | + wbc_to_write_flags(&wbc_writepages), + .extent_locked = 1, + }; ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >> @@ -2719,7 +2562,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) ASSERT(PageLocked(page)); ASSERT(PageDirty(page)); clear_page_dirty_for_io(page); - ret = __extent_writepage(page, &wbc_writepages, &bio_ctrl); + ret = __extent_writepage(page, &bio_ctrl); ASSERT(ret <= 0); if (ret < 0) { found_error = true; @@ -2743,8 +2586,9 @@ int extent_writepages(struct address_space *mapping, struct inode *inode = mapping->host; int ret = 0; struct btrfs_bio_ctrl bio_ctrl = { + .wbc = wbc, + .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), .extent_locked = 0, - .sync_io = (wbc->sync_mode == WB_SYNC_ALL), }; /* @@ -2752,7 +2596,7 @@ int extent_writepages(struct address_space *mapping, * protect the write pointer updates. */ btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); - ret = extent_write_cache_pages(mapping, wbc, &bio_ctrl); + ret = extent_write_cache_pages(mapping, &bio_ctrl); submit_write_bio(&bio_ctrl, ret); btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); return ret; @@ -2760,7 +2604,7 @@ int extent_writepages(struct address_space *mapping, void extent_readahead(struct readahead_control *rac) { - struct btrfs_bio_ctrl bio_ctrl = { 0 }; + struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD }; struct page *pagepool[16]; struct extent_map *em_cached = NULL; u64 prev_em_start = (u64)-1; @@ -4407,10 +4251,11 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, struct page *page = eb->pages[0]; struct extent_state *cached_state = NULL; struct btrfs_bio_ctrl bio_ctrl = { + .opf = REQ_OP_READ, .mirror_num = mirror_num, .parent_check = check, }; - int ret = 0; + int ret; ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); ASSERT(PagePrivate(page)); @@ -4428,14 +4273,13 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, return ret; } - ret = 0; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) || PageUptodate(page) || btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) { set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, &cached_state); - return ret; + return 0; } clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); @@ -4447,28 +4291,19 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len); btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len); - ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl, - eb->start, page, eb->len, - eb->start - page_offset(page), 0, true); - if (ret) { - /* - * In the endio function, if we hit something wrong we will - * increase the io_pages, so here we need to decrease it for - * error path. - */ - atomic_dec(&eb->io_pages); - } + submit_extent_page(&bio_ctrl, eb->start, page, eb->len, + eb->start - page_offset(page)); submit_one_bio(&bio_ctrl); - if (ret || wait != WAIT_COMPLETE) { + if (wait != WAIT_COMPLETE) { free_extent_state(cached_state); - return ret; + return 0; } wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED, &cached_state); if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) - ret = -EIO; - return ret; + return -EIO; + return 0; } int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, @@ -4476,13 +4311,12 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, { int i; struct page *page; - int err; - int ret = 0; int locked_pages = 0; int all_uptodate = 1; int num_pages; unsigned long num_reads = 0; struct btrfs_bio_ctrl bio_ctrl = { + .opf = REQ_OP_READ, .mirror_num = mirror_num, .parent_check = check, }; @@ -4550,27 +4384,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, page = eb->pages[i]; if (!PageUptodate(page)) { - if (ret) { - atomic_dec(&eb->io_pages); - unlock_page(page); - continue; - } - ClearPageError(page); - err = submit_extent_page(REQ_OP_READ, NULL, - &bio_ctrl, page_offset(page), page, - PAGE_SIZE, 0, 0, false); - if (err) { - /* - * We failed to submit the bio so it's the - * caller's responsibility to perform cleanup - * i.e unlock page/set error bit. - */ - ret = err; - SetPageError(page); - unlock_page(page); - atomic_dec(&eb->io_pages); - } + submit_extent_page(&bio_ctrl, page_offset(page), page, + PAGE_SIZE, 0); } else { unlock_page(page); } @@ -4578,17 +4394,17 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, submit_one_bio(&bio_ctrl); - if (ret || wait != WAIT_COMPLETE) - return ret; + if (wait != WAIT_COMPLETE) + return 0; for (i = 0; i < num_pages; i++) { page = eb->pages[i]; wait_on_page_locked(page); if (!PageUptodate(page)) - ret = -EIO; + return -EIO; } - return ret; + return 0; unlock_exit: while (locked_pages > 0) { @@ -4596,7 +4412,7 @@ unlock_exit: page = eb->pages[locked_pages]; unlock_page(page); } - return ret; + return 0; } static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 41c77a100853..cd4cce9ba443 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -52,13 +52,13 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz u64 start, end, i_size; int ret; + spin_lock(&inode->lock); i_size = new_i_size ?: i_size_read(&inode->vfs_inode); if (btrfs_fs_incompat(fs_info, NO_HOLES)) { inode->disk_i_size = i_size; - return; + goto out_unlock; } - spin_lock(&inode->lock); ret = find_contiguous_extent_bit(&inode->file_extent_tree, 0, &start, &end, EXTENT_DIRTY); if (!ret && start == 0) @@ -66,6 +66,7 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz else i_size = 0; inode->disk_i_size = i_size; +out_unlock: spin_unlock(&inode->lock); } @@ -336,48 +337,6 @@ out: } /* - * Locate the file_offset of @cur_disk_bytenr of a @bio. - * - * Bio of btrfs represents read range of - * [bi_sector << 9, bi_sector << 9 + bi_size). - * Knowing this, we can iterate through each bvec to locate the page belong to - * @cur_disk_bytenr and get the file offset. - * - * @inode is used to determine if the bvec page really belongs to @inode. - * - * Return 0 if we can't find the file offset - * Return >0 if we find the file offset and restore it to @file_offset_ret - */ -static int search_file_offset_in_bio(struct bio *bio, struct inode *inode, - u64 disk_bytenr, u64 *file_offset_ret) -{ - struct bvec_iter iter; - struct bio_vec bvec; - u64 cur = bio->bi_iter.bi_sector << SECTOR_SHIFT; - int ret = 0; - - bio_for_each_segment(bvec, bio, iter) { - struct page *page = bvec.bv_page; - - if (cur > disk_bytenr) - break; - if (cur + bvec.bv_len <= disk_bytenr) { - cur += bvec.bv_len; - continue; - } - ASSERT(in_range(disk_bytenr, cur, bvec.bv_len)); - if (page->mapping && page->mapping->host && - page->mapping->host == inode) { - ret = 1; - *file_offset_ret = page_offset(page) + bvec.bv_offset + - disk_bytenr - cur; - break; - } - } - return ret; -} - -/* * Lookup the checksum for the read bio in csum tree. * * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. @@ -386,17 +345,15 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct extent_io_tree *io_tree = &inode->io_tree; struct bio *bio = &bbio->bio; struct btrfs_path *path; const u32 sectorsize = fs_info->sectorsize; const u32 csum_size = fs_info->csum_size; u32 orig_len = bio->bi_iter.bi_size; u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; - u64 cur_disk_bytenr; const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits; - int count = 0; blk_status_t ret = BLK_STS_OK; + u32 bio_offset = 0; if ((inode->flags & BTRFS_INODE_NODATASUM) || test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) @@ -447,28 +404,14 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) path->skip_locking = 1; } - for (cur_disk_bytenr = orig_disk_bytenr; - cur_disk_bytenr < orig_disk_bytenr + orig_len; - cur_disk_bytenr += (count * sectorsize)) { - u64 search_len = orig_disk_bytenr + orig_len - cur_disk_bytenr; - unsigned int sector_offset; - u8 *csum_dst; - - /* - * Although both cur_disk_bytenr and orig_disk_bytenr is u64, - * we're calculating the offset to the bio start. - * - * Bio size is limited to UINT_MAX, thus unsigned int is large - * enough to contain the raw result, not to mention the right - * shifted result. - */ - ASSERT(cur_disk_bytenr - orig_disk_bytenr < UINT_MAX); - sector_offset = (cur_disk_bytenr - orig_disk_bytenr) >> - fs_info->sectorsize_bits; - csum_dst = bbio->csum + sector_offset * csum_size; + while (bio_offset < orig_len) { + int count; + u64 cur_disk_bytenr = orig_disk_bytenr + bio_offset; + u8 *csum_dst = bbio->csum + + (bio_offset >> fs_info->sectorsize_bits) * csum_size; count = search_csum_tree(fs_info, path, cur_disk_bytenr, - search_len, csum_dst); + orig_len - bio_offset, csum_dst); if (count < 0) { ret = errno_to_blk_status(count); if (bbio->csum != bbio->csum_inline) @@ -493,14 +436,9 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) if (inode->root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { - u64 file_offset; - int ret; - - ret = search_file_offset_in_bio(bio, - &inode->vfs_inode, - cur_disk_bytenr, &file_offset); - if (ret) - set_extent_bits(io_tree, file_offset, + u64 file_offset = bbio->file_offset + bio_offset; + + set_extent_bits(&inode->io_tree, file_offset, file_offset + sectorsize - 1, EXTENT_NODATASUM); } else { @@ -509,6 +447,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) cur_disk_bytenr, cur_disk_bytenr + sectorsize); } } + bio_offset += count * sectorsize; } btrfs_free_path(path); @@ -659,7 +598,8 @@ fail: * in is large enough to contain all csums. */ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, - u8 *csum_buf, unsigned long *csum_bitmap) + u8 *csum_buf, unsigned long *csum_bitmap, + bool search_commit) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; @@ -676,6 +616,12 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, if (!path) return -ENOMEM; + if (search_commit) { + path->skip_locking = 1; + path->reada = READA_FORWARD; + path->search_commit_root = 1; + } + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; key.type = BTRFS_EXTENT_CSUM_KEY; key.offset = start; diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index cd7f2ae515c0..6be8725cd574 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -57,7 +57,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit, bool nowait); int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, - u8 *csum_buf, unsigned long *csum_bitmap); + u8 *csum_buf, unsigned long *csum_bitmap, + bool search_commit); void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, struct btrfs_file_extent_item *fi, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5cc5a1faaef5..f649647392e0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3730,10 +3730,15 @@ static int check_direct_read(struct btrfs_fs_info *fs_info, if (!iter_is_iovec(iter)) return 0; - for (seg = 0; seg < iter->nr_segs; seg++) - for (i = seg + 1; i < iter->nr_segs; i++) - if (iter->iov[seg].iov_base == iter->iov[i].iov_base) + for (seg = 0; seg < iter->nr_segs; seg++) { + for (i = seg + 1; i < iter->nr_segs; i++) { + const struct iovec *iov1 = iter_iov(iter) + seg; + const struct iovec *iov2 = iter_iov(iter) + i; + + if (iov1->iov_base == iov2->iov_base) return -EINVAL; + } + } return 0; } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d84cef89cdff..cf98a3c05480 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -870,15 +870,16 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, } spin_lock(&ctl->tree_lock); ret = link_free_space(ctl, e); - ctl->total_bitmaps++; - recalculate_thresholds(ctl); - spin_unlock(&ctl->tree_lock); if (ret) { + spin_unlock(&ctl->tree_lock); btrfs_err(fs_info, "Duplicate entries in free space cache, dumping"); kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } + ctl->total_bitmaps++; + recalculate_thresholds(ctl); + spin_unlock(&ctl->tree_lock); list_add_tail(&e->list, &bitmaps); } diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 4d155a48ec59..b21da1446f2a 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1252,7 +1252,7 @@ out: return ret; } -int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) +int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) { struct btrfs_trans_handle *trans; struct btrfs_root *tree_root = fs_info->tree_root; @@ -1298,6 +1298,54 @@ abort: return ret; } +int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_trans_handle *trans; + struct btrfs_key key = { + .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID, + .type = BTRFS_ROOT_ITEM_KEY, + .offset = 0, + }; + struct btrfs_root *free_space_root = btrfs_global_root(fs_info, &key); + struct rb_node *node; + int ret; + + trans = btrfs_start_transaction(free_space_root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + set_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); + set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); + + ret = clear_free_space_tree(trans, free_space_root); + if (ret) + goto abort; + + node = rb_first_cached(&fs_info->block_group_cache_tree); + while (node) { + struct btrfs_block_group *block_group; + + block_group = rb_entry(node, struct btrfs_block_group, + cache_node); + ret = populate_free_space_tree(trans, block_group); + if (ret) + goto abort; + node = rb_next(node); + } + + btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE); + btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); + clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); + + ret = btrfs_commit_transaction(trans); + clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); + return ret; +abort: + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; +} + static int __add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path) diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h index dc2463e4cfe3..6d5551d0ced8 100644 --- a/fs/btrfs/free-space-tree.h +++ b/fs/btrfs/free-space-tree.h @@ -18,7 +18,8 @@ struct btrfs_caching_control; void set_free_space_tree_thresholds(struct btrfs_block_group *block_group); int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info); -int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info); +int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info); +int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info); int load_free_space_tree(struct btrfs_caching_control *caching_ctl); int add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group); diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 24cd49229408..0d98fc5f6f44 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -25,6 +25,18 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); /* + * Number of metadata items necessary for an unlink operation: + * + * 1 for the possible orphan item + * 1 for the dir item + * 1 for the dir index + * 1 for the inode ref + * 1 for the inode + * 1 for the parent inode + */ +#define BTRFS_UNLINK_METADATA_UNITS 6 + +/* * The reserved space at the beginning of each device. It covers the primary * super block and leaves space for potential use by other tools like * bootloaders or to lower potential damage of accidental overwrite. @@ -193,11 +205,7 @@ enum { #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL -#ifdef CONFIG_BTRFS_DEBUG -/* - * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG - */ -#define BTRFS_FEATURE_INCOMPAT_SUPP \ +#define BTRFS_FEATURE_INCOMPAT_SUPP_STABLE \ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ @@ -210,23 +218,22 @@ enum { BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ - BTRFS_FEATURE_INCOMPAT_ZONED | \ + BTRFS_FEATURE_INCOMPAT_ZONED) + +#ifdef CONFIG_BTRFS_DEBUG + /* + * Features under developmen like Extent tree v2 support is enabled + * only under CONFIG_BTRFS_DEBUG. + */ +#define BTRFS_FEATURE_INCOMPAT_SUPP \ + (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \ BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) + #else -#define BTRFS_FEATURE_INCOMPAT_SUPP \ - (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ - BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ - BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ - BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ - BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ - BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ - BTRFS_FEATURE_INCOMPAT_RAID56 | \ - BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ - BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ - BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ - BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ - BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ - BTRFS_FEATURE_INCOMPAT_ZONED) + +#define BTRFS_FEATURE_INCOMPAT_SUPP \ + (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE) + #endif #define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ @@ -412,7 +419,6 @@ struct btrfs_fs_info { * Must be written and read while holding btrfs_fs_info::commit_root_sem. */ u64 last_reloc_trans; - u64 avg_delayed_ref_runtime; /* * This is updated to the current trans every time a full commit is @@ -638,7 +644,6 @@ struct btrfs_fs_info { refcount_t scrub_workers_refcnt; struct workqueue_struct *scrub_workers; struct workqueue_struct *scrub_wr_completion_workers; - struct workqueue_struct *scrub_parity_workers; struct btrfs_subpage_info *subpage_info; struct btrfs_discard_ctl discard_ctl; @@ -828,7 +833,7 @@ static inline u64 btrfs_csum_bytes_to_leaves( * Use this if we would be adding new items, as we could split nodes as we cow * down the tree. */ -static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info, +static inline u64 btrfs_calc_insert_metadata_size(const struct btrfs_fs_info *fs_info, unsigned num_items) { return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; @@ -838,7 +843,7 @@ static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info, * Doing a truncate or a modification won't result in new nodes or leaves, just * what we need for COW. */ -static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info, +static inline u64 btrfs_calc_metadata_size(const struct btrfs_fs_info *fs_info, unsigned num_items) { return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index b65c45b5d681..4c322b720a80 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -527,7 +527,7 @@ search_again: while (1) { u64 clear_start = 0, clear_len = 0, extent_start = 0; - bool should_throttle = false; + bool refill_delayed_refs_rsv = false; fi = NULL; leaf = path->nodes[0]; @@ -660,8 +660,7 @@ delete: /* No pending yet, add ourselves */ pending_del_slot = path->slots[0]; pending_del_nr = 1; - } else if (pending_del_nr && - path->slots[0] + 1 == pending_del_slot) { + } else if (path->slots[0] + 1 == pending_del_slot) { /* Hop on the pending chunk */ pending_del_nr++; pending_del_slot = path->slots[0]; @@ -686,10 +685,8 @@ delete: btrfs_abort_transaction(trans, ret); break; } - if (be_nice) { - if (btrfs_should_throttle_delayed_refs(trans)) - should_throttle = true; - } + if (be_nice && btrfs_check_space_for_delayed_refs(fs_info)) + refill_delayed_refs_rsv = true; } if (found_type == BTRFS_INODE_ITEM_KEY) @@ -697,7 +694,7 @@ delete: if (path->slots[0] == 0 || path->slots[0] != pending_del_slot || - should_throttle) { + refill_delayed_refs_rsv) { if (pending_del_nr) { ret = btrfs_del_items(trans, root, path, pending_del_slot, @@ -720,7 +717,7 @@ delete: * actually allocate, so just bail if we're short and * let the normal reservation dance happen higher up. */ - if (should_throttle) { + if (refill_delayed_refs_rsv) { ret = btrfs_delayed_refs_rsv_refill(fs_info, BTRFS_RESERVE_NO_FLUSH); if (ret) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 957e4d76a7b6..19c707bc8801 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -79,6 +79,7 @@ struct btrfs_iget_args { struct btrfs_dio_data { ssize_t submitted; struct extent_changeset *data_reserved; + struct btrfs_ordered_extent *ordered; bool data_space_reserved; bool nocow_done; }; @@ -669,8 +670,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) again: will_compress = 0; nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; - nr_pages = min_t(unsigned long, nr_pages, - BTRFS_MAX_COMPRESSED / PAGE_SIZE); + nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES); /* * we don't want to send crud past the end of i_size through @@ -945,10 +945,9 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, ret = cow_file_range(inode, locked_page, start, end, &page_started, &nr_written, 0, NULL); /* Inline extent inserted, page gets unlocked and everything is done */ - if (page_started) { - ret = 0; - goto out; - } + if (page_started) + return 0; + if (ret < 0) { btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); if (locked_page) { @@ -962,14 +961,11 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, end_extent_writepage(locked_page, ret, page_start, page_end); unlock_page(locked_page); } - goto out; + return ret; } - ret = extent_write_locked_range(&inode->vfs_inode, start, end); /* All pages will be unlocked, including @locked_page */ -out: - kfree(async_extent); - return ret; + return extent_write_locked_range(&inode->vfs_inode, start, end); } static int submit_one_async_extent(struct btrfs_inode *inode, @@ -987,6 +983,9 @@ static int submit_one_async_extent(struct btrfs_inode *inode, u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; + if (async_chunk->blkcg_css) + kthread_associate_blkcg(async_chunk->blkcg_css); + /* * If async_chunk->locked_page is in the async_extent range, we need to * handle it. @@ -1001,8 +1000,10 @@ static int submit_one_async_extent(struct btrfs_inode *inode, lock_extent(io_tree, start, end, NULL); /* We have fall back to uncompressed write */ - if (!async_extent->pages) - return submit_uncompressed_range(inode, async_extent, locked_page); + if (!async_extent->pages) { + ret = submit_uncompressed_range(inode, async_extent, locked_page); + goto done; + } ret = btrfs_reserve_extent(root, async_extent->ram_size, async_extent->compressed_size, @@ -1054,24 +1055,18 @@ static int submit_one_async_extent(struct btrfs_inode *inode, extent_clear_unlock_delalloc(inode, start, end, NULL, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_START_WRITEBACK); - if (btrfs_submit_compressed_write(inode, start, /* file_offset */ + + btrfs_submit_compressed_write(inode, start, /* file_offset */ async_extent->ram_size, /* num_bytes */ ins.objectid, /* disk_bytenr */ ins.offset, /* compressed_len */ async_extent->pages, /* compressed_pages */ async_extent->nr_pages, - async_chunk->write_flags, - async_chunk->blkcg_css, true)) { - const u64 start = async_extent->start; - const u64 end = start + async_extent->ram_size - 1; - - btrfs_writepage_endio_finish_ordered(inode, NULL, start, end, 0); - - extent_clear_unlock_delalloc(inode, start, end, NULL, 0, - PAGE_END_WRITEBACK | PAGE_SET_ERROR); - free_async_extent_pages(async_extent); - } + async_chunk->write_flags, true); *alloc_hint = ins.objectid + ins.offset; +done: + if (async_chunk->blkcg_css) + kthread_associate_blkcg(NULL); kfree(async_extent); return ret; @@ -1086,8 +1081,7 @@ out_free: PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK | PAGE_SET_ERROR); free_async_extent_pages(async_extent); - kfree(async_extent); - return ret; + goto done; } /* @@ -1622,6 +1616,7 @@ static int cow_file_range_async(struct btrfs_inode *inode, if (blkcg_css != blkcg_root_css) { css_get(blkcg_css); async_chunk[i].blkcg_css = blkcg_css; + async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT; } else { async_chunk[i].blkcg_css = NULL; } @@ -2521,37 +2516,31 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, } /* - * Split an extent_map at [start, start + len] + * Split off the first pre bytes from the extent_map at [start, start + len] * * This function is intended to be used only for extract_ordered_extent(). */ -static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, - u64 pre, u64 post) +static int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre) { struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; struct extent_map *split_pre = NULL; struct extent_map *split_mid = NULL; - struct extent_map *split_post = NULL; int ret = 0; unsigned long flags; - /* Sanity check */ - if (pre == 0 && post == 0) - return 0; + ASSERT(pre != 0); + ASSERT(pre < len); split_pre = alloc_extent_map(); - if (pre) - split_mid = alloc_extent_map(); - if (post) - split_post = alloc_extent_map(); - if (!split_pre || (pre && !split_mid) || (post && !split_post)) { + if (!split_pre) + return -ENOMEM; + split_mid = alloc_extent_map(); + if (!split_mid) { ret = -ENOMEM; - goto out; + goto out_free_pre; } - ASSERT(pre + post < len); - lock_extent(&inode->io_tree, start, start + len - 1, NULL); write_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); @@ -2572,7 +2561,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, /* First, replace the em with a new extent_map starting from * em->start */ split_pre->start = em->start; - split_pre->len = (pre ? pre : em->len - post); + split_pre->len = pre; split_pre->orig_start = split_pre->start; split_pre->block_start = em->block_start; split_pre->block_len = split_pre->len; @@ -2586,38 +2575,21 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, /* * Now we only have an extent_map at: - * [em->start, em->start + pre] if pre != 0 - * [em->start, em->start + em->len - post] if pre == 0 - */ - - if (pre) { - /* Insert the middle extent_map */ - split_mid->start = em->start + pre; - split_mid->len = em->len - pre - post; - split_mid->orig_start = split_mid->start; - split_mid->block_start = em->block_start + pre; - split_mid->block_len = split_mid->len; - split_mid->orig_block_len = split_mid->block_len; - split_mid->ram_bytes = split_mid->len; - split_mid->flags = flags; - split_mid->compress_type = em->compress_type; - split_mid->generation = em->generation; - add_extent_mapping(em_tree, split_mid, 1); - } - - if (post) { - split_post->start = em->start + em->len - post; - split_post->len = post; - split_post->orig_start = split_post->start; - split_post->block_start = em->block_start + em->len - post; - split_post->block_len = split_post->len; - split_post->orig_block_len = split_post->block_len; - split_post->ram_bytes = split_post->len; - split_post->flags = flags; - split_post->compress_type = em->compress_type; - split_post->generation = em->generation; - add_extent_mapping(em_tree, split_post, 1); - } + * [em->start, em->start + pre] + */ + + /* Insert the middle extent_map. */ + split_mid->start = em->start + pre; + split_mid->len = em->len - pre; + split_mid->orig_start = split_mid->start; + split_mid->block_start = em->block_start + pre; + split_mid->block_len = split_mid->len; + split_mid->orig_block_len = split_mid->block_len; + split_mid->ram_bytes = split_mid->len; + split_mid->flags = flags; + split_mid->compress_type = em->compress_type; + split_mid->generation = em->generation; + add_extent_mapping(em_tree, split_mid, 1); /* Once for us */ free_extent_map(em); @@ -2627,72 +2599,41 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, out_unlock: write_unlock(&em_tree->lock); unlock_extent(&inode->io_tree, start, start + len - 1, NULL); -out: - free_extent_map(split_pre); free_extent_map(split_mid); - free_extent_map(split_post); - +out_free_pre: + free_extent_map(split_pre); return ret; } -blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio) +int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, + struct btrfs_ordered_extent *ordered) { u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; u64 len = bbio->bio.bi_iter.bi_size; struct btrfs_inode *inode = bbio->inode; - struct btrfs_ordered_extent *ordered; - u64 file_len; - u64 end = start + len; - u64 ordered_end; - u64 pre, post; + u64 ordered_len = ordered->num_bytes; int ret = 0; - ordered = btrfs_lookup_ordered_extent(inode, bbio->file_offset); - if (WARN_ON_ONCE(!ordered)) - return BLK_STS_IOERR; + /* Must always be called for the beginning of an ordered extent. */ + if (WARN_ON_ONCE(start != ordered->disk_bytenr)) + return -EINVAL; - /* No need to split */ + /* No need to split if the ordered extent covers the entire bio. */ if (ordered->disk_num_bytes == len) - goto out; - - /* We cannot split once end_bio'd ordered extent */ - if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) { - ret = -EINVAL; - goto out; - } - - /* We cannot split a compressed ordered extent */ - if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) { - ret = -EINVAL; - goto out; - } - - ordered_end = ordered->disk_bytenr + ordered->disk_num_bytes; - /* bio must be in one ordered extent */ - if (WARN_ON_ONCE(start < ordered->disk_bytenr || end > ordered_end)) { - ret = -EINVAL; - goto out; - } - - /* Checksum list should be empty */ - if (WARN_ON_ONCE(!list_empty(&ordered->list))) { - ret = -EINVAL; - goto out; - } - - file_len = ordered->num_bytes; - pre = start - ordered->disk_bytenr; - post = ordered_end - end; + return 0; - ret = btrfs_split_ordered_extent(ordered, pre, post); + ret = btrfs_split_ordered_extent(ordered, len); if (ret) - goto out; - ret = split_zoned_em(inode, bbio->file_offset, file_len, pre, post); + return ret; -out: - btrfs_put_ordered_extent(ordered); + /* + * Don't split the extent_map for NOCOW extents, as we're writing into + * a pre-existing one. + */ + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) + return 0; - return errno_to_blk_status(ret); + return split_extent_map(inode, bbio->file_offset, ordered_len, len); } /* @@ -3167,6 +3108,9 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) btrfs_rewrite_logical_zoned(ordered_extent); btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes); + } else if (btrfs_is_data_reloc_root(inode->root)) { + btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes); } if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { @@ -3367,13 +3311,6 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, return 0; } -static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 offset) -{ - u64 offset_in_sectors = offset >> fs_info->sectorsize_bits; - - return csums + offset_in_sectors * fs_info->csum_size; -} - /* * Verify the checksum of a single data sector. * @@ -3411,7 +3348,8 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, return true; } - csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset); + csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) * + fs_info->csum_size; if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum, csum_expected)) goto zeroit; @@ -3691,6 +3629,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + iput(inode); goto out; } btrfs_debug(fs_info, "auto deleting %Lu", @@ -3698,8 +3637,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) ret = btrfs_del_orphan_item(trans, root, found_key.objectid); btrfs_end_transaction(trans); - if (ret) + if (ret) { + iput(inode); goto out; + } continue; } @@ -4261,15 +4202,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir) { struct btrfs_root *root = dir->root; - /* - * 1 for the possible orphan item - * 1 for the dir item - * 1 for the dir index - * 1 for the inode ref - * 1 for the inode - * 1 for the parent inode - */ - return btrfs_start_transaction_fallback_global_rsv(root, 6); + return btrfs_start_transaction_fallback_global_rsv(root, + BTRFS_UNLINK_METADATA_UNITS); } static int btrfs_unlink(struct inode *dir, struct dentry *dentry) @@ -5243,7 +5177,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; - u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1); + u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, 1); int ret; /* @@ -5281,7 +5215,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, trans->block_rsv = &fs_info->trans_block_rsv; trans->bytes_reserved = delayed_refs_extra; btrfs_block_rsv_migrate(rsv, trans->block_rsv, - delayed_refs_extra, 1); + delayed_refs_extra, true); } return trans; } @@ -5291,7 +5225,7 @@ void btrfs_evict_inode(struct inode *inode) struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_block_rsv *rsv; + struct btrfs_block_rsv *rsv = NULL; int ret; trace_btrfs_inode_evict(inode); @@ -5308,18 +5242,18 @@ void btrfs_evict_inode(struct inode *inode) ((btrfs_root_refs(&root->root_item) != 0 && root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || btrfs_is_free_space_inode(BTRFS_I(inode)))) - goto no_delete; + goto out; if (is_bad_inode(inode)) - goto no_delete; + goto out; if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) - goto no_delete; + goto out; if (inode->i_nlink > 0) { BUG_ON(btrfs_root_refs(&root->root_item) != 0 && root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID); - goto no_delete; + goto out; } /* @@ -5328,7 +5262,7 @@ void btrfs_evict_inode(struct inode *inode) */ ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode)); if (ret) - goto no_delete; + goto out; /* * This drops any pending insert or delete operations we have for this @@ -5340,7 +5274,7 @@ void btrfs_evict_inode(struct inode *inode) rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); if (!rsv) - goto no_delete; + goto out; rsv->size = btrfs_calc_metadata_size(fs_info, 1); rsv->failfast = true; @@ -5356,16 +5290,21 @@ void btrfs_evict_inode(struct inode *inode) trans = evict_refill_and_join(root, rsv); if (IS_ERR(trans)) - goto free_rsv; + goto out; trans->block_rsv = rsv; ret = btrfs_truncate_inode_items(trans, root, &control); trans->block_rsv = &fs_info->trans_block_rsv; btrfs_end_transaction(trans); - btrfs_btree_balance_dirty(fs_info); + /* + * We have not added new delayed items for our inode after we + * have flushed its delayed items, so no need to throttle on + * delayed items. However we have modified extent buffers. + */ + btrfs_btree_balance_dirty_nodelay(fs_info); if (ret && ret != -ENOSPC && ret != -EAGAIN) - goto free_rsv; + goto out; else if (!ret) break; } @@ -5387,9 +5326,8 @@ void btrfs_evict_inode(struct inode *inode) btrfs_end_transaction(trans); } -free_rsv: +out: btrfs_free_block_rsv(fs_info, rsv); -no_delete: /* * If we didn't successfully delete, the orphan item will still be in * the tree and we'll retry on the next mount. Again, we might also want @@ -6981,6 +6919,7 @@ out: } static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, + struct btrfs_dio_data *dio_data, const u64 start, const u64 len, const u64 orig_start, @@ -6991,7 +6930,7 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, const int type) { struct extent_map *em = NULL; - int ret; + struct btrfs_ordered_extent *ordered; if (type != BTRFS_ORDERED_NOCOW) { em = create_io_em(inode, start, len, orig_start, block_start, @@ -7001,18 +6940,21 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, if (IS_ERR(em)) goto out; } - ret = btrfs_add_ordered_extent(inode, start, len, len, block_start, - block_len, 0, - (1 << type) | - (1 << BTRFS_ORDERED_DIRECT), - BTRFS_COMPRESS_NONE); - if (ret) { + ordered = btrfs_alloc_ordered_extent(inode, start, len, len, + block_start, block_len, 0, + (1 << type) | + (1 << BTRFS_ORDERED_DIRECT), + BTRFS_COMPRESS_NONE); + if (IS_ERR(ordered)) { if (em) { free_extent_map(em); btrfs_drop_extent_map_range(inode, start, start + len - 1, false); } - em = ERR_PTR(ret); + em = ERR_CAST(ordered); + } else { + ASSERT(!dio_data->ordered); + dio_data->ordered = ordered; } out: @@ -7020,6 +6962,7 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, } static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, + struct btrfs_dio_data *dio_data, u64 start, u64 len) { struct btrfs_root *root = inode->root; @@ -7035,7 +6978,7 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, if (ret) return ERR_PTR(ret); - em = btrfs_create_dio_extent(inode, start, ins.offset, start, + em = btrfs_create_dio_extent(inode, dio_data, start, ins.offset, start, ins.objectid, ins.offset, ins.offset, ins.offset, BTRFS_ORDERED_REGULAR); btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -7380,7 +7323,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, } space_reserved = true; - em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len, + em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len, orig_start, block_start, len, orig_block_len, ram_bytes, type); @@ -7422,7 +7365,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, goto out; space_reserved = true; - em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); + em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; @@ -7728,6 +7671,10 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, pos + length - 1, NULL); ret = -ENOTBLK; } + if (write) { + btrfs_put_ordered_extent(dio_data->ordered); + dio_data->ordered = NULL; + } if (write) extent_changeset_free(dio_data->data_reserved); @@ -7767,14 +7714,34 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, container_of(bbio, struct btrfs_dio_private, bbio); struct btrfs_dio_data *dio_data = iter->private; - btrfs_bio_init(bbio, BTRFS_I(iter->inode), btrfs_dio_end_io, bio->bi_private); + btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info, + btrfs_dio_end_io, bio->bi_private); + bbio->inode = BTRFS_I(iter->inode); bbio->file_offset = file_offset; dip->file_offset = file_offset; dip->bytes = bio->bi_iter.bi_size; dio_data->submitted += bio->bi_iter.bi_size; - btrfs_submit_bio(bio, 0); + + /* + * Check if we are doing a partial write. If we are, we need to split + * the ordered extent to match the submitted bio. Hang on to the + * remaining unfinishable ordered_extent in dio_data so that it can be + * cancelled in iomap_end to avoid a deadlock wherein faulting the + * remaining pages is blocked on the outstanding ordered extent. + */ + if (iter->flags & IOMAP_WRITE) { + int ret; + + ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered); + if (ret) { + btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); + return; + } + } + + btrfs_submit_bio(bbio, 0); } static const struct iomap_ops btrfs_dio_iomap_ops = { @@ -7789,7 +7756,7 @@ static const struct iomap_dio_ops btrfs_dio_ops = { ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) { - struct btrfs_dio_data data; + struct btrfs_dio_data data = { 0 }; return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, IOMAP_DIO_PARTIAL, &data, done_before); @@ -7798,7 +7765,7 @@ ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_be struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) { - struct btrfs_dio_data data; + struct btrfs_dio_data data = { 0 }; return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, IOMAP_DIO_PARTIAL, &data, done_before); @@ -9908,8 +9875,6 @@ out: } struct btrfs_encoded_read_private { - struct btrfs_inode *inode; - u64 file_offset; wait_queue_head_t wait; atomic_t pending; blk_status_t status; @@ -9939,45 +9904,41 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 disk_io_size, struct page **pages) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_encoded_read_private priv = { - .inode = inode, - .file_offset = file_offset, .pending = ATOMIC_INIT(1), }; unsigned long i = 0; - u64 cur = 0; + struct btrfs_bio *bbio; init_waitqueue_head(&priv.wait); - /* Submit bios for the extent, splitting due to bio limits as necessary. */ - while (cur < disk_io_size) { - struct bio *bio = NULL; - u64 remaining = disk_io_size - cur; - - while (bio || remaining) { - size_t bytes = min_t(u64, remaining, PAGE_SIZE); - - if (!bio) { - bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, - inode, - btrfs_encoded_read_endio, - &priv); - bio->bi_iter.bi_sector = - (disk_bytenr + cur) >> SECTOR_SHIFT; - } - if (!bytes || - bio_add_page(bio, pages[i], bytes, 0) < bytes) { - atomic_inc(&priv.pending); - btrfs_submit_bio(bio, 0); - bio = NULL; - continue; - } + bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, + btrfs_encoded_read_endio, &priv); + bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; + bbio->inode = inode; - i++; - cur += bytes; - remaining -= bytes; + do { + size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); + + if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { + atomic_inc(&priv.pending); + btrfs_submit_bio(bbio, 0); + + bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, + btrfs_encoded_read_endio, &priv); + bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; + bbio->inode = inode; + continue; } - } + + i++; + disk_bytenr += bytes; + disk_io_size -= bytes; + } while (disk_io_size); + + atomic_inc(&priv.pending); + btrfs_submit_bio(bbio, 0); if (atomic_dec_return(&priv.pending)) io_wait_event(priv.wait, !atomic_read(&priv.pending)); @@ -10398,13 +10359,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, btrfs_delalloc_release_extents(inode, num_bytes); - if (btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid, - ins.offset, pages, nr_pages, 0, NULL, - false)) { - btrfs_writepage_endio_finish_ordered(inode, pages[0], start, end, 0); - ret = -EIO; - goto out_pages; - } + btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid, + ins.offset, pages, nr_pages, 0, false); ret = orig_count; goto out; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ba769a1eb87a..2fa36f694daa 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -454,7 +454,9 @@ void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, case BTRFS_EXCLOP_BALANCE_PAUSED: spin_lock(&fs_info->super_lock); ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE || - fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD); + fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD || + fs_info->exclusive_operation == BTRFS_EXCLOP_NONE || + fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED; spin_unlock(&fs_info->super_lock); break; @@ -3161,6 +3163,11 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg) if (IS_ERR(sa)) return PTR_ERR(sa); + if (sa->flags & ~BTRFS_SCRUB_SUPPORTED_FLAGS) { + ret = -EOPNOTSUPP; + goto out; + } + if (!(sa->flags & BTRFS_SCRUB_READONLY)) { ret = mnt_want_write_file(file); if (ret) diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 870528d87526..3a496b0d3d2b 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -325,24 +325,12 @@ struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root) * acquire the lock. */ -int btrfs_drew_lock_init(struct btrfs_drew_lock *lock) +void btrfs_drew_lock_init(struct btrfs_drew_lock *lock) { - int ret; - - ret = percpu_counter_init(&lock->writers, 0, GFP_KERNEL); - if (ret) - return ret; - atomic_set(&lock->readers, 0); + atomic_set(&lock->writers, 0); init_waitqueue_head(&lock->pending_readers); init_waitqueue_head(&lock->pending_writers); - - return 0; -} - -void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock) -{ - percpu_counter_destroy(&lock->writers); } /* Return true if acquisition is successful, false otherwise */ @@ -351,10 +339,10 @@ bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock) if (atomic_read(&lock->readers)) return false; - percpu_counter_inc(&lock->writers); + atomic_inc(&lock->writers); /* Ensure writers count is updated before we check for pending readers */ - smp_mb(); + smp_mb__after_atomic(); if (atomic_read(&lock->readers)) { btrfs_drew_write_unlock(lock); return false; @@ -374,7 +362,7 @@ void btrfs_drew_write_lock(struct btrfs_drew_lock *lock) void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock) { - percpu_counter_dec(&lock->writers); + atomic_dec(&lock->writers); cond_wake_up(&lock->pending_readers); } @@ -390,8 +378,7 @@ void btrfs_drew_read_lock(struct btrfs_drew_lock *lock) */ smp_mb__after_atomic(); - wait_event(lock->pending_readers, - percpu_counter_sum(&lock->writers) == 0); + wait_event(lock->pending_readers, atomic_read(&lock->writers) == 0); } void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock) diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 11c2269b4b6f..edb9b4a0dba1 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -195,13 +195,12 @@ static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) struct btrfs_drew_lock { atomic_t readers; - struct percpu_counter writers; + atomic_t writers; wait_queue_head_t pending_writers; wait_queue_head_t pending_readers; }; -int btrfs_drew_lock_init(struct btrfs_drew_lock *lock); -void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock); +void btrfs_drew_lock_init(struct btrfs_drew_lock *lock); void btrfs_drew_write_lock(struct btrfs_drew_lock *lock); bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock); void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock); diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h index de3e18bce24a..00328c856be6 100644 --- a/fs/btrfs/lru_cache.h +++ b/fs/btrfs/lru_cache.h @@ -55,11 +55,6 @@ static inline unsigned int btrfs_lru_cache_size(const struct btrfs_lru_cache *ca return cache->size; } -static inline bool btrfs_lru_cache_is_full(const struct btrfs_lru_cache *cache) -{ - return cache->size >= cache->max_size; -} - static inline struct btrfs_lru_cache_entry *btrfs_lru_cache_lru_entry( struct btrfs_lru_cache *cache) { diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 71f6d8302d50..3a095b9c6373 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -17,6 +17,7 @@ #include "compression.h" #include "ctree.h" #include "super.h" +#include "btrfs_inode.h" #define LZO_LEN 4 @@ -329,7 +330,7 @@ static void copy_compressed_segment(struct compressed_bio *cb, int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct workspace *workspace = list_entry(ws, struct workspace, list); - const struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); + const struct btrfs_fs_info *fs_info = cb->bbio.inode->root->fs_info; const u32 sectorsize = fs_info->sectorsize; char *kaddr; int ret; @@ -388,8 +389,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) */ btrfs_err(fs_info, "unexpectedly large lzo segment len %u", seg_len); - ret = -EIO; - goto out; + return -EIO; } /* Copy the compressed segment payload into workspace */ @@ -400,8 +400,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) workspace->buf, &out_len); if (ret != LZO_E_OK) { btrfs_err(fs_info, "failed to decompress"); - ret = -EIO; - goto out; + return -EIO; } /* Copy the data into inode pages */ @@ -410,7 +409,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) /* All data read, exit */ if (ret == 0) - goto out; + return 0; ret = 0; /* Check if the sector has enough space for a segment header */ @@ -421,10 +420,8 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) /* Skip the padding zeros */ cur_in += sector_bytes_left; } -out: - if (!ret) - zero_fill_bio(cb->orig_bio); - return ret; + + return 0; } int lzo_decompress(struct list_head *ws, const u8 *data_in, diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index fde5aaa6e7c9..310a05cf95ef 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -253,7 +253,7 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, #endif #ifdef CONFIG_BTRFS_ASSERT -void __cold btrfs_assertfail(const char *expr, const char *file, int line) +void __cold __noreturn btrfs_assertfail(const char *expr, const char *file, int line) { pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); BUG(); diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 8c516ee58ff9..ac2d1982ba3d 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -160,7 +160,7 @@ do { \ } while (0) #ifdef CONFIG_BTRFS_ASSERT -void __cold btrfs_assertfail(const char *expr, const char *file, int line); +void __cold __noreturn btrfs_assertfail(const char *expr, const char *file, int line); #define ASSERT(expr) \ (likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__)) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 6c24b69e2d0a..a9778a91511e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -160,14 +160,16 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, * @compress_type: Compression algorithm used for data. * * Most of these parameters correspond to &struct btrfs_file_extent_item. The - * tree is given a single reference on the ordered extent that was inserted. + * tree is given a single reference on the ordered extent that was inserted, and + * the returned pointer is given a second reference. * - * Return: 0 or -ENOMEM. + * Return: the new ordered extent or error pointer. */ -int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, - u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, - u64 disk_num_bytes, u64 offset, unsigned flags, - int compress_type) +struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( + struct btrfs_inode *inode, u64 file_offset, + u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, + u64 disk_num_bytes, u64 offset, unsigned long flags, + int compress_type) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -181,7 +183,7 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, /* For nocow write, we can release the qgroup rsv right now */ ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes); if (ret < 0) - return ret; + return ERR_PTR(ret); ret = 0; } else { /* @@ -190,11 +192,11 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, */ ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes); if (ret < 0) - return ret; + return ERR_PTR(ret); } entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); if (!entry) - return -ENOMEM; + return ERR_PTR(-ENOMEM); entry->file_offset = file_offset; entry->num_bytes = num_bytes; @@ -256,6 +258,32 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, btrfs_mod_outstanding_extents(inode, 1); spin_unlock(&inode->lock); + /* One ref for the returned entry to match semantics of lookup. */ + refcount_inc(&entry->refs); + + return entry; +} + +/* + * Add a new btrfs_ordered_extent for the range, but drop the reference instead + * of returning it to the caller. + */ +int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, + u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, + u64 disk_num_bytes, u64 offset, unsigned long flags, + int compress_type) +{ + struct btrfs_ordered_extent *ordered; + + ordered = btrfs_alloc_ordered_extent(inode, file_offset, num_bytes, + ram_bytes, disk_bytenr, + disk_num_bytes, offset, flags, + compress_type); + + if (IS_ERR(ordered)) + return PTR_ERR(ordered); + btrfs_put_ordered_extent(ordered); + return 0; } @@ -1088,39 +1116,37 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, return false; } - -static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos, - u64 len) -{ - struct inode *inode = ordered->inode; - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - u64 file_offset = ordered->file_offset + pos; - u64 disk_bytenr = ordered->disk_bytenr + pos; - unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS; - - /* - * The splitting extent is already counted and will be added again in - * btrfs_add_ordered_extent_*(). Subtract len to avoid double counting. - */ - percpu_counter_add_batch(&fs_info->ordered_bytes, -len, - fs_info->delalloc_batch); - WARN_ON_ONCE(flags & (1 << BTRFS_ORDERED_COMPRESSED)); - return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len, - disk_bytenr, len, 0, flags, - ordered->compress_type); -} - -int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, - u64 post) +/* Split out a new ordered extent for this first @len bytes of @ordered. */ +int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len) { struct inode *inode = ordered->inode; struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; - struct rb_node *node; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int ret = 0; + u64 file_offset = ordered->file_offset; + u64 disk_bytenr = ordered->disk_bytenr; + unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS; + struct rb_node *node; trace_btrfs_ordered_extent_split(BTRFS_I(inode), ordered); + ASSERT(!(flags & (1U << BTRFS_ORDERED_COMPRESSED))); + + /* + * The entire bio must be covered by the ordered extent, but we can't + * reduce the original extent to a zero length either. + */ + if (WARN_ON_ONCE(len >= ordered->num_bytes)) + return -EINVAL; + /* We cannot split once ordered extent is past end_bio. */ + if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) + return -EINVAL; + /* We cannot split a compressed ordered extent. */ + if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) + return -EINVAL; + /* Checksum list should be empty. */ + if (WARN_ON_ONCE(!list_empty(&ordered->list))) + return -EINVAL; + spin_lock_irq(&tree->lock); /* Remove from tree once */ node = &ordered->rb_node; @@ -1129,11 +1155,11 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, if (tree->last == node) tree->last = NULL; - ordered->file_offset += pre; - ordered->disk_bytenr += pre; - ordered->num_bytes -= (pre + post); - ordered->disk_num_bytes -= (pre + post); - ordered->bytes_left -= (pre + post); + ordered->file_offset += len; + ordered->disk_bytenr += len; + ordered->num_bytes -= len; + ordered->disk_num_bytes -= len; + ordered->bytes_left -= len; /* Re-insert the node */ node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node); @@ -1144,13 +1170,15 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, spin_unlock_irq(&tree->lock); - if (pre) - ret = clone_ordered_extent(ordered, 0, pre); - if (ret == 0 && post) - ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes, - post); + /* + * The splitting extent is already counted and will be added again in + * btrfs_add_ordered_extent(). Subtract len to avoid double counting. + */ + percpu_counter_add_batch(&fs_info->ordered_bytes, -len, fs_info->delalloc_batch); - return ret; + return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len, + disk_bytenr, len, 0, flags, + ordered->compress_type); } int __init ordered_data_init(void) diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index eb40cb39f842..f0f1138d23c3 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -178,9 +178,14 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size); +struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( + struct btrfs_inode *inode, u64 file_offset, + u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, + u64 disk_num_bytes, u64 offset, unsigned long flags, + int compress_type); int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, - u64 disk_num_bytes, u64 offset, unsigned flags, + u64 disk_num_bytes, u64 offset, unsigned long flags, int compress_type); void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); @@ -207,8 +212,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, struct extent_state **cached_state); bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state); -int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, - u64 post); +int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len); int __init ordered_data_init(void); void __cold ordered_data_exit(void); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index b93c96213304..497b9dbd8a13 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -151,10 +151,10 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type) pr_cont("shared data backref parent %llu count %u\n", offset, btrfs_shared_data_ref_count(eb, sref)); /* - * offset is supposed to be a tree block which - * must be aligned to nodesize. + * Offset is supposed to be a tree block which must be + * aligned to sectorsize. */ - if (!IS_ALIGNED(offset, eb->fs_info->nodesize)) + if (!IS_ALIGNED(offset, eb->fs_info->sectorsize)) pr_info( "\t\t\t(parent %llu not aligned to sectorsize %u)\n", offset, eb->fs_info->sectorsize); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 642828c1b299..2fab37f062de 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -202,7 +202,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) */ static int rbio_bucket(struct btrfs_raid_bio *rbio) { - u64 num = rbio->bioc->raid_map[0]; + u64 num = rbio->bioc->full_stripe_logical; /* * we shift down quite a bit. We're using byte @@ -407,16 +407,15 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) { struct btrfs_stripe_hash_table *table; - unsigned long flags; if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) return; table = rbio->bioc->fs_info->stripe_hash_table; - spin_lock_irqsave(&table->cache_lock, flags); + spin_lock(&table->cache_lock); __remove_rbio_from_cache(rbio); - spin_unlock_irqrestore(&table->cache_lock, flags); + spin_unlock(&table->cache_lock); } /* @@ -425,19 +424,18 @@ static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) { struct btrfs_stripe_hash_table *table; - unsigned long flags; struct btrfs_raid_bio *rbio; table = info->stripe_hash_table; - spin_lock_irqsave(&table->cache_lock, flags); + spin_lock(&table->cache_lock); while (!list_empty(&table->stripe_cache)) { rbio = list_entry(table->stripe_cache.next, struct btrfs_raid_bio, stripe_cache); __remove_rbio_from_cache(rbio); } - spin_unlock_irqrestore(&table->cache_lock, flags); + spin_unlock(&table->cache_lock); } /* @@ -467,14 +465,13 @@ void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) static void cache_rbio(struct btrfs_raid_bio *rbio) { struct btrfs_stripe_hash_table *table; - unsigned long flags; if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) return; table = rbio->bioc->fs_info->stripe_hash_table; - spin_lock_irqsave(&table->cache_lock, flags); + spin_lock(&table->cache_lock); spin_lock(&rbio->bio_list_lock); /* bump our ref if we were not in the list before */ @@ -501,7 +498,7 @@ static void cache_rbio(struct btrfs_raid_bio *rbio) __remove_rbio_from_cache(found); } - spin_unlock_irqrestore(&table->cache_lock, flags); + spin_unlock(&table->cache_lock); } /* @@ -530,15 +527,14 @@ static void run_xor(void **pages, int src_cnt, ssize_t len) */ static int rbio_is_full(struct btrfs_raid_bio *rbio) { - unsigned long flags; unsigned long size = rbio->bio_list_bytes; int ret = 1; - spin_lock_irqsave(&rbio->bio_list_lock, flags); + spin_lock(&rbio->bio_list_lock); if (size != rbio->nr_data * BTRFS_STRIPE_LEN) ret = 0; BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN); - spin_unlock_irqrestore(&rbio->bio_list_lock, flags); + spin_unlock(&rbio->bio_list_lock); return ret; } @@ -571,7 +567,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, test_bit(RBIO_CACHE_BIT, &cur->flags)) return 0; - if (last->bioc->raid_map[0] != cur->bioc->raid_map[0]) + if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical) return 0; /* we can't merge with different operations */ @@ -657,16 +653,15 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) struct btrfs_stripe_hash *h; struct btrfs_raid_bio *cur; struct btrfs_raid_bio *pending; - unsigned long flags; struct btrfs_raid_bio *freeit = NULL; struct btrfs_raid_bio *cache_drop = NULL; int ret = 0; h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio); - spin_lock_irqsave(&h->lock, flags); + spin_lock(&h->lock); list_for_each_entry(cur, &h->hash_list, hash_list) { - if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0]) + if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical) continue; spin_lock(&cur->bio_list_lock); @@ -724,7 +719,7 @@ lockit: refcount_inc(&rbio->refs); list_add(&rbio->hash_list, &h->hash_list); out: - spin_unlock_irqrestore(&h->lock, flags); + spin_unlock(&h->lock); if (cache_drop) remove_rbio_from_cache(cache_drop); if (freeit) @@ -742,7 +737,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) { int bucket; struct btrfs_stripe_hash *h; - unsigned long flags; int keep_cache = 0; bucket = rbio_bucket(rbio); @@ -751,7 +745,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) if (list_empty(&rbio->plug_list)) cache_rbio(rbio); - spin_lock_irqsave(&h->lock, flags); + spin_lock(&h->lock); spin_lock(&rbio->bio_list_lock); if (!list_empty(&rbio->hash_list)) { @@ -788,7 +782,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) list_add(&next->hash_list, &h->hash_list); refcount_inc(&next->refs); spin_unlock(&rbio->bio_list_lock); - spin_unlock_irqrestore(&h->lock, flags); + spin_unlock(&h->lock); if (next->operation == BTRFS_RBIO_READ_REBUILD) start_async_work(next, recover_rbio_work_locked); @@ -808,7 +802,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) } done: spin_unlock(&rbio->bio_list_lock); - spin_unlock_irqrestore(&h->lock, flags); + spin_unlock(&h->lock); done_nolock: if (!keep_cache) @@ -891,16 +885,16 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, index = stripe_nr * rbio->stripe_nsectors + sector_nr; ASSERT(index >= 0 && index < rbio->nr_sectors); - spin_lock_irq(&rbio->bio_list_lock); + spin_lock(&rbio->bio_list_lock); sector = &rbio->bio_sectors[index]; if (sector->page || bio_list_only) { /* Don't return sector without a valid page pointer */ if (!sector->page) sector = NULL; - spin_unlock_irq(&rbio->bio_list_lock); + spin_unlock(&rbio->bio_list_lock); return sector; } - spin_unlock_irq(&rbio->bio_list_lock); + spin_unlock(&rbio->bio_list_lock); return &rbio->stripe_sectors[index]; } @@ -912,7 +906,7 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, struct btrfs_io_context *bioc) { - const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; + const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes; const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT; const unsigned int num_pages = stripe_npages * real_stripes; const unsigned int stripe_nsectors = @@ -1108,7 +1102,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, bio->bi_iter.bi_sector = disk_start >> 9; bio->bi_private = rbio; - bio_add_page(bio, sector->page, sectorsize, sector->pgoff); + __bio_add_page(bio, sector->page, sectorsize, sector->pgoff); bio_list_add(bio_list, bio); return 0; } @@ -1119,7 +1113,7 @@ static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) struct bio_vec bvec; struct bvec_iter iter; u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - - rbio->bioc->raid_map[0]; + rbio->bioc->full_stripe_logical; bio_for_each_segment(bvec, bio, iter) { u32 bvec_offset; @@ -1148,11 +1142,11 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) { struct bio *bio; - spin_lock_irq(&rbio->bio_list_lock); + spin_lock(&rbio->bio_list_lock); bio_list_for_each(bio, &rbio->bio_list) index_one_bio(rbio, bio); - spin_unlock_irq(&rbio->bio_list_lock); + spin_unlock(&rbio->bio_list_lock); } static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio, @@ -1282,10 +1276,16 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, goto error; } - if (likely(!rbio->bioc->num_tgtdevs)) + if (likely(!rbio->bioc->replace_nr_stripes)) return 0; - /* Make a copy for the replace target device. */ + /* + * Make a copy for the replace target device. + * + * Thus the source stripe number (in replace_stripe_src) should be valid. + */ + ASSERT(rbio->bioc->replace_stripe_src >= 0); + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { struct sector_ptr *sector; @@ -1293,7 +1293,12 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, stripe = total_sector_nr / rbio->stripe_nsectors; sectornr = total_sector_nr % rbio->stripe_nsectors; - if (!rbio->bioc->tgtdev_map[stripe]) { + /* + * For RAID56, there is only one device that can be replaced, + * and replace_stripe_src[0] indicates the stripe number we + * need to copy from. + */ + if (stripe != rbio->bioc->replace_stripe_src) { /* * We can skip the whole stripe completely, note * total_sector_nr will be increased by one anyway. @@ -1316,7 +1321,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, } ret = rbio_add_io_sector(rbio, bio_list, sector, - rbio->bioc->tgtdev_map[stripe], + rbio->real_stripes, sectornr, REQ_OP_WRITE); if (ret) goto error; @@ -1332,7 +1337,7 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - - rbio->bioc->raid_map[0]; + rbio->bioc->full_stripe_logical; int total_nr_sector = offset >> fs_info->sectorsize_bits; ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors); @@ -1609,7 +1614,7 @@ static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) { const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT; - const u64 full_stripe_start = rbio->bioc->raid_map[0]; + const u64 full_stripe_start = rbio->bioc->full_stripe_logical; const u32 orig_len = orig_bio->bi_iter.bi_size; const u32 sectorsize = fs_info->sectorsize; u64 cur_logical; @@ -1796,9 +1801,8 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, * here due to a crc mismatch and we can't give them the * data they want. */ - if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) { - if (rbio->bioc->raid_map[faila] == - RAID5_P_STRIPE) + if (failb == rbio->real_stripes - 1) { + if (faila == rbio->real_stripes - 2) /* * Only P and Q are corrupted. * We only care about data stripes recovery, @@ -1812,7 +1816,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, goto pstripe; } - if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { + if (failb == rbio->real_stripes - 2) { raid6_datap_recov(rbio->real_stripes, sectorsize, faila, pointers); } else { @@ -1895,9 +1899,9 @@ static int recover_sectors(struct btrfs_raid_bio *rbio) if (rbio->operation == BTRFS_RBIO_READ_REBUILD || rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { - spin_lock_irq(&rbio->bio_list_lock); + spin_lock(&rbio->bio_list_lock); set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); - spin_unlock_irq(&rbio->bio_list_lock); + spin_unlock(&rbio->bio_list_lock); } index_rbio_pages(rbio); @@ -2075,8 +2079,8 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; struct btrfs_root *csum_root = btrfs_csum_root(fs_info, - rbio->bioc->raid_map[0]); - const u64 start = rbio->bioc->raid_map[0]; + rbio->bioc->full_stripe_logical); + const u64 start = rbio->bioc->full_stripe_logical; const u32 len = (rbio->nr_data * rbio->stripe_nsectors) << fs_info->sectorsize_bits; int ret; @@ -2109,7 +2113,7 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio) } ret = btrfs_lookup_csums_bitmap(csum_root, start, start + len - 1, - rbio->csum_buf, rbio->csum_bitmap); + rbio->csum_buf, rbio->csum_bitmap, false); if (ret < 0) goto error; if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits)) @@ -2124,7 +2128,7 @@ error: */ btrfs_warn_rl(fs_info, "sub-stripe write for full stripe %llu is not safe, failed to get csum: %d", - rbio->bioc->raid_map[0], ret); + rbio->bioc->full_stripe_logical, ret); no_csum: kfree(rbio->csum_buf); bitmap_free(rbio->csum_bitmap); @@ -2265,9 +2269,9 @@ static void rmw_rbio(struct btrfs_raid_bio *rbio) * bio list any more, anyone else that wants to change this stripe * needs to do their own rmw. */ - spin_lock_irq(&rbio->bio_list_lock); + spin_lock(&rbio->bio_list_lock); set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); - spin_unlock_irq(&rbio->bio_list_lock); + spin_unlock(&rbio->bio_list_lock); bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); @@ -2372,23 +2376,6 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, return rbio; } -/* Used for both parity scrub and missing. */ -void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, - unsigned int pgoff, u64 logical) -{ - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - int stripe_offset; - int index; - - ASSERT(logical >= rbio->bioc->raid_map[0]); - ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] + - BTRFS_STRIPE_LEN * rbio->nr_data); - stripe_offset = (int)(logical - rbio->bioc->raid_map[0]); - index = stripe_offset / sectorsize; - rbio->bio_sectors[index].page = page; - rbio->bio_sectors[index].pgoff = pgoff; -} - /* * We just scrub the parity that we have correct data on the same horizontal, * so we needn't allocate all pages for all the stripes. @@ -2442,7 +2429,11 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) else BUG(); - if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) { + /* + * Replace is running and our P/Q stripe is being replaced, then we + * need to duplicate the final write to replace target. + */ + if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) { is_replace = 1; bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors); } @@ -2544,13 +2535,18 @@ writeback: if (!is_replace) goto submit_write; + /* + * Replace is running and our parity stripe needs to be duplicated to + * the target device. Check we have a valid source stripe number. + */ + ASSERT(rbio->bioc->replace_stripe_src >= 0); for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { struct sector_ptr *sector; sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); ret = rbio_add_io_sector(rbio, &bio_list, sector, - bioc->tgtdev_map[rbio->scrubp], - sectornr, REQ_OP_WRITE); + rbio->real_stripes, + sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } @@ -2751,33 +2747,3 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) if (!lock_stripe_add(rbio)) start_async_work(rbio, scrub_rbio_work_locked); } - -/* The following code is used for dev replace of a missing RAID 5/6 device. */ - -struct btrfs_raid_bio * -raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc) -{ - struct btrfs_fs_info *fs_info = bioc->fs_info; - struct btrfs_raid_bio *rbio; - - rbio = alloc_rbio(fs_info, bioc); - if (IS_ERR(rbio)) - return NULL; - - rbio->operation = BTRFS_RBIO_REBUILD_MISSING; - bio_list_add(&rbio->bio_list, bio); - /* - * This is a special bio which is used to hold the completion handler - * and make the scrub rbio is similar to the other types - */ - ASSERT(!bio->bi_iter.bi_size); - - set_rbio_range_error(rbio, bio); - - return rbio; -} - -void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) -{ - start_async_work(rbio, recover_rbio_work); -} diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index df0e0abdeb1f..0f7f31c8cb98 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -170,6 +170,11 @@ static inline int nr_data_stripes(const struct map_lookup *map) return map->num_stripes - btrfs_nr_parity_stripes(map->type); } +static inline int nr_bioc_data_stripes(const struct btrfs_io_context *bioc) +{ + return bioc->num_stripes - btrfs_nr_parity_stripes(bioc->map_type); +} + #define RAID5_P_STRIPE ((u64)-2) #define RAID6_Q_STRIPE ((u64)-1) @@ -182,19 +187,12 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, int mirror_num); void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc); -void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, - unsigned int pgoff, u64 logical); - struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, struct btrfs_io_context *bioc, struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors); void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); -struct btrfs_raid_bio * -raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc); -void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio); - int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index ef13a9d4e370..59a06499c647 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1266,7 +1266,7 @@ again: level = btrfs_header_level(parent); ASSERT(level >= lowest_level); - ret = btrfs_bin_search(parent, &key, &slot); + ret = btrfs_bin_search(parent, 0, &key, &slot); if (ret < 0) break; if (ret && slot > 0) @@ -2407,7 +2407,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, if (upper->eb && !upper->locked) { if (!lowest) { - ret = btrfs_bin_search(upper->eb, key, &slot); + ret = btrfs_bin_search(upper->eb, 0, key, &slot); if (ret < 0) goto next; BUG_ON(ret); @@ -2441,7 +2441,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, slot = path->slots[upper->level]; btrfs_release_path(path); } else { - ret = btrfs_bin_search(upper->eb, key, &slot); + ret = btrfs_bin_search(upper->eb, 0, key, &slot); if (ret < 0) goto next; BUG_ON(ret); @@ -3422,7 +3422,7 @@ int add_data_references(struct reloc_control *rc, btrfs_release_path(path); ctx.bytenr = extent_key->objectid; - ctx.ignore_extent_item_pos = true; + ctx.skip_inode_ref_list = true; ctx.fs_info = rc->extent_root->fs_info; ret = btrfs_find_all_leafs(&ctx); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 69c93ae333f6..836725a19661 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -38,18 +38,14 @@ * - add a mode to also read unallocated space */ -struct scrub_block; struct scrub_ctx; /* - * The following three values only influence the performance. + * The following value only influences the performance. * - * The last one configures the number of parallel and outstanding I/O - * operations. The first one configures an upper limit for the number - * of (dynamically allocated) pages that are added to a bio. + * This determines the batch size for stripe submitted in one go. */ -#define SCRUB_SECTORS_PER_BIO 32 /* 128KiB per bio for 4KiB pages */ -#define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for 4KiB pages */ +#define SCRUB_STRIPES_PER_SCTX 8 /* That would be 8 64K stripe per-device. */ /* * The following value times PAGE_SIZE needs to be large enough to match the @@ -57,128 +53,124 @@ struct scrub_ctx; */ #define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K) -#define SCRUB_MAX_PAGES (DIV_ROUND_UP(BTRFS_MAX_METADATA_BLOCKSIZE, PAGE_SIZE)) +/* Represent one sector and its needed info to verify the content. */ +struct scrub_sector_verification { + bool is_metadata; -/* - * Maximum number of mirrors that can be available for all profiles counting - * the target device of dev-replace as one. During an active device replace - * procedure, the target device of the copy operation is a mirror for the - * filesystem data as well that can be used to read data in order to repair - * read errors on other disks. - * - * Current value is derived from RAID1C4 with 4 copies. - */ -#define BTRFS_MAX_MIRRORS (4 + 1) + union { + /* + * Csum pointer for data csum verification. Should point to a + * sector csum inside scrub_stripe::csums. + * + * NULL if this data sector has no csum. + */ + u8 *csum; -struct scrub_recover { - refcount_t refs; - struct btrfs_io_context *bioc; - u64 map_length; + /* + * Extra info for metadata verification. All sectors inside a + * tree block share the same generation. + */ + u64 generation; + }; }; -struct scrub_sector { - struct scrub_block *sblock; - struct list_head list; - u64 flags; /* extent flags */ - u64 generation; - /* Offset in bytes to @sblock. */ - u32 offset; - atomic_t refs; - unsigned int have_csum:1; - unsigned int io_error:1; - u8 csum[BTRFS_CSUM_SIZE]; - - struct scrub_recover *recover; -}; +enum scrub_stripe_flags { + /* Set when @mirror_num, @dev, @physical and @logical are set. */ + SCRUB_STRIPE_FLAG_INITIALIZED, -struct scrub_bio { - int index; - struct scrub_ctx *sctx; - struct btrfs_device *dev; - struct bio *bio; - blk_status_t status; - u64 logical; - u64 physical; - struct scrub_sector *sectors[SCRUB_SECTORS_PER_BIO]; - int sector_count; - int next_free; - struct work_struct work; -}; + /* Set when the read-repair is finished. */ + SCRUB_STRIPE_FLAG_REPAIR_DONE, -struct scrub_block { /* - * Each page will have its page::private used to record the logical - * bytenr. + * Set for data stripes if it's triggered from P/Q stripe. + * During such scrub, we should not report errors in data stripes, nor + * update the accounting. */ - struct page *pages[SCRUB_MAX_PAGES]; - struct scrub_sector *sectors[SCRUB_MAX_SECTORS_PER_BLOCK]; - struct btrfs_device *dev; - /* Logical bytenr of the sblock */ - u64 logical; - u64 physical; - u64 physical_for_dev_replace; - /* Length of sblock in bytes */ - u32 len; - int sector_count; - int mirror_num; - - atomic_t outstanding_sectors; - refcount_t refs; /* free mem on transition to zero */ - struct scrub_ctx *sctx; - struct scrub_parity *sparity; - struct { - unsigned int header_error:1; - unsigned int checksum_error:1; - unsigned int no_io_error_seen:1; - unsigned int generation_error:1; /* also sets header_error */ - - /* The following is for the data used to check parity */ - /* It is for the data with checksum */ - unsigned int data_corrected:1; - }; - struct work_struct work; + SCRUB_STRIPE_FLAG_NO_REPORT, }; -/* Used for the chunks with parity stripe such RAID5/6 */ -struct scrub_parity { - struct scrub_ctx *sctx; +#define SCRUB_STRIPE_PAGES (BTRFS_STRIPE_LEN / PAGE_SIZE) - struct btrfs_device *scrub_dev; +/* + * Represent one contiguous range with a length of BTRFS_STRIPE_LEN. + */ +struct scrub_stripe { + struct scrub_ctx *sctx; + struct btrfs_block_group *bg; - u64 logic_start; + struct page *pages[SCRUB_STRIPE_PAGES]; + struct scrub_sector_verification *sectors; - u64 logic_end; + struct btrfs_device *dev; + u64 logical; + u64 physical; + + u16 mirror_num; + + /* Should be BTRFS_STRIPE_LEN / sectorsize. */ + u16 nr_sectors; + + /* + * How many data/meta extents are in this stripe. Only for scrub status + * reporting purposes. + */ + u16 nr_data_extents; + u16 nr_meta_extents; + + atomic_t pending_io; + wait_queue_head_t io_wait; + wait_queue_head_t repair_wait; - int nsectors; + /* + * Indicate the states of the stripe. Bits are defined in + * scrub_stripe_flags enum. + */ + unsigned long state; - u32 stripe_len; + /* Indicate which sectors are covered by extent items. */ + unsigned long extent_sector_bitmap; - refcount_t refs; + /* + * The errors hit during the initial read of the stripe. + * + * Would be utilized for error reporting and repair. + */ + unsigned long init_error_bitmap; - struct list_head sectors_list; + /* + * The following error bitmaps are all for the current status. + * Every time we submit a new read, these bitmaps may be updated. + * + * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap; + * + * IO and csum errors can happen for both metadata and data. + */ + unsigned long error_bitmap; + unsigned long io_error_bitmap; + unsigned long csum_error_bitmap; + unsigned long meta_error_bitmap; - /* Work of parity check and repair */ - struct work_struct work; + /* For writeback (repair or replace) error reporting. */ + unsigned long write_error_bitmap; - /* Mark the parity blocks which have data */ - unsigned long dbitmap; + /* Writeback can be concurrent, thus we need to protect the bitmap. */ + spinlock_t write_error_lock; /* - * Mark the parity blocks which have data, but errors happen when - * read data or check data + * Checksum for the whole stripe if this stripe is inside a data block + * group. */ - unsigned long ebitmap; + u8 *csums; + + struct work_struct work; }; struct scrub_ctx { - struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX]; + struct scrub_stripe stripes[SCRUB_STRIPES_PER_SCTX]; + struct scrub_stripe *raid56_data_stripes; struct btrfs_fs_info *fs_info; int first_free; - int curr; - atomic_t bios_in_flight; - atomic_t workers_pending; - spinlock_t list_lock; - wait_queue_head_t list_wait; + int cur_stripe; struct list_head csum_list; atomic_t cancel_req; int readonly; @@ -191,10 +183,8 @@ struct scrub_ctx { int is_dev_replace; u64 write_pointer; - struct scrub_bio *wr_curr_bio; struct mutex wr_lock; struct btrfs_device *wr_tgtdev; - bool flush_all_writes; /* * statistics @@ -221,239 +211,66 @@ struct scrub_warning { struct btrfs_device *dev; }; -struct full_stripe_lock { - struct rb_node node; - u64 logical; - u64 refs; - struct mutex mutex; -}; - -#ifndef CONFIG_64BIT -/* This structure is for architectures whose (void *) is smaller than u64 */ -struct scrub_page_private { - u64 logical; -}; -#endif - -static int attach_scrub_page_private(struct page *page, u64 logical) -{ -#ifdef CONFIG_64BIT - attach_page_private(page, (void *)logical); - return 0; -#else - struct scrub_page_private *spp; - - spp = kmalloc(sizeof(*spp), GFP_KERNEL); - if (!spp) - return -ENOMEM; - spp->logical = logical; - attach_page_private(page, (void *)spp); - return 0; -#endif -} - -static void detach_scrub_page_private(struct page *page) -{ -#ifdef CONFIG_64BIT - detach_page_private(page); - return; -#else - struct scrub_page_private *spp; - - spp = detach_page_private(page); - kfree(spp); - return; -#endif -} - -static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx, - struct btrfs_device *dev, - u64 logical, u64 physical, - u64 physical_for_dev_replace, - int mirror_num) -{ - struct scrub_block *sblock; - - sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); - if (!sblock) - return NULL; - refcount_set(&sblock->refs, 1); - sblock->sctx = sctx; - sblock->logical = logical; - sblock->physical = physical; - sblock->physical_for_dev_replace = physical_for_dev_replace; - sblock->dev = dev; - sblock->mirror_num = mirror_num; - sblock->no_io_error_seen = 1; - /* - * Scrub_block::pages will be allocated at alloc_scrub_sector() when - * the corresponding page is not allocated. - */ - return sblock; -} - -/* - * Allocate a new scrub sector and attach it to @sblock. - * - * Will also allocate new pages for @sblock if needed. - */ -static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock, - u64 logical) +static void release_scrub_stripe(struct scrub_stripe *stripe) { - const pgoff_t page_index = (logical - sblock->logical) >> PAGE_SHIFT; - struct scrub_sector *ssector; - - /* We must never have scrub_block exceed U32_MAX in size. */ - ASSERT(logical - sblock->logical < U32_MAX); - - ssector = kzalloc(sizeof(*ssector), GFP_KERNEL); - if (!ssector) - return NULL; - - /* Allocate a new page if the slot is not allocated */ - if (!sblock->pages[page_index]) { - int ret; + if (!stripe) + return; - sblock->pages[page_index] = alloc_page(GFP_KERNEL); - if (!sblock->pages[page_index]) { - kfree(ssector); - return NULL; - } - ret = attach_scrub_page_private(sblock->pages[page_index], - sblock->logical + (page_index << PAGE_SHIFT)); - if (ret < 0) { - kfree(ssector); - __free_page(sblock->pages[page_index]); - sblock->pages[page_index] = NULL; - return NULL; - } + for (int i = 0; i < SCRUB_STRIPE_PAGES; i++) { + if (stripe->pages[i]) + __free_page(stripe->pages[i]); + stripe->pages[i] = NULL; } - - atomic_set(&ssector->refs, 1); - ssector->sblock = sblock; - /* The sector to be added should not be used */ - ASSERT(sblock->sectors[sblock->sector_count] == NULL); - ssector->offset = logical - sblock->logical; - - /* The sector count must be smaller than the limit */ - ASSERT(sblock->sector_count < SCRUB_MAX_SECTORS_PER_BLOCK); - - sblock->sectors[sblock->sector_count] = ssector; - sblock->sector_count++; - sblock->len += sblock->sctx->fs_info->sectorsize; - - return ssector; -} - -static struct page *scrub_sector_get_page(struct scrub_sector *ssector) -{ - struct scrub_block *sblock = ssector->sblock; - pgoff_t index; - /* - * When calling this function, ssector must be alreaday attached to the - * parent sblock. - */ - ASSERT(sblock); - - /* The range should be inside the sblock range */ - ASSERT(ssector->offset < sblock->len); - - index = ssector->offset >> PAGE_SHIFT; - ASSERT(index < SCRUB_MAX_PAGES); - ASSERT(sblock->pages[index]); - ASSERT(PagePrivate(sblock->pages[index])); - return sblock->pages[index]; + kfree(stripe->sectors); + kfree(stripe->csums); + stripe->sectors = NULL; + stripe->csums = NULL; + stripe->sctx = NULL; + stripe->state = 0; } -static unsigned int scrub_sector_get_page_offset(struct scrub_sector *ssector) +static int init_scrub_stripe(struct btrfs_fs_info *fs_info, + struct scrub_stripe *stripe) { - struct scrub_block *sblock = ssector->sblock; + int ret; - /* - * When calling this function, ssector must be already attached to the - * parent sblock. - */ - ASSERT(sblock); + memset(stripe, 0, sizeof(*stripe)); - /* The range should be inside the sblock range */ - ASSERT(ssector->offset < sblock->len); + stripe->nr_sectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; + stripe->state = 0; - return offset_in_page(ssector->offset); -} + init_waitqueue_head(&stripe->io_wait); + init_waitqueue_head(&stripe->repair_wait); + atomic_set(&stripe->pending_io, 0); + spin_lock_init(&stripe->write_error_lock); -static char *scrub_sector_get_kaddr(struct scrub_sector *ssector) -{ - return page_address(scrub_sector_get_page(ssector)) + - scrub_sector_get_page_offset(ssector); + ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages); + if (ret < 0) + goto error; + + stripe->sectors = kcalloc(stripe->nr_sectors, + sizeof(struct scrub_sector_verification), + GFP_KERNEL); + if (!stripe->sectors) + goto error; + + stripe->csums = kcalloc(BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits, + fs_info->csum_size, GFP_KERNEL); + if (!stripe->csums) + goto error; + return 0; +error: + release_scrub_stripe(stripe); + return -ENOMEM; } -static int bio_add_scrub_sector(struct bio *bio, struct scrub_sector *ssector, - unsigned int len) +static void wait_scrub_stripe_io(struct scrub_stripe *stripe) { - return bio_add_page(bio, scrub_sector_get_page(ssector), len, - scrub_sector_get_page_offset(ssector)); + wait_event(stripe->io_wait, atomic_read(&stripe->pending_io) == 0); } -static int scrub_setup_recheck_block(struct scrub_block *original_sblock, - struct scrub_block *sblocks_for_recheck[]); -static void scrub_recheck_block(struct btrfs_fs_info *fs_info, - struct scrub_block *sblock, - int retry_failed_mirror); -static void scrub_recheck_block_checksum(struct scrub_block *sblock); -static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, - struct scrub_block *sblock_good); -static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad, - struct scrub_block *sblock_good, - int sector_num, int force_write); -static void scrub_write_block_to_dev_replace(struct scrub_block *sblock); -static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, - int sector_num); -static int scrub_checksum_data(struct scrub_block *sblock); -static int scrub_checksum_tree_block(struct scrub_block *sblock); -static int scrub_checksum_super(struct scrub_block *sblock); -static void scrub_block_put(struct scrub_block *sblock); -static void scrub_sector_get(struct scrub_sector *sector); -static void scrub_sector_put(struct scrub_sector *sector); -static void scrub_parity_get(struct scrub_parity *sparity); -static void scrub_parity_put(struct scrub_parity *sparity); -static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len, - u64 physical, struct btrfs_device *dev, u64 flags, - u64 gen, int mirror_num, u8 *csum, - u64 physical_for_dev_replace); -static void scrub_bio_end_io(struct bio *bio); -static void scrub_bio_end_io_worker(struct work_struct *work); -static void scrub_block_complete(struct scrub_block *sblock); -static void scrub_find_good_copy(struct btrfs_fs_info *fs_info, - u64 extent_logical, u32 extent_len, - u64 *extent_physical, - struct btrfs_device **extent_dev, - int *extent_mirror_num); -static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx, - struct scrub_sector *sector); -static void scrub_wr_submit(struct scrub_ctx *sctx); -static void scrub_wr_bio_end_io(struct bio *bio); -static void scrub_wr_bio_end_io_worker(struct work_struct *work); static void scrub_put_ctx(struct scrub_ctx *sctx); -static inline int scrub_is_page_on_raid56(struct scrub_sector *sector) -{ - return sector->recover && - (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); -} - -static void scrub_pending_bio_inc(struct scrub_ctx *sctx) -{ - refcount_inc(&sctx->refs); - atomic_inc(&sctx->bios_in_flight); -} - -static void scrub_pending_bio_dec(struct scrub_ctx *sctx) -{ - atomic_dec(&sctx->bios_in_flight); - wake_up(&sctx->list_wait); - scrub_put_ctx(sctx); -} - static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) { while (atomic_read(&fs_info->scrub_pause_req)) { @@ -486,223 +303,6 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) scrub_pause_off(fs_info); } -/* - * Insert new full stripe lock into full stripe locks tree - * - * Return pointer to existing or newly inserted full_stripe_lock structure if - * everything works well. - * Return ERR_PTR(-ENOMEM) if we failed to allocate memory - * - * NOTE: caller must hold full_stripe_locks_root->lock before calling this - * function - */ -static struct full_stripe_lock *insert_full_stripe_lock( - struct btrfs_full_stripe_locks_tree *locks_root, - u64 fstripe_logical) -{ - struct rb_node **p; - struct rb_node *parent = NULL; - struct full_stripe_lock *entry; - struct full_stripe_lock *ret; - - lockdep_assert_held(&locks_root->lock); - - p = &locks_root->root.rb_node; - while (*p) { - parent = *p; - entry = rb_entry(parent, struct full_stripe_lock, node); - if (fstripe_logical < entry->logical) { - p = &(*p)->rb_left; - } else if (fstripe_logical > entry->logical) { - p = &(*p)->rb_right; - } else { - entry->refs++; - return entry; - } - } - - /* - * Insert new lock. - */ - ret = kmalloc(sizeof(*ret), GFP_KERNEL); - if (!ret) - return ERR_PTR(-ENOMEM); - ret->logical = fstripe_logical; - ret->refs = 1; - mutex_init(&ret->mutex); - - rb_link_node(&ret->node, parent, p); - rb_insert_color(&ret->node, &locks_root->root); - return ret; -} - -/* - * Search for a full stripe lock of a block group - * - * Return pointer to existing full stripe lock if found - * Return NULL if not found - */ -static struct full_stripe_lock *search_full_stripe_lock( - struct btrfs_full_stripe_locks_tree *locks_root, - u64 fstripe_logical) -{ - struct rb_node *node; - struct full_stripe_lock *entry; - - lockdep_assert_held(&locks_root->lock); - - node = locks_root->root.rb_node; - while (node) { - entry = rb_entry(node, struct full_stripe_lock, node); - if (fstripe_logical < entry->logical) - node = node->rb_left; - else if (fstripe_logical > entry->logical) - node = node->rb_right; - else - return entry; - } - return NULL; -} - -/* - * Helper to get full stripe logical from a normal bytenr. - * - * Caller must ensure @cache is a RAID56 block group. - */ -static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr) -{ - u64 ret; - - /* - * Due to chunk item size limit, full stripe length should not be - * larger than U32_MAX. Just a sanity check here. - */ - WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX); - - /* - * round_down() can only handle power of 2, while RAID56 full - * stripe length can be 64KiB * n, so we need to manually round down. - */ - ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) * - cache->full_stripe_len + cache->start; - return ret; -} - -/* - * Lock a full stripe to avoid concurrency of recovery and read - * - * It's only used for profiles with parities (RAID5/6), for other profiles it - * does nothing. - * - * Return 0 if we locked full stripe covering @bytenr, with a mutex held. - * So caller must call unlock_full_stripe() at the same context. - * - * Return <0 if encounters error. - */ -static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, - bool *locked_ret) -{ - struct btrfs_block_group *bg_cache; - struct btrfs_full_stripe_locks_tree *locks_root; - struct full_stripe_lock *existing; - u64 fstripe_start; - int ret = 0; - - *locked_ret = false; - bg_cache = btrfs_lookup_block_group(fs_info, bytenr); - if (!bg_cache) { - ASSERT(0); - return -ENOENT; - } - - /* Profiles not based on parity don't need full stripe lock */ - if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) - goto out; - locks_root = &bg_cache->full_stripe_locks_root; - - fstripe_start = get_full_stripe_logical(bg_cache, bytenr); - - /* Now insert the full stripe lock */ - mutex_lock(&locks_root->lock); - existing = insert_full_stripe_lock(locks_root, fstripe_start); - mutex_unlock(&locks_root->lock); - if (IS_ERR(existing)) { - ret = PTR_ERR(existing); - goto out; - } - mutex_lock(&existing->mutex); - *locked_ret = true; -out: - btrfs_put_block_group(bg_cache); - return ret; -} - -/* - * Unlock a full stripe. - * - * NOTE: Caller must ensure it's the same context calling corresponding - * lock_full_stripe(). - * - * Return 0 if we unlock full stripe without problem. - * Return <0 for error - */ -static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, - bool locked) -{ - struct btrfs_block_group *bg_cache; - struct btrfs_full_stripe_locks_tree *locks_root; - struct full_stripe_lock *fstripe_lock; - u64 fstripe_start; - bool freeit = false; - int ret = 0; - - /* If we didn't acquire full stripe lock, no need to continue */ - if (!locked) - return 0; - - bg_cache = btrfs_lookup_block_group(fs_info, bytenr); - if (!bg_cache) { - ASSERT(0); - return -ENOENT; - } - if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) - goto out; - - locks_root = &bg_cache->full_stripe_locks_root; - fstripe_start = get_full_stripe_logical(bg_cache, bytenr); - - mutex_lock(&locks_root->lock); - fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start); - /* Unpaired unlock_full_stripe() detected */ - if (!fstripe_lock) { - WARN_ON(1); - ret = -ENOENT; - mutex_unlock(&locks_root->lock); - goto out; - } - - if (fstripe_lock->refs == 0) { - WARN_ON(1); - btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow", - fstripe_lock->logical); - } else { - fstripe_lock->refs--; - } - - if (fstripe_lock->refs == 0) { - rb_erase(&fstripe_lock->node, &locks_root->root); - freeit = true; - } - mutex_unlock(&locks_root->lock); - - mutex_unlock(&fstripe_lock->mutex); - if (freeit) - kfree(fstripe_lock); -out: - btrfs_put_block_group(bg_cache); - return ret; -} - static void scrub_free_csums(struct scrub_ctx *sctx) { while (!list_empty(&sctx->csum_list)) { @@ -721,24 +321,9 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) if (!sctx) return; - /* this can happen when scrub is cancelled */ - if (sctx->curr != -1) { - struct scrub_bio *sbio = sctx->bios[sctx->curr]; - - for (i = 0; i < sbio->sector_count; i++) - scrub_block_put(sbio->sectors[i]->sblock); - bio_put(sbio->bio); - } + for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) + release_scrub_stripe(&sctx->stripes[i]); - for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { - struct scrub_bio *sbio = sctx->bios[i]; - - if (!sbio) - break; - kfree(sbio); - } - - kfree(sctx->wr_curr_bio); scrub_free_csums(sctx); kfree(sctx); } @@ -760,45 +345,26 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( goto nomem; refcount_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; - sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO; - sctx->curr = -1; sctx->fs_info = fs_info; INIT_LIST_HEAD(&sctx->csum_list); - for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { - struct scrub_bio *sbio; + for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) { + int ret; - sbio = kzalloc(sizeof(*sbio), GFP_KERNEL); - if (!sbio) + ret = init_scrub_stripe(fs_info, &sctx->stripes[i]); + if (ret < 0) goto nomem; - sctx->bios[i] = sbio; - - sbio->index = i; - sbio->sctx = sctx; - sbio->sector_count = 0; - INIT_WORK(&sbio->work, scrub_bio_end_io_worker); - - if (i != SCRUB_BIOS_PER_SCTX - 1) - sctx->bios[i]->next_free = i + 1; - else - sctx->bios[i]->next_free = -1; + sctx->stripes[i].sctx = sctx; } sctx->first_free = 0; - atomic_set(&sctx->bios_in_flight, 0); - atomic_set(&sctx->workers_pending, 0); atomic_set(&sctx->cancel_req, 0); - spin_lock_init(&sctx->list_lock); spin_lock_init(&sctx->stat_lock); - init_waitqueue_head(&sctx->list_wait); sctx->throttle_deadline = 0; - WARN_ON(sctx->wr_curr_bio != NULL); mutex_init(&sctx->wr_lock); - sctx->wr_curr_bio = NULL; if (is_dev_replace) { WARN_ON(!fs_info->dev_replace.tgtdev); sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; - sctx->flush_all_writes = false; } return sctx; @@ -898,10 +464,10 @@ err: return 0; } -static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) +static void scrub_print_common_warning(const char *errstr, struct btrfs_device *dev, + bool is_super, u64 logical, u64 physical) { - struct btrfs_device *dev; - struct btrfs_fs_info *fs_info; + struct btrfs_fs_info *fs_info = dev->fs_info; struct btrfs_path *path; struct btrfs_key found_key; struct extent_buffer *eb; @@ -914,22 +480,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) u8 ref_level = 0; int ret; - WARN_ON(sblock->sector_count < 1); - dev = sblock->dev; - fs_info = sblock->sctx->fs_info; - /* Super block error, no need to search extent tree. */ - if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { + if (is_super) { btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu", - errstr, btrfs_dev_name(dev), sblock->physical); + errstr, btrfs_dev_name(dev), physical); return; } path = btrfs_alloc_path(); if (!path) return; - swarn.physical = sblock->physical; - swarn.logical = sblock->logical; + swarn.physical = physical; + swarn.logical = logical; swarn.errstr = errstr; swarn.dev = NULL; @@ -978,447 +540,6 @@ out: btrfs_free_path(path); } -static inline void scrub_get_recover(struct scrub_recover *recover) -{ - refcount_inc(&recover->refs); -} - -static inline void scrub_put_recover(struct btrfs_fs_info *fs_info, - struct scrub_recover *recover) -{ - if (refcount_dec_and_test(&recover->refs)) { - btrfs_bio_counter_dec(fs_info); - btrfs_put_bioc(recover->bioc); - kfree(recover); - } -} - -/* - * scrub_handle_errored_block gets called when either verification of the - * sectors failed or the bio failed to read, e.g. with EIO. In the latter - * case, this function handles all sectors in the bio, even though only one - * may be bad. - * The goal of this function is to repair the errored block by using the - * contents of one of the mirrors. - */ -static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) -{ - struct scrub_ctx *sctx = sblock_to_check->sctx; - struct btrfs_device *dev = sblock_to_check->dev; - struct btrfs_fs_info *fs_info; - u64 logical; - unsigned int failed_mirror_index; - unsigned int is_metadata; - unsigned int have_csum; - /* One scrub_block for each mirror */ - struct scrub_block *sblocks_for_recheck[BTRFS_MAX_MIRRORS] = { 0 }; - struct scrub_block *sblock_bad; - int ret; - int mirror_index; - int sector_num; - int success; - bool full_stripe_locked; - unsigned int nofs_flag; - static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); - - BUG_ON(sblock_to_check->sector_count < 1); - fs_info = sctx->fs_info; - if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { - /* - * If we find an error in a super block, we just report it. - * They will get written with the next transaction commit - * anyway - */ - scrub_print_warning("super block error", sblock_to_check); - spin_lock(&sctx->stat_lock); - ++sctx->stat.super_errors; - spin_unlock(&sctx->stat_lock); - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); - return 0; - } - logical = sblock_to_check->logical; - ASSERT(sblock_to_check->mirror_num); - failed_mirror_index = sblock_to_check->mirror_num - 1; - is_metadata = !(sblock_to_check->sectors[0]->flags & - BTRFS_EXTENT_FLAG_DATA); - have_csum = sblock_to_check->sectors[0]->have_csum; - - if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical)) - return 0; - - /* - * We must use GFP_NOFS because the scrub task might be waiting for a - * worker task executing this function and in turn a transaction commit - * might be waiting the scrub task to pause (which needs to wait for all - * the worker tasks to complete before pausing). - * We do allocations in the workers through insert_full_stripe_lock() - * and scrub_add_sector_to_wr_bio(), which happens down the call chain of - * this function. - */ - nofs_flag = memalloc_nofs_save(); - /* - * For RAID5/6, race can happen for a different device scrub thread. - * For data corruption, Parity and Data threads will both try - * to recovery the data. - * Race can lead to doubly added csum error, or even unrecoverable - * error. - */ - ret = lock_full_stripe(fs_info, logical, &full_stripe_locked); - if (ret < 0) { - memalloc_nofs_restore(nofs_flag); - spin_lock(&sctx->stat_lock); - if (ret == -ENOMEM) - sctx->stat.malloc_errors++; - sctx->stat.read_errors++; - sctx->stat.uncorrectable_errors++; - spin_unlock(&sctx->stat_lock); - return ret; - } - - /* - * read all mirrors one after the other. This includes to - * re-read the extent or metadata block that failed (that was - * the cause that this fixup code is called) another time, - * sector by sector this time in order to know which sectors - * caused I/O errors and which ones are good (for all mirrors). - * It is the goal to handle the situation when more than one - * mirror contains I/O errors, but the errors do not - * overlap, i.e. the data can be repaired by selecting the - * sectors from those mirrors without I/O error on the - * particular sectors. One example (with blocks >= 2 * sectorsize) - * would be that mirror #1 has an I/O error on the first sector, - * the second sector is good, and mirror #2 has an I/O error on - * the second sector, but the first sector is good. - * Then the first sector of the first mirror can be repaired by - * taking the first sector of the second mirror, and the - * second sector of the second mirror can be repaired by - * copying the contents of the 2nd sector of the 1st mirror. - * One more note: if the sectors of one mirror contain I/O - * errors, the checksum cannot be verified. In order to get - * the best data for repairing, the first attempt is to find - * a mirror without I/O errors and with a validated checksum. - * Only if this is not possible, the sectors are picked from - * mirrors with I/O errors without considering the checksum. - * If the latter is the case, at the end, the checksum of the - * repaired area is verified in order to correctly maintain - * the statistics. - */ - for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) { - /* - * Note: the two members refs and outstanding_sectors are not - * used in the blocks that are used for the recheck procedure. - * - * But alloc_scrub_block() will initialize sblock::ref anyway, - * so we can use scrub_block_put() to clean them up. - * - * And here we don't setup the physical/dev for the sblock yet, - * they will be correctly initialized in scrub_setup_recheck_block(). - */ - sblocks_for_recheck[mirror_index] = alloc_scrub_block(sctx, NULL, - logical, 0, 0, mirror_index); - if (!sblocks_for_recheck[mirror_index]) { - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - sctx->stat.read_errors++; - sctx->stat.uncorrectable_errors++; - spin_unlock(&sctx->stat_lock); - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); - goto out; - } - } - - /* Setup the context, map the logical blocks and alloc the sectors */ - ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck); - if (ret) { - spin_lock(&sctx->stat_lock); - sctx->stat.read_errors++; - sctx->stat.uncorrectable_errors++; - spin_unlock(&sctx->stat_lock); - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); - goto out; - } - BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); - sblock_bad = sblocks_for_recheck[failed_mirror_index]; - - /* build and submit the bios for the failed mirror, check checksums */ - scrub_recheck_block(fs_info, sblock_bad, 1); - - if (!sblock_bad->header_error && !sblock_bad->checksum_error && - sblock_bad->no_io_error_seen) { - /* - * The error disappeared after reading sector by sector, or - * the area was part of a huge bio and other parts of the - * bio caused I/O errors, or the block layer merged several - * read requests into one and the error is caused by a - * different bio (usually one of the two latter cases is - * the cause) - */ - spin_lock(&sctx->stat_lock); - sctx->stat.unverified_errors++; - sblock_to_check->data_corrected = 1; - spin_unlock(&sctx->stat_lock); - - if (sctx->is_dev_replace) - scrub_write_block_to_dev_replace(sblock_bad); - goto out; - } - - if (!sblock_bad->no_io_error_seen) { - spin_lock(&sctx->stat_lock); - sctx->stat.read_errors++; - spin_unlock(&sctx->stat_lock); - if (__ratelimit(&rs)) - scrub_print_warning("i/o error", sblock_to_check); - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); - } else if (sblock_bad->checksum_error) { - spin_lock(&sctx->stat_lock); - sctx->stat.csum_errors++; - spin_unlock(&sctx->stat_lock); - if (__ratelimit(&rs)) - scrub_print_warning("checksum error", sblock_to_check); - btrfs_dev_stat_inc_and_print(dev, - BTRFS_DEV_STAT_CORRUPTION_ERRS); - } else if (sblock_bad->header_error) { - spin_lock(&sctx->stat_lock); - sctx->stat.verify_errors++; - spin_unlock(&sctx->stat_lock); - if (__ratelimit(&rs)) - scrub_print_warning("checksum/header error", - sblock_to_check); - if (sblock_bad->generation_error) - btrfs_dev_stat_inc_and_print(dev, - BTRFS_DEV_STAT_GENERATION_ERRS); - else - btrfs_dev_stat_inc_and_print(dev, - BTRFS_DEV_STAT_CORRUPTION_ERRS); - } - - if (sctx->readonly) { - ASSERT(!sctx->is_dev_replace); - goto out; - } - - /* - * now build and submit the bios for the other mirrors, check - * checksums. - * First try to pick the mirror which is completely without I/O - * errors and also does not have a checksum error. - * If one is found, and if a checksum is present, the full block - * that is known to contain an error is rewritten. Afterwards - * the block is known to be corrected. - * If a mirror is found which is completely correct, and no - * checksum is present, only those sectors are rewritten that had - * an I/O error in the block to be repaired, since it cannot be - * determined, which copy of the other sectors is better (and it - * could happen otherwise that a correct sector would be - * overwritten by a bad one). - */ - for (mirror_index = 0; ;mirror_index++) { - struct scrub_block *sblock_other; - - if (mirror_index == failed_mirror_index) - continue; - - /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */ - if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) { - if (mirror_index >= BTRFS_MAX_MIRRORS) - break; - if (!sblocks_for_recheck[mirror_index]->sector_count) - break; - - sblock_other = sblocks_for_recheck[mirror_index]; - } else { - struct scrub_recover *r = sblock_bad->sectors[0]->recover; - int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs; - - if (mirror_index >= max_allowed) - break; - if (!sblocks_for_recheck[1]->sector_count) - break; - - ASSERT(failed_mirror_index == 0); - sblock_other = sblocks_for_recheck[1]; - sblock_other->mirror_num = 1 + mirror_index; - } - - /* build and submit the bios, check checksums */ - scrub_recheck_block(fs_info, sblock_other, 0); - - if (!sblock_other->header_error && - !sblock_other->checksum_error && - sblock_other->no_io_error_seen) { - if (sctx->is_dev_replace) { - scrub_write_block_to_dev_replace(sblock_other); - goto corrected_error; - } else { - ret = scrub_repair_block_from_good_copy( - sblock_bad, sblock_other); - if (!ret) - goto corrected_error; - } - } - } - - if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace) - goto did_not_correct_error; - - /* - * In case of I/O errors in the area that is supposed to be - * repaired, continue by picking good copies of those sectors. - * Select the good sectors from mirrors to rewrite bad sectors from - * the area to fix. Afterwards verify the checksum of the block - * that is supposed to be repaired. This verification step is - * only done for the purpose of statistic counting and for the - * final scrub report, whether errors remain. - * A perfect algorithm could make use of the checksum and try - * all possible combinations of sectors from the different mirrors - * until the checksum verification succeeds. For example, when - * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector - * of mirror #2 is readable but the final checksum test fails, - * then the 2nd sector of mirror #3 could be tried, whether now - * the final checksum succeeds. But this would be a rare - * exception and is therefore not implemented. At least it is - * avoided that the good copy is overwritten. - * A more useful improvement would be to pick the sectors - * without I/O error based on sector sizes (512 bytes on legacy - * disks) instead of on sectorsize. Then maybe 512 byte of one - * mirror could be repaired by taking 512 byte of a different - * mirror, even if other 512 byte sectors in the same sectorsize - * area are unreadable. - */ - success = 1; - for (sector_num = 0; sector_num < sblock_bad->sector_count; - sector_num++) { - struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num]; - struct scrub_block *sblock_other = NULL; - - /* Skip no-io-error sectors in scrub */ - if (!sector_bad->io_error && !sctx->is_dev_replace) - continue; - - if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) { - /* - * In case of dev replace, if raid56 rebuild process - * didn't work out correct data, then copy the content - * in sblock_bad to make sure target device is identical - * to source device, instead of writing garbage data in - * sblock_for_recheck array to target device. - */ - sblock_other = NULL; - } else if (sector_bad->io_error) { - /* Try to find no-io-error sector in mirrors */ - for (mirror_index = 0; - mirror_index < BTRFS_MAX_MIRRORS && - sblocks_for_recheck[mirror_index]->sector_count > 0; - mirror_index++) { - if (!sblocks_for_recheck[mirror_index]-> - sectors[sector_num]->io_error) { - sblock_other = sblocks_for_recheck[mirror_index]; - break; - } - } - if (!sblock_other) - success = 0; - } - - if (sctx->is_dev_replace) { - /* - * Did not find a mirror to fetch the sector from. - * scrub_write_sector_to_dev_replace() handles this - * case (sector->io_error), by filling the block with - * zeros before submitting the write request - */ - if (!sblock_other) - sblock_other = sblock_bad; - - if (scrub_write_sector_to_dev_replace(sblock_other, - sector_num) != 0) { - atomic64_inc( - &fs_info->dev_replace.num_write_errors); - success = 0; - } - } else if (sblock_other) { - ret = scrub_repair_sector_from_good_copy(sblock_bad, - sblock_other, - sector_num, 0); - if (0 == ret) - sector_bad->io_error = 0; - else - success = 0; - } - } - - if (success && !sctx->is_dev_replace) { - if (is_metadata || have_csum) { - /* - * need to verify the checksum now that all - * sectors on disk are repaired (the write - * request for data to be repaired is on its way). - * Just be lazy and use scrub_recheck_block() - * which re-reads the data before the checksum - * is verified, but most likely the data comes out - * of the page cache. - */ - scrub_recheck_block(fs_info, sblock_bad, 1); - if (!sblock_bad->header_error && - !sblock_bad->checksum_error && - sblock_bad->no_io_error_seen) - goto corrected_error; - else - goto did_not_correct_error; - } else { -corrected_error: - spin_lock(&sctx->stat_lock); - sctx->stat.corrected_errors++; - sblock_to_check->data_corrected = 1; - spin_unlock(&sctx->stat_lock); - btrfs_err_rl_in_rcu(fs_info, - "fixed up error at logical %llu on dev %s", - logical, btrfs_dev_name(dev)); - } - } else { -did_not_correct_error: - spin_lock(&sctx->stat_lock); - sctx->stat.uncorrectable_errors++; - spin_unlock(&sctx->stat_lock); - btrfs_err_rl_in_rcu(fs_info, - "unable to fixup (regular) error at logical %llu on dev %s", - logical, btrfs_dev_name(dev)); - } - -out: - for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) { - struct scrub_block *sblock = sblocks_for_recheck[mirror_index]; - struct scrub_recover *recover; - int sector_index; - - /* Not allocated, continue checking the next mirror */ - if (!sblock) - continue; - - for (sector_index = 0; sector_index < sblock->sector_count; - sector_index++) { - /* - * Here we just cleanup the recover, each sector will be - * properly cleaned up by later scrub_block_put() - */ - recover = sblock->sectors[sector_index]->recover; - if (recover) { - scrub_put_recover(fs_info, recover); - sblock->sectors[sector_index]->recover = NULL; - } - } - scrub_block_put(sblock); - } - - ret = unlock_full_stripe(fs_info, logical, full_stripe_locked); - memalloc_nofs_restore(nofs_flag); - if (ret < 0) - return ret; - return 0; -} - static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc) { if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) @@ -1430,7 +551,7 @@ static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc) } static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, - u64 *raid_map, + u64 full_stripe_logical, int nstripes, int mirror, int *stripe_index, u64 *stripe_offset) @@ -1438,19 +559,22 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, int i; if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + const int nr_data_stripes = (map_type & BTRFS_BLOCK_GROUP_RAID5) ? + nstripes - 1 : nstripes - 2; + /* RAID5/6 */ - for (i = 0; i < nstripes; i++) { - if (raid_map[i] == RAID6_Q_STRIPE || - raid_map[i] == RAID5_P_STRIPE) - continue; + for (i = 0; i < nr_data_stripes; i++) { + const u64 data_stripe_start = full_stripe_logical + + (i * BTRFS_STRIPE_LEN); - if (logical >= raid_map[i] && - logical < raid_map[i] + BTRFS_STRIPE_LEN) + if (logical >= data_stripe_start && + logical < data_stripe_start + BTRFS_STRIPE_LEN) break; } *stripe_index = i; - *stripe_offset = logical - raid_map[i]; + *stripe_offset = (logical - full_stripe_logical) & + BTRFS_STRIPE_LEN_MASK; } else { /* The other RAID type */ *stripe_index = mirror; @@ -1458,336 +582,6 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, } } -static int scrub_setup_recheck_block(struct scrub_block *original_sblock, - struct scrub_block *sblocks_for_recheck[]) -{ - struct scrub_ctx *sctx = original_sblock->sctx; - struct btrfs_fs_info *fs_info = sctx->fs_info; - u64 logical = original_sblock->logical; - u64 length = original_sblock->sector_count << fs_info->sectorsize_bits; - u64 generation = original_sblock->sectors[0]->generation; - u64 flags = original_sblock->sectors[0]->flags; - u64 have_csum = original_sblock->sectors[0]->have_csum; - struct scrub_recover *recover; - struct btrfs_io_context *bioc; - u64 sublen; - u64 mapped_length; - u64 stripe_offset; - int stripe_index; - int sector_index = 0; - int mirror_index; - int nmirrors; - int ret; - - while (length > 0) { - sublen = min_t(u64, length, fs_info->sectorsize); - mapped_length = sublen; - bioc = NULL; - - /* - * With a length of sectorsize, each returned stripe represents - * one mirror - */ - btrfs_bio_counter_inc_blocked(fs_info); - ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - logical, &mapped_length, &bioc); - if (ret || !bioc || mapped_length < sublen) { - btrfs_put_bioc(bioc); - btrfs_bio_counter_dec(fs_info); - return -EIO; - } - - recover = kzalloc(sizeof(struct scrub_recover), GFP_KERNEL); - if (!recover) { - btrfs_put_bioc(bioc); - btrfs_bio_counter_dec(fs_info); - return -ENOMEM; - } - - refcount_set(&recover->refs, 1); - recover->bioc = bioc; - recover->map_length = mapped_length; - - ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK); - - nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS); - - for (mirror_index = 0; mirror_index < nmirrors; - mirror_index++) { - struct scrub_block *sblock; - struct scrub_sector *sector; - - sblock = sblocks_for_recheck[mirror_index]; - sblock->sctx = sctx; - - sector = alloc_scrub_sector(sblock, logical); - if (!sector) { - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - spin_unlock(&sctx->stat_lock); - scrub_put_recover(fs_info, recover); - return -ENOMEM; - } - sector->flags = flags; - sector->generation = generation; - sector->have_csum = have_csum; - if (have_csum) - memcpy(sector->csum, - original_sblock->sectors[0]->csum, - sctx->fs_info->csum_size); - - scrub_stripe_index_and_offset(logical, - bioc->map_type, - bioc->raid_map, - bioc->num_stripes - - bioc->num_tgtdevs, - mirror_index, - &stripe_index, - &stripe_offset); - /* - * We're at the first sector, also populate @sblock - * physical and dev. - */ - if (sector_index == 0) { - sblock->physical = - bioc->stripes[stripe_index].physical + - stripe_offset; - sblock->dev = bioc->stripes[stripe_index].dev; - sblock->physical_for_dev_replace = - original_sblock->physical_for_dev_replace; - } - - BUG_ON(sector_index >= original_sblock->sector_count); - scrub_get_recover(recover); - sector->recover = recover; - } - scrub_put_recover(fs_info, recover); - length -= sublen; - logical += sublen; - sector_index++; - } - - return 0; -} - -static void scrub_bio_wait_endio(struct bio *bio) -{ - complete(bio->bi_private); -} - -static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, - struct bio *bio, - struct scrub_sector *sector) -{ - DECLARE_COMPLETION_ONSTACK(done); - - bio->bi_iter.bi_sector = (sector->offset + sector->sblock->logical) >> - SECTOR_SHIFT; - bio->bi_private = &done; - bio->bi_end_io = scrub_bio_wait_endio; - raid56_parity_recover(bio, sector->recover->bioc, sector->sblock->mirror_num); - - wait_for_completion_io(&done); - return blk_status_to_errno(bio->bi_status); -} - -static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info, - struct scrub_block *sblock) -{ - struct scrub_sector *first_sector = sblock->sectors[0]; - struct bio *bio; - int i; - - /* All sectors in sblock belong to the same stripe on the same device. */ - ASSERT(sblock->dev); - if (!sblock->dev->bdev) - goto out; - - bio = bio_alloc(sblock->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); - - for (i = 0; i < sblock->sector_count; i++) { - struct scrub_sector *sector = sblock->sectors[i]; - - bio_add_scrub_sector(bio, sector, fs_info->sectorsize); - } - - if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) { - bio_put(bio); - goto out; - } - - bio_put(bio); - - scrub_recheck_block_checksum(sblock); - - return; -out: - for (i = 0; i < sblock->sector_count; i++) - sblock->sectors[i]->io_error = 1; - - sblock->no_io_error_seen = 0; -} - -/* - * This function will check the on disk data for checksum errors, header errors - * and read I/O errors. If any I/O errors happen, the exact sectors which are - * errored are marked as being bad. The goal is to enable scrub to take those - * sectors that are not errored from all the mirrors so that the sectors that - * are errored in the just handled mirror can be repaired. - */ -static void scrub_recheck_block(struct btrfs_fs_info *fs_info, - struct scrub_block *sblock, - int retry_failed_mirror) -{ - int i; - - sblock->no_io_error_seen = 1; - - /* short cut for raid56 */ - if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0])) - return scrub_recheck_block_on_raid56(fs_info, sblock); - - for (i = 0; i < sblock->sector_count; i++) { - struct scrub_sector *sector = sblock->sectors[i]; - struct bio bio; - struct bio_vec bvec; - - if (sblock->dev->bdev == NULL) { - sector->io_error = 1; - sblock->no_io_error_seen = 0; - continue; - } - - bio_init(&bio, sblock->dev->bdev, &bvec, 1, REQ_OP_READ); - bio_add_scrub_sector(&bio, sector, fs_info->sectorsize); - bio.bi_iter.bi_sector = (sblock->physical + sector->offset) >> - SECTOR_SHIFT; - - btrfsic_check_bio(&bio); - if (submit_bio_wait(&bio)) { - sector->io_error = 1; - sblock->no_io_error_seen = 0; - } - - bio_uninit(&bio); - } - - if (sblock->no_io_error_seen) - scrub_recheck_block_checksum(sblock); -} - -static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector) -{ - struct btrfs_fs_devices *fs_devices = sector->sblock->dev->fs_devices; - int ret; - - ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE); - return !ret; -} - -static void scrub_recheck_block_checksum(struct scrub_block *sblock) -{ - sblock->header_error = 0; - sblock->checksum_error = 0; - sblock->generation_error = 0; - - if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA) - scrub_checksum_data(sblock); - else - scrub_checksum_tree_block(sblock); -} - -static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, - struct scrub_block *sblock_good) -{ - int i; - int ret = 0; - - for (i = 0; i < sblock_bad->sector_count; i++) { - int ret_sub; - - ret_sub = scrub_repair_sector_from_good_copy(sblock_bad, - sblock_good, i, 1); - if (ret_sub) - ret = ret_sub; - } - - return ret; -} - -static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad, - struct scrub_block *sblock_good, - int sector_num, int force_write) -{ - struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num]; - struct scrub_sector *sector_good = sblock_good->sectors[sector_num]; - struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info; - const u32 sectorsize = fs_info->sectorsize; - - if (force_write || sblock_bad->header_error || - sblock_bad->checksum_error || sector_bad->io_error) { - struct bio bio; - struct bio_vec bvec; - int ret; - - if (!sblock_bad->dev->bdev) { - btrfs_warn_rl(fs_info, - "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected"); - return -EIO; - } - - bio_init(&bio, sblock_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE); - bio.bi_iter.bi_sector = (sblock_bad->physical + - sector_bad->offset) >> SECTOR_SHIFT; - ret = bio_add_scrub_sector(&bio, sector_good, sectorsize); - - btrfsic_check_bio(&bio); - ret = submit_bio_wait(&bio); - bio_uninit(&bio); - - if (ret) { - btrfs_dev_stat_inc_and_print(sblock_bad->dev, - BTRFS_DEV_STAT_WRITE_ERRS); - atomic64_inc(&fs_info->dev_replace.num_write_errors); - return -EIO; - } - } - - return 0; -} - -static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) -{ - struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; - int i; - - /* - * This block is used for the check of the parity on the source device, - * so the data needn't be written into the destination device. - */ - if (sblock->sparity) - return; - - for (i = 0; i < sblock->sector_count; i++) { - int ret; - - ret = scrub_write_sector_to_dev_replace(sblock, i); - if (ret) - atomic64_inc(&fs_info->dev_replace.num_write_errors); - } -} - -static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num) -{ - const u32 sectorsize = sblock->sctx->fs_info->sectorsize; - struct scrub_sector *sector = sblock->sectors[sector_num]; - - if (sector->io_error) - memset(scrub_sector_get_kaddr(sector), 0, sectorsize); - - return scrub_add_sector_to_wr_bio(sblock->sctx, sector); -} - static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) { int ret = 0; @@ -1810,1089 +604,653 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) return ret; } -static void scrub_block_get(struct scrub_block *sblock) +static struct page *scrub_stripe_get_page(struct scrub_stripe *stripe, int sector_nr) { - refcount_inc(&sblock->refs); -} - -static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx, - struct scrub_sector *sector) -{ - struct scrub_block *sblock = sector->sblock; - struct scrub_bio *sbio; - int ret; - const u32 sectorsize = sctx->fs_info->sectorsize; - - mutex_lock(&sctx->wr_lock); -again: - if (!sctx->wr_curr_bio) { - sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio), - GFP_KERNEL); - if (!sctx->wr_curr_bio) { - mutex_unlock(&sctx->wr_lock); - return -ENOMEM; - } - sctx->wr_curr_bio->sctx = sctx; - sctx->wr_curr_bio->sector_count = 0; - } - sbio = sctx->wr_curr_bio; - if (sbio->sector_count == 0) { - ret = fill_writer_pointer_gap(sctx, sector->offset + - sblock->physical_for_dev_replace); - if (ret) { - mutex_unlock(&sctx->wr_lock); - return ret; - } - - sbio->physical = sblock->physical_for_dev_replace + sector->offset; - sbio->logical = sblock->logical + sector->offset; - sbio->dev = sctx->wr_tgtdev; - if (!sbio->bio) { - sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio, - REQ_OP_WRITE, GFP_NOFS); - } - sbio->bio->bi_private = sbio; - sbio->bio->bi_end_io = scrub_wr_bio_end_io; - sbio->bio->bi_iter.bi_sector = sbio->physical >> 9; - sbio->status = 0; - } else if (sbio->physical + sbio->sector_count * sectorsize != - sblock->physical_for_dev_replace + sector->offset || - sbio->logical + sbio->sector_count * sectorsize != - sblock->logical + sector->offset) { - scrub_wr_submit(sctx); - goto again; - } + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + int page_index = (sector_nr << fs_info->sectorsize_bits) >> PAGE_SHIFT; - ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize); - if (ret != sectorsize) { - if (sbio->sector_count < 1) { - bio_put(sbio->bio); - sbio->bio = NULL; - mutex_unlock(&sctx->wr_lock); - return -EIO; - } - scrub_wr_submit(sctx); - goto again; - } - - sbio->sectors[sbio->sector_count] = sector; - scrub_sector_get(sector); - /* - * Since ssector no longer holds a page, but uses sblock::pages, we - * have to ensure the sblock had not been freed before our write bio - * finished. - */ - scrub_block_get(sector->sblock); - - sbio->sector_count++; - if (sbio->sector_count == sctx->sectors_per_bio) - scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_lock); - - return 0; + return stripe->pages[page_index]; } -static void scrub_wr_submit(struct scrub_ctx *sctx) +static unsigned int scrub_stripe_get_page_offset(struct scrub_stripe *stripe, + int sector_nr) { - struct scrub_bio *sbio; - - if (!sctx->wr_curr_bio) - return; - - sbio = sctx->wr_curr_bio; - sctx->wr_curr_bio = NULL; - scrub_pending_bio_inc(sctx); - /* process all writes in a single worker thread. Then the block layer - * orders the requests before sending them to the driver which - * doubled the write performance on spinning disks when measured - * with Linux 3.5 */ - btrfsic_check_bio(sbio->bio); - submit_bio(sbio->bio); - - if (btrfs_is_zoned(sctx->fs_info)) - sctx->write_pointer = sbio->physical + sbio->sector_count * - sctx->fs_info->sectorsize; -} - -static void scrub_wr_bio_end_io(struct bio *bio) -{ - struct scrub_bio *sbio = bio->bi_private; - struct btrfs_fs_info *fs_info = sbio->dev->fs_info; - - sbio->status = bio->bi_status; - sbio->bio = bio; + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; - INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker); - queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); + return offset_in_page(sector_nr << fs_info->sectorsize_bits); } -static void scrub_wr_bio_end_io_worker(struct work_struct *work) +static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr) { - struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); - struct scrub_ctx *sctx = sbio->sctx; - int i; - - ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO); - if (sbio->status) { - struct btrfs_dev_replace *dev_replace = - &sbio->sctx->fs_info->dev_replace; - - for (i = 0; i < sbio->sector_count; i++) { - struct scrub_sector *sector = sbio->sectors[i]; - - sector->io_error = 1; - atomic64_inc(&dev_replace->num_write_errors); - } - } - - /* - * In scrub_add_sector_to_wr_bio() we grab extra ref for sblock, now in - * endio we should put the sblock. - */ - for (i = 0; i < sbio->sector_count; i++) { - scrub_block_put(sbio->sectors[i]->sblock); - scrub_sector_put(sbio->sectors[i]); - } - - bio_put(sbio->bio); - kfree(sbio); - scrub_pending_bio_dec(sctx); -} - -static int scrub_checksum(struct scrub_block *sblock) -{ - u64 flags; - int ret; - - /* - * No need to initialize these stats currently, - * because this function only use return value - * instead of these stats value. - * - * Todo: - * always use stats - */ - sblock->header_error = 0; - sblock->generation_error = 0; - sblock->checksum_error = 0; - - WARN_ON(sblock->sector_count < 1); - flags = sblock->sectors[0]->flags; - ret = 0; - if (flags & BTRFS_EXTENT_FLAG_DATA) - ret = scrub_checksum_data(sblock); - else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) - ret = scrub_checksum_tree_block(sblock); - else if (flags & BTRFS_EXTENT_FLAG_SUPER) - ret = scrub_checksum_super(sblock); - else - WARN_ON(1); - if (ret) - scrub_handle_errored_block(sblock); - - return ret; -} - -static int scrub_checksum_data(struct scrub_block *sblock) -{ - struct scrub_ctx *sctx = sblock->sctx; - struct btrfs_fs_info *fs_info = sctx->fs_info; + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; + const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits); + const struct page *first_page = scrub_stripe_get_page(stripe, sector_nr); + const unsigned int first_off = scrub_stripe_get_page_offset(stripe, sector_nr); SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - u8 csum[BTRFS_CSUM_SIZE]; - struct scrub_sector *sector; - char *kaddr; - - BUG_ON(sblock->sector_count < 1); - sector = sblock->sectors[0]; - if (!sector->have_csum) - return 0; - - kaddr = scrub_sector_get_kaddr(sector); - - shash->tfm = fs_info->csum_shash; - crypto_shash_init(shash); - - crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); - - if (memcmp(csum, sector->csum, fs_info->csum_size)) - sblock->checksum_error = 1; - return sblock->checksum_error; -} - -static int scrub_checksum_tree_block(struct scrub_block *sblock) -{ - struct scrub_ctx *sctx = sblock->sctx; - struct btrfs_header *h; - struct btrfs_fs_info *fs_info = sctx->fs_info; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; - /* - * This is done in sectorsize steps even for metadata as there's a - * constraint for nodesize to be aligned to sectorsize. This will need - * to change so we don't misuse data and metadata units like that. - */ - const u32 sectorsize = sctx->fs_info->sectorsize; - const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits; - int i; - struct scrub_sector *sector; - char *kaddr; - - BUG_ON(sblock->sector_count < 1); - - /* Each member in sectors is just one sector */ - ASSERT(sblock->sector_count == num_sectors); - - sector = sblock->sectors[0]; - kaddr = scrub_sector_get_kaddr(sector); - h = (struct btrfs_header *)kaddr; - memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size); + u8 calculated_csum[BTRFS_CSUM_SIZE]; + struct btrfs_header *header; /* - * we don't use the getter functions here, as we - * a) don't have an extent buffer and - * b) the page is already kmapped + * Here we don't have a good way to attach the pages (and subpages) + * to a dummy extent buffer, thus we have to directly grab the members + * from pages. */ - if (sblock->logical != btrfs_stack_header_bytenr(h)) { - sblock->header_error = 1; + header = (struct btrfs_header *)(page_address(first_page) + first_off); + memcpy(on_disk_csum, header->csum, fs_info->csum_size); + + if (logical != btrfs_stack_header_bytenr(header)) { + bitmap_set(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree); + bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad bytenr, has %llu want %llu", - sblock->logical, sblock->mirror_num, - btrfs_stack_header_bytenr(h), - sblock->logical); - goto out; + logical, stripe->mirror_num, + btrfs_stack_header_bytenr(header), logical); + return; } - - if (!scrub_check_fsid(h->fsid, sector)) { - sblock->header_error = 1; + if (memcmp(header->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE) != 0) { + bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); + bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad fsid, has %pU want %pU", - sblock->logical, sblock->mirror_num, - h->fsid, sblock->dev->fs_devices->fsid); - goto out; + logical, stripe->mirror_num, + header->fsid, fs_info->fs_devices->fsid); + return; } - - if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) { - sblock->header_error = 1; + if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid, + BTRFS_UUID_SIZE) != 0) { + bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); + bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", - sblock->logical, sblock->mirror_num, - h->chunk_tree_uuid, fs_info->chunk_tree_uuid); - goto out; + logical, stripe->mirror_num, + header->chunk_tree_uuid, fs_info->chunk_tree_uuid); + return; } + /* Now check tree block csum. */ shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); - crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, - sectorsize - BTRFS_CSUM_SIZE); + crypto_shash_update(shash, page_address(first_page) + first_off + + BTRFS_CSUM_SIZE, fs_info->sectorsize - BTRFS_CSUM_SIZE); + + for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) { + struct page *page = scrub_stripe_get_page(stripe, i); + unsigned int page_off = scrub_stripe_get_page_offset(stripe, i); - for (i = 1; i < num_sectors; i++) { - kaddr = scrub_sector_get_kaddr(sblock->sectors[i]); - crypto_shash_update(shash, kaddr, sectorsize); + crypto_shash_update(shash, page_address(page) + page_off, + fs_info->sectorsize); } crypto_shash_final(shash, calculated_csum); - if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) { - sblock->checksum_error = 1; + if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) { + bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); + bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, - sblock->logical, sblock->mirror_num, + logical, stripe->mirror_num, CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); - goto out; + return; } - - if (sector->generation != btrfs_stack_header_generation(h)) { - sblock->header_error = 1; - sblock->generation_error = 1; + if (stripe->sectors[sector_nr].generation != + btrfs_stack_header_generation(header)) { + bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); + bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad generation, has %llu want %llu", - sblock->logical, sblock->mirror_num, - btrfs_stack_header_generation(h), - sector->generation); + logical, stripe->mirror_num, + btrfs_stack_header_generation(header), + stripe->sectors[sector_nr].generation); + return; } - -out: - return sblock->header_error || sblock->checksum_error; + bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree); + bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree); + bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); } -static int scrub_checksum_super(struct scrub_block *sblock) +static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) { - struct btrfs_super_block *s; - struct scrub_ctx *sctx = sblock->sctx; - struct btrfs_fs_info *fs_info = sctx->fs_info; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - u8 calculated_csum[BTRFS_CSUM_SIZE]; - struct scrub_sector *sector; - char *kaddr; - int fail_gen = 0; - int fail_cor = 0; - - BUG_ON(sblock->sector_count < 1); - sector = sblock->sectors[0]; - kaddr = scrub_sector_get_kaddr(sector); - s = (struct btrfs_super_block *)kaddr; - - if (sblock->logical != btrfs_super_bytenr(s)) - ++fail_cor; - - if (sector->generation != btrfs_super_generation(s)) - ++fail_gen; - - if (!scrub_check_fsid(s->fsid, sector)) - ++fail_cor; + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + struct scrub_sector_verification *sector = &stripe->sectors[sector_nr]; + const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; + struct page *page = scrub_stripe_get_page(stripe, sector_nr); + unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr); + u8 csum_buf[BTRFS_CSUM_SIZE]; + int ret; - shash->tfm = fs_info->csum_shash; - crypto_shash_init(shash); - crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE, - BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum); + ASSERT(sector_nr >= 0 && sector_nr < stripe->nr_sectors); - if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size)) - ++fail_cor; + /* Sector not utilized, skip it. */ + if (!test_bit(sector_nr, &stripe->extent_sector_bitmap)) + return; - return fail_cor + fail_gen; -} + /* IO error, no need to check. */ + if (test_bit(sector_nr, &stripe->io_error_bitmap)) + return; -static void scrub_block_put(struct scrub_block *sblock) -{ - if (refcount_dec_and_test(&sblock->refs)) { - int i; - - if (sblock->sparity) - scrub_parity_put(sblock->sparity); - - for (i = 0; i < sblock->sector_count; i++) - scrub_sector_put(sblock->sectors[i]); - for (i = 0; i < DIV_ROUND_UP(sblock->len, PAGE_SIZE); i++) { - if (sblock->pages[i]) { - detach_scrub_page_private(sblock->pages[i]); - __free_page(sblock->pages[i]); - } + /* Metadata, verify the full tree block. */ + if (sector->is_metadata) { + /* + * Check if the tree block crosses the stripe boudary. If + * crossed the boundary, we cannot verify it but only give a + * warning. + * + * This can only happen on a very old filesystem where chunks + * are not ensured to be stripe aligned. + */ + if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) { + btrfs_warn_rl(fs_info, + "tree block at %llu crosses stripe boundary %llu", + stripe->logical + + (sector_nr << fs_info->sectorsize_bits), + stripe->logical); + return; } - kfree(sblock); - } -} - -static void scrub_sector_get(struct scrub_sector *sector) -{ - atomic_inc(§or->refs); -} - -static void scrub_sector_put(struct scrub_sector *sector) -{ - if (atomic_dec_and_test(§or->refs)) - kfree(sector); -} - -/* - * Throttling of IO submission, bandwidth-limit based, the timeslice is 1 - * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max. - */ -static void scrub_throttle(struct scrub_ctx *sctx) -{ - const int time_slice = 1000; - struct scrub_bio *sbio; - struct btrfs_device *device; - s64 delta; - ktime_t now; - u32 div; - u64 bwlimit; - - sbio = sctx->bios[sctx->curr]; - device = sbio->dev; - bwlimit = READ_ONCE(device->scrub_speed_max); - if (bwlimit == 0) + scrub_verify_one_metadata(stripe, sector_nr); return; + } /* - * Slice is divided into intervals when the IO is submitted, adjust by - * bwlimit and maximum of 64 intervals. + * Data is easier, we just verify the data csum (if we have it). For + * cases without csum, we have no other choice but to trust it. */ - div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024))); - div = min_t(u32, 64, div); - - /* Start new epoch, set deadline */ - now = ktime_get(); - if (sctx->throttle_deadline == 0) { - sctx->throttle_deadline = ktime_add_ms(now, time_slice / div); - sctx->throttle_sent = 0; + if (!sector->csum) { + clear_bit(sector_nr, &stripe->error_bitmap); + return; } - /* Still in the time to send? */ - if (ktime_before(now, sctx->throttle_deadline)) { - /* If current bio is within the limit, send it */ - sctx->throttle_sent += sbio->bio->bi_iter.bi_size; - if (sctx->throttle_sent <= div_u64(bwlimit, div)) - return; - - /* We're over the limit, sleep until the rest of the slice */ - delta = ktime_ms_delta(sctx->throttle_deadline, now); + ret = btrfs_check_sector_csum(fs_info, page, pgoff, csum_buf, sector->csum); + if (ret < 0) { + set_bit(sector_nr, &stripe->csum_error_bitmap); + set_bit(sector_nr, &stripe->error_bitmap); } else { - /* New request after deadline, start new epoch */ - delta = 0; - } - - if (delta) { - long timeout; - - timeout = div_u64(delta * HZ, 1000); - schedule_timeout_interruptible(timeout); + clear_bit(sector_nr, &stripe->csum_error_bitmap); + clear_bit(sector_nr, &stripe->error_bitmap); } - - /* Next call will start the deadline period */ - sctx->throttle_deadline = 0; -} - -static void scrub_submit(struct scrub_ctx *sctx) -{ - struct scrub_bio *sbio; - - if (sctx->curr == -1) - return; - - scrub_throttle(sctx); - - sbio = sctx->bios[sctx->curr]; - sctx->curr = -1; - scrub_pending_bio_inc(sctx); - btrfsic_check_bio(sbio->bio); - submit_bio(sbio->bio); } -static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx, - struct scrub_sector *sector) +/* Verify specified sectors of a stripe. */ +static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long bitmap) { - struct scrub_block *sblock = sector->sblock; - struct scrub_bio *sbio; - const u32 sectorsize = sctx->fs_info->sectorsize; - int ret; + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; + int sector_nr; -again: - /* - * grab a fresh bio or wait for one to become available - */ - while (sctx->curr == -1) { - spin_lock(&sctx->list_lock); - sctx->curr = sctx->first_free; - if (sctx->curr != -1) { - sctx->first_free = sctx->bios[sctx->curr]->next_free; - sctx->bios[sctx->curr]->next_free = -1; - sctx->bios[sctx->curr]->sector_count = 0; - spin_unlock(&sctx->list_lock); - } else { - spin_unlock(&sctx->list_lock); - wait_event(sctx->list_wait, sctx->first_free != -1); - } + for_each_set_bit(sector_nr, &bitmap, stripe->nr_sectors) { + scrub_verify_one_sector(stripe, sector_nr); + if (stripe->sectors[sector_nr].is_metadata) + sector_nr += sectors_per_tree - 1; } - sbio = sctx->bios[sctx->curr]; - if (sbio->sector_count == 0) { - sbio->physical = sblock->physical + sector->offset; - sbio->logical = sblock->logical + sector->offset; - sbio->dev = sblock->dev; - if (!sbio->bio) { - sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio, - REQ_OP_READ, GFP_NOFS); - } - sbio->bio->bi_private = sbio; - sbio->bio->bi_end_io = scrub_bio_end_io; - sbio->bio->bi_iter.bi_sector = sbio->physical >> 9; - sbio->status = 0; - } else if (sbio->physical + sbio->sector_count * sectorsize != - sblock->physical + sector->offset || - sbio->logical + sbio->sector_count * sectorsize != - sblock->logical + sector->offset || - sbio->dev != sblock->dev) { - scrub_submit(sctx); - goto again; - } - - sbio->sectors[sbio->sector_count] = sector; - ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize); - if (ret != sectorsize) { - if (sbio->sector_count < 1) { - bio_put(sbio->bio); - sbio->bio = NULL; - return -EIO; - } - scrub_submit(sctx); - goto again; - } - - scrub_block_get(sblock); /* one for the page added to the bio */ - atomic_inc(&sblock->outstanding_sectors); - sbio->sector_count++; - if (sbio->sector_count == sctx->sectors_per_bio) - scrub_submit(sctx); - - return 0; } -static void scrub_missing_raid56_end_io(struct bio *bio) +static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first_bvec) { - struct scrub_block *sblock = bio->bi_private; - struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; - - btrfs_bio_counter_dec(fs_info); - if (bio->bi_status) - sblock->no_io_error_seen = 0; - - bio_put(bio); + int i; - queue_work(fs_info->scrub_workers, &sblock->work); + for (i = 0; i < stripe->nr_sectors; i++) { + if (scrub_stripe_get_page(stripe, i) == first_bvec->bv_page && + scrub_stripe_get_page_offset(stripe, i) == first_bvec->bv_offset) + break; + } + ASSERT(i < stripe->nr_sectors); + return i; } -static void scrub_missing_raid56_worker(struct work_struct *work) +/* + * Repair read is different to the regular read: + * + * - Only reads the failed sectors + * - May have extra blocksize limits + */ +static void scrub_repair_read_endio(struct btrfs_bio *bbio) { - struct scrub_block *sblock = container_of(work, struct scrub_block, work); - struct scrub_ctx *sctx = sblock->sctx; - struct btrfs_fs_info *fs_info = sctx->fs_info; - u64 logical; - struct btrfs_device *dev; + struct scrub_stripe *stripe = bbio->private; + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + struct bio_vec *bvec; + int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); + u32 bio_size = 0; + int i; - logical = sblock->logical; - dev = sblock->dev; + ASSERT(sector_nr < stripe->nr_sectors); - if (sblock->no_io_error_seen) - scrub_recheck_block_checksum(sblock); + bio_for_each_bvec_all(bvec, &bbio->bio, i) + bio_size += bvec->bv_len; - if (!sblock->no_io_error_seen) { - spin_lock(&sctx->stat_lock); - sctx->stat.read_errors++; - spin_unlock(&sctx->stat_lock); - btrfs_err_rl_in_rcu(fs_info, - "IO error rebuilding logical %llu for dev %s", - logical, btrfs_dev_name(dev)); - } else if (sblock->header_error || sblock->checksum_error) { - spin_lock(&sctx->stat_lock); - sctx->stat.uncorrectable_errors++; - spin_unlock(&sctx->stat_lock); - btrfs_err_rl_in_rcu(fs_info, - "failed to rebuild valid logical %llu for dev %s", - logical, btrfs_dev_name(dev)); + if (bbio->bio.bi_status) { + bitmap_set(&stripe->io_error_bitmap, sector_nr, + bio_size >> fs_info->sectorsize_bits); + bitmap_set(&stripe->error_bitmap, sector_nr, + bio_size >> fs_info->sectorsize_bits); } else { - scrub_write_block_to_dev_replace(sblock); - } - - if (sctx->is_dev_replace && sctx->flush_all_writes) { - mutex_lock(&sctx->wr_lock); - scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_lock); + bitmap_clear(&stripe->io_error_bitmap, sector_nr, + bio_size >> fs_info->sectorsize_bits); } + bio_put(&bbio->bio); + if (atomic_dec_and_test(&stripe->pending_io)) + wake_up(&stripe->io_wait); +} - scrub_block_put(sblock); - scrub_pending_bio_dec(sctx); +static int calc_next_mirror(int mirror, int num_copies) +{ + ASSERT(mirror <= num_copies); + return (mirror + 1 > num_copies) ? 1 : mirror + 1; } -static void scrub_missing_raid56_pages(struct scrub_block *sblock) +static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, + int mirror, int blocksize, bool wait) { - struct scrub_ctx *sctx = sblock->sctx; - struct btrfs_fs_info *fs_info = sctx->fs_info; - u64 length = sblock->sector_count << fs_info->sectorsize_bits; - u64 logical = sblock->logical; - struct btrfs_io_context *bioc = NULL; - struct bio *bio; - struct btrfs_raid_bio *rbio; - int ret; + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + struct btrfs_bio *bbio = NULL; + const unsigned long old_error_bitmap = stripe->error_bitmap; int i; - btrfs_bio_counter_inc_blocked(fs_info); - ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, - &length, &bioc); - if (ret || !bioc || !bioc->raid_map) - goto bioc_out; - - if (WARN_ON(!sctx->is_dev_replace || - !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) { - /* - * We shouldn't be scrubbing a missing device. Even for dev - * replace, we should only get here for RAID 5/6. We either - * managed to mount something with no mirrors remaining or - * there's a bug in scrub_find_good_copy()/btrfs_map_block(). - */ - goto bioc_out; - } + ASSERT(stripe->mirror_num >= 1); + ASSERT(atomic_read(&stripe->pending_io) == 0); - bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); - bio->bi_iter.bi_sector = logical >> 9; - bio->bi_private = sblock; - bio->bi_end_io = scrub_missing_raid56_end_io; + for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) { + struct page *page; + int pgoff; + int ret; - rbio = raid56_alloc_missing_rbio(bio, bioc); - if (!rbio) - goto rbio_out; + page = scrub_stripe_get_page(stripe, i); + pgoff = scrub_stripe_get_page_offset(stripe, i); + + /* The current sector cannot be merged, submit the bio. */ + if (bbio && ((i > 0 && !test_bit(i - 1, &stripe->error_bitmap)) || + bbio->bio.bi_iter.bi_size >= blocksize)) { + ASSERT(bbio->bio.bi_iter.bi_size); + atomic_inc(&stripe->pending_io); + btrfs_submit_bio(bbio, mirror); + if (wait) + wait_scrub_stripe_io(stripe); + bbio = NULL; + } - for (i = 0; i < sblock->sector_count; i++) { - struct scrub_sector *sector = sblock->sectors[i]; + if (!bbio) { + bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ, + fs_info, scrub_repair_read_endio, stripe); + bbio->bio.bi_iter.bi_sector = (stripe->logical + + (i << fs_info->sectorsize_bits)) >> SECTOR_SHIFT; + } - raid56_add_scrub_pages(rbio, scrub_sector_get_page(sector), - scrub_sector_get_page_offset(sector), - sector->offset + sector->sblock->logical); + ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); + ASSERT(ret == fs_info->sectorsize); + } + if (bbio) { + ASSERT(bbio->bio.bi_iter.bi_size); + atomic_inc(&stripe->pending_io); + btrfs_submit_bio(bbio, mirror); + if (wait) + wait_scrub_stripe_io(stripe); } - - INIT_WORK(&sblock->work, scrub_missing_raid56_worker); - scrub_block_get(sblock); - scrub_pending_bio_inc(sctx); - raid56_submit_missing_rbio(rbio); - btrfs_put_bioc(bioc); - return; - -rbio_out: - bio_put(bio); -bioc_out: - btrfs_bio_counter_dec(fs_info); - btrfs_put_bioc(bioc); - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - spin_unlock(&sctx->stat_lock); } -static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len, - u64 physical, struct btrfs_device *dev, u64 flags, - u64 gen, int mirror_num, u8 *csum, - u64 physical_for_dev_replace) +static void scrub_stripe_report_errors(struct scrub_ctx *sctx, + struct scrub_stripe *stripe) { - struct scrub_block *sblock; - const u32 sectorsize = sctx->fs_info->sectorsize; - int index; + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct btrfs_device *dev = NULL; + u64 physical = 0; + int nr_data_sectors = 0; + int nr_meta_sectors = 0; + int nr_nodatacsum_sectors = 0; + int nr_repaired_sectors = 0; + int sector_nr; + + if (test_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state)) + return; - sblock = alloc_scrub_block(sctx, dev, logical, physical, - physical_for_dev_replace, mirror_num); - if (!sblock) { - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - spin_unlock(&sctx->stat_lock); - return -ENOMEM; - } + /* + * Init needed infos for error reporting. + * + * Although our scrub_stripe infrastucture is mostly based on btrfs_submit_bio() + * thus no need for dev/physical, error reporting still needs dev and physical. + */ + if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) { + u64 mapped_len = fs_info->sectorsize; + struct btrfs_io_context *bioc = NULL; + int stripe_index = stripe->mirror_num - 1; + int ret; - for (index = 0; len > 0; index++) { - struct scrub_sector *sector; + /* For scrub, our mirror_num should always start at 1. */ + ASSERT(stripe->mirror_num >= 1); + ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, + stripe->logical, &mapped_len, &bioc); /* - * Here we will allocate one page for one sector to scrub. - * This is fine if PAGE_SIZE == sectorsize, but will cost - * more memory for PAGE_SIZE > sectorsize case. + * If we failed, dev will be NULL, and later detailed reports + * will just be skipped. */ - u32 l = min(sectorsize, len); + if (ret < 0) + goto skip; + physical = bioc->stripes[stripe_index].physical; + dev = bioc->stripes[stripe_index].dev; + btrfs_put_bioc(bioc); + } - sector = alloc_scrub_sector(sblock, logical); - if (!sector) { - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - spin_unlock(&sctx->stat_lock); - scrub_block_put(sblock); - return -ENOMEM; - } - sector->flags = flags; - sector->generation = gen; - if (csum) { - sector->have_csum = 1; - memcpy(sector->csum, csum, sctx->fs_info->csum_size); +skip: + for_each_set_bit(sector_nr, &stripe->extent_sector_bitmap, stripe->nr_sectors) { + bool repaired = false; + + if (stripe->sectors[sector_nr].is_metadata) { + nr_meta_sectors++; } else { - sector->have_csum = 0; + nr_data_sectors++; + if (!stripe->sectors[sector_nr].csum) + nr_nodatacsum_sectors++; } - len -= l; - logical += l; - physical += l; - physical_for_dev_replace += l; - } - WARN_ON(sblock->sector_count == 0); - if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) { + if (test_bit(sector_nr, &stripe->init_error_bitmap) && + !test_bit(sector_nr, &stripe->error_bitmap)) { + nr_repaired_sectors++; + repaired = true; + } + + /* Good sector from the beginning, nothing need to be done. */ + if (!test_bit(sector_nr, &stripe->init_error_bitmap)) + continue; + /* - * This case should only be hit for RAID 5/6 device replace. See - * the comment in scrub_missing_raid56_pages() for details. + * Report error for the corrupted sectors. If repaired, just + * output the message of repaired message. */ - scrub_missing_raid56_pages(sblock); - } else { - for (index = 0; index < sblock->sector_count; index++) { - struct scrub_sector *sector = sblock->sectors[index]; - int ret; - - ret = scrub_add_sector_to_rd_bio(sctx, sector); - if (ret) { - scrub_block_put(sblock); - return ret; + if (repaired) { + if (dev) { + btrfs_err_rl_in_rcu(fs_info, + "fixed up error at logical %llu on dev %s physical %llu", + stripe->logical, btrfs_dev_name(dev), + physical); + } else { + btrfs_err_rl_in_rcu(fs_info, + "fixed up error at logical %llu on mirror %u", + stripe->logical, stripe->mirror_num); } + continue; } - if (flags & BTRFS_EXTENT_FLAG_SUPER) - scrub_submit(sctx); - } - - /* last one frees, either here or in bio completion for last page */ - scrub_block_put(sblock); - return 0; -} - -static void scrub_bio_end_io(struct bio *bio) -{ - struct scrub_bio *sbio = bio->bi_private; - struct btrfs_fs_info *fs_info = sbio->dev->fs_info; + /* The remaining are all for unrepaired. */ + if (dev) { + btrfs_err_rl_in_rcu(fs_info, + "unable to fixup (regular) error at logical %llu on dev %s physical %llu", + stripe->logical, btrfs_dev_name(dev), + physical); + } else { + btrfs_err_rl_in_rcu(fs_info, + "unable to fixup (regular) error at logical %llu on mirror %u", + stripe->logical, stripe->mirror_num); + } - sbio->status = bio->bi_status; - sbio->bio = bio; + if (test_bit(sector_nr, &stripe->io_error_bitmap)) + if (__ratelimit(&rs) && dev) + scrub_print_common_warning("i/o error", dev, false, + stripe->logical, physical); + if (test_bit(sector_nr, &stripe->csum_error_bitmap)) + if (__ratelimit(&rs) && dev) + scrub_print_common_warning("checksum error", dev, false, + stripe->logical, physical); + if (test_bit(sector_nr, &stripe->meta_error_bitmap)) + if (__ratelimit(&rs) && dev) + scrub_print_common_warning("header error", dev, false, + stripe->logical, physical); + } - queue_work(fs_info->scrub_workers, &sbio->work); + spin_lock(&sctx->stat_lock); + sctx->stat.data_extents_scrubbed += stripe->nr_data_extents; + sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents; + sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits; + sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits; + sctx->stat.no_csum += nr_nodatacsum_sectors; + sctx->stat.read_errors += + bitmap_weight(&stripe->io_error_bitmap, stripe->nr_sectors); + sctx->stat.csum_errors += + bitmap_weight(&stripe->csum_error_bitmap, stripe->nr_sectors); + sctx->stat.verify_errors += + bitmap_weight(&stripe->meta_error_bitmap, stripe->nr_sectors); + sctx->stat.uncorrectable_errors += + bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors); + sctx->stat.corrected_errors += nr_repaired_sectors; + spin_unlock(&sctx->stat_lock); } -static void scrub_bio_end_io_worker(struct work_struct *work) +/* + * The main entrance for all read related scrub work, including: + * + * - Wait for the initial read to finish + * - Verify and locate any bad sectors + * - Go through the remaining mirrors and try to read as large blocksize as + * possible + * - Go through all mirrors (including the failed mirror) sector-by-sector + * + * Writeback does not happen here, it needs extra synchronization. + */ +static void scrub_stripe_read_repair_worker(struct work_struct *work) { - struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); - struct scrub_ctx *sctx = sbio->sctx; + struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work); + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, + stripe->bg->length); + int mirror; int i; - ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO); - if (sbio->status) { - for (i = 0; i < sbio->sector_count; i++) { - struct scrub_sector *sector = sbio->sectors[i]; + ASSERT(stripe->mirror_num > 0); - sector->io_error = 1; - sector->sblock->no_io_error_seen = 0; - } - } + wait_scrub_stripe_io(stripe); + scrub_verify_one_stripe(stripe, stripe->extent_sector_bitmap); + /* Save the initial failed bitmap for later repair and report usage. */ + stripe->init_error_bitmap = stripe->error_bitmap; - /* Now complete the scrub_block items that have all pages completed */ - for (i = 0; i < sbio->sector_count; i++) { - struct scrub_sector *sector = sbio->sectors[i]; - struct scrub_block *sblock = sector->sblock; + if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) + goto out; - if (atomic_dec_and_test(&sblock->outstanding_sectors)) - scrub_block_complete(sblock); - scrub_block_put(sblock); + /* + * Try all remaining mirrors. + * + * Here we still try to read as large block as possible, as this is + * faster and we have extra safety nets to rely on. + */ + for (mirror = calc_next_mirror(stripe->mirror_num, num_copies); + mirror != stripe->mirror_num; + mirror = calc_next_mirror(mirror, num_copies)) { + const unsigned long old_error_bitmap = stripe->error_bitmap; + + scrub_stripe_submit_repair_read(stripe, mirror, + BTRFS_STRIPE_LEN, false); + wait_scrub_stripe_io(stripe); + scrub_verify_one_stripe(stripe, old_error_bitmap); + if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) + goto out; } - bio_put(sbio->bio); - sbio->bio = NULL; - spin_lock(&sctx->list_lock); - sbio->next_free = sctx->first_free; - sctx->first_free = sbio->index; - spin_unlock(&sctx->list_lock); + /* + * Last safety net, try re-checking all mirrors, including the failed + * one, sector-by-sector. + * + * As if one sector failed the drive's internal csum, the whole read + * containing the offending sector would be marked as error. + * Thus here we do sector-by-sector read. + * + * This can be slow, thus we only try it as the last resort. + */ - if (sctx->is_dev_replace && sctx->flush_all_writes) { - mutex_lock(&sctx->wr_lock); - scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_lock); - } + for (i = 0, mirror = stripe->mirror_num; + i < num_copies; + i++, mirror = calc_next_mirror(mirror, num_copies)) { + const unsigned long old_error_bitmap = stripe->error_bitmap; - scrub_pending_bio_dec(sctx); + scrub_stripe_submit_repair_read(stripe, mirror, + fs_info->sectorsize, true); + wait_scrub_stripe_io(stripe); + scrub_verify_one_stripe(stripe, old_error_bitmap); + if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) + goto out; + } +out: + scrub_stripe_report_errors(stripe->sctx, stripe); + set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state); + wake_up(&stripe->repair_wait); } -static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, - unsigned long *bitmap, - u64 start, u32 len) +static void scrub_read_endio(struct btrfs_bio *bbio) { - u64 offset; - u32 nsectors; - u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits; + struct scrub_stripe *stripe = bbio->private; - if (len >= sparity->stripe_len) { - bitmap_set(bitmap, 0, sparity->nsectors); - return; + if (bbio->bio.bi_status) { + bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors); + bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors); + } else { + bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors); } - - start -= sparity->logic_start; - start = div64_u64_rem(start, sparity->stripe_len, &offset); - offset = offset >> sectorsize_bits; - nsectors = len >> sectorsize_bits; - - if (offset + nsectors <= sparity->nsectors) { - bitmap_set(bitmap, offset, nsectors); - return; + bio_put(&bbio->bio); + if (atomic_dec_and_test(&stripe->pending_io)) { + wake_up(&stripe->io_wait); + INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker); + queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work); } - - bitmap_set(bitmap, offset, sparity->nsectors - offset); - bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset)); -} - -static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity, - u64 start, u32 len) -{ - __scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len); } -static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity, - u64 start, u32 len) +static void scrub_write_endio(struct btrfs_bio *bbio) { - __scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len); -} - -static void scrub_block_complete(struct scrub_block *sblock) -{ - int corrupted = 0; + struct scrub_stripe *stripe = bbio->private; + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + struct bio_vec *bvec; + int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); + u32 bio_size = 0; + int i; - if (!sblock->no_io_error_seen) { - corrupted = 1; - scrub_handle_errored_block(sblock); - } else { - /* - * if has checksum error, write via repair mechanism in - * dev replace case, otherwise write here in dev replace - * case. - */ - corrupted = scrub_checksum(sblock); - if (!corrupted && sblock->sctx->is_dev_replace) - scrub_write_block_to_dev_replace(sblock); - } + bio_for_each_bvec_all(bvec, &bbio->bio, i) + bio_size += bvec->bv_len; - if (sblock->sparity && corrupted && !sblock->data_corrected) { - u64 start = sblock->logical; - u64 end = sblock->logical + - sblock->sectors[sblock->sector_count - 1]->offset + - sblock->sctx->fs_info->sectorsize; + if (bbio->bio.bi_status) { + unsigned long flags; - ASSERT(end - start <= U32_MAX); - scrub_parity_mark_sectors_error(sblock->sparity, - start, end - start); + spin_lock_irqsave(&stripe->write_error_lock, flags); + bitmap_set(&stripe->write_error_bitmap, sector_nr, + bio_size >> fs_info->sectorsize_bits); + spin_unlock_irqrestore(&stripe->write_error_lock, flags); } -} + bio_put(&bbio->bio); -static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum) -{ - sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits; - list_del(&sum->list); - kfree(sum); + if (atomic_dec_and_test(&stripe->pending_io)) + wake_up(&stripe->io_wait); } /* - * Find the desired csum for range [logical, logical + sectorsize), and store - * the csum into @csum. + * Submit the write bio(s) for the sectors specified by @write_bitmap. * - * The search source is sctx->csum_list, which is a pre-populated list - * storing bytenr ordered csum ranges. We're responsible to cleanup any range - * that is before @logical. + * Here we utilize btrfs_submit_repair_write(), which has some extra benefits: * - * Return 0 if there is no csum for the range. - * Return 1 if there is csum for the range and copied to @csum. + * - Only needs logical bytenr and mirror_num + * Just like the scrub read path + * + * - Would only result in writes to the specified mirror + * Unlike the regular writeback path, which would write back to all stripes + * + * - Handle dev-replace and read-repair writeback differently */ -static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum) +static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe, + unsigned long write_bitmap, bool dev_replace) { - bool found = false; + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + struct btrfs_bio *bbio = NULL; + const bool zoned = btrfs_is_zoned(fs_info); + int sector_nr; - while (!list_empty(&sctx->csum_list)) { - struct btrfs_ordered_sum *sum = NULL; - unsigned long index; - unsigned long num_sectors; - - sum = list_first_entry(&sctx->csum_list, - struct btrfs_ordered_sum, list); - /* The current csum range is beyond our range, no csum found */ - if (sum->bytenr > logical) - break; + for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) { + struct page *page = scrub_stripe_get_page(stripe, sector_nr); + unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr); + int ret; - /* - * The current sum is before our bytenr, since scrub is always - * done in bytenr order, the csum will never be used anymore, - * clean it up so that later calls won't bother with the range, - * and continue search the next range. - */ - if (sum->bytenr + sum->len <= logical) { - drop_csum_range(sctx, sum); - continue; + /* We should only writeback sectors covered by an extent. */ + ASSERT(test_bit(sector_nr, &stripe->extent_sector_bitmap)); + + /* Cannot merge with previous sector, submit the current one. */ + if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) { + fill_writer_pointer_gap(sctx, stripe->physical + + (sector_nr << fs_info->sectorsize_bits)); + atomic_inc(&stripe->pending_io); + btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace); + /* For zoned writeback, queue depth must be 1. */ + if (zoned) + wait_scrub_stripe_io(stripe); + bbio = NULL; } - - /* Now the csum range covers our bytenr, copy the csum */ - found = true; - index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits; - num_sectors = sum->len >> sctx->fs_info->sectorsize_bits; - - memcpy(csum, sum->sums + index * sctx->fs_info->csum_size, - sctx->fs_info->csum_size); - - /* Cleanup the range if we're at the end of the csum range */ - if (index == num_sectors - 1) - drop_csum_range(sctx, sum); - break; + if (!bbio) { + bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_WRITE, + fs_info, scrub_write_endio, stripe); + bbio->bio.bi_iter.bi_sector = (stripe->logical + + (sector_nr << fs_info->sectorsize_bits)) >> + SECTOR_SHIFT; + } + ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); + ASSERT(ret == fs_info->sectorsize); + } + if (bbio) { + fill_writer_pointer_gap(sctx, bbio->bio.bi_iter.bi_sector << + SECTOR_SHIFT); + atomic_inc(&stripe->pending_io); + btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace); + if (zoned) + wait_scrub_stripe_io(stripe); } - if (!found) - return 0; - return 1; } -/* scrub extent tries to collect up to 64 kB for each bio */ -static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, - u64 logical, u32 len, - u64 physical, struct btrfs_device *dev, u64 flags, - u64 gen, int mirror_num) +/* + * Throttling of IO submission, bandwidth-limit based, the timeslice is 1 + * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max. + */ +static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *device, + unsigned int bio_size) { - struct btrfs_device *src_dev = dev; - u64 src_physical = physical; - int src_mirror = mirror_num; - int ret; - u8 csum[BTRFS_CSUM_SIZE]; - u32 blocksize; + const int time_slice = 1000; + s64 delta; + ktime_t now; + u32 div; + u64 bwlimit; - if (flags & BTRFS_EXTENT_FLAG_DATA) { - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) - blocksize = map->stripe_len; - else - blocksize = sctx->fs_info->sectorsize; - spin_lock(&sctx->stat_lock); - sctx->stat.data_extents_scrubbed++; - sctx->stat.data_bytes_scrubbed += len; - spin_unlock(&sctx->stat_lock); - } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) - blocksize = map->stripe_len; - else - blocksize = sctx->fs_info->nodesize; - spin_lock(&sctx->stat_lock); - sctx->stat.tree_extents_scrubbed++; - sctx->stat.tree_bytes_scrubbed += len; - spin_unlock(&sctx->stat_lock); - } else { - blocksize = sctx->fs_info->sectorsize; - WARN_ON(1); - } + bwlimit = READ_ONCE(device->scrub_speed_max); + if (bwlimit == 0) + return; /* - * For dev-replace case, we can have @dev being a missing device. - * Regular scrub will avoid its execution on missing device at all, - * as that would trigger tons of read error. - * - * Reading from missing device will cause read error counts to - * increase unnecessarily. - * So here we change the read source to a good mirror. + * Slice is divided into intervals when the IO is submitted, adjust by + * bwlimit and maximum of 64 intervals. */ - if (sctx->is_dev_replace && !dev->bdev) - scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical, - &src_dev, &src_mirror); - while (len) { - u32 l = min(len, blocksize); - int have_csum = 0; - - if (flags & BTRFS_EXTENT_FLAG_DATA) { - /* push csums to sbio */ - have_csum = scrub_find_csum(sctx, logical, csum); - if (have_csum == 0) - ++sctx->stat.no_csum; - } - ret = scrub_sectors(sctx, logical, l, src_physical, src_dev, - flags, gen, src_mirror, - have_csum ? csum : NULL, physical); - if (ret) - return ret; - len -= l; - logical += l; - physical += l; - src_physical += l; - } - return 0; -} - -static int scrub_sectors_for_parity(struct scrub_parity *sparity, - u64 logical, u32 len, - u64 physical, struct btrfs_device *dev, - u64 flags, u64 gen, int mirror_num, u8 *csum) -{ - struct scrub_ctx *sctx = sparity->sctx; - struct scrub_block *sblock; - const u32 sectorsize = sctx->fs_info->sectorsize; - int index; - - ASSERT(IS_ALIGNED(len, sectorsize)); - - sblock = alloc_scrub_block(sctx, dev, logical, physical, physical, mirror_num); - if (!sblock) { - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - spin_unlock(&sctx->stat_lock); - return -ENOMEM; - } - - sblock->sparity = sparity; - scrub_parity_get(sparity); - - for (index = 0; len > 0; index++) { - struct scrub_sector *sector; - - sector = alloc_scrub_sector(sblock, logical); - if (!sector) { - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - spin_unlock(&sctx->stat_lock); - scrub_block_put(sblock); - return -ENOMEM; - } - sblock->sectors[index] = sector; - /* For scrub parity */ - scrub_sector_get(sector); - list_add_tail(§or->list, &sparity->sectors_list); - sector->flags = flags; - sector->generation = gen; - if (csum) { - sector->have_csum = 1; - memcpy(sector->csum, csum, sctx->fs_info->csum_size); - } else { - sector->have_csum = 0; - } - - /* Iterate over the stripe range in sectorsize steps */ - len -= sectorsize; - logical += sectorsize; - physical += sectorsize; - } - - WARN_ON(sblock->sector_count == 0); - for (index = 0; index < sblock->sector_count; index++) { - struct scrub_sector *sector = sblock->sectors[index]; - int ret; + div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024))); + div = min_t(u32, 64, div); - ret = scrub_add_sector_to_rd_bio(sctx, sector); - if (ret) { - scrub_block_put(sblock); - return ret; - } + /* Start new epoch, set deadline */ + now = ktime_get(); + if (sctx->throttle_deadline == 0) { + sctx->throttle_deadline = ktime_add_ms(now, time_slice / div); + sctx->throttle_sent = 0; } - /* Last one frees, either here or in bio completion for last sector */ - scrub_block_put(sblock); - return 0; -} - -static int scrub_extent_for_parity(struct scrub_parity *sparity, - u64 logical, u32 len, - u64 physical, struct btrfs_device *dev, - u64 flags, u64 gen, int mirror_num) -{ - struct scrub_ctx *sctx = sparity->sctx; - int ret; - u8 csum[BTRFS_CSUM_SIZE]; - u32 blocksize; - - if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) { - scrub_parity_mark_sectors_error(sparity, logical, len); - return 0; - } + /* Still in the time to send? */ + if (ktime_before(now, sctx->throttle_deadline)) { + /* If current bio is within the limit, send it */ + sctx->throttle_sent += bio_size; + if (sctx->throttle_sent <= div_u64(bwlimit, div)) + return; - if (flags & BTRFS_EXTENT_FLAG_DATA) { - blocksize = sparity->stripe_len; - } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - blocksize = sparity->stripe_len; + /* We're over the limit, sleep until the rest of the slice */ + delta = ktime_ms_delta(sctx->throttle_deadline, now); } else { - blocksize = sctx->fs_info->sectorsize; - WARN_ON(1); + /* New request after deadline, start new epoch */ + delta = 0; } - while (len) { - u32 l = min(len, blocksize); - int have_csum = 0; + if (delta) { + long timeout; - if (flags & BTRFS_EXTENT_FLAG_DATA) { - /* push csums to sbio */ - have_csum = scrub_find_csum(sctx, logical, csum); - if (have_csum == 0) - goto skip; - } - ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev, - flags, gen, mirror_num, - have_csum ? csum : NULL); - if (ret) - return ret; -skip: - len -= l; - logical += l; - physical += l; + timeout = div_u64(delta * HZ, 1000); + schedule_timeout_interruptible(timeout); } - return 0; + + /* Next call will start the deadline period */ + sctx->throttle_deadline = 0; } /* @@ -2908,10 +1266,7 @@ static int get_raid56_logic_offset(u64 physical, int num, { int i; int j = 0; - u64 stripe_nr; u64 last_offset; - u32 stripe_index; - u32 rot; const int data_stripes = nr_data_stripes(map); last_offset = (physical - map->stripes[num].physical) * data_stripes; @@ -2920,13 +1275,17 @@ static int get_raid56_logic_offset(u64 physical, int num, *offset = last_offset; for (i = 0; i < data_stripes; i++) { - *offset = last_offset + i * map->stripe_len; + u32 stripe_nr; + u32 stripe_index; + u32 rot; - stripe_nr = div64_u64(*offset, map->stripe_len); - stripe_nr = div_u64(stripe_nr, data_stripes); + *offset = last_offset + (i << BTRFS_STRIPE_LEN_SHIFT); + + stripe_nr = (u32)(*offset >> BTRFS_STRIPE_LEN_SHIFT) / data_stripes; /* Work out the disk rotation on this stripe-set */ - stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot); + rot = stripe_nr % map->num_stripes; + stripe_nr /= map->num_stripes; /* calculate which stripe this data locates */ rot += i; stripe_index = rot % map->num_stripes; @@ -2935,123 +1294,10 @@ static int get_raid56_logic_offset(u64 physical, int num, if (stripe_index < num) j++; } - *offset = last_offset + j * map->stripe_len; + *offset = last_offset + (j << BTRFS_STRIPE_LEN_SHIFT); return 1; } -static void scrub_free_parity(struct scrub_parity *sparity) -{ - struct scrub_ctx *sctx = sparity->sctx; - struct scrub_sector *curr, *next; - int nbits; - - nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors); - if (nbits) { - spin_lock(&sctx->stat_lock); - sctx->stat.read_errors += nbits; - sctx->stat.uncorrectable_errors += nbits; - spin_unlock(&sctx->stat_lock); - } - - list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) { - list_del_init(&curr->list); - scrub_sector_put(curr); - } - - kfree(sparity); -} - -static void scrub_parity_bio_endio_worker(struct work_struct *work) -{ - struct scrub_parity *sparity = container_of(work, struct scrub_parity, - work); - struct scrub_ctx *sctx = sparity->sctx; - - btrfs_bio_counter_dec(sctx->fs_info); - scrub_free_parity(sparity); - scrub_pending_bio_dec(sctx); -} - -static void scrub_parity_bio_endio(struct bio *bio) -{ - struct scrub_parity *sparity = bio->bi_private; - struct btrfs_fs_info *fs_info = sparity->sctx->fs_info; - - if (bio->bi_status) - bitmap_or(&sparity->ebitmap, &sparity->ebitmap, - &sparity->dbitmap, sparity->nsectors); - - bio_put(bio); - - INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker); - queue_work(fs_info->scrub_parity_workers, &sparity->work); -} - -static void scrub_parity_check_and_repair(struct scrub_parity *sparity) -{ - struct scrub_ctx *sctx = sparity->sctx; - struct btrfs_fs_info *fs_info = sctx->fs_info; - struct bio *bio; - struct btrfs_raid_bio *rbio; - struct btrfs_io_context *bioc = NULL; - u64 length; - int ret; - - if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap, - &sparity->ebitmap, sparity->nsectors)) - goto out; - - length = sparity->logic_end - sparity->logic_start; - - btrfs_bio_counter_inc_blocked(fs_info); - ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start, - &length, &bioc); - if (ret || !bioc || !bioc->raid_map) - goto bioc_out; - - bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); - bio->bi_iter.bi_sector = sparity->logic_start >> 9; - bio->bi_private = sparity; - bio->bi_end_io = scrub_parity_bio_endio; - - rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, - sparity->scrub_dev, - &sparity->dbitmap, - sparity->nsectors); - btrfs_put_bioc(bioc); - if (!rbio) - goto rbio_out; - - scrub_pending_bio_inc(sctx); - raid56_parity_submit_scrub_rbio(rbio); - return; - -rbio_out: - bio_put(bio); -bioc_out: - btrfs_bio_counter_dec(fs_info); - bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap, - sparity->nsectors); - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - spin_unlock(&sctx->stat_lock); -out: - scrub_free_parity(sparity); -} - -static void scrub_parity_get(struct scrub_parity *sparity) -{ - refcount_inc(&sparity->refs); -} - -static void scrub_parity_put(struct scrub_parity *sparity) -{ - if (!refcount_dec_and_test(&sparity->refs)) - return; - - scrub_parity_check_and_repair(sparity); -} - /* * Return 0 if the extent item range covers any byte of the range. * Return <0 if the extent item is before @search_start. @@ -3178,226 +1424,533 @@ static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret, *generation_ret = btrfs_extent_generation(path->nodes[0], ei); } -static bool does_range_cross_boundary(u64 extent_start, u64 extent_len, - u64 boundary_start, u64 boudary_len) +static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, + u64 physical, u64 physical_end) { - return (extent_start < boundary_start && - extent_start + extent_len > boundary_start) || - (extent_start < boundary_start + boudary_len && - extent_start + extent_len > boundary_start + boudary_len); + struct btrfs_fs_info *fs_info = sctx->fs_info; + int ret = 0; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + mutex_lock(&sctx->wr_lock); + if (sctx->write_pointer < physical_end) { + ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical, + physical, + sctx->write_pointer); + if (ret) + btrfs_err(fs_info, + "zoned: failed to recover write pointer"); + } + mutex_unlock(&sctx->wr_lock); + btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical); + + return ret; } -static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx, - struct scrub_parity *sparity, - struct map_lookup *map, - struct btrfs_device *sdev, - struct btrfs_path *path, - u64 logical) +static void fill_one_extent_info(struct btrfs_fs_info *fs_info, + struct scrub_stripe *stripe, + u64 extent_start, u64 extent_len, + u64 extent_flags, u64 extent_gen) +{ + for (u64 cur_logical = max(stripe->logical, extent_start); + cur_logical < min(stripe->logical + BTRFS_STRIPE_LEN, + extent_start + extent_len); + cur_logical += fs_info->sectorsize) { + const int nr_sector = (cur_logical - stripe->logical) >> + fs_info->sectorsize_bits; + struct scrub_sector_verification *sector = + &stripe->sectors[nr_sector]; + + set_bit(nr_sector, &stripe->extent_sector_bitmap); + if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + sector->is_metadata = true; + sector->generation = extent_gen; + } + } +} + +static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe) { - struct btrfs_fs_info *fs_info = sctx->fs_info; - struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical); - struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical); - u64 cur_logical = logical; + stripe->extent_sector_bitmap = 0; + stripe->init_error_bitmap = 0; + stripe->error_bitmap = 0; + stripe->io_error_bitmap = 0; + stripe->csum_error_bitmap = 0; + stripe->meta_error_bitmap = 0; +} + +/* + * Locate one stripe which has at least one extent in its range. + * + * Return 0 if found such stripe, and store its info into @stripe. + * Return >0 if there is no such stripe in the specified range. + * Return <0 for error. + */ +static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, + struct btrfs_device *dev, u64 physical, + int mirror_num, u64 logical_start, + u32 logical_len, + struct scrub_stripe *stripe) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start); + struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start); + const u64 logical_end = logical_start + logical_len; + struct btrfs_path path = { 0 }; + u64 cur_logical = logical_start; + u64 stripe_end; + u64 extent_start; + u64 extent_len; + u64 extent_flags; + u64 extent_gen; int ret; - ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK); + memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * + stripe->nr_sectors); + scrub_stripe_reset_bitmaps(stripe); - /* Path must not be populated */ - ASSERT(!path->nodes[0]); + /* The range must be inside the bg. */ + ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); - while (cur_logical < logical + map->stripe_len) { - struct btrfs_io_context *bioc = NULL; - struct btrfs_device *extent_dev; - u64 extent_start; - u64 extent_size; - u64 mapped_length; - u64 extent_flags; - u64 extent_gen; - u64 extent_physical; - u64 extent_mirror_num; - - ret = find_first_extent_item(extent_root, path, cur_logical, - logical + map->stripe_len - cur_logical); - /* No more extent item in this data stripe */ + path.search_commit_root = 1; + path.skip_locking = 1; + + ret = find_first_extent_item(extent_root, &path, logical_start, logical_len); + /* Either error or not found. */ + if (ret) + goto out; + get_extent_info(&path, &extent_start, &extent_len, &extent_flags, &extent_gen); + if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) + stripe->nr_meta_extents++; + if (extent_flags & BTRFS_EXTENT_FLAG_DATA) + stripe->nr_data_extents++; + cur_logical = max(extent_start, cur_logical); + + /* + * Round down to stripe boundary. + * + * The extra calculation against bg->start is to handle block groups + * whose logical bytenr is not BTRFS_STRIPE_LEN aligned. + */ + stripe->logical = round_down(cur_logical - bg->start, BTRFS_STRIPE_LEN) + + bg->start; + stripe->physical = physical + stripe->logical - logical_start; + stripe->dev = dev; + stripe->bg = bg; + stripe->mirror_num = mirror_num; + stripe_end = stripe->logical + BTRFS_STRIPE_LEN - 1; + + /* Fill the first extent info into stripe->sectors[] array. */ + fill_one_extent_info(fs_info, stripe, extent_start, extent_len, + extent_flags, extent_gen); + cur_logical = extent_start + extent_len; + + /* Fill the extent info for the remaining sectors. */ + while (cur_logical <= stripe_end) { + ret = find_first_extent_item(extent_root, &path, cur_logical, + stripe_end - cur_logical + 1); + if (ret < 0) + goto out; if (ret > 0) { ret = 0; break; } + get_extent_info(&path, &extent_start, &extent_len, + &extent_flags, &extent_gen); + if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) + stripe->nr_meta_extents++; + if (extent_flags & BTRFS_EXTENT_FLAG_DATA) + stripe->nr_data_extents++; + fill_one_extent_info(fs_info, stripe, extent_start, extent_len, + extent_flags, extent_gen); + cur_logical = extent_start + extent_len; + } + + /* Now fill the data csum. */ + if (bg->flags & BTRFS_BLOCK_GROUP_DATA) { + int sector_nr; + unsigned long csum_bitmap = 0; + + /* Csum space should have already been allocated. */ + ASSERT(stripe->csums); + + /* + * Our csum bitmap should be large enough, as BTRFS_STRIPE_LEN + * should contain at most 16 sectors. + */ + ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); + + ret = btrfs_lookup_csums_bitmap(csum_root, stripe->logical, + stripe_end, stripe->csums, + &csum_bitmap, true); if (ret < 0) - break; - get_extent_info(path, &extent_start, &extent_size, &extent_flags, - &extent_gen); + goto out; + if (ret > 0) + ret = 0; + + for_each_set_bit(sector_nr, &csum_bitmap, stripe->nr_sectors) { + stripe->sectors[sector_nr].csum = stripe->csums + + sector_nr * fs_info->csum_size; + } + } + set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); +out: + btrfs_release_path(&path); + return ret; +} + +static void scrub_reset_stripe(struct scrub_stripe *stripe) +{ + scrub_stripe_reset_bitmaps(stripe); + + stripe->nr_meta_extents = 0; + stripe->nr_data_extents = 0; + stripe->state = 0; + + for (int i = 0; i < stripe->nr_sectors; i++) { + stripe->sectors[i].is_metadata = false; + stripe->sectors[i].csum = NULL; + stripe->sectors[i].generation = 0; + } +} + +static void scrub_submit_initial_read(struct scrub_ctx *sctx, + struct scrub_stripe *stripe) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct btrfs_bio *bbio; + int mirror = stripe->mirror_num; + + ASSERT(stripe->bg); + ASSERT(stripe->mirror_num > 0); + ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); + + bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info, + scrub_read_endio, stripe); + + /* Read the whole stripe. */ + bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT; + for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) { + int ret; + + ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0); + /* We should have allocated enough bio vectors. */ + ASSERT(ret == PAGE_SIZE); + } + atomic_inc(&stripe->pending_io); + + /* + * For dev-replace, either user asks to avoid the source dev, or + * the device is missing, we try the next mirror instead. + */ + if (sctx->is_dev_replace && + (fs_info->dev_replace.cont_reading_from_srcdev_mode == + BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID || + !stripe->dev->bdev)) { + int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, + stripe->bg->length); + + mirror = calc_next_mirror(mirror, num_copies); + } + btrfs_submit_bio(bbio, mirror); +} + +static bool stripe_has_metadata_error(struct scrub_stripe *stripe) +{ + int i; + + for_each_set_bit(i, &stripe->error_bitmap, stripe->nr_sectors) { + if (stripe->sectors[i].is_metadata) { + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; - /* Metadata should not cross stripe boundaries */ - if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && - does_range_cross_boundary(extent_start, extent_size, - logical, map->stripe_len)) { btrfs_err(fs_info, - "scrub: tree block %llu spanning stripes, ignored. logical=%llu", - extent_start, logical); - spin_lock(&sctx->stat_lock); - sctx->stat.uncorrectable_errors++; - spin_unlock(&sctx->stat_lock); - cur_logical += extent_size; - continue; + "stripe %llu has unrepaired metadata sector at %llu", + stripe->logical, + stripe->logical + (i << fs_info->sectorsize_bits)); + return true; } + } + return false; +} - /* Skip hole range which doesn't have any extent */ - cur_logical = max(extent_start, cur_logical); +static int flush_scrub_stripes(struct scrub_ctx *sctx) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct scrub_stripe *stripe; + const int nr_stripes = sctx->cur_stripe; + int ret = 0; - /* Truncate the range inside this data stripe */ - extent_size = min(extent_start + extent_size, - logical + map->stripe_len) - cur_logical; - extent_start = cur_logical; - ASSERT(extent_size <= U32_MAX); + if (!nr_stripes) + return 0; - scrub_parity_mark_sectors_data(sparity, extent_start, extent_size); + ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state)); - mapped_length = extent_size; - ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start, - &mapped_length, &bioc, 0); - if (!ret && (!bioc || mapped_length < extent_size)) - ret = -EIO; - if (ret) { - btrfs_put_bioc(bioc); - scrub_parity_mark_sectors_error(sparity, extent_start, - extent_size); - break; + scrub_throttle_dev_io(sctx, sctx->stripes[0].dev, + nr_stripes << BTRFS_STRIPE_LEN_SHIFT); + for (int i = 0; i < nr_stripes; i++) { + stripe = &sctx->stripes[i]; + scrub_submit_initial_read(sctx, stripe); + } + + for (int i = 0; i < nr_stripes; i++) { + stripe = &sctx->stripes[i]; + + wait_event(stripe->repair_wait, + test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state)); + } + + /* + * Submit the repaired sectors. For zoned case, we cannot do repair + * in-place, but queue the bg to be relocated. + */ + if (btrfs_is_zoned(fs_info)) { + for (int i = 0; i < nr_stripes; i++) { + stripe = &sctx->stripes[i]; + + if (!bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) { + btrfs_repair_one_zone(fs_info, + sctx->stripes[0].bg->start); + break; + } } - extent_physical = bioc->stripes[0].physical; - extent_mirror_num = bioc->mirror_num; - extent_dev = bioc->stripes[0].dev; - btrfs_put_bioc(bioc); + } else { + for (int i = 0; i < nr_stripes; i++) { + unsigned long repaired; - ret = btrfs_lookup_csums_list(csum_root, extent_start, - extent_start + extent_size - 1, - &sctx->csum_list, 1, false); - if (ret) { - scrub_parity_mark_sectors_error(sparity, extent_start, - extent_size); - break; + stripe = &sctx->stripes[i]; + + bitmap_andnot(&repaired, &stripe->init_error_bitmap, + &stripe->error_bitmap, stripe->nr_sectors); + scrub_write_sectors(sctx, stripe, repaired, false); + } + } + + /* Submit for dev-replace. */ + if (sctx->is_dev_replace) { + /* + * For dev-replace, if we know there is something wrong with + * metadata, we should immedately abort. + */ + for (int i = 0; i < nr_stripes; i++) { + if (stripe_has_metadata_error(&sctx->stripes[i])) { + ret = -EIO; + goto out; + } } + for (int i = 0; i < nr_stripes; i++) { + unsigned long good; - ret = scrub_extent_for_parity(sparity, extent_start, - extent_size, extent_physical, - extent_dev, extent_flags, - extent_gen, extent_mirror_num); - scrub_free_csums(sctx); + stripe = &sctx->stripes[i]; - if (ret) { - scrub_parity_mark_sectors_error(sparity, extent_start, - extent_size); - break; + ASSERT(stripe->dev == fs_info->dev_replace.srcdev); + + bitmap_andnot(&good, &stripe->extent_sector_bitmap, + &stripe->error_bitmap, stripe->nr_sectors); + scrub_write_sectors(sctx, stripe, good, true); } + } - cond_resched(); - cur_logical += extent_size; + /* Wait for the above writebacks to finish. */ + for (int i = 0; i < nr_stripes; i++) { + stripe = &sctx->stripes[i]; + + wait_scrub_stripe_io(stripe); + scrub_reset_stripe(stripe); } - btrfs_release_path(path); +out: + sctx->cur_stripe = 0; return ret; } -static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, - struct map_lookup *map, - struct btrfs_device *sdev, - u64 logic_start, - u64 logic_end) +static void raid56_scrub_wait_endio(struct bio *bio) { - struct btrfs_fs_info *fs_info = sctx->fs_info; - struct btrfs_path *path; - u64 cur_logical; + complete(bio->bi_private); +} + +static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, + struct btrfs_device *dev, int mirror_num, + u64 logical, u32 length, u64 physical) +{ + struct scrub_stripe *stripe; int ret; - struct scrub_parity *sparity; - int nsectors; - path = btrfs_alloc_path(); - if (!path) { - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - spin_unlock(&sctx->stat_lock); - return -ENOMEM; + /* No available slot, submit all stripes and wait for them. */ + if (sctx->cur_stripe >= SCRUB_STRIPES_PER_SCTX) { + ret = flush_scrub_stripes(sctx); + if (ret < 0) + return ret; } - path->search_commit_root = 1; - path->skip_locking = 1; - ASSERT(map->stripe_len <= U32_MAX); - nsectors = map->stripe_len >> fs_info->sectorsize_bits; - ASSERT(nsectors <= BITS_PER_LONG); - sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS); - if (!sparity) { - spin_lock(&sctx->stat_lock); - sctx->stat.malloc_errors++; - spin_unlock(&sctx->stat_lock); - btrfs_free_path(path); - return -ENOMEM; - } + stripe = &sctx->stripes[sctx->cur_stripe]; - ASSERT(map->stripe_len <= U32_MAX); - sparity->stripe_len = map->stripe_len; - sparity->nsectors = nsectors; - sparity->sctx = sctx; - sparity->scrub_dev = sdev; - sparity->logic_start = logic_start; - sparity->logic_end = logic_end; - refcount_set(&sparity->refs, 1); - INIT_LIST_HEAD(&sparity->sectors_list); + /* We can queue one stripe using the remaining slot. */ + scrub_reset_stripe(stripe); + ret = scrub_find_fill_first_stripe(bg, dev, physical, mirror_num, + logical, length, stripe); + /* Either >0 as no more extents or <0 for error. */ + if (ret) + return ret; + sctx->cur_stripe++; + return 0; +} - ret = 0; - for (cur_logical = logic_start; cur_logical < logic_end; - cur_logical += map->stripe_len) { - ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map, - sdev, path, cur_logical); +static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, + struct btrfs_device *scrub_dev, + struct btrfs_block_group *bg, + struct map_lookup *map, + u64 full_stripe_start) +{ + DECLARE_COMPLETION_ONSTACK(io_done); + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct btrfs_raid_bio *rbio; + struct btrfs_io_context *bioc = NULL; + struct bio *bio; + struct scrub_stripe *stripe; + bool all_empty = true; + const int data_stripes = nr_data_stripes(map); + unsigned long extent_bitmap = 0; + u64 length = data_stripes << BTRFS_STRIPE_LEN_SHIFT; + int ret; + + ASSERT(sctx->raid56_data_stripes); + + for (int i = 0; i < data_stripes; i++) { + int stripe_index; + int rot; + u64 physical; + + stripe = &sctx->raid56_data_stripes[i]; + rot = div_u64(full_stripe_start - bg->start, + data_stripes) >> BTRFS_STRIPE_LEN_SHIFT; + stripe_index = (i + rot) % map->num_stripes; + physical = map->stripes[stripe_index].physical + + (rot << BTRFS_STRIPE_LEN_SHIFT); + + scrub_reset_stripe(stripe); + set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state); + ret = scrub_find_fill_first_stripe(bg, + map->stripes[stripe_index].dev, physical, 1, + full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT), + BTRFS_STRIPE_LEN, stripe); if (ret < 0) + goto out; + /* + * No extent in this data stripe, need to manually mark them + * initialized to make later read submission happy. + */ + if (ret > 0) { + stripe->logical = full_stripe_start + + (i << BTRFS_STRIPE_LEN_SHIFT); + stripe->dev = map->stripes[stripe_index].dev; + stripe->mirror_num = 1; + set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); + } + } + + /* Check if all data stripes are empty. */ + for (int i = 0; i < data_stripes; i++) { + stripe = &sctx->raid56_data_stripes[i]; + if (!bitmap_empty(&stripe->extent_sector_bitmap, stripe->nr_sectors)) { + all_empty = false; break; + } + } + if (all_empty) { + ret = 0; + goto out; } - scrub_parity_put(sparity); - scrub_submit(sctx); - mutex_lock(&sctx->wr_lock); - scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_lock); + for (int i = 0; i < data_stripes; i++) { + stripe = &sctx->raid56_data_stripes[i]; + scrub_submit_initial_read(sctx, stripe); + } + for (int i = 0; i < data_stripes; i++) { + stripe = &sctx->raid56_data_stripes[i]; - btrfs_free_path(path); - return ret < 0 ? ret : 0; -} + wait_event(stripe->repair_wait, + test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state)); + } + /* For now, no zoned support for RAID56. */ + ASSERT(!btrfs_is_zoned(sctx->fs_info)); -static void sync_replace_for_zoned(struct scrub_ctx *sctx) -{ - if (!btrfs_is_zoned(sctx->fs_info)) - return; + /* Writeback for the repaired sectors. */ + for (int i = 0; i < data_stripes; i++) { + unsigned long repaired; - sctx->flush_all_writes = true; - scrub_submit(sctx); - mutex_lock(&sctx->wr_lock); - scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_lock); + stripe = &sctx->raid56_data_stripes[i]; - wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); -} + bitmap_andnot(&repaired, &stripe->init_error_bitmap, + &stripe->error_bitmap, stripe->nr_sectors); + scrub_write_sectors(sctx, stripe, repaired, false); + } -static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, - u64 physical, u64 physical_end) -{ - struct btrfs_fs_info *fs_info = sctx->fs_info; - int ret = 0; + /* Wait for the above writebacks to finish. */ + for (int i = 0; i < data_stripes; i++) { + stripe = &sctx->raid56_data_stripes[i]; - if (!btrfs_is_zoned(fs_info)) - return 0; + wait_scrub_stripe_io(stripe); + } - wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); + /* + * Now all data stripes are properly verified. Check if we have any + * unrepaired, if so abort immediately or we could further corrupt the + * P/Q stripes. + * + * During the loop, also populate extent_bitmap. + */ + for (int i = 0; i < data_stripes; i++) { + unsigned long error; - mutex_lock(&sctx->wr_lock); - if (sctx->write_pointer < physical_end) { - ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical, - physical, - sctx->write_pointer); - if (ret) + stripe = &sctx->raid56_data_stripes[i]; + + /* + * We should only check the errors where there is an extent. + * As we may hit an empty data stripe while it's missing. + */ + bitmap_and(&error, &stripe->error_bitmap, + &stripe->extent_sector_bitmap, stripe->nr_sectors); + if (!bitmap_empty(&error, stripe->nr_sectors)) { btrfs_err(fs_info, - "zoned: failed to recover write pointer"); +"unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", + full_stripe_start, i, stripe->nr_sectors, + &error); + ret = -EIO; + goto out; + } + bitmap_or(&extent_bitmap, &extent_bitmap, + &stripe->extent_sector_bitmap, stripe->nr_sectors); } - mutex_unlock(&sctx->wr_lock); - btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical); + /* Now we can check and regenerate the P/Q stripe. */ + bio = bio_alloc(NULL, 1, REQ_OP_READ, GFP_NOFS); + bio->bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT; + bio->bi_private = &io_done; + bio->bi_end_io = raid56_scrub_wait_endio; + + btrfs_bio_counter_inc_blocked(fs_info); + ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, full_stripe_start, + &length, &bioc); + if (ret < 0) { + btrfs_put_bioc(bioc); + btrfs_bio_counter_dec(fs_info); + goto out; + } + rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, scrub_dev, &extent_bitmap, + BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); + btrfs_put_bioc(bioc); + if (!rbio) { + ret = -ENOMEM; + btrfs_bio_counter_dec(fs_info); + goto out; + } + raid56_parity_submit_scrub_rbio(rbio); + wait_for_completion_io(&io_done); + ret = blk_status_to_errno(bio->bi_status); + bio_put(bio); + btrfs_bio_counter_dec(fs_info); + +out: return ret; } @@ -3410,8 +1963,6 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, * and @logical_length parameter. */ static int scrub_simple_mirror(struct scrub_ctx *sctx, - struct btrfs_root *extent_root, - struct btrfs_root *csum_root, struct btrfs_block_group *bg, struct map_lookup *map, u64 logical_start, u64 logical_length, @@ -3421,7 +1972,6 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, struct btrfs_fs_info *fs_info = sctx->fs_info; const u64 logical_end = logical_start + logical_length; /* An artificial limit, inherit from old scrub behavior */ - const u32 max_length = SZ_64K; struct btrfs_path path = { 0 }; u64 cur_logical = logical_start; int ret; @@ -3433,11 +1983,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, path.skip_locking = 1; /* Go through each extent items inside the logical range */ while (cur_logical < logical_end) { - u64 extent_start; - u64 extent_len; - u64 extent_flags; - u64 extent_gen; - u64 scrub_len; + u64 cur_physical = physical + cur_logical - logical_start; /* Canceled? */ if (atomic_read(&fs_info->scrub_cancel_req) || @@ -3448,14 +1994,6 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, /* Paused? */ if (atomic_read(&fs_info->scrub_pause_req)) { /* Push queued extents */ - sctx->flush_all_writes = true; - scrub_submit(sctx); - mutex_lock(&sctx->wr_lock); - scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_lock); - wait_event(sctx->list_wait, - atomic_read(&sctx->bios_in_flight) == 0); - sctx->flush_all_writes = false; scrub_blocked_if_needed(fs_info); } /* Block group removed? */ @@ -3467,8 +2005,9 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, } spin_unlock(&bg->lock); - ret = find_first_extent_item(extent_root, &path, cur_logical, - logical_end - cur_logical); + ret = queue_scrub_stripe(sctx, bg, device, mirror_num, + cur_logical, logical_end - cur_logical, + cur_physical); if (ret > 0) { /* No more extent, just update the accounting */ sctx->stat.last_physical = physical + logical_length; @@ -3477,52 +2016,11 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, } if (ret < 0) break; - get_extent_info(&path, &extent_start, &extent_len, - &extent_flags, &extent_gen); - /* Skip hole range which doesn't have any extent */ - cur_logical = max(extent_start, cur_logical); - /* - * Scrub len has three limits: - * - Extent size limit - * - Scrub range limit - * This is especially imporatant for RAID0/RAID10 to reuse - * this function - * - Max scrub size limit - */ - scrub_len = min(min(extent_start + extent_len, - logical_end), cur_logical + max_length) - - cur_logical; - - if (extent_flags & BTRFS_EXTENT_FLAG_DATA) { - ret = btrfs_lookup_csums_list(csum_root, cur_logical, - cur_logical + scrub_len - 1, - &sctx->csum_list, 1, false); - if (ret) - break; - } - if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && - does_range_cross_boundary(extent_start, extent_len, - logical_start, logical_length)) { - btrfs_err(fs_info, -"scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)", - extent_start, logical_start, logical_end); - spin_lock(&sctx->stat_lock); - sctx->stat.uncorrectable_errors++; - spin_unlock(&sctx->stat_lock); - cur_logical += scrub_len; - continue; - } - ret = scrub_extent(sctx, map, cur_logical, scrub_len, - cur_logical - logical_start + physical, - device, extent_flags, extent_gen, - mirror_num); - scrub_free_csums(sctx); - if (ret) - break; - if (sctx->is_dev_replace) - sync_replace_for_zoned(sctx); - cur_logical += scrub_len; + ASSERT(sctx->cur_stripe > 0); + cur_logical = sctx->stripes[sctx->cur_stripe - 1].logical + + BTRFS_STRIPE_LEN; + /* Don't hold CPU for too long time */ cond_resched(); } @@ -3536,7 +2034,7 @@ static u64 simple_stripe_full_stripe_len(const struct map_lookup *map) ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)); - return map->num_stripes / map->sub_stripes * map->stripe_len; + return (map->num_stripes / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT; } /* Get the logical bytenr for the stripe */ @@ -3552,7 +2050,8 @@ static u64 simple_stripe_get_logical(struct map_lookup *map, * (stripe_index / sub_stripes) gives how many data stripes we need to * skip. */ - return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start; + return ((stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT) + + bg->start; } /* Get the mirror number for the stripe */ @@ -3567,8 +2066,6 @@ static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index) } static int scrub_simple_stripe(struct scrub_ctx *sctx, - struct btrfs_root *extent_root, - struct btrfs_root *csum_root, struct btrfs_block_group *bg, struct map_lookup *map, struct btrfs_device *device, @@ -3588,15 +2085,15 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx, * just RAID1, so we can reuse scrub_simple_mirror() to scrub * this stripe. */ - ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map, - cur_logical, map->stripe_len, device, - cur_physical, mirror_num); + ret = scrub_simple_mirror(sctx, bg, map, cur_logical, + BTRFS_STRIPE_LEN, device, cur_physical, + mirror_num); if (ret) return ret; /* Skip to next stripe which belongs to the target device */ cur_logical += logical_increment; /* For physical offset, we just go to next stripe */ - cur_physical += map->stripe_len; + cur_physical += BTRFS_STRIPE_LEN; } return ret; } @@ -3607,15 +2104,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, int stripe_index) { - struct btrfs_path *path; struct btrfs_fs_info *fs_info = sctx->fs_info; - struct btrfs_root *root; - struct btrfs_root *csum_root; - struct blk_plug plug; struct map_lookup *map = em->map_lookup; const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; const u64 chunk_logical = bg->start; int ret; + int ret2; u64 physical = map->stripes[stripe_index].physical; const u64 dev_stripe_len = btrfs_calc_stripe_length(em); const u64 physical_end = physical + dev_stripe_len; @@ -3626,43 +2120,37 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, /* Offset inside the chunk */ u64 offset; u64 stripe_logical; - u64 stripe_end; int stop_loop = 0; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - /* - * work on commit root. The related disk blocks are static as - * long as COW is applied. This means, it is save to rewrite - * them to repair disk errors without any race conditions - */ - path->search_commit_root = 1; - path->skip_locking = 1; - path->reada = READA_FORWARD; - - wait_event(sctx->list_wait, - atomic_read(&sctx->bios_in_flight) == 0); scrub_blocked_if_needed(fs_info); - root = btrfs_extent_root(fs_info, bg->start); - csum_root = btrfs_csum_root(fs_info, bg->start); - - /* - * collect all data csums for the stripe to avoid seeking during - * the scrub. This might currently (crc32) end up to be about 1MB - */ - blk_start_plug(&plug); - if (sctx->is_dev_replace && btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) { mutex_lock(&sctx->wr_lock); sctx->write_pointer = physical; mutex_unlock(&sctx->wr_lock); - sctx->flush_all_writes = true; } + /* Prepare the extra data stripes used by RAID56. */ + if (profile & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ASSERT(sctx->raid56_data_stripes == NULL); + + sctx->raid56_data_stripes = kcalloc(nr_data_stripes(map), + sizeof(struct scrub_stripe), + GFP_KERNEL); + if (!sctx->raid56_data_stripes) { + ret = -ENOMEM; + goto out; + } + for (int i = 0; i < nr_data_stripes(map); i++) { + ret = init_scrub_stripe(fs_info, + &sctx->raid56_data_stripes[i]); + if (ret < 0) + goto out; + sctx->raid56_data_stripes[i].bg = bg; + sctx->raid56_data_stripes[i].sctx = sctx; + } + } /* * There used to be a big double loop to handle all profiles using the * same routine, which grows larger and more gross over time. @@ -3680,17 +2168,15 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, * Only @physical and @mirror_num needs to calculated using * @stripe_index. */ - ret = scrub_simple_mirror(sctx, root, csum_root, bg, map, - bg->start, bg->length, scrub_dev, - map->stripes[stripe_index].physical, + ret = scrub_simple_mirror(sctx, bg, map, bg->start, bg->length, + scrub_dev, map->stripes[stripe_index].physical, stripe_index + 1); offset = 0; goto out; } if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { - ret = scrub_simple_stripe(sctx, root, csum_root, bg, map, - scrub_dev, stripe_index); - offset = map->stripe_len * (stripe_index / map->sub_stripes); + ret = scrub_simple_stripe(sctx, bg, map, scrub_dev, stripe_index); + offset = (stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT; goto out; } @@ -3705,7 +2191,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, /* Initialize @offset in case we need to go to out: label */ get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL); - increment = map->stripe_len * nr_data_stripes(map); + increment = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT; /* * Due to the rotation, for RAID56 it's better to iterate each stripe @@ -3718,10 +2204,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, if (ret) { /* it is parity strip */ stripe_logical += chunk_logical; - stripe_end = stripe_logical + increment; - ret = scrub_raid56_parity(sctx, map, scrub_dev, - stripe_logical, - stripe_end); + ret = scrub_raid56_parity_stripe(sctx, scrub_dev, bg, + map, stripe_logical); if (ret) goto out; goto next; @@ -3735,14 +2219,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, * We can reuse scrub_simple_mirror() here, as the repair part * is still based on @mirror_num. */ - ret = scrub_simple_mirror(sctx, root, csum_root, bg, map, - logical, map->stripe_len, + ret = scrub_simple_mirror(sctx, bg, map, logical, BTRFS_STRIPE_LEN, scrub_dev, physical, 1); if (ret < 0) goto out; next: logical += increment; - physical += map->stripe_len; + physical += BTRFS_STRIPE_LEN; spin_lock(&sctx->stat_lock); if (stop_loop) sctx->stat.last_physical = @@ -3754,14 +2237,15 @@ next: break; } out: - /* push queued extents */ - scrub_submit(sctx); - mutex_lock(&sctx->wr_lock); - scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_lock); - - blk_finish_plug(&plug); - btrfs_free_path(path); + ret2 = flush_scrub_stripes(sctx); + if (!ret2) + ret = ret2; + if (sctx->raid56_data_stripes) { + for (int i = 0; i < nr_data_stripes(map); i++) + release_scrub_stripe(&sctx->raid56_data_stripes[i]); + kfree(sctx->raid56_data_stripes); + sctx->raid56_data_stripes = NULL; + } if (sctx->is_dev_replace && ret >= 0) { int ret2; @@ -4079,39 +2563,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset, dev_extent_len); - - /* - * flush, submit all pending read and write bios, afterwards - * wait for them. - * Note that in the dev replace case, a read request causes - * write requests that are submitted in the read completion - * worker. Therefore in the current situation, it is required - * that all write requests are flushed, so that all read and - * write requests are really completed when bios_in_flight - * changes to 0. - */ - sctx->flush_all_writes = true; - scrub_submit(sctx); - mutex_lock(&sctx->wr_lock); - scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_lock); - - wait_event(sctx->list_wait, - atomic_read(&sctx->bios_in_flight) == 0); - - scrub_pause_on(fs_info); - - /* - * must be called before we decrease @scrub_paused. - * make sure we don't block transaction commit while - * we are waiting pending workers finished. - */ - wait_event(sctx->list_wait, - atomic_read(&sctx->workers_pending) == 0); - sctx->flush_all_writes = false; - - scrub_pause_off(fs_info); - if (sctx->is_dev_replace && !btrfs_finish_block_group_to_copy(dev_replace->srcdev, cache, found_key.offset)) @@ -4168,18 +2619,62 @@ skip: return ret; } +static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev, + struct page *page, u64 physical, u64 generation) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct bio_vec bvec; + struct bio bio; + struct btrfs_super_block *sb = page_address(page); + int ret; + + bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_READ); + bio.bi_iter.bi_sector = physical >> SECTOR_SHIFT; + __bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE, 0); + ret = submit_bio_wait(&bio); + bio_uninit(&bio); + + if (ret < 0) + return ret; + ret = btrfs_check_super_csum(fs_info, sb); + if (ret != 0) { + btrfs_err_rl(fs_info, + "super block at physical %llu devid %llu has bad csum", + physical, dev->devid); + return -EIO; + } + if (btrfs_super_generation(sb) != generation) { + btrfs_err_rl(fs_info, +"super block at physical %llu devid %llu has bad generation %llu expect %llu", + physical, dev->devid, + btrfs_super_generation(sb), generation); + return -EUCLEAN; + } + + return btrfs_validate_super(fs_info, sb, -1); +} + static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev) { int i; u64 bytenr; u64 gen; - int ret; + int ret = 0; + struct page *page; struct btrfs_fs_info *fs_info = sctx->fs_info; if (BTRFS_FS_ERROR(fs_info)) return -EROFS; + page = alloc_page(GFP_KERNEL); + if (!page) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + return -ENOMEM; + } + /* Seed devices of a new filesystem has their own generation. */ if (scrub_dev->fs_devices != fs_info->fs_devices) gen = scrub_dev->generation; @@ -4194,14 +2689,14 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, if (!btrfs_check_super_location(scrub_dev, bytenr)) continue; - ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, - scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, - NULL, bytenr); - if (ret) - return ret; + ret = scrub_one_super(sctx, scrub_dev, page, bytenr, gen); + if (ret) { + spin_lock(&sctx->stat_lock); + sctx->stat.super_errors++; + spin_unlock(&sctx->stat_lock); + } } - wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); - + __free_page(page); return 0; } @@ -4212,20 +2707,15 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info) struct workqueue_struct *scrub_workers = fs_info->scrub_workers; struct workqueue_struct *scrub_wr_comp = fs_info->scrub_wr_completion_workers; - struct workqueue_struct *scrub_parity = - fs_info->scrub_parity_workers; fs_info->scrub_workers = NULL; fs_info->scrub_wr_completion_workers = NULL; - fs_info->scrub_parity_workers = NULL; mutex_unlock(&fs_info->scrub_lock); if (scrub_workers) destroy_workqueue(scrub_workers); if (scrub_wr_comp) destroy_workqueue(scrub_wr_comp); - if (scrub_parity) - destroy_workqueue(scrub_parity); } } @@ -4237,7 +2727,6 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, { struct workqueue_struct *scrub_workers = NULL; struct workqueue_struct *scrub_wr_comp = NULL; - struct workqueue_struct *scrub_parity = NULL; unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; int max_active = fs_info->thread_pool_size; int ret = -ENOMEM; @@ -4254,18 +2743,12 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, if (!scrub_wr_comp) goto fail_scrub_wr_completion_workers; - scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active); - if (!scrub_parity) - goto fail_scrub_parity_workers; - mutex_lock(&fs_info->scrub_lock); if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) { ASSERT(fs_info->scrub_workers == NULL && - fs_info->scrub_wr_completion_workers == NULL && - fs_info->scrub_parity_workers == NULL); + fs_info->scrub_wr_completion_workers == NULL); fs_info->scrub_workers = scrub_workers; fs_info->scrub_wr_completion_workers = scrub_wr_comp; - fs_info->scrub_parity_workers = scrub_parity; refcount_set(&fs_info->scrub_workers_refcnt, 1); mutex_unlock(&fs_info->scrub_lock); return 0; @@ -4275,8 +2758,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, mutex_unlock(&fs_info->scrub_lock); ret = 0; - destroy_workqueue(scrub_parity); -fail_scrub_parity_workers: + destroy_workqueue(scrub_wr_comp); fail_scrub_wr_completion_workers: destroy_workqueue(scrub_workers); @@ -4411,12 +2893,9 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, ret = scrub_enumerate_chunks(sctx, dev, start, end); memalloc_nofs_restore(nofs_flag); - wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); atomic_dec(&fs_info->scrubs_running); wake_up(&fs_info->scrub_pause_wait); - wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0); - if (progress) memcpy(progress, &sctx->stat, sizeof(*progress)); @@ -4541,28 +3020,3 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; } - -static void scrub_find_good_copy(struct btrfs_fs_info *fs_info, - u64 extent_logical, u32 extent_len, - u64 *extent_physical, - struct btrfs_device **extent_dev, - int *extent_mirror_num) -{ - u64 mapped_length; - struct btrfs_io_context *bioc = NULL; - int ret; - - mapped_length = extent_len; - ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical, - &mapped_length, &bioc, 0); - if (ret || !bioc || mapped_length < extent_len || - !bioc->stripes[0].dev->bdev) { - btrfs_put_bioc(bioc); - return; - } - - *extent_physical = bioc->stripes[0].physical; - *extent_mirror_num = bioc->mirror_num; - *extent_dev = bioc->stripes[0].dev; - btrfs_put_bioc(bioc); -} diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index e5c963bb873d..af2e153543a5 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1875,7 +1875,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, int left_ret; int right_ret; u64 left_gen; - u64 right_gen; + u64 right_gen = 0; struct btrfs_inode_info info; ret = get_inode_info(sctx->send_root, ino, &info); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 3eecce86f63f..75e7fa337e66 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -537,7 +537,7 @@ again: up_read(&info->groups_sem); } -static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, +static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info, u64 to_reclaim) { u64 bytes; @@ -550,6 +550,18 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, return nr; } +static inline u64 calc_delayed_refs_nr(const struct btrfs_fs_info *fs_info, + u64 to_reclaim) +{ + const u64 bytes = btrfs_calc_delayed_ref_bytes(fs_info, 1); + u64 nr; + + nr = div64_u64(to_reclaim, bytes); + if (!nr) + nr = 1; + return nr; +} + #define EXTENT_SIZE_PER_ITEM SZ_256K /* @@ -727,7 +739,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; } if (state == FLUSH_DELAYED_REFS_NR) - nr = calc_reclaim_items_nr(fs_info, num_bytes); + nr = calc_delayed_refs_nr(fs_info, num_bytes); else nr = 0; btrfs_run_delayed_refs(trans, nr); @@ -1599,11 +1611,22 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, struct reserve_ticket ticket; u64 start_ns = 0; u64 used; - int ret = 0; + int ret = -ENOSPC; bool pending_tickets; ASSERT(orig_bytes); - ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); + /* + * If have a transaction handle (current->journal_info != NULL), then + * the flush method can not be neither BTRFS_RESERVE_FLUSH_ALL* nor + * BTRFS_RESERVE_FLUSH_EVICT, as we could deadlock because those + * flushing methods can trigger transaction commits. + */ + if (current->journal_info) { + /* One assert per line for easier debugging. */ + ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL); + ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL_STEAL); + ASSERT(flush != BTRFS_RESERVE_FLUSH_EVICT); + } if (flush == BTRFS_RESERVE_FLUSH_DATA) async_work = &fs_info->async_data_reclaim_work; @@ -1611,7 +1634,6 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, async_work = &fs_info->async_reclaim_work; spin_lock(&space_info->lock); - ret = -ENOSPC; used = btrfs_space_info_used(space_info, true); /* diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 2033b71b18ce..0bb9d14e60a8 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -27,6 +27,7 @@ enum btrfs_reserve_flush_enum { * - Running delayed refs * - Running delalloc and waiting for ordered extents * - Allocating a new chunk + * - Committing transaction */ BTRFS_RESERVE_FLUSH_EVICT, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 366fb4cde145..ec18e2210602 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -826,7 +826,11 @@ out: !btrfs_test_opt(info, CLEAR_CACHE)) { btrfs_err(info, "cannot disable free space tree"); ret = -EINVAL; - + } + if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) && + !btrfs_test_opt(info, FREE_SPACE_TREE)) { + btrfs_err(info, "cannot disable free space tree with block-group-tree feature"); + ret = -EINVAL; } if (!ret) ret = btrfs_check_mountopts_zoned(info); @@ -1158,6 +1162,7 @@ static int btrfs_fill_super(struct super_block *sb, inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root); if (IS_ERR(inode)) { err = PTR_ERR(inode); + btrfs_handle_fs_error(fs_info, err, NULL); goto fail_close; } @@ -2412,7 +2417,7 @@ static int __init btrfs_print_mod_info(void) ", fsverity=no" #endif ; - pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options); + pr_info("Btrfs loaded%s\n", options); return 0; } diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 37fc58a7f27e..25294e624851 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1262,8 +1262,13 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, if (ret) return ret; +#ifdef CONFIG_BTRFS_DEBUG + if (thresh != 0 && (thresh > 100)) + return -EINVAL; +#else if (thresh != 0 && (thresh <= 50 || thresh > 100)) return -EINVAL; +#endif WRITE_ONCE(fs_info->bg_reclaim_threshold, thresh); diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index f2f2e11dac4c..ed0f36ae5346 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -486,7 +486,6 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, em->map_lookup = map; map->num_stripes = test->num_stripes; - map->stripe_len = BTRFS_STRIPE_LEN; map->type = test->raid_type; for (i = 0; i < map->num_stripes; i++) { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index b8d5b1fa9a03..8b6a99b8d7f6 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -601,15 +601,16 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, /* * We want to reserve all the bytes we may need all at once, so * we only do 1 enospc flushing cycle per transaction start. We - * accomplish this by simply assuming we'll do 2 x num_items - * worth of delayed refs updates in this trans handle, and - * refill that amount for whatever is missing in the reserve. + * accomplish this by simply assuming we'll do num_items worth + * of delayed refs updates in this trans handle, and refill that + * amount for whatever is missing in the reserve. */ num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items); if (flush == BTRFS_RESERVE_FLUSH_ALL && - btrfs_block_rsv_full(delayed_refs_rsv) == 0) { - delayed_refs_bytes = num_bytes; - num_bytes <<= 1; + !btrfs_block_rsv_full(delayed_refs_rsv)) { + delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info, + num_items); + num_bytes += delayed_refs_bytes; } /* @@ -942,16 +943,6 @@ void btrfs_throttle(struct btrfs_fs_info *fs_info) wait_current_trans(fs_info); } -static bool should_end_transaction(struct btrfs_trans_handle *trans) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - - if (btrfs_check_space_for_delayed_refs(fs_info)) - return true; - - return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 50); -} - bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_transaction *cur_trans = trans->transaction; @@ -960,7 +951,10 @@ bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans) test_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags)) return true; - return should_end_transaction(trans); + if (btrfs_check_space_for_delayed_refs(trans->fs_info)) + return true; + + return !!btrfs_block_rsv_check(&trans->fs_info->global_block_rsv, 50); } static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index baad1ed7e111..e2b54793bf0c 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -849,6 +849,20 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, stripe_len); return -EUCLEAN; } + /* + * We artificially limit the chunk size, so that the number of stripes + * inside a chunk can be fit into a U32. The current limit (256G) is + * way too large for real world usage anyway, and it's also much larger + * than our existing limit (10G). + * + * Thus it should be a good way to catch obvious bitflips. + */ + if (unlikely(length >= ((u64)U32_MAX << BTRFS_STRIPE_LEN_SHIFT))) { + chunk_err(leaf, chunk, logical, + "chunk length too large: have %llu limit %llu", + length, (u64)U32_MAX << BTRFS_STRIPE_LEN_SHIFT); + return -EUCLEAN; + } if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK))) { chunk_err(leaf, chunk, logical, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 200cea6e49e5..9b212e8c70cc 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2563,6 +2563,28 @@ static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) btrfs_put_block_group(cache); } +static int clean_log_buffer(struct btrfs_trans_handle *trans, + struct extent_buffer *eb) +{ + int ret; + + btrfs_tree_lock(eb); + btrfs_clear_buffer_dirty(trans, eb); + wait_on_extent_buffer_writeback(eb); + btrfs_tree_unlock(eb); + + if (trans) { + ret = btrfs_pin_reserved_extent(trans, eb->start, eb->len); + if (ret) + return ret; + btrfs_redirty_list_add(trans->transaction, eb); + } else { + unaccount_log_buffer(eb->fs_info, eb->start); + } + + return 0; +} + static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int *level, @@ -2573,7 +2595,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, u64 ptr_gen; struct extent_buffer *next; struct extent_buffer *cur; - u32 blocksize; int ret = 0; while (*level > 0) { @@ -2593,7 +2614,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, check.level = *level - 1; check.has_first_key = true; btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]); - blocksize = fs_info->nodesize; next = btrfs_find_create_tree_block(fs_info, bytenr, btrfs_header_owner(cur), @@ -2617,22 +2637,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, return ret; } - btrfs_tree_lock(next); - btrfs_clear_buffer_dirty(trans, next); - wait_on_extent_buffer_writeback(next); - btrfs_tree_unlock(next); - - if (trans) { - ret = btrfs_pin_reserved_extent(trans, - bytenr, blocksize); - if (ret) { - free_extent_buffer(next); - return ret; - } - btrfs_redirty_list_add( - trans->transaction, next); - } else { - unaccount_log_buffer(fs_info, bytenr); + ret = clean_log_buffer(trans, next); + if (ret) { + free_extent_buffer(next); + return ret; } } free_extent_buffer(next); @@ -2662,7 +2670,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, struct btrfs_path *path, int *level, struct walk_control *wc) { - struct btrfs_fs_info *fs_info = root->fs_info; int i; int slot; int ret; @@ -2682,27 +2689,9 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, return ret; if (wc->free) { - struct extent_buffer *next; - - next = path->nodes[*level]; - - btrfs_tree_lock(next); - btrfs_clear_buffer_dirty(trans, next); - wait_on_extent_buffer_writeback(next); - btrfs_tree_unlock(next); - - if (trans) { - ret = btrfs_pin_reserved_extent(trans, - path->nodes[*level]->start, - path->nodes[*level]->len); - if (ret) - return ret; - btrfs_redirty_list_add(trans->transaction, - next); - } else { - unaccount_log_buffer(fs_info, - path->nodes[*level]->start); - } + ret = clean_log_buffer(trans, path->nodes[*level]); + if (ret) + return ret; } free_extent_buffer(path->nodes[*level]); path->nodes[*level] = NULL; @@ -2720,7 +2709,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, static int walk_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct walk_control *wc) { - struct btrfs_fs_info *fs_info = log->fs_info; int ret = 0; int wret; int level; @@ -2762,26 +2750,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, orig_level); if (ret) goto out; - if (wc->free) { - struct extent_buffer *next; - - next = path->nodes[orig_level]; - - btrfs_tree_lock(next); - btrfs_clear_buffer_dirty(trans, next); - wait_on_extent_buffer_writeback(next); - btrfs_tree_unlock(next); - - if (trans) { - ret = btrfs_pin_reserved_extent(trans, - next->start, next->len); - if (ret) - goto out; - btrfs_redirty_list_add(trans->transaction, next); - } else { - unaccount_log_buffer(fs_info, next->start); - } - } + if (wc->free) + ret = clean_log_buffer(trans, path->nodes[orig_level]); } out: @@ -3648,6 +3618,9 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, ret = BTRFS_LOG_FORCE_COMMIT; else inode->last_dir_index_offset = last_index; + + if (btrfs_get_first_dir_index_to_log(inode) == 0) + btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset); out: kfree(ins_data); @@ -4099,7 +4072,7 @@ static int drop_inode_items(struct btrfs_trans_handle *trans, found_key.offset = 0; found_key.type = 0; - ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot); + ret = btrfs_bin_search(path->nodes[0], 0, &found_key, &start_slot); if (ret < 0) break; @@ -5406,6 +5379,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, LIST_HEAD(dir_list); struct btrfs_dir_list *dir_elem; u64 ino = btrfs_ino(start_inode); + struct btrfs_inode *curr_inode = start_inode; int ret = 0; /* @@ -5420,43 +5394,39 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; + /* Pairs with btrfs_add_delayed_iput below. */ + ihold(&curr_inode->vfs_inode); + while (true) { - struct extent_buffer *leaf; - struct btrfs_key min_key; + struct inode *vfs_inode; + struct btrfs_key key; + struct btrfs_key found_key; + u64 next_index; bool continue_curr_inode = true; - int nritems; - int i; + int iter_ret; - min_key.objectid = ino; - min_key.type = BTRFS_DIR_INDEX_KEY; - min_key.offset = 0; + key.objectid = ino; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = btrfs_get_first_dir_index_to_log(curr_inode); + next_index = key.offset; again: - btrfs_release_path(path); - ret = btrfs_search_forward(root, &min_key, path, trans->transid); - if (ret < 0) { - break; - } else if (ret > 0) { - ret = 0; - goto next; - } - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - for (i = path->slots[0]; i < nritems; i++) { + btrfs_for_each_slot(root->log_root, &key, &found_key, path, iter_ret) { + struct extent_buffer *leaf = path->nodes[0]; struct btrfs_dir_item *di; struct btrfs_key di_key; struct inode *di_inode; int log_mode = LOG_INODE_EXISTS; int type; - btrfs_item_key_to_cpu(leaf, &min_key, i); - if (min_key.objectid != ino || - min_key.type != BTRFS_DIR_INDEX_KEY) { + if (found_key.objectid != ino || + found_key.type != BTRFS_DIR_INDEX_KEY) { continue_curr_inode = false; break; } - di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); + next_index = found_key.offset + 1; + + di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); type = btrfs_dir_ftype(leaf, di); if (btrfs_dir_transid(leaf, di) < trans->transid) continue; @@ -5496,12 +5466,24 @@ again: break; } - if (continue_curr_inode && min_key.offset < (u64)-1) { - min_key.offset++; + btrfs_release_path(path); + + if (iter_ret < 0) { + ret = iter_ret; + goto out; + } else if (iter_ret > 0) { + continue_curr_inode = false; + } else { + key = found_key; + } + + if (continue_curr_inode && key.offset < (u64)-1) { + key.offset++; goto again; } -next: + btrfs_set_first_dir_index_to_log(curr_inode, next_index); + if (list_empty(&dir_list)) break; @@ -5509,9 +5491,22 @@ next: ino = dir_elem->ino; list_del(&dir_elem->list); kfree(dir_elem); + + btrfs_add_delayed_iput(curr_inode); + curr_inode = NULL; + + vfs_inode = btrfs_iget(fs_info->sb, ino, root); + if (IS_ERR(vfs_inode)) { + ret = PTR_ERR(vfs_inode); + break; + } + curr_inode = BTRFS_I(vfs_inode); } out: btrfs_free_path(path); + if (curr_inode) + btrfs_add_delayed_iput(curr_inode); + if (ret) { struct btrfs_dir_list *next; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c6d592870400..841e799dece5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1150,10 +1150,10 @@ static void btrfs_close_one_device(struct btrfs_device *device) device->last_flush_error = 0; /* Verify the device is back in a pristine state */ - ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); - ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); - ASSERT(list_empty(&device->dev_alloc_list)); - ASSERT(list_empty(&device->post_commit_list)); + WARN_ON(test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); + WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); + WARN_ON(!list_empty(&device->dev_alloc_list)); + WARN_ON(!list_empty(&device->post_commit_list)); } static void close_fs_devices(struct btrfs_fs_devices *fs_devices) @@ -2618,7 +2618,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path struct block_device *bdev; struct super_block *sb = fs_info->sb; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - struct btrfs_fs_devices *seed_devices; + struct btrfs_fs_devices *seed_devices = NULL; u64 orig_super_total_bytes; u64 orig_super_num_devices; int ret = 0; @@ -5125,7 +5125,7 @@ static void init_alloc_chunk_ctl_policy_regular( /* We don't want a chunk larger than 10% of writable space */ ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10), ctl->max_chunk_size); - ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; + ctl->dev_extent_min = ctl->dev_stripes << BTRFS_STRIPE_LEN_SHIFT; } static void init_alloc_chunk_ctl_policy_zoned( @@ -5407,7 +5407,6 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, j * ctl->stripe_size; } } - map->stripe_len = BTRFS_STRIPE_LEN; map->io_align = BTRFS_STRIPE_LEN; map->io_width = BTRFS_STRIPE_LEN; map->type = type; @@ -5438,7 +5437,7 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, } write_unlock(&em_tree->lock); - block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size); + block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size); if (IS_ERR(block_group)) goto error_del_extent; @@ -5615,11 +5614,11 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, btrfs_set_stack_chunk_length(chunk, bg->length); btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID); - btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); + btrfs_set_stack_chunk_stripe_len(chunk, BTRFS_STRIPE_LEN); btrfs_set_stack_chunk_type(chunk, map->type); btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); - btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); - btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); + btrfs_set_stack_chunk_io_align(chunk, BTRFS_STRIPE_LEN); + btrfs_set_stack_chunk_io_width(chunk, BTRFS_STRIPE_LEN); btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); @@ -5784,13 +5783,6 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) */ ret = map->num_stripes; free_extent_map(em); - - down_read(&fs_info->dev_replace.rwsem); - if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && - fs_info->dev_replace.tgtdev) - ret++; - up_read(&fs_info->dev_replace.rwsem); - return ret; } @@ -5809,7 +5801,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, if (!WARN_ON(IS_ERR(em))) { map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) - len = map->stripe_len * nr_data_stripes(map); + len = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT; free_extent_map(em); } return len; @@ -5895,41 +5887,16 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, return preferred_mirror; } -/* Bubble-sort the stripe set to put the parity/syndrome stripes last */ -static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes) -{ - int i; - int again = 1; - - while (again) { - again = 0; - for (i = 0; i < num_stripes - 1; i++) { - /* Swap if parity is on a smaller index */ - if (bioc->raid_map[i] > bioc->raid_map[i + 1]) { - swap(bioc->stripes[i], bioc->stripes[i + 1]); - swap(bioc->raid_map[i], bioc->raid_map[i + 1]); - again = 1; - } - } - } -} - static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, - int total_stripes, - int real_stripes) + u16 total_stripes) { - struct btrfs_io_context *bioc = kzalloc( + struct btrfs_io_context *bioc; + + bioc = kzalloc( /* The size of btrfs_io_context */ sizeof(struct btrfs_io_context) + /* Plus the variable array for the stripes */ - sizeof(struct btrfs_io_stripe) * (total_stripes) + - /* Plus the variable array for the tgt dev */ - sizeof(int) * (real_stripes) + - /* - * Plus the raid_map, which includes both the tgt dev - * and the stripes. - */ - sizeof(u64) * (total_stripes), + sizeof(struct btrfs_io_stripe) * (total_stripes), GFP_NOFS); if (!bioc) @@ -5938,8 +5905,8 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_ refcount_set(&bioc->refs, 1); bioc->fs_info = fs_info; - bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes); - bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes); + bioc->replace_stripe_src = -1; + bioc->full_stripe_logical = (u64)-1; return bioc; } @@ -5971,16 +5938,15 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, struct btrfs_discard_stripe *stripes; u64 length = *length_ret; u64 offset; - u64 stripe_nr; - u64 stripe_nr_end; + u32 stripe_nr; + u32 stripe_nr_end; + u32 stripe_cnt; u64 stripe_end_offset; - u64 stripe_cnt; - u64 stripe_len; u64 stripe_offset; u32 stripe_index; u32 factor = 0; u32 sub_stripes = 0; - u64 stripes_per_dev = 0; + u32 stripes_per_dev = 0; u32 remaining_stripes = 0; u32 last_stripe = 0; int ret; @@ -5996,26 +5962,25 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = -EOPNOTSUPP; goto out_free_map; -} + } offset = logical - em->start; length = min_t(u64, em->start + em->len - logical, length); *length_ret = length; - stripe_len = map->stripe_len; /* * stripe_nr counts the total number of stripes we have to stride * to get to this block */ - stripe_nr = div64_u64(offset, stripe_len); + stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; /* stripe_offset is the offset of this block in its stripe */ - stripe_offset = offset - stripe_nr * stripe_len; + stripe_offset = offset - (stripe_nr << BTRFS_STRIPE_LEN_SHIFT); - stripe_nr_end = round_up(offset + length, map->stripe_len); - stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); + stripe_nr_end = round_up(offset + length, BTRFS_STRIPE_LEN) >> + BTRFS_STRIPE_LEN_SHIFT; stripe_cnt = stripe_nr_end - stripe_nr; - stripe_end_offset = stripe_nr_end * map->stripe_len - + stripe_end_offset = (stripe_nr_end << BTRFS_STRIPE_LEN_SHIFT) - (offset + length); /* * after this, stripe_nr is the number of stripes on this @@ -6034,18 +5999,19 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, factor = map->num_stripes / sub_stripes; *num_stripes = min_t(u64, map->num_stripes, sub_stripes * stripe_cnt); - stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); + stripe_index = stripe_nr % factor; + stripe_nr /= factor; stripe_index *= sub_stripes; - stripes_per_dev = div_u64_rem(stripe_cnt, factor, - &remaining_stripes); - div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); - last_stripe *= sub_stripes; + + remaining_stripes = stripe_cnt % factor; + stripes_per_dev = stripe_cnt / factor; + last_stripe = ((stripe_nr_end - 1) % factor) * sub_stripes; } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_DUP)) { *num_stripes = map->num_stripes; } else { - stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, - &stripe_index); + stripe_index = stripe_nr % map->num_stripes; + stripe_nr /= map->num_stripes; } stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS); @@ -6057,15 +6023,15 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, for (i = 0; i < *num_stripes; i++) { stripes[i].physical = map->stripes[stripe_index].physical + - stripe_offset + stripe_nr * map->stripe_len; + stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT); stripes[i].dev = map->stripes[stripe_index].dev; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { - stripes[i].length = stripes_per_dev * map->stripe_len; + stripes[i].length = stripes_per_dev << BTRFS_STRIPE_LEN_SHIFT; if (i / sub_stripes < remaining_stripes) - stripes[i].length += map->stripe_len; + stripes[i].length += BTRFS_STRIPE_LEN; /* * Special for the first stripe and @@ -6103,83 +6069,6 @@ out_free_map: return ERR_PTR(ret); } -/* - * In dev-replace case, for repair case (that's the only case where the mirror - * is selected explicitly when calling btrfs_map_block), blocks left of the - * left cursor can also be read from the target drive. - * - * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the - * array of stripes. - * For READ, it also needs to be supported using the same mirror number. - * - * If the requested block is not left of the left cursor, EIO is returned. This - * can happen because btrfs_num_copies() returns one more in the dev-replace - * case. - */ -static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, - u64 logical, u64 length, - u64 srcdev_devid, int *mirror_num, - u64 *physical) -{ - struct btrfs_io_context *bioc = NULL; - int num_stripes; - int index_srcdev = 0; - int found = 0; - u64 physical_of_found = 0; - int i; - int ret = 0; - - ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - logical, &length, &bioc, NULL, NULL, 0); - if (ret) { - ASSERT(bioc == NULL); - return ret; - } - - num_stripes = bioc->num_stripes; - if (*mirror_num > num_stripes) { - /* - * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, - * that means that the requested area is not left of the left - * cursor - */ - btrfs_put_bioc(bioc); - return -EIO; - } - - /* - * process the rest of the function using the mirror_num of the source - * drive. Therefore look it up first. At the end, patch the device - * pointer to the one of the target drive. - */ - for (i = 0; i < num_stripes; i++) { - if (bioc->stripes[i].dev->devid != srcdev_devid) - continue; - - /* - * In case of DUP, in order to keep it simple, only add the - * mirror with the lowest physical address - */ - if (found && - physical_of_found <= bioc->stripes[i].physical) - continue; - - index_srcdev = i; - found = 1; - physical_of_found = bioc->stripes[i].physical; - } - - btrfs_put_bioc(bioc); - - ASSERT(found); - if (!found) - return -EIO; - - *mirror_num = index_srcdev + 1; - *physical = physical_of_found; - return ret; -} - static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) { struct btrfs_block_group *cache; @@ -6198,101 +6087,80 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) } static void handle_ops_on_dev_replace(enum btrfs_map_op op, - struct btrfs_io_context **bioc_ret, + struct btrfs_io_context *bioc, struct btrfs_dev_replace *dev_replace, u64 logical, int *num_stripes_ret, int *max_errors_ret) { - struct btrfs_io_context *bioc = *bioc_ret; u64 srcdev_devid = dev_replace->srcdev->devid; - int tgtdev_indexes = 0; + /* + * At this stage, num_stripes is still the real number of stripes, + * excluding the duplicated stripes. + */ int num_stripes = *num_stripes_ret; + int nr_extra_stripes = 0; int max_errors = *max_errors_ret; int i; - if (op == BTRFS_MAP_WRITE) { - int index_where_to_add; + /* + * A block group which has "to_copy" set will eventually be copied by + * the dev-replace process. We can avoid cloning IO here. + */ + if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) + return; - /* - * A block group which have "to_copy" set will eventually - * copied by dev-replace process. We can avoid cloning IO here. - */ - if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) - return; + /* + * Duplicate the write operations while the dev-replace procedure is + * running. Since the copying of the old disk to the new disk takes + * place at run time while the filesystem is mounted writable, the + * regular write operations to the old disk have to be duplicated to go + * to the new disk as well. + * + * Note that device->missing is handled by the caller, and that the + * write to the old disk is already set up in the stripes array. + */ + for (i = 0; i < num_stripes; i++) { + struct btrfs_io_stripe *old = &bioc->stripes[i]; + struct btrfs_io_stripe *new = &bioc->stripes[num_stripes + nr_extra_stripes]; - /* - * duplicate the write operations while the dev replace - * procedure is running. Since the copying of the old disk to - * the new disk takes place at run time while the filesystem is - * mounted writable, the regular write operations to the old - * disk have to be duplicated to go to the new disk as well. - * - * Note that device->missing is handled by the caller, and that - * the write to the old disk is already set up in the stripes - * array. - */ - index_where_to_add = num_stripes; - for (i = 0; i < num_stripes; i++) { - if (bioc->stripes[i].dev->devid == srcdev_devid) { - /* write to new disk, too */ - struct btrfs_io_stripe *new = - bioc->stripes + index_where_to_add; - struct btrfs_io_stripe *old = - bioc->stripes + i; - - new->physical = old->physical; - new->dev = dev_replace->tgtdev; - bioc->tgtdev_map[i] = index_where_to_add; - index_where_to_add++; - max_errors++; - tgtdev_indexes++; - } - } - num_stripes = index_where_to_add; - } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { - int index_srcdev = 0; - int found = 0; - u64 physical_of_found = 0; + if (old->dev->devid != srcdev_devid) + continue; - /* - * During the dev-replace procedure, the target drive can also - * be used to read data in case it is needed to repair a corrupt - * block elsewhere. This is possible if the requested area is - * left of the left cursor. In this area, the target drive is a - * full copy of the source drive. - */ - for (i = 0; i < num_stripes; i++) { - if (bioc->stripes[i].dev->devid == srcdev_devid) { - /* - * In case of DUP, in order to keep it simple, - * only add the mirror with the lowest physical - * address - */ - if (found && - physical_of_found <= bioc->stripes[i].physical) - continue; - index_srcdev = i; - found = 1; - physical_of_found = bioc->stripes[i].physical; - } - } - if (found) { - struct btrfs_io_stripe *tgtdev_stripe = - bioc->stripes + num_stripes; + new->physical = old->physical; + new->dev = dev_replace->tgtdev; + if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) + bioc->replace_stripe_src = i; + nr_extra_stripes++; + } + + /* We can only have at most 2 extra nr_stripes (for DUP). */ + ASSERT(nr_extra_stripes <= 2); + /* + * For GET_READ_MIRRORS, we can only return at most 1 extra stripe for + * replace. + * If we have 2 extra stripes, only choose the one with smaller physical. + */ + if (op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) { + struct btrfs_io_stripe *first = &bioc->stripes[num_stripes]; + struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1]; - tgtdev_stripe->physical = physical_of_found; - tgtdev_stripe->dev = dev_replace->tgtdev; - bioc->tgtdev_map[index_srcdev] = num_stripes; + /* Only DUP can have two extra stripes. */ + ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP); - tgtdev_indexes++; - num_stripes++; + /* + * Swap the last stripe stripes and reduce @nr_extra_stripes. + * The extra stripe would still be there, but won't be accessed. + */ + if (first->physical > second->physical) { + swap(second->physical, first->physical); + swap(second->dev, first->dev); + nr_extra_stripes--; } } - *num_stripes_ret = num_stripes; - *max_errors_ret = max_errors; - bioc->num_tgtdevs = tgtdev_indexes; - *bioc_ret = bioc; + *num_stripes_ret = num_stripes + nr_extra_stripes; + *max_errors_ret = max_errors + nr_extra_stripes; + bioc->replace_nr_stripes = nr_extra_stripes; } static bool need_full_stripe(enum btrfs_map_op op) @@ -6301,25 +6169,35 @@ static bool need_full_stripe(enum btrfs_map_op op) } static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, - u64 offset, u64 *stripe_nr, u64 *stripe_offset, + u64 offset, u32 *stripe_nr, u64 *stripe_offset, u64 *full_stripe_start) { - u32 stripe_len = map->stripe_len; - ASSERT(op != BTRFS_MAP_DISCARD); /* * Stripe_nr is the stripe where this block falls. stripe_offset is * the offset of this block in its stripe. */ - *stripe_nr = div64_u64_rem(offset, stripe_len, stripe_offset); + *stripe_offset = offset & BTRFS_STRIPE_LEN_MASK; + *stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; ASSERT(*stripe_offset < U32_MAX); if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); + unsigned long full_stripe_len = nr_data_stripes(map) << + BTRFS_STRIPE_LEN_SHIFT; + /* + * For full stripe start, we use previously calculated + * @stripe_nr. Align it to nr_data_stripes, then multiply with + * STRIPE_LEN. + * + * By this we can avoid u64 division completely. And we have + * to go rounddown(), not round_down(), as nr_data_stripes is + * not ensured to be power of 2. + */ *full_stripe_start = - div64_u64(offset, full_stripe_len) * full_stripe_len; + rounddown(*stripe_nr, nr_data_stripes(map)) << + BTRFS_STRIPE_LEN_SHIFT; /* * For writes to RAID56, allow to write a full stripe set, but @@ -6334,16 +6212,16 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, * a single disk). */ if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) - return stripe_len - *stripe_offset; + return BTRFS_STRIPE_LEN - *stripe_offset; return U64_MAX; } static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map, - u32 stripe_index, u64 stripe_offset, u64 stripe_nr) + u32 stripe_index, u64 stripe_offset, u32 stripe_nr) { dst->dev = map->stripes[stripe_index].dev; dst->physical = map->stripes[stripe_index].physical + - stripe_offset + stripe_nr * map->stripe_len; + stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT); } int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, @@ -6356,35 +6234,35 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, struct map_lookup *map; u64 map_offset; u64 stripe_offset; - u64 stripe_nr; - u64 stripe_len; + u32 stripe_nr; u32 stripe_index; int data_stripes; int i; int ret = 0; int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0); int num_stripes; + int num_copies; int max_errors = 0; - int tgtdev_indexes = 0; struct btrfs_io_context *bioc = NULL; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; int dev_replace_is_ongoing = 0; - int num_alloc_stripes; - int patch_the_first_stripe_for_dev_replace = 0; - u64 physical_to_patch_in_first_stripe = 0; + u16 num_alloc_stripes; u64 raid56_full_stripe_start = (u64)-1; u64 max_len; ASSERT(bioc_ret); ASSERT(op != BTRFS_MAP_DISCARD); + num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize); + if (mirror_num > num_copies) + return -EINVAL; + em = btrfs_get_chunk_map(fs_info, logical, *length); if (IS_ERR(em)) return PTR_ERR(em); map = em->map_lookup; data_stripes = nr_data_stripes(map); - stripe_len = map->stripe_len; map_offset = logical - em->start; max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr, @@ -6400,25 +6278,11 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, if (!dev_replace_is_ongoing) up_read(&dev_replace->rwsem); - if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && - !need_full_stripe(op) && dev_replace->tgtdev != NULL) { - ret = get_extra_mirror_from_replace(fs_info, logical, *length, - dev_replace->srcdev->devid, - &mirror_num, - &physical_to_patch_in_first_stripe); - if (ret) - goto out; - else - patch_the_first_stripe_for_dev_replace = 1; - } else if (mirror_num > map->num_stripes) { - mirror_num = 0; - } - num_stripes = 1; stripe_index = 0; if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, - &stripe_index); + stripe_index = stripe_nr % map->num_stripes; + stripe_nr /= map->num_stripes; if (!need_full_stripe(op)) mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { @@ -6444,8 +6308,8 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { u32 factor = map->num_stripes / map->sub_stripes; - stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); - stripe_index *= map->sub_stripes; + stripe_index = (stripe_nr % factor) * map->sub_stripes; + stripe_nr /= factor; if (need_full_stripe(op)) num_stripes = map->sub_stripes; @@ -6460,11 +6324,17 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, } } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - ASSERT(map->stripe_len == BTRFS_STRIPE_LEN); if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { - /* push stripe_nr back to the start of the full stripe */ - stripe_nr = div64_u64(raid56_full_stripe_start, - stripe_len * data_stripes); + /* + * Push stripe_nr back to the start of the full stripe + * For those cases needing a full stripe, @stripe_nr + * is the full stripe number. + * + * Originally we go raid56_full_stripe_start / full_stripe_len, + * but that can be expensive. Here we just divide + * @stripe_nr with @data_stripes. + */ + stripe_nr /= data_stripes; /* RAID[56] write or recovery. Return all stripes */ num_stripes = map->num_stripes; @@ -6473,7 +6343,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, /* Return the length to the full stripe end */ *length = min(logical + *length, raid56_full_stripe_start + em->start + - data_stripes * stripe_len) - logical; + (data_stripes << BTRFS_STRIPE_LEN_SHIFT)) - logical; stripe_index = 0; stripe_offset = 0; } else { @@ -6482,25 +6352,24 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * Mirror #2 is RAID5 parity block. * Mirror #3 is RAID6 Q block. */ - stripe_nr = div_u64_rem(stripe_nr, - data_stripes, &stripe_index); + stripe_index = stripe_nr % data_stripes; + stripe_nr /= data_stripes; if (mirror_num > 1) stripe_index = data_stripes + mirror_num - 2; /* We distribute the parity blocks across stripes */ - div_u64_rem(stripe_nr + stripe_index, map->num_stripes, - &stripe_index); + stripe_index = (stripe_nr + stripe_index) % map->num_stripes; if (!need_full_stripe(op) && mirror_num <= 1) mirror_num = 1; } } else { /* - * after this, stripe_nr is the number of stripes on this + * After this, stripe_nr is the number of stripes on this * device we have to walk to find the data, and stripe_index is * the number of our device in the stripe array */ - stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, - &stripe_index); + stripe_index = stripe_nr % map->num_stripes; + stripe_nr /= map->num_stripes; mirror_num = stripe_index + 1; } if (stripe_index >= map->num_stripes) { @@ -6512,13 +6381,16 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, } num_alloc_stripes = num_stripes; - if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { - if (op == BTRFS_MAP_WRITE) - num_alloc_stripes <<= 1; - if (op == BTRFS_MAP_GET_READ_MIRRORS) - num_alloc_stripes++; - tgtdev_indexes = num_stripes; - } + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && + op != BTRFS_MAP_READ) + /* + * For replace case, we need to add extra stripes for extra + * duplicated stripes. + * + * For both WRITE and GET_READ_MIRRORS, we may have at most + * 2 more stripes (DUP types, otherwise 1). + */ + num_alloc_stripes += 2; /* * If this I/O maps to a single device, try to return the device and @@ -6529,53 +6401,53 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) && (!need_full_stripe(op) || !dev_replace_is_ongoing || !dev_replace->tgtdev)) { - if (patch_the_first_stripe_for_dev_replace) { - smap->dev = dev_replace->tgtdev; - smap->physical = physical_to_patch_in_first_stripe; - *mirror_num_ret = map->num_stripes + 1; - } else { - set_io_stripe(smap, map, stripe_index, stripe_offset, - stripe_nr); - *mirror_num_ret = mirror_num; - } + set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr); + *mirror_num_ret = mirror_num; *bioc_ret = NULL; ret = 0; goto out; } - bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes); + bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes); if (!bioc) { ret = -ENOMEM; goto out; } + bioc->map_type = map->type; - for (i = 0; i < num_stripes; i++) { - set_io_stripe(&bioc->stripes[i], map, stripe_index, stripe_offset, - stripe_nr); - stripe_index++; - } - - /* Build raid_map */ + /* + * For RAID56 full map, we need to make sure the stripes[] follows the + * rule that data stripes are all ordered, then followed with P and Q + * (if we have). + * + * It's still mostly the same as other profiles, just with extra rotation. + */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { - u64 tmp; - unsigned rot; - - /* Work out the disk rotation on this stripe-set */ - div_u64_rem(stripe_nr, num_stripes, &rot); - - /* Fill in the logical address of each stripe */ - tmp = stripe_nr * data_stripes; - for (i = 0; i < data_stripes; i++) - bioc->raid_map[(i + rot) % num_stripes] = - em->start + (tmp + i) * map->stripe_len; - - bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE; - if (map->type & BTRFS_BLOCK_GROUP_RAID6) - bioc->raid_map[(i + rot + 1) % num_stripes] = - RAID6_Q_STRIPE; - - sort_parity_stripes(bioc, num_stripes); + /* + * For RAID56 @stripe_nr is already the number of full stripes + * before us, which is also the rotation value (needs to modulo + * with num_stripes). + * + * In this case, we just add @stripe_nr with @i, then do the + * modulo, to reduce one modulo call. + */ + bioc->full_stripe_logical = em->start + + ((stripe_nr * data_stripes) << BTRFS_STRIPE_LEN_SHIFT); + for (i = 0; i < num_stripes; i++) + set_io_stripe(&bioc->stripes[i], map, + (i + stripe_nr) % num_stripes, + stripe_offset, stripe_nr); + } else { + /* + * For all other non-RAID56 profiles, just copy the target + * stripe into the bioc. + */ + for (i = 0; i < num_stripes; i++) { + set_io_stripe(&bioc->stripes[i], map, stripe_index, + stripe_offset, stripe_nr); + stripe_index++; + } } if (need_full_stripe(op)) @@ -6583,27 +6455,15 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && need_full_stripe(op)) { - handle_ops_on_dev_replace(op, &bioc, dev_replace, logical, + handle_ops_on_dev_replace(op, bioc, dev_replace, logical, &num_stripes, &max_errors); } *bioc_ret = bioc; - bioc->map_type = map->type; bioc->num_stripes = num_stripes; bioc->max_errors = max_errors; bioc->mirror_num = mirror_num; - /* - * this is the case that REQ_READ && dev_replace_is_ongoing && - * mirror_num == num_stripes + 1 && dev_replace target drive is - * available as a mirror - */ - if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { - WARN_ON(num_stripes > 1); - bioc->stripes[0].dev = dev_replace->tgtdev; - bioc->stripes[0].physical = physical_to_patch_in_first_stripe; - bioc->mirror_num = map->num_stripes + 1; - } out: if (dev_replace_is_ongoing) { lockdep_assert_held(&dev_replace->rwsem); @@ -6941,7 +6801,6 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, map->num_stripes = num_stripes; map->io_width = btrfs_chunk_io_width(leaf, chunk); map->io_align = btrfs_chunk_io_align(leaf, chunk); - map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); map->type = type; /* * We can't use the sub_stripes value, as for profiles other than @@ -8161,3 +8020,76 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) return true; } + +static void map_raid56_repair_block(struct btrfs_io_context *bioc, + struct btrfs_io_stripe *smap, + u64 logical) +{ + int data_stripes = nr_bioc_data_stripes(bioc); + int i; + + for (i = 0; i < data_stripes; i++) { + u64 stripe_start = bioc->full_stripe_logical + + (i << BTRFS_STRIPE_LEN_SHIFT); + + if (logical >= stripe_start && + logical < stripe_start + BTRFS_STRIPE_LEN) + break; + } + ASSERT(i < data_stripes); + smap->dev = bioc->stripes[i].dev; + smap->physical = bioc->stripes[i].physical + + ((logical - bioc->full_stripe_logical) & + BTRFS_STRIPE_LEN_MASK); +} + +/* + * Map a repair write into a single device. + * + * A repair write is triggered by read time repair or scrub, which would only + * update the contents of a single device. + * Not update any other mirrors nor go through RMW path. + * + * Callers should ensure: + * + * - Call btrfs_bio_counter_inc_blocked() first + * - The range does not cross stripe boundary + * - Has a valid @mirror_num passed in. + */ +int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, + struct btrfs_io_stripe *smap, u64 logical, + u32 length, int mirror_num) +{ + struct btrfs_io_context *bioc = NULL; + u64 map_length = length; + int mirror_ret = mirror_num; + int ret; + + ASSERT(mirror_num > 0); + + ret = __btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, + &bioc, smap, &mirror_ret, true); + if (ret < 0) + return ret; + + /* The map range should not cross stripe boundary. */ + ASSERT(map_length >= length); + + /* Already mapped to single stripe. */ + if (!bioc) + goto out; + + /* Map the RAID56 multi-stripe writes to a single one. */ + if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + map_raid56_repair_block(bioc, smap, logical); + goto out; + } + + ASSERT(mirror_num <= bioc->num_stripes); + smap->dev = bioc->stripes[mirror_num - 1].dev; + smap->physical = bioc->stripes[mirror_num - 1].physical; +out: + btrfs_put_bioc(bioc); + ASSERT(smap->dev); + return 0; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 7e51f2238f72..bf47a1a70813 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -17,7 +17,11 @@ extern struct mutex uuid_mutex; -#define BTRFS_STRIPE_LEN SZ_64K +#define BTRFS_STRIPE_LEN SZ_64K +#define BTRFS_STRIPE_LEN_SHIFT (16) +#define BTRFS_STRIPE_LEN_MASK (BTRFS_STRIPE_LEN - 1) + +static_assert(const_ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT); /* Used by sanity check for btrfs_raid_types. */ #define const_ffs(n) (__builtin_ctzll(n) + 1) @@ -404,17 +408,74 @@ struct btrfs_io_context { u64 map_type; /* get from map_lookup->type */ struct bio *orig_bio; atomic_t error; - int max_errors; - int num_stripes; - int mirror_num; - int num_tgtdevs; - int *tgtdev_map; + u16 max_errors; + + /* + * The total number of stripes, including the extra duplicated + * stripe for replace. + */ + u16 num_stripes; + + /* + * The mirror_num of this bioc. + * + * This is for reads which use 0 as mirror_num, thus we should return a + * valid mirror_num (>0) for the reader. + */ + u16 mirror_num; + + /* + * The following two members are for dev-replace case only. + * + * @replace_nr_stripes: Number of duplicated stripes which need to be + * written to replace target. + * Should be <= 2 (2 for DUP, otherwise <= 1). + * @replace_stripe_src: The array indicates where the duplicated stripes + * are from. + * + * The @replace_stripe_src[] array is mostly for RAID56 cases. + * As non-RAID56 stripes share the same contents of the mapped range, + * thus no need to bother where the duplicated ones are from. + * + * But for RAID56 case, all stripes contain different contents, thus + * we need a way to know the mapping. + * + * There is an example for the two members, using a RAID5 write: + * + * num_stripes: 4 (3 + 1 duplicated write) + * stripes[0]: dev = devid 1, physical = X + * stripes[1]: dev = devid 2, physical = Y + * stripes[2]: dev = devid 3, physical = Z + * stripes[3]: dev = devid 0, physical = Y + * + * replace_nr_stripes = 1 + * replace_stripe_src = 1 <- Means stripes[1] is involved in replace. + * The duplicated stripe index would be + * (@num_stripes - 1). + * + * Note, that we can still have cases replace_nr_stripes = 2 for DUP. + * In that case, all stripes share the same content, thus we don't + * need to bother @replace_stripe_src value at all. + */ + u16 replace_nr_stripes; + s16 replace_stripe_src; /* - * logical block numbers for the start of each stripe - * The last one or two are p/q. These are sorted, - * so raid_map[0] is the start of our full stripe + * Logical bytenr of the full stripe start, only for RAID56 cases. + * + * When this value is set to other than (u64)-1, the stripes[] should + * follow this pattern: + * + * (real_stripes = num_stripes - replace_nr_stripes) + * (data_stripes = (is_raid6) ? (real_stripes - 2) : (real_stripes - 1)) + * + * stripes[0]: The first data stripe + * stripes[1]: The second data stripe + * ... + * stripes[data_stripes - 1]: The last data stripe + * stripes[data_stripes]: The P stripe + * stripes[data_stripes + 1]: The Q stripe (only for RAID6). */ - u64 *raid_map; + u64 full_stripe_logical; struct btrfs_io_stripe stripes[]; }; @@ -446,7 +507,6 @@ struct map_lookup { u64 type; int io_align; int io_width; - u32 stripe_len; int num_stripes; int sub_stripes; int verified_stripes; /* For mount time dev extent verification */ @@ -527,6 +587,9 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, struct btrfs_io_context **bioc_ret, struct btrfs_io_stripe *smap, int *mirror_num_ret, int need_raid_map); +int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, + struct btrfs_io_stripe *smap, u64 logical, + u32 length, int mirror_num); struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, u64 logical, u64 *length_ret, u32 *num_stripes); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 0ebeaf4e81f9..fc4b20c2688a 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -444,10 +444,6 @@ static const struct xattr_handler btrfs_btrfs_xattr_handler = { const struct xattr_handler *btrfs_xattr_handlers[] = { &btrfs_security_xattr_handler, -#ifdef CONFIG_BTRFS_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif &btrfs_trusted_xattr_handler, &btrfs_user_xattr_handler, &btrfs_btrfs_xattr_handler, diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index da7bb9187b68..8acb05e176c5 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -350,8 +350,6 @@ done: zlib_inflateEnd(&workspace->strm); if (data_in) kunmap_local(data_in); - if (!ret) - zero_fill_bio(cb->orig_bio); return ret; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 45d04092f2f8..39828af4a4e8 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -122,10 +122,9 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, int i; for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { - u64 bytenr; - - bytenr = ((zones[i].start + zones[i].len) - << SECTOR_SHIFT) - BTRFS_SUPER_INFO_SIZE; + u64 zone_end = (zones[i].start + zones[i].capacity) << SECTOR_SHIFT; + u64 bytenr = ALIGN_DOWN(zone_end, BTRFS_SUPER_INFO_SIZE) - + BTRFS_SUPER_INFO_SIZE; page[i] = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); @@ -1168,12 +1167,12 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) return -ERANGE; /* All the zones are conventional */ - if (find_next_bit(zinfo->seq_zones, begin, end) == end) + if (find_next_bit(zinfo->seq_zones, end, begin) == end) return 0; /* All the zones are sequential and empty */ - if (find_next_zero_bit(zinfo->seq_zones, begin, end) == end && - find_next_zero_bit(zinfo->empty_zones, begin, end) == end) + if (find_next_zero_bit(zinfo->seq_zones, end, begin) == end && + find_next_zero_bit(zinfo->empty_zones, end, begin) == end) return 0; for (pos = start; pos < start + size; pos += zinfo->zone_size) { @@ -1610,11 +1609,11 @@ void btrfs_redirty_list_add(struct btrfs_transaction *trans, !list_empty(&eb->release_list)) return; + memzero_extent_buffer(eb, 0, eb->len); + set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags); set_extent_buffer_dirty(eb); set_extent_bits_nowait(&trans->dirty_pages, eb->start, eb->start + eb->len - 1, EXTENT_DIRTY); - memzero_extent_buffer(eb, 0, eb->len); - set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags); spin_lock(&trans->releasing_ebs_lock); list_add_tail(&eb->release_list, &trans->releasing_ebs); @@ -1640,14 +1639,14 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio) { u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT); struct btrfs_inode *inode = bbio->inode; - struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_fs_info *fs_info = bbio->fs_info; struct btrfs_block_group *cache; bool ret = false; if (!btrfs_is_zoned(fs_info)) return false; - if (!is_data_inode(&inode->vfs_inode)) + if (!inode || !is_data_inode(&inode->vfs_inode)) return false; if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index e34f1ab99d56..f798da267590 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -609,7 +609,6 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) } } ret = 0; - zero_fill_bio(cb->orig_bio); done: if (workspace->in_buf.src) kunmap_local(workspace->in_buf.src); diff --git a/fs/buffer.c b/fs/buffer.c index 9e1e2add541e..a7fc561758b1 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -308,20 +308,19 @@ static void verify_bh(struct work_struct *work) struct buffer_head *bh = ctx->bh; bool valid; - valid = fsverity_verify_blocks(page_folio(bh->b_page), bh->b_size, - bh_offset(bh)); + valid = fsverity_verify_blocks(bh->b_folio, bh->b_size, bh_offset(bh)); end_buffer_async_read(bh, valid); kfree(ctx); } static bool need_fsverity(struct buffer_head *bh) { - struct page *page = bh->b_page; - struct inode *inode = page->mapping->host; + struct folio *folio = bh->b_folio; + struct inode *inode = folio->mapping->host; return fsverity_active(inode) && /* needed by ext4 */ - page->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); + folio->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); } static void decrypt_bh(struct work_struct *work) @@ -331,8 +330,8 @@ static void decrypt_bh(struct work_struct *work) struct buffer_head *bh = ctx->bh; int err; - err = fscrypt_decrypt_pagecache_blocks(page_folio(bh->b_page), - bh->b_size, bh_offset(bh)); + err = fscrypt_decrypt_pagecache_blocks(bh->b_folio, bh->b_size, + bh_offset(bh)); if (err == 0 && need_fsverity(bh)) { /* * We use different work queues for decryption and for verity @@ -843,7 +842,7 @@ int remove_inode_buffers(struct inode *inode) } /* - * Create the appropriate buffers when given a page for data area and + * Create the appropriate buffers when given a folio for data area and * the size of each buffer.. Use the bh->b_this_page linked list to * follow the buffers created. Return NULL if unable to create more * buffers. @@ -851,8 +850,8 @@ int remove_inode_buffers(struct inode *inode) * The retry flag is used to differentiate async IO (paging, swapping) * which may not fail from ordinary buffer allocations. */ -struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, - bool retry) +struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size, + bool retry) { struct buffer_head *bh, *head; gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT; @@ -862,12 +861,12 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, if (retry) gfp |= __GFP_NOFAIL; - /* The page lock pins the memcg */ - memcg = page_memcg(page); + /* The folio lock pins the memcg */ + memcg = folio_memcg(folio); old_memcg = set_active_memcg(memcg); head = NULL; - offset = PAGE_SIZE; + offset = folio_size(folio); while ((offset -= size) >= 0) { bh = alloc_buffer_head(gfp); if (!bh) @@ -879,8 +878,8 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, bh->b_size = size; - /* Link the buffer to its page */ - set_bh_page(bh, page, offset); + /* Link the buffer to its folio */ + folio_set_bh(bh, folio, offset); } out: set_active_memcg(old_memcg); @@ -899,6 +898,13 @@ no_grow: goto out; } +EXPORT_SYMBOL_GPL(folio_alloc_buffers); + +struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, + bool retry) +{ + return folio_alloc_buffers(page_folio(page), size, retry); +} EXPORT_SYMBOL_GPL(alloc_page_buffers); static inline void @@ -1485,6 +1491,21 @@ void set_bh_page(struct buffer_head *bh, } EXPORT_SYMBOL(set_bh_page); +void folio_set_bh(struct buffer_head *bh, struct folio *folio, + unsigned long offset) +{ + bh->b_folio = folio; + BUG_ON(offset >= folio_size(folio)); + if (folio_test_highmem(folio)) + /* + * This catches illegal uses and preserves the offset: + */ + bh->b_data = (char *)(0 + offset); + else + bh->b_data = folio_address(folio) + offset; +} +EXPORT_SYMBOL(folio_set_bh); + /* * Called when truncating a buffer on a page completely. */ @@ -1572,18 +1593,17 @@ out: } EXPORT_SYMBOL(block_invalidate_folio); - /* * We attach and possibly dirty the buffers atomically wrt * block_dirty_folio() via private_lock. try_to_free_buffers - * is already excluded via the page lock. + * is already excluded via the folio lock. */ -void create_empty_buffers(struct page *page, - unsigned long blocksize, unsigned long b_state) +void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize, + unsigned long b_state) { struct buffer_head *bh, *head, *tail; - head = alloc_page_buffers(page, blocksize, true); + head = folio_alloc_buffers(folio, blocksize, true); bh = head; do { bh->b_state |= b_state; @@ -1592,19 +1612,26 @@ void create_empty_buffers(struct page *page, } while (bh); tail->b_this_page = head; - spin_lock(&page->mapping->private_lock); - if (PageUptodate(page) || PageDirty(page)) { + spin_lock(&folio->mapping->private_lock); + if (folio_test_uptodate(folio) || folio_test_dirty(folio)) { bh = head; do { - if (PageDirty(page)) + if (folio_test_dirty(folio)) set_buffer_dirty(bh); - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); bh = bh->b_this_page; } while (bh != head); } - attach_page_private(page, head); - spin_unlock(&page->mapping->private_lock); + folio_attach_private(folio, head); + spin_unlock(&folio->mapping->private_lock); +} +EXPORT_SYMBOL(folio_create_empty_buffers); + +void create_empty_buffers(struct page *page, + unsigned long blocksize, unsigned long b_state) +{ + folio_create_empty_buffers(page_folio(page), blocksize, b_state); } EXPORT_SYMBOL(create_empty_buffers); @@ -1695,14 +1722,17 @@ static inline int block_size_bits(unsigned int blocksize) return ilog2(blocksize); } -static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state) +static struct buffer_head *folio_create_buffers(struct folio *folio, + struct inode *inode, + unsigned int b_state) { - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); - if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << READ_ONCE(inode->i_blkbits), - b_state); - return page_buffers(page); + if (!folio_buffers(folio)) + folio_create_empty_buffers(folio, + 1 << READ_ONCE(inode->i_blkbits), + b_state); + return folio_buffers(folio); } /* @@ -1746,8 +1776,8 @@ int __block_write_full_page(struct inode *inode, struct page *page, int nr_underway = 0; blk_opf_t write_flags = wbc_to_write_flags(wbc); - head = create_page_buffers(page, inode, - (1 << BH_Dirty)|(1 << BH_Uptodate)); + head = folio_create_buffers(page_folio(page), inode, + (1 << BH_Dirty) | (1 << BH_Uptodate)); /* * Be very careful. We have no exclusion from block_dirty_folio @@ -2010,7 +2040,7 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, BUG_ON(to > PAGE_SIZE); BUG_ON(from > to); - head = create_page_buffers(&folio->page, inode, 0); + head = folio_create_buffers(folio, inode, 0); blocksize = head->b_size; bbits = block_size_bits(blocksize); @@ -2296,7 +2326,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) VM_BUG_ON_FOLIO(folio_test_large(folio), folio); - head = create_page_buffers(&folio->page, inode, 0); + head = folio_create_buffers(folio, inode, 0); blocksize = head->b_size; bbits = block_size_bits(blocksize); @@ -2581,7 +2611,7 @@ int block_truncate_page(struct address_space *mapping, struct inode *inode = mapping->host; struct page *page; struct buffer_head *bh; - int err; + int err = 0; blocksize = i_blocksize(inode); length = offset & (blocksize - 1); @@ -2594,9 +2624,8 @@ int block_truncate_page(struct address_space *mapping, iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits); page = grab_cache_page(mapping, index); - err = -ENOMEM; if (!page) - goto out; + return -ENOMEM; if (!page_has_buffers(page)) create_empty_buffers(page, blocksize, 0); @@ -2610,7 +2639,6 @@ int block_truncate_page(struct address_space *mapping, pos += blocksize; } - err = 0; if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); err = get_block(inode, iblock, bh, 0); @@ -2634,12 +2662,11 @@ int block_truncate_page(struct address_space *mapping, zero_user(page, offset, length); mark_buffer_dirty(bh); - err = 0; unlock: unlock_page(page); put_page(page); -out: + return err; } EXPORT_SYMBOL(block_truncate_page); diff --git a/fs/cachefiles/error_inject.c b/fs/cachefiles/error_inject.c index 58f8aec964e4..18de8a876b02 100644 --- a/fs/cachefiles/error_inject.c +++ b/fs/cachefiles/error_inject.c @@ -22,18 +22,9 @@ static struct ctl_table cachefiles_sysctls[] = { {} }; -static struct ctl_table cachefiles_sysctls_root[] = { - { - .procname = "cachefiles", - .mode = 0555, - .child = cachefiles_sysctls, - }, - {} -}; - int __init cachefiles_register_error_injection(void) { - cachefiles_sysctl = register_sysctl_table(cachefiles_sysctls_root); + cachefiles_sysctl = register_sysctl("cachefiles", cachefiles_sysctls); if (!cachefiles_sysctl) return -ENOMEM; return 0; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index d5335f445233..6bb251a4d613 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -808,6 +808,7 @@ static int ceph_writepages_start(struct address_space *mapping, bool should_loop, range_whole = false; bool done = false; bool caching = ceph_is_cache_enabled(inode); + xa_mark_t tag; if (wbc->sync_mode == WB_SYNC_NONE && fsc->write_congested) @@ -834,6 +835,11 @@ static int ceph_writepages_start(struct address_space *mapping, start_index = wbc->range_cyclic ? mapping->writeback_index : 0; index = start_index; + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { + tag = PAGECACHE_TAG_TOWRITE; + } else { + tag = PAGECACHE_TAG_DIRTY; + } retry: /* find oldest snap context with dirty data */ snapc = get_oldest_context(inode, &ceph_wbc, NULL); @@ -872,6 +878,9 @@ retry: dout(" non-head snapc, range whole\n"); } + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, index, end); + ceph_put_snap_context(last_snapc); last_snapc = snapc; @@ -888,7 +897,7 @@ retry: get_more_pages: nr_folios = filemap_get_folios_tag(mapping, &index, - end, PAGECACHE_TAG_DIRTY, &fbatch); + end, tag, &fbatch); dout("pagevec_lookup_range_tag got %d\n", nr_folios); if (!nr_folios && !locked_pages) break; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 7cc20772eac9..789be30d6ee2 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -431,7 +431,7 @@ void ceph_reservation_status(struct ceph_fs_client *fsc, * * Called with i_ceph_lock held. */ -static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) +struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) { struct ceph_cap *cap; struct rb_node *n = ci->i_caps.rb_node; diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index bec3c4549c07..3904333fa6c3 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -248,14 +248,20 @@ static int metrics_caps_show(struct seq_file *s, void *p) return 0; } -static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p) +static int caps_show_cb(struct inode *inode, int mds, void *p) { + struct ceph_inode_info *ci = ceph_inode(inode); struct seq_file *s = p; - - seq_printf(s, "0x%-17llx%-3d%-17s%-17s\n", ceph_ino(inode), - cap->session->s_mds, - ceph_cap_string(cap->issued), - ceph_cap_string(cap->implemented)); + struct ceph_cap *cap; + + spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + if (cap) + seq_printf(s, "0x%-17llx%-3d%-17s%-17s\n", ceph_ino(inode), + cap->session->s_mds, + ceph_cap_string(cap->issued), + ceph_cap_string(cap->implemented)); + spin_unlock(&ci->i_ceph_lock); return 0; } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 0ced8b570e42..cb67ac821f0e 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1050,6 +1050,9 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, struct ceph_mds_request *req; int err; + if (dentry->d_flags & DCACHE_DISCONNECTED) + return -EINVAL; + err = ceph_wait_on_conflict_unlink(dentry); if (err) return err; @@ -1057,8 +1060,8 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, if (ceph_snap(dir) != CEPH_NOSNAP) return -EROFS; - dout("link in dir %p old_dentry %p dentry %p\n", dir, - old_dentry, dentry); + dout("link in dir %p %llx.%llx old_dentry %p:'%pd' dentry %p:'%pd'\n", + dir, ceph_vinop(dir), old_dentry, old_dentry, dentry, dentry); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS); if (IS_ERR(req)) { d_drop(dentry); @@ -1067,6 +1070,12 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_old_dentry = dget(old_dentry); + /* + * The old_dentry maybe a DCACHE_DISCONNECTED dentry, then we + * will just pass the ino# to MDSs. + */ + if (old_dentry->d_flags & DCACHE_DISCONNECTED) + req->r_ino2 = ceph_vino(d_inode(old_dentry)); req->r_parent = dir; ihold(dir); set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 27a245d959c0..29cf00220b09 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1632,8 +1632,8 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, * Caller must hold session s_mutex. */ int ceph_iterate_session_caps(struct ceph_mds_session *session, - int (*cb)(struct inode *, struct ceph_cap *, - void *), void *arg) + int (*cb)(struct inode *, int mds, void *), + void *arg) { struct list_head *p; struct ceph_cap *cap; @@ -1645,6 +1645,8 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session, spin_lock(&session->s_cap_lock); p = session->s_caps.next; while (p != &session->s_caps) { + int mds; + cap = list_entry(p, struct ceph_cap, session_caps); inode = igrab(&cap->ci->netfs.inode); if (!inode) { @@ -1652,6 +1654,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session, continue; } session->s_cap_iterator = cap; + mds = cap->mds; spin_unlock(&session->s_cap_lock); if (last_inode) { @@ -1663,7 +1666,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session, old_cap = NULL; } - ret = cb(inode, cap, arg); + ret = cb(inode, mds, arg); last_inode = inode; spin_lock(&session->s_cap_lock); @@ -1696,20 +1699,25 @@ out: return ret; } -static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, - void *arg) +static int remove_session_caps_cb(struct inode *inode, int mds, void *arg) { struct ceph_inode_info *ci = ceph_inode(inode); bool invalidate = false; - int iputs; + struct ceph_cap *cap; + int iputs = 0; - dout("removing cap %p, ci is %p, inode is %p\n", - cap, ci, &ci->netfs.inode); spin_lock(&ci->i_ceph_lock); - iputs = ceph_purge_inode_cap(inode, cap, &invalidate); + cap = __get_cap_for_mds(ci, mds); + if (cap) { + dout(" removing cap %p, ci is %p, inode is %p\n", + cap, ci, &ci->netfs.inode); + + iputs = ceph_purge_inode_cap(inode, cap, &invalidate); + } spin_unlock(&ci->i_ceph_lock); - wake_up_all(&ci->i_cap_wq); + if (cap) + wake_up_all(&ci->i_cap_wq); if (invalidate) ceph_queue_invalidate(inode); while (iputs--) @@ -1780,8 +1788,7 @@ enum { * * caller must hold s_mutex. */ -static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, - void *arg) +static int wake_up_session_cb(struct inode *inode, int mds, void *arg) { struct ceph_inode_info *ci = ceph_inode(inode); unsigned long ev = (unsigned long)arg; @@ -1792,12 +1799,14 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, ci->i_requested_max_size = 0; spin_unlock(&ci->i_ceph_lock); } else if (ev == RENEWCAPS) { - if (cap->cap_gen < atomic_read(&cap->session->s_cap_gen)) { - /* mds did not re-issue stale cap */ - spin_lock(&ci->i_ceph_lock); + struct ceph_cap *cap; + + spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + /* mds did not re-issue stale cap */ + if (cap && cap->cap_gen < atomic_read(&cap->session->s_cap_gen)) cap->issued = cap->implemented = CEPH_CAP_PIN; - spin_unlock(&ci->i_ceph_lock); - } + spin_unlock(&ci->i_ceph_lock); } else if (ev == FORCE_RO) { } wake_up_all(&ci->i_cap_wq); @@ -1959,16 +1968,22 @@ out: * Yes, this is a bit sloppy. Our only real goal here is to respond to * memory pressure from the MDS, though, so it needn't be perfect. */ -static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) +static int trim_caps_cb(struct inode *inode, int mds, void *arg) { int *remaining = arg; struct ceph_inode_info *ci = ceph_inode(inode); int used, wanted, oissued, mine; + struct ceph_cap *cap; if (*remaining <= 0) return -1; spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + if (!cap) { + spin_unlock(&ci->i_ceph_lock); + return 0; + } mine = cap->issued | cap->implemented; used = __ceph_caps_used(ci); wanted = __ceph_caps_file_wanted(ci); @@ -2555,6 +2570,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, u64 ino1 = 0, ino2 = 0; int pathlen1 = 0, pathlen2 = 0; bool freepath1 = false, freepath2 = false; + struct dentry *old_dentry = NULL; int len; u16 releases; void *p, *end; @@ -2572,7 +2588,10 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, } /* If r_old_dentry is set, then assume that its parent is locked */ - ret = set_request_path_attr(NULL, req->r_old_dentry, + if (req->r_old_dentry && + !(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED)) + old_dentry = req->r_old_dentry; + ret = set_request_path_attr(NULL, old_dentry, req->r_old_dentry_dir, req->r_path2, req->r_ino2.ino, &path2, &pathlen2, &ino2, &freepath2, true); @@ -3911,26 +3930,22 @@ out_unlock: /* * Encode information about a cap for a reconnect with the MDS. */ -static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, - void *arg) +static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) { union { struct ceph_mds_cap_reconnect v2; struct ceph_mds_cap_reconnect_v1 v1; } rec; - struct ceph_inode_info *ci = cap->ci; + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_reconnect_state *recon_state = arg; struct ceph_pagelist *pagelist = recon_state->pagelist; struct dentry *dentry; + struct ceph_cap *cap; char *path; - int pathlen = 0, err; + int pathlen = 0, err = 0; u64 pathbase; u64 snap_follows; - dout(" adding %p ino %llx.%llx cap %p %lld %s\n", - inode, ceph_vinop(inode), cap, cap->cap_id, - ceph_cap_string(cap->issued)); - dentry = d_find_primary(inode); if (dentry) { /* set pathbase to parent dir when msg_version >= 2 */ @@ -3947,6 +3962,15 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, } spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + if (!cap) { + spin_unlock(&ci->i_ceph_lock); + goto out_err; + } + dout(" adding %p ino %llx.%llx cap %p %lld %s\n", + inode, ceph_vinop(inode), cap, cap->cap_id, + ceph_cap_string(cap->issued)); + cap->seq = 0; /* reset cap seq */ cap->issue_seq = 0; /* and issue_seq */ cap->mseq = 0; /* and migrate_seq */ diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 0598faa50e2e..724307ff89cd 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -355,8 +355,8 @@ struct ceph_snapid_map { struct rb_node node; struct list_head lru; atomic_t ref; - u64 snap; dev_t dev; + u64 snap; unsigned long last_used; }; @@ -541,8 +541,7 @@ extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc); extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr); extern int ceph_iterate_session_caps(struct ceph_mds_session *session, - int (*cb)(struct inode *, - struct ceph_cap *, void *), + int (*cb)(struct inode *, int mds, void *), void *arg); extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 6ecca2c6d137..d24bf0db5234 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1192,6 +1192,8 @@ extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session, struct ceph_inode_info *ci); +extern struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, + int mds); extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds); extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps, diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index f65b07cc33a2..806183959c47 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -535,6 +535,8 @@ static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode, return NULL; } +#define MAX_XATTR_VAL_PRINT_LEN 256 + static int __set_xattr(struct ceph_inode_info *ci, const char *name, int name_len, const char *val, int val_len, @@ -597,7 +599,7 @@ static int __set_xattr(struct ceph_inode_info *ci, xattr->should_free_name = update_xattr; ci->i_xattrs.count++; - dout("__set_xattr count=%d\n", ci->i_xattrs.count); + dout("%s count=%d\n", __func__, ci->i_xattrs.count); } else { kfree(*newxattr); *newxattr = NULL; @@ -625,11 +627,13 @@ static int __set_xattr(struct ceph_inode_info *ci, if (new) { rb_link_node(&xattr->node, parent, p); rb_insert_color(&xattr->node, &ci->i_xattrs.index); - dout("__set_xattr_val p=%p\n", p); + dout("%s p=%p\n", __func__, p); } - dout("__set_xattr_val added %llx.%llx xattr %p %.*s=%.*s\n", - ceph_vinop(&ci->netfs.inode), xattr, name_len, name, val_len, val); + dout("%s added %llx.%llx xattr %p %.*s=%.*s%s\n", __func__, + ceph_vinop(&ci->netfs.inode), xattr, name_len, name, + min(val_len, MAX_XATTR_VAL_PRINT_LEN), val, + val_len > MAX_XATTR_VAL_PRINT_LEN ? "..." : ""); return 0; } @@ -655,13 +659,15 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci, else if (c > 0) p = &(*p)->rb_right; else { - dout("__get_xattr %s: found %.*s\n", name, - xattr->val_len, xattr->val); + int len = min(xattr->val_len, MAX_XATTR_VAL_PRINT_LEN); + + dout("%s %s: found %.*s%s\n", __func__, name, len, + xattr->val, xattr->val_len > len ? "..." : ""); return xattr; } } - dout("__get_xattr %s: not found\n", name); + dout("%s %s: not found\n", __func__, name); return NULL; } @@ -1411,10 +1417,6 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx) * attributes are handled directly. */ const struct xattr_handler *ceph_xattr_handlers[] = { -#ifdef CONFIG_CEPH_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif &ceph_other_xattr_handler, NULL, }; diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index e9c8c088d948..d4ed200a9471 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -280,8 +280,10 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) seq_printf(m, "\n%d) ConnectionId: 0x%llx ", c, server->conn_id); + spin_lock(&server->srv_lock); if (server->hostname) seq_printf(m, "Hostname: %s ", server->hostname); + spin_unlock(&server->srv_lock); #ifdef CONFIG_CIFS_SMB_DIRECT if (!server->rdma) goto skip_rdma; @@ -623,10 +625,13 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v) server->fastest_cmd[j], server->slowest_cmd[j]); for (j = 0; j < NUMBER_OF_SMB2_COMMANDS; j++) - if (atomic_read(&server->smb2slowcmd[j])) + if (atomic_read(&server->smb2slowcmd[j])) { + spin_lock(&server->srv_lock); seq_printf(m, " %d slow responses from %s for command %d\n", atomic_read(&server->smb2slowcmd[j]), server->hostname, j); + spin_unlock(&server->srv_lock); + } #endif /* STATS2 */ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index d44808263cfb..ce5cfd236fdb 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -81,19 +81,19 @@ do { \ #define cifs_server_dbg_func(ratefunc, type, fmt, ...) \ do { \ - const char *sn = ""; \ - if (server && server->hostname) \ - sn = server->hostname; \ + spin_lock(&server->srv_lock); \ if ((type) & FYI && cifsFYI & CIFS_INFO) { \ pr_debug_ ## ratefunc("%s: \\\\%s " fmt, \ - __FILE__, sn, ##__VA_ARGS__); \ + __FILE__, server->hostname, \ + ##__VA_ARGS__); \ } else if ((type) & VFS) { \ pr_err_ ## ratefunc("VFS: \\\\%s " fmt, \ - sn, ##__VA_ARGS__); \ + server->hostname, ##__VA_ARGS__); \ } else if ((type) & NOISY && (NOISY != 0)) { \ pr_debug_ ## ratefunc("\\\\%s " fmt, \ - sn, ##__VA_ARGS__); \ + server->hostname, ##__VA_ARGS__); \ } \ + spin_unlock(&server->srv_lock); \ } while (0) #define cifs_server_dbg(type, fmt, ...) \ diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index cb40074feb3e..0329a907bdfe 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -171,8 +171,6 @@ static struct vfsmount *cifs_dfs_do_automount(struct path *path) mnt = ERR_CAST(full_path); goto out; } - - convert_delimiter(full_path, '/'); cifs_dbg(FYI, "%s: full_path: %s\n", __func__, full_path); tmp = *cur_ctx; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index ac9034fce409..43a4d8603db3 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -246,7 +246,7 @@ cifs_read_super(struct super_block *sb) if (cifs_sb->ctx->rasize) sb->s_bdi->ra_pages = cifs_sb->ctx->rasize / PAGE_SIZE; else - sb->s_bdi->ra_pages = cifs_sb->ctx->rsize / PAGE_SIZE; + sb->s_bdi->ra_pages = 2 * (cifs_sb->ctx->rsize / PAGE_SIZE); sb->s_blocksize = CIFS_MAX_MSGSIZE; sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */ @@ -744,6 +744,7 @@ static void cifs_umount_begin(struct super_block *sb) spin_unlock(&tcon->tc_lock); spin_unlock(&cifs_tcp_ses_lock); + cifs_close_all_deferred_files(tcon); /* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */ /* cancel_notify_requests(tcon); */ if (tcon->ses && tcon->ses->server) { @@ -759,6 +760,20 @@ static void cifs_umount_begin(struct super_block *sb) return; } +static int cifs_freeze(struct super_block *sb) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifs_tcon *tcon; + + if (cifs_sb == NULL) + return 0; + + tcon = cifs_sb_master_tcon(cifs_sb); + + cifs_close_all_deferred_files(tcon); + return 0; +} + #ifdef CONFIG_CIFS_STATS2 static int cifs_show_stats(struct seq_file *s, struct dentry *root) { @@ -797,6 +812,7 @@ static const struct super_operations cifs_super_ops = { as opens */ .show_options = cifs_show_options, .umount_begin = cifs_umount_begin, + .freeze_fs = cifs_freeze, #ifdef CONFIG_CIFS_STATS2 .show_stats = cifs_show_stats, #endif @@ -874,14 +890,12 @@ cifs_smb3_do_mount(struct file_system_type *fs_type, struct cifs_mnt_data mnt_data; struct dentry *root; - /* - * Prints in Kernel / CIFS log the attempted mount operation - * If CIFS_DEBUG && cifs_FYI - */ - if (cifsFYI) - cifs_dbg(FYI, "Devname: %s flags: %d\n", old_ctx->UNC, flags); - else - cifs_info("Attempting to mount %s\n", old_ctx->UNC); + if (cifsFYI) { + cifs_dbg(FYI, "%s: devname=%s flags=0x%x\n", __func__, + old_ctx->source, flags); + } else { + cifs_info("Attempting to mount %s\n", old_ctx->source); + } cifs_sb = kzalloc(sizeof(struct cifs_sb_info), GFP_KERNEL); if (cifs_sb == NULL) { diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 415176b2cf32..74cd6fafb33e 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -162,6 +162,6 @@ extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ /* when changing internal version - update following two lines at same time */ -#define SMB3_PRODUCT_BUILD 41 -#define CIFS_VERSION "2.42" +#define SMB3_PRODUCT_BUILD 43 +#define CIFS_VERSION "2.43" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 08a73dcb7786..414685c5d530 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -736,17 +736,23 @@ struct TCP_Server_Info { #endif struct mutex refpath_lock; /* protects leaf_fullpath */ /* - * Canonical DFS full paths that were used to chase referrals in mount and reconnect. + * origin_fullpath: Canonical copy of smb3_fs_context::source. + * It is used for matching existing DFS tcons. * - * origin_fullpath: first or original referral path - * leaf_fullpath: last referral path (might be changed due to nested links in reconnect) + * leaf_fullpath: Canonical DFS referral path related to this + * connection. + * It is used in DFS cache refresher, reconnect and may + * change due to nested DFS links. * - * current_fullpath: pointer to either origin_fullpath or leaf_fullpath - * NOTE: cannot be accessed outside cifs_reconnect() and smb2_reconnect() + * Both protected by @refpath_lock and @srv_lock. The @refpath_lock is + * mosly used for not requiring a copy of @leaf_fullpath when getting + * cached or new DFS referrals (which might also sleep during I/O). + * While @srv_lock is held for making string and NULL comparions against + * both fields as in mount(2) and cache refresh. * - * format: \\HOST\SHARE\[OPTIONAL PATH] + * format: \\HOST\SHARE[\OPTIONAL PATH] */ - char *origin_fullpath, *leaf_fullpath, *current_fullpath; + char *origin_fullpath, *leaf_fullpath; }; static inline bool is_smb1(struct TCP_Server_Info *server) @@ -1232,8 +1238,8 @@ struct cifs_tcon { struct cached_fids *cfids; /* BB add field for back pointer to sb struct(s)? */ #ifdef CONFIG_CIFS_DFS_UPCALL - struct list_head ulist; /* cache update list */ struct list_head dfs_ses_list; + struct delayed_work dfs_cache_work; #endif struct delayed_work query_interfaces; /* query interfaces workqueue job */ }; @@ -1750,7 +1756,6 @@ struct cifs_mount_ctx { struct TCP_Server_Info *server; struct cifs_ses *ses; struct cifs_tcon *tcon; - char *origin_fullpath, *leaf_fullpath; struct list_head dfs_ses_list; }; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index e2eff66eefab..c1c704990b98 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -8,6 +8,7 @@ #ifndef _CIFSPROTO_H #define _CIFSPROTO_H #include <linux/nls.h> +#include <linux/ctype.h> #include "trace.h" #ifdef CONFIG_CIFS_DFS_UPCALL #include "dfs_cache.h" @@ -572,7 +573,7 @@ extern int E_md4hash(const unsigned char *passwd, unsigned char *p16, extern struct TCP_Server_Info * cifs_find_tcp_session(struct smb3_fs_context *ctx); -extern void cifs_put_smb_ses(struct cifs_ses *ses); +void __cifs_put_smb_ses(struct cifs_ses *ses); extern struct cifs_ses * cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx); @@ -696,4 +697,45 @@ struct super_block *cifs_get_tcon_super(struct cifs_tcon *tcon); void cifs_put_tcon_super(struct super_block *sb); int cifs_wait_for_server_reconnect(struct TCP_Server_Info *server, bool retry); +/* Put references of @ses and @ses->dfs_root_ses */ +static inline void cifs_put_smb_ses(struct cifs_ses *ses) +{ + struct cifs_ses *rses = ses->dfs_root_ses; + + __cifs_put_smb_ses(ses); + if (rses) + __cifs_put_smb_ses(rses); +} + +/* Get an active reference of @ses and @ses->dfs_root_ses. + * + * NOTE: make sure to call this function when incrementing reference count of + * @ses to ensure that any DFS root session attached to it (@ses->dfs_root_ses) + * will also get its reference count incremented. + * + * cifs_put_smb_ses() will put both references, so call it when you're done. + */ +static inline void cifs_smb_ses_inc_refcount(struct cifs_ses *ses) +{ + lockdep_assert_held(&cifs_tcp_ses_lock); + + ses->ses_count++; + if (ses->dfs_root_ses) + ses->dfs_root_ses->ses_count++; +} + +static inline bool dfs_src_pathname_equal(const char *s1, const char *s2) +{ + if (strlen(s1) != strlen(s2)) + return false; + for (; *s1; s1++, s2++) { + if (*s1 == '/' || *s1 == '\\') { + if (*s2 != '/' && *s2 != '\\') + return false; + } else if (tolower(*s1) != tolower(*s2)) + return false; + } + return true; +} + #endif /* _CIFSPROTO_H */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 1cbb90587995..8e9a672320ab 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -403,8 +403,10 @@ static int __reconnect_target_unlocked(struct TCP_Server_Info *server, const cha if (server->hostname != target) { hostname = extract_hostname(target); if (!IS_ERR(hostname)) { + spin_lock(&server->srv_lock); kfree(server->hostname); server->hostname = hostname; + spin_unlock(&server->srv_lock); } else { cifs_dbg(FYI, "%s: couldn't extract hostname or address from dfs target: %ld\n", __func__, PTR_ERR(hostname)); @@ -452,7 +454,6 @@ static int reconnect_target_unlocked(struct TCP_Server_Info *server, struct dfs_ static int reconnect_dfs_server(struct TCP_Server_Info *server) { int rc = 0; - const char *refpath = server->current_fullpath + 1; struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl); struct dfs_cache_tgt_iterator *target_hint = NULL; int num_targets = 0; @@ -465,8 +466,10 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server) * through /proc/fs/cifs/dfscache or the target list is empty due to server settings after * refreshing the referral, so, in this case, default it to 1. */ - if (!dfs_cache_noreq_find(refpath, NULL, &tl)) + mutex_lock(&server->refpath_lock); + if (!dfs_cache_noreq_find(server->leaf_fullpath + 1, NULL, &tl)) num_targets = dfs_cache_get_nr_tgts(&tl); + mutex_unlock(&server->refpath_lock); if (!num_targets) num_targets = 1; @@ -510,7 +513,9 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server) mod_delayed_work(cifsiod_wq, &server->reconnect, 0); } while (server->tcpStatus == CifsNeedReconnect); - dfs_cache_noreq_update_tgthint(refpath, target_hint); + mutex_lock(&server->refpath_lock); + dfs_cache_noreq_update_tgthint(server->leaf_fullpath + 1, target_hint); + mutex_unlock(&server->refpath_lock); dfs_cache_free_tgts(&tl); /* Need to set up echo worker again once connection has been established */ @@ -561,9 +566,7 @@ cifs_echo_request(struct work_struct *work) goto requeue_echo; rc = server->ops->echo ? server->ops->echo(server) : -ENOSYS; - if (rc) - cifs_dbg(FYI, "Unable to send echo request to server: %s\n", - server->hostname); + cifs_server_dbg(FYI, "send echo request: rc = %d\n", rc); /* Check witness registrations */ cifs_swn_check(); @@ -993,10 +996,8 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) */ } -#ifdef CONFIG_CIFS_DFS_UPCALL kfree(server->origin_fullpath); kfree(server->leaf_fullpath); -#endif kfree(server); length = atomic_dec_return(&tcpSesAllocCount); @@ -1384,26 +1385,13 @@ match_security(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) return true; } -static bool dfs_src_pathname_equal(const char *s1, const char *s2) -{ - if (strlen(s1) != strlen(s2)) - return false; - for (; *s1; s1++, s2++) { - if (*s1 == '/' || *s1 == '\\') { - if (*s2 != '/' && *s2 != '\\') - return false; - } else if (tolower(*s1) != tolower(*s2)) - return false; - } - return true; -} - /* this function must be called with srv_lock held */ -static int match_server(struct TCP_Server_Info *server, struct smb3_fs_context *ctx, - bool dfs_super_cmp) +static int match_server(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) { struct sockaddr *addr = (struct sockaddr *)&ctx->dstaddr; + lockdep_assert_held(&server->srv_lock); + if (ctx->nosharesock) return 0; @@ -1429,27 +1417,41 @@ static int match_server(struct TCP_Server_Info *server, struct smb3_fs_context * (struct sockaddr *)&server->srcaddr)) return 0; /* - * When matching DFS superblocks, we only check for original source pathname as the - * currently connected target might be different than the one parsed earlier in i.e. - * mount.cifs(8). + * - Match for an DFS tcon (@server->origin_fullpath). + * - Match for an DFS root server connection (@server->leaf_fullpath). + * - If none of the above and @ctx->leaf_fullpath is set, then + * it is a new DFS connection. + * - If 'nodfs' mount option was passed, then match only connections + * that have no DFS referrals set + * (e.g. can't failover to other targets). */ - if (dfs_super_cmp) { - if (!ctx->source || !server->origin_fullpath || - !dfs_src_pathname_equal(server->origin_fullpath, ctx->source)) - return 0; - } else { - /* Skip addr, hostname and port matching for DFS connections */ - if (server->leaf_fullpath) { + if (!ctx->nodfs) { + if (ctx->source && server->origin_fullpath) { + if (!dfs_src_pathname_equal(ctx->source, + server->origin_fullpath)) + return 0; + } else if (server->leaf_fullpath) { if (!ctx->leaf_fullpath || - strcasecmp(server->leaf_fullpath, ctx->leaf_fullpath)) + strcasecmp(server->leaf_fullpath, + ctx->leaf_fullpath)) return 0; - } else if (strcasecmp(server->hostname, ctx->server_hostname) || - !match_server_address(server, addr) || - !match_port(server, addr)) { + } else if (ctx->leaf_fullpath) { return 0; } + } else if (server->origin_fullpath || server->leaf_fullpath) { + return 0; } + /* + * Match for a regular connection (address/hostname/port) which has no + * DFS referrals set. + */ + if (!server->origin_fullpath && !server->leaf_fullpath && + (strcasecmp(server->hostname, ctx->server_hostname) || + !match_server_address(server, addr) || + !match_port(server, addr))) + return 0; + if (!match_security(server, ctx)) return 0; @@ -1480,7 +1482,7 @@ cifs_find_tcp_session(struct smb3_fs_context *ctx) * Skip ses channels since they're only handled in lower layers * (e.g. cifs_send_recv). */ - if (CIFS_SERVER_IS_CHAN(server) || !match_server(server, ctx, false)) { + if (CIFS_SERVER_IS_CHAN(server) || !match_server(server, ctx)) { spin_unlock(&server->srv_lock); continue; } @@ -1580,7 +1582,6 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, rc = -ENOMEM; goto out_err; } - tcp_ses->current_fullpath = tcp_ses->leaf_fullpath; } if (ctx->nosharesock) @@ -1810,7 +1811,9 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx) if (tcon == NULL) return -ENOMEM; + spin_lock(&server->srv_lock); scnprintf(unc, sizeof(unc), "\\\\%s\\IPC$", server->hostname); + spin_unlock(&server->srv_lock); xid = get_xid(); tcon->ses = ses; @@ -1863,7 +1866,7 @@ cifs_free_ipc(struct cifs_ses *ses) static struct cifs_ses * cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) { - struct cifs_ses *ses; + struct cifs_ses *ses, *ret = NULL; spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { @@ -1873,23 +1876,22 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) continue; } spin_lock(&ses->chan_lock); - if (!match_session(ses, ctx)) { + if (match_session(ses, ctx)) { spin_unlock(&ses->chan_lock); spin_unlock(&ses->ses_lock); - continue; + ret = ses; + break; } spin_unlock(&ses->chan_lock); spin_unlock(&ses->ses_lock); - - ++ses->ses_count; - spin_unlock(&cifs_tcp_ses_lock); - return ses; } + if (ret) + cifs_smb_ses_inc_refcount(ret); spin_unlock(&cifs_tcp_ses_lock); - return NULL; + return ret; } -void cifs_put_smb_ses(struct cifs_ses *ses) +void __cifs_put_smb_ses(struct cifs_ses *ses) { unsigned int rc, xid; unsigned int chan_count; @@ -1916,18 +1918,22 @@ void cifs_put_smb_ses(struct cifs_ses *ses) /* ses_count can never go negative */ WARN_ON(ses->ses_count < 0); + spin_lock(&ses->ses_lock); if (ses->ses_status == SES_GOOD) ses->ses_status = SES_EXITING; - cifs_free_ipc(ses); - if (ses->ses_status == SES_EXITING && server->ops->logoff) { + spin_unlock(&ses->ses_lock); + cifs_free_ipc(ses); xid = get_xid(); rc = server->ops->logoff(xid, ses); if (rc) cifs_server_dbg(VFS, "%s: Session Logoff failure rc=%d\n", __func__, rc); _free_xid(xid); + } else { + spin_unlock(&ses->ses_lock); + cifs_free_ipc(ses); } spin_lock(&cifs_tcp_ses_lock); @@ -2240,6 +2246,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) */ spin_lock(&cifs_tcp_ses_lock); ses->dfs_root_ses = ctx->dfs_root_ses; + if (ses->dfs_root_ses) + ses->dfs_root_ses->ses_count++; list_add(&ses->smb_ses_list, &server->smb_ses_list); spin_unlock(&cifs_tcp_ses_lock); @@ -2256,12 +2264,15 @@ get_ses_fail: } /* this function must be called with tc_lock held */ -static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx, bool dfs_super_cmp) +static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) { + struct TCP_Server_Info *server = tcon->ses->server; + if (tcon->status == TID_EXITING) return 0; - /* Skip UNC validation when matching DFS superblocks */ - if (!dfs_super_cmp && strncmp(tcon->tree_name, ctx->UNC, MAX_TREE_SIZE)) + /* Skip UNC validation when matching DFS connections or superblocks */ + if (!server->origin_fullpath && !server->leaf_fullpath && + strncmp(tcon->tree_name, ctx->UNC, MAX_TREE_SIZE)) return 0; if (tcon->seal != ctx->seal) return 0; @@ -2284,7 +2295,7 @@ cifs_find_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { spin_lock(&tcon->tc_lock); - if (!match_tcon(tcon, ctx, false)) { + if (!match_tcon(tcon, ctx)) { spin_unlock(&tcon->tc_lock); continue; } @@ -2330,6 +2341,9 @@ cifs_put_tcon(struct cifs_tcon *tcon) /* cancel polling of interfaces */ cancel_delayed_work_sync(&tcon->query_interfaces); +#ifdef CONFIG_CIFS_DFS_UPCALL + cancel_delayed_work_sync(&tcon->dfs_cache_work); +#endif if (tcon->use_witness) { int rc; @@ -2577,7 +2591,9 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) queue_delayed_work(cifsiod_wq, &tcon->query_interfaces, (SMB_INTERFACE_POLL_INTERVAL * HZ)); } - +#ifdef CONFIG_CIFS_DFS_UPCALL + INIT_DELAYED_WORK(&tcon->dfs_cache_work, dfs_cache_refresh); +#endif spin_lock(&cifs_tcp_ses_lock); list_add(&tcon->tcon_list, &ses->tcon_list); spin_unlock(&cifs_tcp_ses_lock); @@ -2655,9 +2671,11 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data) return 1; } -static int -match_prepath(struct super_block *sb, struct cifs_mnt_data *mnt_data) +static int match_prepath(struct super_block *sb, + struct TCP_Server_Info *server, + struct cifs_mnt_data *mnt_data) { + struct smb3_fs_context *ctx = mnt_data->ctx; struct cifs_sb_info *old = CIFS_SB(sb); struct cifs_sb_info *new = mnt_data->cifs_sb; bool old_set = (old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) && @@ -2665,6 +2683,10 @@ match_prepath(struct super_block *sb, struct cifs_mnt_data *mnt_data) bool new_set = (new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) && new->prepath; + if (server->origin_fullpath && + dfs_src_pathname_equal(server->origin_fullpath, ctx->source)) + return 1; + if (old_set && new_set && !strcmp(new->prepath, old->prepath)) return 1; else if (!old_set && !new_set) @@ -2683,11 +2705,17 @@ cifs_match_super(struct super_block *sb, void *data) struct cifs_ses *ses; struct cifs_tcon *tcon; struct tcon_link *tlink; - bool dfs_super_cmp; int rc = 0; spin_lock(&cifs_tcp_ses_lock); cifs_sb = CIFS_SB(sb); + + /* We do not want to use a superblock that has been shutdown */ + if (CIFS_MOUNT_SHUTDOWN & cifs_sb->mnt_cifs_flags) { + spin_unlock(&cifs_tcp_ses_lock); + return 0; + } + tlink = cifs_get_tlink(cifs_sb_master_tlink(cifs_sb)); if (tlink == NULL) { /* can not match superblock if tlink were ever null */ @@ -2698,18 +2726,16 @@ cifs_match_super(struct super_block *sb, void *data) ses = tcon->ses; tcp_srv = ses->server; - dfs_super_cmp = IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) && tcp_srv->origin_fullpath; - ctx = mnt_data->ctx; spin_lock(&tcp_srv->srv_lock); spin_lock(&ses->ses_lock); spin_lock(&ses->chan_lock); spin_lock(&tcon->tc_lock); - if (!match_server(tcp_srv, ctx, dfs_super_cmp) || + if (!match_server(tcp_srv, ctx) || !match_session(ses, ctx) || - !match_tcon(tcon, ctx, dfs_super_cmp) || - !match_prepath(sb, mnt_data)) { + !match_tcon(tcon, ctx) || + !match_prepath(sb, tcp_srv, mnt_data)) { rc = 0; goto out; } @@ -3454,8 +3480,6 @@ out: error: dfs_put_root_smb_sessions(&mnt_ctx.dfs_ses_list); - kfree(mnt_ctx.origin_fullpath); - kfree(mnt_ctx.leaf_fullpath); cifs_mount_put_conns(&mnt_ctx); return rc; } diff --git a/fs/cifs/dfs.c b/fs/cifs/dfs.c index 3a11716b6e13..a93dbca1411b 100644 --- a/fs/cifs/dfs.c +++ b/fs/cifs/dfs.c @@ -99,7 +99,7 @@ static int get_session(struct cifs_mount_ctx *mnt_ctx, const char *full_path) return rc; } -static int get_root_smb_session(struct cifs_mount_ctx *mnt_ctx) +static int add_root_smb_session(struct cifs_mount_ctx *mnt_ctx) { struct smb3_fs_context *ctx = mnt_ctx->fs_ctx; struct dfs_root_ses *root_ses; @@ -127,7 +127,7 @@ static int get_dfs_conn(struct cifs_mount_ctx *mnt_ctx, const char *ref_path, co { struct smb3_fs_context *ctx = mnt_ctx->fs_ctx; struct dfs_info3_param ref = {}; - bool is_refsrv = false; + bool is_refsrv; int rc, rc2; rc = dfs_cache_get_tgt_referral(ref_path + 1, tit, &ref); @@ -157,8 +157,10 @@ static int get_dfs_conn(struct cifs_mount_ctx *mnt_ctx, const char *ref_path, co rc = cifs_is_path_remote(mnt_ctx); } + dfs_cache_noreq_update_tgthint(ref_path + 1, tit); + if (rc == -EREMOTE && is_refsrv) { - rc2 = get_root_smb_session(mnt_ctx); + rc2 = add_root_smb_session(mnt_ctx); if (rc2) rc = rc2; } @@ -248,16 +250,19 @@ static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx) tcon = mnt_ctx->tcon; mutex_lock(&server->refpath_lock); + spin_lock(&server->srv_lock); if (!server->origin_fullpath) { server->origin_fullpath = origin_fullpath; - server->current_fullpath = server->leaf_fullpath; origin_fullpath = NULL; } + spin_unlock(&server->srv_lock); mutex_unlock(&server->refpath_lock); if (list_empty(&tcon->dfs_ses_list)) { list_replace_init(&mnt_ctx->dfs_ses_list, &tcon->dfs_ses_list); + queue_delayed_work(dfscache_wq, &tcon->dfs_cache_work, + dfs_cache_get_ttl() * HZ); } else { dfs_put_root_smb_sessions(&mnt_ctx->dfs_ses_list); } @@ -272,15 +277,21 @@ out: int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs) { - struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb; struct smb3_fs_context *ctx = mnt_ctx->fs_ctx; + struct cifs_ses *ses; + char *source = ctx->source; + bool nodfs = ctx->nodfs; int rc; *isdfs = false; - + /* Temporarily set @ctx->source to NULL as we're not matching DFS + * superblocks yet. See cifs_match_super() and match_server(). + */ + ctx->source = NULL; rc = get_session(mnt_ctx, NULL); if (rc) - return rc; + goto out; + ctx->dfs_root_ses = mnt_ctx->ses; /* * If called with 'nodfs' mount option, then skip DFS resolving. Otherwise unconditionally @@ -289,23 +300,41 @@ int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs) * Skip prefix path to provide support for DFS referrals from w2k8 servers which don't seem * to respond with PATH_NOT_COVERED to requests that include the prefix. */ - if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) || - dfs_get_referral(mnt_ctx, ctx->UNC + 1, NULL, NULL)) { + if (!nodfs) { + rc = dfs_get_referral(mnt_ctx, ctx->UNC + 1, NULL, NULL); + if (rc) { + if (rc != -ENOENT && rc != -EOPNOTSUPP) + goto out; + nodfs = true; + } + } + if (nodfs) { rc = cifs_mount_get_tcon(mnt_ctx); - if (rc) - return rc; - - rc = cifs_is_path_remote(mnt_ctx); - if (!rc || rc != -EREMOTE) - return rc; + if (!rc) + rc = cifs_is_path_remote(mnt_ctx); + goto out; } *isdfs = true; - rc = get_root_smb_session(mnt_ctx); - if (rc) - return rc; - - return __dfs_mount_share(mnt_ctx); + /* + * Prevent DFS root session of being put in the first call to + * cifs_mount_put_conns(). If another DFS root server was not found + * while chasing the referrals (@ctx->dfs_root_ses == @ses), then we + * can safely put extra refcount of @ses. + */ + ses = mnt_ctx->ses; + mnt_ctx->ses = NULL; + mnt_ctx->server = NULL; + rc = __dfs_mount_share(mnt_ctx); + if (ses == ctx->dfs_root_ses) + cifs_put_smb_ses(ses); +out: + /* + * Restore previous value of @ctx->source so DFS superblock can be + * matched in cifs_match_super(). + */ + ctx->source = source; + return rc; } /* Update dfs referral path of superblock */ @@ -342,10 +371,11 @@ static int update_server_fullpath(struct TCP_Server_Info *server, struct cifs_sb rc = PTR_ERR(npath); } else { mutex_lock(&server->refpath_lock); + spin_lock(&server->srv_lock); kfree(server->leaf_fullpath); server->leaf_fullpath = npath; + spin_unlock(&server->srv_lock); mutex_unlock(&server->refpath_lock); - server->current_fullpath = server->leaf_fullpath; } return rc; } @@ -374,6 +404,54 @@ static int target_share_matches_server(struct TCP_Server_Info *server, char *sha return rc; } +static void __tree_connect_ipc(const unsigned int xid, char *tree, + struct cifs_sb_info *cifs_sb, + struct cifs_ses *ses) +{ + struct TCP_Server_Info *server = ses->server; + struct cifs_tcon *tcon = ses->tcon_ipc; + int rc; + + spin_lock(&ses->ses_lock); + spin_lock(&ses->chan_lock); + if (cifs_chan_needs_reconnect(ses, server) || + ses->ses_status != SES_GOOD) { + spin_unlock(&ses->chan_lock); + spin_unlock(&ses->ses_lock); + cifs_server_dbg(FYI, "%s: skipping ipc reconnect due to disconnected ses\n", + __func__); + return; + } + spin_unlock(&ses->chan_lock); + spin_unlock(&ses->ses_lock); + + cifs_server_lock(server); + scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", server->hostname); + cifs_server_unlock(server); + + rc = server->ops->tree_connect(xid, ses, tree, tcon, + cifs_sb->local_nls); + cifs_server_dbg(FYI, "%s: tree_reconnect %s: %d\n", __func__, tree, rc); + spin_lock(&tcon->tc_lock); + if (rc) { + tcon->status = TID_NEED_TCON; + } else { + tcon->status = TID_GOOD; + tcon->need_reconnect = false; + } + spin_unlock(&tcon->tc_lock); +} + +static void tree_connect_ipc(const unsigned int xid, char *tree, + struct cifs_sb_info *cifs_sb, + struct cifs_tcon *tcon) +{ + struct cifs_ses *ses = tcon->ses; + + __tree_connect_ipc(xid, tree, cifs_sb, ses); + __tree_connect_ipc(xid, tree, cifs_sb, CIFS_DFS_ROOT_SES(ses)); +} + static int __tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, char *tree, bool islink, struct dfs_cache_tgt_list *tl) @@ -382,7 +460,6 @@ static int __tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *t struct TCP_Server_Info *server = tcon->ses->server; const struct smb_version_operations *ops = server->ops; struct cifs_ses *root_ses = CIFS_DFS_ROOT_SES(tcon->ses); - struct cifs_tcon *ipc = root_ses->tcon_ipc; char *share = NULL, *prefix = NULL; struct dfs_cache_tgt_iterator *tit; bool target_match; @@ -403,7 +480,7 @@ static int __tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *t share = prefix = NULL; /* Check if share matches with tcp ses */ - rc = dfs_cache_get_tgt_share(server->current_fullpath + 1, tit, &share, &prefix); + rc = dfs_cache_get_tgt_share(server->leaf_fullpath + 1, tit, &share, &prefix); if (rc) { cifs_dbg(VFS, "%s: failed to parse target share: %d\n", __func__, rc); break; @@ -417,19 +494,15 @@ static int __tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *t continue; } - dfs_cache_noreq_update_tgthint(server->current_fullpath + 1, tit); - - if (ipc->need_reconnect) { - scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", server->hostname); - rc = ops->tree_connect(xid, ipc->ses, tree, ipc, cifs_sb->local_nls); - cifs_dbg(FYI, "%s: reconnect ipc: %d\n", __func__, rc); - } + dfs_cache_noreq_update_tgthint(server->leaf_fullpath + 1, tit); + tree_connect_ipc(xid, tree, cifs_sb, tcon); scnprintf(tree, MAX_TREE_SIZE, "\\%s", share); if (!islink) { rc = ops->tree_connect(xid, tcon->ses, tree, tcon, cifs_sb->local_nls); break; } + /* * If no dfs referrals were returned from link target, then just do a TREE_CONNECT * to it. Otherwise, cache the dfs referral and then mark current tcp ses for @@ -539,8 +612,8 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru cifs_sb = CIFS_SB(sb); /* If it is not dfs or there was no cached dfs referral, then reconnect to same share */ - if (!server->current_fullpath || - dfs_cache_noreq_find(server->current_fullpath + 1, &ref, &tl)) { + if (!server->leaf_fullpath || + dfs_cache_noreq_find(server->leaf_fullpath + 1, &ref, &tl)) { rc = ops->tree_connect(xid, tcon->ses, tcon->tree_name, tcon, cifs_sb->local_nls); goto out; } diff --git a/fs/cifs/dfs.h b/fs/cifs/dfs.h index 13f26e01f7b9..1c90df5ecfbd 100644 --- a/fs/cifs/dfs.h +++ b/fs/cifs/dfs.h @@ -34,19 +34,42 @@ static inline int dfs_get_referral(struct cifs_mount_ctx *mnt_ctx, const char *p cifs_remap(cifs_sb), path, ref, tl); } +/* Return DFS full path out of a dentry set for automount */ static inline char *dfs_get_automount_devname(struct dentry *dentry, void *page) { struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); struct TCP_Server_Info *server = tcon->ses->server; + size_t len; + char *s; - if (unlikely(!server->origin_fullpath)) + spin_lock(&server->srv_lock); + if (unlikely(!server->origin_fullpath)) { + spin_unlock(&server->srv_lock); return ERR_PTR(-EREMOTE); + } + spin_unlock(&server->srv_lock); + + s = dentry_path_raw(dentry, page, PATH_MAX); + if (IS_ERR(s)) + return s; + /* for root, we want "" */ + if (!s[1]) + s++; + + spin_lock(&server->srv_lock); + len = strlen(server->origin_fullpath); + if (s < (char *)page + len) { + spin_unlock(&server->srv_lock); + return ERR_PTR(-ENAMETOOLONG); + } + + s -= len; + memcpy(s, server->origin_fullpath, len); + spin_unlock(&server->srv_lock); + convert_delimiter(s, '/'); - return __build_path_from_dentry_optional_prefix(dentry, page, - server->origin_fullpath, - strlen(server->origin_fullpath), - true); + return s; } static inline void dfs_put_root_smb_sessions(struct list_head *head) diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index 30cbdf8514a5..1513b2709889 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -20,12 +20,14 @@ #include "cifs_unicode.h" #include "smb2glob.h" #include "dns_resolve.h" +#include "dfs.h" #include "dfs_cache.h" -#define CACHE_HTABLE_SIZE 32 -#define CACHE_MAX_ENTRIES 64 -#define CACHE_MIN_TTL 120 /* 2 minutes */ +#define CACHE_HTABLE_SIZE 32 +#define CACHE_MAX_ENTRIES 64 +#define CACHE_MIN_TTL 120 /* 2 minutes */ +#define CACHE_DEFAULT_TTL 300 /* 5 minutes */ #define IS_DFS_INTERLINK(v) (((v) & DFSREF_REFERRAL_SERVER) && !((v) & DFSREF_STORAGE_SERVER)) @@ -50,10 +52,9 @@ struct cache_entry { }; static struct kmem_cache *cache_slab __read_mostly; -static struct workqueue_struct *dfscache_wq __read_mostly; +struct workqueue_struct *dfscache_wq; -static int cache_ttl; -static DEFINE_SPINLOCK(cache_ttl_lock); +atomic_t dfs_cache_ttl; static struct nls_table *cache_cp; @@ -65,10 +66,6 @@ static atomic_t cache_count; static struct hlist_head cache_htable[CACHE_HTABLE_SIZE]; static DECLARE_RWSEM(htable_rw_lock); -static void refresh_cache_worker(struct work_struct *work); - -static DECLARE_DELAYED_WORK(refresh_task, refresh_cache_worker); - /** * dfs_cache_canonical_path - get a canonical DFS path * @@ -290,7 +287,9 @@ int dfs_cache_init(void) int rc; int i; - dfscache_wq = alloc_workqueue("cifs-dfscache", WQ_FREEZABLE | WQ_UNBOUND, 1); + dfscache_wq = alloc_workqueue("cifs-dfscache", + WQ_UNBOUND|WQ_FREEZABLE|WQ_MEM_RECLAIM, + 0); if (!dfscache_wq) return -ENOMEM; @@ -306,6 +305,7 @@ int dfs_cache_init(void) INIT_HLIST_HEAD(&cache_htable[i]); atomic_set(&cache_count, 0); + atomic_set(&dfs_cache_ttl, CACHE_DEFAULT_TTL); cache_cp = load_nls("utf8"); if (!cache_cp) cache_cp = load_nls_default(); @@ -480,6 +480,7 @@ static struct cache_entry *add_cache_entry_locked(struct dfs_info3_param *refs, int rc; struct cache_entry *ce; unsigned int hash; + int ttl; WARN_ON(!rwsem_is_locked(&htable_rw_lock)); @@ -496,15 +497,8 @@ static struct cache_entry *add_cache_entry_locked(struct dfs_info3_param *refs, if (IS_ERR(ce)) return ce; - spin_lock(&cache_ttl_lock); - if (!cache_ttl) { - cache_ttl = ce->ttl; - queue_delayed_work(dfscache_wq, &refresh_task, cache_ttl * HZ); - } else { - cache_ttl = min_t(int, cache_ttl, ce->ttl); - mod_delayed_work(dfscache_wq, &refresh_task, cache_ttl * HZ); - } - spin_unlock(&cache_ttl_lock); + ttl = min_t(int, atomic_read(&dfs_cache_ttl), ce->ttl); + atomic_set(&dfs_cache_ttl, ttl); hlist_add_head(&ce->hlist, &cache_htable[hash]); dump_ce(ce); @@ -616,7 +610,6 @@ static struct cache_entry *lookup_cache_entry(const char *path) */ void dfs_cache_destroy(void) { - cancel_delayed_work_sync(&refresh_task); unload_nls(cache_cp); flush_cache_ents(); kmem_cache_destroy(cache_slab); @@ -1142,6 +1135,7 @@ static bool target_share_equal(struct TCP_Server_Info *server, const char *s1, c * target shares in @refs. */ static void mark_for_reconnect_if_needed(struct TCP_Server_Info *server, + const char *path, struct dfs_cache_tgt_list *old_tl, struct dfs_cache_tgt_list *new_tl) { @@ -1153,8 +1147,10 @@ static void mark_for_reconnect_if_needed(struct TCP_Server_Info *server, nit = dfs_cache_get_next_tgt(new_tl, nit)) { if (target_share_equal(server, dfs_cache_get_tgt_name(oit), - dfs_cache_get_tgt_name(nit))) + dfs_cache_get_tgt_name(nit))) { + dfs_cache_noreq_update_tgthint(path, nit); return; + } } } @@ -1162,13 +1158,28 @@ static void mark_for_reconnect_if_needed(struct TCP_Server_Info *server, cifs_signal_cifsd_for_reconnect(server, true); } +static bool is_ses_good(struct cifs_ses *ses) +{ + struct TCP_Server_Info *server = ses->server; + struct cifs_tcon *tcon = ses->tcon_ipc; + bool ret; + + spin_lock(&ses->ses_lock); + spin_lock(&ses->chan_lock); + ret = !cifs_chan_needs_reconnect(ses, server) && + ses->ses_status == SES_GOOD && + !tcon->need_reconnect; + spin_unlock(&ses->chan_lock); + spin_unlock(&ses->ses_lock); + return ret; +} + /* Refresh dfs referral of tcon and mark it for reconnect if needed */ -static int __refresh_tcon(const char *path, struct cifs_tcon *tcon, bool force_refresh) +static int __refresh_tcon(const char *path, struct cifs_ses *ses, bool force_refresh) { struct dfs_cache_tgt_list old_tl = DFS_CACHE_TGT_LIST_INIT(old_tl); struct dfs_cache_tgt_list new_tl = DFS_CACHE_TGT_LIST_INIT(new_tl); - struct cifs_ses *ses = CIFS_DFS_ROOT_SES(tcon->ses); - struct cifs_tcon *ipc = ses->tcon_ipc; + struct TCP_Server_Info *server = ses->server; bool needs_refresh = false; struct cache_entry *ce; unsigned int xid; @@ -1190,20 +1201,19 @@ static int __refresh_tcon(const char *path, struct cifs_tcon *tcon, bool force_r goto out; } - spin_lock(&ipc->tc_lock); - if (ipc->status != TID_GOOD) { - spin_unlock(&ipc->tc_lock); - cifs_dbg(FYI, "%s: skip cache refresh due to disconnected ipc\n", __func__); + ses = CIFS_DFS_ROOT_SES(ses); + if (!is_ses_good(ses)) { + cifs_dbg(FYI, "%s: skip cache refresh due to disconnected ipc\n", + __func__); goto out; } - spin_unlock(&ipc->tc_lock); ce = cache_refresh_path(xid, ses, path, true); if (!IS_ERR(ce)) { rc = get_targets(ce, &new_tl); up_read(&htable_rw_lock); cifs_dbg(FYI, "%s: get_targets: %d\n", __func__, rc); - mark_for_reconnect_if_needed(tcon->ses->server, &old_tl, &new_tl); + mark_for_reconnect_if_needed(server, path, &old_tl, &new_tl); } out: @@ -1216,10 +1226,11 @@ out: static int refresh_tcon(struct cifs_tcon *tcon, bool force_refresh) { struct TCP_Server_Info *server = tcon->ses->server; + struct cifs_ses *ses = tcon->ses; mutex_lock(&server->refpath_lock); if (server->leaf_fullpath) - __refresh_tcon(server->leaf_fullpath + 1, tcon, force_refresh); + __refresh_tcon(server->leaf_fullpath + 1, ses, force_refresh); mutex_unlock(&server->refpath_lock); return 0; } @@ -1263,56 +1274,32 @@ int dfs_cache_remount_fs(struct cifs_sb_info *cifs_sb) return refresh_tcon(tcon, true); } -/* - * Worker that will refresh DFS cache from all active mounts based on lowest TTL value - * from a DFS referral. - */ -static void refresh_cache_worker(struct work_struct *work) +/* Refresh all DFS referrals related to DFS tcon */ +void dfs_cache_refresh(struct work_struct *work) { struct TCP_Server_Info *server; - struct cifs_tcon *tcon, *ntcon; - struct list_head tcons; + struct dfs_root_ses *rses; + struct cifs_tcon *tcon; struct cifs_ses *ses; - INIT_LIST_HEAD(&tcons); - - spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { - if (!server->leaf_fullpath) - continue; - - list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { - if (ses->tcon_ipc) { - ses->ses_count++; - list_add_tail(&ses->tcon_ipc->ulist, &tcons); - } - list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { - if (!tcon->ipc) { - tcon->tc_count++; - list_add_tail(&tcon->ulist, &tcons); - } - } - } - } - spin_unlock(&cifs_tcp_ses_lock); + tcon = container_of(work, struct cifs_tcon, dfs_cache_work.work); + ses = tcon->ses; + server = ses->server; - list_for_each_entry_safe(tcon, ntcon, &tcons, ulist) { - struct TCP_Server_Info *server = tcon->ses->server; - - list_del_init(&tcon->ulist); + mutex_lock(&server->refpath_lock); + if (server->leaf_fullpath) + __refresh_tcon(server->leaf_fullpath + 1, ses, false); + mutex_unlock(&server->refpath_lock); + list_for_each_entry(rses, &tcon->dfs_ses_list, list) { + ses = rses->ses; + server = ses->server; mutex_lock(&server->refpath_lock); if (server->leaf_fullpath) - __refresh_tcon(server->leaf_fullpath + 1, tcon, false); + __refresh_tcon(server->leaf_fullpath + 1, ses, false); mutex_unlock(&server->refpath_lock); - - if (tcon->ipc) - cifs_put_smb_ses(tcon->ses); - else - cifs_put_tcon(tcon); } - spin_lock(&cache_ttl_lock); - queue_delayed_work(dfscache_wq, &refresh_task, cache_ttl * HZ); - spin_unlock(&cache_ttl_lock); + queue_delayed_work(dfscache_wq, &tcon->dfs_cache_work, + atomic_read(&dfs_cache_ttl) * HZ); } diff --git a/fs/cifs/dfs_cache.h b/fs/cifs/dfs_cache.h index e0d39393035a..c6d89cd6d4fd 100644 --- a/fs/cifs/dfs_cache.h +++ b/fs/cifs/dfs_cache.h @@ -13,6 +13,9 @@ #include <linux/uuid.h> #include "cifsglob.h" +extern struct workqueue_struct *dfscache_wq; +extern atomic_t dfs_cache_ttl; + #define DFS_CACHE_TGT_LIST_INIT(var) { .tl_numtgts = 0, .tl_list = LIST_HEAD_INIT((var).tl_list), } struct dfs_cache_tgt_list { @@ -42,6 +45,7 @@ int dfs_cache_get_tgt_share(char *path, const struct dfs_cache_tgt_iterator *it, char **prefix); char *dfs_cache_canonical_path(const char *path, const struct nls_table *cp, int remap); int dfs_cache_remount_fs(struct cifs_sb_info *cifs_sb); +void dfs_cache_refresh(struct work_struct *work); static inline struct dfs_cache_tgt_iterator * dfs_cache_get_next_tgt(struct dfs_cache_tgt_list *tl, @@ -89,4 +93,9 @@ dfs_cache_get_nr_tgts(const struct dfs_cache_tgt_list *tl) return tl ? tl->tl_numtgts : 0; } +static inline int dfs_cache_get_ttl(void) +{ + return atomic_read(&dfs_cache_ttl); +} + #endif /* _CIFS_DFS_CACHE_H */ diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 6831a9949c43..c5fcefdfd797 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -4010,7 +4010,6 @@ static void collect_uncached_read_data(struct cifs_aio_ctx *ctx) { struct cifs_readdata *rdata, *tmp; - struct iov_iter *to = &ctx->iter; struct cifs_sb_info *cifs_sb; int rc; @@ -4076,9 +4075,6 @@ again: kref_put(&rdata->refcount, cifs_readdata_release); } - if (!ctx->direct_io) - ctx->total_len = ctx->len - iov_iter_count(to); - /* mask nodata case */ if (rc == -ENODATA) rc = 0; @@ -4886,6 +4882,8 @@ void cifs_oplock_break(struct work_struct *work) struct TCP_Server_Info *server = tcon->ses->server; int rc = 0; bool purge_cache = false; + struct cifs_deferred_close *dclose; + bool is_deferred = false; wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS, TASK_UNINTERRUPTIBLE); @@ -4922,6 +4920,20 @@ void cifs_oplock_break(struct work_struct *work) oplock_break_ack: /* + * When oplock break is received and there are no active + * file handles but cached, then schedule deferred close immediately. + * So, new open will not use cached handle. + */ + spin_lock(&CIFS_I(inode)->deferred_lock); + is_deferred = cifs_is_deferred_close(cfile, &dclose); + spin_unlock(&CIFS_I(inode)->deferred_lock); + + if (!CIFS_CACHE_HANDLE(cinode) && is_deferred && + cfile->deferred_close_scheduled && delayed_work_pending(&cfile->deferred)) { + cifs_close_deferred_file(cinode); + } + + /* * releasing stale oplock after recent reconnect of smb session using * a now incorrect file handle is not a data integrity issue but do * not bother sending an oplock release if session to server still is diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 6419ec47c2a8..cb3be58cd55e 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -239,7 +239,7 @@ static int cifs_dump_full_key(struct cifs_tcon *tcon, struct smb3_full_key_debug * section, we need to make sure it won't be released * so increment its refcount */ - ses->ses_count++; + cifs_smb_ses_inc_refcount(ses); found = true; goto search_end; } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 7f085ed2d866..cd914be905b2 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -749,7 +749,9 @@ cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode) list_for_each_entry(cfile, &cifs_inode->openFileList, flist) { if (delayed_work_pending(&cfile->deferred)) { if (cancel_delayed_work(&cfile->deferred)) { + spin_lock(&cifs_inode->deferred_lock); cifs_del_deferred_close(cfile); + spin_unlock(&cifs_inode->deferred_lock); tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC); if (tmp_list == NULL) @@ -762,7 +764,7 @@ cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode) spin_unlock(&cifs_inode->open_file_lock); list_for_each_entry_safe(tmp_list, tmp_next_list, &file_head, list) { - _cifsFileInfo_put(tmp_list->cfile, true, false); + _cifsFileInfo_put(tmp_list->cfile, false, false); list_del(&tmp_list->list); kfree(tmp_list); } @@ -780,7 +782,9 @@ cifs_close_all_deferred_files(struct cifs_tcon *tcon) list_for_each_entry(cfile, &tcon->openFileList, tlist) { if (delayed_work_pending(&cfile->deferred)) { if (cancel_delayed_work(&cfile->deferred)) { + spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); cifs_del_deferred_close(cfile); + spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC); if (tmp_list == NULL) @@ -815,7 +819,9 @@ cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path) if (strstr(full_path, path)) { if (delayed_work_pending(&cfile->deferred)) { if (cancel_delayed_work(&cfile->deferred)) { + spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); cifs_del_deferred_close(cfile); + spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC); if (tmp_list == NULL) diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index d2cbae4b5d21..335c078c42fb 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -159,6 +159,7 @@ cifs_chan_is_iface_active(struct cifs_ses *ses, /* returns number of channels added */ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) { + struct TCP_Server_Info *server = ses->server; int old_chan_count, new_chan_count; int left; int rc = 0; @@ -178,16 +179,16 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) return 0; } - if (ses->server->dialect < SMB30_PROT_ID) { + if (server->dialect < SMB30_PROT_ID) { spin_unlock(&ses->chan_lock); cifs_dbg(VFS, "multichannel is not supported on this protocol version, use 3.0 or above\n"); return 0; } - if (!(ses->server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { + if (!(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { ses->chan_max = 1; spin_unlock(&ses->chan_lock); - cifs_dbg(VFS, "server %s does not support multichannel\n", ses->server->hostname); + cifs_server_dbg(VFS, "no multichannel support\n"); return 0; } spin_unlock(&ses->chan_lock); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index a81758225fcd..a295e4c2d54e 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1682,7 +1682,7 @@ smb2_copychunk_range(const unsigned int xid, pcchunk->SourceOffset = cpu_to_le64(src_off); pcchunk->TargetOffset = cpu_to_le64(dest_off); pcchunk->Length = - cpu_to_le32(min_t(u32, len, tcon->max_bytes_chunk)); + cpu_to_le32(min_t(u64, len, tcon->max_bytes_chunk)); /* Request server copy to target from src identified by key */ kfree(retbuf); diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 4245249dbba8..9ed61b6f9b21 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -175,8 +175,17 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, } } spin_unlock(&tcon->tc_lock); - if ((!tcon->ses) || (tcon->ses->ses_status == SES_EXITING) || - (!tcon->ses->server) || !server) + + ses = tcon->ses; + if (!ses) + return -EIO; + spin_lock(&ses->ses_lock); + if (ses->ses_status == SES_EXITING) { + spin_unlock(&ses->ses_lock); + return -EIO; + } + spin_unlock(&ses->ses_lock); + if (!ses->server || !server) return -EIO; spin_lock(&server->srv_lock); @@ -204,8 +213,6 @@ again: if (rc) return rc; - ses = tcon->ses; - spin_lock(&ses->chan_lock); if (!cifs_chan_needs_reconnect(ses, server) && !tcon->need_reconnect) { spin_unlock(&ses->chan_lock); @@ -821,7 +828,6 @@ create_posix_buf(umode_t mode) static int add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode) { - struct smb2_create_req *req = iov[0].iov_base; unsigned int num = *num_iovec; iov[num].iov_base = create_posix_buf(mode); @@ -830,11 +836,6 @@ add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode) if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = sizeof(struct create_posix); - if (!req->CreateContextsOffset) - req->CreateContextsOffset = cpu_to_le32( - sizeof(struct smb2_create_req) + - iov[num - 1].iov_len); - le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_posix)); *num_iovec = num + 1; return 0; } @@ -1946,6 +1947,9 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, init_copy_chunk_defaults(tcon); if (server->ops->validate_negotiate) rc = server->ops->validate_negotiate(xid, tcon); + if (rc == 0) /* See MS-SMB2 2.2.10 and 3.2.5.5 */ + if (tcon->share_flags & SMB2_SHAREFLAG_ISOLATED_TRANSPORT) + server->nosharesock = true; tcon_exit: free_rsp_buf(resp_buftype, rsp); @@ -2069,7 +2073,7 @@ create_reconnect_durable_buf(struct cifs_fid *fid) static void parse_query_id_ctxt(struct create_context *cc, struct smb2_file_all_info *buf) { - struct create_on_disk_id *pdisk_id = (struct create_on_disk_id *)cc; + struct create_disk_id_rsp *pdisk_id = (struct create_disk_id_rsp *)cc; cifs_dbg(FYI, "parse query id context 0x%llx 0x%llx\n", pdisk_id->DiskFileId, pdisk_id->VolumeId); @@ -2172,10 +2176,11 @@ smb2_parse_contexts(struct TCP_Server_Info *server, } static int -add_lease_context(struct TCP_Server_Info *server, struct kvec *iov, +add_lease_context(struct TCP_Server_Info *server, + struct smb2_create_req *req, + struct kvec *iov, unsigned int *num_iovec, u8 *lease_key, __u8 *oplock) { - struct smb2_create_req *req = iov[0].iov_base; unsigned int num = *num_iovec; iov[num].iov_base = server->ops->create_lease_buf(lease_key, *oplock); @@ -2183,12 +2188,6 @@ add_lease_context(struct TCP_Server_Info *server, struct kvec *iov, return -ENOMEM; iov[num].iov_len = server->vals->create_lease_size; req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_LEASE; - if (!req->CreateContextsOffset) - req->CreateContextsOffset = cpu_to_le32( - sizeof(struct smb2_create_req) + - iov[num - 1].iov_len); - le32_add_cpu(&req->CreateContextsLength, - server->vals->create_lease_size); *num_iovec = num + 1; return 0; } @@ -2267,18 +2266,12 @@ static int add_durable_v2_context(struct kvec *iov, unsigned int *num_iovec, struct cifs_open_parms *oparms) { - struct smb2_create_req *req = iov[0].iov_base; unsigned int num = *num_iovec; iov[num].iov_base = create_durable_v2_buf(oparms); if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = sizeof(struct create_durable_v2); - if (!req->CreateContextsOffset) - req->CreateContextsOffset = - cpu_to_le32(sizeof(struct smb2_create_req) + - iov[1].iov_len); - le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable_v2)); *num_iovec = num + 1; return 0; } @@ -2287,7 +2280,6 @@ static int add_durable_reconnect_v2_context(struct kvec *iov, unsigned int *num_iovec, struct cifs_open_parms *oparms) { - struct smb2_create_req *req = iov[0].iov_base; unsigned int num = *num_iovec; /* indicate that we don't need to relock the file */ @@ -2297,12 +2289,6 @@ add_durable_reconnect_v2_context(struct kvec *iov, unsigned int *num_iovec, if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = sizeof(struct create_durable_handle_reconnect_v2); - if (!req->CreateContextsOffset) - req->CreateContextsOffset = - cpu_to_le32(sizeof(struct smb2_create_req) + - iov[1].iov_len); - le32_add_cpu(&req->CreateContextsLength, - sizeof(struct create_durable_handle_reconnect_v2)); *num_iovec = num + 1; return 0; } @@ -2311,7 +2297,6 @@ static int add_durable_context(struct kvec *iov, unsigned int *num_iovec, struct cifs_open_parms *oparms, bool use_persistent) { - struct smb2_create_req *req = iov[0].iov_base; unsigned int num = *num_iovec; if (use_persistent) { @@ -2331,11 +2316,6 @@ add_durable_context(struct kvec *iov, unsigned int *num_iovec, if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = sizeof(struct create_durable); - if (!req->CreateContextsOffset) - req->CreateContextsOffset = - cpu_to_le32(sizeof(struct smb2_create_req) + - iov[1].iov_len); - le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable)); *num_iovec = num + 1; return 0; } @@ -2369,18 +2349,12 @@ create_twarp_buf(__u64 timewarp) static int add_twarp_context(struct kvec *iov, unsigned int *num_iovec, __u64 timewarp) { - struct smb2_create_req *req = iov[0].iov_base; unsigned int num = *num_iovec; iov[num].iov_base = create_twarp_buf(timewarp); if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = sizeof(struct crt_twarp_ctxt); - if (!req->CreateContextsOffset) - req->CreateContextsOffset = cpu_to_le32( - sizeof(struct smb2_create_req) + - iov[num - 1].iov_len); - le32_add_cpu(&req->CreateContextsLength, sizeof(struct crt_twarp_ctxt)); *num_iovec = num + 1; return 0; } @@ -2503,7 +2477,6 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len) static int add_sd_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode, bool set_owner) { - struct smb2_create_req *req = iov[0].iov_base; unsigned int num = *num_iovec; unsigned int len = 0; @@ -2511,11 +2484,6 @@ add_sd_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode, bool set if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = len; - if (!req->CreateContextsOffset) - req->CreateContextsOffset = cpu_to_le32( - sizeof(struct smb2_create_req) + - iov[num - 1].iov_len); - le32_add_cpu(&req->CreateContextsLength, len); *num_iovec = num + 1; return 0; } @@ -2546,18 +2514,12 @@ create_query_id_buf(void) static int add_query_id_context(struct kvec *iov, unsigned int *num_iovec) { - struct smb2_create_req *req = iov[0].iov_base; unsigned int num = *num_iovec; iov[num].iov_base = create_query_id_buf(); if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = sizeof(struct crt_query_id_ctxt); - if (!req->CreateContextsOffset) - req->CreateContextsOffset = cpu_to_le32( - sizeof(struct smb2_create_req) + - iov[num - 1].iov_len); - le32_add_cpu(&req->CreateContextsLength, sizeof(struct crt_query_id_ctxt)); *num_iovec = num + 1; return 0; } @@ -2720,6 +2682,9 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, rc = add_posix_context(iov, &n_iov, mode); if (rc) goto err_free_req; + req->CreateContextsOffset = cpu_to_le32( + sizeof(struct smb2_create_req) + + iov[1].iov_len); pc_buf = iov[n_iov-1].iov_base; } @@ -2857,21 +2822,13 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, (oparms->create_options & CREATE_NOT_FILE)) req->RequestedOplockLevel = *oplock; /* no srv lease support */ else { - rc = add_lease_context(server, iov, &n_iov, + rc = add_lease_context(server, req, iov, &n_iov, oparms->fid->lease_key, oplock); if (rc) return rc; } if (*oplock == SMB2_OPLOCK_LEVEL_BATCH) { - /* need to set Next field of lease context if we request it */ - if (server->capabilities & SMB2_GLOBAL_CAP_LEASING) { - struct create_context *ccontext = - (struct create_context *)iov[n_iov-1].iov_base; - ccontext->Next = - cpu_to_le32(server->vals->create_lease_size); - } - rc = add_durable_context(iov, &n_iov, oparms, tcon->use_persistent); if (rc) @@ -2879,13 +2836,6 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, } if (tcon->posix_extensions) { - if (n_iov > 2) { - struct create_context *ccontext = - (struct create_context *)iov[n_iov-1].iov_base; - ccontext->Next = - cpu_to_le32(iov[n_iov-1].iov_len); - } - rc = add_posix_context(iov, &n_iov, oparms->mode); if (rc) return rc; @@ -2893,13 +2843,6 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, if (tcon->snapshot_time) { cifs_dbg(FYI, "adding snapshot context\n"); - if (n_iov > 2) { - struct create_context *ccontext = - (struct create_context *)iov[n_iov-1].iov_base; - ccontext->Next = - cpu_to_le32(iov[n_iov-1].iov_len); - } - rc = add_twarp_context(iov, &n_iov, tcon->snapshot_time); if (rc) return rc; @@ -2923,12 +2866,6 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, set_owner = false; if (set_owner | set_mode) { - if (n_iov > 2) { - struct create_context *ccontext = - (struct create_context *)iov[n_iov-1].iov_base; - ccontext->Next = cpu_to_le32(iov[n_iov-1].iov_len); - } - cifs_dbg(FYI, "add sd with mode 0x%x\n", oparms->mode); rc = add_sd_context(iov, &n_iov, oparms->mode, set_owner); if (rc) @@ -2936,12 +2873,30 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, } } + add_query_id_context(iov, &n_iov); + if (n_iov > 2) { - struct create_context *ccontext = - (struct create_context *)iov[n_iov-1].iov_base; - ccontext->Next = cpu_to_le32(iov[n_iov-1].iov_len); + /* + * We have create contexts behind iov[1] (the file + * name), point at them from the main create request + */ + req->CreateContextsOffset = cpu_to_le32( + sizeof(struct smb2_create_req) + + iov[1].iov_len); + req->CreateContextsLength = 0; + + for (unsigned int i = 2; i < (n_iov-1); i++) { + struct kvec *v = &iov[i]; + size_t len = v->iov_len; + struct create_context *cctx = + (struct create_context *)v->iov_base; + + cctx->Next = cpu_to_le32(len); + le32_add_cpu(&req->CreateContextsLength, len); + } + le32_add_cpu(&req->CreateContextsLength, + iov[n_iov-1].iov_len); } - add_query_id_context(iov, &n_iov); rqst->rq_nvec = n_iov; return 0; @@ -3849,7 +3804,7 @@ void smb2_reconnect_server(struct work_struct *work) if (ses->tcon_ipc && ses->tcon_ipc->need_reconnect) { list_add_tail(&ses->tcon_ipc->rlist, &tmp_list); tcon_selected = tcon_exist = true; - ses->ses_count++; + cifs_smb_ses_inc_refcount(ses); } /* * handle the case where channel needs to reconnect @@ -3860,7 +3815,7 @@ void smb2_reconnect_server(struct work_struct *work) if (!tcon_selected && cifs_chan_needs_reconnect(ses, server)) { list_add_tail(&ses->rlist, &tmp_ses_list); ses_exist = true; - ses->ses_count++; + cifs_smb_ses_inc_refcount(ses); } spin_unlock(&ses->chan_lock); } @@ -4180,10 +4135,12 @@ smb2_readv_callback(struct mid_q_entry *mid) struct smb2_hdr *shdr = (struct smb2_hdr *)rdata->iov[0].iov_base; struct cifs_credits credits = { .value = 0, .instance = 0 }; - struct smb_rqst rqst = { .rq_iov = &rdata->iov[1], - .rq_nvec = 1, - .rq_iter = rdata->iter, - .rq_iter_size = iov_iter_count(&rdata->iter), }; + struct smb_rqst rqst = { .rq_iov = &rdata->iov[1], .rq_nvec = 1 }; + + if (rdata->got_bytes) { + rqst.rq_iter = rdata->iter; + rqst.rq_iter_size = iov_iter_count(&rdata->iter); + } WARN_ONCE(rdata->server != mid->server, "rdata server %p != mid server %p", diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 2114e8a0c63a..220994d0a0f7 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -132,17 +132,6 @@ struct share_redirect_error_context_rsp { #define SMB2_LEASE_HANDLE_CACHING_HE 0x02 #define SMB2_LEASE_WRITE_CACHING_HE 0x04 -struct create_durable { - struct create_context ccontext; - __u8 Name[8]; - union { - __u8 Reserved[16]; - struct { - __u64 PersistentFileId; - __u64 VolatileFileId; - } Fid; - } Data; -} __packed; /* See MS-SMB2 2.2.13.2.11 */ /* Flags */ @@ -170,15 +159,6 @@ struct durable_reconnect_context_v2 { __le32 Flags; /* see above DHANDLE_FLAG_PERSISTENT */ } __packed; -/* See MS-SMB2 2.2.14.2.9 */ -struct create_on_disk_id { - struct create_context ccontext; - __u8 Name[8]; - __le64 DiskFileId; - __le64 VolumeId; - __u32 Reserved[4]; -} __packed; - /* See MS-SMB2 2.2.14.2.12 */ struct durable_reconnect_context_v2_rsp { __le32 Timeout; diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 50e762fa1a14..4ad5531686d8 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -487,9 +487,5 @@ const struct xattr_handler *cifs_xattr_handlers[] = { &smb3_ntsd_xattr_handler, /* alias for above since avoiding "cifs" */ &cifs_cifs_ntsd_full_xattr_handler, &smb3_ntsd_full_xattr_handler, /* alias for above since avoiding "cifs" */ -#ifdef CONFIG_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif NULL }; diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index b39580ad4ce5..3c3148588491 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -361,7 +361,7 @@ static int __init init_coda_psdev(void) __func__, CODA_PSDEV_MAJOR); return -EIO; } - coda_psdev_class = class_create(THIS_MODULE, "coda"); + coda_psdev_class = class_create("coda"); if (IS_ERR(coda_psdev_class)) { err = PTR_ERR(coda_psdev_class); goto out_chrdev; diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c index fda3b702b1c5..a247c14aaab7 100644 --- a/fs/coda/sysctl.c +++ b/fs/coda/sysctl.c @@ -39,19 +39,10 @@ static struct ctl_table coda_table[] = { {} }; -static struct ctl_table fs_table[] = { - { - .procname = "coda", - .mode = 0555, - .child = coda_table - }, - {} -}; - void coda_sysctl_init(void) { if ( !fs_table_header ) - fs_table_header = register_sysctl_table(fs_table); + fs_table_header = register_sysctl("coda", coda_table); } void coda_sysctl_clean(void) diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 4afcbbe63e68..18677cd4e62f 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1599,12 +1599,6 @@ static int configfs_dir_close(struct inode *inode, struct file *file) return 0; } -/* Relationship between s_mode and the DT_xxx types */ -static inline unsigned char dt_type(struct configfs_dirent *sd) -{ - return (sd->s_mode >> 12) & 15; -} - static int configfs_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; @@ -1654,7 +1648,8 @@ static int configfs_readdir(struct file *file, struct dir_context *ctx) name = configfs_get_name(next); len = strlen(name); - if (!dir_emit(ctx, name, len, ino, dt_type(next))) + if (!dir_emit(ctx, name, len, ino, + fs_umode_to_dtype(next->s_mode))) return 0; spin_lock(&configfs_dirent_lock); diff --git a/fs/coredump.c b/fs/coredump.c index 5df1e6e1eb2b..ece7badf701b 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -882,6 +882,7 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page) pos = file->f_pos; bvec_set_page(&bvec, page, PAGE_SIZE, 0); iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE); + iov_iter_set_copy_mc(&iter); n = __kernel_write_iter(cprm->file, &iter, &pos); if (n != PAGE_SIZE) return 0; diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index d57d0a020f71..62e1a3dd8357 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -69,7 +69,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, pblk << (blockbits - SECTOR_SHIFT); } ret = bio_add_page(bio, ZERO_PAGE(0), bytes_this_page, 0); - if (WARN_ON(ret != bytes_this_page)) { + if (WARN_ON_ONCE(ret != bytes_this_page)) { err = -EIO; goto out; } @@ -147,7 +147,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, break; } nr_pages = i; - if (WARN_ON(nr_pages <= 0)) + if (WARN_ON_ONCE(nr_pages <= 0)) return -EINVAL; /* This always succeeds since __GFP_DIRECT_RECLAIM is set. */ @@ -170,7 +170,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, offset += blocksize; if (offset == PAGE_SIZE || len == 0) { ret = bio_add_page(bio, pages[i++], offset, 0); - if (WARN_ON(ret != offset)) { + if (WARN_ON_ONCE(ret != offset)) { err = -EIO; goto out; } diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index bf642479269a..6a837e4b80dc 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -308,19 +308,24 @@ EXPORT_SYMBOL(fscrypt_decrypt_block_inplace); /** * fscrypt_initialize() - allocate major buffers for fs encryption. - * @cop_flags: fscrypt operations flags + * @sb: the filesystem superblock * * We only call this when we start accessing encrypted files, since it * results in memory getting allocated that wouldn't otherwise be used. * * Return: 0 on success; -errno on failure */ -int fscrypt_initialize(unsigned int cop_flags) +int fscrypt_initialize(struct super_block *sb) { int err = 0; + mempool_t *pool; + + /* pairs with smp_store_release() below */ + if (likely(smp_load_acquire(&fscrypt_bounce_page_pool))) + return 0; /* No need to allocate a bounce page pool if this FS won't use it. */ - if (cop_flags & FS_CFLG_OWN_PAGES) + if (sb->s_cop->flags & FS_CFLG_OWN_PAGES) return 0; mutex_lock(&fscrypt_init_mutex); @@ -328,11 +333,11 @@ int fscrypt_initialize(unsigned int cop_flags) goto out_unlock; err = -ENOMEM; - fscrypt_bounce_page_pool = - mempool_create_page_pool(num_prealloc_crypto_pages, 0); - if (!fscrypt_bounce_page_pool) + pool = mempool_create_page_pool(num_prealloc_crypto_pages, 0); + if (!pool) goto out_unlock; - + /* pairs with smp_load_acquire() above */ + smp_store_release(&fscrypt_bounce_page_pool, pool); err = 0; out_unlock: mutex_unlock(&fscrypt_init_mutex); diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 12bd61d20f69..6eae3f12ad50 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -110,7 +110,7 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, * Copy the filename to the output buffer for encrypting in-place and * pad it with the needed number of NUL bytes. */ - if (WARN_ON(olen < iname->len)) + if (WARN_ON_ONCE(olen < iname->len)) return -ENOBUFS; memcpy(out, iname->name, iname->len); memset(out + iname->len, 0, olen - iname->len); @@ -570,7 +570,7 @@ u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name) { const struct fscrypt_info *ci = dir->i_crypt_info; - WARN_ON(!ci->ci_dirhash_key_initialized); + WARN_ON_ONCE(!ci->ci_dirhash_key_initialized); return siphash(name->name, name->len, &ci->ci_dirhash_key); } diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 0fec2dfc36eb..7ab5a7b7eef8 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -101,7 +101,7 @@ static inline const u8 *fscrypt_context_nonce(const union fscrypt_context *ctx) case FSCRYPT_CONTEXT_V2: return ctx->v2.nonce; } - WARN_ON(1); + WARN_ON_ONCE(1); return NULL; } @@ -264,7 +264,7 @@ typedef enum { /* crypto.c */ extern struct kmem_cache *fscrypt_info_cachep; -int fscrypt_initialize(unsigned int cop_flags); +int fscrypt_initialize(struct super_block *sb); int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, u64 lblk_num, struct page *src_page, struct page *dest_page, unsigned int len, @@ -386,7 +386,7 @@ fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_info *ci) { - WARN_ON(1); + WARN_ON_ONCE(1); return -EOPNOTSUPP; } diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c index 7607d18b35fc..5a384dad2c72 100644 --- a/fs/crypto/hkdf.c +++ b/fs/crypto/hkdf.c @@ -79,7 +79,7 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, return PTR_ERR(hmac_tfm); } - if (WARN_ON(crypto_shash_digestsize(hmac_tfm) != sizeof(prk))) { + if (WARN_ON_ONCE(crypto_shash_digestsize(hmac_tfm) != sizeof(prk))) { err = -EINVAL; goto err_free_tfm; } @@ -125,7 +125,7 @@ int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, u8 counter = 1; u8 tmp[HKDF_HASHLEN]; - if (WARN_ON(okmlen > 255 * HKDF_HASHLEN)) + if (WARN_ON_ONCE(okmlen > 255 * HKDF_HASHLEN)) return -EINVAL; desc->tfm = hkdf->hmac_tfm; diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 7b8c5a1104b5..9e786ae66a13 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -111,6 +111,36 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, } EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup); +/** + * fscrypt_prepare_lookup_partial() - prepare lookup without filename setup + * @dir: the encrypted directory being searched + * @dentry: the dentry being looked up in @dir + * + * This function should be used by the ->lookup and ->atomic_open methods of + * filesystems that handle filename encryption and no-key name encoding + * themselves and thus can't use fscrypt_prepare_lookup(). Like + * fscrypt_prepare_lookup(), this will try to set up the directory's encryption + * key and will set DCACHE_NOKEY_NAME on the dentry if the key is unavailable. + * However, this function doesn't set up a struct fscrypt_name for the filename. + * + * Return: 0 on success; -errno on error. Note that the encryption key being + * unavailable is not considered an error. It is also not an error if + * the encryption policy is unsupported by this kernel; that is treated + * like the key being unavailable, so that files can still be deleted. + */ +int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry) +{ + int err = fscrypt_get_encryption_info(dir, true); + + if (!err && !fscrypt_has_encryption_key(dir)) { + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_NOKEY_NAME; + spin_unlock(&dentry->d_lock); + } + return err; +} +EXPORT_SYMBOL_GPL(fscrypt_prepare_lookup_partial); + int __fscrypt_prepare_readdir(struct inode *dir) { return fscrypt_get_encryption_info(dir, true); @@ -315,7 +345,7 @@ const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, int err; /* This is for encrypted symlinks only */ - if (WARN_ON(!IS_ENCRYPTED(inode))) + if (WARN_ON_ONCE(!IS_ENCRYPTED(inode))) return ERR_PTR(-EINVAL); /* If the decrypted target is already cached, just return it. */ diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 13d336a6cc5d..7cbb1fd872ac 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -73,7 +73,7 @@ void fscrypt_put_master_key(struct fscrypt_master_key *mk) * fscrypt_master_key struct itself after an RCU grace period ensures * that concurrent keyring lookups can no longer find it. */ - WARN_ON(refcount_read(&mk->mk_active_refs) != 0); + WARN_ON_ONCE(refcount_read(&mk->mk_active_refs) != 0); key_put(mk->mk_users); mk->mk_users = NULL; call_rcu(&mk->mk_rcu_head, fscrypt_free_master_key); @@ -92,7 +92,7 @@ void fscrypt_put_master_key_activeref(struct super_block *sb, * destroying any subkeys embedded in it. */ - if (WARN_ON(!sb->s_master_keys)) + if (WARN_ON_ONCE(!sb->s_master_keys)) return; spin_lock(&sb->s_master_keys->lock); hlist_del_rcu(&mk->mk_node); @@ -102,8 +102,8 @@ void fscrypt_put_master_key_activeref(struct super_block *sb, * ->mk_active_refs == 0 implies that ->mk_secret is not present and * that ->mk_decrypted_inodes is empty. */ - WARN_ON(is_master_key_secret_present(&mk->mk_secret)); - WARN_ON(!list_empty(&mk->mk_decrypted_inodes)); + WARN_ON_ONCE(is_master_key_secret_present(&mk->mk_secret)); + WARN_ON_ONCE(!list_empty(&mk->mk_decrypted_inodes)); for (i = 0; i <= FSCRYPT_MODE_MAX; i++) { fscrypt_destroy_prepared_key( @@ -237,9 +237,9 @@ void fscrypt_destroy_keyring(struct super_block *sb) * with ->mk_secret. There should be no structural refs * beyond the one associated with the active ref. */ - WARN_ON(refcount_read(&mk->mk_active_refs) != 1); - WARN_ON(refcount_read(&mk->mk_struct_refs) != 1); - WARN_ON(!is_master_key_secret_present(&mk->mk_secret)); + WARN_ON_ONCE(refcount_read(&mk->mk_active_refs) != 1); + WARN_ON_ONCE(refcount_read(&mk->mk_struct_refs) != 1); + WARN_ON_ONCE(!is_master_key_secret_present(&mk->mk_secret)); wipe_master_key_secret(&mk->mk_secret); fscrypt_put_master_key_activeref(sb, mk); } diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index aa94fba9d17e..361f41ef46c7 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -125,7 +125,7 @@ fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key, pr_info("fscrypt: %s using implementation \"%s\"\n", mode->friendly_name, crypto_skcipher_driver_name(tfm)); } - if (WARN_ON(crypto_skcipher_ivsize(tfm) != mode->ivsize)) { + if (WARN_ON_ONCE(crypto_skcipher_ivsize(tfm) != mode->ivsize)) { err = -EINVAL; goto err_free_tfm; } @@ -199,7 +199,7 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci, unsigned int hkdf_infolen = 0; int err; - if (WARN_ON(mode_num > FSCRYPT_MODE_MAX)) + if (WARN_ON_ONCE(mode_num > FSCRYPT_MODE_MAX)) return -EINVAL; prep_key = &keys[mode_num]; @@ -282,8 +282,8 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, void fscrypt_hash_inode_number(struct fscrypt_info *ci, const struct fscrypt_master_key *mk) { - WARN_ON(ci->ci_inode->i_ino == 0); - WARN_ON(!mk->mk_ino_hash_key_initialized); + WARN_ON_ONCE(ci->ci_inode->i_ino == 0); + WARN_ON_ONCE(!mk->mk_ino_hash_key_initialized); ci->ci_hashed_ino = (u32)siphash_1u64(ci->ci_inode->i_ino, &mk->mk_ino_hash_key); @@ -503,7 +503,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, err = fscrypt_setup_v2_file_key(ci, mk, need_dirhash_key); break; default: - WARN_ON(1); + WARN_ON_ONCE(1); err = -EINVAL; break; } @@ -560,7 +560,7 @@ fscrypt_setup_encryption_info(struct inode *inode, struct fscrypt_master_key *mk = NULL; int res; - res = fscrypt_initialize(inode->i_sb->s_cop->flags); + res = fscrypt_initialize(inode->i_sb); if (res) return res; @@ -577,7 +577,7 @@ fscrypt_setup_encryption_info(struct inode *inode, res = PTR_ERR(mode); goto out; } - WARN_ON(mode->ivsize > FSCRYPT_MAX_IV_SIZE); + WARN_ON_ONCE(mode->ivsize > FSCRYPT_MAX_IV_SIZE); crypt_info->ci_mode = mode; res = setup_file_encryption_key(crypt_info, need_dirhash_key, &mk); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 3b5fcb6402ea..f4456ecb3f87 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -48,7 +48,7 @@ int fscrypt_policy_to_key_spec(const union fscrypt_policy *policy, FSCRYPT_KEY_IDENTIFIER_SIZE); return 0; default: - WARN_ON(1); + WARN_ON_ONCE(1); return -EINVAL; } } @@ -463,7 +463,7 @@ static int set_encryption_policy(struct inode *inode, current->comm, current->pid); break; default: - WARN_ON(1); + WARN_ON_ONCE(1); return -EINVAL; } diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 4f25015aa534..fe3db0eda8e4 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -72,24 +72,6 @@ static struct ctl_table pty_table[] = { {} }; -static struct ctl_table pty_kern_table[] = { - { - .procname = "pty", - .mode = 0555, - .child = pty_table, - }, - {} -}; - -static struct ctl_table pty_root_table[] = { - { - .procname = "kernel", - .mode = 0555, - .child = pty_kern_table, - }, - {} -}; - struct pts_mount_opts { int setuid; int setgid; @@ -630,7 +612,7 @@ static int __init init_devpts_fs(void) { int err = register_filesystem(&devpts_fs_type); if (!err) { - register_sysctl_table(pty_root_table); + register_sysctl("kernel/pty", pty_table); } return err; } diff --git a/fs/direct-io.c b/fs/direct-io.c index ab0d7ea89813..0b380bb8a81e 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -86,7 +86,6 @@ struct dio_submit { sector_t final_block_in_request;/* doesn't change */ int boundary; /* prev block is at a boundary */ get_block_t *get_block; /* block mapping function */ - dio_submit_t *submit_io; /* IO submition function */ loff_t logical_offset_in_bio; /* current first logical block in bio */ sector_t final_block_in_bio; /* current final block in bio + 1 */ @@ -431,10 +430,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) dio->bio_disk = bio->bi_bdev->bd_disk; - if (sdio->submit_io) - sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio); - else - submit_bio(bio); + submit_bio(bio); sdio->bio = NULL; sdio->boundary = 0; @@ -1098,7 +1094,7 @@ static inline int drop_refcount(struct dio *dio) ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, get_block_t get_block, dio_iodone_t end_io, - dio_submit_t submit_io, int flags) + int flags) { unsigned i_blkbits = READ_ONCE(inode->i_blkbits); unsigned blkbits = i_blkbits; @@ -1215,7 +1211,6 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, sdio.get_block = get_block; dio->end_io = end_io; - sdio.submit_io = submit_io; sdio.final_block_in_bio = -1; sdio.next_block_for_io = -1; diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index b3b86dbdc187..f82a4952769d 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -8,15 +8,6 @@ menuconfig DLM A general purpose distributed lock manager for kernel or userspace applications. -config DLM_DEPRECATED_API - bool "DLM deprecated API" - depends on DLM - help - Enables deprecated DLM timeout features that will be removed in - later Linux kernel releases. - - If you are unsure, say N. - config DLM_DEBUG bool "DLM debugging" depends on DLM diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile index 71dab733cf9a..5a471af1d1fe 100644 --- a/fs/dlm/Makefile +++ b/fs/dlm/Makefile @@ -17,6 +17,5 @@ dlm-y := ast.o \ requestqueue.o \ user.o \ util.o -dlm-$(CONFIG_DLM_DEPRECATED_API) += netlink.o dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 26fef9945cc9..700ff2e0515a 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -45,7 +45,7 @@ void dlm_purge_lkb_callbacks(struct dlm_lkb *lkb) kref_put(&cb->ref, dlm_release_callback); } - lkb->lkb_flags &= ~DLM_IFL_CB_PENDING; + clear_bit(DLM_IFL_CB_PENDING_BIT, &lkb->lkb_iflags); /* invalidate */ dlm_callback_set_last_ptr(&lkb->lkb_last_cast, NULL); @@ -103,10 +103,9 @@ int dlm_enqueue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, cb->sb_status = status; cb->sb_flags = (sbflags & 0x000000FF); kref_init(&cb->ref); - if (!(lkb->lkb_flags & DLM_IFL_CB_PENDING)) { - lkb->lkb_flags |= DLM_IFL_CB_PENDING; + if (!test_and_set_bit(DLM_IFL_CB_PENDING_BIT, &lkb->lkb_iflags)) rv = DLM_ENQUEUE_CALLBACK_NEED_SCHED; - } + list_add_tail(&cb->list, &lkb->lkb_callbacks); if (flags & DLM_CB_CAST) @@ -140,7 +139,7 @@ void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, struct dlm_ls *ls = lkb->lkb_resource->res_ls; int rv; - if (lkb->lkb_flags & DLM_IFL_USER) { + if (test_bit(DLM_DFL_USER_BIT, &lkb->lkb_dflags)) { dlm_user_add_ast(lkb, flags, mode, status, sbflags); return; } @@ -209,7 +208,7 @@ void dlm_callback_work(struct work_struct *work) spin_lock(&lkb->lkb_cb_lock); rv = dlm_dequeue_lkb_callback(lkb, &cb); if (rv == DLM_DEQUEUE_CALLBACK_EMPTY) { - lkb->lkb_flags &= ~DLM_IFL_CB_PENDING; + clear_bit(DLM_IFL_CB_PENDING_BIT, &lkb->lkb_iflags); spin_unlock(&lkb->lkb_cb_lock); break; } diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 20b60709eccf..d31319d08581 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -75,9 +75,6 @@ struct dlm_cluster { unsigned int cl_log_info; unsigned int cl_protocol; unsigned int cl_mark; -#ifdef CONFIG_DLM_DEPRECATED_API - unsigned int cl_timewarn_cs; -#endif unsigned int cl_new_rsb_count; unsigned int cl_recover_callbacks; char cl_cluster_name[DLM_LOCKSPACE_LEN]; @@ -103,9 +100,6 @@ enum { CLUSTER_ATTR_LOG_INFO, CLUSTER_ATTR_PROTOCOL, CLUSTER_ATTR_MARK, -#ifdef CONFIG_DLM_DEPRECATED_API - CLUSTER_ATTR_TIMEWARN_CS, -#endif CLUSTER_ATTR_NEW_RSB_COUNT, CLUSTER_ATTR_RECOVER_CALLBACKS, CLUSTER_ATTR_CLUSTER_NAME, @@ -226,9 +220,6 @@ CLUSTER_ATTR(log_debug, NULL); CLUSTER_ATTR(log_info, NULL); CLUSTER_ATTR(protocol, dlm_check_protocol_and_dlm_running); CLUSTER_ATTR(mark, NULL); -#ifdef CONFIG_DLM_DEPRECATED_API -CLUSTER_ATTR(timewarn_cs, dlm_check_zero); -#endif CLUSTER_ATTR(new_rsb_count, NULL); CLUSTER_ATTR(recover_callbacks, NULL); @@ -243,9 +234,6 @@ static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_LOG_INFO] = &cluster_attr_log_info, [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol, [CLUSTER_ATTR_MARK] = &cluster_attr_mark, -#ifdef CONFIG_DLM_DEPRECATED_API - [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs, -#endif [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count, [CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks, [CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name, @@ -436,9 +424,6 @@ static struct config_group *make_cluster(struct config_group *g, cl->cl_log_debug = dlm_config.ci_log_debug; cl->cl_log_info = dlm_config.ci_log_info; cl->cl_protocol = dlm_config.ci_protocol; -#ifdef CONFIG_DLM_DEPRECATED_API - cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs; -#endif cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count; cl->cl_recover_callbacks = dlm_config.ci_recover_callbacks; memcpy(cl->cl_cluster_name, dlm_config.ci_cluster_name, @@ -959,9 +944,6 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) #define DEFAULT_LOG_INFO 1 #define DEFAULT_PROTOCOL DLM_PROTO_TCP #define DEFAULT_MARK 0 -#ifdef CONFIG_DLM_DEPRECATED_API -#define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */ -#endif #define DEFAULT_NEW_RSB_COUNT 128 #define DEFAULT_RECOVER_CALLBACKS 0 #define DEFAULT_CLUSTER_NAME "" @@ -977,9 +959,6 @@ struct dlm_config_info dlm_config = { .ci_log_info = DEFAULT_LOG_INFO, .ci_protocol = DEFAULT_PROTOCOL, .ci_mark = DEFAULT_MARK, -#ifdef CONFIG_DLM_DEPRECATED_API - .ci_timewarn_cs = DEFAULT_TIMEWARN_CS, -#endif .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT, .ci_recover_callbacks = DEFAULT_RECOVER_CALLBACKS, .ci_cluster_name = DEFAULT_CLUSTER_NAME diff --git a/fs/dlm/config.h b/fs/dlm/config.h index 55c5f2c13ebd..4c91fcca0fd4 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -37,9 +37,6 @@ struct dlm_config_info { int ci_log_info; int ci_protocol; int ci_mark; -#ifdef CONFIG_DLM_DEPRECATED_API - int ci_timewarn_cs; -#endif int ci_new_rsb_count; int ci_recover_callbacks; char ci_cluster_name[DLM_LOCKSPACE_LEN]; diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 8a0e1b1f74ad..a1aca41c49d0 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -170,7 +170,7 @@ static void print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb, u64 xid = 0; u64 us; - if (lkb->lkb_flags & DLM_IFL_USER) { + if (test_bit(DLM_DFL_USER_BIT, &lkb->lkb_dflags)) { if (lkb->lkb_ua) xid = lkb->lkb_ua->xid; } @@ -188,7 +188,7 @@ static void print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb, lkb->lkb_ownpid, (unsigned long long)xid, lkb->lkb_exflags, - lkb->lkb_flags, + dlm_iflags_val(lkb), lkb->lkb_status, lkb->lkb_grmode, lkb->lkb_rqmode, @@ -230,7 +230,7 @@ static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb, { u64 xid = 0; - if (lkb->lkb_flags & DLM_IFL_USER) { + if (test_bit(DLM_DFL_USER_BIT, &lkb->lkb_dflags)) { if (lkb->lkb_ua) xid = lkb->lkb_ua->xid; } @@ -242,7 +242,7 @@ static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb, lkb->lkb_ownpid, (unsigned long long)xid, lkb->lkb_exflags, - lkb->lkb_flags, + dlm_iflags_val(lkb), lkb->lkb_status, lkb->lkb_grmode, lkb->lkb_rqmode, diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index ab1a55337a6e..986a9d7b1f33 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -99,13 +99,13 @@ do { \ } -#define DLM_RTF_SHRINK 0x00000001 +#define DLM_RTF_SHRINK_BIT 0 struct dlm_rsbtable { struct rb_root keep; struct rb_root toss; spinlock_t lock; - uint32_t flags; + unsigned long flags; }; @@ -145,9 +145,6 @@ struct dlm_args { void (*bastfn) (void *astparam, int mode); int mode; struct dlm_lksb *lksb; -#ifdef CONFIG_DLM_DEPRECATED_API - unsigned long timeout; -#endif }; @@ -197,31 +194,25 @@ struct dlm_args { #define DLM_LKSTS_GRANTED 2 #define DLM_LKSTS_CONVERT 3 -/* lkb_flags */ - -#define DLM_IFL_MSTCPY 0x00010000 -#define DLM_IFL_RESEND 0x00020000 -#define DLM_IFL_DEAD 0x00040000 -#define DLM_IFL_OVERLAP_UNLOCK 0x00080000 -#define DLM_IFL_OVERLAP_CANCEL 0x00100000 -#define DLM_IFL_ENDOFLIFE 0x00200000 -#ifdef CONFIG_DLM_DEPRECATED_API -#define DLM_IFL_WATCH_TIMEWARN 0x00400000 -#define DLM_IFL_TIMEOUT_CANCEL 0x00800000 -#endif -#define DLM_IFL_DEADLOCK_CANCEL 0x01000000 -#define DLM_IFL_STUB_MS 0x02000000 /* magic number for m_flags */ -#define DLM_IFL_CB_PENDING 0x04000000 -/* least significant 2 bytes are message changed, they are full transmitted - * but at receive side only the 2 bytes LSB will be set. - * - * Even wireshark dlm dissector does only evaluate the lower bytes and note - * that they may not be used on transceiver side, we assume the higher bytes - * are for internal use or reserved so long they are not parsed on receiver - * side. - */ -#define DLM_IFL_USER 0x00000001 -#define DLM_IFL_ORPHAN 0x00000002 +/* lkb_iflags */ + +#define DLM_IFL_MSTCPY_BIT 16 +#define __DLM_IFL_MIN_BIT DLM_IFL_MSTCPY_BIT +#define DLM_IFL_RESEND_BIT 17 +#define DLM_IFL_DEAD_BIT 18 +#define DLM_IFL_OVERLAP_UNLOCK_BIT 19 +#define DLM_IFL_OVERLAP_CANCEL_BIT 20 +#define DLM_IFL_ENDOFLIFE_BIT 21 +#define DLM_IFL_DEADLOCK_CANCEL_BIT 24 +#define DLM_IFL_CB_PENDING_BIT 25 +#define __DLM_IFL_MAX_BIT DLM_IFL_CB_PENDING_BIT + +/* lkb_dflags */ + +#define DLM_DFL_USER_BIT 0 +#define __DLM_DFL_MIN_BIT DLM_DFL_USER_BIT +#define DLM_DFL_ORPHAN_BIT 1 +#define __DLM_DFL_MAX_BIT DLM_DFL_ORPHAN_BIT #define DLM_CB_CAST 0x00000001 #define DLM_CB_BAST 0x00000002 @@ -244,8 +235,9 @@ struct dlm_lkb { uint32_t lkb_id; /* our lock ID */ uint32_t lkb_remid; /* lock ID on remote partner */ uint32_t lkb_exflags; /* external flags from caller */ - uint32_t lkb_sbflags; /* lksb flags */ - uint32_t lkb_flags; /* internal flags */ + unsigned long lkb_sbflags; /* lksb flags */ + unsigned long lkb_dflags; /* distributed flags */ + unsigned long lkb_iflags; /* internal flags */ uint32_t lkb_lvbseq; /* lvb sequence number */ int8_t lkb_status; /* granted, waiting, convert */ @@ -263,11 +255,6 @@ struct dlm_lkb { struct list_head lkb_ownqueue; /* list of locks for a process */ ktime_t lkb_timestamp; -#ifdef CONFIG_DLM_DEPRECATED_API - struct list_head lkb_time_list; - unsigned long lkb_timeout_cs; -#endif - spinlock_t lkb_cb_lock; struct work_struct lkb_cb_work; struct list_head lkb_cb_list; /* for ls_cb_delay or proc->asts */ @@ -583,11 +570,6 @@ struct dlm_ls { struct mutex ls_orphans_mutex; struct list_head ls_orphans; -#ifdef CONFIG_DLM_DEPRECATED_API - struct mutex ls_timeout_mutex; - struct list_head ls_timeout; -#endif - spinlock_t ls_new_rsb_spin; int ls_new_rsb_count; struct list_head ls_new_rsb; /* new rsb structs */ @@ -607,9 +589,9 @@ struct dlm_ls { int ls_slots_size; struct dlm_slot *ls_slots; - struct dlm_rsb ls_stub_rsb; /* for returning errors */ - struct dlm_lkb ls_stub_lkb; /* for returning errors */ - struct dlm_message ls_stub_ms; /* for faking a reply */ + struct dlm_rsb ls_local_rsb; /* for returning errors */ + struct dlm_lkb ls_local_lkb; /* for returning errors */ + struct dlm_message ls_local_ms; /* for faking a reply */ struct dentry *ls_debug_rsb_dentry; /* debugfs */ struct dentry *ls_debug_waiters_dentry; /* debugfs */ @@ -701,9 +683,6 @@ struct dlm_ls { #define LSFL_RCOM_READY 5 #define LSFL_RCOM_WAIT 6 #define LSFL_UEVENT_WAIT 7 -#ifdef CONFIG_DLM_DEPRECATED_API -#define LSFL_TIMEWARN 8 -#endif #define LSFL_CB_DELAY 9 #define LSFL_NODIR 10 @@ -756,15 +735,76 @@ static inline int dlm_no_directory(struct dlm_ls *ls) return test_bit(LSFL_NODIR, &ls->ls_flags); } -#ifdef CONFIG_DLM_DEPRECATED_API -int dlm_netlink_init(void); -void dlm_netlink_exit(void); -void dlm_timeout_warn(struct dlm_lkb *lkb); -#else -static inline int dlm_netlink_init(void) { return 0; } -static inline void dlm_netlink_exit(void) { }; -static inline void dlm_timeout_warn(struct dlm_lkb *lkb) { }; -#endif +/* takes a snapshot from dlm atomic flags */ +static inline uint32_t dlm_flags_val(const unsigned long *addr, + uint32_t min, uint32_t max) +{ + uint32_t bit = min, val = 0; + + for_each_set_bit_from(bit, addr, max + 1) { + val |= BIT(bit); + } + + return val; +} + +static inline uint32_t dlm_iflags_val(const struct dlm_lkb *lkb) +{ + return dlm_flags_val(&lkb->lkb_iflags, __DLM_IFL_MIN_BIT, + __DLM_IFL_MAX_BIT); +} + +static inline uint32_t dlm_dflags_val(const struct dlm_lkb *lkb) +{ + return dlm_flags_val(&lkb->lkb_dflags, __DLM_DFL_MIN_BIT, + __DLM_DFL_MAX_BIT); +} + +/* coming from UAPI header + * + * TODO: + * Move this to UAPI header and let other values point to them and use BIT() + */ +#define DLM_SBF_DEMOTED_BIT 0 +#define __DLM_SBF_MIN_BIT DLM_SBF_DEMOTED_BIT +#define DLM_SBF_VALNOTVALID_BIT 1 +#define DLM_SBF_ALTMODE_BIT 2 +#define __DLM_SBF_MAX_BIT DLM_SBF_ALTMODE_BIT + +static inline uint32_t dlm_sbflags_val(const struct dlm_lkb *lkb) +{ + /* be sure the next person updates this */ + BUILD_BUG_ON(BIT(__DLM_SBF_MAX_BIT) != DLM_SBF_ALTMODE); + + return dlm_flags_val(&lkb->lkb_sbflags, __DLM_SBF_MIN_BIT, + __DLM_SBF_MAX_BIT); +} + +static inline void dlm_set_flags_val(unsigned long *addr, uint32_t val, + uint32_t min, uint32_t max) +{ + uint32_t bit; + + for (bit = min; bit < (max + 1); bit++) { + if (val & BIT(bit)) + set_bit(bit, addr); + else + clear_bit(bit, addr); + } +} + +static inline void dlm_set_dflags_val(struct dlm_lkb *lkb, uint32_t val) +{ + dlm_set_flags_val(&lkb->lkb_dflags, val, __DLM_DFL_MIN_BIT, + __DLM_DFL_MAX_BIT); +} + +static inline void dlm_set_sbflags_val(struct dlm_lkb *lkb, uint32_t val) +{ + dlm_set_flags_val(&lkb->lkb_sbflags, val, __DLM_SBF_MIN_BIT, + __DLM_SBF_MAX_BIT); +} + int dlm_plock_init(void); void dlm_plock_exit(void); diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index e1adfa5aed05..debf8a55ad7d 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -86,10 +86,9 @@ static int send_remove(struct dlm_rsb *r); static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, - struct dlm_message *ms); + struct dlm_message *ms, bool local); static int receive_extralen(struct dlm_message *ms); static void do_purge(struct dlm_ls *ls, int nodeid, int pid); -static void del_timeout(struct dlm_lkb *lkb); static void toss_rsb(struct kref *kref); /* @@ -164,7 +163,7 @@ void dlm_print_lkb(struct dlm_lkb *lkb) printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x " "sts %d rq %d gr %d wait_type %d wait_nodeid %d seq %llu\n", lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags, - lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode, + dlm_iflags_val(lkb), lkb->lkb_status, lkb->lkb_rqmode, lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_wait_nodeid, (unsigned long long)lkb->lkb_recover_seq); } @@ -229,12 +228,12 @@ static inline int force_blocking_asts(struct dlm_lkb *lkb) static inline int is_demoted(struct dlm_lkb *lkb) { - return (lkb->lkb_sbflags & DLM_SBF_DEMOTED); + return test_bit(DLM_SBF_DEMOTED_BIT, &lkb->lkb_sbflags); } static inline int is_altmode(struct dlm_lkb *lkb) { - return (lkb->lkb_sbflags & DLM_SBF_ALTMODE); + return test_bit(DLM_SBF_ALTMODE_BIT, &lkb->lkb_sbflags); } static inline int is_granted(struct dlm_lkb *lkb) @@ -250,12 +249,13 @@ static inline int is_remote(struct dlm_rsb *r) static inline int is_process_copy(struct dlm_lkb *lkb) { - return (lkb->lkb_nodeid && !(lkb->lkb_flags & DLM_IFL_MSTCPY)); + return lkb->lkb_nodeid && + !test_bit(DLM_IFL_MSTCPY_BIT, &lkb->lkb_iflags); } static inline int is_master_copy(struct dlm_lkb *lkb) { - return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? 1 : 0; + return test_bit(DLM_IFL_MSTCPY_BIT, &lkb->lkb_iflags); } static inline int middle_conversion(struct dlm_lkb *lkb) @@ -273,18 +273,18 @@ static inline int down_conversion(struct dlm_lkb *lkb) static inline int is_overlap_unlock(struct dlm_lkb *lkb) { - return lkb->lkb_flags & DLM_IFL_OVERLAP_UNLOCK; + return test_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags); } static inline int is_overlap_cancel(struct dlm_lkb *lkb) { - return lkb->lkb_flags & DLM_IFL_OVERLAP_CANCEL; + return test_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags); } static inline int is_overlap(struct dlm_lkb *lkb) { - return (lkb->lkb_flags & (DLM_IFL_OVERLAP_UNLOCK | - DLM_IFL_OVERLAP_CANCEL)); + return test_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags) || + test_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags); } static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) @@ -292,25 +292,13 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) if (is_master_copy(lkb)) return; - del_timeout(lkb); - DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb);); -#ifdef CONFIG_DLM_DEPRECATED_API - /* if the operation was a cancel, then return -DLM_ECANCEL, if a - timeout caused the cancel then return -ETIMEDOUT */ - if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_TIMEOUT_CANCEL)) { - lkb->lkb_flags &= ~DLM_IFL_TIMEOUT_CANCEL; - rv = -ETIMEDOUT; - } -#endif - - if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_DEADLOCK_CANCEL)) { - lkb->lkb_flags &= ~DLM_IFL_DEADLOCK_CANCEL; + if (rv == -DLM_ECANCEL && + test_and_clear_bit(DLM_IFL_DEADLOCK_CANCEL_BIT, &lkb->lkb_iflags)) rv = -EDEADLK; - } - dlm_add_cb(lkb, DLM_CB_CAST, lkb->lkb_grmode, rv, lkb->lkb_sbflags); + dlm_add_cb(lkb, DLM_CB_CAST, lkb->lkb_grmode, rv, dlm_sbflags_val(lkb)); } static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb) @@ -1151,7 +1139,7 @@ static void toss_rsb(struct kref *kref) rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[r->res_bucket].keep); rsb_insert(r, &ls->ls_rsbtbl[r->res_bucket].toss); r->res_toss_time = jiffies; - ls->ls_rsbtbl[r->res_bucket].flags |= DLM_RTF_SHRINK; + set_bit(DLM_RTF_SHRINK_BIT, &ls->ls_rsbtbl[r->res_bucket].flags); if (r->res_lvbptr) { dlm_free_lvb(r->res_lvbptr); r->res_lvbptr = NULL; @@ -1215,9 +1203,6 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret, kref_init(&lkb->lkb_ref); INIT_LIST_HEAD(&lkb->lkb_ownqueue); INIT_LIST_HEAD(&lkb->lkb_rsb_lookup); -#ifdef CONFIG_DLM_DEPRECATED_API - INIT_LIST_HEAD(&lkb->lkb_time_list); -#endif INIT_LIST_HEAD(&lkb->lkb_cb_list); INIT_LIST_HEAD(&lkb->lkb_callbacks); spin_lock_init(&lkb->lkb_cb_lock); @@ -1434,10 +1419,10 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid) if (lkb->lkb_wait_type || is_overlap_cancel(lkb)) { switch (mstype) { case DLM_MSG_UNLOCK: - lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK; + set_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags); break; case DLM_MSG_CANCEL: - lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL; + set_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags); break; default: error = -EBUSY; @@ -1448,7 +1433,7 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid) log_debug(ls, "addwait %x cur %d overlap %d count %d f %x", lkb->lkb_id, lkb->lkb_wait_type, mstype, - lkb->lkb_wait_count, lkb->lkb_flags); + lkb->lkb_wait_count, dlm_iflags_val(lkb)); goto out; } @@ -1464,7 +1449,7 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid) out: if (error) log_error(ls, "addwait error %x %d flags %x %d %d %s", - lkb->lkb_id, error, lkb->lkb_flags, mstype, + lkb->lkb_id, error, dlm_iflags_val(lkb), mstype, lkb->lkb_wait_type, lkb->lkb_resource->res_name); mutex_unlock(&ls->ls_waiters_mutex); return error; @@ -1481,16 +1466,16 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype, struct dlm_ls *ls = lkb->lkb_resource->res_ls; int overlap_done = 0; - if (is_overlap_unlock(lkb) && (mstype == DLM_MSG_UNLOCK_REPLY)) { + if (mstype == DLM_MSG_UNLOCK_REPLY && + test_and_clear_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags)) { log_debug(ls, "remwait %x unlock_reply overlap", lkb->lkb_id); - lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK; overlap_done = 1; goto out_del; } - if (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL_REPLY)) { + if (mstype == DLM_MSG_CANCEL_REPLY && + test_and_clear_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags)) { log_debug(ls, "remwait %x cancel_reply overlap", lkb->lkb_id); - lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL; overlap_done = 1; goto out_del; } @@ -1514,12 +1499,11 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype, lingering state of the cancel and fail with -EBUSY. */ if ((mstype == DLM_MSG_CONVERT_REPLY) && - (lkb->lkb_wait_type == DLM_MSG_CONVERT) && - is_overlap_cancel(lkb) && ms && !ms->m_result) { + (lkb->lkb_wait_type == DLM_MSG_CONVERT) && ms && !ms->m_result && + test_and_clear_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags)) { log_debug(ls, "remwait %x convert_reply zap overlap_cancel", lkb->lkb_id); lkb->lkb_wait_type = 0; - lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL; lkb->lkb_wait_count--; unhold_lkb(lkb); goto out_del; @@ -1535,7 +1519,7 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype, log_error(ls, "remwait error %x remote %d %x msg %d flags %x no wait", lkb->lkb_id, ms ? le32_to_cpu(ms->m_header.h_nodeid) : 0, - lkb->lkb_remid, mstype, lkb->lkb_flags); + lkb->lkb_remid, mstype, dlm_iflags_val(lkb)); return -1; out_del: @@ -1554,7 +1538,7 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype, DLM_ASSERT(lkb->lkb_wait_count, dlm_print_lkb(lkb);); - lkb->lkb_flags &= ~DLM_IFL_RESEND; + clear_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags); lkb->lkb_wait_count--; if (!lkb->lkb_wait_count) list_del_init(&lkb->lkb_wait_reply); @@ -1573,18 +1557,19 @@ static int remove_from_waiters(struct dlm_lkb *lkb, int mstype) return error; } -/* Handles situations where we might be processing a "fake" or "stub" reply in +/* Handles situations where we might be processing a "fake" or "local" reply in which we can't try to take waiters_mutex again. */ -static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms) +static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms, + bool local) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; int error; - if (ms->m_flags != cpu_to_le32(DLM_IFL_STUB_MS)) + if (!local) mutex_lock(&ls->ls_waiters_mutex); error = _remove_from_waiters(lkb, le32_to_cpu(ms->m_type), ms); - if (ms->m_flags != cpu_to_le32(DLM_IFL_STUB_MS)) + if (!local) mutex_unlock(&ls->ls_waiters_mutex); return error; } @@ -1603,7 +1588,7 @@ static void shrink_bucket(struct dlm_ls *ls, int b) spin_lock(&ls->ls_rsbtbl[b].lock); - if (!(ls->ls_rsbtbl[b].flags & DLM_RTF_SHRINK)) { + if (!test_bit(DLM_RTF_SHRINK_BIT, &ls->ls_rsbtbl[b].flags)) { spin_unlock(&ls->ls_rsbtbl[b].lock); return; } @@ -1658,9 +1643,9 @@ static void shrink_bucket(struct dlm_ls *ls, int b) } if (need_shrink) - ls->ls_rsbtbl[b].flags |= DLM_RTF_SHRINK; + set_bit(DLM_RTF_SHRINK_BIT, &ls->ls_rsbtbl[b].flags); else - ls->ls_rsbtbl[b].flags &= ~DLM_RTF_SHRINK; + clear_bit(DLM_RTF_SHRINK_BIT, &ls->ls_rsbtbl[b].flags); spin_unlock(&ls->ls_rsbtbl[b].lock); /* @@ -1735,133 +1720,6 @@ void dlm_scan_rsbs(struct dlm_ls *ls) } } -#ifdef CONFIG_DLM_DEPRECATED_API -static void add_timeout(struct dlm_lkb *lkb) -{ - struct dlm_ls *ls = lkb->lkb_resource->res_ls; - - if (is_master_copy(lkb)) - return; - - if (test_bit(LSFL_TIMEWARN, &ls->ls_flags) && - !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) { - lkb->lkb_flags |= DLM_IFL_WATCH_TIMEWARN; - goto add_it; - } - if (lkb->lkb_exflags & DLM_LKF_TIMEOUT) - goto add_it; - return; - - add_it: - DLM_ASSERT(list_empty(&lkb->lkb_time_list), dlm_print_lkb(lkb);); - mutex_lock(&ls->ls_timeout_mutex); - hold_lkb(lkb); - list_add_tail(&lkb->lkb_time_list, &ls->ls_timeout); - mutex_unlock(&ls->ls_timeout_mutex); -} - -static void del_timeout(struct dlm_lkb *lkb) -{ - struct dlm_ls *ls = lkb->lkb_resource->res_ls; - - mutex_lock(&ls->ls_timeout_mutex); - if (!list_empty(&lkb->lkb_time_list)) { - list_del_init(&lkb->lkb_time_list); - unhold_lkb(lkb); - } - mutex_unlock(&ls->ls_timeout_mutex); -} - -/* FIXME: is it safe to look at lkb_exflags, lkb_flags, lkb_timestamp, and - lkb_lksb_timeout without lock_rsb? Note: we can't lock timeout_mutex - and then lock rsb because of lock ordering in add_timeout. We may need - to specify some special timeout-related bits in the lkb that are just to - be accessed under the timeout_mutex. */ - -void dlm_scan_timeout(struct dlm_ls *ls) -{ - struct dlm_rsb *r; - struct dlm_lkb *lkb = NULL, *iter; - int do_cancel, do_warn; - s64 wait_us; - - for (;;) { - if (dlm_locking_stopped(ls)) - break; - - do_cancel = 0; - do_warn = 0; - mutex_lock(&ls->ls_timeout_mutex); - list_for_each_entry(iter, &ls->ls_timeout, lkb_time_list) { - - wait_us = ktime_to_us(ktime_sub(ktime_get(), - iter->lkb_timestamp)); - - if ((iter->lkb_exflags & DLM_LKF_TIMEOUT) && - wait_us >= (iter->lkb_timeout_cs * 10000)) - do_cancel = 1; - - if ((iter->lkb_flags & DLM_IFL_WATCH_TIMEWARN) && - wait_us >= dlm_config.ci_timewarn_cs * 10000) - do_warn = 1; - - if (!do_cancel && !do_warn) - continue; - hold_lkb(iter); - lkb = iter; - break; - } - mutex_unlock(&ls->ls_timeout_mutex); - - if (!lkb) - break; - - r = lkb->lkb_resource; - hold_rsb(r); - lock_rsb(r); - - if (do_warn) { - /* clear flag so we only warn once */ - lkb->lkb_flags &= ~DLM_IFL_WATCH_TIMEWARN; - if (!(lkb->lkb_exflags & DLM_LKF_TIMEOUT)) - del_timeout(lkb); - dlm_timeout_warn(lkb); - } - - if (do_cancel) { - log_debug(ls, "timeout cancel %x node %d %s", - lkb->lkb_id, lkb->lkb_nodeid, r->res_name); - lkb->lkb_flags &= ~DLM_IFL_WATCH_TIMEWARN; - lkb->lkb_flags |= DLM_IFL_TIMEOUT_CANCEL; - del_timeout(lkb); - _cancel_lock(r, lkb); - } - - unlock_rsb(r); - unhold_rsb(r); - dlm_put_lkb(lkb); - } -} - -/* This is only called by dlm_recoverd, and we rely on dlm_ls_stop() stopping - dlm_recoverd before checking/setting ls_recover_begin. */ - -void dlm_adjust_timeouts(struct dlm_ls *ls) -{ - struct dlm_lkb *lkb; - u64 adj_us = jiffies_to_usecs(jiffies - ls->ls_recover_begin); - - ls->ls_recover_begin = 0; - mutex_lock(&ls->ls_timeout_mutex); - list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) - lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us); - mutex_unlock(&ls->ls_timeout_mutex); -} -#else -static void add_timeout(struct dlm_lkb *lkb) { } -static void del_timeout(struct dlm_lkb *lkb) { } -#endif - /* lkb is master or local copy */ static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) @@ -1912,7 +1770,7 @@ static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) } if (rsb_flag(r, RSB_VALNOTVALID)) - lkb->lkb_sbflags |= DLM_SBF_VALNOTVALID; + set_bit(DLM_SBF_VALNOTVALID_BIT, &lkb->lkb_sbflags); } static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb) @@ -2384,7 +2242,7 @@ static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now, conversion_deadlock_detect(r, lkb)) { if (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) { lkb->lkb_grmode = DLM_LOCK_NL; - lkb->lkb_sbflags |= DLM_SBF_DEMOTED; + set_bit(DLM_SBF_DEMOTED_BIT, &lkb->lkb_sbflags); } else if (err) { *err = -EDEADLK; } else { @@ -2411,7 +2269,7 @@ static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now, lkb->lkb_rqmode = alt; rv = _can_be_granted(r, lkb, now, 0); if (rv) - lkb->lkb_sbflags |= DLM_SBF_ALTMODE; + set_bit(DLM_SBF_ALTMODE_BIT, &lkb->lkb_sbflags); else lkb->lkb_rqmode = rqmode; } @@ -2723,20 +2581,11 @@ static void confirm_master(struct dlm_rsb *r, int error) } } -#ifdef CONFIG_DLM_DEPRECATED_API -static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, - int namelen, unsigned long timeout_cs, - void (*ast) (void *astparam), - void *astparam, - void (*bast) (void *astparam, int mode), - struct dlm_args *args) -#else static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, int namelen, void (*ast)(void *astparam), void *astparam, void (*bast)(void *astparam, int mode), struct dlm_args *args) -#endif { int rv = -EINVAL; @@ -2789,9 +2638,6 @@ static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, args->astfn = ast; args->astparam = astparam; args->bastfn = bast; -#ifdef CONFIG_DLM_DEPRECATED_API - args->timeout = timeout_cs; -#endif args->mode = mode; args->lksb = lksb; rv = 0; @@ -2830,7 +2676,7 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, goto out; rv = -EINVAL; - if (lkb->lkb_flags & DLM_IFL_MSTCPY) + if (test_bit(DLM_IFL_MSTCPY_BIT, &lkb->lkb_iflags)) goto out; if (args->flags & DLM_LKF_QUECVT && @@ -2839,7 +2685,7 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, } lkb->lkb_exflags = args->flags; - lkb->lkb_sbflags = 0; + dlm_set_sbflags_val(lkb, 0); lkb->lkb_astfn = args->astfn; lkb->lkb_astparam = args->astparam; lkb->lkb_bastfn = args->bastfn; @@ -2847,9 +2693,6 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, lkb->lkb_lksb = args->lksb; lkb->lkb_lvbptr = args->lksb->sb_lvbptr; lkb->lkb_ownpid = (int) current->pid; -#ifdef CONFIG_DLM_DEPRECATED_API - lkb->lkb_timeout_cs = args->timeout; -#endif rv = 0; out: switch (rv) { @@ -2859,13 +2702,13 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, /* annoy the user because dlm usage is wrong */ WARN_ON(1); log_error(ls, "%s %d %x %x %x %d %d %s", __func__, - rv, lkb->lkb_id, lkb->lkb_flags, args->flags, + rv, lkb->lkb_id, dlm_iflags_val(lkb), args->flags, lkb->lkb_status, lkb->lkb_wait_type, lkb->lkb_resource->res_name); break; default: log_debug(ls, "%s %d %x %x %x %d %d %s", __func__, - rv, lkb->lkb_id, lkb->lkb_flags, args->flags, + rv, lkb->lkb_id, dlm_iflags_val(lkb), args->flags, lkb->lkb_status, lkb->lkb_wait_type, lkb->lkb_resource->res_name); break; @@ -2908,7 +2751,7 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) } rv = -EINVAL; - if (lkb->lkb_flags & DLM_IFL_MSTCPY) { + if (test_bit(DLM_IFL_MSTCPY_BIT, &lkb->lkb_iflags)) { log_error(ls, "unlock on MSTCPY %x", lkb->lkb_id); dlm_print_lkb(lkb); goto out; @@ -2919,7 +2762,7 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) * locks; return same error as if the lkid had not been found at all */ - if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) { + if (test_bit(DLM_IFL_ENDOFLIFE_BIT, &lkb->lkb_iflags)) { log_debug(ls, "unlock on ENDOFLIFE %x", lkb->lkb_id); rv = -ENOENT; goto out; @@ -2934,11 +2777,8 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) if (is_overlap(lkb)) goto out; - /* don't let scand try to do a cancel */ - del_timeout(lkb); - - if (lkb->lkb_flags & DLM_IFL_RESEND) { - lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL; + if (test_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags)) { + set_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags); rv = -EBUSY; goto out; } @@ -2953,7 +2793,7 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) switch (lkb->lkb_wait_type) { case DLM_MSG_LOOKUP: case DLM_MSG_REQUEST: - lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL; + set_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags); rv = -EBUSY; goto out; case DLM_MSG_UNLOCK: @@ -2975,11 +2815,8 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) if (is_overlap_unlock(lkb)) goto out; - /* don't let scand try to do a cancel */ - del_timeout(lkb); - - if (lkb->lkb_flags & DLM_IFL_RESEND) { - lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK; + if (test_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags)) { + set_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags); rv = -EBUSY; goto out; } @@ -2987,7 +2824,7 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) switch (lkb->lkb_wait_type) { case DLM_MSG_LOOKUP: case DLM_MSG_REQUEST: - lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK; + set_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags); rv = -EBUSY; goto out; case DLM_MSG_UNLOCK: @@ -2999,7 +2836,7 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) out_ok: /* an overlapping op shouldn't blow away exflags from other op */ lkb->lkb_exflags |= args->flags; - lkb->lkb_sbflags = 0; + dlm_set_sbflags_val(lkb, 0); lkb->lkb_astparam = args->astparam; rv = 0; out: @@ -3010,13 +2847,13 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) /* annoy the user because dlm usage is wrong */ WARN_ON(1); log_error(ls, "%s %d %x %x %x %x %d %s", __func__, rv, - lkb->lkb_id, lkb->lkb_flags, lkb->lkb_exflags, + lkb->lkb_id, dlm_iflags_val(lkb), lkb->lkb_exflags, args->flags, lkb->lkb_wait_type, lkb->lkb_resource->res_name); break; default: log_debug(ls, "%s %d %x %x %x %x %d %s", __func__, rv, - lkb->lkb_id, lkb->lkb_flags, lkb->lkb_exflags, + lkb->lkb_id, dlm_iflags_val(lkb), lkb->lkb_exflags, args->flags, lkb->lkb_wait_type, lkb->lkb_resource->res_name); break; @@ -3045,7 +2882,6 @@ static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb) if (can_be_queued(lkb)) { error = -EINPROGRESS; add_lkb(r, lkb, DLM_LKSTS_WAITING); - add_timeout(lkb); goto out; } @@ -3114,7 +2950,6 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) error = -EINPROGRESS; del_lkb(r, lkb); add_lkb(r, lkb, DLM_LKSTS_CONVERT); - add_timeout(lkb); goto out; } @@ -3401,13 +3236,8 @@ int dlm_lock(dlm_lockspace_t *lockspace, trace_dlm_lock_start(ls, lkb, name, namelen, mode, flags); -#ifdef CONFIG_DLM_DEPRECATED_API - error = set_lock_args(mode, lksb, flags, namelen, 0, ast, - astarg, bast, &args); -#else error = set_lock_args(mode, lksb, flags, namelen, ast, astarg, bast, &args); -#endif if (error) goto out_put; @@ -3551,7 +3381,7 @@ static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb, case DLM_MSG_REQUEST_REPLY: case DLM_MSG_CONVERT_REPLY: case DLM_MSG_GRANT: - if (lkb && lkb->lkb_lvbptr) + if (lkb && lkb->lkb_lvbptr && (lkb->lkb_exflags & DLM_LKF_VALBLK)) mb_len += r->res_ls->ls_lvblen; break; } @@ -3578,8 +3408,8 @@ static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb, ms->m_lkid = cpu_to_le32(lkb->lkb_id); ms->m_remid = cpu_to_le32(lkb->lkb_remid); ms->m_exflags = cpu_to_le32(lkb->lkb_exflags); - ms->m_sbflags = cpu_to_le32(lkb->lkb_sbflags); - ms->m_flags = cpu_to_le32(lkb->lkb_flags); + ms->m_sbflags = cpu_to_le32(dlm_sbflags_val(lkb)); + ms->m_flags = cpu_to_le32(dlm_dflags_val(lkb)); ms->m_lvbseq = cpu_to_le32(lkb->lkb_lvbseq); ms->m_status = cpu_to_le32(lkb->lkb_status); ms->m_grmode = cpu_to_le32(lkb->lkb_grmode); @@ -3656,10 +3486,9 @@ static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) /* down conversions go without a reply from the master */ if (!error && down_conversion(lkb)) { remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY); - r->res_ls->ls_stub_ms.m_flags = cpu_to_le32(DLM_IFL_STUB_MS); - r->res_ls->ls_stub_ms.m_type = cpu_to_le32(DLM_MSG_CONVERT_REPLY); - r->res_ls->ls_stub_ms.m_result = 0; - __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms); + r->res_ls->ls_local_ms.m_type = cpu_to_le32(DLM_MSG_CONVERT_REPLY); + r->res_ls->ls_local_ms.m_result = 0; + __receive_convert_reply(r, lkb, &r->res_ls->ls_local_ms, true); } return error; @@ -3818,7 +3647,7 @@ static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in, int ret_nodeid, int rv) { - struct dlm_rsb *r = &ls->ls_stub_rsb; + struct dlm_rsb *r = &ls->ls_local_rsb; struct dlm_message *ms; struct dlm_mhandle *mh; int error, nodeid = le32_to_cpu(ms_in->m_header.h_nodeid); @@ -3844,19 +3673,18 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in, static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms) { lkb->lkb_exflags = le32_to_cpu(ms->m_exflags); - lkb->lkb_sbflags = le32_to_cpu(ms->m_sbflags); - lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) | - (le32_to_cpu(ms->m_flags) & 0x0000FFFF); + dlm_set_sbflags_val(lkb, le32_to_cpu(ms->m_sbflags)); + dlm_set_dflags_val(lkb, le32_to_cpu(ms->m_flags)); } -static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms) +static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms, + bool local) { - if (ms->m_flags == cpu_to_le32(DLM_IFL_STUB_MS)) + if (local) return; - lkb->lkb_sbflags = le32_to_cpu(ms->m_sbflags); - lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) | - (le32_to_cpu(ms->m_flags) & 0x0000FFFF); + dlm_set_sbflags_val(lkb, le32_to_cpu(ms->m_sbflags)); + dlm_set_dflags_val(lkb, le32_to_cpu(ms->m_flags)); } static int receive_extralen(struct dlm_message *ms) @@ -3938,12 +3766,12 @@ static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, return 0; } -/* We fill in the stub-lkb fields with the info that send_xxxx_reply() +/* We fill in the local-lkb fields with the info that send_xxxx_reply() uses to send a reply and that the remote end uses to process the reply. */ -static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms) +static void setup_local_lkb(struct dlm_ls *ls, struct dlm_message *ms) { - struct dlm_lkb *lkb = &ls->ls_stub_lkb; + struct dlm_lkb *lkb = &ls->ls_local_lkb; lkb->lkb_nodeid = le32_to_cpu(ms->m_header.h_nodeid); lkb->lkb_remid = le32_to_cpu(ms->m_lkid); } @@ -3957,8 +3785,8 @@ static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms) int error = 0; /* currently mixing of user/kernel locks are not supported */ - if (ms->m_flags & cpu_to_le32(DLM_IFL_USER) && - ~lkb->lkb_flags & DLM_IFL_USER) { + if (ms->m_flags & cpu_to_le32(BIT(DLM_DFL_USER_BIT)) && + !test_bit(DLM_DFL_USER_BIT, &lkb->lkb_dflags)) { log_error(lkb->lkb_resource->res_ls, "got user dlm message for a kernel lock"); error = -EINVAL; @@ -3998,7 +3826,8 @@ out: log_error(lkb->lkb_resource->res_ls, "ignore invalid message %d from %d %x %x %x %d", le32_to_cpu(ms->m_type), from, lkb->lkb_id, - lkb->lkb_remid, lkb->lkb_flags, lkb->lkb_nodeid); + lkb->lkb_remid, dlm_iflags_val(lkb), + lkb->lkb_nodeid); return error; } @@ -4016,7 +3845,7 @@ static int receive_request(struct dlm_ls *ls, struct dlm_message *ms) goto fail; receive_flags(lkb, ms); - lkb->lkb_flags |= DLM_IFL_MSTCPY; + set_bit(DLM_IFL_MSTCPY_BIT, &lkb->lkb_iflags); error = receive_request_args(ls, lkb, ms); if (error) { __put_lkb(ls, lkb); @@ -4076,8 +3905,8 @@ static int receive_request(struct dlm_ls *ls, struct dlm_message *ms) le32_to_cpu(ms->m_lkid), from_nodeid, error); } - setup_stub_lkb(ls, ms); - send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); + setup_local_lkb(ls, ms); + send_request_reply(&ls->ls_local_rsb, &ls->ls_local_lkb, error); return error; } @@ -4132,8 +3961,8 @@ static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms) return 0; fail: - setup_stub_lkb(ls, ms); - send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); + setup_local_lkb(ls, ms); + send_convert_reply(&ls->ls_local_rsb, &ls->ls_local_lkb, error); return error; } @@ -4184,8 +4013,8 @@ static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms) return 0; fail: - setup_stub_lkb(ls, ms); - send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); + setup_local_lkb(ls, ms); + send_unlock_reply(&ls->ls_local_rsb, &ls->ls_local_lkb, error); return error; } @@ -4220,8 +4049,8 @@ static int receive_cancel(struct dlm_ls *ls, struct dlm_message *ms) return 0; fail: - setup_stub_lkb(ls, ms); - send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); + setup_local_lkb(ls, ms); + send_cancel_reply(&ls->ls_local_rsb, &ls->ls_local_lkb, error); return error; } @@ -4244,7 +4073,7 @@ static int receive_grant(struct dlm_ls *ls, struct dlm_message *ms) if (error) goto out; - receive_flags_reply(lkb, ms); + receive_flags_reply(lkb, ms, false); if (is_altmode(lkb)) munge_altmode(lkb, ms); grant_lock_pc(r, lkb, ms); @@ -4448,13 +4277,12 @@ static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) case -EINPROGRESS: case 0: /* request was queued or granted on remote master */ - receive_flags_reply(lkb, ms); + receive_flags_reply(lkb, ms, false); lkb->lkb_remid = le32_to_cpu(ms->m_lkid); if (is_altmode(lkb)) munge_altmode(lkb, ms); if (result) { add_lkb(r, lkb, DLM_LKSTS_WAITING); - add_timeout(lkb); } else { grant_lock_pc(r, lkb, ms); queue_cast(r, lkb, 0); @@ -4496,20 +4324,21 @@ static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) lkb->lkb_id, result); } - if (is_overlap_unlock(lkb) && (result == 0 || result == -EINPROGRESS)) { + if ((result == 0 || result == -EINPROGRESS) && + test_and_clear_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags)) { log_debug(ls, "receive_request_reply %x result %d unlock", lkb->lkb_id, result); - lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK; - lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL; + clear_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags); send_unlock(r, lkb); - } else if (is_overlap_cancel(lkb) && (result == -EINPROGRESS)) { + } else if ((result == -EINPROGRESS) && + test_and_clear_bit(DLM_IFL_OVERLAP_CANCEL_BIT, + &lkb->lkb_iflags)) { log_debug(ls, "receive_request_reply %x cancel", lkb->lkb_id); - lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK; - lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL; + clear_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags); send_cancel(r, lkb); } else { - lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL; - lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK; + clear_bit(DLM_IFL_OVERLAP_CANCEL_BIT, &lkb->lkb_iflags); + clear_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, &lkb->lkb_iflags); } out: unlock_rsb(r); @@ -4519,7 +4348,7 @@ static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) } static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, - struct dlm_message *ms) + struct dlm_message *ms, bool local) { /* this is the value returned from do_convert() on the master */ switch (from_dlm_errno(le32_to_cpu(ms->m_result))) { @@ -4529,24 +4358,23 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, break; case -EDEADLK: - receive_flags_reply(lkb, ms); + receive_flags_reply(lkb, ms, local); revert_lock_pc(r, lkb); queue_cast(r, lkb, -EDEADLK); break; case -EINPROGRESS: /* convert was queued on remote master */ - receive_flags_reply(lkb, ms); + receive_flags_reply(lkb, ms, local); if (is_demoted(lkb)) munge_demoted(lkb); del_lkb(r, lkb); add_lkb(r, lkb, DLM_LKSTS_CONVERT); - add_timeout(lkb); break; case 0: /* convert was granted on remote master */ - receive_flags_reply(lkb, ms); + receive_flags_reply(lkb, ms, local); if (is_demoted(lkb)) munge_demoted(lkb); grant_lock_pc(r, lkb, ms); @@ -4563,7 +4391,8 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, } } -static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms) +static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms, + bool local) { struct dlm_rsb *r = lkb->lkb_resource; int error; @@ -4575,12 +4404,12 @@ static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms) if (error) goto out; - /* stub reply can happen with waiters_mutex held */ - error = remove_from_waiters_ms(lkb, ms); + /* local reply can happen with waiters_mutex held */ + error = remove_from_waiters_ms(lkb, ms, local); if (error) goto out; - __receive_convert_reply(r, lkb, ms); + __receive_convert_reply(r, lkb, ms, local); out: unlock_rsb(r); put_rsb(r); @@ -4595,12 +4424,13 @@ static int receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms) if (error) return error; - _receive_convert_reply(lkb, ms); + _receive_convert_reply(lkb, ms, false); dlm_put_lkb(lkb); return 0; } -static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms) +static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms, + bool local) { struct dlm_rsb *r = lkb->lkb_resource; int error; @@ -4612,8 +4442,8 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms) if (error) goto out; - /* stub reply can happen with waiters_mutex held */ - error = remove_from_waiters_ms(lkb, ms); + /* local reply can happen with waiters_mutex held */ + error = remove_from_waiters_ms(lkb, ms, local); if (error) goto out; @@ -4621,7 +4451,7 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms) switch (from_dlm_errno(le32_to_cpu(ms->m_result))) { case -DLM_EUNLOCK: - receive_flags_reply(lkb, ms); + receive_flags_reply(lkb, ms, local); remove_lock_pc(r, lkb); queue_cast(r, lkb, -DLM_EUNLOCK); break; @@ -4645,12 +4475,13 @@ static int receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms) if (error) return error; - _receive_unlock_reply(lkb, ms); + _receive_unlock_reply(lkb, ms, false); dlm_put_lkb(lkb); return 0; } -static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms) +static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms, + bool local) { struct dlm_rsb *r = lkb->lkb_resource; int error; @@ -4662,8 +4493,8 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms) if (error) goto out; - /* stub reply can happen with waiters_mutex held */ - error = remove_from_waiters_ms(lkb, ms); + /* local reply can happen with waiters_mutex held */ + error = remove_from_waiters_ms(lkb, ms, local); if (error) goto out; @@ -4671,7 +4502,7 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms) switch (from_dlm_errno(le32_to_cpu(ms->m_result))) { case -DLM_ECANCEL: - receive_flags_reply(lkb, ms); + receive_flags_reply(lkb, ms, local); revert_lock_pc(r, lkb); queue_cast(r, lkb, -DLM_ECANCEL); break; @@ -4696,7 +4527,7 @@ static int receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms) if (error) return error; - _receive_cancel_reply(lkb, ms); + _receive_cancel_reply(lkb, ms, false); dlm_put_lkb(lkb); return 0; } @@ -4763,7 +4594,7 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms) if (is_overlap(lkb)) { log_debug(ls, "receive_lookup_reply %x unlock %x", - lkb->lkb_id, lkb->lkb_flags); + lkb->lkb_id, dlm_iflags_val(lkb)); queue_cast_overlap(r, lkb); unhold_lkb(lkb); /* undoes create_lkb() */ goto out_list; @@ -5006,16 +4837,15 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid) } static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb, - struct dlm_message *ms_stub) + struct dlm_message *ms_local) { if (middle_conversion(lkb)) { hold_lkb(lkb); - memset(ms_stub, 0, sizeof(struct dlm_message)); - ms_stub->m_flags = cpu_to_le32(DLM_IFL_STUB_MS); - ms_stub->m_type = cpu_to_le32(DLM_MSG_CONVERT_REPLY); - ms_stub->m_result = cpu_to_le32(to_dlm_errno(-EINPROGRESS)); - ms_stub->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid); - _receive_convert_reply(lkb, ms_stub); + memset(ms_local, 0, sizeof(struct dlm_message)); + ms_local->m_type = cpu_to_le32(DLM_MSG_CONVERT_REPLY); + ms_local->m_result = cpu_to_le32(to_dlm_errno(-EINPROGRESS)); + ms_local->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid); + _receive_convert_reply(lkb, ms_local, true); /* Same special case as in receive_rcom_lock_args() */ lkb->lkb_grmode = DLM_LOCK_IV; @@ -5023,7 +4853,7 @@ static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb, unhold_lkb(lkb); } else if (lkb->lkb_rqmode >= lkb->lkb_grmode) { - lkb->lkb_flags |= DLM_IFL_RESEND; + set_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags); } /* lkb->lkb_rqmode < lkb->lkb_grmode shouldn't happen since down @@ -5054,12 +4884,12 @@ static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb, void dlm_recover_waiters_pre(struct dlm_ls *ls) { struct dlm_lkb *lkb, *safe; - struct dlm_message *ms_stub; - int wait_type, stub_unlock_result, stub_cancel_result; + struct dlm_message *ms_local; + int wait_type, local_unlock_result, local_cancel_result; int dir_nodeid; - ms_stub = kmalloc(sizeof(*ms_stub), GFP_KERNEL); - if (!ms_stub) + ms_local = kmalloc(sizeof(*ms_local), GFP_KERNEL); + if (!ms_local) return; mutex_lock(&ls->ls_waiters_mutex); @@ -5087,7 +4917,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) resent after recovery is done */ if (lkb->lkb_wait_type == DLM_MSG_LOOKUP) { - lkb->lkb_flags |= DLM_IFL_RESEND; + set_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags); continue; } @@ -5095,8 +4925,8 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) continue; wait_type = lkb->lkb_wait_type; - stub_unlock_result = -DLM_EUNLOCK; - stub_cancel_result = -DLM_ECANCEL; + local_unlock_result = -DLM_EUNLOCK; + local_cancel_result = -DLM_ECANCEL; /* Main reply may have been received leaving a zero wait_type, but a reply for the overlapping op may not have been @@ -5107,48 +4937,46 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) if (is_overlap_cancel(lkb)) { wait_type = DLM_MSG_CANCEL; if (lkb->lkb_grmode == DLM_LOCK_IV) - stub_cancel_result = 0; + local_cancel_result = 0; } if (is_overlap_unlock(lkb)) { wait_type = DLM_MSG_UNLOCK; if (lkb->lkb_grmode == DLM_LOCK_IV) - stub_unlock_result = -ENOENT; + local_unlock_result = -ENOENT; } log_debug(ls, "rwpre overlap %x %x %d %d %d", - lkb->lkb_id, lkb->lkb_flags, wait_type, - stub_cancel_result, stub_unlock_result); + lkb->lkb_id, dlm_iflags_val(lkb), wait_type, + local_cancel_result, local_unlock_result); } switch (wait_type) { case DLM_MSG_REQUEST: - lkb->lkb_flags |= DLM_IFL_RESEND; + set_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags); break; case DLM_MSG_CONVERT: - recover_convert_waiter(ls, lkb, ms_stub); + recover_convert_waiter(ls, lkb, ms_local); break; case DLM_MSG_UNLOCK: hold_lkb(lkb); - memset(ms_stub, 0, sizeof(struct dlm_message)); - ms_stub->m_flags = cpu_to_le32(DLM_IFL_STUB_MS); - ms_stub->m_type = cpu_to_le32(DLM_MSG_UNLOCK_REPLY); - ms_stub->m_result = cpu_to_le32(to_dlm_errno(stub_unlock_result)); - ms_stub->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid); - _receive_unlock_reply(lkb, ms_stub); + memset(ms_local, 0, sizeof(struct dlm_message)); + ms_local->m_type = cpu_to_le32(DLM_MSG_UNLOCK_REPLY); + ms_local->m_result = cpu_to_le32(to_dlm_errno(local_unlock_result)); + ms_local->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid); + _receive_unlock_reply(lkb, ms_local, true); dlm_put_lkb(lkb); break; case DLM_MSG_CANCEL: hold_lkb(lkb); - memset(ms_stub, 0, sizeof(struct dlm_message)); - ms_stub->m_flags = cpu_to_le32(DLM_IFL_STUB_MS); - ms_stub->m_type = cpu_to_le32(DLM_MSG_CANCEL_REPLY); - ms_stub->m_result = cpu_to_le32(to_dlm_errno(stub_cancel_result)); - ms_stub->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid); - _receive_cancel_reply(lkb, ms_stub); + memset(ms_local, 0, sizeof(struct dlm_message)); + ms_local->m_type = cpu_to_le32(DLM_MSG_CANCEL_REPLY); + ms_local->m_result = cpu_to_le32(to_dlm_errno(local_cancel_result)); + ms_local->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid); + _receive_cancel_reply(lkb, ms_local, true); dlm_put_lkb(lkb); break; @@ -5159,7 +4987,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) schedule(); } mutex_unlock(&ls->ls_waiters_mutex); - kfree(ms_stub); + kfree(ms_local); } static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls) @@ -5168,7 +4996,7 @@ static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls) mutex_lock(&ls->ls_waiters_mutex); list_for_each_entry(iter, &ls->ls_waiters, lkb_wait_reply) { - if (iter->lkb_flags & DLM_IFL_RESEND) { + if (test_bit(DLM_IFL_RESEND_BIT, &iter->lkb_iflags)) { hold_lkb(iter); lkb = iter; break; @@ -5217,8 +5045,10 @@ int dlm_recover_waiters_post(struct dlm_ls *ls) lock_rsb(r); mstype = lkb->lkb_wait_type; - oc = is_overlap_cancel(lkb); - ou = is_overlap_unlock(lkb); + oc = test_and_clear_bit(DLM_IFL_OVERLAP_CANCEL_BIT, + &lkb->lkb_iflags); + ou = test_and_clear_bit(DLM_IFL_OVERLAP_UNLOCK_BIT, + &lkb->lkb_iflags); err = 0; log_debug(ls, "waiter %x remote %x msg %d r_nodeid %d " @@ -5231,9 +5061,7 @@ int dlm_recover_waiters_post(struct dlm_ls *ls) previous op or overlap op on this lock. First, do a big remove_from_waiters() for all previous ops. */ - lkb->lkb_flags &= ~DLM_IFL_RESEND; - lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK; - lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL; + clear_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags); lkb->lkb_wait_type = 0; /* drop all wait_count references we still * hold a reference for this iteration. @@ -5518,8 +5346,8 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, lkb->lkb_ownpid = le32_to_cpu(rl->rl_ownpid); lkb->lkb_remid = le32_to_cpu(rl->rl_lkid); lkb->lkb_exflags = le32_to_cpu(rl->rl_exflags); - lkb->lkb_flags = le32_to_cpu(rl->rl_flags) & 0x0000FFFF; - lkb->lkb_flags |= DLM_IFL_MSTCPY; + dlm_set_dflags_val(lkb, le32_to_cpu(rl->rl_flags)); + set_bit(DLM_IFL_MSTCPY_BIT, &lkb->lkb_iflags); lkb->lkb_lvbseq = le32_to_cpu(rl->rl_lvbseq); lkb->lkb_rqmode = rl->rl_rqmode; lkb->lkb_grmode = rl->rl_grmode; @@ -5708,14 +5536,8 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) return 0; } -#ifdef CONFIG_DLM_DEPRECATED_API -int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, - int mode, uint32_t flags, void *name, unsigned int namelen, - unsigned long timeout_cs) -#else int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, uint32_t flags, void *name, unsigned int namelen) -#endif { struct dlm_lkb *lkb; struct dlm_args args; @@ -5740,13 +5562,8 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, goto out_put; } } -#ifdef CONFIG_DLM_DEPRECATED_API - error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs, - fake_astfn, ua, fake_bastfn, &args); -#else error = set_lock_args(mode, &ua->lksb, flags, namelen, fake_astfn, ua, fake_bastfn, &args); -#endif if (error) { kfree(ua->lksb.sb_lvbptr); ua->lksb.sb_lvbptr = NULL; @@ -5755,9 +5572,9 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, } /* After ua is attached to lkb it will be freed by dlm_free_lkb(). - When DLM_IFL_USER is set, the dlm knows that this is a userspace + When DLM_DFL_USER_BIT is set, the dlm knows that this is a userspace lock and that lkb_astparam is the dlm_user_args structure. */ - lkb->lkb_flags |= DLM_IFL_USER; + set_bit(DLM_DFL_USER_BIT, &lkb->lkb_dflags); error = request_lock(ls, lkb, name, namelen, &args); switch (error) { @@ -5788,14 +5605,8 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, return error; } -#ifdef CONFIG_DLM_DEPRECATED_API -int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, - int mode, uint32_t flags, uint32_t lkid, char *lvb_in, - unsigned long timeout_cs) -#else int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, uint32_t lkid, char *lvb_in) -#endif { struct dlm_lkb *lkb; struct dlm_args args; @@ -5832,13 +5643,8 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, ua->bastaddr = ua_tmp->bastaddr; ua->user_lksb = ua_tmp->user_lksb; -#ifdef CONFIG_DLM_DEPRECATED_API - error = set_lock_args(mode, &ua->lksb, flags, 0, timeout_cs, - fake_astfn, ua, fake_bastfn, &args); -#else error = set_lock_args(mode, &ua->lksb, flags, 0, fake_astfn, ua, fake_bastfn, &args); -#endif if (error) goto out_put; @@ -5883,7 +5689,7 @@ int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, lkb = iter; list_del_init(&iter->lkb_ownqueue); - iter->lkb_flags &= ~DLM_IFL_ORPHAN; + clear_bit(DLM_DFL_ORPHAN_BIT, &iter->lkb_dflags); *lkid = iter->lkb_id; break; } @@ -6050,7 +5856,7 @@ int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid) error = validate_unlock_args(lkb, &args); if (error) goto out_r; - lkb->lkb_flags |= DLM_IFL_DEADLOCK_CANCEL; + set_bit(DLM_IFL_DEADLOCK_CANCEL_BIT, &lkb->lkb_iflags); error = _cancel_lock(r, lkb); out_r: @@ -6127,9 +5933,9 @@ static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls, list_del_init(&lkb->lkb_ownqueue); if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) - lkb->lkb_flags |= DLM_IFL_ORPHAN; + set_bit(DLM_DFL_ORPHAN_BIT, &lkb->lkb_dflags); else - lkb->lkb_flags |= DLM_IFL_DEAD; + set_bit(DLM_IFL_DEAD_BIT, &lkb->lkb_iflags); out: spin_unlock(&ls->ls_clear_proc_locks); return lkb; @@ -6155,7 +5961,6 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) lkb = del_proc_lock(ls, proc); if (!lkb) break; - del_timeout(lkb); if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) orphan_proc_lock(ls, lkb); else @@ -6173,7 +5978,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) /* in-progress unlocks */ list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) { list_del_init(&lkb->lkb_ownqueue); - lkb->lkb_flags |= DLM_IFL_DEAD; + set_bit(DLM_IFL_DEAD_BIT, &lkb->lkb_iflags); dlm_put_lkb(lkb); } @@ -6204,7 +6009,7 @@ static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) if (!lkb) break; - lkb->lkb_flags |= DLM_IFL_DEAD; + set_bit(DLM_IFL_DEAD_BIT, &lkb->lkb_iflags); unlock_proc_lock(ls, lkb); dlm_put_lkb(lkb); /* ref from proc->locks list */ } @@ -6212,7 +6017,7 @@ static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc) spin_lock(&proc->locks_spin); list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) { list_del_init(&lkb->lkb_ownqueue); - lkb->lkb_flags |= DLM_IFL_DEAD; + set_bit(DLM_IFL_DEAD_BIT, &lkb->lkb_iflags); dlm_put_lkb(lkb); } spin_unlock(&proc->locks_spin); @@ -6279,7 +6084,7 @@ int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc, /* debug functionality */ int dlm_debug_add_lkb(struct dlm_ls *ls, uint32_t lkb_id, char *name, int len, - int lkb_nodeid, unsigned int lkb_flags, int lkb_status) + int lkb_nodeid, unsigned int lkb_dflags, int lkb_status) { struct dlm_lksb *lksb; struct dlm_lkb *lkb; @@ -6287,7 +6092,7 @@ int dlm_debug_add_lkb(struct dlm_ls *ls, uint32_t lkb_id, char *name, int len, int error; /* we currently can't set a valid user lock */ - if (lkb_flags & DLM_IFL_USER) + if (lkb_dflags & BIT(DLM_DFL_USER_BIT)) return -EOPNOTSUPP; lksb = kzalloc(sizeof(*lksb), GFP_NOFS); @@ -6300,11 +6105,11 @@ int dlm_debug_add_lkb(struct dlm_ls *ls, uint32_t lkb_id, char *name, int len, return error; } - lkb->lkb_flags = lkb_flags; + dlm_set_dflags_val(lkb, lkb_dflags); lkb->lkb_nodeid = lkb_nodeid; lkb->lkb_lksb = lksb; /* user specific pointer, just don't have it NULL for kernel locks */ - if (~lkb_flags & DLM_IFL_USER) + if (~lkb_dflags & BIT(DLM_DFL_USER_BIT)) lkb->lkb_astparam = (void *)0xDEADBEEF; error = find_rsb(ls, name, len, 0, R_REQUEST, &r); diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index 40c76b5544da..aa5ad44d902b 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -25,14 +25,6 @@ void dlm_scan_rsbs(struct dlm_ls *ls); int dlm_lock_recovery_try(struct dlm_ls *ls); void dlm_unlock_recovery(struct dlm_ls *ls); -#ifdef CONFIG_DLM_DEPRECATED_API -void dlm_scan_timeout(struct dlm_ls *ls); -void dlm_adjust_timeouts(struct dlm_ls *ls); -#else -static inline void dlm_scan_timeout(struct dlm_ls *ls) { } -static inline void dlm_adjust_timeouts(struct dlm_ls *ls) { } -#endif - int dlm_master_lookup(struct dlm_ls *ls, int nodeid, char *name, int len, unsigned int flags, int *r_nodeid, int *result); @@ -47,19 +39,10 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls); int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc); int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc); -#ifdef CONFIG_DLM_DEPRECATED_API -int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, - uint32_t flags, void *name, unsigned int namelen, - unsigned long timeout_cs); -int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, - int mode, uint32_t flags, uint32_t lkid, char *lvb_in, - unsigned long timeout_cs); -#else int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, uint32_t flags, void *name, unsigned int namelen); int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, uint32_t lkid, char *lvb_in); -#endif int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, void *name, unsigned int namelen, uint32_t *lkid); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 7325acbd1af7..67261b7b1f0e 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -273,7 +273,6 @@ static int dlm_scand(void *data) if (dlm_lock_recovery_try(ls)) { ls->ls_scan_time = jiffies; dlm_scan_rsbs(ls); - dlm_scan_timeout(ls); dlm_unlock_recovery(ls); } else { ls->ls_scan_time += HZ; @@ -488,28 +487,10 @@ static int new_lockspace(const char *name, const char *cluster, ls->ls_ops_arg = ops_arg; } -#ifdef CONFIG_DLM_DEPRECATED_API - if (flags & DLM_LSFL_TIMEWARN) { - pr_warn_once("===============================================================\n" - "WARNING: the dlm DLM_LSFL_TIMEWARN flag is being deprecated and\n" - " will be removed in v6.2!\n" - " Inclusive DLM_LSFL_TIMEWARN define in UAPI header!\n" - "===============================================================\n"); - - set_bit(LSFL_TIMEWARN, &ls->ls_flags); - } - - /* ls_exflags are forced to match among nodes, and we don't - * need to require all nodes to have some flags set - */ - ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS | - DLM_LSFL_NEWEXCL)); -#else /* ls_exflags are forced to match among nodes, and we don't * need to require all nodes to have some flags set */ ls->ls_exflags = (flags & ~(DLM_LSFL_FS | DLM_LSFL_NEWEXCL)); -#endif size = READ_ONCE(dlm_config.ci_rsbtbl_size); ls->ls_rsbtbl_size = size; @@ -537,10 +518,6 @@ static int new_lockspace(const char *name, const char *cluster, mutex_init(&ls->ls_waiters_mutex); INIT_LIST_HEAD(&ls->ls_orphans); mutex_init(&ls->ls_orphans_mutex); -#ifdef CONFIG_DLM_DEPRECATED_API - INIT_LIST_HEAD(&ls->ls_timeout); - mutex_init(&ls->ls_timeout_mutex); -#endif INIT_LIST_HEAD(&ls->ls_new_rsb); spin_lock_init(&ls->ls_new_rsb_spin); @@ -552,8 +529,8 @@ static int new_lockspace(const char *name, const char *cluster, ls->ls_total_weight = 0; ls->ls_node_array = NULL; - memset(&ls->ls_stub_rsb, 0, sizeof(struct dlm_rsb)); - ls->ls_stub_rsb.res_ls = ls; + memset(&ls->ls_local_rsb, 0, sizeof(struct dlm_rsb)); + ls->ls_local_rsb.res_ls = ls; ls->ls_debug_rsb_dentry = NULL; ls->ls_debug_waiters_dentry = NULL; @@ -764,7 +741,7 @@ static int lkb_idr_free(int id, void *p, void *data) { struct dlm_lkb *lkb = p; - if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY) + if (lkb->lkb_lvbptr && test_bit(DLM_IFL_MSTCPY_BIT, &lkb->lkb_iflags)) dlm_free_lvb(lkb->lkb_lvbptr); dlm_free_lkb(lkb); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index bd786b3be5ec..3d3802c47b8b 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1718,8 +1718,8 @@ static void work_stop(void) static int work_start(void) { - io_workqueue = alloc_workqueue("dlm_io", WQ_HIGHPRI | WQ_MEM_RECLAIM, - 0); + io_workqueue = alloc_workqueue("dlm_io", WQ_HIGHPRI | WQ_MEM_RECLAIM | + WQ_UNBOUND, 0); if (!io_workqueue) { log_print("can't start dlm_io"); return -ENOMEM; @@ -1815,7 +1815,7 @@ static int dlm_listen_for_all(void) sock->sk->sk_data_ready = lowcomms_listen_data_ready; release_sock(sock->sk); - result = sock->ops->listen(sock, 5); + result = sock->ops->listen(sock, 128); if (result < 0) { dlm_close_sock(&listen_con.sock); return result; diff --git a/fs/dlm/main.c b/fs/dlm/main.c index a77338be3237..93755f83a30d 100644 --- a/fs/dlm/main.c +++ b/fs/dlm/main.c @@ -46,20 +46,14 @@ static int __init init_dlm(void) if (error) goto out_debug; - error = dlm_netlink_init(); - if (error) - goto out_user; - error = dlm_plock_init(); if (error) - goto out_netlink; + goto out_user; printk("DLM installed\n"); return 0; - out_netlink: - dlm_netlink_exit(); out_user: dlm_user_exit(); out_debug: @@ -77,7 +71,6 @@ static int __init init_dlm(void) static void __exit exit_dlm(void) { dlm_plock_exit(); - dlm_netlink_exit(); dlm_user_exit(); dlm_config_exit(); dlm_memory_exit(); diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c index cdbaa452fc05..64f212a066cf 100644 --- a/fs/dlm/memory.c +++ b/fs/dlm/memory.c @@ -118,7 +118,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls) void dlm_free_lkb(struct dlm_lkb *lkb) { - if (lkb->lkb_flags & DLM_IFL_USER) { + if (test_bit(DLM_DFL_USER_BIT, &lkb->lkb_dflags)) { struct dlm_user_args *ua; ua = lkb->lkb_ua; if (ua) { diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c deleted file mode 100644 index 4de4b8651c6c..000000000000 --- a/fs/dlm/netlink.c +++ /dev/null @@ -1,139 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2007 Red Hat, Inc. All rights reserved. - */ - -#include <net/genetlink.h> -#include <linux/dlm.h> -#include <linux/dlm_netlink.h> -#include <linux/gfp.h> - -#include "dlm_internal.h" - -static uint32_t dlm_nl_seqnum; -static uint32_t listener_nlportid; - -static struct genl_family family; - -static int prepare_data(u8 cmd, struct sk_buff **skbp, size_t size) -{ - struct sk_buff *skb; - void *data; - - skb = genlmsg_new(size, GFP_NOFS); - if (!skb) - return -ENOMEM; - - /* add the message headers */ - data = genlmsg_put(skb, 0, dlm_nl_seqnum++, &family, 0, cmd); - if (!data) { - nlmsg_free(skb); - return -EINVAL; - } - - *skbp = skb; - return 0; -} - -static struct dlm_lock_data *mk_data(struct sk_buff *skb) -{ - struct nlattr *ret; - - ret = nla_reserve(skb, DLM_TYPE_LOCK, sizeof(struct dlm_lock_data)); - if (!ret) - return NULL; - return nla_data(ret); -} - -static int send_data(struct sk_buff *skb) -{ - struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); - void *data = genlmsg_data(genlhdr); - - genlmsg_end(skb, data); - - return genlmsg_unicast(&init_net, skb, listener_nlportid); -} - -static int user_cmd(struct sk_buff *skb, struct genl_info *info) -{ - listener_nlportid = info->snd_portid; - printk("user_cmd nlpid %u\n", listener_nlportid); - return 0; -} - -static const struct genl_small_ops dlm_nl_ops[] = { - { - .cmd = DLM_CMD_HELLO, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = user_cmd, - }, -}; - -static struct genl_family family __ro_after_init = { - .name = DLM_GENL_NAME, - .version = DLM_GENL_VERSION, - .small_ops = dlm_nl_ops, - .n_small_ops = ARRAY_SIZE(dlm_nl_ops), - .resv_start_op = DLM_CMD_HELLO + 1, - .module = THIS_MODULE, -}; - -int __init dlm_netlink_init(void) -{ - return genl_register_family(&family); -} - -void dlm_netlink_exit(void) -{ - genl_unregister_family(&family); -} - -static void fill_data(struct dlm_lock_data *data, struct dlm_lkb *lkb) -{ - struct dlm_rsb *r = lkb->lkb_resource; - - memset(data, 0, sizeof(struct dlm_lock_data)); - - data->version = DLM_LOCK_DATA_VERSION; - data->nodeid = lkb->lkb_nodeid; - data->ownpid = lkb->lkb_ownpid; - data->id = lkb->lkb_id; - data->remid = lkb->lkb_remid; - data->status = lkb->lkb_status; - data->grmode = lkb->lkb_grmode; - data->rqmode = lkb->lkb_rqmode; - if (lkb->lkb_ua) - data->xid = lkb->lkb_ua->xid; - if (r) { - data->lockspace_id = r->res_ls->ls_global_id; - data->resource_namelen = r->res_length; - memcpy(data->resource_name, r->res_name, r->res_length); - } -} - -void dlm_timeout_warn(struct dlm_lkb *lkb) -{ - struct sk_buff *send_skb; - struct dlm_lock_data *data; - size_t size; - int rv; - - size = nla_total_size(sizeof(struct dlm_lock_data)) + - nla_total_size(0); /* why this? */ - - rv = prepare_data(DLM_CMD_TIMEOUT, &send_skb, size); - if (rv < 0) - return; - - data = mk_data(send_skb); - if (!data) { - nlmsg_free(send_skb); - return; - } - - fill_data(data, lkb); - - send_data(send_skb); -} - diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index b76d52e2f6bd..f4afdf892f78 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -415,7 +415,7 @@ static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb, rl->rl_ownpid = cpu_to_le32(lkb->lkb_ownpid); rl->rl_lkid = cpu_to_le32(lkb->lkb_id); rl->rl_exflags = cpu_to_le32(lkb->lkb_exflags); - rl->rl_flags = cpu_to_le32(lkb->lkb_flags); + rl->rl_flags = cpu_to_le32(dlm_dflags_val(lkb)); rl->rl_lvbseq = cpu_to_le32(lkb->lkb_lvbseq); rl->rl_rqmode = lkb->lkb_rqmode; rl->rl_grmode = lkb->lkb_grmode; diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index ccff1791803f..29d71a5018d4 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -403,7 +403,7 @@ static void set_lock_master(struct list_head *queue, int nodeid) struct dlm_lkb *lkb; list_for_each_entry(lkb, queue, lkb_statequeue) { - if (!(lkb->lkb_flags & DLM_IFL_MSTCPY)) { + if (!test_bit(DLM_IFL_MSTCPY_BIT, &lkb->lkb_iflags)) { lkb->lkb_nodeid = nodeid; lkb->lkb_remid = 0; } diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index e15eb511b04b..19da816cfb09 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -214,8 +214,6 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_clear_members_gone(ls); - dlm_adjust_timeouts(ls); - dlm_callback_resume(ls); error = enable_locking(ls, rv->seq); diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 35129505ddda..d9c09fc0aba1 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -183,7 +183,8 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, struct dlm_user_proc *proc; int rv; - if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) + if (test_bit(DLM_DFL_ORPHAN_BIT, &lkb->lkb_dflags) || + test_bit(DLM_IFL_DEAD_BIT, &lkb->lkb_iflags)) return; ls = lkb->lkb_resource->res_ls; @@ -195,7 +196,8 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, for cases where a completion ast is received for an operation that began before clear_proc_locks did its cancel/unlock. */ - if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) + if (test_bit(DLM_DFL_ORPHAN_BIT, &lkb->lkb_dflags) || + test_bit(DLM_IFL_DEAD_BIT, &lkb->lkb_iflags)) goto out; DLM_ASSERT(lkb->lkb_ua, dlm_print_lkb(lkb);); @@ -206,7 +208,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, goto out; if ((flags & DLM_CB_CAST) && lkb_is_endoflife(mode, status)) - lkb->lkb_flags |= DLM_IFL_ENDOFLIFE; + set_bit(DLM_IFL_ENDOFLIFE_BIT, &lkb->lkb_iflags); spin_lock(&proc->asts_spin); @@ -229,7 +231,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, } spin_unlock(&proc->asts_spin); - if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) { + if (test_bit(DLM_IFL_ENDOFLIFE_BIT, &lkb->lkb_iflags)) { /* N.B. spin_lock locks_spin, not asts_spin */ spin_lock(&proc->locks_spin); if (!list_empty(&lkb->lkb_ownqueue)) { @@ -259,14 +261,6 @@ static int device_user_lock(struct dlm_user_proc *proc, goto out; } -#ifdef CONFIG_DLM_DEPRECATED_API - if (params->timeout) - pr_warn_once("========================================================\n" - "WARNING: the lkb timeout feature is being deprecated and\n" - " will be removed in v6.2!\n" - "========================================================\n"); -#endif - ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS); if (!ua) goto out; @@ -279,16 +273,9 @@ static int device_user_lock(struct dlm_user_proc *proc, ua->xid = params->xid; if (params->flags & DLM_LKF_CONVERT) { -#ifdef CONFIG_DLM_DEPRECATED_API - error = dlm_user_convert(ls, ua, - params->mode, params->flags, - params->lkid, params->lvb, - (unsigned long) params->timeout); -#else error = dlm_user_convert(ls, ua, params->mode, params->flags, params->lkid, params->lvb); -#endif } else if (params->flags & DLM_LKF_ORPHAN) { error = dlm_user_adopt_orphan(ls, ua, params->mode, params->flags, @@ -297,16 +284,9 @@ static int device_user_lock(struct dlm_user_proc *proc, if (!error) error = lkid; } else { -#ifdef CONFIG_DLM_DEPRECATED_API - error = dlm_user_request(ls, ua, - params->mode, params->flags, - params->name, params->namelen, - (unsigned long) params->timeout); -#else error = dlm_user_request(ls, ua, params->mode, params->flags, params->name, params->namelen); -#endif if (!error) error = ua->lksb.sb_lkid; } @@ -884,7 +864,7 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count, goto try_another; case DLM_DEQUEUE_CALLBACK_LAST: list_del_init(&lkb->lkb_cb_list); - lkb->lkb_flags &= ~DLM_IFL_CB_PENDING; + clear_bit(DLM_IFL_CB_PENDING_BIT, &lkb->lkb_iflags); break; case DLM_DEQUEUE_CALLBACK_SUCCESS: break; diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 144ace9e0dd9..83274915ba6d 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -1210,10 +1210,6 @@ static const struct xattr_handler ecryptfs_xattr_handler = { }; const struct xattr_handler *ecryptfs_xattr_handlers[] = { -#ifdef CONFIG_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif &ecryptfs_xattr_handler, NULL }; diff --git a/fs/erofs/data.c b/fs/erofs/data.c index c08c0f578bc6..6fe9a779fa91 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -27,11 +27,15 @@ void erofs_put_metabuf(struct erofs_buf *buf) buf->page = NULL; } -void *erofs_bread(struct erofs_buf *buf, struct inode *inode, - erofs_blk_t blkaddr, enum erofs_kmap_type type) +/* + * Derive the block size from inode->i_blkbits to make compatible with + * anonymous inode in fscache mode. + */ +void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr, + enum erofs_kmap_type type) { - struct address_space *const mapping = inode->i_mapping; - erofs_off_t offset = blknr_to_addr(blkaddr); + struct inode *inode = buf->inode; + erofs_off_t offset = (erofs_off_t)blkaddr << inode->i_blkbits; pgoff_t index = offset >> PAGE_SHIFT; struct page *page = buf->page; struct folio *folio; @@ -41,7 +45,7 @@ void *erofs_bread(struct erofs_buf *buf, struct inode *inode, erofs_put_metabuf(buf); nofs_flag = memalloc_nofs_save(); - folio = read_cache_folio(mapping, index, NULL, NULL); + folio = read_cache_folio(inode->i_mapping, index, NULL, NULL); memalloc_nofs_restore(nofs_flag); if (IS_ERR(folio)) return folio; @@ -63,14 +67,19 @@ void *erofs_bread(struct erofs_buf *buf, struct inode *inode, return buf->base + (offset & ~PAGE_MASK); } -void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, - erofs_blk_t blkaddr, enum erofs_kmap_type type) +void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb) { if (erofs_is_fscache_mode(sb)) - return erofs_bread(buf, EROFS_SB(sb)->s_fscache->inode, - blkaddr, type); + buf->inode = EROFS_SB(sb)->s_fscache->inode; + else + buf->inode = sb->s_bdev->bd_inode; +} - return erofs_bread(buf, sb->s_bdev->bd_inode, blkaddr, type); +void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, + erofs_blk_t blkaddr, enum erofs_kmap_type type) +{ + erofs_init_metabuf(buf, sb); + return erofs_bread(buf, blkaddr, type); } static int erofs_map_blocks_flatmode(struct inode *inode, @@ -79,33 +88,32 @@ static int erofs_map_blocks_flatmode(struct inode *inode, erofs_blk_t nblocks, lastblk; u64 offset = map->m_la; struct erofs_inode *vi = EROFS_I(inode); + struct super_block *sb = inode->i_sb; bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE); - nblocks = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ); + nblocks = erofs_iblks(inode); lastblk = nblocks - tailendpacking; /* there is no hole in flatmode */ map->m_flags = EROFS_MAP_MAPPED; - if (offset < blknr_to_addr(lastblk)) { - map->m_pa = blknr_to_addr(vi->raw_blkaddr) + map->m_la; - map->m_plen = blknr_to_addr(lastblk) - offset; + if (offset < erofs_pos(sb, lastblk)) { + map->m_pa = erofs_pos(sb, vi->raw_blkaddr) + map->m_la; + map->m_plen = erofs_pos(sb, lastblk) - offset; } else if (tailendpacking) { map->m_pa = erofs_iloc(inode) + vi->inode_isize + - vi->xattr_isize + erofs_blkoff(offset); + vi->xattr_isize + erofs_blkoff(sb, offset); map->m_plen = inode->i_size - offset; /* inline data should be located in the same meta block */ - if (erofs_blkoff(map->m_pa) + map->m_plen > EROFS_BLKSIZ) { - erofs_err(inode->i_sb, - "inline data cross block boundary @ nid %llu", + if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) { + erofs_err(sb, "inline data cross block boundary @ nid %llu", vi->nid); DBG_BUGON(1); return -EFSCORRUPTED; } map->m_flags |= EROFS_MAP_META; } else { - erofs_err(inode->i_sb, - "internal error @ nid: %llu (size %llu), m_la 0x%llx", + erofs_err(sb, "internal error @ nid: %llu (size %llu), m_la 0x%llx", vi->nid, inode->i_size, map->m_la); DBG_BUGON(1); return -EIO; @@ -148,29 +156,29 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, unit) + unit * chunknr; - kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP); + kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(sb, pos), EROFS_KMAP); if (IS_ERR(kaddr)) { err = PTR_ERR(kaddr); goto out; } map->m_la = chunknr << vi->chunkbits; map->m_plen = min_t(erofs_off_t, 1UL << vi->chunkbits, - roundup(inode->i_size - map->m_la, EROFS_BLKSIZ)); + round_up(inode->i_size - map->m_la, sb->s_blocksize)); /* handle block map */ if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) { - __le32 *blkaddr = kaddr + erofs_blkoff(pos); + __le32 *blkaddr = kaddr + erofs_blkoff(sb, pos); if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) { map->m_flags = 0; } else { - map->m_pa = blknr_to_addr(le32_to_cpu(*blkaddr)); + map->m_pa = erofs_pos(sb, le32_to_cpu(*blkaddr)); map->m_flags = EROFS_MAP_MAPPED; } goto out_unlock; } /* parse chunk indexes */ - idx = kaddr + erofs_blkoff(pos); + idx = kaddr + erofs_blkoff(sb, pos); switch (le32_to_cpu(idx->blkaddr)) { case EROFS_NULL_ADDR: map->m_flags = 0; @@ -178,7 +186,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) default: map->m_deviceid = le16_to_cpu(idx->device_id) & EROFS_SB(sb)->device_id_mask; - map->m_pa = blknr_to_addr(le32_to_cpu(idx->blkaddr)); + map->m_pa = erofs_pos(sb, le32_to_cpu(idx->blkaddr)); map->m_flags = EROFS_MAP_MAPPED; break; } @@ -197,7 +205,6 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) struct erofs_device_info *dif; int id; - /* primary device by default */ map->m_bdev = sb->s_bdev; map->m_daxdev = EROFS_SB(sb)->dax_dev; map->m_dax_part_off = EROFS_SB(sb)->dax_part_off; @@ -210,20 +217,25 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) up_read(&devs->rwsem); return -ENODEV; } + if (devs->flatdev) { + map->m_pa += erofs_pos(sb, dif->mapped_blkaddr); + up_read(&devs->rwsem); + return 0; + } map->m_bdev = dif->bdev; map->m_daxdev = dif->dax_dev; map->m_dax_part_off = dif->dax_part_off; map->m_fscache = dif->fscache; up_read(&devs->rwsem); - } else if (devs->extra_devices) { + } else if (devs->extra_devices && !devs->flatdev) { down_read(&devs->rwsem); idr_for_each_entry(&devs->tree, dif, id) { erofs_off_t startoff, length; if (!dif->mapped_blkaddr) continue; - startoff = blknr_to_addr(dif->mapped_blkaddr); - length = blknr_to_addr(dif->blocks); + startoff = erofs_pos(sb, dif->mapped_blkaddr); + length = erofs_pos(sb, dif->blocks); if (map->m_pa >= startoff && map->m_pa < startoff + length) { @@ -244,6 +256,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { int ret; + struct super_block *sb = inode->i_sb; struct erofs_map_blocks map; struct erofs_map_dev mdev; @@ -258,7 +271,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, .m_deviceid = map.m_deviceid, .m_pa = map.m_pa, }; - ret = erofs_map_dev(inode->i_sb, &mdev); + ret = erofs_map_dev(sb, &mdev); if (ret) return ret; @@ -284,11 +297,11 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, struct erofs_buf buf = __EROFS_BUF_INITIALIZER; iomap->type = IOMAP_INLINE; - ptr = erofs_read_metabuf(&buf, inode->i_sb, - erofs_blknr(mdev.m_pa), EROFS_KMAP); + ptr = erofs_read_metabuf(&buf, sb, + erofs_blknr(sb, mdev.m_pa), EROFS_KMAP); if (IS_ERR(ptr)) return PTR_ERR(ptr); - iomap->inline_data = ptr + erofs_blkoff(mdev.m_pa); + iomap->inline_data = ptr + erofs_blkoff(sb, mdev.m_pa); iomap->private = buf.base; } else { iomap->type = IOMAP_MAPPED; diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 51b7ac7166d9..7021e2cf6146 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -42,7 +42,7 @@ int z_erofs_load_lz4_config(struct super_block *sb, if (!sbi->lz4.max_pclusterblks) { sbi->lz4.max_pclusterblks = 1; /* reserved case */ } else if (sbi->lz4.max_pclusterblks > - Z_EROFS_PCLUSTER_MAX_SIZE / EROFS_BLKSIZ) { + erofs_blknr(sb, Z_EROFS_PCLUSTER_MAX_SIZE)) { erofs_err(sb, "too large lz4 pclusterblks %u", sbi->lz4.max_pclusterblks); return -EINVAL; @@ -221,13 +221,13 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, support_0padding = true; ret = z_erofs_fixup_insize(rq, headpage + rq->pageofs_in, min_t(unsigned int, rq->inputsize, - EROFS_BLKSIZ - rq->pageofs_in)); + rq->sb->s_blocksize - rq->pageofs_in)); if (ret) { kunmap_atomic(headpage); return ret; } may_inplace = !((rq->pageofs_in + rq->inputsize) & - (EROFS_BLKSIZ - 1)); + (rq->sb->s_blocksize - 1)); } inputmargin = rq->pageofs_in; diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index d38e19c11270..73091fbe3ea4 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -166,8 +166,8 @@ int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, /* 1. get the exact LZMA compressed size */ kin = kmap(*rq->in); err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in, - min_t(unsigned int, rq->inputsize, - EROFS_BLKSIZ - rq->pageofs_in)); + min_t(unsigned int, rq->inputsize, + rq->sb->s_blocksize - rq->pageofs_in)); if (err) { kunmap(*rq->in); return err; diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index 6970b09b8307..b80abec0531a 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -50,44 +50,43 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) { struct inode *dir = file_inode(f); struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + struct super_block *sb = dir->i_sb; + unsigned long bsz = sb->s_blocksize; const size_t dirsize = i_size_read(dir); - unsigned int i = ctx->pos / EROFS_BLKSIZ; - unsigned int ofs = ctx->pos % EROFS_BLKSIZ; + unsigned int i = erofs_blknr(sb, ctx->pos); + unsigned int ofs = erofs_blkoff(sb, ctx->pos); int err = 0; bool initial = true; + buf.inode = dir; while (ctx->pos < dirsize) { struct erofs_dirent *de; unsigned int nameoff, maxsize; - de = erofs_bread(&buf, dir, i, EROFS_KMAP); + de = erofs_bread(&buf, i, EROFS_KMAP); if (IS_ERR(de)) { - erofs_err(dir->i_sb, - "fail to readdir of logical block %u of nid %llu", + erofs_err(sb, "fail to readdir of logical block %u of nid %llu", i, EROFS_I(dir)->nid); err = PTR_ERR(de); break; } nameoff = le16_to_cpu(de->nameoff); - if (nameoff < sizeof(struct erofs_dirent) || - nameoff >= EROFS_BLKSIZ) { - erofs_err(dir->i_sb, - "invalid de[0].nameoff %u @ nid %llu", + if (nameoff < sizeof(struct erofs_dirent) || nameoff >= bsz) { + erofs_err(sb, "invalid de[0].nameoff %u @ nid %llu", nameoff, EROFS_I(dir)->nid); err = -EFSCORRUPTED; break; } - maxsize = min_t(unsigned int, - dirsize - ctx->pos + ofs, EROFS_BLKSIZ); + maxsize = min_t(unsigned int, dirsize - ctx->pos + ofs, bsz); /* search dirents at the arbitrary position */ if (initial) { initial = false; ofs = roundup(ofs, sizeof(struct erofs_dirent)); - ctx->pos = blknr_to_addr(i) + ofs; + ctx->pos = erofs_pos(sb, i) + ofs; if (ofs >= nameoff) goto skip_this; } @@ -97,7 +96,7 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) if (err) break; skip_this: - ctx->pos = blknr_to_addr(i) + maxsize; + ctx->pos = erofs_pos(sb, i) + maxsize; ++i; ofs = 0; } diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index dbcd24371002..2c7b16e340fe 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -27,6 +27,7 @@ #define EROFS_FEATURE_INCOMPAT_ZTAILPACKING 0x00000010 #define EROFS_FEATURE_INCOMPAT_FRAGMENTS 0x00000020 #define EROFS_FEATURE_INCOMPAT_DEDUPE 0x00000020 +#define EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES 0x00000040 #define EROFS_ALL_FEATURE_INCOMPAT \ (EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \ EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ @@ -36,7 +37,8 @@ EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \ EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \ EROFS_FEATURE_INCOMPAT_FRAGMENTS | \ - EROFS_FEATURE_INCOMPAT_DEDUPE) + EROFS_FEATURE_INCOMPAT_DEDUPE | \ + EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES) #define EROFS_SB_EXTSLOT_SIZE 16 @@ -53,7 +55,7 @@ struct erofs_super_block { __le32 magic; /* file system magic number */ __le32 checksum; /* crc32c(super_block) */ __le32 feature_compat; - __u8 blkszbits; /* support block_size == PAGE_SIZE only */ + __u8 blkszbits; /* filesystem block size in bit shift */ __u8 sb_extslots; /* superblock size = 128 + sb_extslots * 16 */ __le16 root_nid; /* nid of root directory */ @@ -75,49 +77,46 @@ struct erofs_super_block { } __packed u1; __le16 extra_devices; /* # of devices besides the primary device */ __le16 devt_slotoff; /* startoff = devt_slotoff * devt_slotsize */ - __u8 reserved[6]; + __u8 dirblkbits; /* directory block size in bit shift */ + __u8 xattr_prefix_count; /* # of long xattr name prefixes */ + __le32 xattr_prefix_start; /* start of long xattr prefixes */ __le64 packed_nid; /* nid of the special packed inode */ __u8 reserved2[24]; }; /* - * erofs inode datalayout (i_format in on-disk inode): + * EROFS inode datalayout (i_format in on-disk inode): * 0 - uncompressed flat inode without tail-packing inline data: - * inode, [xattrs], ... | ... | no-holed data * 1 - compressed inode with non-compact indexes: - * inode, [xattrs], [map_header], extents ... | ... * 2 - uncompressed flat inode with tail-packing inline data: - * inode, [xattrs], tailpacking data, ... | ... | no-holed data * 3 - compressed inode with compact indexes: - * inode, [xattrs], map_header, extents ... | ... * 4 - chunk-based inode with (optional) multi-device support: - * inode, [xattrs], chunk indexes ... | ... * 5~7 - reserved */ enum { EROFS_INODE_FLAT_PLAIN = 0, - EROFS_INODE_FLAT_COMPRESSION_LEGACY = 1, + EROFS_INODE_COMPRESSED_FULL = 1, EROFS_INODE_FLAT_INLINE = 2, - EROFS_INODE_FLAT_COMPRESSION = 3, + EROFS_INODE_COMPRESSED_COMPACT = 3, EROFS_INODE_CHUNK_BASED = 4, EROFS_INODE_DATALAYOUT_MAX }; static inline bool erofs_inode_is_data_compressed(unsigned int datamode) { - return datamode == EROFS_INODE_FLAT_COMPRESSION || - datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY; + return datamode == EROFS_INODE_COMPRESSED_COMPACT || + datamode == EROFS_INODE_COMPRESSED_FULL; } /* bit definitions of inode i_format */ -#define EROFS_I_VERSION_BITS 1 -#define EROFS_I_DATALAYOUT_BITS 3 +#define EROFS_I_VERSION_MASK 0x01 +#define EROFS_I_DATALAYOUT_MASK 0x07 #define EROFS_I_VERSION_BIT 0 #define EROFS_I_DATALAYOUT_BIT 1 +#define EROFS_I_ALL_BIT 4 -#define EROFS_I_ALL \ - ((1 << (EROFS_I_DATALAYOUT_BIT + EROFS_I_DATALAYOUT_BITS)) - 1) +#define EROFS_I_ALL ((1 << EROFS_I_ALL_BIT) - 1) /* indicate chunk blkbits, thus 'chunksize = blocksize << chunk blkbits' */ #define EROFS_CHUNK_FORMAT_BLKBITS_MASK 0x001F @@ -127,11 +126,30 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode) #define EROFS_CHUNK_FORMAT_ALL \ (EROFS_CHUNK_FORMAT_BLKBITS_MASK | EROFS_CHUNK_FORMAT_INDEXES) +/* 32-byte on-disk inode */ +#define EROFS_INODE_LAYOUT_COMPACT 0 +/* 64-byte on-disk inode */ +#define EROFS_INODE_LAYOUT_EXTENDED 1 + struct erofs_inode_chunk_info { __le16 format; /* chunk blkbits, etc. */ __le16 reserved; }; +union erofs_inode_i_u { + /* total compressed blocks for compressed inodes */ + __le32 compressed_blocks; + + /* block address for uncompressed flat inodes */ + __le32 raw_blkaddr; + + /* for device files, used to indicate old/new device # */ + __le32 rdev; + + /* for chunk-based files, it contains the summary info */ + struct erofs_inode_chunk_info c; +}; + /* 32-byte reduced form of an ondisk inode */ struct erofs_inode_compact { __le16 i_format; /* inode format hints */ @@ -142,29 +160,14 @@ struct erofs_inode_compact { __le16 i_nlink; __le32 i_size; __le32 i_reserved; - union { - /* total compressed blocks for compressed inodes */ - __le32 compressed_blocks; - /* block address for uncompressed flat inodes */ - __le32 raw_blkaddr; - - /* for device files, used to indicate old/new device # */ - __le32 rdev; - - /* for chunk-based files, it contains the summary info */ - struct erofs_inode_chunk_info c; - } i_u; - __le32 i_ino; /* only used for 32-bit stat compatibility */ + union erofs_inode_i_u i_u; + + __le32 i_ino; /* only used for 32-bit stat compatibility */ __le16 i_uid; __le16 i_gid; __le32 i_reserved2; }; -/* 32-byte on-disk inode */ -#define EROFS_INODE_LAYOUT_COMPACT 0 -/* 64-byte on-disk inode */ -#define EROFS_INODE_LAYOUT_EXTENDED 1 - /* 64-byte complete form of an ondisk inode */ struct erofs_inode_extended { __le16 i_format; /* inode format hints */ @@ -174,22 +177,9 @@ struct erofs_inode_extended { __le16 i_mode; __le16 i_reserved; __le64 i_size; - union { - /* total compressed blocks for compressed inodes */ - __le32 compressed_blocks; - /* block address for uncompressed flat inodes */ - __le32 raw_blkaddr; - - /* for device files, used to indicate old/new device # */ - __le32 rdev; - - /* for chunk-based files, it contains the summary info */ - struct erofs_inode_chunk_info c; - } i_u; - - /* only used for 32-bit stat compatibility */ - __le32 i_ino; + union erofs_inode_i_u i_u; + __le32 i_ino; /* only used for 32-bit stat compatibility */ __le32 i_uid; __le32 i_gid; __le64 i_mtime; @@ -198,10 +188,6 @@ struct erofs_inode_extended { __u8 i_reserved2[16]; }; -#define EROFS_MAX_SHARED_XATTRS (128) -/* h_shared_count between 129 ... 255 are special # */ -#define EROFS_SHARED_XATTR_EXTENT (255) - /* * inline xattrs (n == i_xattr_icount): * erofs_xattr_ibody_header(1) + (n - 1) * 4 bytes @@ -228,6 +214,13 @@ struct erofs_xattr_ibody_header { #define EROFS_XATTR_INDEX_LUSTRE 5 #define EROFS_XATTR_INDEX_SECURITY 6 +/* + * bit 7 of e_name_index is set when it refers to a long xattr name prefix, + * while the remained lower bits represent the index of the prefix. + */ +#define EROFS_XATTR_LONG_PREFIX 0x80 +#define EROFS_XATTR_LONG_PREFIX_MASK 0x7f + /* xattr entry (for both inline & shared xattrs) */ struct erofs_xattr_entry { __u8 e_name_len; /* length of name */ @@ -237,6 +230,12 @@ struct erofs_xattr_entry { char e_name[]; /* attribute name */ }; +/* long xattr name prefix */ +struct erofs_xattr_long_prefix { + __u8 base_index; /* short xattr name prefix index */ + char infix[]; /* infix apart from short prefix */ +}; + static inline unsigned int erofs_xattr_ibody_size(__le16 i_xattr_icount) { if (!i_xattr_icount) @@ -267,6 +266,22 @@ struct erofs_inode_chunk_index { __le32 blkaddr; /* start block address of this inode chunk */ }; +/* dirent sorts in alphabet order, thus we can do binary search */ +struct erofs_dirent { + __le64 nid; /* node number */ + __le16 nameoff; /* start offset of file name */ + __u8 file_type; /* file type */ + __u8 reserved; /* reserved */ +} __packed; + +/* + * EROFS file types should match generic FT_* types and + * it seems no need to add BUILD_BUG_ONs since potential + * unmatchness will break other fses as well... + */ + +#define EROFS_NAME_LEN 255 + /* maximum supported size of a physical compression cluster */ #define Z_EROFS_PCLUSTER_MAX_SIZE (1024 * 1024) @@ -336,10 +351,8 @@ struct z_erofs_map_header { __u8 h_clusterbits; }; -#define Z_EROFS_VLE_LEGACY_HEADER_PADDING 8 - /* - * Fixed-sized output compression on-disk logical cluster type: + * On-disk logical cluster type: * 0 - literal (uncompressed) lcluster * 1,3 - compressed lcluster (for HEAD lclusters) * 2 - compressed lcluster (for NONHEAD lclusters) @@ -363,27 +376,27 @@ struct z_erofs_map_header { * di_u.delta[1] = distance to the next HEAD lcluster */ enum { - Z_EROFS_VLE_CLUSTER_TYPE_PLAIN = 0, - Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 = 1, - Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD = 2, - Z_EROFS_VLE_CLUSTER_TYPE_HEAD2 = 3, - Z_EROFS_VLE_CLUSTER_TYPE_MAX + Z_EROFS_LCLUSTER_TYPE_PLAIN = 0, + Z_EROFS_LCLUSTER_TYPE_HEAD1 = 1, + Z_EROFS_LCLUSTER_TYPE_NONHEAD = 2, + Z_EROFS_LCLUSTER_TYPE_HEAD2 = 3, + Z_EROFS_LCLUSTER_TYPE_MAX }; -#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS 2 -#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT 0 +#define Z_EROFS_LI_LCLUSTER_TYPE_BITS 2 +#define Z_EROFS_LI_LCLUSTER_TYPE_BIT 0 /* (noncompact only, HEAD) This pcluster refers to partial decompressed data */ -#define Z_EROFS_VLE_DI_PARTIAL_REF (1 << 15) +#define Z_EROFS_LI_PARTIAL_REF (1 << 15) /* * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the * compressed block count of a compressed extent (in logical clusters, aka. * block count of a pcluster). */ -#define Z_EROFS_VLE_DI_D0_CBLKCNT (1 << 11) +#define Z_EROFS_LI_D0_CBLKCNT (1 << 11) -struct z_erofs_vle_decompressed_index { +struct z_erofs_lcluster_index { __le16 di_advise; /* where to decompress in the head lcluster */ __le16 di_clusterofs; @@ -400,25 +413,8 @@ struct z_erofs_vle_decompressed_index { } di_u; }; -#define Z_EROFS_VLE_LEGACY_INDEX_ALIGN(size) \ - (round_up(size, sizeof(struct z_erofs_vle_decompressed_index)) + \ - sizeof(struct z_erofs_map_header) + Z_EROFS_VLE_LEGACY_HEADER_PADDING) - -/* dirent sorts in alphabet order, thus we can do binary search */ -struct erofs_dirent { - __le64 nid; /* node number */ - __le16 nameoff; /* start offset of file name */ - __u8 file_type; /* file type */ - __u8 reserved; /* reserved */ -} __packed; - -/* - * EROFS file types should match generic FT_* types and - * it seems no need to add BUILD_BUG_ONs since potential - * unmatchness will break other fses as well... - */ - -#define EROFS_NAME_LEN 255 +#define Z_EROFS_FULL_INDEX_ALIGN(end) \ + (ALIGN(end, 8) + sizeof(struct z_erofs_map_header) + 8) /* check the EROFS on-disk layout strictly at compile time */ static inline void erofs_check_ondisk_layout_definitions(void) @@ -435,15 +431,15 @@ static inline void erofs_check_ondisk_layout_definitions(void) BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_info) != 4); BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) != 8); BUILD_BUG_ON(sizeof(struct z_erofs_map_header) != 8); - BUILD_BUG_ON(sizeof(struct z_erofs_vle_decompressed_index) != 8); + BUILD_BUG_ON(sizeof(struct z_erofs_lcluster_index) != 8); BUILD_BUG_ON(sizeof(struct erofs_dirent) != 12); /* keep in sync between 2 index structures for better extendibility */ BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) != - sizeof(struct z_erofs_vle_decompressed_index)); + sizeof(struct z_erofs_lcluster_index)); BUILD_BUG_ON(sizeof(struct erofs_deviceslot) != 128); - BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) < - Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1); + BUILD_BUG_ON(BIT(Z_EROFS_LI_LCLUSTER_TYPE_BITS) < + Z_EROFS_LCLUSTER_TYPE_MAX - 1); /* exclude old compiler versions like gcc 7.5.0 */ BUILD_BUG_ON(__builtin_constant_p(fmh) ? fmh != cpu_to_le64(1ULL << 63) : 0); diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 96a87c023128..87ff35bff8d5 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -209,8 +209,8 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary) void *src; /* For tail packing layout, the offset may be non-zero. */ - offset = erofs_blkoff(map.m_pa); - blknr = erofs_blknr(map.m_pa); + offset = erofs_blkoff(sb, map.m_pa); + blknr = erofs_blknr(sb, map.m_pa); size = map.m_llen; src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP); @@ -460,6 +460,7 @@ static struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb inode->i_size = OFFSET_MAX; inode->i_mapping->a_ops = &erofs_fscache_meta_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + inode->i_blkbits = EROFS_SB(sb)->blkszbits; inode->i_private = ctx; ctx->cookie = cookie; diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 4be7dda3cd24..d70b12b81507 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -23,11 +23,8 @@ static void *erofs_read_inode(struct erofs_buf *buf, unsigned int ifmt; int err; - blkaddr = erofs_blknr(inode_loc); - *ofs = erofs_blkoff(inode_loc); - - erofs_dbg("%s, reading inode nid %llu at %u of blkaddr %u", - __func__, vi->nid, *ofs, blkaddr); + blkaddr = erofs_blknr(sb, inode_loc); + *ofs = erofs_blkoff(sb, inode_loc); kaddr = erofs_read_metabuf(buf, sb, blkaddr, EROFS_KMAP); if (IS_ERR(kaddr)) { @@ -58,11 +55,11 @@ static void *erofs_read_inode(struct erofs_buf *buf, case EROFS_INODE_LAYOUT_EXTENDED: vi->inode_isize = sizeof(struct erofs_inode_extended); /* check if the extended inode acrosses block boundary */ - if (*ofs + vi->inode_isize <= EROFS_BLKSIZ) { + if (*ofs + vi->inode_isize <= sb->s_blocksize) { *ofs += vi->inode_isize; die = (struct erofs_inode_extended *)dic; } else { - const unsigned int gotten = EROFS_BLKSIZ - *ofs; + const unsigned int gotten = sb->s_blocksize - *ofs; copied = kmalloc(vi->inode_isize, GFP_NOFS); if (!copied) { @@ -176,7 +173,7 @@ static void *erofs_read_inode(struct erofs_buf *buf, err = -EOPNOTSUPP; goto err_out; } - vi->chunkbits = LOG_BLOCK_SIZE + + vi->chunkbits = sb->s_blocksize_bits + (vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK); } inode->i_mtime.tv_sec = inode->i_ctime.tv_sec; @@ -188,11 +185,12 @@ static void *erofs_read_inode(struct erofs_buf *buf, if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) && vi->datalayout == EROFS_INODE_FLAT_PLAIN) inode->i_flags |= S_DAX; + if (!nblks) /* measure inode.i_blocks as generic filesystems */ - inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9; + inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9; else - inode->i_blocks = nblks << LOG_SECTORS_PER_BLOCK; + inode->i_blocks = nblks << (sb->s_blocksize_bits - 9); return kaddr; bogusimode: @@ -210,11 +208,12 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr, unsigned int m_pofs) { struct erofs_inode *vi = EROFS_I(inode); + unsigned int bsz = i_blocksize(inode); char *lnk; /* if it cannot be handled with fast symlink scheme */ if (vi->datalayout != EROFS_INODE_FLAT_INLINE || - inode->i_size >= EROFS_BLKSIZ || inode->i_size < 0) { + inode->i_size >= bsz || inode->i_size < 0) { inode->i_op = &erofs_symlink_iops; return 0; } @@ -225,7 +224,7 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr, m_pofs += vi->xattr_isize; /* inline symlink data shouldn't cross block boundary */ - if (m_pofs + inode->i_size > EROFS_BLKSIZ) { + if (m_pofs + inode->i_size > bsz) { kfree(lnk); erofs_err(inode->i_sb, "inline data cross block boundary @ nid %llu", @@ -289,10 +288,15 @@ static int erofs_fill_inode(struct inode *inode) } if (erofs_inode_is_data_compressed(vi->datalayout)) { - if (!erofs_is_fscache_mode(inode->i_sb)) - err = z_erofs_fill_inode(inode); - else - err = -EOPNOTSUPP; +#ifdef CONFIG_EROFS_FS_ZIP + if (!erofs_is_fscache_mode(inode->i_sb) && + inode->i_sb->s_blocksize_bits == PAGE_SHIFT) { + inode->i_mapping->a_ops = &z_erofs_aops; + err = 0; + goto out_unlock; + } +#endif + err = -EOPNOTSUPP; goto out_unlock; } inode->i_mapping->a_ops = &erofs_raw_access_aops; diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 1db018f8c2e8..af0431a40647 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -31,10 +31,8 @@ __printf(3, 4) void _erofs_info(struct super_block *sb, #define erofs_info(sb, fmt, ...) \ _erofs_info(sb, __func__, fmt "\n", ##__VA_ARGS__) #ifdef CONFIG_EROFS_FS_DEBUG -#define erofs_dbg(x, ...) pr_debug(x "\n", ##__VA_ARGS__) #define DBG_BUGON BUG_ON #else -#define erofs_dbg(x, ...) ((void)0) #define DBG_BUGON(x) ((void)(x)) #endif /* !CONFIG_EROFS_FS_DEBUG */ @@ -81,6 +79,7 @@ struct erofs_dev_context { struct rw_semaphore rwsem; unsigned int extra_devices; + bool flatdev; }; struct erofs_fs_context { @@ -116,6 +115,11 @@ struct erofs_fscache { char *name; }; +struct erofs_xattr_prefix_item { + struct erofs_xattr_long_prefix *prefix; + u8 infix_len; +}; + struct erofs_sb_info { struct erofs_mount_opts opt; /* options */ #ifdef CONFIG_EROFS_FS_ZIP @@ -133,8 +137,8 @@ struct erofs_sb_info { struct inode *managed_cache; struct erofs_sb_lz4_info lz4; - struct inode *packed_inode; #endif /* CONFIG_EROFS_FS_ZIP */ + struct inode *packed_inode; struct erofs_dev_context *devs; struct dax_device *dax_dev; u64 dax_part_off; @@ -144,11 +148,14 @@ struct erofs_sb_info { u32 meta_blkaddr; #ifdef CONFIG_EROFS_FS_XATTR u32 xattr_blkaddr; + u32 xattr_prefix_start; + u8 xattr_prefix_count; + struct erofs_xattr_prefix_item *xattr_prefixes; #endif u16 device_id_mask; /* valid bits of device id to be used */ - /* inode slot unit size in bit shift */ - unsigned char islotbits; + unsigned char islotbits; /* inode slot unit size in bit shift */ + unsigned char blkszbits; /* filesystem block size in bit shift */ u32 sb_size; /* total superblock size */ u32 build_time_nsec; @@ -156,6 +163,7 @@ struct erofs_sb_info { /* what we really care is nid, rather than ino.. */ erofs_nid_t root_nid; + erofs_nid_t packed_nid; /* used for statfs, f_files - f_favail */ u64 inos; @@ -240,27 +248,13 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) VAL != EROFS_LOCKED_MAGIC); } -/* we strictly follow PAGE_SIZE and no buffer head yet */ -#define LOG_BLOCK_SIZE PAGE_SHIFT - -#undef LOG_SECTORS_PER_BLOCK -#define LOG_SECTORS_PER_BLOCK (PAGE_SHIFT - 9) - -#undef SECTORS_PER_BLOCK -#define SECTORS_PER_BLOCK (1 << SECTORS_PER_BLOCK) - -#define EROFS_BLKSIZ (1 << LOG_BLOCK_SIZE) - -#if (EROFS_BLKSIZ % 4096 || !EROFS_BLKSIZ) -#error erofs cannot be used in this platform -#endif - enum erofs_kmap_type { EROFS_NO_KMAP, /* don't map the buffer */ EROFS_KMAP, /* use kmap_local_page() to map the buffer */ }; struct erofs_buf { + struct inode *inode; struct page *page; void *base; enum erofs_kmap_type kmap_type; @@ -269,9 +263,10 @@ struct erofs_buf { #define ROOT_NID(sb) ((sb)->root_nid) -#define erofs_blknr(addr) ((addr) / EROFS_BLKSIZ) -#define erofs_blkoff(addr) ((addr) % EROFS_BLKSIZ) -#define blknr_to_addr(nr) ((erofs_off_t)(nr) * EROFS_BLKSIZ) +#define erofs_blknr(sb, addr) ((addr) >> (sb)->s_blocksize_bits) +#define erofs_blkoff(sb, addr) ((addr) & ((sb)->s_blocksize - 1)) +#define erofs_pos(sb, blk) ((erofs_off_t)(blk) << (sb)->s_blocksize_bits) +#define erofs_iblks(i) (round_up((i)->i_size, i_blocksize(i)) >> (i)->i_blkbits) #define EROFS_FEATURE_FUNCS(name, compat, feature) \ static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \ @@ -288,6 +283,7 @@ EROFS_FEATURE_FUNCS(compr_head2, incompat, INCOMPAT_COMPR_HEAD2) EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZTAILPACKING) EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS) EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE) +EROFS_FEATURE_FUNCS(xattr_prefixes, incompat, INCOMPAT_XATTR_PREFIXES) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) /* atomic flag definitions */ @@ -306,7 +302,7 @@ struct erofs_inode { unsigned char datalayout; unsigned char inode_isize; - unsigned short xattr_isize; + unsigned int xattr_isize; unsigned int xattr_shared_count; unsigned int *xattr_shared_xattrs; @@ -343,28 +339,18 @@ static inline erofs_off_t erofs_iloc(struct inode *inode) { struct erofs_sb_info *sbi = EROFS_I_SB(inode); - return blknr_to_addr(sbi->meta_blkaddr) + + return erofs_pos(inode->i_sb, sbi->meta_blkaddr) + (EROFS_I(inode)->nid << sbi->islotbits); } -static inline unsigned int erofs_bitrange(unsigned int value, unsigned int bit, - unsigned int bits) -{ - - return (value >> bit) & ((1 << bits) - 1); -} - - -static inline unsigned int erofs_inode_version(unsigned int value) +static inline unsigned int erofs_inode_version(unsigned int ifmt) { - return erofs_bitrange(value, EROFS_I_VERSION_BIT, - EROFS_I_VERSION_BITS); + return (ifmt >> EROFS_I_VERSION_BIT) & EROFS_I_VERSION_MASK; } -static inline unsigned int erofs_inode_datalayout(unsigned int value) +static inline unsigned int erofs_inode_datalayout(unsigned int ifmt) { - return erofs_bitrange(value, EROFS_I_DATALAYOUT_BIT, - EROFS_I_DATALAYOUT_BITS); + return (ifmt >> EROFS_I_DATALAYOUT_BIT) & EROFS_I_DATALAYOUT_MASK; } /* @@ -451,10 +437,13 @@ extern const struct iomap_ops z_erofs_iomap_report_ops; #define EROFS_REG_COOKIE_SHARE 0x0001 #define EROFS_REG_COOKIE_NEED_NOEXIST 0x0002 +void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, + erofs_off_t *offset, int *lengthp); void erofs_unmap_metabuf(struct erofs_buf *buf); void erofs_put_metabuf(struct erofs_buf *buf); -void *erofs_bread(struct erofs_buf *buf, struct inode *inode, - erofs_blk_t blkaddr, enum erofs_kmap_type type); +void *erofs_bread(struct erofs_buf *buf, erofs_blk_t blkaddr, + enum erofs_kmap_type type); +void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb); void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, erofs_blk_t blkaddr, enum erofs_kmap_type type); int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev); @@ -521,7 +510,6 @@ int erofs_try_to_free_cached_page(struct page *page); int z_erofs_load_lz4_config(struct super_block *sb, struct erofs_super_block *dsb, struct z_erofs_lz4_cfgs *lz4, int len); -int z_erofs_fill_inode(struct inode *inode); int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags); #else @@ -541,7 +529,6 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb, } return 0; } -static inline int z_erofs_fill_inode(struct inode *inode) { return -EOPNOTSUPP; } #endif /* !CONFIG_EROFS_FS_ZIP */ #ifdef CONFIG_EROFS_FS_ZIP_LZMA diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index 966eabc61c13..d4f631d39f0f 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -89,7 +89,8 @@ static struct erofs_dirent *find_target_dirent(struct erofs_qstr *name, static void *erofs_find_target_block(struct erofs_buf *target, struct inode *dir, struct erofs_qstr *name, int *_ndirents) { - int head = 0, back = DIV_ROUND_UP(dir->i_size, EROFS_BLKSIZ) - 1; + unsigned int bsz = i_blocksize(dir); + int head = 0, back = erofs_iblks(dir) - 1; unsigned int startprfx = 0, endprfx = 0; void *candidate = ERR_PTR(-ENOENT); @@ -98,10 +99,10 @@ static void *erofs_find_target_block(struct erofs_buf *target, struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_dirent *de; - de = erofs_bread(&buf, dir, mid, EROFS_KMAP); + buf.inode = dir; + de = erofs_bread(&buf, mid, EROFS_KMAP); if (!IS_ERR(de)) { - const int nameoff = nameoff_from_disk(de->nameoff, - EROFS_BLKSIZ); + const int nameoff = nameoff_from_disk(de->nameoff, bsz); const int ndirents = nameoff / sizeof(*de); int diff; unsigned int matched; @@ -121,11 +122,10 @@ static void *erofs_find_target_block(struct erofs_buf *target, dname.name = (u8 *)de + nameoff; if (ndirents == 1) - dname.end = (u8 *)de + EROFS_BLKSIZ; + dname.end = (u8 *)de + bsz; else dname.end = (u8 *)de + - nameoff_from_disk(de[1].nameoff, - EROFS_BLKSIZ); + nameoff_from_disk(de[1].nameoff, bsz); /* string comparison without already matched prefix */ diff = erofs_dirnamecmp(name, &dname, &matched); @@ -171,6 +171,7 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, qn.name = name->name; qn.end = name->name + name->len; + buf.inode = dir; ndirents = 0; de = erofs_find_target_block(&buf, dir, &qn, &ndirents); @@ -178,7 +179,8 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, return PTR_ERR(de); if (ndirents) - de = find_target_dirent(&qn, (u8 *)de, EROFS_BLKSIZ, ndirents); + de = find_target_dirent(&qn, (u8 *)de, i_blocksize(dir), + ndirents); if (!IS_ERR(de)) { *nid = le64_to_cpu(de->nid); @@ -203,16 +205,13 @@ static struct dentry *erofs_lookup(struct inode *dir, struct dentry *dentry, err = erofs_namei(dir, &dentry->d_name, &nid, &d_type); - if (err == -ENOENT) { + if (err == -ENOENT) /* negative dentry */ inode = NULL; - } else if (err) { + else if (err) inode = ERR_PTR(err); - } else { - erofs_dbg("%s, %pd (nid %llu) found, d_type %u", __func__, - dentry, nid, d_type); + else inode = erofs_iget(dir->i_sb, nid); - } return d_splice_alias(inode, dentry); } diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 19b1ae79cec4..811ab66d805e 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -52,18 +52,21 @@ void _erofs_info(struct super_block *sb, const char *function, static int erofs_superblock_csum_verify(struct super_block *sb, void *sbdata) { + size_t len = 1 << EROFS_SB(sb)->blkszbits; struct erofs_super_block *dsb; u32 expected_crc, crc; - dsb = kmemdup(sbdata + EROFS_SUPER_OFFSET, - EROFS_BLKSIZ - EROFS_SUPER_OFFSET, GFP_KERNEL); + if (len > EROFS_SUPER_OFFSET) + len -= EROFS_SUPER_OFFSET; + + dsb = kmemdup(sbdata + EROFS_SUPER_OFFSET, len, GFP_KERNEL); if (!dsb) return -ENOMEM; expected_crc = le32_to_cpu(dsb->checksum); dsb->checksum = 0; /* to allow for x86 boot sectors and other oddities. */ - crc = crc32c(~0, dsb, EROFS_BLKSIZ - EROFS_SUPER_OFFSET); + crc = crc32c(~0, dsb, len); kfree(dsb); if (crc != expected_crc) { @@ -123,20 +126,19 @@ static bool check_layout_compatibility(struct super_block *sb, return true; } -#ifdef CONFIG_EROFS_FS_ZIP /* read variable-sized metadata, offset will be aligned by 4-byte */ -static void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, - erofs_off_t *offset, int *lengthp) +void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, + erofs_off_t *offset, int *lengthp) { u8 *buffer, *ptr; int len, i, cnt; *offset = round_up(*offset, 4); - ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*offset), EROFS_KMAP); + ptr = erofs_bread(buf, erofs_blknr(sb, *offset), EROFS_KMAP); if (IS_ERR(ptr)) return ptr; - len = le16_to_cpu(*(__le16 *)&ptr[erofs_blkoff(*offset)]); + len = le16_to_cpu(*(__le16 *)&ptr[erofs_blkoff(sb, *offset)]); if (!len) len = U16_MAX + 1; buffer = kmalloc(len, GFP_KERNEL); @@ -146,19 +148,20 @@ static void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, *lengthp = len; for (i = 0; i < len; i += cnt) { - cnt = min(EROFS_BLKSIZ - (int)erofs_blkoff(*offset), len - i); - ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*offset), - EROFS_KMAP); + cnt = min_t(int, sb->s_blocksize - erofs_blkoff(sb, *offset), + len - i); + ptr = erofs_bread(buf, erofs_blknr(sb, *offset), EROFS_KMAP); if (IS_ERR(ptr)) { kfree(buffer); return ptr; } - memcpy(buffer + i, ptr + erofs_blkoff(*offset), cnt); + memcpy(buffer + i, ptr + erofs_blkoff(sb, *offset), cnt); *offset += cnt; } return buffer; } +#ifdef CONFIG_EROFS_FS_ZIP static int erofs_load_compr_cfgs(struct super_block *sb, struct erofs_super_block *dsb) { @@ -175,6 +178,7 @@ static int erofs_load_compr_cfgs(struct super_block *sb, return -EINVAL; } + erofs_init_metabuf(&buf, sb); offset = EROFS_SUPER_OFFSET + sbi->sb_size; alg = 0; for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) { @@ -228,10 +232,10 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, struct block_device *bdev; void *ptr; - ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*pos), EROFS_KMAP); + ptr = erofs_read_metabuf(buf, sb, erofs_blknr(sb, *pos), EROFS_KMAP); if (IS_ERR(ptr)) return PTR_ERR(ptr); - dis = ptr + erofs_blkoff(*pos); + dis = ptr + erofs_blkoff(sb, *pos); if (!dif->path) { if (!dis->tag[0]) { @@ -248,7 +252,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, if (IS_ERR(fscache)) return PTR_ERR(fscache); dif->fscache = fscache; - } else { + } else if (!sbi->devs->flatdev) { bdev = blkdev_get_by_path(dif->path, FMODE_READ | FMODE_EXCL, sb->s_type); if (IS_ERR(bdev)) @@ -290,6 +294,9 @@ static int erofs_scan_devices(struct super_block *sb, if (!ondisk_extradevs) return 0; + if (!sbi->devs->extra_devices && !erofs_is_fscache_mode(sb)) + sbi->devs->flatdev = true; + sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1; pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE; down_read(&sbi->devs->rwsem); @@ -329,7 +336,6 @@ static int erofs_read_superblock(struct super_block *sb) struct erofs_sb_info *sbi; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_super_block *dsb; - unsigned int blkszbits; void *data; int ret; @@ -348,6 +354,16 @@ static int erofs_read_superblock(struct super_block *sb) goto out; } + sbi->blkszbits = dsb->blkszbits; + if (sbi->blkszbits < 9 || sbi->blkszbits > PAGE_SHIFT) { + erofs_err(sb, "blkszbits %u isn't supported", sbi->blkszbits); + goto out; + } + if (dsb->dirblkbits) { + erofs_err(sb, "dirblkbits %u isn't supported", dsb->dirblkbits); + goto out; + } + sbi->feature_compat = le32_to_cpu(dsb->feature_compat); if (erofs_sb_has_sb_chksum(sbi)) { ret = erofs_superblock_csum_verify(sb, data); @@ -356,19 +372,11 @@ static int erofs_read_superblock(struct super_block *sb) } ret = -EINVAL; - blkszbits = dsb->blkszbits; - /* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */ - if (blkszbits != LOG_BLOCK_SIZE) { - erofs_err(sb, "blkszbits %u isn't supported on this platform", - blkszbits); - goto out; - } - if (!check_layout_compatibility(sb, dsb)) goto out; sbi->sb_size = 128 + dsb->sb_extslots * EROFS_SB_EXTSLOT_SIZE; - if (sbi->sb_size > EROFS_BLKSIZ) { + if (sbi->sb_size > PAGE_SIZE - EROFS_SUPER_OFFSET) { erofs_err(sb, "invalid sb_extslots %u (more than a fs block)", sbi->sb_size); goto out; @@ -377,20 +385,12 @@ static int erofs_read_superblock(struct super_block *sb) sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr); #ifdef CONFIG_EROFS_FS_XATTR sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr); + sbi->xattr_prefix_start = le32_to_cpu(dsb->xattr_prefix_start); + sbi->xattr_prefix_count = dsb->xattr_prefix_count; #endif sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact)); sbi->root_nid = le16_to_cpu(dsb->root_nid); -#ifdef CONFIG_EROFS_FS_ZIP - sbi->packed_inode = NULL; - if (erofs_sb_has_fragments(sbi) && dsb->packed_nid) { - sbi->packed_inode = - erofs_iget(sb, le64_to_cpu(dsb->packed_nid)); - if (IS_ERR(sbi->packed_inode)) { - ret = PTR_ERR(sbi->packed_inode); - goto out; - } - } -#endif + sbi->packed_nid = le64_to_cpu(dsb->packed_nid); sbi->inos = le64_to_cpu(dsb->inos); sbi->build_time = le64_to_cpu(dsb->build_time); @@ -417,8 +417,6 @@ static int erofs_read_superblock(struct super_block *sb) /* handle multiple devices */ ret = erofs_scan_devices(sb, dsb); - if (erofs_sb_has_ztailpacking(sbi)) - erofs_info(sb, "EXPERIMENTAL compressed inline data feature in use. Use at your own risk!"); if (erofs_is_fscache_mode(sb)) erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!"); if (erofs_sb_has_fragments(sbi)) @@ -733,9 +731,10 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sbi->domain_id = ctx->domain_id; ctx->domain_id = NULL; + sbi->blkszbits = PAGE_SHIFT; if (erofs_is_fscache_mode(sb)) { - sb->s_blocksize = EROFS_BLKSIZ; - sb->s_blocksize_bits = LOG_BLOCK_SIZE; + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; err = erofs_fscache_register_fs(sb); if (err) @@ -745,8 +744,8 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) if (err) return err; } else { - if (!sb_set_blocksize(sb, EROFS_BLKSIZ)) { - erofs_err(sb, "failed to set erofs blksize"); + if (!sb_set_blocksize(sb, PAGE_SIZE)) { + errorfc(fc, "failed to set initial blksize"); return -EINVAL; } @@ -759,12 +758,24 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) if (err) return err; - if (test_opt(&sbi->opt, DAX_ALWAYS)) { - BUILD_BUG_ON(EROFS_BLKSIZ != PAGE_SIZE); + if (sb->s_blocksize_bits != sbi->blkszbits) { + if (erofs_is_fscache_mode(sb)) { + errorfc(fc, "unsupported blksize for fscache mode"); + return -EINVAL; + } + if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) { + errorfc(fc, "failed to set erofs blksize"); + return -EINVAL; + } + } + if (test_opt(&sbi->opt, DAX_ALWAYS)) { if (!sbi->dax_dev) { errorfc(fc, "DAX unsupported by block device. Turning off DAX."); clear_opt(&sbi->opt, DAX_ALWAYS); + } else if (sbi->blkszbits != PAGE_SHIFT) { + errorfc(fc, "unsupported blocksize for DAX"); + clear_opt(&sbi->opt, DAX_ALWAYS); } } @@ -799,10 +810,22 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) erofs_shrinker_register(sb); /* sb->s_umount is already locked, SB_ACTIVE and SB_BORN are not set */ + if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) { + sbi->packed_inode = erofs_iget(sb, sbi->packed_nid); + if (IS_ERR(sbi->packed_inode)) { + err = PTR_ERR(sbi->packed_inode); + sbi->packed_inode = NULL; + return err; + } + } err = erofs_init_managed_cache(sb); if (err) return err; + err = erofs_xattr_prefixes_init(sb); + if (err) + return err; + err = erofs_register_sysfs(sb); if (err) return err; @@ -962,12 +985,13 @@ static void erofs_put_super(struct super_block *sb) erofs_unregister_sysfs(sb); erofs_shrinker_unregister(sb); + erofs_xattr_prefixes_cleanup(sb); #ifdef CONFIG_EROFS_FS_ZIP iput(sbi->managed_cache); sbi->managed_cache = NULL; +#endif iput(sbi->packed_inode); sbi->packed_inode = NULL; -#endif erofs_free_dev_context(sbi->devs); sbi->devs = NULL; erofs_fscache_unregister_fs(sb); @@ -1060,7 +1084,7 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = sb->s_magic; - buf->f_bsize = EROFS_BLKSIZ; + buf->f_bsize = sb->s_blocksize; buf->f_blocks = sbi->total_blocks; buf->f_bfree = buf->f_bavail = 0; diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 60729b1220b6..cd80499351e0 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -7,6 +7,19 @@ #include <linux/security.h> #include "xattr.h" +static inline erofs_blk_t erofs_xattr_blkaddr(struct super_block *sb, + unsigned int xattr_id) +{ + return EROFS_SB(sb)->xattr_blkaddr + + erofs_blknr(sb, xattr_id * sizeof(__u32)); +} + +static inline unsigned int erofs_xattr_blkoff(struct super_block *sb, + unsigned int xattr_id) +{ + return erofs_blkoff(sb, xattr_id * sizeof(__u32)); +} + struct xattr_iter { struct super_block *sb; struct erofs_buf buf; @@ -16,7 +29,7 @@ struct xattr_iter { unsigned int ofs; }; -static int init_inode_xattrs(struct inode *inode) +static int erofs_init_inode_xattrs(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); struct xattr_iter it; @@ -68,8 +81,8 @@ static int init_inode_xattrs(struct inode *inode) } it.buf = __EROFS_BUF_INITIALIZER; - it.blkaddr = erofs_blknr(erofs_iloc(inode) + vi->inode_isize); - it.ofs = erofs_blkoff(erofs_iloc(inode) + vi->inode_isize); + it.blkaddr = erofs_blknr(sb, erofs_iloc(inode) + vi->inode_isize); + it.ofs = erofs_blkoff(sb, erofs_iloc(inode) + vi->inode_isize); /* read in shared xattr array (non-atomic, see kmalloc below) */ it.kaddr = erofs_read_metabuf(&it.buf, sb, it.blkaddr, EROFS_KMAP); @@ -92,9 +105,9 @@ static int init_inode_xattrs(struct inode *inode) it.ofs += sizeof(struct erofs_xattr_ibody_header); for (i = 0; i < vi->xattr_shared_count; ++i) { - if (it.ofs >= EROFS_BLKSIZ) { + if (it.ofs >= sb->s_blocksize) { /* cannot be unaligned */ - DBG_BUGON(it.ofs != EROFS_BLKSIZ); + DBG_BUGON(it.ofs != sb->s_blocksize); it.kaddr = erofs_read_metabuf(&it.buf, sb, ++it.blkaddr, EROFS_KMAP); @@ -139,15 +152,15 @@ struct xattr_iter_handlers { static inline int xattr_iter_fixup(struct xattr_iter *it) { - if (it->ofs < EROFS_BLKSIZ) + if (it->ofs < it->sb->s_blocksize) return 0; - it->blkaddr += erofs_blknr(it->ofs); + it->blkaddr += erofs_blknr(it->sb, it->ofs); it->kaddr = erofs_read_metabuf(&it->buf, it->sb, it->blkaddr, EROFS_KMAP); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); - it->ofs = erofs_blkoff(it->ofs); + it->ofs = erofs_blkoff(it->sb, it->ofs); return 0; } @@ -157,7 +170,8 @@ static int inline_xattr_iter_begin(struct xattr_iter *it, struct erofs_inode *const vi = EROFS_I(inode); unsigned int xattr_header_sz, inline_xattr_ofs; - xattr_header_sz = inlinexattr_header_size(inode); + xattr_header_sz = sizeof(struct erofs_xattr_ibody_header) + + sizeof(u32) * vi->xattr_shared_count; if (xattr_header_sz >= vi->xattr_isize) { DBG_BUGON(xattr_header_sz > vi->xattr_isize); return -ENOATTR; @@ -165,8 +179,8 @@ static int inline_xattr_iter_begin(struct xattr_iter *it, inline_xattr_ofs = vi->inode_isize + xattr_header_sz; - it->blkaddr = erofs_blknr(erofs_iloc(inode) + inline_xattr_ofs); - it->ofs = erofs_blkoff(erofs_iloc(inode) + inline_xattr_ofs); + it->blkaddr = erofs_blknr(it->sb, erofs_iloc(inode) + inline_xattr_ofs); + it->ofs = erofs_blkoff(it->sb, erofs_iloc(inode) + inline_xattr_ofs); it->kaddr = erofs_read_metabuf(&it->buf, inode->i_sb, it->blkaddr, EROFS_KMAP); if (IS_ERR(it->kaddr)) @@ -222,8 +236,8 @@ static int xattr_foreach(struct xattr_iter *it, processed = 0; while (processed < entry.e_name_len) { - if (it->ofs >= EROFS_BLKSIZ) { - DBG_BUGON(it->ofs > EROFS_BLKSIZ); + if (it->ofs >= it->sb->s_blocksize) { + DBG_BUGON(it->ofs > it->sb->s_blocksize); err = xattr_iter_fixup(it); if (err) @@ -231,7 +245,7 @@ static int xattr_foreach(struct xattr_iter *it, it->ofs = 0; } - slice = min_t(unsigned int, EROFS_BLKSIZ - it->ofs, + slice = min_t(unsigned int, it->sb->s_blocksize - it->ofs, entry.e_name_len - processed); /* handle name */ @@ -257,8 +271,8 @@ static int xattr_foreach(struct xattr_iter *it, } while (processed < value_sz) { - if (it->ofs >= EROFS_BLKSIZ) { - DBG_BUGON(it->ofs > EROFS_BLKSIZ); + if (it->ofs >= it->sb->s_blocksize) { + DBG_BUGON(it->ofs > it->sb->s_blocksize); err = xattr_iter_fixup(it); if (err) @@ -266,7 +280,7 @@ static int xattr_foreach(struct xattr_iter *it, it->ofs = 0; } - slice = min_t(unsigned int, EROFS_BLKSIZ - it->ofs, + slice = min_t(unsigned int, it->sb->s_blocksize - it->ofs, value_sz - processed); op->value(it, processed, it->kaddr + it->ofs, slice); it->ofs += slice; @@ -283,17 +297,45 @@ struct getxattr_iter { struct xattr_iter it; char *buffer; - int buffer_size, index; + int buffer_size, index, infix_len; struct qstr name; }; +static int erofs_xattr_long_entrymatch(struct getxattr_iter *it, + struct erofs_xattr_entry *entry) +{ + struct erofs_sb_info *sbi = EROFS_SB(it->it.sb); + struct erofs_xattr_prefix_item *pf = sbi->xattr_prefixes + + (entry->e_name_index & EROFS_XATTR_LONG_PREFIX_MASK); + + if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count) + return -ENOATTR; + + if (it->index != pf->prefix->base_index || + it->name.len != entry->e_name_len + pf->infix_len) + return -ENOATTR; + + if (memcmp(it->name.name, pf->prefix->infix, pf->infix_len)) + return -ENOATTR; + + it->infix_len = pf->infix_len; + return 0; +} + static int xattr_entrymatch(struct xattr_iter *_it, struct erofs_xattr_entry *entry) { struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); - return (it->index != entry->e_name_index || - it->name.len != entry->e_name_len) ? -ENOATTR : 0; + /* should also match the infix for long name prefixes */ + if (entry->e_name_index & EROFS_XATTR_LONG_PREFIX) + return erofs_xattr_long_entrymatch(it, entry); + + if (it->index != entry->e_name_index || + it->name.len != entry->e_name_len) + return -ENOATTR; + it->infix_len = 0; + return 0; } static int xattr_namematch(struct xattr_iter *_it, @@ -301,7 +343,9 @@ static int xattr_namematch(struct xattr_iter *_it, { struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); - return memcmp(buf, it->name.name + processed, len) ? -ENOATTR : 0; + if (memcmp(buf, it->name.name + it->infix_len + processed, len)) + return -ENOATTR; + return 0; } static int xattr_checkbuffer(struct xattr_iter *_it, @@ -351,21 +395,18 @@ static int inline_getxattr(struct inode *inode, struct getxattr_iter *it) static int shared_getxattr(struct inode *inode, struct getxattr_iter *it) { struct erofs_inode *const vi = EROFS_I(inode); - struct super_block *const sb = inode->i_sb; - struct erofs_sb_info *const sbi = EROFS_SB(sb); - unsigned int i; + struct super_block *const sb = it->it.sb; + unsigned int i, xsid; int ret = -ENOATTR; for (i = 0; i < vi->xattr_shared_count; ++i) { - erofs_blk_t blkaddr = - xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]); - - it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); - it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr, - EROFS_KMAP); + xsid = vi->xattr_shared_xattrs[i]; + it->it.blkaddr = erofs_xattr_blkaddr(sb, xsid); + it->it.ofs = erofs_xattr_blkoff(sb, xsid); + it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, + it->it.blkaddr, EROFS_KMAP); if (IS_ERR(it->it.kaddr)) return PTR_ERR(it->it.kaddr); - it->it.blkaddr = blkaddr; ret = xattr_foreach(&it->it, &find_xattr_handlers, NULL); if (ret != -ENOATTR) @@ -394,7 +435,7 @@ int erofs_getxattr(struct inode *inode, int index, if (!name) return -EINVAL; - ret = init_inode_xattrs(inode); + ret = erofs_init_inode_xattrs(inode); if (ret) return ret; @@ -421,20 +462,9 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) { - struct erofs_sb_info *const sbi = EROFS_I_SB(inode); - - switch (handler->flags) { - case EROFS_XATTR_INDEX_USER: - if (!test_opt(&sbi->opt, XATTR_USER)) - return -EOPNOTSUPP; - break; - case EROFS_XATTR_INDEX_TRUSTED: - break; - case EROFS_XATTR_INDEX_SECURITY: - break; - default: - return -EINVAL; - } + if (handler->flags == EROFS_XATTR_INDEX_USER && + !test_opt(&EROFS_I_SB(inode)->opt, XATTR_USER)) + return -EOPNOTSUPP; return erofs_getxattr(inode, handler->flags, name, buffer, size); } @@ -463,10 +493,6 @@ const struct xattr_handler __maybe_unused erofs_xattr_security_handler = { const struct xattr_handler *erofs_xattr_handlers[] = { &erofs_xattr_user_handler, -#ifdef CONFIG_EROFS_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif &erofs_xattr_trusted_handler, #ifdef CONFIG_EROFS_FS_SECURITY &erofs_xattr_security_handler, @@ -487,29 +513,40 @@ static int xattr_entrylist(struct xattr_iter *_it, { struct listxattr_iter *it = container_of(_it, struct listxattr_iter, it); - unsigned int prefix_len; - const char *prefix; - - const struct xattr_handler *h = - erofs_xattr_handler(entry->e_name_index); + unsigned int base_index = entry->e_name_index; + unsigned int prefix_len, infix_len = 0; + const char *prefix, *infix = NULL; + + if (entry->e_name_index & EROFS_XATTR_LONG_PREFIX) { + struct erofs_sb_info *sbi = EROFS_SB(_it->sb); + struct erofs_xattr_prefix_item *pf = sbi->xattr_prefixes + + (entry->e_name_index & EROFS_XATTR_LONG_PREFIX_MASK); + + if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count) + return 1; + infix = pf->prefix->infix; + infix_len = pf->infix_len; + base_index = pf->prefix->base_index; + } - if (!h || (h->list && !h->list(it->dentry))) + prefix = erofs_xattr_prefix(base_index, it->dentry); + if (!prefix) return 1; - - prefix = xattr_prefix(h); prefix_len = strlen(prefix); if (!it->buffer) { - it->buffer_ofs += prefix_len + entry->e_name_len + 1; + it->buffer_ofs += prefix_len + infix_len + + entry->e_name_len + 1; return 1; } - if (it->buffer_ofs + prefix_len + if (it->buffer_ofs + prefix_len + infix_len + + entry->e_name_len + 1 > it->buffer_size) return -ERANGE; memcpy(it->buffer + it->buffer_ofs, prefix, prefix_len); - it->buffer_ofs += prefix_len; + memcpy(it->buffer + it->buffer_ofs + prefix_len, infix, infix_len); + it->buffer_ofs += prefix_len + infix_len; return 0; } @@ -563,21 +600,18 @@ static int shared_listxattr(struct listxattr_iter *it) { struct inode *const inode = d_inode(it->dentry); struct erofs_inode *const vi = EROFS_I(inode); - struct super_block *const sb = inode->i_sb; - struct erofs_sb_info *const sbi = EROFS_SB(sb); - unsigned int i; + struct super_block *const sb = it->it.sb; + unsigned int i, xsid; int ret = 0; for (i = 0; i < vi->xattr_shared_count; ++i) { - erofs_blk_t blkaddr = - xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]); - - it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); - it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr, - EROFS_KMAP); + xsid = vi->xattr_shared_xattrs[i]; + it->it.blkaddr = erofs_xattr_blkaddr(sb, xsid); + it->it.ofs = erofs_xattr_blkoff(sb, xsid); + it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, + it->it.blkaddr, EROFS_KMAP); if (IS_ERR(it->it.kaddr)) return PTR_ERR(it->it.kaddr); - it->it.blkaddr = blkaddr; ret = xattr_foreach(&it->it, &list_xattr_handlers, NULL); if (ret) @@ -592,7 +626,7 @@ ssize_t erofs_listxattr(struct dentry *dentry, int ret; struct listxattr_iter it; - ret = init_inode_xattrs(d_inode(dentry)); + ret = erofs_init_inode_xattrs(d_inode(dentry)); if (ret == -ENOATTR) return 0; if (ret) @@ -613,6 +647,62 @@ ssize_t erofs_listxattr(struct dentry *dentry, return ret; } +void erofs_xattr_prefixes_cleanup(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + int i; + + if (sbi->xattr_prefixes) { + for (i = 0; i < sbi->xattr_prefix_count; i++) + kfree(sbi->xattr_prefixes[i].prefix); + kfree(sbi->xattr_prefixes); + sbi->xattr_prefixes = NULL; + } +} + +int erofs_xattr_prefixes_init(struct super_block *sb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + erofs_off_t pos = (erofs_off_t)sbi->xattr_prefix_start << 2; + struct erofs_xattr_prefix_item *pfs; + int ret = 0, i, len; + + if (!sbi->xattr_prefix_count) + return 0; + + pfs = kzalloc(sbi->xattr_prefix_count * sizeof(*pfs), GFP_KERNEL); + if (!pfs) + return -ENOMEM; + + if (erofs_sb_has_fragments(sbi)) + buf.inode = sbi->packed_inode; + else + erofs_init_metabuf(&buf, sb); + + for (i = 0; i < sbi->xattr_prefix_count; i++) { + void *ptr = erofs_read_metadata(sb, &buf, &pos, &len); + + if (IS_ERR(ptr)) { + ret = PTR_ERR(ptr); + break; + } else if (len < sizeof(*pfs->prefix) || + len > EROFS_NAME_LEN + sizeof(*pfs->prefix)) { + kfree(ptr); + ret = -EFSCORRUPTED; + break; + } + pfs[i].prefix = ptr; + pfs[i].infix_len = len - sizeof(struct erofs_xattr_long_prefix); + } + + erofs_put_metabuf(&buf); + sbi->xattr_prefixes = pfs; + if (ret) + erofs_xattr_prefixes_cleanup(sb); + return ret; +} + #ifdef CONFIG_EROFS_FS_POSIX_ACL struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu) { diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h index 0a43c9ee9f8f..f16283cb8c93 100644 --- a/fs/erofs/xattr.h +++ b/fs/erofs/xattr.h @@ -13,43 +13,21 @@ /* Attribute not found */ #define ENOATTR ENODATA -static inline unsigned int inlinexattr_header_size(struct inode *inode) -{ - return sizeof(struct erofs_xattr_ibody_header) + - sizeof(u32) * EROFS_I(inode)->xattr_shared_count; -} - -static inline erofs_blk_t xattrblock_addr(struct erofs_sb_info *sbi, - unsigned int xattr_id) -{ -#ifdef CONFIG_EROFS_FS_XATTR - return sbi->xattr_blkaddr + - xattr_id * sizeof(__u32) / EROFS_BLKSIZ; -#else - return 0; -#endif -} - -static inline unsigned int xattrblock_offset(struct erofs_sb_info *sbi, - unsigned int xattr_id) -{ - return (xattr_id * sizeof(__u32)) % EROFS_BLKSIZ; -} - #ifdef CONFIG_EROFS_FS_XATTR extern const struct xattr_handler erofs_xattr_user_handler; extern const struct xattr_handler erofs_xattr_trusted_handler; extern const struct xattr_handler erofs_xattr_security_handler; -static inline const struct xattr_handler *erofs_xattr_handler(unsigned int idx) +static inline const char *erofs_xattr_prefix(unsigned int idx, + struct dentry *dentry) { + const struct xattr_handler *handler = NULL; + static const struct xattr_handler *xattr_handler_map[] = { [EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler, #ifdef CONFIG_EROFS_FS_POSIX_ACL - [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = - &posix_acl_access_xattr_handler, - [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] = - &posix_acl_default_xattr_handler, + [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, + [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default, #endif [EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler, #ifdef CONFIG_EROFS_FS_SECURITY @@ -57,15 +35,24 @@ static inline const struct xattr_handler *erofs_xattr_handler(unsigned int idx) #endif }; - return idx && idx < ARRAY_SIZE(xattr_handler_map) ? - xattr_handler_map[idx] : NULL; + if (idx && idx < ARRAY_SIZE(xattr_handler_map)) + handler = xattr_handler_map[idx]; + + if (!xattr_handler_can_list(handler, dentry)) + return NULL; + + return xattr_prefix(handler); } extern const struct xattr_handler *erofs_xattr_handlers[]; +int erofs_xattr_prefixes_init(struct super_block *sb); +void erofs_xattr_prefixes_cleanup(struct super_block *sb); int erofs_getxattr(struct inode *, int, const char *, void *, size_t); ssize_t erofs_listxattr(struct dentry *, char *, size_t); #else +static inline int erofs_xattr_prefixes_init(struct super_block *sb) { return 0; } +static inline void erofs_xattr_prefixes_cleanup(struct super_block *sb) {} static inline int erofs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index f1708c77a991..45f21db2303a 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -807,7 +807,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) if (ztailpacking) { pcl->obj.index = 0; /* which indicates ztailpacking */ - pcl->pageofs_in = erofs_blkoff(map->m_pa); + pcl->pageofs_in = erofs_blkoff(fe->inode->i_sb, map->m_pa); pcl->tailpacking_size = map->m_plen; } else { pcl->obj.index = map->m_pa >> PAGE_SHIFT; @@ -930,6 +930,7 @@ static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, struct page *page, unsigned int pageofs, unsigned int len) { + struct super_block *sb = inode->i_sb; struct inode *packed_inode = EROFS_I_SB(inode)->packed_inode; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; u8 *src, *dst; @@ -938,19 +939,19 @@ static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, if (!packed_inode) return -EFSCORRUPTED; + buf.inode = packed_inode; pos += EROFS_I(inode)->z_fragmentoff; for (i = 0; i < len; i += cnt) { cnt = min_t(unsigned int, len - i, - EROFS_BLKSIZ - erofs_blkoff(pos)); - src = erofs_bread(&buf, packed_inode, - erofs_blknr(pos), EROFS_KMAP); + sb->s_blocksize - erofs_blkoff(sb, pos)); + src = erofs_bread(&buf, erofs_blknr(sb, pos), EROFS_KMAP); if (IS_ERR(src)) { erofs_put_metabuf(&buf); return PTR_ERR(src); } dst = kmap_local_page(page); - memcpy(dst + pageofs + i, src + erofs_blkoff(pos), cnt); + memcpy(dst + pageofs + i, src + erofs_blkoff(sb, pos), cnt); kunmap_local(dst); pos += cnt; } @@ -978,8 +979,6 @@ repeat: if (offset + cur < map->m_la || offset + cur >= map->m_la + map->m_llen) { - erofs_dbg("out-of-range map @ pos %llu", offset + cur); - if (z_erofs_collector_end(fe)) fe->backmost = false; map->m_la = offset + cur; @@ -1005,7 +1004,8 @@ repeat: void *mp; mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb, - erofs_blknr(map->m_pa), EROFS_NO_KMAP); + erofs_blknr(inode->i_sb, map->m_pa), + EROFS_NO_KMAP); if (IS_ERR(mp)) { err = PTR_ERR(mp); erofs_err(inode->i_sb, @@ -1103,9 +1103,6 @@ out: if (err) z_erofs_page_mark_eio(page); z_erofs_onlinepage_endio(page); - - erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu", - __func__, page, spiltted, map->m_llen); return err; } @@ -1726,11 +1723,11 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, /* no device id here, thus it will always succeed */ mdev = (struct erofs_map_dev) { - .m_pa = blknr_to_addr(pcl->obj.index), + .m_pa = erofs_pos(sb, pcl->obj.index), }; (void)erofs_map_dev(sb, &mdev); - cur = erofs_blknr(mdev.m_pa); + cur = erofs_blknr(sb, mdev.m_pa); end = cur + pcl->pclusterpages; do { @@ -1764,7 +1761,7 @@ submit_bio_retry: last_bdev = mdev.m_bdev; bio->bi_iter.bi_sector = (sector_t)cur << - LOG_SECTORS_PER_BLOCK; + (sb->s_blocksize_bits - 9); bio->bi_private = q[JQ_SUBMIT]; if (f->readahead) bio->bi_opf |= REQ_RAHEAD; diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 655da4d739cb..d37c5c89c728 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -7,24 +7,6 @@ #include <asm/unaligned.h> #include <trace/events/erofs.h> -int z_erofs_fill_inode(struct inode *inode) -{ - struct erofs_inode *const vi = EROFS_I(inode); - struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); - - if (!erofs_sb_has_big_pcluster(sbi) && - !erofs_sb_has_ztailpacking(sbi) && !erofs_sb_has_fragments(sbi) && - vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) { - vi->z_advise = 0; - vi->z_algorithmtype[0] = 0; - vi->z_algorithmtype[1] = 0; - vi->z_logical_clusterbits = LOG_BLOCK_SIZE; - set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); - } - inode->i_mapping->a_ops = &z_erofs_aops; - return 0; -} - struct z_erofs_maprecorder { struct inode *inode; struct erofs_map_blocks *map; @@ -45,47 +27,50 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, { struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); - const erofs_off_t pos = - Z_EROFS_VLE_LEGACY_INDEX_ALIGN(erofs_iloc(inode) + - vi->inode_isize + vi->xattr_isize) + - lcn * sizeof(struct z_erofs_vle_decompressed_index); - struct z_erofs_vle_decompressed_index *di; + const erofs_off_t pos = Z_EROFS_FULL_INDEX_ALIGN(erofs_iloc(inode) + + vi->inode_isize + vi->xattr_isize) + + lcn * sizeof(struct z_erofs_lcluster_index); + struct z_erofs_lcluster_index *di; unsigned int advise, type; m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, - erofs_blknr(pos), EROFS_KMAP); + erofs_blknr(inode->i_sb, pos), EROFS_KMAP); if (IS_ERR(m->kaddr)) return PTR_ERR(m->kaddr); - m->nextpackoff = pos + sizeof(struct z_erofs_vle_decompressed_index); + m->nextpackoff = pos + sizeof(struct z_erofs_lcluster_index); m->lcn = lcn; - di = m->kaddr + erofs_blkoff(pos); + di = m->kaddr + erofs_blkoff(inode->i_sb, pos); advise = le16_to_cpu(di->di_advise); - type = (advise >> Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT) & - ((1 << Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) - 1); + type = (advise >> Z_EROFS_LI_LCLUSTER_TYPE_BIT) & + ((1 << Z_EROFS_LI_LCLUSTER_TYPE_BITS) - 1); switch (type) { - case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: + case Z_EROFS_LCLUSTER_TYPE_NONHEAD: m->clusterofs = 1 << vi->z_logical_clusterbits; m->delta[0] = le16_to_cpu(di->di_u.delta[0]); - if (m->delta[0] & Z_EROFS_VLE_DI_D0_CBLKCNT) { + if (m->delta[0] & Z_EROFS_LI_D0_CBLKCNT) { if (!(vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | Z_EROFS_ADVISE_BIG_PCLUSTER_2))) { DBG_BUGON(1); return -EFSCORRUPTED; } m->compressedblks = m->delta[0] & - ~Z_EROFS_VLE_DI_D0_CBLKCNT; + ~Z_EROFS_LI_D0_CBLKCNT; m->delta[0] = 1; } m->delta[1] = le16_to_cpu(di->di_u.delta[1]); break; - case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: - if (advise & Z_EROFS_VLE_DI_PARTIAL_REF) + case Z_EROFS_LCLUSTER_TYPE_PLAIN: + case Z_EROFS_LCLUSTER_TYPE_HEAD1: + case Z_EROFS_LCLUSTER_TYPE_HEAD2: + if (advise & Z_EROFS_LI_PARTIAL_REF) m->partialref = true; m->clusterofs = le16_to_cpu(di->di_clusterofs); + if (m->clusterofs >= 1 << vi->z_logical_clusterbits) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } m->pblk = le32_to_cpu(di->di_u.blkaddr); break; default: @@ -121,13 +106,13 @@ static int get_compacted_la_distance(unsigned int lclusterbits, lo = decode_compactedbits(lclusterbits, lomask, in, encodebits * i, &type); - if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) + if (type != Z_EROFS_LCLUSTER_TYPE_NONHEAD) return d1; ++d1; } while (++i < vcnt); - /* vcnt - 1 (Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) item */ - if (!(lo & Z_EROFS_VLE_DI_D0_CBLKCNT)) + /* vcnt - 1 (Z_EROFS_LCLUSTER_TYPE_NONHEAD) item */ + if (!(lo & Z_EROFS_LI_D0_CBLKCNT)) d1 += lo - 1; return d1; } @@ -156,7 +141,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, (vcnt << amortizedshift); big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1; encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt; - eofs = erofs_blkoff(pos); + eofs = erofs_blkoff(m->inode->i_sb, pos); base = round_down(eofs, vcnt << amortizedshift); in = m->kaddr + base; @@ -165,19 +150,19 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, lo = decode_compactedbits(lclusterbits, lomask, in, encodebits * i, &type); m->type = type; - if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { + if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { m->clusterofs = 1 << lclusterbits; /* figure out lookahead_distance: delta[1] if needed */ if (lookahead) m->delta[1] = get_compacted_la_distance(lclusterbits, encodebits, vcnt, in, i); - if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) { + if (lo & Z_EROFS_LI_D0_CBLKCNT) { if (!big_pcluster) { DBG_BUGON(1); return -EFSCORRUPTED; } - m->compressedblks = lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT; + m->compressedblks = lo & ~Z_EROFS_LI_D0_CBLKCNT; m->delta[0] = 1; return 0; } else if (i + 1 != (int)vcnt) { @@ -191,9 +176,9 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, */ lo = decode_compactedbits(lclusterbits, lomask, in, encodebits * (i - 1), &type); - if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) + if (type != Z_EROFS_LCLUSTER_TYPE_NONHEAD) lo = 0; - else if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) + else if (lo & Z_EROFS_LI_D0_CBLKCNT) lo = 1; m->delta[0] = lo + 1; return 0; @@ -207,7 +192,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, --i; lo = decode_compactedbits(lclusterbits, lomask, in, encodebits * i, &type); - if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) + if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) i -= lo; if (i >= 0) @@ -219,10 +204,10 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, --i; lo = decode_compactedbits(lclusterbits, lomask, in, encodebits * i, &type); - if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { - if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) { + if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { + if (lo & Z_EROFS_LI_D0_CBLKCNT) { --i; - nblk += lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT; + nblk += lo & ~Z_EROFS_LI_D0_CBLKCNT; continue; } /* bigpcluster shouldn't have plain d0 == 1 */ @@ -249,7 +234,7 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, const unsigned int lclusterbits = vi->z_logical_clusterbits; const erofs_off_t ebase = sizeof(struct z_erofs_map_header) + ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); - const unsigned int totalidx = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ); + unsigned int totalidx = erofs_iblks(inode); unsigned int compacted_4b_initial, compacted_2b; unsigned int amortizedshift; erofs_off_t pos; @@ -290,7 +275,7 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, out: pos += lcn * (1 << amortizedshift); m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb, - erofs_blknr(pos), EROFS_KMAP); + erofs_blknr(inode->i_sb, pos), EROFS_KMAP); if (IS_ERR(m->kaddr)) return PTR_ERR(m->kaddr); return unpack_compacted_index(m, amortizedshift, pos, lookahead); @@ -301,10 +286,10 @@ static int z_erofs_load_cluster_from_disk(struct z_erofs_maprecorder *m, { const unsigned int datamode = EROFS_I(m->inode)->datalayout; - if (datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY) + if (datamode == EROFS_INODE_COMPRESSED_FULL) return legacy_load_cluster_from_disk(m, lcn); - if (datamode == EROFS_INODE_FLAT_COMPRESSION) + if (datamode == EROFS_INODE_COMPRESSED_COMPACT) return compacted_load_cluster_from_disk(m, lcn, lookahead); return -EINVAL; @@ -326,7 +311,7 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, return err; switch (m->type) { - case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: + case Z_EROFS_LCLUSTER_TYPE_NONHEAD: if (!m->delta[0]) { erofs_err(m->inode->i_sb, "invalid lookback distance 0 @ nid %llu", @@ -336,9 +321,9 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, } lookback_distance = m->delta[0]; continue; - case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: + case Z_EROFS_LCLUSTER_TYPE_PLAIN: + case Z_EROFS_LCLUSTER_TYPE_HEAD1: + case Z_EROFS_LCLUSTER_TYPE_HEAD2: m->headtype = m->type; m->map->m_la = (lcn << lclusterbits) | m->clusterofs; return 0; @@ -360,21 +345,22 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, unsigned int initial_lcn) { + struct super_block *sb = m->inode->i_sb; struct erofs_inode *const vi = EROFS_I(m->inode); struct erofs_map_blocks *const map = m->map; const unsigned int lclusterbits = vi->z_logical_clusterbits; unsigned long lcn; int err; - DBG_BUGON(m->type != Z_EROFS_VLE_CLUSTER_TYPE_PLAIN && - m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 && - m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD2); + DBG_BUGON(m->type != Z_EROFS_LCLUSTER_TYPE_PLAIN && + m->type != Z_EROFS_LCLUSTER_TYPE_HEAD1 && + m->type != Z_EROFS_LCLUSTER_TYPE_HEAD2); DBG_BUGON(m->type != m->headtype); - if (m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN || - ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1) && + if (m->headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN || + ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD1) && !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) || - ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) && + ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) && !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) { map->m_plen = 1ULL << lclusterbits; return 0; @@ -396,19 +382,19 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, * BUG_ON in the debugging mode only for developers to notice that. */ DBG_BUGON(lcn == initial_lcn && - m->type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD); + m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD); switch (m->type) { - case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: + case Z_EROFS_LCLUSTER_TYPE_PLAIN: + case Z_EROFS_LCLUSTER_TYPE_HEAD1: + case Z_EROFS_LCLUSTER_TYPE_HEAD2: /* * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type * rather than CBLKCNT, it's a 1 lcluster-sized pcluster. */ - m->compressedblks = 1 << (lclusterbits - LOG_BLOCK_SIZE); + m->compressedblks = 1 << (lclusterbits - sb->s_blocksize_bits); break; - case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: + case Z_EROFS_LCLUSTER_TYPE_NONHEAD: if (m->delta[0] != 1) goto err_bonus_cblkcnt; if (m->compressedblks) @@ -422,7 +408,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, return -EFSCORRUPTED; } out: - map->m_plen = (u64)m->compressedblks << LOG_BLOCK_SIZE; + map->m_plen = erofs_pos(sb, m->compressedblks); return 0; err_bonus_cblkcnt: erofs_err(m->inode->i_sb, @@ -452,12 +438,12 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) if (err) return err; - if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { + if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { DBG_BUGON(!m->delta[1] && m->clusterofs != 1 << lclusterbits); - } else if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN || - m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 || - m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) { + } else if (m->type == Z_EROFS_LCLUSTER_TYPE_PLAIN || + m->type == Z_EROFS_LCLUSTER_TYPE_HEAD1 || + m->type == Z_EROFS_LCLUSTER_TYPE_HEAD2) { /* go on until the next HEAD lcluster */ if (lcn != headlcn) break; @@ -476,8 +462,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) } static int z_erofs_do_map_blocks(struct inode *inode, - struct erofs_map_blocks *map, - int flags) + struct erofs_map_blocks *map, int flags) { struct erofs_inode *const vi = EROFS_I(inode); bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER; @@ -507,9 +492,9 @@ static int z_erofs_do_map_blocks(struct inode *inode, end = (m.lcn + 1ULL) << lclusterbits; switch (m.type) { - case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: + case Z_EROFS_LCLUSTER_TYPE_PLAIN: + case Z_EROFS_LCLUSTER_TYPE_HEAD1: + case Z_EROFS_LCLUSTER_TYPE_HEAD2: if (endoff >= m.clusterofs) { m.headtype = m.type; map->m_la = (m.lcn << lclusterbits) | m.clusterofs; @@ -534,7 +519,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, map->m_flags |= EROFS_MAP_FULL_MAPPED; m.delta[0] = 1; fallthrough; - case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: + case Z_EROFS_LCLUSTER_TYPE_NONHEAD: /* get the corresponding first chunk */ err = z_erofs_extent_lookback(&m, m.delta[0]); if (err) @@ -555,7 +540,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, vi->z_tailextent_headlcn = m.lcn; /* for non-compact indexes, fragmentoff is 64 bits */ if (fragment && - vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) + vi->datalayout == EROFS_INODE_COMPRESSED_FULL) vi->z_fragmentoff |= (u64)m.pblk << 32; } if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) { @@ -565,13 +550,13 @@ static int z_erofs_do_map_blocks(struct inode *inode, } else if (fragment && m.lcn == vi->z_tailextent_headlcn) { map->m_flags |= EROFS_MAP_FRAGMENT; } else { - map->m_pa = blknr_to_addr(m.pblk); + map->m_pa = erofs_pos(inode->i_sb, m.pblk); err = z_erofs_get_extent_compressedlen(&m, initial_lcn); if (err) goto unmap_out; } - if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) { + if (m.headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN) { if (map->m_llen > map->m_plen) { DBG_BUGON(1); err = -EFSCORRUPTED; @@ -583,7 +568,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, else map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; - } else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) { + } else if (m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) { map->m_algorithmformat = vi->z_algorithmtype[1]; } else { map->m_algorithmformat = vi->z_algorithmtype[0]; @@ -592,7 +577,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, if ((flags & EROFS_GET_BLOCKS_FIEMAP) || ((flags & EROFS_GET_BLOCKS_READMORE) && map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA && - map->m_llen >= EROFS_BLKSIZ)) { + map->m_llen >= i_blocksize(inode))) { err = z_erofs_get_extent_decompressedlen(&m); if (!err) map->m_flags |= EROFS_MAP_FULL_MAPPED; @@ -600,9 +585,6 @@ static int z_erofs_do_map_blocks(struct inode *inode, unmap_out: erofs_unmap_metabuf(&m.map->buf); - erofs_dbg("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o", - __func__, map->m_la, map->m_pa, - map->m_llen, map->m_plen, map->m_flags); return err; } @@ -633,13 +615,13 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) goto out_unlock; pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); - kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP); + kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(sb, pos), EROFS_KMAP); if (IS_ERR(kaddr)) { err = PTR_ERR(kaddr); goto out_unlock; } - h = kaddr + erofs_blkoff(pos); + h = kaddr + erofs_blkoff(sb, pos); /* * if the highest bit of the 8-byte map header is set, the whole file * is stored in the packed inode. The rest bits keeps z_fragmentoff. @@ -663,7 +645,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) goto out_put_metabuf; } - vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7); + vi->z_logical_clusterbits = sb->s_blocksize_bits + (h->h_clusterbits & 7); if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { @@ -672,7 +654,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) err = -EFSCORRUPTED; goto out_put_metabuf; } - if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION && + if (vi->datalayout == EROFS_INODE_COMPRESSED_COMPACT && !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^ !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu", @@ -692,7 +674,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) erofs_put_metabuf(&map.buf); if (!map.m_plen || - erofs_blkoff(map.m_pa) + map.m_plen > EROFS_BLKSIZ) { + erofs_blkoff(sb, map.m_pa) + map.m_plen > sb->s_blocksize) { erofs_err(sb, "invalid tail-packing pclustersize %llu", map.m_plen); err = -EFSCORRUPTED; diff --git a/fs/eventfd.c b/fs/eventfd.c index 249ca6c0b784..95850a13ce8d 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -228,7 +228,6 @@ static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to) struct file *file = iocb->ki_filp; struct eventfd_ctx *ctx = file->private_data; __u64 ucnt = 0; - DECLARE_WAITQUEUE(wait, current); if (iov_iter_count(to) < sizeof(ucnt)) return -EINVAL; @@ -239,23 +238,11 @@ static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to) spin_unlock_irq(&ctx->wqh.lock); return -EAGAIN; } - __add_wait_queue(&ctx->wqh, &wait); - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - if (ctx->count) - break; - if (signal_pending(current)) { - __remove_wait_queue(&ctx->wqh, &wait); - __set_current_state(TASK_RUNNING); - spin_unlock_irq(&ctx->wqh.lock); - return -ERESTARTSYS; - } + + if (wait_event_interruptible_locked_irq(ctx->wqh, ctx->count)) { spin_unlock_irq(&ctx->wqh.lock); - schedule(); - spin_lock_irq(&ctx->wqh.lock); + return -ERESTARTSYS; } - __remove_wait_queue(&ctx->wqh, &wait); - __set_current_state(TASK_RUNNING); } eventfd_ctx_do_read(ctx, &ucnt); current->in_eventfd = 1; @@ -275,7 +262,6 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c struct eventfd_ctx *ctx = file->private_data; ssize_t res; __u64 ucnt; - DECLARE_WAITQUEUE(wait, current); if (count < sizeof(ucnt)) return -EINVAL; @@ -288,23 +274,10 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c if (ULLONG_MAX - ctx->count > ucnt) res = sizeof(ucnt); else if (!(file->f_flags & O_NONBLOCK)) { - __add_wait_queue(&ctx->wqh, &wait); - for (res = 0;;) { - set_current_state(TASK_INTERRUPTIBLE); - if (ULLONG_MAX - ctx->count > ucnt) { - res = sizeof(ucnt); - break; - } - if (signal_pending(current)) { - res = -ERESTARTSYS; - break; - } - spin_unlock_irq(&ctx->wqh.lock); - schedule(); - spin_lock_irq(&ctx->wqh.lock); - } - __remove_wait_queue(&ctx->wqh, &wait); - __set_current_state(TASK_RUNNING); + res = wait_event_interruptible_locked_irq(ctx->wqh, + ULLONG_MAX - ctx->count > ucnt); + if (!res) + res = sizeof(ucnt); } if (likely(res > 0)) { ctx->count += ucnt; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 64659b110973..980483455cc0 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -43,7 +43,7 @@ * LOCKING: * There are three level of locking required by epoll : * - * 1) epmutex (mutex) + * 1) epnested_mutex (mutex) * 2) ep->mtx (mutex) * 3) ep->lock (rwlock) * @@ -57,14 +57,8 @@ * we need a lock that will allow us to sleep. This lock is a * mutex (ep->mtx). It is acquired during the event transfer loop, * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file(). - * Then we also need a global mutex to serialize eventpoll_release_file() - * and ep_free(). - * This mutex is acquired by ep_free() during the epoll file - * cleanup path and it is also acquired by eventpoll_release_file() - * if a file has been pushed inside an epoll set and it is then - * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL). - * It is also acquired when inserting an epoll fd onto another epoll - * fd. We do this so that we walk the epoll tree and ensure that this + * The epnested_mutex is acquired when inserting an epoll fd onto another + * epoll fd. We do this so that we walk the epoll tree and ensure that this * insertion does not create a cycle of epoll file descriptors, which * could lead to deadlock. We need a global mutex to prevent two * simultaneous inserts (A into B and B into A) from racing and @@ -80,9 +74,9 @@ * of epoll file descriptors, we use the current recursion depth as * the lockdep subkey. * It is possible to drop the "ep->mtx" and to use the global - * mutex "epmutex" (together with "ep->lock") to have it working, + * mutex "epnested_mutex" (together with "ep->lock") to have it working, * but having "ep->mtx" will make the interface more scalable. - * Events that require holding "epmutex" are very rare, while for + * Events that require holding "epnested_mutex" are very rare, while for * normal operations the epoll private "ep->mtx" will guarantee * a better scalability. */ @@ -153,6 +147,13 @@ struct epitem { /* The file descriptor information this item refers to */ struct epoll_filefd ffd; + /* + * Protected by file->f_lock, true for to-be-released epitem already + * removed from the "struct file" items list; together with + * eventpoll->refcount orchestrates "struct eventpoll" disposal + */ + bool dying; + /* List containing poll wait queues */ struct eppoll_entry *pwqlist; @@ -217,6 +218,12 @@ struct eventpoll { u64 gen; struct hlist_head refs; + /* + * usage count, used together with epitem->dying to + * orchestrate the disposal of this struct + */ + refcount_t refcount; + #ifdef CONFIG_NET_RX_BUSY_POLL /* used to track busy poll napi_id */ unsigned int napi_id; @@ -240,10 +247,8 @@ struct ep_pqueue { /* Maximum number of epoll watched descriptors, per user */ static long max_user_watches __read_mostly; -/* - * This mutex is used to serialize ep_free() and eventpoll_release_file(). - */ -static DEFINE_MUTEX(epmutex); +/* Used for cycles detection */ +static DEFINE_MUTEX(epnested_mutex); static u64 loop_check_gen = 0; @@ -258,7 +263,7 @@ static struct kmem_cache *pwq_cache __read_mostly; /* * List of files with newly added links, where we may need to limit the number - * of emanating paths. Protected by the epmutex. + * of emanating paths. Protected by the epnested_mutex. */ struct epitems_head { struct hlist_head epitems; @@ -483,8 +488,8 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) * (efd1) notices that it may have some event ready, so it needs to wake up * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake() * that ends up in another wake_up(), after having checked about the - * recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to - * avoid stack blasting. + * recursion constraints. That are, no more than EP_MAX_NESTS, to avoid + * stack blasting. * * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle * this special case of epoll. @@ -557,8 +562,7 @@ static void ep_remove_wait_queue(struct eppoll_entry *pwq) /* * This function unregisters poll callbacks from the associated file - * descriptor. Must be called with "mtx" held (or "epmutex" if called from - * ep_free). + * descriptor. Must be called with "mtx" held. */ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) { @@ -681,11 +685,40 @@ static void epi_rcu_free(struct rcu_head *head) kmem_cache_free(epi_cache, epi); } +static void ep_get(struct eventpoll *ep) +{ + refcount_inc(&ep->refcount); +} + +/* + * Returns true if the event poll can be disposed + */ +static bool ep_refcount_dec_and_test(struct eventpoll *ep) +{ + if (!refcount_dec_and_test(&ep->refcount)) + return false; + + WARN_ON_ONCE(!RB_EMPTY_ROOT(&ep->rbr.rb_root)); + return true; +} + +static void ep_free(struct eventpoll *ep) +{ + mutex_destroy(&ep->mtx); + free_uid(ep->user); + wakeup_source_unregister(ep->ws); + kfree(ep); +} + /* * Removes a "struct epitem" from the eventpoll RB tree and deallocates * all the associated resources. Must be called with "mtx" held. + * If the dying flag is set, do the removal only if force is true. + * This prevents ep_clear_and_put() from dropping all the ep references + * while running concurrently with eventpoll_release_file(). + * Returns true if the eventpoll can be disposed. */ -static int ep_remove(struct eventpoll *ep, struct epitem *epi) +static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) { struct file *file = epi->ffd.file; struct epitems_head *to_free; @@ -700,6 +733,11 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) /* Remove the current item from the list of epoll hooks */ spin_lock(&file->f_lock); + if (epi->dying && !force) { + spin_unlock(&file->f_lock); + return false; + } + to_free = NULL; head = file->f_ep; if (head->first == &epi->fllink && !epi->fllink.next) { @@ -733,28 +771,28 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) call_rcu(&epi->rcu, epi_rcu_free); percpu_counter_dec(&ep->user->epoll_watches); + return ep_refcount_dec_and_test(ep); +} - return 0; +/* + * ep_remove variant for callers owing an additional reference to the ep + */ +static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi) +{ + WARN_ON_ONCE(__ep_remove(ep, epi, false)); } -static void ep_free(struct eventpoll *ep) +static void ep_clear_and_put(struct eventpoll *ep) { - struct rb_node *rbp; + struct rb_node *rbp, *next; struct epitem *epi; + bool dispose; /* We need to release all tasks waiting for these file */ if (waitqueue_active(&ep->poll_wait)) ep_poll_safewake(ep, NULL, 0); - /* - * We need to lock this because we could be hit by - * eventpoll_release_file() while we're freeing the "struct eventpoll". - * We do not need to hold "ep->mtx" here because the epoll file - * is on the way to be removed and no one has references to it - * anymore. The only hit might come from eventpoll_release_file() but - * holding "epmutex" is sufficient here. - */ - mutex_lock(&epmutex); + mutex_lock(&ep->mtx); /* * Walks through the whole tree by unregistering poll callbacks. @@ -767,26 +805,25 @@ static void ep_free(struct eventpoll *ep) } /* - * Walks through the whole tree by freeing each "struct epitem". At this - * point we are sure no poll callbacks will be lingering around, and also by - * holding "epmutex" we can be sure that no file cleanup code will hit - * us during this operation. So we can avoid the lock on "ep->lock". - * We do not need to lock ep->mtx, either, we only do it to prevent - * a lockdep warning. + * Walks through the whole tree and try to free each "struct epitem". + * Note that ep_remove_safe() will not remove the epitem in case of a + * racing eventpoll_release_file(); the latter will do the removal. + * At this point we are sure no poll callbacks will be lingering around. + * Since we still own a reference to the eventpoll struct, the loop can't + * dispose it. */ - mutex_lock(&ep->mtx); - while ((rbp = rb_first_cached(&ep->rbr)) != NULL) { + for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) { + next = rb_next(rbp); epi = rb_entry(rbp, struct epitem, rbn); - ep_remove(ep, epi); + ep_remove_safe(ep, epi); cond_resched(); } + + dispose = ep_refcount_dec_and_test(ep); mutex_unlock(&ep->mtx); - mutex_unlock(&epmutex); - mutex_destroy(&ep->mtx); - free_uid(ep->user); - wakeup_source_unregister(ep->ws); - kfree(ep); + if (dispose) + ep_free(ep); } static int ep_eventpoll_release(struct inode *inode, struct file *file) @@ -794,7 +831,7 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file) struct eventpoll *ep = file->private_data; if (ep) - ep_free(ep); + ep_clear_and_put(ep); return 0; } @@ -906,33 +943,34 @@ void eventpoll_release_file(struct file *file) { struct eventpoll *ep; struct epitem *epi; - struct hlist_node *next; + bool dispose; /* - * We don't want to get "file->f_lock" because it is not - * necessary. It is not necessary because we're in the "struct file" - * cleanup path, and this means that no one is using this file anymore. - * So, for example, epoll_ctl() cannot hit here since if we reach this - * point, the file counter already went to zero and fget() would fail. - * The only hit might come from ep_free() but by holding the mutex - * will correctly serialize the operation. We do need to acquire - * "ep->mtx" after "epmutex" because ep_remove() requires it when called - * from anywhere but ep_free(). - * - * Besides, ep_remove() acquires the lock, so we can't hold it here. + * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from + * touching the epitems list before eventpoll_release_file() can access + * the ep->mtx. */ - mutex_lock(&epmutex); - if (unlikely(!file->f_ep)) { - mutex_unlock(&epmutex); - return; - } - hlist_for_each_entry_safe(epi, next, file->f_ep, fllink) { +again: + spin_lock(&file->f_lock); + if (file->f_ep && file->f_ep->first) { + epi = hlist_entry(file->f_ep->first, struct epitem, fllink); + epi->dying = true; + spin_unlock(&file->f_lock); + + /* + * ep access is safe as we still own a reference to the ep + * struct + */ ep = epi->ep; - mutex_lock_nested(&ep->mtx, 0); - ep_remove(ep, epi); + mutex_lock(&ep->mtx); + dispose = __ep_remove(ep, epi, true); mutex_unlock(&ep->mtx); + + if (dispose) + ep_free(ep); + goto again; } - mutex_unlock(&epmutex); + spin_unlock(&file->f_lock); } static int ep_alloc(struct eventpoll **pep) @@ -955,6 +993,7 @@ static int ep_alloc(struct eventpoll **pep) ep->rbr = RB_ROOT_CACHED; ep->ovflist = EP_UNACTIVE_PTR; ep->user = user; + refcount_set(&ep->refcount, 1); *pep = ep; @@ -1223,10 +1262,10 @@ out_unlock: */ list_del_init(&wait->entry); /* - * ->whead != NULL protects us from the race with ep_free() - * or ep_remove(), ep_remove_wait_queue() takes whead->lock - * held by the caller. Once we nullify it, nothing protects - * ep/epi or even wait. + * ->whead != NULL protects us from the race with + * ep_clear_and_put() or ep_remove(), ep_remove_wait_queue() + * takes whead->lock held by the caller. Once we nullify it, + * nothing protects ep/epi or even wait. */ smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL); } @@ -1298,7 +1337,7 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) * is connected to n file sources. In this case each file source has 1 path * of length 1. Thus, the numbers below should be more than sufficient. These * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify - * and delete can't add additional paths. Protected by the epmutex. + * and delete can't add additional paths. Protected by the epnested_mutex. */ static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 }; static int path_count[PATH_ARR_SIZE]; @@ -1496,16 +1535,22 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, if (tep) mutex_unlock(&tep->mtx); + /* + * ep_remove_safe() calls in the later error paths can't lead to + * ep_free() as the ep file itself still holds an ep reference. + */ + ep_get(ep); + /* now check if we've created too many backpaths */ if (unlikely(full_check && reverse_path_check())) { - ep_remove(ep, epi); + ep_remove_safe(ep, epi); return -EINVAL; } if (epi->event.events & EPOLLWAKEUP) { error = ep_create_wakeup_source(epi); if (error) { - ep_remove(ep, epi); + ep_remove_safe(ep, epi); return error; } } @@ -1529,7 +1574,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, * high memory pressure. */ if (unlikely(!epq.epi)) { - ep_remove(ep, epi); + ep_remove_safe(ep, epi); return -ENOMEM; } @@ -2025,7 +2070,7 @@ static int do_epoll_create(int flags) out_free_fd: put_unused_fd(fd); out_free_ep: - ep_free(ep); + ep_clear_and_put(ep); return error; } @@ -2042,6 +2087,19 @@ SYSCALL_DEFINE1(epoll_create, int, size) return do_epoll_create(0); } +#ifdef CONFIG_PM_SLEEP +static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev) +{ + if ((epev->events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND)) + epev->events &= ~EPOLLWAKEUP; +} +#else +static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev) +{ + epev->events &= ~EPOLLWAKEUP; +} +#endif + static inline int epoll_mutex_lock(struct mutex *mutex, int depth, bool nonblock) { @@ -2122,7 +2180,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when * the epoll file descriptor is attaching directly to a wakeup source, * unless the epoll file descriptor is nested. The purpose of taking the - * 'epmutex' on add is to prevent complex toplogies such as loops and + * 'epnested_mutex' on add is to prevent complex toplogies such as loops and * deep wakeup paths from forming in parallel through multiple * EPOLL_CTL_ADD operations. */ @@ -2133,7 +2191,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen || is_file_epoll(tf.file)) { mutex_unlock(&ep->mtx); - error = epoll_mutex_lock(&epmutex, 0, nonblock); + error = epoll_mutex_lock(&epnested_mutex, 0, nonblock); if (error) goto error_tgt_fput; loop_check_gen++; @@ -2167,10 +2225,16 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, error = -EEXIST; break; case EPOLL_CTL_DEL: - if (epi) - error = ep_remove(ep, epi); - else + if (epi) { + /* + * The eventpoll itself is still alive: the refcount + * can't go to zero here. + */ + ep_remove_safe(ep, epi); + error = 0; + } else { error = -ENOENT; + } break; case EPOLL_CTL_MOD: if (epi) { @@ -2188,7 +2252,7 @@ error_tgt_fput: if (full_check) { clear_tfile_check_list(); loop_check_gen++; - mutex_unlock(&epmutex); + mutex_unlock(&epnested_mutex); } fdput(tf); diff --git a/fs/exec.c b/fs/exec.c index 7c44d0c65b1b..a466e797c8e2 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -65,6 +65,7 @@ #include <linux/syscall_user_dispatch.h> #include <linux/coredump.h> #include <linux/time_namespace.h> +#include <linux/user_events.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -1034,7 +1035,7 @@ static int exec_mmap(struct mm_struct *mm) mmput(old_mm); return 0; } - mmdrop(active_mm); + mmdrop_lazy_tlb(active_mm); return 0; } @@ -1859,6 +1860,7 @@ static int bprm_execve(struct linux_binprm *bprm, current->fs->in_exec = 0; current->in_execve = 0; rseq_execve(current); + user_events_execve(current); acct_update_integrals(current); task_numa_free(current, false); return retval; diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index cb78d7dcfb95..8244366862e4 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -178,8 +178,9 @@ static inline struct ext2_sb_info *EXT2_SB(struct super_block *sb) * Macro-instructions used to manage several block sizes */ #define EXT2_MIN_BLOCK_SIZE 1024 -#define EXT2_MAX_BLOCK_SIZE 4096 +#define EXT2_MAX_BLOCK_SIZE 65536 #define EXT2_MIN_BLOCK_LOG_SIZE 10 +#define EXT2_MAX_BLOCK_LOG_SIZE 16 #define EXT2_BLOCK_SIZE(s) ((s)->s_blocksize) #define EXT2_ADDR_PER_BLOCK(s) (EXT2_BLOCK_SIZE(s) / sizeof (__u32)) #define EXT2_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 69c88facfe90..f342f347a695 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -945,6 +945,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } + if (le32_to_cpu(es->s_log_block_size) > + (EXT2_MAX_BLOCK_LOG_SIZE - BLOCK_SIZE_BITS)) { + ext2_msg(sb, KERN_ERR, + "Invalid log block size: %u", + le32_to_cpu(es->s_log_block_size)); + goto failed_mount; + } blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); if (test_opt(sb, DAX)) { diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 641abfa4b718..8906ba479aaf 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -101,8 +101,8 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *, static const struct xattr_handler *ext2_xattr_handler_map[] = { [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler, #ifdef CONFIG_EXT2_FS_POSIX_ACL - [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, - [EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, + [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, + [EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default, #endif [EXT2_XATTR_INDEX_TRUSTED] = &ext2_xattr_trusted_handler, #ifdef CONFIG_EXT2_FS_SECURITY @@ -113,10 +113,6 @@ static const struct xattr_handler *ext2_xattr_handler_map[] = { const struct xattr_handler *ext2_xattr_handlers[] = { &ext2_xattr_user_handler, &ext2_xattr_trusted_handler, -#ifdef CONFIG_EXT2_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif #ifdef CONFIG_EXT2_FS_SECURITY &ext2_xattr_security_handler, #endif @@ -125,14 +121,18 @@ const struct xattr_handler *ext2_xattr_handlers[] = { #define EA_BLOCK_CACHE(inode) (EXT2_SB(inode->i_sb)->s_ea_block_cache) -static inline const struct xattr_handler * -ext2_xattr_handler(int name_index) +static inline const char *ext2_xattr_prefix(int name_index, + struct dentry *dentry) { const struct xattr_handler *handler = NULL; if (name_index > 0 && name_index < ARRAY_SIZE(ext2_xattr_handler_map)) handler = ext2_xattr_handler_map[name_index]; - return handler; + + if (!xattr_handler_can_list(handler, dentry)) + return NULL; + + return xattr_prefix(handler); } static bool @@ -333,11 +333,10 @@ bad_block: /* list the attribute names */ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); entry = EXT2_XATTR_NEXT(entry)) { - const struct xattr_handler *handler = - ext2_xattr_handler(entry->e_name_index); + const char *prefix; - if (handler && (!handler->list || handler->list(dentry))) { - const char *prefix = handler->prefix ?: handler->name; + prefix = ext2_xattr_prefix(entry->e_name_index, dentry); + if (prefix) { size_t prefix_len = strlen(prefix); size_t size = prefix_len + entry->e_name_len + 1; @@ -553,7 +552,6 @@ bad_block: error = -ENOMEM; if (header == NULL) goto cleanup; - end = (char *)header + sb->s_blocksize; header->h_magic = cpu_to_le32(EXT2_XATTR_MAGIC); header->h_blocks = header->h_refcount = cpu_to_le32(1); last = here = ENTRY(header+1); diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 8ff4b9192a9f..c1edde817be8 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -80,32 +80,56 @@ static inline int ext4_block_in_group(struct super_block *sb, return (actual_group == block_group) ? 1 : 0; } -/* Return the number of clusters used for file system metadata; this +/* + * Return the number of clusters used for file system metadata; this * represents the overhead needed by the file system. */ static unsigned ext4_num_overhead_clusters(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp) { - unsigned num_clusters; - int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c; + unsigned base_clusters, num_clusters; + int block_cluster = -1, inode_cluster; + int itbl_cluster_start = -1, itbl_cluster_end = -1; ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group); - ext4_fsblk_t itbl_blk; + ext4_fsblk_t end = start + EXT4_BLOCKS_PER_GROUP(sb) - 1; + ext4_fsblk_t itbl_blk_start, itbl_blk_end; struct ext4_sb_info *sbi = EXT4_SB(sb); /* This is the number of clusters used by the superblock, * block group descriptors, and reserved block group * descriptor blocks */ - num_clusters = ext4_num_base_meta_clusters(sb, block_group); + base_clusters = ext4_num_base_meta_clusters(sb, block_group); + num_clusters = base_clusters; + + /* + * Account and record inode table clusters if any cluster + * is in the block group, or inode table cluster range is + * [-1, -1] and won't overlap with block/inode bitmap cluster + * accounted below. + */ + itbl_blk_start = ext4_inode_table(sb, gdp); + itbl_blk_end = itbl_blk_start + sbi->s_itb_per_group - 1; + if (itbl_blk_start <= end && itbl_blk_end >= start) { + itbl_blk_start = itbl_blk_start >= start ? + itbl_blk_start : start; + itbl_blk_end = itbl_blk_end <= end ? + itbl_blk_end : end; + + itbl_cluster_start = EXT4_B2C(sbi, itbl_blk_start - start); + itbl_cluster_end = EXT4_B2C(sbi, itbl_blk_end - start); + + num_clusters += itbl_cluster_end - itbl_cluster_start + 1; + /* check if border cluster is overlapped */ + if (itbl_cluster_start == base_clusters - 1) + num_clusters--; + } /* - * For the allocation bitmaps and inode table, we first need - * to check to see if the block is in the block group. If it - * is, then check to see if the cluster is already accounted - * for in the clusters used for the base metadata cluster, or - * if we can increment the base metadata cluster to include - * that block. Otherwise, we will have to track the cluster - * used for the allocation bitmap or inode table explicitly. + * For the allocation bitmaps, we first need to check to see + * if the block is in the block group. If it is, then check + * to see if the cluster is already accounted for in the clusters + * used for the base metadata cluster and inode tables cluster. * Normally all of these blocks are contiguous, so the special * case handling shouldn't be necessary except for *very* * unusual file system layouts. @@ -113,46 +137,26 @@ static unsigned ext4_num_overhead_clusters(struct super_block *sb, if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) { block_cluster = EXT4_B2C(sbi, ext4_block_bitmap(sb, gdp) - start); - if (block_cluster < num_clusters) - block_cluster = -1; - else if (block_cluster == num_clusters) { + if (block_cluster >= base_clusters && + (block_cluster < itbl_cluster_start || + block_cluster > itbl_cluster_end)) num_clusters++; - block_cluster = -1; - } } if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) { inode_cluster = EXT4_B2C(sbi, ext4_inode_bitmap(sb, gdp) - start); - if (inode_cluster < num_clusters) - inode_cluster = -1; - else if (inode_cluster == num_clusters) { + /* + * Additional check if inode bitmap is in just accounted + * block_cluster + */ + if (inode_cluster != block_cluster && + inode_cluster >= base_clusters && + (inode_cluster < itbl_cluster_start || + inode_cluster > itbl_cluster_end)) num_clusters++; - inode_cluster = -1; - } } - itbl_blk = ext4_inode_table(sb, gdp); - for (i = 0; i < sbi->s_itb_per_group; i++) { - if (ext4_block_in_group(sb, itbl_blk + i, block_group)) { - c = EXT4_B2C(sbi, itbl_blk + i - start); - if ((c < num_clusters) || (c == inode_cluster) || - (c == block_cluster) || (c == itbl_cluster)) - continue; - if (c == num_clusters) { - num_clusters++; - continue; - } - num_clusters++; - itbl_cluster = c; - } - } - - if (block_cluster != -1) - num_clusters++; - if (inode_cluster != -1) - num_clusters++; - return num_clusters; } @@ -187,8 +191,6 @@ static int ext4_init_block_bitmap(struct super_block *sb, ASSERT(buffer_locked(bh)); - /* If checksum is bad mark all blocks used to prevent allocation - * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT | @@ -303,6 +305,38 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, return desc; } +static ext4_fsblk_t ext4_valid_block_bitmap_padding(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head *bh) +{ + ext4_grpblk_t next_zero_bit; + unsigned long bitmap_size = sb->s_blocksize * 8; + unsigned int offset = num_clusters_in_group(sb, block_group); + + if (bitmap_size <= offset) + return 0; + + next_zero_bit = ext4_find_next_zero_bit(bh->b_data, bitmap_size, offset); + + return (next_zero_bit < bitmap_size ? next_zero_bit : 0); +} + +struct ext4_group_info *ext4_get_group_info(struct super_block *sb, + ext4_group_t group) +{ + struct ext4_group_info **grp_info; + long indexv, indexh; + + if (unlikely(group >= EXT4_SB(sb)->s_groups_count)) { + ext4_error(sb, "invalid group %u", group); + return NULL; + } + indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); + indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); + grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv); + return grp_info[indexh]; +} + /* * Return the block number which was discovered to be invalid, or 0 if * the block bitmap is valid. @@ -350,13 +384,13 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, blk = ext4_inode_table(sb, desc); offset = blk - group_first_block; if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || - EXT4_B2C(sbi, offset + sbi->s_itb_per_group) >= max_bit) + EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) >= max_bit) return blk; next_zero_bit = ext4_find_next_zero_bit(bh->b_data, - EXT4_B2C(sbi, offset + sbi->s_itb_per_group), + EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1, EXT4_B2C(sbi, offset)); if (next_zero_bit < - EXT4_B2C(sbi, offset + sbi->s_itb_per_group)) + EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1) /* bad bitmap for inode tables */ return blk; return 0; @@ -377,14 +411,13 @@ static int ext4_validate_block_bitmap(struct super_block *sb, if (buffer_verified(bh)) return 0; - if (EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + if (!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) return -EFSCORRUPTED; ext4_lock_group(sb, block_group); if (buffer_verified(bh)) goto verified; - if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group, - desc, bh) || + if (unlikely(!ext4_block_bitmap_csum_verify(sb, desc, bh) || ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); @@ -401,6 +434,15 @@ static int ext4_validate_block_bitmap(struct super_block *sb, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EFSCORRUPTED; } + blk = ext4_valid_block_bitmap_padding(sb, block_group, bh); + if (unlikely(blk != 0)) { + ext4_unlock_group(sb, block_group); + ext4_error(sb, "bg %u: block %llu: padding at end of block bitmap is not set", + block_group, blk); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + return -EFSCORRUPTED; + } set_buffer_verified(bh); verified: ext4_unlock_group(sb, block_group); @@ -474,17 +516,19 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, goto out; } err = ext4_init_block_bitmap(sb, bh, block_group, desc); - set_bitmap_uptodate(bh); - set_buffer_uptodate(bh); - set_buffer_verified(bh); - ext4_unlock_group(sb, block_group); - unlock_buffer(bh); if (err) { + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); ext4_error(sb, "Failed to init block bitmap for group " "%u: %d", block_group, err); goto out; } - goto verify; + set_bitmap_uptodate(bh); + set_buffer_uptodate(bh); + set_buffer_verified(bh); + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); + return bh; } ext4_unlock_group(sb, block_group); if (buffer_uptodate(bh)) { @@ -842,10 +886,7 @@ static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, if (!ext4_bg_has_super(sb, group)) return 0; - if (ext4_has_feature_meta_bg(sb)) - return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); - else - return EXT4_SB(sb)->s_gdb_count; + return EXT4_SB(sb)->s_gdb_count; } /** @@ -887,11 +928,11 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb, block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) * sbi->s_desc_per_block) { if (num) { - num += ext4_bg_num_gdb(sb, block_group); + num += ext4_bg_num_gdb_nometa(sb, block_group); num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); } } else { /* For META_BG_BLOCK_GROUPS */ - num += ext4_bg_num_gdb(sb, block_group); + num += ext4_bg_num_gdb_meta(sb, block_group); } return EXT4_NUM_B2C(sbi, num); } diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index f63e028c638c..cd725bebe69e 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -16,7 +16,7 @@ unsigned int ext4_count_free(char *bitmap, unsigned int numchars) return numchars * BITS_PER_BYTE - memweight(bitmap, numchars); } -int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, +int ext4_inode_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh, int sz) { @@ -38,7 +38,7 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, return provided == calculated; } -void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, +void ext4_inode_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh, int sz) { @@ -54,7 +54,7 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16); } -int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, +int ext4_block_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh) { @@ -74,13 +74,10 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, } else calculated &= 0xFFFF; - if (provided == calculated) - return 1; - - return 0; + return provided == calculated; } -void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group, +void ext4_block_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 08b29c289da4..6948d673bba2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -40,6 +40,7 @@ #ifdef __KERNEL__ #include <linux/compat.h> #endif +#include <uapi/linux/ext4.h> #include <linux/fscrypt.h> #include <linux/fsverity.h> @@ -591,17 +592,6 @@ static inline void ext4_check_flag_values(void) CHECK_FLAG_VALUE(RESERVED); } -/* Used to pass group descriptor data when online resize is done */ -struct ext4_new_group_input { - __u32 group; /* Group number for this data */ - __u64 block_bitmap; /* Absolute block number of block bitmap */ - __u64 inode_bitmap; /* Absolute block number of inode bitmap */ - __u64 inode_table; /* Absolute block number of inode table start */ - __u32 blocks_count; /* Total number of blocks in this group */ - __u16 reserved_blocks; /* Number of reserved blocks in this group */ - __u16 unused; -}; - #if defined(__KERNEL__) && defined(CONFIG_COMPAT) struct compat_ext4_new_group_input { u32 group; @@ -698,70 +688,6 @@ enum { #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 #define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER 0x0040 -/* - * ioctl commands - */ -#define EXT4_IOC_GETVERSION _IOR('f', 3, long) -#define EXT4_IOC_SETVERSION _IOW('f', 4, long) -#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION -#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION -#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) -#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) -#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) -#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) -#define EXT4_IOC_MIGRATE _IO('f', 9) - /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */ - /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ -#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) -#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) -#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) -#define EXT4_IOC_SWAP_BOOT _IO('f', 17) -#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18) -/* ioctl codes 19--39 are reserved for fscrypt */ -#define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) -#define EXT4_IOC_GETSTATE _IOW('f', 41, __u32) -#define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap) -#define EXT4_IOC_CHECKPOINT _IOW('f', 43, __u32) -#define EXT4_IOC_GETFSUUID _IOR('f', 44, struct fsuuid) -#define EXT4_IOC_SETFSUUID _IOW('f', 44, struct fsuuid) - -#define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32) - -/* - * Flags for going down operation - */ -#define EXT4_GOING_FLAGS_DEFAULT 0x0 /* going down */ -#define EXT4_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ -#define EXT4_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ - -/* - * Flags returned by EXT4_IOC_GETSTATE - * - * We only expose to userspace a subset of the state flags in - * i_state_flags - */ -#define EXT4_STATE_FLAG_EXT_PRECACHED 0x00000001 -#define EXT4_STATE_FLAG_NEW 0x00000002 -#define EXT4_STATE_FLAG_NEWENTRY 0x00000004 -#define EXT4_STATE_FLAG_DA_ALLOC_CLOSE 0x00000008 - -/* flags for ioctl EXT4_IOC_CHECKPOINT */ -#define EXT4_IOC_CHECKPOINT_FLAG_DISCARD 0x1 -#define EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT 0x2 -#define EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN 0x4 -#define EXT4_IOC_CHECKPOINT_FLAG_VALID (EXT4_IOC_CHECKPOINT_FLAG_DISCARD | \ - EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT | \ - EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN) - -/* - * Structure for EXT4_IOC_GETFSUUID/EXT4_IOC_SETFSUUID - */ -struct fsuuid { - __u32 fsu_len; - __u32 fsu_flags; - __u8 fsu_uuid[]; -}; - #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* * ioctl commands in 32 bit emulation @@ -776,12 +702,6 @@ struct fsuuid { #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION #endif -/* - * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag. - * It indicates that the entry in extent status cache is for a hole. - */ -#define EXT4_FIEMAP_EXTENT_HOLE 0x08000000 - /* Max physical block we can address w/o extents */ #define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF @@ -852,15 +772,6 @@ struct ext4_inode { __le32 i_projid; /* Project ID */ }; -struct move_extent { - __u32 reserved; /* should be zero */ - __u32 donor_fd; /* donor file descriptor */ - __u64 orig_start; /* logical start offset in block for orig */ - __u64 donor_start; /* logical start offset in block for donor */ - __u64 len; /* block length to be moved */ - __u64 moved_len; /* moved block length */ -}; - #define EXT4_EPOCH_BITS 2 #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) #define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS) @@ -1120,8 +1031,8 @@ struct ext4_inode_info { /* mballoc */ atomic_t i_prealloc_active; - struct list_head i_prealloc_list; - spinlock_t i_prealloc_lock; + struct rb_root i_prealloc_node; + rwlock_t i_prealloc_lock; /* extents status tree */ struct ext4_es_tree i_es_tree; @@ -1613,7 +1524,6 @@ struct ext4_sb_info { unsigned int s_mb_stats; unsigned int s_mb_order2_reqs; unsigned int s_mb_group_prealloc; - unsigned int s_mb_max_inode_prealloc; unsigned int s_max_dir_size_kb; /* where last allocation was done - for stream allocation */ unsigned long s_mb_last_group; @@ -1774,6 +1684,30 @@ static inline struct ext4_inode_info *EXT4_I(struct inode *inode) return container_of(inode, struct ext4_inode_info, vfs_inode); } +static inline int ext4_writepages_down_read(struct super_block *sb) +{ + percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem); + return memalloc_nofs_save(); +} + +static inline void ext4_writepages_up_read(struct super_block *sb, int ctx) +{ + memalloc_nofs_restore(ctx); + percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem); +} + +static inline int ext4_writepages_down_write(struct super_block *sb) +{ + percpu_down_write(&EXT4_SB(sb)->s_writepages_rwsem); + return memalloc_nofs_save(); +} + +static inline void ext4_writepages_up_write(struct super_block *sb, int ctx) +{ + memalloc_nofs_restore(ctx); + percpu_up_write(&EXT4_SB(sb)->s_writepages_rwsem); +} + static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) { return ino == EXT4_ROOT_INO || @@ -1887,7 +1821,6 @@ static inline void ext4_simulate_fail_bh(struct super_block *sb, * Inode dynamic state flags */ enum { - EXT4_STATE_JDATA, /* journaled data exists */ EXT4_STATE_NEW, /* inode is newly created */ EXT4_STATE_XATTR, /* has in-inode xattrs */ EXT4_STATE_NO_EXPAND, /* No space for expansion */ @@ -2676,16 +2609,16 @@ struct mmpd_data { /* bitmap.c */ extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); -void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, +void ext4_inode_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh, int sz); -int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, +int ext4_inode_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh, int sz); -void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group, +void ext4_block_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh); -int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, +int ext4_block_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh); @@ -2716,6 +2649,8 @@ extern void ext4_check_blocks_bitmap(struct super_block *); extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, ext4_group_t block_group, struct buffer_head ** bh); +extern struct ext4_group_info *ext4_get_group_info(struct super_block *sb, + ext4_group_t group); extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, @@ -3323,19 +3258,6 @@ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) raw_inode->i_size_high = cpu_to_le32(i_size >> 32); } -static inline -struct ext4_group_info *ext4_get_group_info(struct super_block *sb, - ext4_group_t group) -{ - struct ext4_group_info **grp_info; - long indexv, indexh; - BUG_ON(group >= EXT4_SB(sb)->s_groups_count); - indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); - indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); - grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv); - return grp_info[indexh]; -} - /* * Reading s_groups_count requires using smp_rmb() afterwards. See * the locking protocol documented in the comments of ext4_group_add() @@ -3550,7 +3472,7 @@ extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, unsigned int len); extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); -extern int ext4_readpage_inline(struct inode *inode, struct page *page); +int ext4_readpage_inline(struct inode *inode, struct folio *folio); extern int ext4_try_to_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, @@ -3647,7 +3569,7 @@ static inline void ext4_set_de_type(struct super_block *sb, /* readpages.c */ extern int ext4_mpage_readpages(struct inode *inode, - struct readahead_control *rac, struct page *page); + struct readahead_control *rac, struct folio *folio); extern int __init ext4_init_post_read_processing(void); extern void ext4_exit_post_read_processing(void); @@ -3757,9 +3679,8 @@ extern void ext4_io_submit_init(struct ext4_io_submit *io, struct writeback_control *wbc); extern void ext4_end_io_rsv_work(struct work_struct *work); extern void ext4_io_submit(struct ext4_io_submit *io); -extern int ext4_bio_write_page(struct ext4_io_submit *io, - struct page *page, - int len); +int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page, + size_t len); extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end); extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3559ea6b0781..35703dce23a3 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4526,13 +4526,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, trace_ext4_zero_range(inode, offset, len, mode); - /* Call ext4_force_commit to flush all data in case of data=journal. */ - if (ext4_should_journal_data(inode)) { - ret = ext4_force_commit(inode->i_sb); - if (ret) - return ret; - } - /* * Round up offset. This is not fallocate, we need to zero out * blocks, so convert interior block aligned part of the range to @@ -4616,6 +4609,20 @@ static long ext4_zero_range(struct file *file, loff_t offset, filemap_invalidate_unlock(mapping); goto out_mutex; } + + /* + * For journalled data we need to write (and checkpoint) pages + * before discarding page cache to avoid inconsitent data on + * disk in case of crash before zeroing trans is committed. + */ + if (ext4_should_journal_data(inode)) { + ret = filemap_write_and_wait_range(mapping, start, end); + if (ret) { + filemap_invalidate_unlock(mapping); + goto out_mutex; + } + } + /* Now release the pages and zero block aligned part of pages */ truncate_pagecache_range(inode, start, end - 1); inode->i_mtime = inode->i_ctime = current_time(inode); @@ -5290,13 +5297,6 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); - /* Call ext4_force_commit to flush all data in case of data=journal. */ - if (ext4_should_journal_data(inode)) { - ret = ext4_force_commit(inode->i_sb); - if (ret) - return ret; - } - inode_lock(inode); /* * There is no need to overlap collapse range with EOF, in which case @@ -5443,13 +5443,6 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb); len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb); - /* Call ext4_force_commit to flush all data in case of data=journal */ - if (ext4_should_journal_data(inode)) { - ret = ext4_force_commit(inode->i_sb); - if (ret) - return ret; - } - inode_lock(inode); /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { @@ -5802,7 +5795,8 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu) * mapped - no physical clusters have been allocated, and the * file has no extents */ - if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) + if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) || + ext4_has_inline_data(inode)) return 0; /* search for the extent closest to the first block in the cluster */ diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 7bc221038c6c..595abb9e7d74 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -267,14 +267,12 @@ static void __es_find_extent_range(struct inode *inode, /* see if the extent has been cached */ es->es_lblk = es->es_len = es->es_pblk = 0; - if (tree->cache_es) { - es1 = tree->cache_es; - if (in_range(lblk, es1->es_lblk, es1->es_len)) { - es_debug("%u cached by [%u/%u) %llu %x\n", - lblk, es1->es_lblk, es1->es_len, - ext4_es_pblock(es1), ext4_es_status(es1)); - goto out; - } + es1 = READ_ONCE(tree->cache_es); + if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) { + es_debug("%u cached by [%u/%u) %llu %x\n", + lblk, es1->es_lblk, es1->es_len, + ext4_es_pblock(es1), ext4_es_status(es1)); + goto out; } es1 = __es_tree_search(&tree->root, lblk); @@ -293,7 +291,7 @@ out: } if (es1 && matching_fn(es1)) { - tree->cache_es = es1; + WRITE_ONCE(tree->cache_es, es1); es->es_lblk = es1->es_lblk; es->es_len = es1->es_len; es->es_pblk = es1->es_pblk; @@ -931,14 +929,12 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, /* find extent in cache firstly */ es->es_lblk = es->es_len = es->es_pblk = 0; - if (tree->cache_es) { - es1 = tree->cache_es; - if (in_range(lblk, es1->es_lblk, es1->es_len)) { - es_debug("%u cached by [%u/%u)\n", - lblk, es1->es_lblk, es1->es_len); - found = 1; - goto out; - } + es1 = READ_ONCE(tree->cache_es); + if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) { + es_debug("%u cached by [%u/%u)\n", + lblk, es1->es_lblk, es1->es_len); + found = 1; + goto out; } node = tree->root.rb_node; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 0b8b4499e5ca..d101b3b0c7da 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -899,7 +899,8 @@ static int ext4_file_open(struct inode *inode, struct file *filp) return ret; } - filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; + filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | + FMODE_DIO_PARALLEL_WRITE; return dquot_file_open(inode, filp); } diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 027a7d7037a0..f65fdb27ce14 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -153,23 +153,12 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; /* - * data=writeback,ordered: * The caller's filemap_fdatawrite()/wait will sync the data. * Metadata is in the journal, we wait for proper transaction to * commit here. - * - * data=journal: - * filemap_fdatawrite won't do anything (the buffers are clean). - * ext4_force_commit will write the file data into the journal and - * will wait on that. - * filemap_fdatawait() will encounter a ton of newly-dirtied pages - * (they were dirtied by commit). But that's OK - the blocks are - * safe in-journal, which is all fsync() needs to ensure. */ if (!sbi->s_journal) ret = ext4_fsync_nojournal(inode, datasync, &needs_barrier); - else if (ext4_should_journal_data(inode)) - ret = ext4_force_commit(inode->i_sb); else ret = ext4_fsync_journal(inode, datasync, &needs_barrier); diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 147b5241dd94..46c3423ddfa1 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -277,7 +277,11 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len, } default: hinfo->hash = 0; - return -1; + hinfo->minor_hash = 0; + ext4_warning(dir->i_sb, + "invalid/unsupported hash tree version %u", + hinfo->hash_version); + return -EINVAL; } hash = hash & ~1; if (hash == (EXT4_HTREE_EOF_32BIT << 1)) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 157663031f8c..754f961cd9fd 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -91,14 +91,14 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, if (buffer_verified(bh)) return 0; - if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) + if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) return -EFSCORRUPTED; ext4_lock_group(sb, block_group); if (buffer_verified(bh)) goto verified; blk = ext4_inode_bitmap(sb, desc); - if (!ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh, + if (!ext4_inode_bitmap_csum_verify(sb, desc, bh, EXT4_INODES_PER_GROUP(sb) / 8) || ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) { ext4_unlock_group(sb, block_group); @@ -293,7 +293,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) } if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { grp = ext4_get_group_info(sb, block_group); - if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) { + if (!grp || unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) { fatal = -EFSCORRUPTED; goto error_return; } @@ -327,7 +327,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) if (percpu_counter_initialized(&sbi->s_dirs_counter)) percpu_counter_dec(&sbi->s_dirs_counter); } - ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh, + ext4_inode_bitmap_csum_set(sb, gdp, bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); @@ -813,8 +813,7 @@ int ext4_mark_inode_used(struct super_block *sb, int ino) gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, ext4_free_clusters_after_init(sb, group, gdp)); - ext4_block_bitmap_csum_set(sb, group, gdp, - block_bitmap_bh); + ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); @@ -852,7 +851,7 @@ int ext4_mark_inode_used(struct super_block *sb, int ino) ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); if (ext4_has_group_desc_csum(sb)) { - ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh, + ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); ext4_group_desc_csum_set(sb, group, gdp); } @@ -1047,7 +1046,7 @@ got_group: * Skip groups with already-known suspicious inode * tables */ - if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) + if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) goto next_group; } @@ -1165,8 +1164,7 @@ got: gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, ext4_free_clusters_after_init(sb, group, gdp)); - ext4_block_bitmap_csum_set(sb, group, gdp, - block_bitmap_bh); + ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); @@ -1185,6 +1183,10 @@ got: if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { grp = ext4_get_group_info(sb, group); + if (!grp) { + err = -EFSCORRUPTED; + goto out; + } down_read(&grp->alloc_sem); /* * protect vs itable * lazyinit @@ -1222,7 +1224,7 @@ got: } } if (ext4_has_group_desc_csum(sb)) { - ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh, + ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); ext4_group_desc_csum_set(sb, group, gdp); } @@ -1528,7 +1530,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, } gdp = ext4_get_group_desc(sb, group, &group_desc_bh); - if (!gdp) + if (!gdp || !grp) goto out; /* diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 1602d74b5eeb..5854bd5a3352 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -34,6 +34,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode, struct ext4_xattr_ibody_header *header; struct ext4_xattr_entry *entry; struct ext4_inode *raw_inode; + void *end; int free, min_offs; if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) @@ -57,14 +58,23 @@ static int get_max_inline_xattr_value_size(struct inode *inode, raw_inode = ext4_raw_inode(iloc); header = IHDR(inode, raw_inode); entry = IFIRST(header); + end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; /* Compute min_offs. */ - for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { + while (!IS_LAST_ENTRY(entry)) { + void *next = EXT4_XATTR_NEXT(entry); + + if (next >= end) { + EXT4_ERROR_INODE(inode, + "corrupt xattr in inline inode"); + return 0; + } if (!entry->e_value_inum && entry->e_value_size) { size_t offs = le16_to_cpu(entry->e_value_offs); if (offs < min_offs) min_offs = offs; } + entry = next; } free = min_offs - ((void *)entry - (void *)IFIRST(header)) - sizeof(__u32); @@ -350,7 +360,7 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode, error = ext4_xattr_ibody_get(inode, i.name_index, i.name, value, len); - if (error == -ENODATA) + if (error < 0) goto out; BUFFER_TRACE(is.iloc.bh, "get_write_access"); @@ -467,16 +477,16 @@ out: return error; } -static int ext4_read_inline_page(struct inode *inode, struct page *page) +static int ext4_read_inline_folio(struct inode *inode, struct folio *folio) { void *kaddr; int ret = 0; size_t len; struct ext4_iloc iloc; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); BUG_ON(!ext4_has_inline_data(inode)); - BUG_ON(page->index); + BUG_ON(folio->index); if (!EXT4_I(inode)->i_inline_off) { ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.", @@ -489,19 +499,20 @@ static int ext4_read_inline_page(struct inode *inode, struct page *page) goto out; len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode)); - kaddr = kmap_atomic(page); + BUG_ON(len > PAGE_SIZE); + kaddr = kmap_local_folio(folio, 0); ret = ext4_read_inline_data(inode, kaddr, len, &iloc); - flush_dcache_page(page); - kunmap_atomic(kaddr); - zero_user_segment(page, len, PAGE_SIZE); - SetPageUptodate(page); + flush_dcache_folio(folio); + kunmap_local(kaddr); + folio_zero_segment(folio, len, folio_size(folio)); + folio_mark_uptodate(folio); brelse(iloc.bh); out: return ret; } -int ext4_readpage_inline(struct inode *inode, struct page *page) +int ext4_readpage_inline(struct inode *inode, struct folio *folio) { int ret = 0; @@ -515,16 +526,16 @@ int ext4_readpage_inline(struct inode *inode, struct page *page) * Current inline data can only exist in the 1st page, * So for all the other pages, just set them uptodate. */ - if (!page->index) - ret = ext4_read_inline_page(inode, page); - else if (!PageUptodate(page)) { - zero_user_segment(page, 0, PAGE_SIZE); - SetPageUptodate(page); + if (!folio->index) + ret = ext4_read_inline_folio(inode, folio); + else if (!folio_test_uptodate(folio)) { + folio_zero_segment(folio, 0, folio_size(folio)); + folio_mark_uptodate(folio); } up_read(&EXT4_I(inode)->xattr_sem); - unlock_page(page); + folio_unlock(folio); return ret >= 0 ? 0 : ret; } @@ -534,8 +545,7 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping, int ret, needed_blocks, no_expand; handle_t *handle = NULL; int retries = 0, sem_held = 0; - struct page *page = NULL; - unsigned int flags; + struct folio *folio = NULL; unsigned from, to; struct ext4_iloc iloc; @@ -564,12 +574,11 @@ retry: /* We cannot recurse into the filesystem as the transaction is already * started */ - flags = memalloc_nofs_save(); - page = grab_cache_page_write_begin(mapping, 0); - memalloc_nofs_restore(flags); - if (!page) { - ret = -ENOMEM; - goto out; + folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + goto out_nofolio; } ext4_write_lock_xattr(inode, &no_expand); @@ -582,8 +591,8 @@ retry: from = 0; to = ext4_get_inline_size(inode); - if (!PageUptodate(page)) { - ret = ext4_read_inline_page(inode, page); + if (!folio_test_uptodate(folio)) { + ret = ext4_read_inline_folio(inode, folio); if (ret < 0) goto out; } @@ -593,21 +602,21 @@ retry: goto out; if (ext4_should_dioread_nolock(inode)) { - ret = __block_write_begin(page, from, to, + ret = __block_write_begin(&folio->page, from, to, ext4_get_block_unwritten); } else - ret = __block_write_begin(page, from, to, ext4_get_block); + ret = __block_write_begin(&folio->page, from, to, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { - ret = ext4_walk_page_buffers(handle, inode, page_buffers(page), - from, to, NULL, - do_journal_get_write_access); + ret = ext4_walk_page_buffers(handle, inode, + folio_buffers(folio), from, to, + NULL, do_journal_get_write_access); } if (ret) { - unlock_page(page); - put_page(page); - page = NULL; + folio_unlock(folio); + folio_put(folio); + folio = NULL; ext4_orphan_add(handle, inode); ext4_write_unlock_xattr(inode, &no_expand); sem_held = 0; @@ -627,13 +636,14 @@ retry: if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; - if (page) - block_commit_write(page, from, to); + if (folio) + block_commit_write(&folio->page, from, to); out: - if (page) { - unlock_page(page); - put_page(page); + if (folio) { + folio_unlock(folio); + folio_put(folio); } +out_nofolio: if (sem_held) ext4_write_unlock_xattr(inode, &no_expand); if (handle) @@ -655,8 +665,7 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, { int ret; handle_t *handle; - unsigned int flags; - struct page *page; + struct folio *folio; struct ext4_iloc iloc; if (pos + len > ext4_get_max_inline_size(inode)) @@ -693,28 +702,27 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, if (ret) goto out; - flags = memalloc_nofs_save(); - page = grab_cache_page_write_begin(mapping, 0); - memalloc_nofs_restore(flags); - if (!page) { - ret = -ENOMEM; + folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto out; } - *pagep = page; + *pagep = &folio->page; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { ret = 0; - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto out_up_read; } - if (!PageUptodate(page)) { - ret = ext4_read_inline_page(inode, page); + if (!folio_test_uptodate(folio)) { + ret = ext4_read_inline_folio(inode, folio); if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto out_up_read; } } @@ -735,20 +743,21 @@ convert: int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct page *page) { + struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); int no_expand; void *kaddr; struct ext4_iloc iloc; int ret = 0, ret2; - if (unlikely(copied < len) && !PageUptodate(page)) + if (unlikely(copied < len) && !folio_test_uptodate(folio)) copied = 0; if (likely(copied)) { ret = ext4_get_inode_loc(inode, &iloc); if (ret) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); ext4_std_error(inode->i_sb, ret); goto out; } @@ -762,30 +771,30 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, */ (void) ext4_find_inline_data_nolock(inode); - kaddr = kmap_atomic(page); + kaddr = kmap_local_folio(folio, 0); ext4_write_inline_data(inode, &iloc, kaddr, pos, copied); - kunmap_atomic(kaddr); - SetPageUptodate(page); - /* clear page dirty so that writepages wouldn't work for us. */ - ClearPageDirty(page); + kunmap_local(kaddr); + folio_mark_uptodate(folio); + /* clear dirty flag so that writepages wouldn't work for us. */ + folio_clear_dirty(folio); ext4_write_unlock_xattr(inode, &no_expand); brelse(iloc.bh); /* - * It's important to update i_size while still holding page + * It's important to update i_size while still holding folio * lock: page writeout could otherwise come in and zero * beyond i_size. */ ext4_update_inode_size(inode, pos + copied); } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); /* - * Don't mark the inode dirty under page lock. First, it unnecessarily - * makes the holding time of page lock longer. Second, it forces lock - * ordering of page lock and transaction start for journaling + * Don't mark the inode dirty under folio lock. First, it unnecessarily + * makes the holding time of folio lock longer. Second, it forces lock + * ordering of folio lock and transaction start for journaling * filesystems. */ if (likely(copied)) @@ -852,11 +861,12 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, void **fsdata) { int ret = 0, inline_size; - struct page *page; + struct folio *folio; - page = grab_cache_page_write_begin(mapping, 0); - if (!page) - return -ENOMEM; + folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { @@ -866,32 +876,32 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, inline_size = ext4_get_inline_size(inode); - if (!PageUptodate(page)) { - ret = ext4_read_inline_page(inode, page); + if (!folio_test_uptodate(folio)) { + ret = ext4_read_inline_folio(inode, folio); if (ret < 0) goto out; } - ret = __block_write_begin(page, 0, inline_size, + ret = __block_write_begin(&folio->page, 0, inline_size, ext4_da_get_block_prep); if (ret) { up_read(&EXT4_I(inode)->xattr_sem); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); ext4_truncate_failed_write(inode); return ret; } - SetPageDirty(page); - SetPageUptodate(page); + folio_mark_dirty(folio); + folio_mark_uptodate(folio); ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); *fsdata = (void *)CONVERT_INLINE_DATA; out: up_read(&EXT4_I(inode)->xattr_sem); - if (page) { - unlock_page(page); - put_page(page); + if (folio) { + folio_unlock(folio); + folio_put(folio); } return ret; } @@ -912,10 +922,9 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, { int ret; handle_t *handle; - struct page *page; + struct folio *folio; struct ext4_iloc iloc; int retries = 0; - unsigned int flags; ret = ext4_get_inode_loc(inode, &iloc); if (ret) @@ -947,11 +956,10 @@ retry_journal: * We cannot recurse into the filesystem as the transaction * is already started. */ - flags = memalloc_nofs_save(); - page = grab_cache_page_write_begin(mapping, 0); - memalloc_nofs_restore(flags); - if (!page) { - ret = -ENOMEM; + folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto out_journal; } @@ -961,8 +969,8 @@ retry_journal: goto out_release_page; } - if (!PageUptodate(page)) { - ret = ext4_read_inline_page(inode, page); + if (!folio_test_uptodate(folio)) { + ret = ext4_read_inline_folio(inode, folio); if (ret < 0) goto out_release_page; } @@ -972,13 +980,13 @@ retry_journal: goto out_release_page; up_read(&EXT4_I(inode)->xattr_sem); - *pagep = page; + *pagep = &folio->page; brelse(iloc.bh); return 1; out_release_page: up_read(&EXT4_I(inode)->xattr_sem); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); out_journal: ext4_journal_stop(handle); out: @@ -1177,6 +1185,7 @@ static int ext4_finish_convert_inline_dir(handle_t *handle, ext4_initialize_dirent_tail(dir_block, inode->i_sb->s_blocksize); set_buffer_uptodate(dir_block); + unlock_buffer(dir_block); err = ext4_handle_dirty_dirblock(handle, inode, dir_block); if (err) return err; @@ -1251,6 +1260,7 @@ static int ext4_convert_inline_data_nolock(handle_t *handle, if (!S_ISDIR(inode->i_mode)) { memcpy(data_bh->b_data, buf, inline_size); set_buffer_uptodate(data_bh); + unlock_buffer(data_bh); error = ext4_handle_dirty_metadata(handle, inode, data_bh); } else { @@ -1258,7 +1268,6 @@ static int ext4_convert_inline_data_nolock(handle_t *handle, buf, inline_size); } - unlock_buffer(data_bh); out_restore: if (error) ext4_restore_inline_data(handle, inode, iloc, buf, inline_size); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bf0b7dea4900..ce5f21b6c2b3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -136,7 +136,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, new_size); } -static int __ext4_journalled_writepage(struct page *page, unsigned int len); static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents); @@ -180,33 +179,6 @@ void ext4_evict_inode(struct inode *inode) if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL) ext4_evict_ea_inode(inode); if (inode->i_nlink) { - /* - * When journalling data dirty buffers are tracked only in the - * journal. So although mm thinks everything is clean and - * ready for reaping the inode might still have some pages to - * write in the running transaction or waiting to be - * checkpointed. Thus calling jbd2_journal_invalidate_folio() - * (via truncate_inode_pages()) to discard these buffers can - * cause data loss. Also even if we did not discard these - * buffers, we would have no way to find them after the inode - * is reaped and thus user could see stale data if he tries to - * read them before the transaction is checkpointed. So be - * careful and force everything to disk here... We use - * ei->i_datasync_tid to store the newest transaction - * containing inode's data. - * - * Note that directories do not have this problem because they - * don't use page cache. - */ - if (inode->i_ino != EXT4_JOURNAL_INO && - ext4_should_journal_data(inode) && - S_ISREG(inode->i_mode) && inode->i_data.nrpages) { - journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; - - jbd2_complete_transaction(journal, commit_tid); - filemap_write_and_wait(&inode->i_data); - } truncate_inode_pages_final(&inode->i_data); goto no_delete; @@ -1005,29 +977,17 @@ int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, } /* - * To preserve ordering, it is essential that the hole instantiation and - * the data write be encapsulated in a single transaction. We cannot - * close off a transaction and start a new one between the ext4_get_block() - * and the commit_write(). So doing the jbd2_journal_start at the start of - * prepare_write() is the right place. - * - * Also, this function can nest inside ext4_writepage(). In that case, we - * *know* that ext4_writepage() has generated enough buffer credits to do the - * whole page. So we won't block on the journal in that case, which is good, - * because the caller may be PF_MEMALLOC. - * - * By accident, ext4 can be reentered when a transaction is open via - * quota file writes. If we were to commit the transaction while thus - * reentered, there can be a deadlock - we would be holding a quota - * lock, and the commit would never complete if another thread had a - * transaction open and was blocking on the quota lock - a ranking - * violation. - * - * So what we do is to rely on the fact that jbd2_journal_stop/journal_start - * will _not_ run commit under these circumstances because handle->h_ref - * is elevated. We'll still have enough credits for the tiny quotafile - * write. + * Helper for handling dirtying of journalled data. We also mark the folio as + * dirty so that writeback code knows about this page (and inode) contains + * dirty data. ext4_writepages() then commits appropriate transaction to + * make data stable. */ +static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh) +{ + folio_mark_dirty(bh->b_folio); + return ext4_handle_dirty_metadata(handle, NULL, bh); +} + int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh) { @@ -1050,17 +1010,17 @@ int do_journal_get_write_access(handle_t *handle, struct inode *inode, ret = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (!ret && dirty) - ret = ext4_handle_dirty_metadata(handle, NULL, bh); + ret = ext4_dirty_journalled_data(handle, bh); return ret; } #ifdef CONFIG_FS_ENCRYPTION -static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, +static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block) { unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + len; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; unsigned block_start, block_end; sector_t block; int err = 0; @@ -1070,22 +1030,24 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, int nr_wait = 0; int i; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); BUG_ON(from > PAGE_SIZE); BUG_ON(to > PAGE_SIZE); BUG_ON(from > to); - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); - head = page_buffers(page); + head = folio_buffers(folio); + if (!head) { + create_empty_buffers(&folio->page, blocksize, 0); + head = folio_buffers(folio); + } bbits = ilog2(blocksize); - block = (sector_t)page->index << (PAGE_SHIFT - bbits); + block = (sector_t)folio->index << (PAGE_SHIFT - bbits); for (bh = head, block_start = 0; bh != head || !block_start; block++, block_start = block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); } continue; @@ -1098,19 +1060,20 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, if (err) break; if (buffer_new(bh)) { - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { clear_buffer_new(bh); set_buffer_uptodate(bh); mark_buffer_dirty(bh); continue; } if (block_end > to || block_start < from) - zero_user_segments(page, to, block_end, - block_start, from); + folio_zero_segments(folio, to, + block_end, + block_start, from); continue; } } - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); continue; } @@ -1130,14 +1093,13 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, err = -EIO; } if (unlikely(err)) { - page_zero_new_buffers(page, from, to); + page_zero_new_buffers(&folio->page, from, to); } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) { for (i = 0; i < nr_wait; i++) { int err2; - err2 = fscrypt_decrypt_pagecache_blocks(page_folio(page), - blocksize, - bh_offset(wait[i])); + err2 = fscrypt_decrypt_pagecache_blocks(folio, + blocksize, bh_offset(wait[i])); if (err2) { clear_buffer_uptodate(wait[i]); err = err2; @@ -1149,6 +1111,13 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, } #endif +/* + * To preserve ordering, it is essential that the hole instantiation and + * the data write be encapsulated in a single transaction. We cannot + * close off a transaction and start a new one between the ext4_get_block() + * and the ext4_write_end(). So doing the jbd2_journal_start at the start of + * ext4_write_begin() is the right place. + */ static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, void **fsdata) @@ -1157,7 +1126,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, int ret, needed_blocks; handle_t *handle; int retries = 0; - struct page *page; + struct folio *folio; pgoff_t index; unsigned from, to; @@ -1184,68 +1153,68 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, } /* - * grab_cache_page_write_begin() can take a long time if the - * system is thrashing due to memory pressure, or if the page + * __filemap_get_folio() can take a long time if the + * system is thrashing due to memory pressure, or if the folio * is being written back. So grab it first before we start * the transaction handle. This also allows us to allocate - * the page (if needed) without using GFP_NOFS. + * the folio (if needed) without using GFP_NOFS. */ retry_grab: - page = grab_cache_page_write_begin(mapping, index); - if (!page) - return -ENOMEM; + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); /* * The same as page allocation, we prealloc buffer heads before * starting the handle. */ - if (!page_has_buffers(page)) - create_empty_buffers(page, inode->i_sb->s_blocksize, 0); + if (!folio_buffers(folio)) + create_empty_buffers(&folio->page, inode->i_sb->s_blocksize, 0); - unlock_page(page); + folio_unlock(folio); retry_journal: handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { - put_page(page); + folio_put(folio); return PTR_ERR(handle); } - lock_page(page); - if (page->mapping != mapping) { - /* The page got truncated from under us */ - unlock_page(page); - put_page(page); + folio_lock(folio); + if (folio->mapping != mapping) { + /* The folio got truncated from under us */ + folio_unlock(folio); + folio_put(folio); ext4_journal_stop(handle); goto retry_grab; } - /* In case writeback began while the page was unlocked */ - wait_for_stable_page(page); + /* In case writeback began while the folio was unlocked */ + folio_wait_stable(folio); #ifdef CONFIG_FS_ENCRYPTION if (ext4_should_dioread_nolock(inode)) - ret = ext4_block_write_begin(page, pos, len, + ret = ext4_block_write_begin(folio, pos, len, ext4_get_block_unwritten); else - ret = ext4_block_write_begin(page, pos, len, - ext4_get_block); + ret = ext4_block_write_begin(folio, pos, len, ext4_get_block); #else if (ext4_should_dioread_nolock(inode)) - ret = __block_write_begin(page, pos, len, + ret = __block_write_begin(&folio->page, pos, len, ext4_get_block_unwritten); else - ret = __block_write_begin(page, pos, len, ext4_get_block); + ret = __block_write_begin(&folio->page, pos, len, ext4_get_block); #endif if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, - page_buffers(page), from, to, NULL, - do_journal_get_write_access); + folio_buffers(folio), from, to, + NULL, do_journal_get_write_access); } if (ret) { bool extended = (pos + len > inode->i_size) && !ext4_verity_in_progress(inode); - unlock_page(page); + folio_unlock(folio); /* * __block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need @@ -1273,10 +1242,10 @@ retry_journal: if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_journal; - put_page(page); + folio_put(folio); return ret; } - *pagep = page; + *pagep = &folio->page; return ret; } @@ -1288,7 +1257,7 @@ static int write_end_fn(handle_t *handle, struct inode *inode, if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; set_buffer_uptodate(bh); - ret = ext4_handle_dirty_metadata(handle, NULL, bh); + ret = ext4_dirty_journalled_data(handle, bh); clear_buffer_meta(bh); clear_buffer_prio(bh); return ret; @@ -1306,6 +1275,7 @@ static int ext4_write_end(struct file *file, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; @@ -1321,7 +1291,7 @@ static int ext4_write_end(struct file *file, copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); /* - * it's important to update i_size while still holding page lock: + * it's important to update i_size while still holding folio lock: * page writeout could otherwise come in and zero beyond i_size. * * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree @@ -1329,15 +1299,15 @@ static int ext4_write_end(struct file *file, */ if (!verity) i_size_changed = ext4_update_inode_size(inode, pos + copied); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); /* - * Don't mark the inode dirty under page lock. First, it unnecessarily - * makes the holding time of page lock longer. Second, it forces lock - * ordering of page lock and transaction start for journaling + * Don't mark the inode dirty under folio lock. First, it unnecessarily + * makes the holding time of folio lock longer. Second, it forces lock + * ordering of folio lock and transaction start for journaling * filesystems. */ if (i_size_changed) @@ -1371,28 +1341,28 @@ static int ext4_write_end(struct file *file, /* * This is a private version of page_zero_new_buffers() which doesn't * set the buffer to be dirty, since in data=journalled mode we need - * to call ext4_handle_dirty_metadata() instead. + * to call ext4_dirty_journalled_data() instead. */ static void ext4_journalled_zero_new_buffers(handle_t *handle, struct inode *inode, - struct page *page, + struct folio *folio, unsigned from, unsigned to) { unsigned int block_start = 0, block_end; struct buffer_head *head, *bh; - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); do { block_end = block_start + bh->b_size; if (buffer_new(bh)) { if (block_end > from && block_start < to) { - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { unsigned start, size; start = max(from, block_start); size = min(to, block_end) - start; - zero_user(page, start, size); + folio_zero_range(folio, start, size); write_end_fn(handle, inode, bh); } clear_buffer_new(bh); @@ -1408,6 +1378,7 @@ static int ext4_journalled_write_end(struct file *file, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; @@ -1426,25 +1397,26 @@ static int ext4_journalled_write_end(struct file *file, if (ext4_has_inline_data(inode)) return ext4_write_inline_data_end(inode, pos, len, copied, page); - if (unlikely(copied < len) && !PageUptodate(page)) { + if (unlikely(copied < len) && !folio_test_uptodate(folio)) { copied = 0; - ext4_journalled_zero_new_buffers(handle, inode, page, from, to); + ext4_journalled_zero_new_buffers(handle, inode, folio, + from, to); } else { if (unlikely(copied < len)) - ext4_journalled_zero_new_buffers(handle, inode, page, + ext4_journalled_zero_new_buffers(handle, inode, folio, from + copied, to); - ret = ext4_walk_page_buffers(handle, inode, page_buffers(page), + ret = ext4_walk_page_buffers(handle, inode, + folio_buffers(folio), from, from + copied, &partial, write_end_fn); if (!partial) - SetPageUptodate(page); + folio_mark_uptodate(folio); } if (!verity) size_changed = ext4_update_inode_size(inode, pos + copied); - ext4_set_inode_state(inode, EXT4_STATE_JDATA); EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); @@ -1568,6 +1540,7 @@ struct mpage_da_data { struct ext4_io_submit io_submit; /* IO submission data */ unsigned int do_map:1; unsigned int scanned_until_end:1; + unsigned int journalled_more_data:1; }; static void mpage_release_unused_pages(struct mpage_da_data *mpd, @@ -1649,12 +1622,6 @@ static void ext4_print_free_blocks(struct inode *inode) return; } -static int ext4_bh_delay_or_unwritten(handle_t *handle, struct inode *inode, - struct buffer_head *bh) -{ - return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); -} - /* * ext4_insert_delayed_block - adds a delayed block to the extents status * tree, incrementing the reserved cluster/block @@ -1887,249 +1854,41 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, return 0; } -static int __ext4_journalled_writepage(struct page *page, - unsigned int len) +static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio) { - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; - handle_t *handle = NULL; - int ret = 0, err = 0; - int inline_data = ext4_has_inline_data(inode); - struct buffer_head *inode_bh = NULL; - loff_t size; - - ClearPageChecked(page); - - if (inline_data) { - BUG_ON(page->index != 0); - BUG_ON(len > ext4_get_max_inline_size(inode)); - inode_bh = ext4_journalled_write_inline_data(inode, len, page); - if (inode_bh == NULL) - goto out; - } - /* - * We need to release the page lock before we start the - * journal, so grab a reference so the page won't disappear - * out from under us. - */ - get_page(page); - unlock_page(page); - - handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, - ext4_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - put_page(page); - goto out_no_pagelock; - } - BUG_ON(!ext4_handle_valid(handle)); - - lock_page(page); - put_page(page); - size = i_size_read(inode); - if (page->mapping != mapping || page_offset(page) > size) { - /* The page got truncated from under us */ - ext4_journal_stop(handle); - ret = 0; - goto out; - } - - if (inline_data) { - ret = ext4_mark_inode_dirty(handle, inode); - } else { - struct buffer_head *page_bufs = page_buffers(page); - - if (page->index == size >> PAGE_SHIFT) - len = size & ~PAGE_MASK; - else - len = PAGE_SIZE; - - ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, - NULL, do_journal_get_write_access); - - err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, - NULL, write_end_fn); - } - if (ret == 0) - ret = err; - err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len); - if (ret == 0) - ret = err; - EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; - err = ext4_journal_stop(handle); - if (!ret) - ret = err; - - ext4_set_inode_state(inode, EXT4_STATE_JDATA); -out: - unlock_page(page); -out_no_pagelock: - brelse(inode_bh); - return ret; + mpd->first_page += folio_nr_pages(folio); + folio_unlock(folio); } -/* - * Note that we don't need to start a transaction unless we're journaling data - * because we should have holes filled from ext4_page_mkwrite(). We even don't - * need to file the inode to the transaction's list in ordered mode because if - * we are writing back data added by write(), the inode is already there and if - * we are writing back data modified via mmap(), no one guarantees in which - * transaction the data will hit the disk. In case we are journaling data, we - * cannot start transaction directly because transaction start ranks above page - * lock so we have to do some magic. - * - * This function can get called via... - * - ext4_writepages after taking page lock (have journal handle) - * - journal_submit_inode_data_buffers (no journal handle) - * - shrink_page_list via the kswapd/direct reclaim (no journal handle) - * - grab_page_cache when doing write_begin (have journal handle) - * - * We don't do any block allocation in this function. If we have page with - * multiple blocks we need to write those buffer_heads that are mapped. This - * is important for mmaped based write. So if we do with blocksize 1K - * truncate(f, 1024); - * a = mmap(f, 0, 4096); - * a[0] = 'a'; - * truncate(f, 4096); - * we have in the page first buffer_head mapped via page_mkwrite call back - * but other buffer_heads would be unmapped but dirty (dirty done via the - * do_wp_page). So writepage should write the first block. If we modify - * the mmap area beyond 1024 we will again get a page_fault and the - * page_mkwrite callback will do the block allocation and mark the - * buffer_heads mapped. - * - * We redirty the page if we have any buffer_heads that is either delay or - * unwritten in the page. - * - * We can get recursively called as show below. - * - * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> - * ext4_writepage() - * - * But since we don't do any block allocation we should not deadlock. - * Page also have the dirty flag cleared so we don't get recurive page_lock. - */ -static int ext4_writepage(struct page *page, - struct writeback_control *wbc) +static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) { - struct folio *folio = page_folio(page); - int ret = 0; - loff_t size; - unsigned int len; - struct buffer_head *page_bufs = NULL; - struct inode *inode = page->mapping->host; - struct ext4_io_submit io_submit; - - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { - folio_invalidate(folio, 0, folio_size(folio)); - folio_unlock(folio); - return -EIO; - } - - trace_ext4_writepage(page); - size = i_size_read(inode); - if (page->index == size >> PAGE_SHIFT && - !ext4_verity_in_progress(inode)) - len = size & ~PAGE_MASK; - else - len = PAGE_SIZE; - - /* Should never happen but for bugs in other kernel subsystems */ - if (!page_has_buffers(page)) { - ext4_warning_inode(inode, - "page %lu does not have buffers attached", page->index); - ClearPageDirty(page); - unlock_page(page); - return 0; - } - - page_bufs = page_buffers(page); - /* - * We cannot do block allocation or other extent handling in this - * function. If there are buffers needing that, we have to redirty - * the page. But we may reach here when we do a journal commit via - * journal_submit_inode_data_buffers() and in that case we must write - * allocated buffers to achieve data=ordered mode guarantees. - * - * Also, if there is only one buffer per page (the fs block - * size == the page size), if one buffer needs block - * allocation or needs to modify the extent tree to clear the - * unwritten flag, we know that the page can't be written at - * all, so we might as well refuse the write immediately. - * Unfortunately if the block size != page size, we can't as - * easily detect this case using ext4_walk_page_buffers(), but - * for the extremely common case, this is an optimization that - * skips a useless round trip through ext4_bio_write_page(). - */ - if (ext4_walk_page_buffers(NULL, inode, page_bufs, 0, len, NULL, - ext4_bh_delay_or_unwritten)) { - redirty_page_for_writepage(wbc, page); - if ((current->flags & PF_MEMALLOC) || - (inode->i_sb->s_blocksize == PAGE_SIZE)) { - /* - * For memory cleaning there's no point in writing only - * some buffers. So just bail out. Warn if we came here - * from direct reclaim. - */ - WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) - == PF_MEMALLOC); - unlock_page(page); - return 0; - } - } - - if (PageChecked(page) && ext4_should_journal_data(inode)) - /* - * It's mmapped pagecache. Add buffers and journal it. There - * doesn't seem much point in redirtying the page here. - */ - return __ext4_journalled_writepage(page, len); - - ext4_io_submit_init(&io_submit, wbc); - io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); - if (!io_submit.io_end) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return -ENOMEM; - } - ret = ext4_bio_write_page(&io_submit, page, len); - ext4_io_submit(&io_submit); - /* Drop io_end reference we got from init */ - ext4_put_io_end_defer(io_submit.io_end); - return ret; -} - -static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) -{ - int len; + size_t len; loff_t size; int err; - BUG_ON(page->index != mpd->first_page); - clear_page_dirty_for_io(page); + BUG_ON(folio->index != mpd->first_page); + folio_clear_dirty_for_io(folio); /* * We have to be very careful here! Nothing protects writeback path * against i_size changes and the page can be writeably mapped into * page tables. So an application can be growing i_size and writing - * data through mmap while writeback runs. clear_page_dirty_for_io() + * data through mmap while writeback runs. folio_clear_dirty_for_io() * write-protects our page in page tables and the page cannot get - * written to again until we release page lock. So only after - * clear_page_dirty_for_io() we are safe to sample i_size for - * ext4_bio_write_page() to zero-out tail of the written page. We rely - * on the barrier provided by TestClearPageDirty in - * clear_page_dirty_for_io() to make sure i_size is really sampled only + * written to again until we release folio lock. So only after + * folio_clear_dirty_for_io() we are safe to sample i_size for + * ext4_bio_write_folio() to zero-out tail of the written page. We rely + * on the barrier provided by folio_test_clear_dirty() in + * folio_clear_dirty_for_io() to make sure i_size is really sampled only * after page tables are updated. */ size = i_size_read(mpd->inode); - if (page->index == size >> PAGE_SHIFT && + len = folio_size(folio); + if (folio_pos(folio) + len > size && !ext4_verity_in_progress(mpd->inode)) len = size & ~PAGE_MASK; - else - len = PAGE_SIZE; - err = ext4_bio_write_page(&mpd->io_submit, page, len); + err = ext4_bio_write_folio(&mpd->io_submit, folio, len); if (!err) mpd->wbc->nr_to_write--; - mpd->first_page++; return err; } @@ -2240,9 +1999,10 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, } while (lblk++, (bh = bh->b_this_page) != head); /* So far everything mapped? Submit the page for IO. */ if (mpd->map.m_len == 0) { - err = mpage_submit_page(mpd, head->b_page); + err = mpage_submit_folio(mpd, head->b_folio); if (err < 0) return err; + mpage_folio_done(mpd, head->b_folio); } if (lblk >= blocks) { mpd->scanned_until_end = 1; @@ -2252,21 +2012,22 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, } /* - * mpage_process_page - update page buffers corresponding to changed extent and - * may submit fully mapped page for IO - * - * @mpd - description of extent to map, on return next extent to map - * @m_lblk - logical block mapping. - * @m_pblk - corresponding physical mapping. - * @map_bh - determines on return whether this page requires any further + * mpage_process_folio - update folio buffers corresponding to changed extent + * and may submit fully mapped page for IO + * @mpd: description of extent to map, on return next extent to map + * @folio: Contains these buffers. + * @m_lblk: logical block mapping. + * @m_pblk: corresponding physical mapping. + * @map_bh: determines on return whether this page requires any further * mapping or not. - * Scan given page buffers corresponding to changed extent and update buffer + * + * Scan given folio buffers corresponding to changed extent and update buffer * state according to new extent state. * We map delalloc buffers to their physical location, clear unwritten bits. - * If the given page is not fully mapped, we update @map to the next extent in - * the given page that needs mapping & return @map_bh as true. + * If the given folio is not fully mapped, we update @mpd to the next extent in + * the given folio that needs mapping & return @map_bh as true. */ -static int mpage_process_page(struct mpage_da_data *mpd, struct page *page, +static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk, bool *map_bh) { @@ -2279,14 +2040,14 @@ static int mpage_process_page(struct mpage_da_data *mpd, struct page *page, ssize_t io_end_size = 0; struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); do { if (lblk < mpd->map.m_lblk) continue; if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { /* * Buffer after end of mapped extent. - * Find next buffer in the page to map. + * Find next buffer in the folio to map. */ mpd->map.m_len = 0; mpd->map.m_flags = 0; @@ -2359,9 +2120,9 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) if (nr == 0) break; for (i = 0; i < nr; i++) { - struct page *page = &fbatch.folios[i]->page; + struct folio *folio = fbatch.folios[i]; - err = mpage_process_page(mpd, page, &lblk, &pblock, + err = mpage_process_folio(mpd, folio, &lblk, &pblock, &map_bh); /* * If map_bh is true, means page may require further bh @@ -2371,9 +2132,10 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) if (err < 0 || map_bh) goto out; /* Page fully mapped - let IO run! */ - err = mpage_submit_page(mpd, page); + err = mpage_submit_folio(mpd, folio); if (err < 0) goto out; + mpage_folio_done(mpd, folio); } folio_batch_release(&fbatch); } @@ -2559,17 +2321,45 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); } -/* Return true if the page needs to be written as part of transaction commit */ -static bool ext4_page_nomap_can_writeout(struct page *page) +static int ext4_journal_page_buffers(handle_t *handle, struct page *page, + int len) { - struct buffer_head *bh, *head; + struct buffer_head *page_bufs = page_buffers(page); + struct inode *inode = page->mapping->host; + int ret, err; - bh = head = page_buffers(page); - do { - if (buffer_dirty(bh) && buffer_mapped(bh) && !buffer_delay(bh)) - return true; - } while ((bh = bh->b_this_page) != head); - return false; + ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, + NULL, do_journal_get_write_access); + err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, + NULL, write_end_fn); + if (ret == 0) + ret = err; + err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len); + if (ret == 0) + ret = err; + EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; + + return ret; +} + +static int mpage_journal_page_buffers(handle_t *handle, + struct mpage_da_data *mpd, + struct page *page) +{ + struct inode *inode = mpd->inode; + loff_t size = i_size_read(inode); + int len; + + ClearPageChecked(page); + mpd->wbc->nr_to_write--; + + if (page->index == size >> PAGE_SHIFT && + !ext4_verity_in_progress(inode)) + len = size & ~PAGE_MASK; + else + len = PAGE_SIZE; + + return ext4_journal_page_buffers(handle, page, len); } /* @@ -2597,7 +2387,6 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) struct address_space *mapping = mpd->inode->i_mapping; struct folio_batch fbatch; unsigned int nr_folios; - long left = mpd->wbc->nr_to_write; pgoff_t index = mpd->first_page; pgoff_t end = mpd->last_page; xa_mark_t tag; @@ -2605,14 +2394,23 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) int blkbits = mpd->inode->i_blkbits; ext4_lblk_t lblk; struct buffer_head *head; + handle_t *handle = NULL; + int bpp = ext4_journal_blocks_per_page(mpd->inode); if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; - folio_batch_init(&fbatch); + mpd->map.m_len = 0; mpd->next_page = index; + if (ext4_should_journal_data(mpd->inode)) { + handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE, + bpp); + if (IS_ERR(handle)) + return PTR_ERR(handle); + } + folio_batch_init(&fbatch); while (index <= end) { nr_folios = filemap_get_folios_tag(mapping, &index, end, tag, &fbatch); @@ -2630,13 +2428,22 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) * newly appeared dirty pages, but have not synced all * of the old dirty pages. */ - if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0) + if (mpd->wbc->sync_mode == WB_SYNC_NONE && + mpd->wbc->nr_to_write <= + mpd->map.m_len >> (PAGE_SHIFT - blkbits)) goto out; /* If we can't merge this page, we are done. */ if (mpd->map.m_len > 0 && mpd->next_page != folio->index) goto out; + if (handle) { + err = ext4_journal_ensure_credits(handle, bpp, + 0); + if (err < 0) + goto out; + } + folio_lock(folio); /* * If the page is no longer dirty, or its mapping no @@ -2676,18 +2483,28 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) mpd->first_page = folio->index; mpd->next_page = folio->index + folio_nr_pages(folio); /* - * Writeout for transaction commit where we cannot - * modify metadata is simple. Just submit the page. + * Writeout when we cannot modify metadata is simple. + * Just submit the page. For data=journal mode we + * first handle writeout of the page for checkpoint and + * only after that handle delayed page dirtying. This + * makes sure current data is checkpointed to the final + * location before possibly journalling it again which + * is desirable when the page is frequently dirtied + * through a pin. */ if (!mpd->can_map) { - if (ext4_page_nomap_can_writeout(&folio->page)) { - err = mpage_submit_page(mpd, &folio->page); + err = mpage_submit_folio(mpd, folio); + if (err < 0) + goto out; + /* Pending dirtying of journalled data? */ + if (folio_test_checked(folio)) { + err = mpage_journal_page_buffers(handle, + mpd, &folio->page); if (err < 0) goto out; - } else { - folio_unlock(folio); - mpd->first_page += folio_nr_pages(folio); + mpd->journalled_more_data = 1; } + mpage_folio_done(mpd, folio); } else { /* Add all dirty buffers to mpd */ lblk = ((ext4_lblk_t)folio->index) << @@ -2699,24 +2516,21 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) goto out; err = 0; } - left -= folio_nr_pages(folio); } folio_batch_release(&fbatch); cond_resched(); } mpd->scanned_until_end = 1; + if (handle) + ext4_journal_stop(handle); return 0; out: folio_batch_release(&fbatch); + if (handle) + ext4_journal_stop(handle); return err; } -static int ext4_writepage_cb(struct folio *folio, struct writeback_control *wbc, - void *data) -{ - return ext4_writepage(&folio->page, wbc); -} - static int ext4_do_writepages(struct mpage_da_data *mpd) { struct writeback_control *wbc = mpd->wbc; @@ -2742,13 +2556,6 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) goto out_writepages; - if (ext4_should_journal_data(inode)) { - blk_start_plug(&plug); - ret = write_cache_pages(mapping, wbc, ext4_writepage_cb, NULL); - blk_finish_plug(&plug); - goto out_writepages; - } - /* * If the filesystem has aborted, it is read-only, so return * right away instead of dumping stack traces later on that @@ -2783,6 +2590,26 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) ext4_journal_stop(handle); } + /* + * data=journal mode does not do delalloc so we just need to writeout / + * journal already mapped buffers. On the other hand we need to commit + * transaction to make data stable. We expect all the data to be + * already in the journal (the only exception are DMA pinned pages + * dirtied behind our back) so we commit transaction here and run the + * writeback loop to checkpoint them. The checkpointing is not actually + * necessary to make data persistent *but* quite a few places (extent + * shifting operations, fsverity, ...) depend on being able to drop + * pagecache pages after calling filemap_write_and_wait() and for that + * checkpointing needs to happen. + */ + if (ext4_should_journal_data(inode)) { + mpd->can_map = 0; + if (wbc->sync_mode == WB_SYNC_ALL) + ext4_fc_commit(sbi->s_journal, + EXT4_I(inode)->i_datasync_tid); + } + mpd->journalled_more_data = 0; + if (ext4_should_dioread_nolock(inode)) { /* * We may need to convert up to one extent per block in @@ -2956,13 +2783,21 @@ static int ext4_writepages(struct address_space *mapping, .can_map = 1, }; int ret; + int alloc_ctx; if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) return -EIO; - percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem); + alloc_ctx = ext4_writepages_down_read(sb); ret = ext4_do_writepages(&mpd); - percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem); + /* + * For data=journal writeback we could have come across pages marked + * for delayed dirtying (PageChecked) which were just added to the + * running transaction. Try once more to get them to stable storage. + */ + if (!ret && mpd.journalled_more_data) + ret = ext4_do_writepages(&mpd); + ext4_writepages_up_read(sb, alloc_ctx); return ret; } @@ -2990,17 +2825,18 @@ static int ext4_dax_writepages(struct address_space *mapping, long nr_to_write = wbc->nr_to_write; struct inode *inode = mapping->host; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); + int alloc_ctx; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; - percpu_down_read(&sbi->s_writepages_rwsem); + alloc_ctx = ext4_writepages_down_read(inode->i_sb); trace_ext4_writepages(inode, wbc); ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc); trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); - percpu_up_read(&sbi->s_writepages_rwsem); + ext4_writepages_up_read(inode->i_sb, alloc_ctx); return ret; } @@ -3043,7 +2879,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { int ret, retries = 0; - struct page *page; + struct folio *folio; pgoff_t index; struct inode *inode = mapping->host; @@ -3070,22 +2906,22 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, } retry: - page = grab_cache_page_write_begin(mapping, index); - if (!page) - return -ENOMEM; + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); - /* In case writeback began while the page was unlocked */ - wait_for_stable_page(page); + /* In case writeback began while the folio was unlocked */ + folio_wait_stable(folio); #ifdef CONFIG_FS_ENCRYPTION - ret = ext4_block_write_begin(page, pos, len, - ext4_da_get_block_prep); + ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep); #else - ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); + ret = __block_write_begin(&folio->page, pos, len, ext4_da_get_block_prep); #endif if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); /* * block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need @@ -3100,7 +2936,7 @@ retry: return ret; } - *pagep = page; + *pagep = &folio->page; return ret; } @@ -3148,6 +2984,9 @@ static int ext4_da_write_end(struct file *file, ext4_has_inline_data(inode)) return ext4_write_inline_data_end(inode, pos, len, copied, page); + if (unlikely(copied < len) && !PageUptodate(page)) + copied = 0; + start = pos & (PAGE_SIZE - 1); end = start + copied - 1; @@ -3159,9 +2998,8 @@ static int ext4_da_write_end(struct file *file, * i_disksize since writeback will push i_disksize upto i_size * eventually. If the end of the current write is > i_size and * inside an allocated block (ext4_da_should_update_i_disksize() - * check), we need to update i_disksize here as neither - * ext4_writepage() nor certain ext4_writepages() paths not - * allocating blocks update i_disksize. + * check), we need to update i_disksize here as certain + * ext4_writepages() paths not allocating blocks update i_disksize. * * Note that we defer inode dirtying to generic_write_end() / * ext4_da_write_inline_data_end(). @@ -3235,9 +3073,7 @@ int ext4_alloc_da_blocks(struct inode *inode) static sector_t ext4_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; - journal_t *journal; sector_t ret = 0; - int err; inode_lock_shared(inode); /* @@ -3247,45 +3083,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) goto out; if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && - test_opt(inode->i_sb, DELALLOC)) { + (test_opt(inode->i_sb, DELALLOC) || + ext4_should_journal_data(inode))) { /* - * With delalloc we want to sync the file - * so that we can make sure we allocate - * blocks for file + * With delalloc or journalled data we want to sync the file so + * that we can make sure we allocate blocks for file and data + * is in place for the user to see it */ filemap_write_and_wait(mapping); } - if (EXT4_JOURNAL(inode) && - ext4_test_inode_state(inode, EXT4_STATE_JDATA)) { - /* - * This is a REALLY heavyweight approach, but the use of - * bmap on dirty files is expected to be extremely rare: - * only if we run lilo or swapon on a freshly made file - * do we expect this to happen. - * - * (bmap requires CAP_SYS_RAWIO so this does not - * represent an unprivileged user DOS attack --- we'd be - * in trouble if mortal users could trigger this path at - * will.) - * - * NB. EXT4_STATE_JDATA is not set on files other than - * regular files. If somebody wants to bmap a directory - * or symlink and gets confused because the buffer - * hasn't yet been flushed to disk, they deserve - * everything they get. - */ - - ext4_clear_inode_state(inode, EXT4_STATE_JDATA); - journal = EXT4_JOURNAL(inode); - jbd2_journal_lock_updates(journal); - err = jbd2_journal_flush(journal, 0); - jbd2_journal_unlock_updates(journal); - - if (err) - goto out; - } - ret = iomap_bmap(mapping, block, &ext4_iomap_ops); out: @@ -3295,17 +3102,16 @@ out: static int ext4_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; int ret = -EAGAIN; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; - trace_ext4_readpage(page); + trace_ext4_readpage(&folio->page); if (ext4_has_inline_data(inode)) - ret = ext4_readpage_inline(inode, page); + ret = ext4_readpage_inline(inode, folio); if (ret == -EAGAIN) - return ext4_mpage_readpages(inode, NULL, page); + return ext4_mpage_readpages(inode, NULL, folio); return ret; } @@ -3571,7 +3377,7 @@ static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, */ flags &= ~IOMAP_WRITE; ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap); - WARN_ON_ONCE(iomap->type != IOMAP_MAPPED); + WARN_ON_ONCE(!ret && iomap->type != IOMAP_MAPPED); return ret; } @@ -3686,24 +3492,26 @@ const struct iomap_ops ext4_iomap_report_ops = { }; /* - * Whenever the folio is being dirtied, corresponding buffers should already - * be attached to the transaction (we take care of this in ext4_page_mkwrite() - * and ext4_write_begin()). However we cannot move buffers to dirty transaction - * lists here because ->dirty_folio is called under VFS locks and the folio - * is not necessarily locked. - * - * We cannot just dirty the folio and leave attached buffers clean, because the - * buffers' dirty state is "definitive". We cannot just set the buffers dirty - * or jbddirty because all the journalling code will explode. - * - * So what we do is to mark the folio "pending dirty" and next time writepage - * is called, propagate that into the buffers appropriately. + * For data=journal mode, folio should be marked dirty only when it was + * writeably mapped. When that happens, it was already attached to the + * transaction and marked as jbddirty (we take care of this in + * ext4_page_mkwrite()). On transaction commit, we writeprotect page mappings + * so we should have nothing to do here, except for the case when someone + * had the page pinned and dirtied the page through this pin (e.g. by doing + * direct IO to it). In that case we'd need to attach buffers here to the + * transaction but we cannot due to lock ordering. We cannot just dirty the + * folio and leave attached buffers clean, because the buffers' dirty state is + * "definitive". We cannot just set the buffers dirty or jbddirty because all + * the journalling code will explode. So what we do is to mark the folio + * "pending dirty" and next time ext4_writepages() is called, attach buffers + * to the transaction appropriately. */ static bool ext4_journalled_dirty_folio(struct address_space *mapping, struct folio *folio) { WARN_ON_ONCE(!folio_buffers(folio)); - folio_set_checked(folio); + if (folio_maybe_dma_pinned(folio)) + folio_set_checked(folio); return filemap_dirty_folio(mapping, folio); } @@ -3809,23 +3617,26 @@ static int __ext4_block_zero_page_range(handle_t *handle, ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; - struct page *page; + struct folio *folio; int err = 0; - page = find_or_create_page(mapping, from >> PAGE_SHIFT, - mapping_gfp_constraint(mapping, ~__GFP_FS)); - if (!page) - return -ENOMEM; + folio = __filemap_get_folio(mapping, from >> PAGE_SHIFT, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mapping_gfp_constraint(mapping, ~__GFP_FS)); + if (IS_ERR(folio)) + return PTR_ERR(folio); blocksize = inode->i_sb->s_blocksize; iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); + bh = folio_buffers(folio); + if (!bh) { + create_empty_buffers(&folio->page, blocksize, 0); + bh = folio_buffers(folio); + } /* Find the buffer that contains "offset" */ - bh = page_buffers(page); pos = blocksize; while (offset >= pos) { bh = bh->b_this_page; @@ -3847,7 +3658,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, } /* Ok, it's mapped. Make sure it's up-to-date */ - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { @@ -3857,7 +3668,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, if (fscrypt_inode_uses_fs_layer_crypto(inode)) { /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); - err = fscrypt_decrypt_pagecache_blocks(page_folio(page), + err = fscrypt_decrypt_pagecache_blocks(folio, blocksize, bh_offset(bh)); if (err) { @@ -3873,11 +3684,11 @@ static int __ext4_block_zero_page_range(handle_t *handle, if (err) goto unlock; } - zero_user(page, offset, length); + folio_zero_range(folio, offset, length); BUFFER_TRACE(bh, "zeroed end of block"); if (ext4_should_journal_data(inode)) { - err = ext4_handle_dirty_metadata(handle, inode, bh); + err = ext4_dirty_journalled_data(handle, bh); } else { err = 0; mark_buffer_dirty(bh); @@ -3887,8 +3698,8 @@ static int __ext4_block_zero_page_range(handle_t *handle, } unlock: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return err; } @@ -5385,7 +5196,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) * If the folio is fully truncated, we don't need to wait for any commit * (and we even should not as __ext4_journalled_invalidate_folio() may * strip all buffers from the folio but keep the folio dirty which can then - * confuse e.g. concurrent ext4_writepage() seeing dirty folio without + * confuse e.g. concurrent ext4_writepages() seeing dirty folio without * buffers). Also we don't need to wait for any commit if all buffers in * the folio remain valid. This is most beneficial for the common case of * blocksize == PAGESIZE. @@ -5395,7 +5206,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) while (1) { struct folio *folio = filemap_lock_folio(inode->i_mapping, inode->i_size >> PAGE_SHIFT); - if (!folio) + if (IS_ERR(folio)) return; ret = __ext4_journalled_invalidate_folio(folio, offset, folio_size(folio) - offset); @@ -6119,7 +5930,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) journal_t *journal; handle_t *handle; int err; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int alloc_ctx; /* * We have to be very careful here: changing a data block's @@ -6157,7 +5968,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) } } - percpu_down_write(&sbi->s_writepages_rwsem); + alloc_ctx = ext4_writepages_down_write(inode->i_sb); jbd2_journal_lock_updates(journal); /* @@ -6174,7 +5985,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) err = jbd2_journal_flush(journal, 0); if (err < 0) { jbd2_journal_unlock_updates(journal); - percpu_up_write(&sbi->s_writepages_rwsem); + ext4_writepages_up_write(inode->i_sb, alloc_ctx); return err; } ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); @@ -6182,7 +5993,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) ext4_set_aops(inode); jbd2_journal_unlock_updates(journal); - percpu_up_write(&sbi->s_writepages_rwsem); + ext4_writepages_up_write(inode->i_sb, alloc_ctx); if (val) filemap_invalidate_unlock(inode->i_mapping); @@ -6212,7 +6023,7 @@ static int ext4_bh_unmapped(handle_t *handle, struct inode *inode, vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); loff_t size; unsigned long len; int err; @@ -6256,19 +6067,18 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) goto out_ret; } - lock_page(page); + folio_lock(folio); size = i_size_read(inode); /* Page got truncated from under us? */ - if (page->mapping != mapping || page_offset(page) > size) { - unlock_page(page); + if (folio->mapping != mapping || folio_pos(folio) > size) { + folio_unlock(folio); ret = VM_FAULT_NOPAGE; goto out; } - if (page->index == size >> PAGE_SHIFT) - len = size & ~PAGE_MASK; - else - len = PAGE_SIZE; + len = folio_size(folio); + if (folio_pos(folio) + len > size) + len = size - folio_pos(folio); /* * Return if we have all the buffers mapped. This avoids the need to do * journal_start/journal_stop which can block and take a long time @@ -6276,17 +6086,17 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) * This cannot be done for data journalling, as we have to add the * inode to the transaction's list to writeprotect pages on commit. */ - if (page_has_buffers(page)) { - if (!ext4_walk_page_buffers(NULL, inode, page_buffers(page), + if (folio_buffers(folio)) { + if (!ext4_walk_page_buffers(NULL, inode, folio_buffers(folio), 0, len, NULL, ext4_bh_unmapped)) { /* Wait so that we don't change page under IO */ - wait_for_stable_page(page); + folio_wait_stable(folio); ret = VM_FAULT_LOCKED; goto out; } } - unlock_page(page); + folio_unlock(folio); /* OK, we need to fill the hole... */ if (ext4_should_dioread_nolock(inode)) get_block = ext4_get_block_unwritten; @@ -6307,36 +6117,25 @@ retry_alloc: if (!ext4_should_journal_data(inode)) { err = block_page_mkwrite(vma, vmf, get_block); } else { - lock_page(page); + folio_lock(folio); size = i_size_read(inode); /* Page got truncated from under us? */ - if (page->mapping != mapping || page_offset(page) > size) { + if (folio->mapping != mapping || folio_pos(folio) > size) { ret = VM_FAULT_NOPAGE; goto out_error; } - if (page->index == size >> PAGE_SHIFT) - len = size & ~PAGE_MASK; - else - len = PAGE_SIZE; + len = folio_size(folio); + if (folio_pos(folio) + len > size) + len = size - folio_pos(folio); - err = __block_write_begin(page, 0, len, ext4_get_block); + err = __block_write_begin(&folio->page, 0, len, ext4_get_block); if (!err) { ret = VM_FAULT_SIGBUS; - if (ext4_walk_page_buffers(handle, inode, - page_buffers(page), 0, len, NULL, - do_journal_get_write_access)) + if (ext4_journal_page_buffers(handle, &folio->page, len)) goto out_error; - if (ext4_walk_page_buffers(handle, inode, - page_buffers(page), 0, len, NULL, - write_end_fn)) - goto out_error; - if (ext4_jbd2_inode_add_write(handle, inode, - page_offset(page), len)) - goto out_error; - ext4_set_inode_state(inode, EXT4_STATE_JDATA); } else { - unlock_page(page); + folio_unlock(folio); } } ext4_journal_stop(handle); @@ -6349,7 +6148,7 @@ out: sb_end_pagefault(inode->i_sb); return ret; out_error: - unlock_page(page); + folio_unlock(folio); ext4_journal_stop(handle); goto out; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 5b2ae37a8b80..7b2e36d103cb 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -745,6 +745,8 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); grp = ext4_get_group_info(sb, e4b->bd_group); + if (!grp) + return NULL; list_for_each(cur, &grp->bb_prealloc_list) { ext4_group_t groupnr; struct ext4_prealloc_space *pa; @@ -1060,9 +1062,9 @@ mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) static noinline_for_stack void ext4_mb_generate_buddy(struct super_block *sb, - void *buddy, void *bitmap, ext4_group_t group) + void *buddy, void *bitmap, ext4_group_t group, + struct ext4_group_info *grp) { - struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; @@ -1168,10 +1170,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) if (groups_per_page > 1) { i = sizeof(struct buffer_head *) * groups_per_page; bh = kzalloc(i, gfp); - if (bh == NULL) { - err = -ENOMEM; - goto out; - } + if (bh == NULL) + return -ENOMEM; } else bh = &bhs; @@ -1183,6 +1183,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) break; grinfo = ext4_get_group_info(sb, group); + if (!grinfo) + continue; /* * If page is uptodate then we came here after online resize * which added some new uninitialized group info structs, so @@ -1248,6 +1250,10 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) group, page->index, i * blocksize); trace_ext4_mb_buddy_bitmap_load(sb, group); grinfo = ext4_get_group_info(sb, group); + if (!grinfo) { + err = -EFSCORRUPTED; + goto out; + } grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, sizeof(*grinfo->bb_counters) * @@ -1258,7 +1264,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) ext4_lock_group(sb, group); /* init the buddy */ memset(data, 0xff, blocksize); - ext4_mb_generate_buddy(sb, data, incore, group); + ext4_mb_generate_buddy(sb, data, incore, group, grinfo); ext4_unlock_group(sb, group); incore = NULL; } else { @@ -1372,6 +1378,9 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) might_sleep(); mb_debug(sb, "init group %u\n", group); this_grp = ext4_get_group_info(sb, group); + if (!this_grp) + return -EFSCORRUPTED; + /* * This ensures that we don't reinit the buddy cache * page which map to the group from which we are already @@ -1446,6 +1455,8 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, blocks_per_page = PAGE_SIZE / sb->s_blocksize; grp = ext4_get_group_info(sb, group); + if (!grp) + return -EFSCORRUPTED; e4b->bd_blkbits = sb->s_blocksize_bits; e4b->bd_info = grp; @@ -1489,7 +1500,13 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, put_page(page); page = find_or_create_page(inode->i_mapping, pnum, gfp); if (page) { - BUG_ON(page->mapping != inode->i_mapping); + if (WARN_RATELIMIT(page->mapping != inode->i_mapping, + "ext4: bitmap's paging->mapping != inode->i_mapping\n")) { + /* should never happen */ + unlock_page(page); + ret = -EINVAL; + goto err; + } if (!PageUptodate(page)) { ret = ext4_mb_init_cache(page, NULL, gfp); if (ret) { @@ -1525,7 +1542,13 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, put_page(page); page = find_or_create_page(inode->i_mapping, pnum, gfp); if (page) { - BUG_ON(page->mapping != inode->i_mapping); + if (WARN_RATELIMIT(page->mapping != inode->i_mapping, + "ext4: buddy bitmap's page->mapping != inode->i_mapping\n")) { + /* should never happen */ + unlock_page(page); + ret = -EINVAL; + goto err; + } if (!PageUptodate(page)) { ret = ext4_mb_init_cache(page, e4b->bd_bitmap, gfp); @@ -1557,8 +1580,7 @@ err: put_page(page); if (e4b->bd_bitmap_page) put_page(e4b->bd_bitmap_page); - if (e4b->bd_buddy_page) - put_page(e4b->bd_buddy_page); + e4b->bd_buddy = NULL; e4b->bd_bitmap = NULL; return ret; @@ -1721,7 +1743,8 @@ static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last) break; order++; - if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) { + buddy2 = mb_find_buddy(e4b, order, &max); + if (!buddy2) { mb_clear_bits(buddy, first, last - first + 1); e4b->bd_info->bb_counters[order - 1] += last - first + 1; break; @@ -2021,8 +2044,6 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac, struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_free_extent *bex = &ac->ac_b_ex; struct ext4_free_extent *gex = &ac->ac_g_ex; - struct ext4_free_extent ex; - int max; if (ac->ac_status == AC_STATUS_FOUND) return; @@ -2041,17 +2062,8 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac, if (bex->fe_len < gex->fe_len) return; - if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan) - && bex->fe_group == e4b->bd_group) { - /* recheck chunk's availability - we don't know - * when it was found (within this lock-unlock - * period or not) */ - max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex); - if (max >= gex->fe_len) { - ext4_mb_use_best_found(ac, e4b); - return; - } - } + if (finish_group) + ext4_mb_use_best_found(ac, e4b); } /* @@ -2124,7 +2136,7 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, } static noinline_for_stack -int ext4_mb_try_best_found(struct ext4_allocation_context *ac, +void ext4_mb_try_best_found(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct ext4_free_extent ex = ac->ac_b_ex; @@ -2135,7 +2147,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac, BUG_ON(ex.fe_len <= 0); err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); if (err) - return err; + return; ext4_lock_group(ac->ac_sb, group); max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); @@ -2147,8 +2159,6 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac, ext4_unlock_group(ac->ac_sb, group); ext4_mb_unload_buddy(e4b); - - return 0; } static noinline_for_stack @@ -2162,7 +2172,9 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); struct ext4_free_extent ex; - if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) + if (!grp) + return -EFSCORRUPTED; + if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY))) return 0; if (grp->bb_free == 0) return 0; @@ -2236,7 +2248,9 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, continue; buddy = mb_find_buddy(e4b, i, &max); - BUG_ON(buddy == NULL); + if (WARN_RATELIMIT(buddy == NULL, + "ext4: mb_simple_scan_group: mb_find_buddy failed, (%d)\n", i)) + continue; k = mb_find_next_zero_bit(buddy, max, 0); if (k >= max) { @@ -2386,7 +2400,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, BUG_ON(cr < 0 || cr >= 4); - if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp) || !grp)) return false; free = grp->bb_free; @@ -2455,6 +2469,8 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, ext4_grpblk_t free; int ret = 0; + if (!grp) + return -EFSCORRUPTED; if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]); if (should_lock) { @@ -2535,7 +2551,7 @@ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, * prefetch once, so we avoid getblk() call, which can * be expensive. */ - if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) && + if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) && EXT4_MB_GRP_NEED_INIT(grp) && ext4_free_group_clusters(sb, gdp) > 0 && !(ext4_has_group_desc_csum(sb) && @@ -2569,17 +2585,17 @@ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, unsigned int nr) { - while (nr-- > 0) { - struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, - NULL); - struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_group_desc *gdp; + struct ext4_group_info *grp; + while (nr-- > 0) { if (!group) group = ext4_get_groups_count(sb); group--; + gdp = ext4_get_group_desc(sb, group, NULL); grp = ext4_get_group_info(sb, group); - if (EXT4_MB_GRP_NEED_INIT(grp) && + if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) && ext4_free_group_clusters(sb, gdp) > 0 && !(ext4_has_group_desc_csum(sb) && (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) { @@ -2838,6 +2854,8 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) sizeof(struct ext4_group_info); grinfo = ext4_get_group_info(sb, group); + if (!grinfo) + return 0; /* Load the group info in memory only if not already loaded. */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { err = ext4_mb_load_buddy(sb, group, &e4b); @@ -2848,7 +2866,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) buddy_loaded = 1; } - memcpy(&sg, ext4_get_group_info(sb, group), i); + memcpy(&sg, grinfo, i); if (buddy_loaded) ext4_mb_unload_buddy(&e4b); @@ -3084,7 +3102,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, if (meta_group_info == NULL) { ext4_msg(sb, KERN_ERR, "can't allocate mem " "for a buddy group"); - goto exit_meta_group_info; + return -ENOMEM; } rcu_read_lock(); rcu_dereference(sbi->s_group_info)[idx] = meta_group_info; @@ -3138,7 +3156,6 @@ exit_group_info: group_info[idx] = NULL; rcu_read_unlock(); } -exit_meta_group_info: return -ENOMEM; } /* ext4_mb_add_groupinfo */ @@ -3210,8 +3227,12 @@ static int ext4_mb_init_backend(struct super_block *sb) err_freebuddy: cachep = get_groupinfo_cache(sb->s_blocksize_bits); - while (i-- > 0) - kmem_cache_free(cachep, ext4_get_group_info(sb, i)); + while (i-- > 0) { + struct ext4_group_info *grp = ext4_get_group_info(sb, i); + + if (grp) + kmem_cache_free(cachep, grp); + } i = sbi->s_group_info_size; rcu_read_lock(); group_info = rcu_dereference(sbi->s_group_info); @@ -3419,7 +3440,6 @@ int ext4_mb_init(struct super_block *sb) sbi->s_mb_stats = MB_DEFAULT_STATS; sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; - sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC; /* * The default group preallocation is 512, which for 4k block * sizes translates to 2 megabytes. However for bigalloc file @@ -3525,6 +3545,8 @@ int ext4_mb_release(struct super_block *sb) for (i = 0; i < ngroups; i++) { cond_resched(); grinfo = ext4_get_group_info(sb, i); + if (!grinfo) + continue; mb_group_bb_bitmap_free(grinfo); ext4_lock_group(sb, i); count = ext4_mb_cleanup_pa(grinfo); @@ -3606,7 +3628,7 @@ static void ext4_free_data_in_buddy(struct super_block *sb, { struct ext4_buddy e4b; struct ext4_group_info *db; - int err, count = 0, count2 = 0; + int err, count = 0; mb_debug(sb, "gonna free %u blocks in group %u (0x%p):", entry->efd_count, entry->efd_group, entry); @@ -3622,7 +3644,6 @@ static void ext4_free_data_in_buddy(struct super_block *sb, db = e4b.bd_info; /* there are blocks to put in buddy to make them really free */ count += entry->efd_count; - count2++; ext4_lock_group(sb, entry->efd_group); /* Take it out of per group rb tree */ rb_erase(&entry->efd_node, &(db->bb_free_root)); @@ -3647,8 +3668,7 @@ static void ext4_free_data_in_buddy(struct super_block *sb, ext4_unlock_group(sb, entry->efd_group); ext4_mb_unload_buddy(&e4b); - mb_debug(sb, "freed %d blocks in %d structures\n", count, - count2); + mb_debug(sb, "freed %d blocks in 1 structures\n", count); } /* @@ -3757,9 +3777,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); if (IS_ERR(bitmap_bh)) { - err = PTR_ERR(bitmap_bh); - bitmap_bh = NULL; - goto out_err; + return PTR_ERR(bitmap_bh); } BUFFER_TRACE(bitmap_bh, "getting write access"); @@ -3822,7 +3840,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, } len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; ext4_free_group_clusters_set(sb, gdp, len); - ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh); + ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp); ext4_unlock_group(sb, ac->ac_b_ex.fe_group); @@ -3929,7 +3947,7 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, clen = ext4_free_group_clusters(sb, gdp) + clen_changed; ext4_free_group_clusters_set(sb, gdp, clen); - ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh); + ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); ext4_unlock_group(sb, group); @@ -3985,6 +4003,197 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) } /* + * This function returns the next element to look at during inode + * PA rbtree walk. We assume that we have held the inode PA rbtree lock + * (ei->i_prealloc_lock) + * + * new_start The start of the range we want to compare + * cur_start The existing start that we are comparing against + * node The node of the rb_tree + */ +static inline struct rb_node* +ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_node *node) +{ + if (new_start < cur_start) + return node->rb_left; + else + return node->rb_right; +} + +static inline void +ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, + ext4_lblk_t start, ext4_lblk_t end) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); + struct ext4_prealloc_space *tmp_pa; + ext4_lblk_t tmp_pa_start, tmp_pa_end; + struct rb_node *iter; + + read_lock(&ei->i_prealloc_lock); + for (iter = ei->i_prealloc_node.rb_node; iter; + iter = ext4_mb_pa_rb_next_iter(start, tmp_pa_start, iter)) { + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); + tmp_pa_start = tmp_pa->pa_lstart; + tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0) + BUG_ON(!(start >= tmp_pa_end || end <= tmp_pa_start)); + spin_unlock(&tmp_pa->pa_lock); + } + read_unlock(&ei->i_prealloc_lock); +} + +/* + * Given an allocation context "ac" and a range "start", "end", check + * and adjust boundaries if the range overlaps with any of the existing + * preallocatoins stored in the corresponding inode of the allocation context. + * + * Parameters: + * ac allocation context + * start start of the new range + * end end of the new range + */ +static inline void +ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, + ext4_lblk_t *start, ext4_lblk_t *end) +{ + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL; + struct rb_node *iter; + ext4_lblk_t new_start, new_end; + ext4_lblk_t tmp_pa_start, tmp_pa_end, left_pa_end = -1, right_pa_start = -1; + + new_start = *start; + new_end = *end; + + /* + * Adjust the normalized range so that it doesn't overlap with any + * existing preallocated blocks(PAs). Make sure to hold the rbtree lock + * so it doesn't change underneath us. + */ + read_lock(&ei->i_prealloc_lock); + + /* Step 1: find any one immediate neighboring PA of the normalized range */ + for (iter = ei->i_prealloc_node.rb_node; iter; + iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, + tmp_pa_start, iter)) { + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); + tmp_pa_start = tmp_pa->pa_lstart; + tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + + /* PA must not overlap original request */ + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0) + BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end || + ac->ac_o_ex.fe_logical < tmp_pa_start)); + spin_unlock(&tmp_pa->pa_lock); + } + + /* + * Step 2: check if the found PA is left or right neighbor and + * get the other neighbor + */ + if (tmp_pa) { + if (tmp_pa->pa_lstart < ac->ac_o_ex.fe_logical) { + struct rb_node *tmp; + + left_pa = tmp_pa; + tmp = rb_next(&left_pa->pa_node.inode_node); + if (tmp) { + right_pa = rb_entry(tmp, + struct ext4_prealloc_space, + pa_node.inode_node); + } + } else { + struct rb_node *tmp; + + right_pa = tmp_pa; + tmp = rb_prev(&right_pa->pa_node.inode_node); + if (tmp) { + left_pa = rb_entry(tmp, + struct ext4_prealloc_space, + pa_node.inode_node); + } + } + } + + /* Step 3: get the non deleted neighbors */ + if (left_pa) { + for (iter = &left_pa->pa_node.inode_node;; + iter = rb_prev(iter)) { + if (!iter) { + left_pa = NULL; + break; + } + + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); + left_pa = tmp_pa; + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0) { + spin_unlock(&tmp_pa->pa_lock); + break; + } + spin_unlock(&tmp_pa->pa_lock); + } + } + + if (right_pa) { + for (iter = &right_pa->pa_node.inode_node;; + iter = rb_next(iter)) { + if (!iter) { + right_pa = NULL; + break; + } + + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); + right_pa = tmp_pa; + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0) { + spin_unlock(&tmp_pa->pa_lock); + break; + } + spin_unlock(&tmp_pa->pa_lock); + } + } + + if (left_pa) { + left_pa_end = + left_pa->pa_lstart + EXT4_C2B(sbi, left_pa->pa_len); + BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical); + } + + if (right_pa) { + right_pa_start = right_pa->pa_lstart; + BUG_ON(right_pa_start <= ac->ac_o_ex.fe_logical); + } + + /* Step 4: trim our normalized range to not overlap with the neighbors */ + if (left_pa) { + if (left_pa_end > new_start) + new_start = left_pa_end; + } + + if (right_pa) { + if (right_pa_start < new_end) + new_end = right_pa_start; + } + read_unlock(&ei->i_prealloc_lock); + + /* XXX: extra loop to check we really don't overlap preallocations */ + ext4_mb_pa_assert_overlap(ac, new_start, new_end); + + *start = new_start; + *end = new_end; +} + +/* * Normalization means making request better in terms of * size and alignment */ @@ -3993,13 +4202,12 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, struct ext4_allocation_request *ar) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_super_block *es = sbi->s_es; int bsbits, max; ext4_lblk_t end; loff_t size, start_off; loff_t orig_size __maybe_unused; ext4_lblk_t start; - struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); - struct ext4_prealloc_space *pa; /* do normalize only data requests, metadata requests do not need preallocation */ @@ -4068,7 +4276,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, size = 8 * 1024 * 1024; } else { start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits; - size = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb), + size = (loff_t) EXT4_C2B(sbi, ac->ac_o_ex.fe_len) << bsbits; } size = size >> bsbits; @@ -4100,61 +4308,10 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, end = start + size; - /* check we don't cross already preallocated blocks */ - rcu_read_lock(); - list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { - ext4_lblk_t pa_end; - - if (pa->pa_deleted) - continue; - spin_lock(&pa->pa_lock); - if (pa->pa_deleted) { - spin_unlock(&pa->pa_lock); - continue; - } - - pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), - pa->pa_len); - - /* PA must not overlap original request */ - BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || - ac->ac_o_ex.fe_logical < pa->pa_lstart)); + ext4_mb_pa_adjust_overlap(ac, &start, &end); - /* skip PAs this normalized request doesn't overlap with */ - if (pa->pa_lstart >= end || pa_end <= start) { - spin_unlock(&pa->pa_lock); - continue; - } - BUG_ON(pa->pa_lstart <= start && pa_end >= end); - - /* adjust start or end to be adjacent to this pa */ - if (pa_end <= ac->ac_o_ex.fe_logical) { - BUG_ON(pa_end < start); - start = pa_end; - } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) { - BUG_ON(pa->pa_lstart > end); - end = pa->pa_lstart; - } - spin_unlock(&pa->pa_lock); - } - rcu_read_unlock(); size = end - start; - /* XXX: extra loop to check we really don't overlap preallocations */ - rcu_read_lock(); - list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { - ext4_lblk_t pa_end; - - spin_lock(&pa->pa_lock); - if (pa->pa_deleted == 0) { - pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), - pa->pa_len); - BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); - } - spin_unlock(&pa->pa_lock); - } - rcu_read_unlock(); - /* * In this function "start" and "size" are normalized for better * alignment and length such that we could preallocate more blocks. @@ -4165,7 +4322,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, * provide gurantee on number of contiguous blocks allocation since that * depends upon free space left, etc). * In case of inode pa, later we use the allocated blocks - * [pa_start + fe_logical - pa_lstart, fe_len/size] from the preallocated + * [pa_pstart + fe_logical - pa_lstart, fe_len/size] from the preallocated * range of goal/best blocks [start, size] to put it at the * ac_o_ex.fe_logical extent of this inode. * (See ext4_mb_use_inode_pa() for more details) @@ -4188,18 +4345,21 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); /* define goal start in order to merge */ - if (ar->pright && (ar->lright == (start + size))) { + if (ar->pright && (ar->lright == (start + size)) && + ar->pright >= size && + ar->pright - size >= le32_to_cpu(es->s_first_data_block)) { /* merge to the right */ ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size, - &ac->ac_f_ex.fe_group, - &ac->ac_f_ex.fe_start); + &ac->ac_g_ex.fe_group, + &ac->ac_g_ex.fe_start); ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; } - if (ar->pleft && (ar->lleft + 1 == start)) { + if (ar->pleft && (ar->lleft + 1 == start) && + ar->pleft + 1 < ext4_blocks_count(es)) { /* merge to the left */ ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, - &ac->ac_f_ex.fe_group, - &ac->ac_f_ex.fe_start); + &ac->ac_g_ex.fe_group, + &ac->ac_g_ex.fe_start); ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; } @@ -4247,15 +4407,14 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) if (ac->ac_f_ex.fe_len == 0) return; err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b); - if (err) { + if (WARN_RATELIMIT(err, + "ext4: mb_load_buddy failed (%d)", err)) /* * This should never happen since we pin the * pages in the ext4_allocation_context so * ext4_mb_load_buddy() should never fail. */ - WARN(1, "mb_load_buddy failed (%d)", err); return; - } ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group); mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start, ac->ac_f_ex.fe_len); @@ -4263,8 +4422,11 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) ext4_mb_unload_buddy(&e4b); return; } - if (pa->pa_type == MB_INODE_PA) + if (pa->pa_type == MB_INODE_PA) { + spin_lock(&pa->pa_lock); pa->pa_free += ac->ac_b_ex.fe_len; + spin_unlock(&pa->pa_lock); + } } /* @@ -4292,6 +4454,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, BUG_ON(start < pa->pa_pstart); BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len)); BUG_ON(pa->pa_free < len); + BUG_ON(ac->ac_b_ex.fe_len <= 0); pa->pa_free -= len; mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa); @@ -4312,14 +4475,14 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, ac->ac_status = AC_STATUS_FOUND; ac->ac_pa = pa; - /* we don't correct pa_pstart or pa_plen here to avoid + /* we don't correct pa_pstart or pa_len here to avoid * possible race when the group is being loaded concurrently * instead we correct pa later, after blocks are marked * in on-disk bitmap -- see ext4_mb_release_context() * Other CPUs are prevented from allocating from this pa by lg_mutex */ mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n", - pa->pa_lstart-len, len, pa); + pa->pa_lstart, len, pa); } /* @@ -4361,7 +4524,9 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) int order, i; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; - struct ext4_prealloc_space *pa, *cpa = NULL; + struct ext4_prealloc_space *tmp_pa, *cpa = NULL; + ext4_lblk_t tmp_pa_start, tmp_pa_end; + struct rb_node *iter; ext4_fsblk_t goal_block; /* only data can be preallocated */ @@ -4369,35 +4534,47 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) return false; /* first, try per-file preallocation */ - rcu_read_lock(); - list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { + read_lock(&ei->i_prealloc_lock); + for (iter = ei->i_prealloc_node.rb_node; iter; + iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, + tmp_pa_start, iter)) { + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); /* all fields in this condition don't change, * so we can skip locking for them */ - if (ac->ac_o_ex.fe_logical < pa->pa_lstart || - ac->ac_o_ex.fe_logical >= (pa->pa_lstart + - EXT4_C2B(sbi, pa->pa_len))) + tmp_pa_start = tmp_pa->pa_lstart; + tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + + /* original request start doesn't lie in this PA */ + if (ac->ac_o_ex.fe_logical < tmp_pa_start || + ac->ac_o_ex.fe_logical >= tmp_pa_end) continue; /* non-extent files can't have physical blocks past 2^32 */ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && - (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) > - EXT4_MAX_BLOCK_FILE_PHYS)) - continue; + (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) > + EXT4_MAX_BLOCK_FILE_PHYS)) { + /* + * Since PAs don't overlap, we won't find any + * other PA to satisfy this. + */ + break; + } /* found preallocated blocks, use them */ - spin_lock(&pa->pa_lock); - if (pa->pa_deleted == 0 && pa->pa_free) { - atomic_inc(&pa->pa_count); - ext4_mb_use_inode_pa(ac, pa); - spin_unlock(&pa->pa_lock); + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free) { + atomic_inc(&tmp_pa->pa_count); + ext4_mb_use_inode_pa(ac, tmp_pa); + spin_unlock(&tmp_pa->pa_lock); ac->ac_criteria = 10; - rcu_read_unlock(); + read_unlock(&ei->i_prealloc_lock); return true; } - spin_unlock(&pa->pa_lock); + spin_unlock(&tmp_pa->pa_lock); } - rcu_read_unlock(); + read_unlock(&ei->i_prealloc_lock); /* can we use group allocation? */ if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) @@ -4419,16 +4596,16 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) */ for (i = order; i < PREALLOC_TB_SIZE; i++) { rcu_read_lock(); - list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i], - pa_inode_list) { - spin_lock(&pa->pa_lock); - if (pa->pa_deleted == 0 && - pa->pa_free >= ac->ac_o_ex.fe_len) { + list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[i], + pa_node.lg_list) { + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0 && + tmp_pa->pa_free >= ac->ac_o_ex.fe_len) { cpa = ext4_mb_check_group_pa(goal_block, - pa, cpa); + tmp_pa, cpa); } - spin_unlock(&pa->pa_lock); + spin_unlock(&tmp_pa->pa_lock); } rcu_read_unlock(); } @@ -4454,6 +4631,8 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, struct ext4_free_data *entry; grp = ext4_get_group_info(sb, group); + if (!grp) + return; n = rb_first(&(grp->bb_free_root)); while (n) { @@ -4481,6 +4660,9 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, int preallocated = 0; int len; + if (!grp) + return; + /* all form of preallocation discards first load group, * so the only competing code is preallocation use. * we don't need any locking here @@ -4525,16 +4707,22 @@ static void ext4_mb_mark_pa_deleted(struct super_block *sb, } } -static void ext4_mb_pa_callback(struct rcu_head *head) +static inline void ext4_mb_pa_free(struct ext4_prealloc_space *pa) { - struct ext4_prealloc_space *pa; - pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); - + BUG_ON(!pa); BUG_ON(atomic_read(&pa->pa_count)); BUG_ON(pa->pa_deleted == 0); kmem_cache_free(ext4_pspace_cachep, pa); } +static void ext4_mb_pa_callback(struct rcu_head *head) +{ + struct ext4_prealloc_space *pa; + + pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); + ext4_mb_pa_free(pa); +} + /* * drops a reference to preallocated space descriptor * if this was the last reference and the space is consumed @@ -4544,6 +4732,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, { ext4_group_t grp; ext4_fsblk_t grp_blk; + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); /* in this short window concurrent discard can set pa_deleted */ spin_lock(&pa->pa_lock); @@ -4588,11 +4777,42 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, list_del(&pa->pa_group_list); ext4_unlock_group(sb, grp); - spin_lock(pa->pa_obj_lock); - list_del_rcu(&pa->pa_inode_list); - spin_unlock(pa->pa_obj_lock); + if (pa->pa_type == MB_INODE_PA) { + write_lock(pa->pa_node_lock.inode_lock); + rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); + write_unlock(pa->pa_node_lock.inode_lock); + ext4_mb_pa_free(pa); + } else { + spin_lock(pa->pa_node_lock.lg_lock); + list_del_rcu(&pa->pa_node.lg_list); + spin_unlock(pa->pa_node_lock.lg_lock); + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + } +} + +static void ext4_mb_pa_rb_insert(struct rb_root *root, struct rb_node *new) +{ + struct rb_node **iter = &root->rb_node, *parent = NULL; + struct ext4_prealloc_space *iter_pa, *new_pa; + ext4_lblk_t iter_start, new_start; + + while (*iter) { + iter_pa = rb_entry(*iter, struct ext4_prealloc_space, + pa_node.inode_node); + new_pa = rb_entry(new, struct ext4_prealloc_space, + pa_node.inode_node); + iter_start = iter_pa->pa_lstart; + new_start = new_pa->pa_lstart; + + parent = *iter; + if (new_start < iter_start) + iter = &((*iter)->rb_left); + else + iter = &((*iter)->rb_right); + } - call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + rb_link_node(new, parent, iter); + rb_insert_color(new, root); } /* @@ -4616,10 +4836,8 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa = ac->ac_pa; if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { - int winl; - int wins; - int win; - int offs; + int new_bex_start; + int new_bex_end; /* we can't allocate as much as normalizer wants. * so, found space must get proper lstart @@ -4627,38 +4845,47 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); - /* we're limited by original request in that - * logical block must be covered any way - * winl is window we can move our chunk within */ - winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical; + /* + * Use the below logic for adjusting best extent as it keeps + * fragmentation in check while ensuring logical range of best + * extent doesn't overflow out of goal extent: + * + * 1. Check if best ex can be kept at end of goal and still + * cover original start + * 2. Else, check if best ex can be kept at start of goal and + * still cover original start + * 3. Else, keep the best ex at start of original request. + */ + new_bex_end = ac->ac_g_ex.fe_logical + + EXT4_C2B(sbi, ac->ac_g_ex.fe_len); + new_bex_start = new_bex_end - EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + if (ac->ac_o_ex.fe_logical >= new_bex_start) + goto adjust_bex; - /* also, we should cover whole original request */ - wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len); + new_bex_start = ac->ac_g_ex.fe_logical; + new_bex_end = + new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + if (ac->ac_o_ex.fe_logical < new_bex_end) + goto adjust_bex; - /* the smallest one defines real window */ - win = min(winl, wins); + new_bex_start = ac->ac_o_ex.fe_logical; + new_bex_end = + new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); - offs = ac->ac_o_ex.fe_logical % - EXT4_C2B(sbi, ac->ac_b_ex.fe_len); - if (offs && offs < win) - win = offs; +adjust_bex: + ac->ac_b_ex.fe_logical = new_bex_start; - ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - - EXT4_NUM_B2C(sbi, win); BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); + BUG_ON(new_bex_end > (ac->ac_g_ex.fe_logical + + EXT4_C2B(sbi, ac->ac_g_ex.fe_len))); } - /* preallocation can change ac_b_ex, thus we store actually - * allocated blocks for history */ - ac->ac_f_ex = ac->ac_b_ex; - pa->pa_lstart = ac->ac_b_ex.fe_logical; pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_free = pa->pa_len; spin_lock_init(&pa->pa_lock); - INIT_LIST_HEAD(&pa->pa_inode_list); INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_type = MB_INODE_PA; @@ -4667,20 +4894,22 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa->pa_len, pa->pa_lstart); trace_ext4_mb_new_inode_pa(ac, pa); - ext4_mb_use_inode_pa(ac, pa); atomic_add(pa->pa_free, &sbi->s_mb_preallocated); + ext4_mb_use_inode_pa(ac, pa); ei = EXT4_I(ac->ac_inode); grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); + if (!grp) + return; - pa->pa_obj_lock = &ei->i_prealloc_lock; + pa->pa_node_lock.inode_lock = &ei->i_prealloc_lock; pa->pa_inode = ac->ac_inode; list_add(&pa->pa_group_list, &grp->bb_prealloc_list); - spin_lock(pa->pa_obj_lock); - list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); - spin_unlock(pa->pa_obj_lock); + write_lock(pa->pa_node_lock.inode_lock); + ext4_mb_pa_rb_insert(&ei->i_prealloc_node, &pa->pa_node.inode_node); + write_unlock(pa->pa_node_lock.inode_lock); atomic_inc(&ei->i_prealloc_active); } @@ -4703,16 +4932,12 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) pa = ac->ac_pa; - /* preallocation can change ac_b_ex, thus we store actually - * allocated blocks for history */ - ac->ac_f_ex = ac->ac_b_ex; - pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); pa->pa_lstart = pa->pa_pstart; pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_free = pa->pa_len; spin_lock_init(&pa->pa_lock); - INIT_LIST_HEAD(&pa->pa_inode_list); + INIT_LIST_HEAD(&pa->pa_node.lg_list); INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_type = MB_GROUP_PA; @@ -4725,10 +4950,12 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); + if (!grp) + return; lg = ac->ac_lg; BUG_ON(lg == NULL); - pa->pa_obj_lock = &lg->lg_prealloc_lock; + pa->pa_node_lock.lg_lock = &lg->lg_prealloc_lock; pa->pa_inode = NULL; list_add(&pa->pa_group_list, &grp->bb_prealloc_list); @@ -4820,7 +5047,11 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, trace_ext4_mb_release_group_pa(sb, pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); - BUG_ON(group != e4b->bd_group && pa->pa_len != 0); + if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) { + ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu", + e4b->bd_group, group, pa->pa_pstart); + return 0; + } mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); @@ -4846,9 +5077,12 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, struct ext4_prealloc_space *pa, *tmp; struct list_head list; struct ext4_buddy e4b; + struct ext4_inode_info *ei; int err; int free = 0; + if (!grp) + return 0; mb_debug(sb, "discard preallocation for group %u\n", group); if (list_empty(&grp->bb_prealloc_list)) goto out_dbg; @@ -4904,17 +5138,26 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { /* remove from object (inode or locality group) */ - spin_lock(pa->pa_obj_lock); - list_del_rcu(&pa->pa_inode_list); - spin_unlock(pa->pa_obj_lock); + if (pa->pa_type == MB_GROUP_PA) { + spin_lock(pa->pa_node_lock.lg_lock); + list_del_rcu(&pa->pa_node.lg_list); + spin_unlock(pa->pa_node_lock.lg_lock); + } else { + write_lock(pa->pa_node_lock.inode_lock); + ei = EXT4_I(pa->pa_inode); + rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); + write_unlock(pa->pa_node_lock.inode_lock); + } - if (pa->pa_type == MB_GROUP_PA) + list_del(&pa->u.pa_tmp_list); + + if (pa->pa_type == MB_GROUP_PA) { ext4_mb_release_group_pa(&e4b, pa); - else + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + } else { ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); - - list_del(&pa->u.pa_tmp_list); - call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + ext4_mb_pa_free(pa); + } } ext4_unlock_group(sb, group); @@ -4944,10 +5187,10 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) ext4_group_t group = 0; struct list_head list; struct ext4_buddy e4b; + struct rb_node *iter; int err; if (!S_ISREG(inode->i_mode)) { - /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/ return; } @@ -4966,17 +5209,19 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) repeat: /* first, collect all pa's in the inode */ - spin_lock(&ei->i_prealloc_lock); - while (!list_empty(&ei->i_prealloc_list) && needed) { - pa = list_entry(ei->i_prealloc_list.prev, - struct ext4_prealloc_space, pa_inode_list); - BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock); + write_lock(&ei->i_prealloc_lock); + for (iter = rb_first(&ei->i_prealloc_node); iter && needed; + iter = rb_next(iter)) { + pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); + BUG_ON(pa->pa_node_lock.inode_lock != &ei->i_prealloc_lock); + spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { /* this shouldn't happen often - nobody should * use preallocation while we're discarding it */ spin_unlock(&pa->pa_lock); - spin_unlock(&ei->i_prealloc_lock); + write_unlock(&ei->i_prealloc_lock); ext4_msg(sb, KERN_ERR, "uh-oh! used pa while discarding"); WARN_ON(1); @@ -4987,7 +5232,7 @@ repeat: if (pa->pa_deleted == 0) { ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); - list_del_rcu(&pa->pa_inode_list); + rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); list_add(&pa->u.pa_tmp_list, &list); needed--; continue; @@ -4995,7 +5240,7 @@ repeat: /* someone is deleting pa right now */ spin_unlock(&pa->pa_lock); - spin_unlock(&ei->i_prealloc_lock); + write_unlock(&ei->i_prealloc_lock); /* we have to wait here because pa_deleted * doesn't mean pa is already unlinked from @@ -5012,7 +5257,7 @@ repeat: schedule_timeout_uninterruptible(HZ); goto repeat; } - spin_unlock(&ei->i_prealloc_lock); + write_unlock(&ei->i_prealloc_lock); list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { BUG_ON(pa->pa_type != MB_INODE_PA); @@ -5044,7 +5289,7 @@ repeat: put_bh(bitmap_bh); list_del(&pa->u.pa_tmp_list); - call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + ext4_mb_pa_free(pa); } } @@ -5061,14 +5306,20 @@ static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac) return 0; } -static void ext4_mb_pa_free(struct ext4_allocation_context *ac) +static void ext4_mb_pa_put_free(struct ext4_allocation_context *ac) { struct ext4_prealloc_space *pa = ac->ac_pa; BUG_ON(!pa); ac->ac_pa = NULL; WARN_ON(!atomic_dec_and_test(&pa->pa_count)); - kmem_cache_free(ext4_pspace_cachep, pa); + /* + * current function is only called due to an error or due to + * len of found blocks < len of requested blocks hence the PA has not + * been added to grp->bb_prealloc_list. So we don't need to lock it + */ + pa->pa_deleted = 1; + ext4_mb_pa_free(pa); } #ifdef CONFIG_EXT4_DEBUG @@ -5086,6 +5337,9 @@ static inline void ext4_mb_show_pa(struct super_block *sb) struct ext4_prealloc_space *pa; ext4_grpblk_t start; struct list_head *cur; + + if (!grp) + continue; ext4_lock_group(sb, i); list_for_each(cur, &grp->bb_prealloc_list) { pa = list_entry(cur, struct ext4_prealloc_space, @@ -5271,7 +5525,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], - pa_inode_list, + pa_node.lg_list, lockdep_is_held(&lg->lg_prealloc_lock)) { spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { @@ -5294,7 +5548,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); - list_del_rcu(&pa->pa_inode_list); + list_del_rcu(&pa->pa_node.lg_list); list_add(&pa->u.pa_tmp_list, &discard_list); total_entries--; @@ -5355,7 +5609,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) /* Add the prealloc space to lg */ spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], - pa_inode_list, + pa_node.lg_list, lockdep_is_held(&lg->lg_prealloc_lock)) { spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted) { @@ -5364,8 +5618,8 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) } if (!added && pa->pa_free < tmp_pa->pa_free) { /* Add to the tail of the previous entry */ - list_add_tail_rcu(&pa->pa_inode_list, - &tmp_pa->pa_inode_list); + list_add_tail_rcu(&pa->pa_node.lg_list, + &tmp_pa->pa_node.lg_list); added = 1; /* * we want to count the total @@ -5376,7 +5630,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) lg_prealloc_count++; } if (!added) - list_add_tail_rcu(&pa->pa_inode_list, + list_add_tail_rcu(&pa->pa_node.lg_list, &lg->lg_prealloc_list[order]); spin_unlock(&lg->lg_prealloc_lock); @@ -5390,29 +5644,10 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) } /* - * if per-inode prealloc list is too long, trim some PA - */ -static void ext4_mb_trim_inode_pa(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - int count, delta; - - count = atomic_read(&ei->i_prealloc_active); - delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1; - if (count > sbi->s_mb_max_inode_prealloc + delta) { - count -= sbi->s_mb_max_inode_prealloc; - ext4_discard_preallocations(inode, count); - } -} - -/* * release all resource we used in allocation */ static int ext4_mb_release_context(struct ext4_allocation_context *ac) { - struct inode *inode = ac->ac_inode; - struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *pa = ac->ac_pa; if (pa) { @@ -5432,23 +5667,13 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) * doesn't grow big. */ if (likely(pa->pa_free)) { - spin_lock(pa->pa_obj_lock); - list_del_rcu(&pa->pa_inode_list); - spin_unlock(pa->pa_obj_lock); + spin_lock(pa->pa_node_lock.lg_lock); + list_del_rcu(&pa->pa_node.lg_list); + spin_unlock(pa->pa_node_lock.lg_lock); ext4_mb_add_n_trim(ac); } } - if (pa->pa_type == MB_INODE_PA) { - /* - * treat per-inode prealloc list as a lru list, then try - * to trim the least recently used PA. - */ - spin_lock(pa->pa_obj_lock); - list_move(&pa->pa_inode_list, &ei->i_prealloc_list); - spin_unlock(pa->pa_obj_lock); - } - ext4_mb_put_pa(ac, ac->ac_sb, pa); } if (ac->ac_bitmap_page) @@ -5458,7 +5683,6 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) mutex_unlock(&ac->ac_lg->lg_mutex); ext4_mb_collect_stats(ac); - ext4_mb_trim_inode_pa(inode); return 0; } @@ -5611,13 +5835,13 @@ repeat: * So we have to free this pa here itself. */ if (*errp) { - ext4_mb_pa_free(ac); + ext4_mb_pa_put_free(ac); ext4_discard_allocated_blocks(ac); goto errout; } if (ac->ac_status == AC_STATUS_FOUND && ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len) - ext4_mb_pa_free(ac); + ext4_mb_pa_put_free(ac); } if (likely(ac->ac_status == AC_STATUS_FOUND)) { *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); @@ -5636,20 +5860,19 @@ repeat: * If block allocation fails then the pa allocated above * needs to be freed here itself. */ - ext4_mb_pa_free(ac); + ext4_mb_pa_put_free(ac); *errp = -ENOSPC; } -errout: if (*errp) { +errout: ac->ac_b_ex.fe_len = 0; ar->len = 0; ext4_mb_show_ac(ac); } ext4_mb_release_context(ac); + kmem_cache_free(ext4_ac_cachep, ac); out: - if (ac) - kmem_cache_free(ext4_ac_cachep, ac); if (inquota && ar->len < inquota) dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); if (!ar->len) { @@ -5693,7 +5916,7 @@ static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi, kmem_cache_free(ext4_free_data_cachep, entry); } -static noinline_for_stack int +static noinline_for_stack void ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct ext4_free_data *new_entry) { @@ -5736,7 +5959,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, EXT4_C2B(sbi, cluster), "Block already on to-be-freed list"); kmem_cache_free(ext4_free_data_cachep, new_entry); - return 0; + return; } } @@ -5762,7 +5985,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list); sbi->s_mb_free_pending += clusters; spin_unlock(&sbi->s_md_lock); - return 0; } /* @@ -5797,9 +6019,6 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, return 0; } - ext4_get_group_no_and_offset(sb, - max(ext4_group_first_block_no(sb, group), goal), - NULL, &blkoff); while (1) { i = mb_find_next_zero_bit(bitmap_bh->b_data, max, blkoff); @@ -5814,6 +6033,8 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, brelse(bitmap_bh); if (i < max) break; + + blkoff = 0; } if (group >= ext4_get_groups_count(sb) || i >= max) { @@ -5842,13 +6063,12 @@ static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block, ext4_get_group_no_and_offset(sb, block, &group, &blkoff); bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { - err = PTR_ERR(bitmap_bh); pr_warn("Failed to read block bitmap\n"); return; } gdp = ext4_get_group_desc(sb, group, &gdp_bh); if (!gdp) - return; + goto err_out; for (i = 0; i < count; i++) { if (!mb_test_bit(blkoff + i, bitmap_bh->b_data)) @@ -5857,15 +6077,17 @@ static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block, mb_clear_bits(bitmap_bh->b_data, blkoff, count); err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); if (err) - return; + goto err_out; ext4_free_group_clusters_set( sb, gdp, ext4_free_group_clusters(sb, gdp) + count - already_freed); - ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh); + ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); sync_dirty_buffer(bitmap_bh); sync_dirty_buffer(gdp_bh); + +err_out: brelse(bitmap_bh); } @@ -5885,6 +6107,7 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode, struct buffer_head *bitmap_bh = NULL; struct super_block *sb = inode->i_sb; struct ext4_group_desc *gdp; + struct ext4_group_info *grp; unsigned int overflow; ext4_grpblk_t bit; struct buffer_head *gd_bh; @@ -5910,8 +6133,8 @@ do_more: overflow = 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); - if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT( - ext4_get_group_info(sb, block_group)))) + grp = ext4_get_group_info(sb, block_group); + if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) return; /* @@ -6023,7 +6246,7 @@ do_more: ret = ext4_free_group_clusters(sb, gdp) + count_clusters; ext4_free_group_clusters_set(sb, gdp, ret); - ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh); + ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); @@ -6280,7 +6503,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, free_clusters_count = clusters_freed + ext4_free_group_clusters(sb, desc); ext4_free_group_clusters_set(sb, desc, free_clusters_count); - ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh); + ext4_block_bitmap_csum_set(sb, desc, bitmap_bh); ext4_group_desc_csum_set(sb, block_group, desc); ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeclusters_counter, @@ -6513,6 +6736,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) for (group = first_group; group <= last_group; group++) { grp = ext4_get_group_info(sb, group); + if (!grp) + continue; /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { ret = ext4_mb_init_group(sb, group, GFP_NOFS); diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index dcda2a943cee..6d85ee8674a6 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -74,11 +74,6 @@ #define MB_DEFAULT_GROUP_PREALLOC 512 /* - * maximum length of inode prealloc list - */ -#define MB_DEFAULT_MAX_INODE_PREALLOC 512 - -/* * Number of groups to search linearly before performing group scanning * optimization. */ @@ -114,7 +109,10 @@ struct ext4_free_data { }; struct ext4_prealloc_space { - struct list_head pa_inode_list; + union { + struct rb_node inode_node; /* for inode PA rbtree */ + struct list_head lg_list; /* for lg PAs */ + } pa_node; struct list_head pa_group_list; union { struct list_head pa_tmp_list; @@ -128,8 +126,11 @@ struct ext4_prealloc_space { ext4_grpblk_t pa_len; /* len of preallocated chunk */ ext4_grpblk_t pa_free; /* how many blocks are free */ unsigned short pa_type; /* pa type. inode or group */ - spinlock_t *pa_obj_lock; - struct inode *pa_inode; /* hack, for history only */ + union { + rwlock_t *inode_lock; /* locks the rbtree holding this PA */ + spinlock_t *lg_lock; /* locks the lg list holding this PA */ + } pa_node_lock; + struct inode *pa_inode; /* used to get the inode during group discard */ }; enum { diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index a19a9661646e..d98ac2af8199 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -408,7 +408,6 @@ static int free_ext_block(handle_t *handle, struct inode *inode) int ext4_ext_migrate(struct inode *inode) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); handle_t *handle; int retval = 0, i; __le32 *i_data; @@ -418,6 +417,7 @@ int ext4_ext_migrate(struct inode *inode) unsigned long max_entries; __u32 goal, tmp_csum_seed; uid_t owner[2]; + int alloc_ctx; /* * If the filesystem does not support extents, or the inode @@ -434,7 +434,7 @@ int ext4_ext_migrate(struct inode *inode) */ return retval; - percpu_down_write(&sbi->s_writepages_rwsem); + alloc_ctx = ext4_writepages_down_write(inode->i_sb); /* * Worst case we can touch the allocation bitmaps and a block @@ -586,7 +586,7 @@ out_tmp_inode: unlock_new_inode(tmp_inode); iput(tmp_inode); out_unlock: - percpu_up_write(&sbi->s_writepages_rwsem); + ext4_writepages_up_write(inode->i_sb, alloc_ctx); return retval; } @@ -605,6 +605,7 @@ int ext4_ind_migrate(struct inode *inode) ext4_fsblk_t blk; handle_t *handle; int ret, ret2 = 0; + int alloc_ctx; if (!ext4_has_feature_extents(inode->i_sb) || (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) @@ -621,7 +622,7 @@ int ext4_ind_migrate(struct inode *inode) if (test_opt(inode->i_sb, DELALLOC)) ext4_alloc_da_blocks(inode); - percpu_down_write(&sbi->s_writepages_rwsem); + alloc_ctx = ext4_writepages_down_write(inode->i_sb); handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); if (IS_ERR(handle)) { @@ -665,6 +666,6 @@ errout: ext4_journal_stop(handle); up_write(&EXT4_I(inode)->i_data_sem); out_unlock: - percpu_up_write(&sbi->s_writepages_rwsem); + ext4_writepages_up_write(inode->i_sb, alloc_ctx); return ret; } diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 4681fff6665f..0aaf38ffcb6e 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -39,28 +39,36 @@ static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp) * Write the MMP block using REQ_SYNC to try to get the block on-disk * faster. */ -static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) +static int write_mmp_block_thawed(struct super_block *sb, + struct buffer_head *bh) { struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data); - /* - * We protect against freezing so that we don't create dirty buffers - * on frozen filesystem. - */ - sb_start_write(sb); ext4_mmp_csum_set(sb, mmp); lock_buffer(bh); bh->b_end_io = end_buffer_write_sync; get_bh(bh); submit_bh(REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); - sb_end_write(sb); if (unlikely(!buffer_uptodate(bh))) return -EIO; - return 0; } +static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) +{ + int err; + + /* + * We protect against freezing so that we don't create dirty buffers + * on frozen filesystem. + */ + sb_start_write(sb); + err = write_mmp_block_thawed(sb, bh); + sb_end_write(sb); + return err; +} + /* * Read the MMP block. It _must_ be read from disk and hence we clear the * uptodate flag on the buffer. @@ -282,6 +290,7 @@ int ext4_multi_mount_protect(struct super_block *sb, if (mmp_block < le32_to_cpu(es->s_first_data_block) || mmp_block >= ext4_blocks_count(es)) { ext4_warning(sb, "Invalid MMP block in superblock"); + retval = -EINVAL; goto failed; } @@ -307,6 +316,7 @@ int ext4_multi_mount_protect(struct super_block *sb, if (seq == EXT4_MMP_SEQ_FSCK) { dump_mmp_msg(sb, mmp, "fsck is running on the filesystem"); + retval = -EBUSY; goto failed; } @@ -320,6 +330,7 @@ int ext4_multi_mount_protect(struct super_block *sb, if (schedule_timeout_interruptible(HZ * wait_time) != 0) { ext4_warning(sb, "MMP startup interrupted, failing mount\n"); + retval = -ETIMEDOUT; goto failed; } @@ -330,6 +341,7 @@ int ext4_multi_mount_protect(struct super_block *sb, if (seq != le32_to_cpu(mmp->mmp_seq)) { dump_mmp_msg(sb, mmp, "Device is already active on another node."); + retval = -EBUSY; goto failed; } @@ -340,7 +352,11 @@ skip: seq = mmp_new_seq(); mmp->mmp_seq = cpu_to_le32(seq); - retval = write_mmp_block(sb, bh); + /* + * On mount / remount we are protected against fs freezing (by s_umount + * semaphore) and grabbing freeze protection upsets lockdep + */ + retval = write_mmp_block_thawed(sb, bh); if (retval) goto failed; @@ -349,6 +365,7 @@ skip: */ if (schedule_timeout_interruptible(HZ * wait_time) != 0) { ext4_warning(sb, "MMP startup interrupted, failing mount"); + retval = -ETIMEDOUT; goto failed; } @@ -359,6 +376,7 @@ skip: if (seq != le32_to_cpu(mmp->mmp_seq)) { dump_mmp_msg(sb, mmp, "Device is already active on another node."); + retval = -EBUSY; goto failed; } @@ -378,6 +396,7 @@ skip: EXT4_SB(sb)->s_mmp_tsk = NULL; ext4_warning(sb, "Unable to create kmmpd thread for %s.", sb->s_id); + retval = -ENOMEM; goto failed; } @@ -385,5 +404,5 @@ skip: failed: brelse(bh); - return 1; + return retval; } diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 2de9829aed63..b5af2fc03b2f 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -126,7 +126,6 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2, { struct address_space *mapping[2]; unsigned int flags; - unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; BUG_ON(!inode1 || !inode2); if (inode1 < inode2) { @@ -139,20 +138,20 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2, } flags = memalloc_nofs_save(); - folio[0] = __filemap_get_folio(mapping[0], index1, fgp_flags, + folio[0] = __filemap_get_folio(mapping[0], index1, FGP_WRITEBEGIN, mapping_gfp_mask(mapping[0])); - if (!folio[0]) { + if (IS_ERR(folio[0])) { memalloc_nofs_restore(flags); - return -ENOMEM; + return PTR_ERR(folio[0]); } - folio[1] = __filemap_get_folio(mapping[1], index2, fgp_flags, + folio[1] = __filemap_get_folio(mapping[1], index2, FGP_WRITEBEGIN, mapping_gfp_mask(mapping[1])); memalloc_nofs_restore(flags); - if (!folio[1]) { + if (IS_ERR(folio[1])) { folio_unlock(folio[0]); folio_put(folio[0]); - return -ENOMEM; + return PTR_ERR(folio[1]); } /* * __filemap_get_folio() may not wait on folio's writeback if @@ -169,25 +168,27 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2, /* Force page buffers uptodate w/o dropping page's lock */ static int -mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) +mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; sector_t block; struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; unsigned int blocksize, block_start, block_end; int i, err, nr = 0, partial = 0; - BUG_ON(!PageLocked(page)); - BUG_ON(PageWriteback(page)); + BUG_ON(!folio_test_locked(folio)); + BUG_ON(folio_test_writeback(folio)); - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) return 0; blocksize = i_blocksize(inode); - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); + head = folio_buffers(folio); + if (!head) { + create_empty_buffers(&folio->page, blocksize, 0); + head = folio_buffers(folio); + } - head = page_buffers(page); - block = (sector_t)page->index << (PAGE_SHIFT - inode->i_blkbits); + block = (sector_t)folio->index << (PAGE_SHIFT - inode->i_blkbits); for (bh = head, block_start = 0; bh != head || !block_start; block++, block_start = block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; @@ -201,11 +202,11 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) if (!buffer_mapped(bh)) { err = ext4_get_block(inode, block, bh, 0); if (err) { - SetPageError(page); + folio_set_error(folio); return err; } if (!buffer_mapped(bh)) { - zero_user(page, block_start, blocksize); + folio_zero_range(folio, block_start, blocksize); set_buffer_uptodate(bh); continue; } @@ -227,7 +228,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) } out: if (!partial) - SetPageUptodate(page); + folio_mark_uptodate(folio); return 0; } @@ -355,7 +356,7 @@ again: goto unlock_folios; } data_copy: - *err = mext_page_mkuptodate(&folio[0]->page, from, from + replaced_size); + *err = mext_page_mkuptodate(folio[0], from, from + replaced_size); if (*err) goto unlock_folios; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a5010b5b8a8c..45b579805c95 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -674,7 +674,7 @@ static struct stats dx_show_leaf(struct inode *dir, len = de->name_len; if (!IS_ENCRYPTED(dir)) { /* Directory is not encrypted */ - ext4fs_dirhash(dir, de->name, + (void) ext4fs_dirhash(dir, de->name, de->name_len, &h); printk("%*.s:(U)%x.%u ", len, name, h.hash, @@ -709,8 +709,9 @@ static struct stats dx_show_leaf(struct inode *dir, if (IS_CASEFOLDED(dir)) h.hash = EXT4_DIRENT_HASH(de); else - ext4fs_dirhash(dir, de->name, - de->name_len, &h); + (void) ext4fs_dirhash(dir, + de->name, + de->name_len, &h); printk("%*.s:(E)%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); @@ -720,7 +721,8 @@ static struct stats dx_show_leaf(struct inode *dir, #else int len = de->name_len; char *name = de->name; - ext4fs_dirhash(dir, de->name, de->name_len, &h); + (void) ext4fs_dirhash(dir, de->name, + de->name_len, &h); printk("%*.s:%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); #endif @@ -849,8 +851,14 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; /* hash is already computed for encrypted casefolded directory */ if (fname && fname_name(fname) && - !(IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir))) - ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), hinfo); + !(IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir))) { + int ret = ext4fs_dirhash(dir, fname_name(fname), + fname_len(fname), hinfo); + if (ret < 0) { + ret_err = ERR_PTR(ret); + goto fail; + } + } hash = hinfo->hash; if (root->info.unused_flags & 1) { @@ -1111,7 +1119,12 @@ static int htree_dirblock_to_tree(struct file *dir_file, hinfo->minor_hash = 0; } } else { - ext4fs_dirhash(dir, de->name, de->name_len, hinfo); + err = ext4fs_dirhash(dir, de->name, + de->name_len, hinfo); + if (err < 0) { + count = err; + goto errout; + } } if ((hinfo->hash < start_hash) || ((hinfo->hash == start_hash) && @@ -1313,8 +1326,12 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh, if (de->name_len && de->inode) { if (ext4_hash_in_dirent(dir)) h.hash = EXT4_DIRENT_HASH(de); - else - ext4fs_dirhash(dir, de->name, de->name_len, &h); + else { + int err = ext4fs_dirhash(dir, de->name, + de->name_len, &h); + if (err < 0) + return err; + } map_tail--; map_tail->hash = h.hash; map_tail->offs = ((char *) de - base)>>2; @@ -1452,10 +1469,9 @@ int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, hinfo->hash_version = DX_HASH_SIPHASH; hinfo->seed = NULL; if (cf_name->name) - ext4fs_dirhash(dir, cf_name->name, cf_name->len, hinfo); + return ext4fs_dirhash(dir, cf_name->name, cf_name->len, hinfo); else - ext4fs_dirhash(dir, iname->name, iname->len, hinfo); - return 0; + return ext4fs_dirhash(dir, iname->name, iname->len, hinfo); } #endif @@ -2298,10 +2314,15 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; /* casefolded encrypted hashes are computed on fname setup */ - if (!ext4_hash_in_dirent(dir)) - ext4fs_dirhash(dir, fname_name(fname), - fname_len(fname), &fname->hinfo); - + if (!ext4_hash_in_dirent(dir)) { + int err = ext4fs_dirhash(dir, fname_name(fname), + fname_len(fname), &fname->hinfo); + if (err < 0) { + brelse(bh2); + brelse(bh); + return err; + } + } memset(frames, 0, sizeof(frames)); frame = frames; frame->entries = entries; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 1e4db96a04e6..3621f29ec671 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -99,30 +99,30 @@ static void buffer_io_error(struct buffer_head *bh) static void ext4_finish_bio(struct bio *bio) { - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + struct folio_iter fi; - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; - struct page *bounce_page = NULL; + bio_for_each_folio_all(fi, bio) { + struct folio *folio = fi.folio; + struct folio *io_folio = NULL; struct buffer_head *bh, *head; - unsigned bio_start = bvec->bv_offset; - unsigned bio_end = bio_start + bvec->bv_len; + size_t bio_start = fi.offset; + size_t bio_end = bio_start + fi.length; unsigned under_io = 0; unsigned long flags; - if (fscrypt_is_bounce_page(page)) { - bounce_page = page; - page = fscrypt_pagecache_page(bounce_page); + if (fscrypt_is_bounce_folio(folio)) { + io_folio = folio; + folio = fscrypt_pagecache_folio(folio); } if (bio->bi_status) { - SetPageError(page); - mapping_set_error(page->mapping, -EIO); + int err = blk_status_to_errno(bio->bi_status); + folio_set_error(folio); + mapping_set_error(folio->mapping, err); } - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); /* - * We check all buffers in the page under b_uptodate_lock + * We check all buffers in the folio under b_uptodate_lock * to avoid races with other end io clearing async_write flags */ spin_lock_irqsave(&head->b_uptodate_lock, flags); @@ -141,8 +141,8 @@ static void ext4_finish_bio(struct bio *bio) } while ((bh = bh->b_this_page) != head); spin_unlock_irqrestore(&head->b_uptodate_lock, flags); if (!under_io) { - fscrypt_free_bounce_page(bounce_page); - end_page_writeback(page); + fscrypt_free_bounce_page(&io_folio->page); + folio_end_writeback(folio); } } } @@ -409,12 +409,10 @@ static void io_submit_init_bio(struct ext4_io_submit *io, static void io_submit_add_bh(struct ext4_io_submit *io, struct inode *inode, - struct page *pagecache_page, - struct page *bounce_page, + struct folio *folio, + struct folio *io_folio, struct buffer_head *bh) { - int ret; - if (io->io_bio && (bh->b_blocknr != io->io_next_block || !fscrypt_mergeable_bio_bh(io->io_bio, bh))) { submit_and_retry: @@ -422,20 +420,17 @@ submit_and_retry: } if (io->io_bio == NULL) io_submit_init_bio(io, bh); - ret = bio_add_page(io->io_bio, bounce_page ?: pagecache_page, - bh->b_size, bh_offset(bh)); - if (ret != bh->b_size) + if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh))) goto submit_and_retry; - wbc_account_cgroup_owner(io->io_wbc, pagecache_page, bh->b_size); + wbc_account_cgroup_owner(io->io_wbc, &folio->page, bh->b_size); io->io_next_block++; } -int ext4_bio_write_page(struct ext4_io_submit *io, - struct page *page, - int len) +int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, + size_t len) { - struct page *bounce_page = NULL; - struct inode *inode = page->mapping->host; + struct folio *io_folio = folio; + struct inode *inode = folio->mapping->host; unsigned block_start; struct buffer_head *bh, *head; int ret = 0; @@ -443,30 +438,30 @@ int ext4_bio_write_page(struct ext4_io_submit *io, struct writeback_control *wbc = io->io_wbc; bool keep_towrite = false; - BUG_ON(!PageLocked(page)); - BUG_ON(PageWriteback(page)); + BUG_ON(!folio_test_locked(folio)); + BUG_ON(folio_test_writeback(folio)); - ClearPageError(page); + folio_clear_error(folio); /* * Comments copied from block_write_full_page: * - * The page straddles i_size. It must be zeroed out on each and every + * The folio straddles i_size. It must be zeroed out on each and every * writepage invocation because it may be mmapped. "A file is mapped * in multiples of the page size. For a file that is not a multiple of * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ - if (len < PAGE_SIZE) - zero_user_segment(page, len, PAGE_SIZE); + if (len < folio_size(folio)) + folio_zero_segment(folio, len, folio_size(folio)); /* * In the first loop we prepare and mark buffers to submit. We have to - * mark all buffers in the page before submitting so that - * end_page_writeback() cannot be called from ext4_end_bio() when IO + * mark all buffers in the folio before submitting so that + * folio_end_writeback() cannot be called from ext4_end_bio() when IO * on the first buffer finishes and we are still working on submitting * the second buffer. */ - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); do { block_start = bh_offset(bh); if (block_start >= len) { @@ -481,14 +476,16 @@ int ext4_bio_write_page(struct ext4_io_submit *io, clear_buffer_dirty(bh); /* * Keeping dirty some buffer we cannot write? Make sure - * to redirty the page and keep TOWRITE tag so that - * racing WB_SYNC_ALL writeback does not skip the page. + * to redirty the folio and keep TOWRITE tag so that + * racing WB_SYNC_ALL writeback does not skip the folio. * This happens e.g. when doing writeout for - * transaction commit. + * transaction commit or when journalled data is not + * yet committed. */ - if (buffer_dirty(bh)) { - if (!PageDirty(page)) - redirty_page_for_writepage(wbc, page); + if (buffer_dirty(bh) || + (buffer_jbd(bh) && buffer_jbddirty(bh))) { + if (!folio_test_dirty(folio)) + folio_redirty_for_writepage(wbc, folio); keep_towrite = true; } continue; @@ -500,11 +497,11 @@ int ext4_bio_write_page(struct ext4_io_submit *io, nr_to_submit++; } while ((bh = bh->b_this_page) != head); - /* Nothing to submit? Just unlock the page... */ + /* Nothing to submit? Just unlock the folio... */ if (!nr_to_submit) - goto unlock; + return 0; - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); /* * If any blocks are being written to an encrypted file, encrypt them @@ -513,9 +510,10 @@ int ext4_bio_write_page(struct ext4_io_submit *io, * (e.g. holes) to be unnecessarily encrypted, but this is rare and * can't happen in the common case of blocksize == PAGE_SIZE. */ - if (fscrypt_inode_uses_fs_layer_crypto(inode) && nr_to_submit) { + if (fscrypt_inode_uses_fs_layer_crypto(inode)) { gfp_t gfp_flags = GFP_NOFS; unsigned int enc_bytes = round_up(len, i_blocksize(inode)); + struct page *bounce_page; /* * Since bounce page allocation uses a mempool, we can only use @@ -525,8 +523,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io, if (io->io_bio) gfp_flags = GFP_NOWAIT | __GFP_NOWARN; retry_encrypt: - bounce_page = fscrypt_encrypt_pagecache_blocks(page, enc_bytes, - 0, gfp_flags); + bounce_page = fscrypt_encrypt_pagecache_blocks(&folio->page, + enc_bytes, 0, gfp_flags); if (IS_ERR(bounce_page)) { ret = PTR_ERR(bounce_page); if (ret == -ENOMEM && @@ -542,7 +540,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, } printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); do { if (buffer_async_write(bh)) { clear_buffer_async_write(bh); @@ -550,22 +548,20 @@ int ext4_bio_write_page(struct ext4_io_submit *io, } bh = bh->b_this_page; } while (bh != head); - goto unlock; + + return ret; } + io_folio = page_folio(bounce_page); } - if (keep_towrite) - set_page_writeback_keepwrite(page); - else - set_page_writeback(page); + __folio_start_writeback(folio, keep_towrite); /* Now submit buffers to write */ do { if (!buffer_async_write(bh)) continue; - io_submit_add_bh(io, inode, page, bounce_page, bh); + io_submit_add_bh(io, inode, folio, io_folio, bh); } while ((bh = bh->b_this_page) != head); -unlock: - unlock_page(page); - return ret; + + return 0; } diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index c61dc8a7c014..6f46823fba61 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -68,18 +68,16 @@ struct bio_post_read_ctx { static void __read_end_io(struct bio *bio) { - struct page *page; - struct bio_vec *bv; - struct bvec_iter_all iter_all; + struct folio_iter fi; - bio_for_each_segment_all(bv, bio, iter_all) { - page = bv->bv_page; + bio_for_each_folio_all(fi, bio) { + struct folio *folio = fi.folio; if (bio->bi_status) - ClearPageUptodate(page); + folio_clear_uptodate(folio); else - SetPageUptodate(page); - unlock_page(page); + folio_mark_uptodate(folio); + folio_unlock(folio); } if (bio->bi_private) mempool_free(bio->bi_private, bio_post_read_ctx_pool); @@ -218,7 +216,7 @@ static inline loff_t ext4_readpage_limit(struct inode *inode) } int ext4_mpage_readpages(struct inode *inode, - struct readahead_control *rac, struct page *page) + struct readahead_control *rac, struct folio *folio) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; @@ -247,16 +245,15 @@ int ext4_mpage_readpages(struct inode *inode, int fully_mapped = 1; unsigned first_hole = blocks_per_page; - if (rac) { - page = readahead_page(rac); - prefetchw(&page->flags); - } + if (rac) + folio = readahead_folio(rac); + prefetchw(&folio->flags); - if (page_has_buffers(page)) + if (folio_buffers(folio)) goto confused; block_in_file = next_block = - (sector_t)page->index << (PAGE_SHIFT - blkbits); + (sector_t)folio->index << (PAGE_SHIFT - blkbits); last_block = block_in_file + nr_pages * blocks_per_page; last_block_in_file = (ext4_readpage_limit(inode) + blocksize - 1) >> blkbits; @@ -290,7 +287,7 @@ int ext4_mpage_readpages(struct inode *inode, /* * Then do more ext4_map_blocks() calls until we are - * done with this page. + * done with this folio. */ while (page_block < blocks_per_page) { if (block_in_file < last_block) { @@ -299,10 +296,10 @@ int ext4_mpage_readpages(struct inode *inode, if (ext4_map_blocks(NULL, inode, &map, 0) < 0) { set_error_page: - SetPageError(page); - zero_user_segment(page, 0, - PAGE_SIZE); - unlock_page(page); + folio_set_error(folio); + folio_zero_segment(folio, 0, + folio_size(folio)); + folio_unlock(folio); goto next_page; } } @@ -333,22 +330,22 @@ int ext4_mpage_readpages(struct inode *inode, } } if (first_hole != blocks_per_page) { - zero_user_segment(page, first_hole << blkbits, - PAGE_SIZE); + folio_zero_segment(folio, first_hole << blkbits, + folio_size(folio)); if (first_hole == 0) { - if (ext4_need_verity(inode, page->index) && - !fsverity_verify_page(page)) + if (ext4_need_verity(inode, folio->index) && + !fsverity_verify_page(&folio->page)) goto set_error_page; - SetPageUptodate(page); - unlock_page(page); - goto next_page; + folio_mark_uptodate(folio); + folio_unlock(folio); + continue; } } else if (fully_mapped) { - SetPageMappedToDisk(page); + folio_set_mappedtodisk(folio); } /* - * This page will go to BIO. Do we need to send this + * This folio will go to BIO. Do we need to send this * BIO off first? */ if (bio && (last_block_in_bio != blocks[0] - 1 || @@ -366,7 +363,7 @@ int ext4_mpage_readpages(struct inode *inode, REQ_OP_READ, GFP_KERNEL); fscrypt_set_bio_crypt_ctx(bio, inode, next_block, GFP_KERNEL); - ext4_set_bio_post_read_ctx(bio, inode, page->index); + ext4_set_bio_post_read_ctx(bio, inode, folio->index); bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; if (rac) @@ -374,7 +371,7 @@ int ext4_mpage_readpages(struct inode *inode, } length = first_hole << blkbits; - if (bio_add_page(bio, page, length, 0) < length) + if (!bio_add_folio(bio, folio, length, 0)) goto submit_and_realloc; if (((map.m_flags & EXT4_MAP_BOUNDARY) && @@ -384,19 +381,18 @@ int ext4_mpage_readpages(struct inode *inode, bio = NULL; } else last_block_in_bio = blocks[blocks_per_page - 1]; - goto next_page; + continue; confused: if (bio) { submit_bio(bio); bio = NULL; } - if (!PageUptodate(page)) - block_read_full_folio(page_folio(page), ext4_get_block); + if (!folio_test_uptodate(folio)) + block_read_full_folio(folio, ext4_get_block); else - unlock_page(page); - next_page: - if (rac) - put_page(page); + folio_unlock(folio); +next_page: + ; /* A label shall be followed by a statement until C23 */ } if (bio) submit_bio(bio); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 6b91443d6bf3..0361c20910de 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1306,7 +1306,6 @@ static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block) } static int ext4_set_bitmap_checksums(struct super_block *sb, - ext4_group_t group, struct ext4_group_desc *gdp, struct ext4_new_group_data *group_data) { @@ -1318,14 +1317,14 @@ static int ext4_set_bitmap_checksums(struct super_block *sb, bh = ext4_get_bitmap(sb, group_data->inode_bitmap); if (!bh) return -EIO; - ext4_inode_bitmap_csum_set(sb, group, gdp, bh, + ext4_inode_bitmap_csum_set(sb, gdp, bh, EXT4_INODES_PER_GROUP(sb) / 8); brelse(bh); bh = ext4_get_bitmap(sb, group_data->block_bitmap); if (!bh) return -EIO; - ext4_block_bitmap_csum_set(sb, group, gdp, bh); + ext4_block_bitmap_csum_set(sb, gdp, bh); brelse(bh); return 0; @@ -1363,7 +1362,7 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, memset(gdp, 0, EXT4_DESC_SIZE(sb)); ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap); ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap); - err = ext4_set_bitmap_checksums(sb, group, gdp, group_data); + err = ext4_set_bitmap_checksums(sb, gdp, group_data); if (err) { ext4_std_error(sb, err); break; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f43e526112ae..9680fe753e59 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1048,6 +1048,8 @@ void ext4_mark_group_bitmap_corrupted(struct super_block *sb, struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); int ret; + if (!grp || !gdp) + return; if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) { ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); @@ -1183,14 +1185,83 @@ static inline void ext4_quota_off_umount(struct super_block *sb) } #endif +static int ext4_percpu_param_init(struct ext4_sb_info *sbi) +{ + ext4_fsblk_t block; + int err; + + block = ext4_count_free_clusters(sbi->s_sb); + ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block)); + err = percpu_counter_init(&sbi->s_freeclusters_counter, block, + GFP_KERNEL); + if (!err) { + unsigned long freei = ext4_count_free_inodes(sbi->s_sb); + sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); + err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, + GFP_KERNEL); + } + if (!err) + err = percpu_counter_init(&sbi->s_dirs_counter, + ext4_count_dirs(sbi->s_sb), GFP_KERNEL); + if (!err) + err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, + GFP_KERNEL); + if (!err) + err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0, + GFP_KERNEL); + if (!err) + err = percpu_init_rwsem(&sbi->s_writepages_rwsem); + + if (err) + ext4_msg(sbi->s_sb, KERN_ERR, "insufficient memory"); + + return err; +} + +static void ext4_percpu_param_destroy(struct ext4_sb_info *sbi) +{ + percpu_counter_destroy(&sbi->s_freeclusters_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyclusters_counter); + percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); + percpu_free_rwsem(&sbi->s_writepages_rwsem); +} + +static void ext4_group_desc_free(struct ext4_sb_info *sbi) +{ + struct buffer_head **group_desc; + int i; + + rcu_read_lock(); + group_desc = rcu_dereference(sbi->s_group_desc); + for (i = 0; i < sbi->s_gdb_count; i++) + brelse(group_desc[i]); + kvfree(group_desc); + rcu_read_unlock(); +} + +static void ext4_flex_groups_free(struct ext4_sb_info *sbi) +{ + struct flex_groups **flex_groups; + int i; + + rcu_read_lock(); + flex_groups = rcu_dereference(sbi->s_flex_groups); + if (flex_groups) { + for (i = 0; i < sbi->s_flex_groups_allocated; i++) + kvfree(flex_groups[i]); + kvfree(flex_groups); + } + rcu_read_unlock(); +} + static void ext4_put_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; - struct buffer_head **group_desc; - struct flex_groups **flex_groups; int aborted = 0; - int i, err; + int err; /* * Unregister sysfs before destroying jbd2 journal. @@ -1238,26 +1309,11 @@ static void ext4_put_super(struct super_block *sb) if (!sb_rdonly(sb)) ext4_commit_super(sb); - rcu_read_lock(); - group_desc = rcu_dereference(sbi->s_group_desc); - for (i = 0; i < sbi->s_gdb_count; i++) - brelse(group_desc[i]); - kvfree(group_desc); - flex_groups = rcu_dereference(sbi->s_flex_groups); - if (flex_groups) { - for (i = 0; i < sbi->s_flex_groups_allocated; i++) - kvfree(flex_groups[i]); - kvfree(flex_groups); - } - rcu_read_unlock(); - percpu_counter_destroy(&sbi->s_freeclusters_counter); - percpu_counter_destroy(&sbi->s_freeinodes_counter); - percpu_counter_destroy(&sbi->s_dirs_counter); - percpu_counter_destroy(&sbi->s_dirtyclusters_counter); - percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); - percpu_free_rwsem(&sbi->s_writepages_rwsem); + ext4_group_desc_free(sbi); + ext4_flex_groups_free(sbi); + ext4_percpu_param_destroy(sbi); #ifdef CONFIG_QUOTA - for (i = 0; i < EXT4_MAXQUOTAS; i++) + for (int i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); #endif @@ -1325,9 +1381,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) inode_set_iversion(&ei->vfs_inode, 1); ei->i_flags = 0; spin_lock_init(&ei->i_raw_lock); - INIT_LIST_HEAD(&ei->i_prealloc_list); + ei->i_prealloc_node = RB_ROOT; atomic_set(&ei->i_prealloc_active, 0); - spin_lock_init(&ei->i_prealloc_lock); + rwlock_init(&ei->i_prealloc_lock); ext4_es_init_tree(&ei->i_es_tree); rwlock_init(&ei->i_es_lock); INIT_LIST_HEAD(&ei->i_es_list); @@ -2500,7 +2556,7 @@ static void ext4_apply_quota_options(struct fs_context *fc, qname = rcu_replace_pointer(sbi->s_qf_names[i], qname, lockdep_is_held(&sb->s_umount)); if (qname) - kfree_rcu(qname); + kfree_rcu_mightsleep(qname); } } @@ -3184,11 +3240,9 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, crc = crc16(crc, (__u8 *)gdp, offset); offset += sizeof(gdp->bg_checksum); /* skip checksum */ /* for checksum of struct ext4_group_desc do the rest...*/ - if (ext4_has_feature_64bit(sb) && - offset < le16_to_cpu(sbi->s_es->s_desc_size)) + if (ext4_has_feature_64bit(sb) && offset < sbi->s_desc_size) crc = crc16(crc, (__u8 *)gdp + offset, - le16_to_cpu(sbi->s_es->s_desc_size) - - offset); + sbi->s_desc_size - offset); out: return cpu_to_le16(crc); @@ -4587,6 +4641,8 @@ static int ext4_check_feature_compatibility(struct super_block *sb, struct ext4_super_block *es, int silent) { + struct ext4_sb_info *sbi = EXT4_SB(sb); + if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && (ext4_has_compat_features(sb) || ext4_has_ro_compat_features(sb) || @@ -4656,14 +4712,59 @@ static int ext4_check_feature_compatibility(struct super_block *sb, if (!ext4_feature_set_ok(sb, (sb_rdonly(sb)))) return -EINVAL; + if (sbi->s_daxdev) { + if (sb->s_blocksize == PAGE_SIZE) + set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); + else + ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n"); + } + + if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { + if (ext4_has_feature_inline_data(sb)) { + ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" + " that may contain inline data"); + return -EINVAL; + } + if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) { + ext4_msg(sb, KERN_ERR, + "DAX unsupported by block device."); + return -EINVAL; + } + } + + if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { + ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d", + es->s_encryption_level); + return -EINVAL; + } + return 0; } -static int ext4_geometry_check(struct super_block *sb, +static int ext4_check_geometry(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); __u64 blocks_count; + int err; + + if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) { + ext4_msg(sb, KERN_ERR, + "Number of reserved GDT blocks insanely large: %d", + le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks)); + return -EINVAL; + } + /* + * Test whether we have more sectors than will fit in sector_t, + * and whether the max offset is addressable by the page cache. + */ + err = generic_check_addressable(sb->s_blocksize_bits, + ext4_blocks_count(es)); + if (err) { + ext4_msg(sb, KERN_ERR, "filesystem" + " too large to mount safely on this system"); + return err; + } /* check blocks count against device size */ blocks_count = sb_bdev_nr_blocks(sb); @@ -4719,19 +4820,6 @@ static int ext4_geometry_check(struct super_block *sb, return 0; } -static void ext4_group_desc_free(struct ext4_sb_info *sbi) -{ - struct buffer_head **group_desc; - int i; - - rcu_read_lock(); - group_desc = rcu_dereference(sbi->s_group_desc); - for (i = 0; i < sbi->s_gdb_count; i++) - brelse(group_desc[i]); - kvfree(group_desc); - rcu_read_unlock(); -} - static int ext4_group_desc_init(struct super_block *sb, struct ext4_super_block *es, ext4_fsblk_t logical_sb_block, @@ -4881,7 +4969,7 @@ out: return -EINVAL; } -static int ext4_journal_data_mode_check(struct super_block *sb) +static int ext4_check_journal_data_mode(struct super_block *sb) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with " @@ -5024,18 +5112,92 @@ out: return ret; } +static void ext4_hash_info_init(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + unsigned int i; + + for (i = 0; i < 4; i++) + sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); + + sbi->s_def_hash_version = es->s_def_hash_version; + if (ext4_has_feature_dir_index(sb)) { + i = le32_to_cpu(es->s_flags); + if (i & EXT2_FLAGS_UNSIGNED_HASH) + sbi->s_hash_unsigned = 3; + else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { +#ifdef __CHAR_UNSIGNED__ + if (!sb_rdonly(sb)) + es->s_flags |= + cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); + sbi->s_hash_unsigned = 3; +#else + if (!sb_rdonly(sb)) + es->s_flags |= + cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); +#endif + } + } +} + +static int ext4_block_group_meta_init(struct super_block *sb, int silent) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int has_huge_files; + + has_huge_files = ext4_has_feature_huge_file(sb); + sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, + has_huge_files); + sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); + + sbi->s_desc_size = le16_to_cpu(es->s_desc_size); + if (ext4_has_feature_64bit(sb)) { + if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || + sbi->s_desc_size > EXT4_MAX_DESC_SIZE || + !is_power_of_2(sbi->s_desc_size)) { + ext4_msg(sb, KERN_ERR, + "unsupported descriptor size %lu", + sbi->s_desc_size); + return -EINVAL; + } + } else + sbi->s_desc_size = EXT4_MIN_DESC_SIZE; + + sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); + sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); + + sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb); + if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) { + if (!silent) + ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); + return -EINVAL; + } + if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || + sbi->s_inodes_per_group > sb->s_blocksize * 8) { + ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", + sbi->s_inodes_per_group); + return -EINVAL; + } + sbi->s_itb_per_group = sbi->s_inodes_per_group / + sbi->s_inodes_per_block; + sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb); + sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY; + sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); + sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); + + return 0; +} + static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) { struct ext4_super_block *es = NULL; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct flex_groups **flex_groups; - ext4_fsblk_t block; ext4_fsblk_t logical_sb_block; struct inode *root; - int ret = -ENOMEM; - unsigned int i; - int needs_recovery, has_huge_files; - int err = 0; + int needs_recovery; + int err; ext4_group_t first_not_zeroed; struct ext4_fs_context *ctx = fc->fs_private; int silent = fc->sb_flags & SB_SILENT; @@ -5048,8 +5210,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sbi->s_sectors_written_start = part_stat_read(sb->s_bdev, sectors[STAT_WRITE]); - /* -EINVAL is default */ - ret = -EINVAL; err = ext4_load_super(sb, &logical_sb_block, silent); if (err) goto out_fail; @@ -5075,7 +5235,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) */ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; - if (ext4_inode_info_init(sb, es)) + err = ext4_inode_info_init(sb, es); + if (err) goto failed_mount; err = parse_apply_sb_mount_options(sb, ctx); @@ -5091,10 +5252,12 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) ext4_apply_options(fc, sb); - if (ext4_encoding_init(sb, es)) + err = ext4_encoding_init(sb, es); + if (err) goto failed_mount; - if (ext4_journal_data_mode_check(sb)) + err = ext4_check_journal_data_mode(sb); + if (err) goto failed_mount; sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | @@ -5103,119 +5266,22 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) /* i_version is always enabled now */ sb->s_flags |= SB_I_VERSION; - if (ext4_check_feature_compatibility(sb, es, silent)) - goto failed_mount; - - if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) { - ext4_msg(sb, KERN_ERR, - "Number of reserved GDT blocks insanely large: %d", - le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks)); - goto failed_mount; - } - - if (sbi->s_daxdev) { - if (sb->s_blocksize == PAGE_SIZE) - set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); - else - ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n"); - } - - if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { - if (ext4_has_feature_inline_data(sb)) { - ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" - " that may contain inline data"); - goto failed_mount; - } - if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) { - ext4_msg(sb, KERN_ERR, - "DAX unsupported by block device."); - goto failed_mount; - } - } - - if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { - ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d", - es->s_encryption_level); + err = ext4_check_feature_compatibility(sb, es, silent); + if (err) goto failed_mount; - } - - has_huge_files = ext4_has_feature_huge_file(sb); - sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, - has_huge_files); - sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); - - sbi->s_desc_size = le16_to_cpu(es->s_desc_size); - if (ext4_has_feature_64bit(sb)) { - if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || - sbi->s_desc_size > EXT4_MAX_DESC_SIZE || - !is_power_of_2(sbi->s_desc_size)) { - ext4_msg(sb, KERN_ERR, - "unsupported descriptor size %lu", - sbi->s_desc_size); - goto failed_mount; - } - } else - sbi->s_desc_size = EXT4_MIN_DESC_SIZE; - sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); - sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); - - sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb); - if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) { - if (!silent) - ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); - goto failed_mount; - } - if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || - sbi->s_inodes_per_group > sb->s_blocksize * 8) { - ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", - sbi->s_inodes_per_group); + err = ext4_block_group_meta_init(sb, silent); + if (err) goto failed_mount; - } - sbi->s_itb_per_group = sbi->s_inodes_per_group / - sbi->s_inodes_per_block; - sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb); - sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY; - sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); - sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); - for (i = 0; i < 4; i++) - sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); - sbi->s_def_hash_version = es->s_def_hash_version; - if (ext4_has_feature_dir_index(sb)) { - i = le32_to_cpu(es->s_flags); - if (i & EXT2_FLAGS_UNSIGNED_HASH) - sbi->s_hash_unsigned = 3; - else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { -#ifdef __CHAR_UNSIGNED__ - if (!sb_rdonly(sb)) - es->s_flags |= - cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); - sbi->s_hash_unsigned = 3; -#else - if (!sb_rdonly(sb)) - es->s_flags |= - cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); -#endif - } - } + ext4_hash_info_init(sb); - if (ext4_handle_clustersize(sb)) - goto failed_mount; - - /* - * Test whether we have more sectors than will fit in sector_t, - * and whether the max offset is addressable by the page cache. - */ - err = generic_check_addressable(sb->s_blocksize_bits, - ext4_blocks_count(es)); - if (err) { - ext4_msg(sb, KERN_ERR, "filesystem" - " too large to mount safely on this system"); + err = ext4_handle_clustersize(sb); + if (err) goto failed_mount; - } - if (ext4_geometry_check(sb, es)) + err = ext4_check_geometry(sb, es); + if (err) goto failed_mount; timer_setup(&sbi->s_err_report, print_daily_error_info, 0); @@ -5226,8 +5292,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (err) goto failed_mount3; - /* Register extent status tree shrinker */ - if (ext4_es_register_shrinker(sbi)) + err = ext4_es_register_shrinker(sbi); + if (err) goto failed_mount3; sbi->s_stripe = ext4_get_stripe_size(sbi); @@ -5266,10 +5332,13 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) ext4_has_feature_orphan_present(sb) || ext4_has_feature_journal_needs_recovery(sb)); - if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) - if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) + if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) { + err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)); + if (err) goto failed_mount3a; + } + err = -EINVAL; /* * The first inode we look at is the journal inode. Don't try * root first: it may be modified in the journal! @@ -5321,6 +5390,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (!sbi->s_ea_block_cache) { ext4_msg(sb, KERN_ERR, "Failed to create ea_block_cache"); + err = -EINVAL; goto failed_mount_wq; } @@ -5329,6 +5399,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (!sbi->s_ea_inode_cache) { ext4_msg(sb, KERN_ERR, "Failed to create ea_inode_cache"); + err = -EINVAL; goto failed_mount_wq; } } @@ -5363,7 +5434,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); if (!EXT4_SB(sb)->rsv_conversion_wq) { printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); - ret = -ENOMEM; + err = -ENOMEM; goto failed_mount4; } @@ -5375,28 +5446,28 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL); if (IS_ERR(root)) { ext4_msg(sb, KERN_ERR, "get root inode failed"); - ret = PTR_ERR(root); + err = PTR_ERR(root); root = NULL; goto failed_mount4; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); iput(root); + err = -EFSCORRUPTED; goto failed_mount4; } sb->s_root = d_make_root(root); if (!sb->s_root) { ext4_msg(sb, KERN_ERR, "get root dentry failed"); - ret = -ENOMEM; + err = -ENOMEM; goto failed_mount4; } - ret = ext4_setup_super(sb, es, sb_rdonly(sb)); - if (ret == -EROFS) { + err = ext4_setup_super(sb, es, sb_rdonly(sb)); + if (err == -EROFS) { sb->s_flags |= SB_RDONLY; - ret = 0; - } else if (ret) + } else if (err) goto failed_mount4a; ext4_set_resv_clusters(sb); @@ -5440,40 +5511,16 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; - block = ext4_count_free_clusters(sb); - ext4_free_blocks_count_set(sbi->s_es, - EXT4_C2B(sbi, block)); - err = percpu_counter_init(&sbi->s_freeclusters_counter, block, - GFP_KERNEL); - if (!err) { - unsigned long freei = ext4_count_free_inodes(sb); - sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); - err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, - GFP_KERNEL); - } - if (!err) - err = percpu_counter_init(&sbi->s_dirs_counter, - ext4_count_dirs(sb), GFP_KERNEL); - if (!err) - err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, - GFP_KERNEL); - if (!err) - err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0, - GFP_KERNEL); - if (!err) - err = percpu_init_rwsem(&sbi->s_writepages_rwsem); - - if (err) { - ext4_msg(sb, KERN_ERR, "insufficient memory"); + err = ext4_percpu_param_init(sbi); + if (err) goto failed_mount6; - } if (ext4_has_feature_flex_bg(sb)) if (!ext4_fill_flex_info(sb)) { ext4_msg(sb, KERN_ERR, "unable to initialize " "flex_bg meta info!"); - ret = -ENOMEM; + err = -ENOMEM; goto failed_mount6; } @@ -5548,20 +5595,8 @@ failed_mount7: ext4_unregister_li_request(sb); failed_mount6: ext4_mb_release(sb); - rcu_read_lock(); - flex_groups = rcu_dereference(sbi->s_flex_groups); - if (flex_groups) { - for (i = 0; i < sbi->s_flex_groups_allocated; i++) - kvfree(flex_groups[i]); - kvfree(flex_groups); - } - rcu_read_unlock(); - percpu_counter_destroy(&sbi->s_freeclusters_counter); - percpu_counter_destroy(&sbi->s_freeinodes_counter); - percpu_counter_destroy(&sbi->s_dirs_counter); - percpu_counter_destroy(&sbi->s_dirtyclusters_counter); - percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); - percpu_free_rwsem(&sbi->s_writepages_rwsem); + ext4_flex_groups_free(sbi); + ext4_percpu_param_destroy(sbi); failed_mount5: ext4_ext_release(sb); ext4_release_system_zone(sb); @@ -5602,7 +5637,7 @@ failed_mount: #endif #ifdef CONFIG_QUOTA - for (i = 0; i < EXT4_MAXQUOTAS; i++) + for (unsigned int i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); #endif fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); @@ -5611,7 +5646,7 @@ failed_mount: ext4_blkdev_remove(sbi); out_fail: sb->s_fs_info = NULL; - return err ? err : ret; + return err; } static int ext4_fill_super(struct super_block *sb, struct fs_context *fc) @@ -5649,8 +5684,9 @@ static int ext4_fill_super(struct super_block *sb, struct fs_context *fc) descr = "out journal"; if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) - ext4_msg(sb, KERN_INFO, "mounted filesystem %pU with%s. " - "Quota mode: %s.", &sb->s_uuid, descr, + ext4_msg(sb, KERN_INFO, "mounted filesystem %pU %s with%s. " + "Quota mode: %s.", &sb->s_uuid, + sb_rdonly(sb) ? "ro" : "r/w", descr, ext4_quota_mode(sb)); /* Update the s_overhead_clusters if necessary */ @@ -6352,6 +6388,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) struct ext4_mount_options old_opts; ext4_group_t g; int err = 0; + int enable_rw = 0; #ifdef CONFIG_QUOTA int enable_quota = 0; int i, j; @@ -6538,13 +6575,13 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) if (err) goto restore_opts; - sb->s_flags &= ~SB_RDONLY; - if (ext4_has_feature_mmp(sb)) - if (ext4_multi_mount_protect(sb, - le64_to_cpu(es->s_mmp_block))) { - err = -EROFS; + enable_rw = 1; + if (ext4_has_feature_mmp(sb)) { + err = ext4_multi_mount_protect(sb, + le64_to_cpu(es->s_mmp_block)); + if (err) goto restore_opts; - } + } #ifdef CONFIG_QUOTA enable_quota = 1; #endif @@ -6581,9 +6618,6 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) } #ifdef CONFIG_QUOTA - /* Release old quota file names */ - for (i = 0; i < EXT4_MAXQUOTAS; i++) - kfree(old_opts.s_qf_names[i]); if (enable_quota) { if (sb_any_quota_suspended(sb)) dquot_resume(sb, -1); @@ -6593,16 +6627,29 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) goto restore_opts; } } + /* Release old quota file names */ + for (i = 0; i < EXT4_MAXQUOTAS; i++) + kfree(old_opts.s_qf_names[i]); #endif if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) ext4_release_system_zone(sb); + if (enable_rw) + sb->s_flags &= ~SB_RDONLY; + if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) ext4_stop_mmpd(sbi); return 0; restore_opts: + /* + * If there was a failing r/w to ro transition, we may need to + * re-enable quota + */ + if ((sb->s_flags & SB_RDONLY) && !(old_sb_flags & SB_RDONLY) && + sb_any_quota_suspended(sb)) + dquot_resume(sb, -1); sb->s_flags = old_sb_flags; sbi->s_mount_opt = old_opts.s_mount_opt; sbi->s_mount_opt2 = old_opts.s_mount_opt2; @@ -6643,8 +6690,9 @@ static int ext4_reconfigure(struct fs_context *fc) if (ret < 0) return ret; - ext4_msg(sb, KERN_INFO, "re-mounted %pU. Quota mode: %s.", - &sb->s_uuid, ext4_quota_mode(sb)); + ext4_msg(sb, KERN_INFO, "re-mounted %pU %s. Quota mode: %s.", + &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w", + ext4_quota_mode(sb)); return 0; } @@ -6870,23 +6918,6 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY; } - /* - * When we journal data on quota file, we have to flush journal to see - * all updates to the file when we bypass pagecache... - */ - if (EXT4_SB(sb)->s_journal && - ext4_should_journal_data(d_inode(path->dentry))) { - /* - * We don't need to lock updates but journal_flush() could - * otherwise be livelocked... - */ - jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); - err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); - if (err) - return err; - } - lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA); err = dquot_quota_on(sb, type, format_id, path); if (!err) { diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 12d6252e3e22..3042bc605bbf 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -214,7 +214,6 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); -EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc); EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups); EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); @@ -264,7 +263,6 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(mb_order2_req), ATTR_LIST(mb_stream_req), ATTR_LIST(mb_group_prealloc), - ATTR_LIST(mb_max_inode_prealloc), ATTR_LIST(mb_max_linear_groups), ATTR_LIST(max_writeback_mb_bump), ATTR_LIST(extent_max_zeroout_kb), diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index e4da1704438e..2f37e1ea3955 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -42,18 +42,16 @@ static int pagecache_read(struct inode *inode, void *buf, size_t count, loff_t pos) { while (count) { - size_t n = min_t(size_t, count, - PAGE_SIZE - offset_in_page(pos)); - struct page *page; + struct folio *folio; + size_t n; - page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT, + folio = read_mapping_folio(inode->i_mapping, pos >> PAGE_SHIFT, NULL); - if (IS_ERR(page)) - return PTR_ERR(page); - - memcpy_from_page(buf, page, offset_in_page(pos), n); + if (IS_ERR(folio)) + return PTR_ERR(folio); - put_page(page); + n = memcpy_from_file_folio(buf, folio, pos, count); + folio_put(folio); buf += n; pos += n; @@ -363,21 +361,23 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { - struct page *page; + struct folio *folio; index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT; - page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED); - if (!page || !PageUptodate(page)) { + folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0); + if (IS_ERR(folio) || !folio_test_uptodate(folio)) { DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); - if (page) - put_page(page); + if (!IS_ERR(folio)) + folio_put(folio); else if (num_ra_pages > 1) page_cache_ra_unbounded(&ractl, num_ra_pages, 0); - page = read_mapping_page(inode->i_mapping, index, NULL); + folio = read_mapping_folio(inode->i_mapping, index, NULL); + if (IS_ERR(folio)) + return ERR_CAST(folio); } - return page; + return folio_file_page(folio, index); } static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf, diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 767454d74cd6..dfc2e223bd10 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -88,8 +88,8 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *); static const struct xattr_handler * const ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL - [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, - [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, + [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, + [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default, #endif [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler, #ifdef CONFIG_EXT4_FS_SECURITY @@ -101,10 +101,6 @@ static const struct xattr_handler * const ext4_xattr_handler_map[] = { const struct xattr_handler *ext4_xattr_handlers[] = { &ext4_xattr_user_handler, &ext4_xattr_trusted_handler, -#ifdef CONFIG_EXT4_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif #ifdef CONFIG_EXT4_FS_SECURITY &ext4_xattr_security_handler, #endif @@ -173,14 +169,18 @@ static void ext4_xattr_block_csum_set(struct inode *inode, bh->b_blocknr, BHDR(bh)); } -static inline const struct xattr_handler * -ext4_xattr_handler(int name_index) +static inline const char *ext4_xattr_prefix(int name_index, + struct dentry *dentry) { const struct xattr_handler *handler = NULL; if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map)) handler = ext4_xattr_handler_map[name_index]; - return handler; + + if (!xattr_handler_can_list(handler, dentry)) + return NULL; + + return xattr_prefix(handler); } static int @@ -740,11 +740,10 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry, size_t rest = buffer_size; for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { - const struct xattr_handler *handler = - ext4_xattr_handler(entry->e_name_index); + const char *prefix; - if (handler && (!handler->list || handler->list(dentry))) { - const char *prefix = handler->prefix ?: handler->name; + prefix = ext4_xattr_prefix(entry->e_name_index, dentry); + if (prefix) { size_t prefix_len = strlen(prefix); size_t size = prefix_len + entry->e_name_len + 1; @@ -2615,6 +2614,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, .in_inode = !!entry->e_value_inum, }; struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode); + int needs_kvfree = 0; int error; is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); @@ -2637,7 +2637,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, error = -ENOMEM; goto out; } - + needs_kvfree = 1; error = ext4_xattr_inode_get(inode, entry, buffer, value_size); if (error) goto out; @@ -2676,7 +2676,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, out: kfree(b_entry_name); - if (entry->e_value_inum && buffer) + if (needs_kvfree && buffer) kvfree(buffer); if (is) brelse(is->iloc.bh); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index c3e058e0a018..64b3860f50ee 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -152,6 +152,11 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, se = get_seg_entry(sbi, segno); exist = f2fs_test_bit(offset, se->cur_valid_map); + + /* skip data, if we already have an error in checkpoint. */ + if (unlikely(f2fs_cp_error(sbi))) + return exist; + if (exist && type == DATA_GENERIC_ENHANCE_UPDATE) { f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", blkaddr, exist); @@ -202,6 +207,11 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, case DATA_GENERIC_ENHANCE_UPDATE: if (unlikely(blkaddr >= MAX_BLKADDR(sbi) || blkaddr < MAIN_BLKADDR(sbi))) { + + /* Skip to emit an error message. */ + if (unlikely(f2fs_cp_error(sbi))) + return false; + f2fs_warn(sbi, "access invalid blkaddr:%u", blkaddr); set_sbi_flag(sbi, SBI_NEED_FSCK); @@ -325,8 +335,15 @@ static int __f2fs_write_meta_page(struct page *page, trace_f2fs_writepage(page, META); - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) { + ClearPageUptodate(page); + dec_page_count(sbi, F2FS_DIRTY_META); + unlock_page(page); + return 0; + } goto redirty_out; + } if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0)) @@ -508,6 +525,7 @@ retry: if (!e) { if (!new) { spin_unlock(&im->ino_lock); + radix_tree_preload_end(); goto retry; } e = new; @@ -706,32 +724,18 @@ err_out: int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) { block_t start_blk, orphan_blocks, i, j; - unsigned int s_flags = sbi->sb->s_flags; int err = 0; -#ifdef CONFIG_QUOTA - int quota_enabled; -#endif if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) return 0; - if (bdev_read_only(sbi->sb->s_bdev)) { + if (f2fs_hw_is_readonly(sbi)) { f2fs_info(sbi, "write access unavailable, skipping orphan cleanup"); return 0; } - if (s_flags & SB_RDONLY) { + if (is_sbi_flag_set(sbi, SBI_IS_WRITABLE)) f2fs_info(sbi, "orphan cleanup on readonly fs"); - sbi->sb->s_flags &= ~SB_RDONLY; - } - -#ifdef CONFIG_QUOTA - /* - * Turn on quotas which were not enabled for read-only mounts if - * filesystem has quota feature, so that they are updated correctly. - */ - quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY); -#endif start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); @@ -765,13 +769,6 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) out: set_sbi_flag(sbi, SBI_IS_RECOVERED); -#ifdef CONFIG_QUOTA - /* Turn quotas off */ - if (quota_enabled) - f2fs_quota_off_umount(sbi->sb); -#endif - sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ - return err; } @@ -982,7 +979,7 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi) cp_blk_no = le32_to_cpu(fsb->cp_blkaddr); if (cur_page == cp2) - cp_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg); + cp_blk_no += BIT(le32_to_cpu(fsb->log_blocks_per_seg)); for (i = 1; i < cp_blks; i++) { void *sit_bitmap_ptr; @@ -1133,7 +1130,7 @@ retry: goto retry; } -int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) +static int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) { struct list_head *head = &sbi->inode_list[DIRTY_META]; struct inode *inode; @@ -1306,7 +1303,8 @@ void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type) if (!get_pages(sbi, type)) break; - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi) && + !is_sbi_flag_set(sbi, SBI_IS_CLOSE))) break; if (type == F2FS_DIRTY_META) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index b40dec3d7f79..11653fa79289 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -264,35 +264,21 @@ static void lz4_destroy_compress_ctx(struct compress_ctx *cc) cc->private = NULL; } -#ifdef CONFIG_F2FS_FS_LZ4HC -static int lz4hc_compress_pages(struct compress_ctx *cc) +static int lz4_compress_pages(struct compress_ctx *cc) { + int len = -EINVAL; unsigned char level = F2FS_I(cc->inode)->i_compress_level; - int len; - if (level) - len = LZ4_compress_HC(cc->rbuf, cc->cbuf->cdata, cc->rlen, - cc->clen, level, cc->private); - else + if (!level) len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen, cc->clen, cc->private); - if (!len) - return -EAGAIN; - - cc->clen = len; - return 0; -} -#endif - -static int lz4_compress_pages(struct compress_ctx *cc) -{ - int len; - #ifdef CONFIG_F2FS_FS_LZ4HC - return lz4hc_compress_pages(cc); + else + len = LZ4_compress_HC(cc->rbuf, cc->cbuf->cdata, cc->rlen, + cc->clen, level, cc->private); #endif - len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen, - cc->clen, cc->private); + if (len < 0) + return len; if (!len) return -EAGAIN; @@ -670,7 +656,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) cc->cbuf->clen = cpu_to_le32(cc->clen); - if (fi->i_compress_flag & 1 << COMPRESS_CHKSUM) + if (fi->i_compress_flag & BIT(COMPRESS_CHKSUM)) chksum = f2fs_crc32(F2FS_I_SB(cc->inode), cc->cbuf->cdata, cc->clen); cc->cbuf->chksum = cpu_to_le32(chksum); @@ -755,13 +741,18 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) { ret = -EFSCORRUPTED; - f2fs_handle_error(sbi, ERROR_FAIL_DECOMPRESSION); + + /* Avoid f2fs_commit_super in irq context */ + if (in_task) + f2fs_save_errors(sbi, ERROR_FAIL_DECOMPRESSION); + else + f2fs_handle_error(sbi, ERROR_FAIL_DECOMPRESSION); goto out_release; } ret = cops->decompress_pages(dic); - if (!ret && (fi->i_compress_flag & 1 << COMPRESS_CHKSUM)) { + if (!ret && (fi->i_compress_flag & BIT(COMPRESS_CHKSUM))) { u32 provided = le32_to_cpu(dic->cbuf->chksum); u32 calculated = f2fs_crc32(sbi, dic->cbuf->cdata, dic->clen); @@ -1456,6 +1447,12 @@ continue_unlock: if (!PageDirty(cc->rpages[i])) goto continue_unlock; + if (PageWriteback(cc->rpages[i])) { + if (wbc->sync_mode == WB_SYNC_NONE) + goto continue_unlock; + f2fs_wait_on_page_writeback(cc->rpages[i], DATA, true, true); + } + if (!clear_page_dirty_for_io(cc->rpages[i])) goto continue_unlock; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 06b552a0aba2..7165b1202f53 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -93,17 +93,17 @@ static enum count_type __read_io_type(struct page *page) /* postprocessing steps for read bios */ enum bio_post_read_step { #ifdef CONFIG_FS_ENCRYPTION - STEP_DECRYPT = 1 << 0, + STEP_DECRYPT = BIT(0), #else STEP_DECRYPT = 0, /* compile out the decryption-related code */ #endif #ifdef CONFIG_F2FS_FS_COMPRESSION - STEP_DECOMPRESS = 1 << 1, + STEP_DECOMPRESS = BIT(1), #else STEP_DECOMPRESS = 0, /* compile out the decompression-related code */ #endif #ifdef CONFIG_FS_VERITY - STEP_VERITY = 1 << 2, + STEP_VERITY = BIT(2), #else STEP_VERITY = 0, /* compile out the verity-related code */ #endif @@ -420,7 +420,7 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio) { - unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1; + unsigned int temp_mask = GENMASK(NR_TEMP_TYPE - 1, 0); unsigned int fua_flag, meta_flag, io_flag; blk_opf_t op_flags = 0; @@ -442,9 +442,9 @@ static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio) * 5 | 4 | 3 | 2 | 1 | 0 | * Cold | Warm | Hot | Cold | Warm | Hot | */ - if ((1 << fio->temp) & meta_flag) + if (BIT(fio->temp) & meta_flag) op_flags |= REQ_META; - if ((1 << fio->temp) & fua_flag) + if (BIT(fio->temp) & fua_flag) op_flags |= REQ_FUA; return op_flags; } @@ -874,6 +874,8 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, bool found = false; struct bio *target = bio ? *bio : NULL; + f2fs_bug_on(sbi, !target && !page); + for (temp = HOT; temp < NR_TEMP_TYPE && !found; temp++) { struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; struct list_head *head = &io->bio_list; @@ -2235,6 +2237,10 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (ret) goto out; + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; + goto out_put_dnode; + } f2fs_bug_on(sbi, dn.data_blkaddr != COMPRESS_ADDR); skip_reading_dnode: @@ -2798,7 +2804,8 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, * don't drop any dirty dentry pages for keeping lastest * directory structure. */ - if (S_ISDIR(inode->i_mode)) + if (S_ISDIR(inode->i_mode) && + !is_sbi_flag_set(sbi, SBI_IS_CLOSE)) goto redirty_out; goto out; } @@ -2898,7 +2905,8 @@ out: if (unlikely(f2fs_cp_error(sbi))) { f2fs_submit_merged_write(sbi, DATA); - f2fs_submit_merged_ipu_write(sbi, bio, NULL); + if (bio && *bio) + f2fs_submit_merged_ipu_write(sbi, bio, NULL); submitted = NULL; } @@ -3123,12 +3131,9 @@ continue_unlock: } if (folio_test_writeback(folio)) { - if (wbc->sync_mode != WB_SYNC_NONE) - f2fs_wait_on_page_writeback( - &folio->page, - DATA, true, true); - else + if (wbc->sync_mode == WB_SYNC_NONE) goto continue_unlock; + f2fs_wait_on_page_writeback(&folio->page, DATA, true, true); } if (!folio_clear_dirty_for_io(folio)) @@ -3486,7 +3491,7 @@ unlock_out: static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi, struct page *page, loff_t pos, unsigned int len, - block_t *blk_addr, bool *node_changed) + block_t *blk_addr, bool *node_changed, bool *use_cow) { struct inode *inode = page->mapping->host; struct inode *cow_inode = F2FS_I(inode)->cow_inode; @@ -3500,10 +3505,12 @@ static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi, /* Look for the block in COW inode first */ err = __find_data_block(cow_inode, index, blk_addr); - if (err) + if (err) { return err; - else if (*blk_addr != NULL_ADDR) + } else if (*blk_addr != NULL_ADDR) { + *use_cow = true; return 0; + } if (is_inode_flag_set(inode, FI_ATOMIC_REPLACE)) goto reserve_block; @@ -3533,6 +3540,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, struct page *page = NULL; pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT; bool need_balance = false; + bool use_cow = false; block_t blkaddr = NULL_ADDR; int err = 0; @@ -3592,7 +3600,7 @@ repeat: if (f2fs_is_atomic_file(inode)) err = prepare_atomic_write_begin(sbi, page, pos, len, - &blkaddr, &need_balance); + &blkaddr, &need_balance, &use_cow); else err = prepare_write_begin(sbi, page, pos, len, &blkaddr, &need_balance); @@ -3632,7 +3640,9 @@ repeat: f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); goto fail; } - err = f2fs_submit_page_read(inode, page, blkaddr, 0, true); + err = f2fs_submit_page_read(use_cow ? + F2FS_I(inode)->cow_inode : inode, page, + blkaddr, 0, true); if (err) goto fail; @@ -3725,37 +3735,16 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length) f2fs_remove_dirty_inode(inode); } } - - clear_page_private_reference(&folio->page); - clear_page_private_gcing(&folio->page); - - if (test_opt(sbi, COMPRESS_CACHE) && - inode->i_ino == F2FS_COMPRESS_INO(sbi)) - clear_page_private_data(&folio->page); - - folio_detach_private(folio); + clear_page_private_all(&folio->page); } bool f2fs_release_folio(struct folio *folio, gfp_t wait) { - struct f2fs_sb_info *sbi; - /* If this is dirty folio, keep private data */ if (folio_test_dirty(folio)) return false; - sbi = F2FS_M_SB(folio->mapping); - if (test_opt(sbi, COMPRESS_CACHE)) { - struct inode *inode = folio->mapping->host; - - if (inode->i_ino == F2FS_COMPRESS_INO(sbi)) - clear_page_private_data(&folio->page); - } - - clear_page_private_reference(&folio->page); - clear_page_private_gcing(&folio->page); - - folio_detach_private(folio); + clear_page_private_all(&folio->page); return true; } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 30a77936e3c5..61c35b59126e 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -336,22 +336,23 @@ get_cache: #endif } -static char *s_flag[] = { - [SBI_IS_DIRTY] = " fs_dirty", - [SBI_IS_CLOSE] = " closing", - [SBI_NEED_FSCK] = " need_fsck", - [SBI_POR_DOING] = " recovering", - [SBI_NEED_SB_WRITE] = " sb_dirty", - [SBI_NEED_CP] = " need_cp", - [SBI_IS_SHUTDOWN] = " shutdown", - [SBI_IS_RECOVERED] = " recovered", - [SBI_CP_DISABLED] = " cp_disabled", - [SBI_CP_DISABLED_QUICK] = " cp_disabled_quick", - [SBI_QUOTA_NEED_FLUSH] = " quota_need_flush", - [SBI_QUOTA_SKIP_FLUSH] = " quota_skip_flush", - [SBI_QUOTA_NEED_REPAIR] = " quota_need_repair", - [SBI_IS_RESIZEFS] = " resizefs", - [SBI_IS_FREEZING] = " freezefs", +static const char *s_flag[MAX_SBI_FLAG] = { + [SBI_IS_DIRTY] = "fs_dirty", + [SBI_IS_CLOSE] = "closing", + [SBI_NEED_FSCK] = "need_fsck", + [SBI_POR_DOING] = "recovering", + [SBI_NEED_SB_WRITE] = "sb_dirty", + [SBI_NEED_CP] = "need_cp", + [SBI_IS_SHUTDOWN] = "shutdown", + [SBI_IS_RECOVERED] = "recovered", + [SBI_CP_DISABLED] = "cp_disabled", + [SBI_CP_DISABLED_QUICK] = "cp_disabled_quick", + [SBI_QUOTA_NEED_FLUSH] = "quota_need_flush", + [SBI_QUOTA_SKIP_FLUSH] = "quota_skip_flush", + [SBI_QUOTA_NEED_REPAIR] = "quota_need_repair", + [SBI_IS_RESIZEFS] = "resizefs", + [SBI_IS_FREEZING] = "freezefs", + [SBI_IS_WRITABLE] = "writable", }; static const char *ipu_mode_names[F2FS_IPU_MAX] = { @@ -384,8 +385,8 @@ static int stat_show(struct seq_file *s, void *v) "Disabled" : (f2fs_cp_error(sbi) ? "Error" : "Good")); if (sbi->s_flag) { seq_puts(s, "[SBI:"); - for_each_set_bit(j, &sbi->s_flag, 32) - seq_puts(s, s_flag[j]); + for_each_set_bit(j, &sbi->s_flag, MAX_SBI_FLAG) + seq_printf(s, " %s", s_flag[j]); seq_puts(s, "]\n"); } seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 9ccdbe120425..887e55988450 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -29,7 +29,7 @@ static unsigned long dir_blocks(struct inode *inode) static unsigned int dir_buckets(unsigned int level, int dir_level) { if (level + dir_level < MAX_DIR_HASH_DEPTH / 2) - return 1 << (level + dir_level); + return BIT(level + dir_level); else return MAX_DIR_BUCKETS; } @@ -42,39 +42,6 @@ static unsigned int bucket_blocks(unsigned int level) return 4; } -static unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { - [F2FS_FT_UNKNOWN] = DT_UNKNOWN, - [F2FS_FT_REG_FILE] = DT_REG, - [F2FS_FT_DIR] = DT_DIR, - [F2FS_FT_CHRDEV] = DT_CHR, - [F2FS_FT_BLKDEV] = DT_BLK, - [F2FS_FT_FIFO] = DT_FIFO, - [F2FS_FT_SOCK] = DT_SOCK, - [F2FS_FT_SYMLINK] = DT_LNK, -}; - -static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = { - [S_IFREG >> S_SHIFT] = F2FS_FT_REG_FILE, - [S_IFDIR >> S_SHIFT] = F2FS_FT_DIR, - [S_IFCHR >> S_SHIFT] = F2FS_FT_CHRDEV, - [S_IFBLK >> S_SHIFT] = F2FS_FT_BLKDEV, - [S_IFIFO >> S_SHIFT] = F2FS_FT_FIFO, - [S_IFSOCK >> S_SHIFT] = F2FS_FT_SOCK, - [S_IFLNK >> S_SHIFT] = F2FS_FT_SYMLINK, -}; - -static void set_de_type(struct f2fs_dir_entry *de, umode_t mode) -{ - de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; -} - -unsigned char f2fs_get_de_type(struct f2fs_dir_entry *de) -{ - if (de->file_type < F2FS_FT_MAX) - return f2fs_filetype_table[de->file_type]; - return DT_UNKNOWN; -} - /* If @dir is casefolded, initialize @fname->cf_name from @fname->usr_fname. */ int f2fs_init_casefolded_name(const struct inode *dir, struct f2fs_filename *fname) @@ -485,7 +452,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, lock_page(page); f2fs_wait_on_page_writeback(page, type, true, true); de->ino = cpu_to_le32(inode->i_ino); - set_de_type(de, inode->i_mode); + de->file_type = fs_umode_to_ftype(inode->i_mode); set_page_dirty(page); dir->i_mtime = dir->i_ctime = current_time(dir); @@ -699,7 +666,7 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, de->name_len = cpu_to_le16(name->len); memcpy(d->filename[bit_pos], name->name, name->len); de->ino = cpu_to_le32(ino); - set_de_type(de, mode); + de->file_type = fs_umode_to_ftype(mode); for (i = 0; i < slots; i++) { __set_bit_le(bit_pos + i, (void *)d->bitmap); /* avoid wrong garbage data for readdir */ @@ -938,14 +905,10 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, f2fs_clear_page_cache_dirty_tag(page); clear_page_dirty_for_io(page); ClearPageUptodate(page); - - clear_page_private_gcing(page); + clear_page_private_all(page); inode_dec_dirty_pages(dir); f2fs_remove_dirty_inode(dir); - - detach_page_private(page); - set_page_private(page, 0); } f2fs_put_page(page, 1); @@ -1036,7 +999,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, continue; } - d_type = f2fs_get_de_type(de); + d_type = fs_ftype_to_dtype(de->file_type); de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 28b12553f2b3..0e2d49140c07 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -23,18 +23,26 @@ bool sanity_check_extent_cache(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); + struct extent_tree *et = fi->extent_tree[EX_READ]; struct extent_info *ei; - if (!fi->extent_tree[EX_READ]) + if (!et) + return true; + + ei = &et->largest; + if (!ei->len) return true; - ei = &fi->extent_tree[EX_READ]->largest; + /* Let's drop, if checkpoint got corrupted. */ + if (is_set_ckpt_flags(sbi, CP_ERROR_FLAG)) { + ei->len = 0; + et->largest_updated = true; + return true; + } - if (ei->len && - (!f2fs_is_valid_blkaddr(sbi, ei->blk, - DATA_GENERIC_ENHANCE) || - !f2fs_is_valid_blkaddr(sbi, ei->blk + ei->len - 1, - DATA_GENERIC_ENHANCE))) { + if (!f2fs_is_valid_blkaddr(sbi, ei->blk, DATA_GENERIC_ENHANCE) || + !f2fs_is_valid_blkaddr(sbi, ei->blk + ei->len - 1, + DATA_GENERIC_ENHANCE)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx) extent info [%u, %u, %u] is incorrect, run fsck to fix", __func__, inode->i_ino, @@ -86,7 +94,6 @@ static bool __may_age_extent_tree(struct inode *inode) if (!test_opt(sbi, AGE_EXTENT_CACHE)) return false; - /* don't cache block age info for cold file */ if (is_inode_flag_set(inode, FI_COMPRESSED_FILE)) return false; if (file_is_cold(inode)) @@ -161,118 +168,52 @@ static bool __is_front_mergeable(struct extent_info *cur, return __is_extent_mergeable(cur, front, type); } -static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re, - unsigned int ofs) -{ - if (cached_re) { - if (cached_re->ofs <= ofs && - cached_re->ofs + cached_re->len > ofs) { - return cached_re; - } - } - return NULL; -} - -static struct rb_entry *__lookup_rb_tree_slow(struct rb_root_cached *root, - unsigned int ofs) +static struct extent_node *__lookup_extent_node(struct rb_root_cached *root, + struct extent_node *cached_en, unsigned int fofs) { struct rb_node *node = root->rb_root.rb_node; - struct rb_entry *re; + struct extent_node *en; + /* check a cached entry */ + if (cached_en && cached_en->ei.fofs <= fofs && + cached_en->ei.fofs + cached_en->ei.len > fofs) + return cached_en; + + /* check rb_tree */ while (node) { - re = rb_entry(node, struct rb_entry, rb_node); + en = rb_entry(node, struct extent_node, rb_node); - if (ofs < re->ofs) + if (fofs < en->ei.fofs) node = node->rb_left; - else if (ofs >= re->ofs + re->len) + else if (fofs >= en->ei.fofs + en->ei.len) node = node->rb_right; else - return re; + return en; } return NULL; } -struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root, - struct rb_entry *cached_re, unsigned int ofs) -{ - struct rb_entry *re; - - re = __lookup_rb_tree_fast(cached_re, ofs); - if (!re) - return __lookup_rb_tree_slow(root, ofs); - - return re; -} - -struct rb_node **f2fs_lookup_rb_tree_ext(struct f2fs_sb_info *sbi, - struct rb_root_cached *root, - struct rb_node **parent, - unsigned long long key, bool *leftmost) -{ - struct rb_node **p = &root->rb_root.rb_node; - struct rb_entry *re; - - while (*p) { - *parent = *p; - re = rb_entry(*parent, struct rb_entry, rb_node); - - if (key < re->key) { - p = &(*p)->rb_left; - } else { - p = &(*p)->rb_right; - *leftmost = false; - } - } - - return p; -} - -struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, - struct rb_root_cached *root, - struct rb_node **parent, - unsigned int ofs, bool *leftmost) -{ - struct rb_node **p = &root->rb_root.rb_node; - struct rb_entry *re; - - while (*p) { - *parent = *p; - re = rb_entry(*parent, struct rb_entry, rb_node); - - if (ofs < re->ofs) { - p = &(*p)->rb_left; - } else if (ofs >= re->ofs + re->len) { - p = &(*p)->rb_right; - *leftmost = false; - } else { - f2fs_bug_on(sbi, 1); - } - } - - return p; -} - /* - * lookup rb entry in position of @ofs in rb-tree, + * lookup rb entry in position of @fofs in rb-tree, * if hit, return the entry, otherwise, return NULL - * @prev_ex: extent before ofs - * @next_ex: extent after ofs - * @insert_p: insert point for new extent at ofs + * @prev_ex: extent before fofs + * @next_ex: extent after fofs + * @insert_p: insert point for new extent at fofs * in order to simplify the insertion after. * tree must stay unchanged between lookup and insertion. */ -struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, - struct rb_entry *cached_re, - unsigned int ofs, - struct rb_entry **prev_entry, - struct rb_entry **next_entry, +static struct extent_node *__lookup_extent_node_ret(struct rb_root_cached *root, + struct extent_node *cached_en, + unsigned int fofs, + struct extent_node **prev_entry, + struct extent_node **next_entry, struct rb_node ***insert_p, struct rb_node **insert_parent, - bool force, bool *leftmost) + bool *leftmost) { struct rb_node **pnode = &root->rb_root.rb_node; struct rb_node *parent = NULL, *tmp_node; - struct rb_entry *re = cached_re; + struct extent_node *en = cached_en; *insert_p = NULL; *insert_parent = NULL; @@ -282,24 +223,20 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, if (RB_EMPTY_ROOT(&root->rb_root)) return NULL; - if (re) { - if (re->ofs <= ofs && re->ofs + re->len > ofs) - goto lookup_neighbors; - } + if (en && en->ei.fofs <= fofs && en->ei.fofs + en->ei.len > fofs) + goto lookup_neighbors; - if (leftmost) - *leftmost = true; + *leftmost = true; while (*pnode) { parent = *pnode; - re = rb_entry(*pnode, struct rb_entry, rb_node); + en = rb_entry(*pnode, struct extent_node, rb_node); - if (ofs < re->ofs) { + if (fofs < en->ei.fofs) { pnode = &(*pnode)->rb_left; - } else if (ofs >= re->ofs + re->len) { + } else if (fofs >= en->ei.fofs + en->ei.len) { pnode = &(*pnode)->rb_right; - if (leftmost) - *leftmost = false; + *leftmost = false; } else { goto lookup_neighbors; } @@ -308,71 +245,32 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, *insert_p = pnode; *insert_parent = parent; - re = rb_entry(parent, struct rb_entry, rb_node); + en = rb_entry(parent, struct extent_node, rb_node); tmp_node = parent; - if (parent && ofs > re->ofs) + if (parent && fofs > en->ei.fofs) tmp_node = rb_next(parent); - *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + *next_entry = rb_entry_safe(tmp_node, struct extent_node, rb_node); tmp_node = parent; - if (parent && ofs < re->ofs) + if (parent && fofs < en->ei.fofs) tmp_node = rb_prev(parent); - *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + *prev_entry = rb_entry_safe(tmp_node, struct extent_node, rb_node); return NULL; lookup_neighbors: - if (ofs == re->ofs || force) { + if (fofs == en->ei.fofs) { /* lookup prev node for merging backward later */ - tmp_node = rb_prev(&re->rb_node); - *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + tmp_node = rb_prev(&en->rb_node); + *prev_entry = rb_entry_safe(tmp_node, + struct extent_node, rb_node); } - if (ofs == re->ofs + re->len - 1 || force) { + if (fofs == en->ei.fofs + en->ei.len - 1) { /* lookup next node for merging frontward later */ - tmp_node = rb_next(&re->rb_node); - *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); - } - return re; -} - -bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, - struct rb_root_cached *root, bool check_key) -{ -#ifdef CONFIG_F2FS_CHECK_FS - struct rb_node *cur = rb_first_cached(root), *next; - struct rb_entry *cur_re, *next_re; - - if (!cur) - return true; - - while (cur) { - next = rb_next(cur); - if (!next) - return true; - - cur_re = rb_entry(cur, struct rb_entry, rb_node); - next_re = rb_entry(next, struct rb_entry, rb_node); - - if (check_key) { - if (cur_re->key > next_re->key) { - f2fs_info(sbi, "inconsistent rbtree, " - "cur(%llu) next(%llu)", - cur_re->key, next_re->key); - return false; - } - goto next; - } - - if (cur_re->ofs + cur_re->len > next_re->ofs) { - f2fs_info(sbi, "inconsistent rbtree, cur(%u, %u) next(%u, %u)", - cur_re->ofs, cur_re->len, - next_re->ofs, next_re->len); - return false; - } -next: - cur = next; + tmp_node = rb_next(&en->rb_node); + *next_entry = rb_entry_safe(tmp_node, + struct extent_node, rb_node); } -#endif - return true; + return en; } static struct kmem_cache *extent_tree_slab; @@ -587,8 +485,7 @@ static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs, goto out; } - en = (struct extent_node *)f2fs_lookup_rb_tree(&et->root, - (struct rb_entry *)et->cached_en, pgofs); + en = __lookup_extent_node(&et->root, et->cached_en, pgofs); if (!en) goto out; @@ -662,7 +559,7 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, bool leftmost) { struct extent_tree_info *eti = &sbi->extent_tree[et->type]; - struct rb_node **p; + struct rb_node **p = &et->root.rb_root.rb_node; struct rb_node *parent = NULL; struct extent_node *en = NULL; @@ -674,8 +571,21 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, leftmost = true; - p = f2fs_lookup_rb_tree_for_insert(sbi, &et->root, &parent, - ei->fofs, &leftmost); + /* look up extent_node in the rb tree */ + while (*p) { + parent = *p; + en = rb_entry(parent, struct extent_node, rb_node); + + if (ei->fofs < en->ei.fofs) { + p = &(*p)->rb_left; + } else if (ei->fofs >= en->ei.fofs + en->ei.len) { + p = &(*p)->rb_right; + leftmost = false; + } else { + f2fs_bug_on(sbi, 1); + } + } + do_insert: en = __attach_extent_node(sbi, et, ei, parent, p, leftmost); if (!en) @@ -734,11 +644,10 @@ static void __update_extent_tree_range(struct inode *inode, } /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ - en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root, - (struct rb_entry *)et->cached_en, fofs, - (struct rb_entry **)&prev_en, - (struct rb_entry **)&next_en, - &insert_p, &insert_parent, false, + en = __lookup_extent_node_ret(&et->root, + et->cached_en, fofs, + &prev_en, &next_en, + &insert_p, &insert_parent, &leftmost); if (!en) en = next_en; @@ -876,12 +785,11 @@ void f2fs_update_read_extent_tree_range_compressed(struct inode *inode, write_lock(&et->lock); - en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root, - (struct rb_entry *)et->cached_en, fofs, - (struct rb_entry **)&prev_en, - (struct rb_entry **)&next_en, - &insert_p, &insert_parent, false, - &leftmost); + en = __lookup_extent_node_ret(&et->root, + et->cached_en, fofs, + &prev_en, &next_en, + &insert_p, &insert_parent, + &leftmost); if (en) goto unlock_out; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b0ab2062038a..d211ee89c158 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -65,7 +65,7 @@ enum { }; #ifdef CONFIG_F2FS_FAULT_INJECTION -#define F2FS_ALL_FAULT_TYPE ((1 << FAULT_MAX) - 1) +#define F2FS_ALL_FAULT_TYPE (GENMASK(FAULT_MAX - 1, 0)) struct f2fs_fault_info { atomic_t inject_ops; @@ -74,7 +74,7 @@ struct f2fs_fault_info { }; extern const char *f2fs_fault_name[FAULT_MAX]; -#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type))) +#define IS_FAULT_SET(fi, type) ((fi)->inject_type & BIT(type)) #endif /* @@ -353,15 +353,7 @@ struct discard_info { struct discard_cmd { struct rb_node rb_node; /* rb node located in rb-tree */ - union { - struct { - block_t lstart; /* logical start address */ - block_t len; /* length */ - block_t start; /* actual start address in dev */ - }; - struct discard_info di; /* discard info */ - - }; + struct discard_info di; /* discard info */ struct list_head list; /* command list */ struct completion wait; /* compleation */ struct block_device *bdev; /* bdev */ @@ -628,17 +620,6 @@ enum extent_type { NR_EXTENT_CACHES, }; -struct rb_entry { - struct rb_node rb_node; /* rb node located in rb-tree */ - union { - struct { - unsigned int ofs; /* start offset of the entry */ - unsigned int len; /* length of the entry */ - }; - unsigned long long key; /* 64-bits key */ - } __packed; -}; - struct extent_info { unsigned int fofs; /* start offset in a file */ unsigned int len; /* length of the extent */ @@ -862,7 +843,7 @@ struct f2fs_inode_info { kprojid_t i_projid; /* id for project quota */ int i_inline_xattr_size; /* inline xattr size */ struct timespec64 i_crtime; /* inode creation time */ - struct timespec64 i_disk_time[4];/* inode disk times */ + struct timespec64 i_disk_time[3];/* inode disk times */ /* for file compress */ atomic_t i_compr_blocks; /* # of compressed blocks */ @@ -1293,7 +1274,10 @@ struct f2fs_gc_control { unsigned int nr_free_secs; /* # of free sections to do GC */ }; -/* For s_flag in struct f2fs_sb_info */ +/* + * For s_flag in struct f2fs_sb_info + * Modification on enum should be synchronized with s_flag array + */ enum { SBI_IS_DIRTY, /* dirty flag for checkpoint */ SBI_IS_CLOSE, /* specify unmounting */ @@ -1310,6 +1294,8 @@ enum { SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */ SBI_IS_RESIZEFS, /* resizefs is in process */ SBI_IS_FREEZING, /* freezefs is in process */ + SBI_IS_WRITABLE, /* remove ro mountoption transiently */ + MAX_SBI_FLAG, }; enum { @@ -1412,86 +1398,6 @@ enum { PAGE_PRIVATE_MAX }; -#define PAGE_PRIVATE_GET_FUNC(name, flagname) \ -static inline bool page_private_##name(struct page *page) \ -{ \ - return PagePrivate(page) && \ - test_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)) && \ - test_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ -} - -#define PAGE_PRIVATE_SET_FUNC(name, flagname) \ -static inline void set_page_private_##name(struct page *page) \ -{ \ - if (!PagePrivate(page)) { \ - get_page(page); \ - SetPagePrivate(page); \ - set_page_private(page, 0); \ - } \ - set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)); \ - set_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ -} - -#define PAGE_PRIVATE_CLEAR_FUNC(name, flagname) \ -static inline void clear_page_private_##name(struct page *page) \ -{ \ - clear_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ - if (page_private(page) == 1 << PAGE_PRIVATE_NOT_POINTER) { \ - set_page_private(page, 0); \ - if (PagePrivate(page)) { \ - ClearPagePrivate(page); \ - put_page(page); \ - }\ - } \ -} - -PAGE_PRIVATE_GET_FUNC(nonpointer, NOT_POINTER); -PAGE_PRIVATE_GET_FUNC(inline, INLINE_INODE); -PAGE_PRIVATE_GET_FUNC(gcing, ONGOING_MIGRATION); -PAGE_PRIVATE_GET_FUNC(dummy, DUMMY_WRITE); - -PAGE_PRIVATE_SET_FUNC(reference, REF_RESOURCE); -PAGE_PRIVATE_SET_FUNC(inline, INLINE_INODE); -PAGE_PRIVATE_SET_FUNC(gcing, ONGOING_MIGRATION); -PAGE_PRIVATE_SET_FUNC(dummy, DUMMY_WRITE); - -PAGE_PRIVATE_CLEAR_FUNC(reference, REF_RESOURCE); -PAGE_PRIVATE_CLEAR_FUNC(inline, INLINE_INODE); -PAGE_PRIVATE_CLEAR_FUNC(gcing, ONGOING_MIGRATION); -PAGE_PRIVATE_CLEAR_FUNC(dummy, DUMMY_WRITE); - -static inline unsigned long get_page_private_data(struct page *page) -{ - unsigned long data = page_private(page); - - if (!test_bit(PAGE_PRIVATE_NOT_POINTER, &data)) - return 0; - return data >> PAGE_PRIVATE_MAX; -} - -static inline void set_page_private_data(struct page *page, unsigned long data) -{ - if (!PagePrivate(page)) { - get_page(page); - SetPagePrivate(page); - set_page_private(page, 0); - } - set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)); - page_private(page) |= data << PAGE_PRIVATE_MAX; -} - -static inline void clear_page_private_data(struct page *page) -{ - page_private(page) &= (1 << PAGE_PRIVATE_MAX) - 1; - if (page_private(page) == 1 << PAGE_PRIVATE_NOT_POINTER) { - set_page_private(page, 0); - if (PagePrivate(page)) { - ClearPagePrivate(page); - put_page(page); - } - } -} - /* For compression */ enum compress_algorithm_type { COMPRESS_LZO, @@ -1617,7 +1523,6 @@ struct f2fs_sb_info { #ifdef CONFIG_BLK_DEV_ZONED unsigned int blocks_per_blkz; /* F2FS blocks per zone */ - unsigned int log_blocks_per_blkz; /* log2 F2FS blocks per zone */ #endif /* for node-related operations */ @@ -2386,6 +2291,80 @@ void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...); #define f2fs_debug(sbi, fmt, ...) \ f2fs_printk(sbi, KERN_DEBUG fmt, ##__VA_ARGS__) +#define PAGE_PRIVATE_GET_FUNC(name, flagname) \ +static inline bool page_private_##name(struct page *page) \ +{ \ + return PagePrivate(page) && \ + test_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)) && \ + test_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ +} + +#define PAGE_PRIVATE_SET_FUNC(name, flagname) \ +static inline void set_page_private_##name(struct page *page) \ +{ \ + if (!PagePrivate(page)) \ + attach_page_private(page, (void *)0); \ + set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)); \ + set_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ +} + +#define PAGE_PRIVATE_CLEAR_FUNC(name, flagname) \ +static inline void clear_page_private_##name(struct page *page) \ +{ \ + clear_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ + if (page_private(page) == BIT(PAGE_PRIVATE_NOT_POINTER)) \ + detach_page_private(page); \ +} + +PAGE_PRIVATE_GET_FUNC(nonpointer, NOT_POINTER); +PAGE_PRIVATE_GET_FUNC(inline, INLINE_INODE); +PAGE_PRIVATE_GET_FUNC(gcing, ONGOING_MIGRATION); +PAGE_PRIVATE_GET_FUNC(dummy, DUMMY_WRITE); + +PAGE_PRIVATE_SET_FUNC(reference, REF_RESOURCE); +PAGE_PRIVATE_SET_FUNC(inline, INLINE_INODE); +PAGE_PRIVATE_SET_FUNC(gcing, ONGOING_MIGRATION); +PAGE_PRIVATE_SET_FUNC(dummy, DUMMY_WRITE); + +PAGE_PRIVATE_CLEAR_FUNC(reference, REF_RESOURCE); +PAGE_PRIVATE_CLEAR_FUNC(inline, INLINE_INODE); +PAGE_PRIVATE_CLEAR_FUNC(gcing, ONGOING_MIGRATION); +PAGE_PRIVATE_CLEAR_FUNC(dummy, DUMMY_WRITE); + +static inline unsigned long get_page_private_data(struct page *page) +{ + unsigned long data = page_private(page); + + if (!test_bit(PAGE_PRIVATE_NOT_POINTER, &data)) + return 0; + return data >> PAGE_PRIVATE_MAX; +} + +static inline void set_page_private_data(struct page *page, unsigned long data) +{ + if (!PagePrivate(page)) + attach_page_private(page, (void *)0); + set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)); + page_private(page) |= data << PAGE_PRIVATE_MAX; +} + +static inline void clear_page_private_data(struct page *page) +{ + page_private(page) &= GENMASK(PAGE_PRIVATE_MAX - 1, 0); + if (page_private(page) == BIT(PAGE_PRIVATE_NOT_POINTER)) + detach_page_private(page); +} + +static inline void clear_page_private_all(struct page *page) +{ + clear_page_private_data(page); + clear_page_private_reference(page); + clear_page_private_gcing(page); + clear_page_private_inline(page); + + f2fs_bug_on(F2FS_P_SB(page), page_private(page)); +} + static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, block_t count) @@ -2892,7 +2871,7 @@ static inline int f2fs_test_bit(unsigned int nr, char *addr) int mask; addr += (nr >> 3); - mask = 1 << (7 - (nr & 0x07)); + mask = BIT(7 - (nr & 0x07)); return mask & *addr; } @@ -2901,7 +2880,7 @@ static inline void f2fs_set_bit(unsigned int nr, char *addr) int mask; addr += (nr >> 3); - mask = 1 << (7 - (nr & 0x07)); + mask = BIT(7 - (nr & 0x07)); *addr |= mask; } @@ -2910,7 +2889,7 @@ static inline void f2fs_clear_bit(unsigned int nr, char *addr) int mask; addr += (nr >> 3); - mask = 1 << (7 - (nr & 0x07)); + mask = BIT(7 - (nr & 0x07)); *addr &= ~mask; } @@ -2920,7 +2899,7 @@ static inline int f2fs_test_and_set_bit(unsigned int nr, char *addr) int ret; addr += (nr >> 3); - mask = 1 << (7 - (nr & 0x07)); + mask = BIT(7 - (nr & 0x07)); ret = mask & *addr; *addr |= mask; return ret; @@ -2932,7 +2911,7 @@ static inline int f2fs_test_and_clear_bit(unsigned int nr, char *addr) int ret; addr += (nr >> 3); - mask = 1 << (7 - (nr & 0x07)); + mask = BIT(7 - (nr & 0x07)); ret = mask & *addr; *addr &= ~mask; return ret; @@ -2943,7 +2922,7 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) int mask; addr += (nr >> 3); - mask = 1 << (7 - (nr & 0x07)); + mask = BIT(7 - (nr & 0x07)); *addr ^= mask; } @@ -3307,9 +3286,6 @@ static inline bool f2fs_is_time_consistent(struct inode *inode) return false; if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime)) return false; - if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 3, - &F2FS_I(inode)->i_crtime)) - return false; return true; } @@ -3370,6 +3346,19 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, return kmalloc(size, flags); } +static inline void *f2fs_getname(struct f2fs_sb_info *sbi) +{ + if (time_to_inject(sbi, FAULT_KMALLOC)) + return NULL; + + return __getname(); +} + +static inline void f2fs_putname(char *buf) +{ + __putname(buf); +} + static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { @@ -3489,7 +3478,6 @@ int f2fs_get_tmpfile(struct mnt_idmap *idmap, struct inode *dir, /* * dir.c */ -unsigned char f2fs_get_de_type(struct f2fs_dir_entry *de); int f2fs_init_casefolded_name(const struct inode *dir, struct f2fs_filename *fname); int f2fs_setup_filename(struct inode *dir, const struct qstr *iname, @@ -3554,6 +3542,7 @@ int f2fs_quota_sync(struct super_block *sb, int type); loff_t max_file_blocks(struct inode *inode); void f2fs_quota_off_umount(struct super_block *sb); void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason); +void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag); void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); @@ -3737,7 +3726,6 @@ void f2fs_set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type); bool f2fs_is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type); -int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi); int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi); void f2fs_release_orphan_inode(struct f2fs_sb_info *sbi); void f2fs_add_orphan_inode(struct inode *inode); @@ -3830,6 +3818,10 @@ void f2fs_build_gc_manager(struct f2fs_sb_info *sbi); int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count); int __init f2fs_create_garbage_collection_cache(void); void f2fs_destroy_garbage_collection_cache(void); +/* victim selection function for cleaning and SSR */ +int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result, + int gc_type, int type, char alloc_mode, + unsigned long long age); /* * recovery.c @@ -4138,23 +4130,6 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); * extent_cache.c */ bool sanity_check_extent_cache(struct inode *inode); -struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root, - struct rb_entry *cached_re, unsigned int ofs); -struct rb_node **f2fs_lookup_rb_tree_ext(struct f2fs_sb_info *sbi, - struct rb_root_cached *root, - struct rb_node **parent, - unsigned long long key, bool *left_most); -struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, - struct rb_root_cached *root, - struct rb_node **parent, - unsigned int ofs, bool *leftmost); -struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, - struct rb_entry *cached_re, unsigned int ofs, - struct rb_entry **prev_entry, struct rb_entry **next_entry, - struct rb_node ***insert_p, struct rb_node **insert_parent, - bool force, bool *leftmost); -bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, - struct rb_root_cached *root, bool check_key); void f2fs_init_extent_tree(struct inode *inode); void f2fs_drop_extent_tree(struct inode *inode); void f2fs_destroy_extent_node(struct inode *inode); @@ -4354,9 +4329,9 @@ static inline int set_compress_context(struct inode *inode) F2FS_OPTION(sbi).compress_log_size; F2FS_I(inode)->i_compress_flag = F2FS_OPTION(sbi).compress_chksum ? - 1 << COMPRESS_CHKSUM : 0; + BIT(COMPRESS_CHKSUM) : 0; F2FS_I(inode)->i_cluster_size = - 1 << F2FS_I(inode)->i_log_cluster_size; + BIT(F2FS_I(inode)->i_log_cluster_size); if ((F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 || F2FS_I(inode)->i_compress_algorithm == COMPRESS_ZSTD) && F2FS_OPTION(sbi).compress_level) @@ -4414,7 +4389,7 @@ F2FS_FEATURE_FUNCS(readonly, RO); static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi, block_t blkaddr) { - unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz; + unsigned int zno = blkaddr / sbi->blocks_per_blkz; return test_bit(zno, FDEV(devi).blkz_seq); } @@ -4462,6 +4437,11 @@ static inline bool f2fs_hw_is_readonly(struct f2fs_sb_info *sbi) return false; } +static inline bool f2fs_dev_is_readonly(struct f2fs_sb_info *sbi) +{ + return f2fs_sb_has_readonly(sbi) || f2fs_hw_is_readonly(sbi); +} + static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi) { return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 15dabeac4690..5ac53d2627d2 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2113,7 +2113,11 @@ static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate) clear_inode_flag(fi->cow_inode, FI_INLINE_DATA); } else { /* Reuse the already created COW inode */ - f2fs_do_truncate_blocks(fi->cow_inode, 0, true); + ret = f2fs_do_truncate_blocks(fi->cow_inode, 0, true); + if (ret) { + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + goto out; + } } f2fs_write_inode(inode, NULL); @@ -3009,15 +3013,16 @@ int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid) struct dquot *transfer_to[MAXQUOTAS] = {}; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct super_block *sb = sbi->sb; - int err = 0; + int err; transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); - if (!IS_ERR(transfer_to[PRJQUOTA])) { - err = __dquot_transfer(inode, transfer_to); - if (err) - set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); - dqput(transfer_to[PRJQUOTA]); - } + if (IS_ERR(transfer_to[PRJQUOTA])) + return PTR_ERR(transfer_to[PRJQUOTA]); + + err = __dquot_transfer(inode, transfer_to); + if (err) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + dqput(transfer_to[PRJQUOTA]); return err; } @@ -3964,7 +3969,7 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg) F2FS_I(inode)->i_compress_algorithm = option.algorithm; F2FS_I(inode)->i_log_cluster_size = option.log_cluster_size; - F2FS_I(inode)->i_cluster_size = 1 << option.log_cluster_size; + F2FS_I(inode)->i_cluster_size = BIT(option.log_cluster_size); f2fs_mark_inode_dirty_sync(inode, true); if (!f2fs_is_compress_backend_ready(inode)) @@ -4062,8 +4067,11 @@ static int f2fs_ioc_decompress_file(struct file *filp) if (ret < 0) break; - if (get_dirty_pages(inode) >= blk_per_seg) - filemap_fdatawrite(inode->i_mapping); + if (get_dirty_pages(inode) >= blk_per_seg) { + ret = filemap_fdatawrite(inode->i_mapping); + if (ret < 0) + break; + } count -= len; page_idx += len; @@ -4133,8 +4141,11 @@ static int f2fs_ioc_compress_file(struct file *filp) if (ret < 0) break; - if (get_dirty_pages(inode) >= blk_per_seg) - filemap_fdatawrite(inode->i_mapping); + if (get_dirty_pages(inode) >= blk_per_seg) { + ret = filemap_fdatawrite(inode->i_mapping); + if (ret < 0) + break; + } count -= len; page_idx += len; @@ -4361,7 +4372,7 @@ static void f2fs_trace_rw_file_path(struct kiocb *iocb, size_t count, int rw) struct inode *inode = file_inode(iocb->ki_filp); char *buf, *path; - buf = f2fs_kmalloc(F2FS_I_SB(inode), PATH_MAX, GFP_KERNEL); + buf = f2fs_getname(F2FS_I_SB(inode)); if (!buf) return; path = dentry_path_raw(file_dentry(iocb->ki_filp), buf, PATH_MAX); @@ -4374,7 +4385,7 @@ static void f2fs_trace_rw_file_path(struct kiocb *iocb, size_t count, int rw) trace_f2fs_dataread_start(inode, iocb->ki_pos, count, current->pid, path, current->comm); free_buf: - kfree(buf); + f2fs_putname(buf); } static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -4534,6 +4545,19 @@ static const struct iomap_dio_ops f2fs_iomap_dio_write_ops = { .end_io = f2fs_dio_write_end_io, }; +static void f2fs_flush_buffered_write(struct address_space *mapping, + loff_t start_pos, loff_t end_pos) +{ + int ret; + + ret = filemap_write_and_wait_range(mapping, start_pos, end_pos); + if (ret < 0) + return; + invalidate_mapping_pages(mapping, + start_pos >> PAGE_SHIFT, + end_pos >> PAGE_SHIFT); +} + static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, bool *may_need_sync) { @@ -4633,14 +4657,9 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, ret += ret2; - ret2 = filemap_write_and_wait_range(file->f_mapping, - bufio_start_pos, - bufio_end_pos); - if (ret2 < 0) - goto out; - invalidate_mapping_pages(file->f_mapping, - bufio_start_pos >> PAGE_SHIFT, - bufio_end_pos >> PAGE_SHIFT); + f2fs_flush_buffered_write(file->f_mapping, + bufio_start_pos, + bufio_end_pos); } } else { /* iomap_dio_rw() already handled the generic_write_sync(). */ @@ -4723,8 +4742,18 @@ out_unlock: inode_unlock(inode); out: trace_f2fs_file_write_iter(inode, orig_pos, orig_count, ret); + if (ret > 0 && may_need_sync) ret = generic_write_sync(iocb, ret); + + /* If buffered IO was forced, flush and drop the data from + * the page cache to preserve O_DIRECT semantics + */ + if (ret > 0 && !dio && (iocb->ki_flags & IOCB_DIRECT)) + f2fs_flush_buffered_write(iocb->ki_filp->f_mapping, + orig_pos, + orig_pos + ret - 1); + return ret; } @@ -4879,6 +4908,7 @@ const struct file_operations f2fs_file_operations = { .llseek = f2fs_llseek, .read_iter = f2fs_file_read_iter, .write_iter = f2fs_file_write_iter, + .iopoll = iocb_bio_iopoll, .open = f2fs_file_open, .release = f2fs_release_file, .mmap = f2fs_file_mmap, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 0a9dfa459860..61c5f9d26018 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -390,40 +390,95 @@ static unsigned int count_bits(const unsigned long *addr, return sum; } -static struct victim_entry *attach_victim_entry(struct f2fs_sb_info *sbi, - unsigned long long mtime, unsigned int segno, - struct rb_node *parent, struct rb_node **p, - bool left_most) +static bool f2fs_check_victim_tree(struct f2fs_sb_info *sbi, + struct rb_root_cached *root) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct rb_node *cur = rb_first_cached(root), *next; + struct victim_entry *cur_ve, *next_ve; + + while (cur) { + next = rb_next(cur); + if (!next) + return true; + + cur_ve = rb_entry(cur, struct victim_entry, rb_node); + next_ve = rb_entry(next, struct victim_entry, rb_node); + + if (cur_ve->mtime > next_ve->mtime) { + f2fs_info(sbi, "broken victim_rbtree, " + "cur_mtime(%llu) next_mtime(%llu)", + cur_ve->mtime, next_ve->mtime); + return false; + } + cur = next; + } +#endif + return true; +} + +static struct victim_entry *__lookup_victim_entry(struct f2fs_sb_info *sbi, + unsigned long long mtime) +{ + struct atgc_management *am = &sbi->am; + struct rb_node *node = am->root.rb_root.rb_node; + struct victim_entry *ve = NULL; + + while (node) { + ve = rb_entry(node, struct victim_entry, rb_node); + + if (mtime < ve->mtime) + node = node->rb_left; + else + node = node->rb_right; + } + return ve; +} + +static struct victim_entry *__create_victim_entry(struct f2fs_sb_info *sbi, + unsigned long long mtime, unsigned int segno) { struct atgc_management *am = &sbi->am; struct victim_entry *ve; - ve = f2fs_kmem_cache_alloc(victim_entry_slab, - GFP_NOFS, true, NULL); + ve = f2fs_kmem_cache_alloc(victim_entry_slab, GFP_NOFS, true, NULL); ve->mtime = mtime; ve->segno = segno; - rb_link_node(&ve->rb_node, parent, p); - rb_insert_color_cached(&ve->rb_node, &am->root, left_most); - list_add_tail(&ve->list, &am->victim_list); - am->victim_count++; return ve; } -static void insert_victim_entry(struct f2fs_sb_info *sbi, +static void __insert_victim_entry(struct f2fs_sb_info *sbi, unsigned long long mtime, unsigned int segno) { struct atgc_management *am = &sbi->am; - struct rb_node **p; + struct rb_root_cached *root = &am->root; + struct rb_node **p = &root->rb_root.rb_node; struct rb_node *parent = NULL; + struct victim_entry *ve; bool left_most = true; - p = f2fs_lookup_rb_tree_ext(sbi, &am->root, &parent, mtime, &left_most); - attach_victim_entry(sbi, mtime, segno, parent, p, left_most); + /* look up rb tree to find parent node */ + while (*p) { + parent = *p; + ve = rb_entry(parent, struct victim_entry, rb_node); + + if (mtime < ve->mtime) { + p = &(*p)->rb_left; + } else { + p = &(*p)->rb_right; + left_most = false; + } + } + + ve = __create_victim_entry(sbi, mtime, segno); + + rb_link_node(&ve->rb_node, parent, p); + rb_insert_color_cached(&ve->rb_node, root, left_most); } static void add_victim_entry(struct f2fs_sb_info *sbi, @@ -459,19 +514,7 @@ static void add_victim_entry(struct f2fs_sb_info *sbi, if (sit_i->dirty_max_mtime - mtime < p->age_threshold) return; - insert_victim_entry(sbi, mtime, segno); -} - -static struct rb_node *lookup_central_victim(struct f2fs_sb_info *sbi, - struct victim_sel_policy *p) -{ - struct atgc_management *am = &sbi->am; - struct rb_node *parent = NULL; - bool left_most; - - f2fs_lookup_rb_tree_ext(sbi, &am->root, &parent, p->age, &left_most); - - return parent; + __insert_victim_entry(sbi, mtime, segno); } static void atgc_lookup_victim(struct f2fs_sb_info *sbi, @@ -481,7 +524,6 @@ static void atgc_lookup_victim(struct f2fs_sb_info *sbi, struct atgc_management *am = &sbi->am; struct rb_root_cached *root = &am->root; struct rb_node *node; - struct rb_entry *re; struct victim_entry *ve; unsigned long long total_time; unsigned long long age, u, accu; @@ -508,12 +550,10 @@ static void atgc_lookup_victim(struct f2fs_sb_info *sbi, node = rb_first_cached(root); next: - re = rb_entry_safe(node, struct rb_entry, rb_node); - if (!re) + ve = rb_entry_safe(node, struct victim_entry, rb_node); + if (!ve) return; - ve = (struct victim_entry *)re; - if (ve->mtime >= max_mtime || ve->mtime < min_mtime) goto skip; @@ -555,8 +595,6 @@ static void atssr_lookup_victim(struct f2fs_sb_info *sbi, { struct sit_info *sit_i = SIT_I(sbi); struct atgc_management *am = &sbi->am; - struct rb_node *node; - struct rb_entry *re; struct victim_entry *ve; unsigned long long age; unsigned long long max_mtime = sit_i->dirty_max_mtime; @@ -566,25 +604,22 @@ static void atssr_lookup_victim(struct f2fs_sb_info *sbi, unsigned int dirty_threshold = max(am->max_candidate_count, am->candidate_ratio * am->victim_count / 100); - unsigned int cost; - unsigned int iter = 0; + unsigned int cost, iter; int stage = 0; if (max_mtime < min_mtime) return; max_mtime += 1; next_stage: - node = lookup_central_victim(sbi, p); + iter = 0; + ve = __lookup_victim_entry(sbi, p->age); next_node: - re = rb_entry_safe(node, struct rb_entry, rb_node); - if (!re) { - if (stage == 0) - goto skip_stage; + if (!ve) { + if (stage++ == 0) + goto next_stage; return; } - ve = (struct victim_entry *)re; - if (ve->mtime >= max_mtime || ve->mtime < min_mtime) goto skip_node; @@ -610,24 +645,20 @@ next_node: } skip_node: if (iter < dirty_threshold) { - if (stage == 0) - node = rb_prev(node); - else if (stage == 1) - node = rb_next(node); + ve = rb_entry(stage == 0 ? rb_prev(&ve->rb_node) : + rb_next(&ve->rb_node), + struct victim_entry, rb_node); goto next_node; } -skip_stage: - if (stage < 1) { - stage++; - iter = 0; + + if (stage++ == 0) goto next_stage; - } } + static void lookup_victim_by_age(struct f2fs_sb_info *sbi, struct victim_sel_policy *p) { - f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, - &sbi->am.root, true)); + f2fs_bug_on(sbi, !f2fs_check_victim_tree(sbi, &sbi->am.root)); if (p->gc_mode == GC_AT) atgc_lookup_victim(sbi, p); @@ -710,9 +741,9 @@ static int f2fs_gc_pinned_control(struct inode *inode, int gc_type, * When it is called from SSR segment selection, it finds a segment * which has minimum valid blocks and removes it from dirty seglist. */ -static int get_victim_by_default(struct f2fs_sb_info *sbi, - unsigned int *result, int gc_type, int type, - char alloc_mode, unsigned long long age) +int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result, + int gc_type, int type, char alloc_mode, + unsigned long long age) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct sit_info *sm = SIT_I(sbi); @@ -906,10 +937,6 @@ out: return ret; } -static const struct victim_selection default_v_ops = { - .get_victim = get_victim_by_default, -}; - static struct inode *find_gc_inode(struct gc_inode_list *gc_list, nid_t ino) { struct inode_entry *ie; @@ -1589,14 +1616,14 @@ next_step: int err; if (S_ISREG(inode->i_mode)) { - if (!f2fs_down_write_trylock(&fi->i_gc_rwsem[READ])) { + if (!f2fs_down_write_trylock(&fi->i_gc_rwsem[WRITE])) { sbi->skipped_gc_rwsem++; continue; } if (!f2fs_down_write_trylock( - &fi->i_gc_rwsem[WRITE])) { + &fi->i_gc_rwsem[READ])) { sbi->skipped_gc_rwsem++; - f2fs_up_write(&fi->i_gc_rwsem[READ]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); continue; } locked = true; @@ -1619,8 +1646,8 @@ next_step: submitted++; if (locked) { - f2fs_up_write(&fi->i_gc_rwsem[WRITE]); f2fs_up_write(&fi->i_gc_rwsem[READ]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); } stat_inc_data_blk_count(sbi, 1, gc_type); @@ -1640,8 +1667,7 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, int ret; down_write(&sit_i->sentry_lock); - ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, - NO_CHECK_TYPE, LFS, 0); + ret = f2fs_get_victim(sbi, victim, gc_type, NO_CHECK_TYPE, LFS, 0); up_write(&sit_i->sentry_lock); return ret; } @@ -1779,6 +1805,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control) .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; unsigned int skipped_round = 0, round = 0; + unsigned int upper_secs; trace_f2fs_gc_begin(sbi->sb, gc_type, gc_control->no_bg_gc, gc_control->nr_free_secs, @@ -1791,8 +1818,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control) prefree_segments(sbi)); cpc.reason = __get_cp_reason(sbi); - sbi->skipped_gc_rwsem = 0; gc_more: + sbi->skipped_gc_rwsem = 0; if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { ret = -EINVAL; goto stop; @@ -1802,7 +1829,10 @@ gc_more: goto stop; } - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) { + /* Let's run FG_GC, if we don't have enough space. */ + if (has_not_enough_free_secs(sbi, 0, 0)) { + gc_type = FG_GC; + /* * For example, if there are many prefree_segments below given * threshold, we can make them free by checkpoint. Then, we @@ -1813,8 +1843,6 @@ gc_more: if (ret) goto stop; } - if (has_not_enough_free_secs(sbi, 0, 0)) - gc_type = FG_GC; } /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ @@ -1841,19 +1869,15 @@ retry: if (seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) sec_freed++; - if (gc_type == FG_GC) + if (gc_type == FG_GC) { sbi->cur_victim_sec = NULL_SEGNO; - if (gc_control->init_gc_type == FG_GC || - !has_not_enough_free_secs(sbi, - (gc_type == FG_GC) ? sec_freed : 0, 0)) { - if (gc_type == FG_GC && sec_freed < gc_control->nr_free_secs) - goto go_gc_more; - goto stop; - } - - /* FG_GC stops GC by skip_count */ - if (gc_type == FG_GC) { + if (has_enough_free_secs(sbi, sec_freed, 0)) { + if (!gc_control->no_bg_gc && + sec_freed < gc_control->nr_free_secs) + goto go_gc_more; + goto stop; + } if (sbi->skipped_gc_rwsem) skipped_round++; round++; @@ -1862,10 +1886,17 @@ retry: ret = f2fs_write_checkpoint(sbi, &cpc); goto stop; } + } else if (has_enough_free_secs(sbi, 0, 0)) { + goto stop; } - /* Write checkpoint to reclaim prefree segments */ - if (free_sections(sbi) < NR_CURSEG_PERSIST_TYPE && + __get_secs_required(sbi, NULL, &upper_secs, NULL); + + /* + * Write checkpoint to reclaim prefree segments. + * We need more three extra sections for writer's data/node/dentry. + */ + if (free_sections(sbi) <= upper_secs + NR_GC_CHECKPOINT_SECS && prefree_segments(sbi)) { ret = f2fs_write_checkpoint(sbi, &cpc); if (ret) @@ -1932,8 +1963,6 @@ static void init_atgc_management(struct f2fs_sb_info *sbi) void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) { - DIRTY_I(sbi)->v_ops = &default_v_ops; - sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES; /* give warm/cold data area from slower device */ @@ -2064,8 +2093,8 @@ static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) FDEV(last_dev).end_blk = (long long)FDEV(last_dev).end_blk + blks; #ifdef CONFIG_BLK_DEV_ZONED - FDEV(last_dev).nr_blkz = (int)FDEV(last_dev).nr_blkz + - (int)(blks >> sbi->log_blocks_per_blkz); + FDEV(last_dev).nr_blkz = FDEV(last_dev).nr_blkz + + div_u64(blks, sbi->blocks_per_blkz); #endif } } diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 15bd1d680f67..28a00942802c 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -30,6 +30,8 @@ /* Search max. number of dirty segments to select a victim segment */ #define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */ +#define NR_GC_CHECKPOINT_SECS (3) /* data/node/dentry sections */ + struct f2fs_gc_kthread { struct task_struct *f2fs_gc_task; wait_queue_head_t gc_wait_queue_head; @@ -55,20 +57,10 @@ struct gc_inode_list { struct radix_tree_root iroot; }; -struct victim_info { - unsigned long long mtime; /* mtime of section */ - unsigned int segno; /* section No. */ -}; - struct victim_entry { struct rb_node rb_node; /* rb node located in rb-tree */ - union { - struct { - unsigned long long mtime; /* mtime of section */ - unsigned int segno; /* segment No. */ - }; - struct victim_info vi; /* victim info */ - }; + unsigned long long mtime; /* mtime of section */ + unsigned int segno; /* segment No. */ struct list_head list; }; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 72269e7efd26..4638fee16a91 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -497,7 +497,7 @@ static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) fname.hash = de->hash_code; ino = le32_to_cpu(de->ino); - fake_mode = f2fs_get_de_type(de) << S_SHIFT; + fake_mode = fs_ftype_to_dtype(de->file_type) << S_DT_SHIFT; err = f2fs_add_regular_entry(dir, &fname, NULL, ino, fake_mode); if (err) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 7d2e2c0dba65..cf4327ad106c 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -326,7 +326,6 @@ static void init_idisk_time(struct inode *inode) fi->i_disk_time[0] = inode->i_atime; fi->i_disk_time[1] = inode->i_ctime; fi->i_disk_time[2] = inode->i_mtime; - fi->i_disk_time[3] = fi->i_crtime; } static int do_read_inode(struct inode *inode) @@ -454,8 +453,8 @@ static int do_read_inode(struct inode *inode) fi->i_compress_level = compress_flag >> COMPRESS_LEVEL_OFFSET; fi->i_compress_flag = compress_flag & - (BIT(COMPRESS_LEVEL_OFFSET) - 1); - fi->i_cluster_size = 1 << fi->i_log_cluster_size; + GENMASK(COMPRESS_LEVEL_OFFSET - 1, 0); + fi->i_cluster_size = BIT(fi->i_log_cluster_size); set_inode_flag(inode, FI_COMPRESSED_FILE); } } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 11fc4c8036a9..77a71276ecb1 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -22,7 +22,7 @@ #include "acl.h" #include <trace/events/f2fs.h> -static inline int is_extension_exist(const unsigned char *s, const char *sub, +static inline bool is_extension_exist(const unsigned char *s, const char *sub, bool tmp_ext) { size_t slen = strlen(s); @@ -30,19 +30,19 @@ static inline int is_extension_exist(const unsigned char *s, const char *sub, int i; if (sublen == 1 && *sub == '*') - return 1; + return true; /* * filename format of multimedia file should be defined as: * "filename + '.' + extension + (optional: '.' + temp extension)". */ if (slen < sublen + 2) - return 0; + return false; if (!tmp_ext) { /* file has no temp extension */ if (s[slen - sublen - 1] != '.') - return 0; + return false; return !strncasecmp(s + slen - sublen, sub, sublen); } @@ -50,10 +50,10 @@ static inline int is_extension_exist(const unsigned char *s, const char *sub, if (s[i] != '.') continue; if (!strncasecmp(s + i + 1, sub, sublen)) - return 1; + return true; } - return 0; + return false; } int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, @@ -995,12 +995,20 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, goto out; } + /* + * Copied from ext4_rename: we need to protect against old.inode + * directory getting converted from inline directory format into + * a normal one. + */ + if (S_ISDIR(old_inode->i_mode)) + inode_lock_nested(old_inode, I_MUTEX_NONDIR2); + err = -ENOENT; old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { if (IS_ERR(old_page)) err = PTR_ERR(old_page); - goto out; + goto out_unlock_old; } if (S_ISDIR(old_inode->i_mode)) { @@ -1108,6 +1116,9 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, f2fs_unlock_op(sbi); + if (S_ISDIR(old_inode->i_mode)) + inode_unlock(old_inode); + if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) f2fs_sync_fs(sbi->sb, 1); @@ -1122,6 +1133,9 @@ out_dir: f2fs_put_page(old_dir_page, 0); out_old: f2fs_put_page(old_page, 0); +out_unlock_old: + if (S_ISDIR(old_inode->i_mode)) + inode_unlock(old_inode); out: iput(whiteout); return err; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 99454d46a939..906fb67a99da 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -93,17 +93,15 @@ static inline void copy_node_info(struct node_info *dst, static inline void set_nat_flag(struct nat_entry *ne, unsigned int type, bool set) { - unsigned char mask = 0x01 << type; if (set) - ne->ni.flag |= mask; + ne->ni.flag |= BIT(type); else - ne->ni.flag &= ~mask; + ne->ni.flag &= ~BIT(type); } static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type) { - unsigned char mask = 0x01 << type; - return ne->ni.flag & mask; + return ne->ni.flag & BIT(type); } static inline void nat_reset_flag(struct nat_entry *ne) @@ -225,7 +223,7 @@ static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi, struct f2fs_nm_info *nm_i = NM_I(sbi); block_addr -= nm_i->nat_blkaddr; - block_addr ^= 1 << sbi->log_blocks_per_seg; + block_addr ^= BIT(sbi->log_blocks_per_seg); return block_addr + nm_i->nat_blkaddr; } @@ -395,7 +393,7 @@ static inline nid_t get_nid(struct page *p, int off, bool i) static inline int is_node(struct page *page, int type) { struct f2fs_node *rn = F2FS_NODE(page); - return le32_to_cpu(rn->footer.flag) & (1 << type); + return le32_to_cpu(rn->footer.flag) & BIT(type); } #define is_cold_node(page) is_node(page, COLD_BIT_SHIFT) @@ -408,9 +406,9 @@ static inline void set_cold_node(struct page *page, bool is_dir) unsigned int flag = le32_to_cpu(rn->footer.flag); if (is_dir) - flag &= ~(0x1 << COLD_BIT_SHIFT); + flag &= ~BIT(COLD_BIT_SHIFT); else - flag |= (0x1 << COLD_BIT_SHIFT); + flag |= BIT(COLD_BIT_SHIFT); rn->footer.flag = cpu_to_le32(flag); } @@ -419,9 +417,9 @@ static inline void set_mark(struct page *page, int mark, int type) struct f2fs_node *rn = F2FS_NODE(page); unsigned int flag = le32_to_cpu(rn->footer.flag); if (mark) - flag |= (0x1 << type); + flag |= BIT(type); else - flag &= ~(0x1 << type); + flag &= ~BIT(type); rn->footer.flag = cpu_to_le32(flag); #ifdef CONFIG_F2FS_CHECK_FS diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index dfd41908b12d..58c1a0096f7d 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -825,19 +825,9 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) unsigned long s_flags = sbi->sb->s_flags; bool need_writecp = false; bool fix_curseg_write_pointer = false; -#ifdef CONFIG_QUOTA - int quota_enabled; -#endif - if (s_flags & SB_RDONLY) { + if (is_sbi_flag_set(sbi, SBI_IS_WRITABLE)) f2fs_info(sbi, "recover fsync data on readonly fs"); - sbi->sb->s_flags &= ~SB_RDONLY; - } - -#ifdef CONFIG_QUOTA - /* Turn on quotas so that they are updated correctly */ - quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY); -#endif INIT_LIST_HEAD(&inode_list); INIT_LIST_HEAD(&tmp_inode_list); @@ -909,11 +899,6 @@ skip: } } -#ifdef CONFIG_QUOTA - /* Turn quotas off */ - if (quota_enabled) - f2fs_quota_off_umount(sbi->sb); -#endif sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ return ret ? ret : err; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 227e25836173..6db410f1bb8c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -217,7 +217,7 @@ static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, retry: set_new_dnode(&dn, inode, NULL, NULL, 0); - err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE_RA); + err = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); if (err) { if (err == -ENOMEM) { f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); @@ -246,10 +246,16 @@ retry: } else { blkcnt_t count = 1; + err = inc_valid_block_count(sbi, inode, &count); + if (err) { + f2fs_put_dnode(&dn); + return err; + } + *old_addr = dn.data_blkaddr; f2fs_truncate_data_blocks_range(&dn, 1); dec_valid_block_count(sbi, F2FS_I(inode)->cow_inode, count); - inc_valid_block_count(sbi, inode, &count); + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, ni.version, true, false); } @@ -257,7 +263,7 @@ retry: f2fs_put_dnode(&dn); trace_f2fs_replace_atomic_write_block(inode, F2FS_I(inode)->cow_inode, - index, *old_addr, new_addr, recover); + index, old_addr ? *old_addr : 0, new_addr, recover); return 0; } @@ -406,27 +412,28 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) * We should do GC or end up with checkpoint, if there are so many dirty * dir/node pages without enough free segments. */ - if (has_not_enough_free_secs(sbi, 0, 0)) { - if (test_opt(sbi, GC_MERGE) && sbi->gc_thread && - sbi->gc_thread->f2fs_gc_task) { - DEFINE_WAIT(wait); - - prepare_to_wait(&sbi->gc_thread->fggc_wq, &wait, - TASK_UNINTERRUPTIBLE); - wake_up(&sbi->gc_thread->gc_wait_queue_head); - io_schedule(); - finish_wait(&sbi->gc_thread->fggc_wq, &wait); - } else { - struct f2fs_gc_control gc_control = { - .victim_segno = NULL_SEGNO, - .init_gc_type = BG_GC, - .no_bg_gc = true, - .should_migrate_blocks = false, - .err_gc_skipped = false, - .nr_free_secs = 1 }; - f2fs_down_write(&sbi->gc_lock); - f2fs_gc(sbi, &gc_control); - } + if (has_enough_free_secs(sbi, 0, 0)) + return; + + if (test_opt(sbi, GC_MERGE) && sbi->gc_thread && + sbi->gc_thread->f2fs_gc_task) { + DEFINE_WAIT(wait); + + prepare_to_wait(&sbi->gc_thread->fggc_wq, &wait, + TASK_UNINTERRUPTIBLE); + wake_up(&sbi->gc_thread->gc_wait_queue_head); + io_schedule(); + finish_wait(&sbi->gc_thread->fggc_wq, &wait); + } else { + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .init_gc_type = BG_GC, + .no_bg_gc = true, + .should_migrate_blocks = false, + .err_gc_skipped = false, + .nr_free_secs = 1 }; + f2fs_down_write(&sbi->gc_lock); + f2fs_gc(sbi, &gc_control); } } @@ -933,9 +940,9 @@ static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS, true, NULL); INIT_LIST_HEAD(&dc->list); dc->bdev = bdev; - dc->lstart = lstart; - dc->start = start; - dc->len = len; + dc->di.lstart = lstart; + dc->di.start = start; + dc->di.len = len; dc->ref = 0; dc->state = D_PREP; dc->queued = 0; @@ -950,20 +957,108 @@ static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, return dc; } -static struct discard_cmd *__attach_discard_cmd(struct f2fs_sb_info *sbi, - struct block_device *bdev, block_t lstart, - block_t start, block_t len, - struct rb_node *parent, struct rb_node **p, - bool leftmost) +static bool f2fs_check_discard_tree(struct f2fs_sb_info *sbi) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct rb_node *cur = rb_first_cached(&dcc->root), *next; + struct discard_cmd *cur_dc, *next_dc; + + while (cur) { + next = rb_next(cur); + if (!next) + return true; + + cur_dc = rb_entry(cur, struct discard_cmd, rb_node); + next_dc = rb_entry(next, struct discard_cmd, rb_node); + + if (cur_dc->di.lstart + cur_dc->di.len > next_dc->di.lstart) { + f2fs_info(sbi, "broken discard_rbtree, " + "cur(%u, %u) next(%u, %u)", + cur_dc->di.lstart, cur_dc->di.len, + next_dc->di.lstart, next_dc->di.len); + return false; + } + cur = next; + } +#endif + return true; +} + +static struct discard_cmd *__lookup_discard_cmd(struct f2fs_sb_info *sbi, + block_t blkaddr) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct rb_node *node = dcc->root.rb_root.rb_node; struct discard_cmd *dc; - dc = __create_discard_cmd(sbi, bdev, lstart, start, len); + while (node) { + dc = rb_entry(node, struct discard_cmd, rb_node); - rb_link_node(&dc->rb_node, parent, p); - rb_insert_color_cached(&dc->rb_node, &dcc->root, leftmost); + if (blkaddr < dc->di.lstart) + node = node->rb_left; + else if (blkaddr >= dc->di.lstart + dc->di.len) + node = node->rb_right; + else + return dc; + } + return NULL; +} + +static struct discard_cmd *__lookup_discard_cmd_ret(struct rb_root_cached *root, + block_t blkaddr, + struct discard_cmd **prev_entry, + struct discard_cmd **next_entry, + struct rb_node ***insert_p, + struct rb_node **insert_parent) +{ + struct rb_node **pnode = &root->rb_root.rb_node; + struct rb_node *parent = NULL, *tmp_node; + struct discard_cmd *dc; + + *insert_p = NULL; + *insert_parent = NULL; + *prev_entry = NULL; + *next_entry = NULL; + + if (RB_EMPTY_ROOT(&root->rb_root)) + return NULL; + + while (*pnode) { + parent = *pnode; + dc = rb_entry(*pnode, struct discard_cmd, rb_node); + + if (blkaddr < dc->di.lstart) + pnode = &(*pnode)->rb_left; + else if (blkaddr >= dc->di.lstart + dc->di.len) + pnode = &(*pnode)->rb_right; + else + goto lookup_neighbors; + } + + *insert_p = pnode; + *insert_parent = parent; + + dc = rb_entry(parent, struct discard_cmd, rb_node); + tmp_node = parent; + if (parent && blkaddr > dc->di.lstart) + tmp_node = rb_next(parent); + *next_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node); + + tmp_node = parent; + if (parent && blkaddr < dc->di.lstart) + tmp_node = rb_prev(parent); + *prev_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node); + return NULL; +lookup_neighbors: + /* lookup prev node for merging backward later */ + tmp_node = rb_prev(&dc->rb_node); + *prev_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node); + + /* lookup next node for merging frontward later */ + tmp_node = rb_next(&dc->rb_node); + *next_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node); return dc; } @@ -975,7 +1070,7 @@ static void __detach_discard_cmd(struct discard_cmd_control *dcc, list_del(&dc->list); rb_erase_cached(&dc->rb_node, &dcc->root); - dcc->undiscard_blks -= dc->len; + dcc->undiscard_blks -= dc->di.len; kmem_cache_free(discard_cmd_slab, dc); @@ -988,7 +1083,7 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; unsigned long flags; - trace_f2fs_remove_discard(dc->bdev, dc->start, dc->len); + trace_f2fs_remove_discard(dc->bdev, dc->di.start, dc->di.len); spin_lock_irqsave(&dc->lock, flags); if (dc->bio_ref) { @@ -1006,7 +1101,7 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, printk_ratelimited( "%sF2FS-fs (%s): Issue discard(%u, %u, %u) failed, ret: %d", KERN_INFO, sbi->sb->s_id, - dc->lstart, dc->start, dc->len, dc->error); + dc->di.lstart, dc->di.start, dc->di.len, dc->error); __detach_discard_cmd(dcc, dc); } @@ -1122,14 +1217,14 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) return 0; - trace_f2fs_issue_discard(bdev, dc->start, dc->len); + trace_f2fs_issue_discard(bdev, dc->di.start, dc->di.len); - lstart = dc->lstart; - start = dc->start; - len = dc->len; + lstart = dc->di.lstart; + start = dc->di.start; + len = dc->di.len; total_len = len; - dc->len = 0; + dc->di.len = 0; while (total_len && *issued < dpolicy->max_requests && !err) { struct bio *bio = NULL; @@ -1145,7 +1240,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, if (*issued == dpolicy->max_requests) last = true; - dc->len += len; + dc->di.len += len; if (time_to_inject(sbi, FAULT_DISCARD)) { err = -EIO; @@ -1207,34 +1302,41 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, return err; } -static void __insert_discard_tree(struct f2fs_sb_info *sbi, +static void __insert_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, - block_t start, block_t len, - struct rb_node **insert_p, - struct rb_node *insert_parent) + block_t start, block_t len) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct rb_node **p; + struct rb_node **p = &dcc->root.rb_root.rb_node; struct rb_node *parent = NULL; + struct discard_cmd *dc; bool leftmost = true; - if (insert_p && insert_parent) { - parent = insert_parent; - p = insert_p; - goto do_insert; + /* look up rb tree to find parent node */ + while (*p) { + parent = *p; + dc = rb_entry(parent, struct discard_cmd, rb_node); + + if (lstart < dc->di.lstart) { + p = &(*p)->rb_left; + } else if (lstart >= dc->di.lstart + dc->di.len) { + p = &(*p)->rb_right; + leftmost = false; + } else { + f2fs_bug_on(sbi, 1); + } } - p = f2fs_lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, - lstart, &leftmost); -do_insert: - __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, - p, leftmost); + dc = __create_discard_cmd(sbi, bdev, lstart, start, len); + + rb_link_node(&dc->rb_node, parent, p); + rb_insert_color_cached(&dc->rb_node, &dcc->root, leftmost); } static void __relocate_discard_cmd(struct discard_cmd_control *dcc, struct discard_cmd *dc) { - list_move_tail(&dc->list, &dcc->pend_list[plist_idx(dc->len)]); + list_move_tail(&dc->list, &dcc->pend_list[plist_idx(dc->di.len)]); } static void __punch_discard_cmd(struct f2fs_sb_info *sbi, @@ -1244,7 +1346,7 @@ static void __punch_discard_cmd(struct f2fs_sb_info *sbi, struct discard_info di = dc->di; bool modified = false; - if (dc->state == D_DONE || dc->len == 1) { + if (dc->state == D_DONE || dc->di.len == 1) { __remove_discard_cmd(sbi, dc); return; } @@ -1252,23 +1354,22 @@ static void __punch_discard_cmd(struct f2fs_sb_info *sbi, dcc->undiscard_blks -= di.len; if (blkaddr > di.lstart) { - dc->len = blkaddr - dc->lstart; - dcc->undiscard_blks += dc->len; + dc->di.len = blkaddr - dc->di.lstart; + dcc->undiscard_blks += dc->di.len; __relocate_discard_cmd(dcc, dc); modified = true; } if (blkaddr < di.lstart + di.len - 1) { if (modified) { - __insert_discard_tree(sbi, dc->bdev, blkaddr + 1, + __insert_discard_cmd(sbi, dc->bdev, blkaddr + 1, di.start + blkaddr + 1 - di.lstart, - di.lstart + di.len - 1 - blkaddr, - NULL, NULL); + di.lstart + di.len - 1 - blkaddr); } else { - dc->lstart++; - dc->len--; - dc->start++; - dcc->undiscard_blks += dc->len; + dc->di.lstart++; + dc->di.len--; + dc->di.start++; + dcc->undiscard_blks += dc->di.len; __relocate_discard_cmd(dcc, dc); } } @@ -1287,17 +1388,14 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev)); block_t end = lstart + len; - dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, - NULL, lstart, - (struct rb_entry **)&prev_dc, - (struct rb_entry **)&next_dc, - &insert_p, &insert_parent, true, NULL); + dc = __lookup_discard_cmd_ret(&dcc->root, lstart, + &prev_dc, &next_dc, &insert_p, &insert_parent); if (dc) prev_dc = dc; if (!prev_dc) { di.lstart = lstart; - di.len = next_dc ? next_dc->lstart - lstart : len; + di.len = next_dc ? next_dc->di.lstart - lstart : len; di.len = min(di.len, len); di.start = start; } @@ -1308,16 +1406,16 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, struct discard_cmd *tdc = NULL; if (prev_dc) { - di.lstart = prev_dc->lstart + prev_dc->len; + di.lstart = prev_dc->di.lstart + prev_dc->di.len; if (di.lstart < lstart) di.lstart = lstart; if (di.lstart >= end) break; - if (!next_dc || next_dc->lstart > end) + if (!next_dc || next_dc->di.lstart > end) di.len = end - di.lstart; else - di.len = next_dc->lstart - di.lstart; + di.len = next_dc->di.lstart - di.lstart; di.start = start + di.lstart - lstart; } @@ -1350,10 +1448,9 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, merged = true; } - if (!merged) { - __insert_discard_tree(sbi, bdev, di.lstart, di.start, - di.len, NULL, NULL); - } + if (!merged) + __insert_discard_cmd(sbi, bdev, + di.lstart, di.start, di.len); next: prev_dc = next_dc; if (!prev_dc) @@ -1392,15 +1489,11 @@ static void __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, struct rb_node **insert_p = NULL, *insert_parent = NULL; struct discard_cmd *dc; struct blk_plug plug; - unsigned int pos = dcc->next_pos; bool io_interrupted = false; mutex_lock(&dcc->cmd_lock); - dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, - NULL, pos, - (struct rb_entry **)&prev_dc, - (struct rb_entry **)&next_dc, - &insert_p, &insert_parent, true, NULL); + dc = __lookup_discard_cmd_ret(&dcc->root, dcc->next_pos, + &prev_dc, &next_dc, &insert_p, &insert_parent); if (!dc) dc = next_dc; @@ -1418,7 +1511,7 @@ static void __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, break; } - dcc->next_pos = dc->lstart + dc->len; + dcc->next_pos = dc->di.lstart + dc->di.len; err = __submit_discard_cmd(sbi, dpolicy, dc, issued); if (*issued >= dpolicy->max_requests) @@ -1477,8 +1570,7 @@ retry: if (list_empty(pend_list)) goto next; if (unlikely(dcc->rbtree_check)) - f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, - &dcc->root, false)); + f2fs_bug_on(sbi, !f2fs_check_discard_tree(sbi)); blk_start_plug(&plug); list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); @@ -1556,7 +1648,7 @@ static unsigned int __wait_one_discard_bio(struct f2fs_sb_info *sbi, dc->ref--; if (!dc->ref) { if (!dc->error) - len = dc->len; + len = dc->di.len; __remove_discard_cmd(sbi, dc); } mutex_unlock(&dcc->cmd_lock); @@ -1579,14 +1671,15 @@ next: mutex_lock(&dcc->cmd_lock); list_for_each_entry_safe(iter, tmp, wait_list, list) { - if (iter->lstart + iter->len <= start || end <= iter->lstart) + if (iter->di.lstart + iter->di.len <= start || + end <= iter->di.lstart) continue; - if (iter->len < dpolicy->granularity) + if (iter->di.len < dpolicy->granularity) continue; if (iter->state == D_DONE && !iter->ref) { wait_for_completion_io(&iter->wait); if (!iter->error) - trimmed += iter->len; + trimmed += iter->di.len; __remove_discard_cmd(sbi, iter); } else { iter->ref++; @@ -1630,8 +1723,7 @@ static void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) bool need_wait = false; mutex_lock(&dcc->cmd_lock); - dc = (struct discard_cmd *)f2fs_lookup_rb_tree(&dcc->root, - NULL, blkaddr); + dc = __lookup_discard_cmd(sbi, blkaddr); if (dc) { if (dc->state == D_PREP) { __punch_discard_cmd(sbi, dc, blkaddr); @@ -1760,6 +1852,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, sector_t sector, nr_sects; block_t lblkstart = blkstart; int devi = 0; + u64 remainder = 0; if (f2fs_is_multi_device(sbi)) { devi = f2fs_target_device_index(sbi, blkstart); @@ -1775,9 +1868,9 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, if (f2fs_blkz_is_seq(sbi, devi, blkstart)) { sector = SECTOR_FROM_BLOCK(blkstart); nr_sects = SECTOR_FROM_BLOCK(blklen); + div64_u64_rem(sector, bdev_zone_sectors(bdev), &remainder); - if (sector & (bdev_zone_sectors(bdev) - 1) || - nr_sects != bdev_zone_sectors(bdev)) { + if (remainder || nr_sects != bdev_zone_sectors(bdev)) { f2fs_err(sbi, "(%d) %s: Unaligned zone reset attempted (block %x + %x)", devi, sbi->s_ndevs ? FDEV(devi).path : "", blkstart, blklen); @@ -1982,9 +2075,11 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, if (force && start >= cpc->trim_start && (end - 1) <= cpc->trim_end) - continue; + continue; - if (!f2fs_lfs_mode(sbi) || !__is_large_section(sbi)) { + /* Should cover 2MB zoned device for zone-based reset */ + if (!f2fs_sb_has_blkzoned(sbi) && + (!f2fs_lfs_mode(sbi) || !__is_large_section(sbi))) { f2fs_issue_discard(sbi, START_BLOCK(sbi, start), (end - start) << sbi->log_blocks_per_seg); continue; @@ -2787,7 +2882,6 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, int alloc_mode, unsigned long long age) { struct curseg_info *curseg = CURSEG_I(sbi, type); - const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; unsigned segno = NULL_SEGNO; unsigned short seg_type = curseg->seg_type; int i, cnt; @@ -2796,7 +2890,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, sanity_check_seg_type(sbi, seg_type); /* f2fs_need_SSR() already forces to do this */ - if (!v_ops->get_victim(sbi, &segno, BG_GC, seg_type, alloc_mode, age)) { + if (!f2fs_get_victim(sbi, &segno, BG_GC, seg_type, alloc_mode, age)) { curseg->next_segno = segno; return 1; } @@ -2823,7 +2917,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, for (; cnt-- > 0; reversed ? i-- : i++) { if (i == seg_type) continue; - if (!v_ops->get_victim(sbi, &segno, BG_GC, i, alloc_mode, age)) { + if (!f2fs_get_victim(sbi, &segno, BG_GC, i, alloc_mode, age)) { curseg->next_segno = segno; return 1; } @@ -2964,24 +3058,20 @@ next: mutex_lock(&dcc->cmd_lock); if (unlikely(dcc->rbtree_check)) - f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, - &dcc->root, false)); - - dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, - NULL, start, - (struct rb_entry **)&prev_dc, - (struct rb_entry **)&next_dc, - &insert_p, &insert_parent, true, NULL); + f2fs_bug_on(sbi, !f2fs_check_discard_tree(sbi)); + + dc = __lookup_discard_cmd_ret(&dcc->root, start, + &prev_dc, &next_dc, &insert_p, &insert_parent); if (!dc) dc = next_dc; blk_start_plug(&plug); - while (dc && dc->lstart <= end) { + while (dc && dc->di.lstart <= end) { struct rb_node *node; int err = 0; - if (dc->len < dpolicy->granularity) + if (dc->di.len < dpolicy->granularity) goto skip; if (dc->state != D_PREP) { @@ -2992,7 +3082,7 @@ next: err = __submit_discard_cmd(sbi, dpolicy, dc, &issued); if (issued >= dpolicy->max_requests) { - start = dc->lstart + dc->len; + start = dc->di.lstart + dc->di.len; if (err) __remove_discard_cmd(sbi, dc); @@ -4859,9 +4949,8 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) "New zone for curseg[%d] is not yet discarded. " "Reset the zone: curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff); - err = __f2fs_issue_discard_zone(sbi, zbd->bdev, - zone_sector >> log_sectors_per_block, - zone.len >> log_sectors_per_block); + err = __f2fs_issue_discard_zone(sbi, zbd->bdev, cs_zone_block, + zone.len >> log_sectors_per_block); if (err) { f2fs_err(sbi, "Discard zone failed: %s (errno=%d)", zbd->path, err); @@ -4920,48 +5009,6 @@ int f2fs_check_write_pointer(struct f2fs_sb_info *sbi) return 0; } -static bool is_conv_zone(struct f2fs_sb_info *sbi, unsigned int zone_idx, - unsigned int dev_idx) -{ - if (!bdev_is_zoned(FDEV(dev_idx).bdev)) - return true; - return !test_bit(zone_idx, FDEV(dev_idx).blkz_seq); -} - -/* Return the zone index in the given device */ -static unsigned int get_zone_idx(struct f2fs_sb_info *sbi, unsigned int secno, - int dev_idx) -{ - block_t sec_start_blkaddr = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, secno)); - - return (sec_start_blkaddr - FDEV(dev_idx).start_blk) >> - sbi->log_blocks_per_blkz; -} - -/* - * Return the usable segments in a section based on the zone's - * corresponding zone capacity. Zone is equal to a section. - */ -static inline unsigned int f2fs_usable_zone_segs_in_sec( - struct f2fs_sb_info *sbi, unsigned int segno) -{ - unsigned int dev_idx, zone_idx; - - dev_idx = f2fs_target_device_index(sbi, START_BLOCK(sbi, segno)); - zone_idx = get_zone_idx(sbi, GET_SEC_FROM_SEG(sbi, segno), dev_idx); - - /* Conventional zone's capacity is always equal to zone size */ - if (is_conv_zone(sbi, zone_idx, dev_idx)) - return sbi->segs_per_sec; - - if (!sbi->unusable_blocks_per_sec) - return sbi->segs_per_sec; - - /* Get the segment count beyond zone capacity block */ - return sbi->segs_per_sec - (sbi->unusable_blocks_per_sec >> - sbi->log_blocks_per_seg); -} - /* * Return the number of usable blocks in a segment. The number of blocks * returned is always equal to the number of blocks in a segment for @@ -4974,23 +5021,13 @@ static inline unsigned int f2fs_usable_zone_blks_in_seg( struct f2fs_sb_info *sbi, unsigned int segno) { block_t seg_start, sec_start_blkaddr, sec_cap_blkaddr; - unsigned int zone_idx, dev_idx, secno; - - secno = GET_SEC_FROM_SEG(sbi, segno); - seg_start = START_BLOCK(sbi, segno); - dev_idx = f2fs_target_device_index(sbi, seg_start); - zone_idx = get_zone_idx(sbi, secno, dev_idx); - - /* - * Conventional zone's capacity is always equal to zone size, - * so, blocks per segment is unchanged. - */ - if (is_conv_zone(sbi, zone_idx, dev_idx)) - return sbi->blocks_per_seg; + unsigned int secno; if (!sbi->unusable_blocks_per_sec) return sbi->blocks_per_seg; + secno = GET_SEC_FROM_SEG(sbi, segno); + seg_start = START_BLOCK(sbi, segno); sec_start_blkaddr = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, secno)); sec_cap_blkaddr = sec_start_blkaddr + CAP_BLKS_PER_SEC(sbi); @@ -5024,11 +5061,6 @@ static inline unsigned int f2fs_usable_zone_blks_in_seg(struct f2fs_sb_info *sbi return 0; } -static inline unsigned int f2fs_usable_zone_segs_in_sec(struct f2fs_sb_info *sbi, - unsigned int segno) -{ - return 0; -} #endif unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, unsigned int segno) @@ -5043,7 +5075,7 @@ unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi, unsigned int segno) { if (f2fs_sb_has_blkzoned(sbi)) - return f2fs_usable_zone_segs_in_sec(sbi, segno); + return CAP_SEGS_PER_SEC(sbi); return sbi->segs_per_sec; } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index efdb7fc3b797..2ca8fb5d0dc4 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -104,6 +104,9 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, #define CAP_BLKS_PER_SEC(sbi) \ ((sbi)->segs_per_sec * (sbi)->blocks_per_seg - \ (sbi)->unusable_blocks_per_sec) +#define CAP_SEGS_PER_SEC(sbi) \ + ((sbi)->segs_per_sec - ((sbi)->unusable_blocks_per_sec >>\ + (sbi)->log_blocks_per_seg)) #define GET_SEC_FROM_SEG(sbi, segno) \ (((segno) == -1) ? -1: (segno) / (sbi)->segs_per_sec) #define GET_SEG_FROM_SEC(sbi, secno) \ @@ -286,7 +289,6 @@ enum dirty_type { }; struct dirty_seglist_info { - const struct victim_selection *v_ops; /* victim selction operation */ unsigned long *dirty_segmap[NR_DIRTY_TYPE]; unsigned long *dirty_secmap; struct mutex seglist_lock; /* lock for segment bitmaps */ @@ -297,12 +299,6 @@ struct dirty_seglist_info { bool enable_pin_section; /* enable pinning section */ }; -/* victim selection function for cleaning and SSR */ -struct victim_selection { - int (*get_victim)(struct f2fs_sb_info *, unsigned int *, - int, int, char, unsigned long long); -}; - /* for active log information */ struct curseg_info { struct mutex curseg_mutex; /* lock for consistency */ @@ -599,8 +595,12 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, return true; } -static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, - int freed, int needed) +/* + * calculate needed sections for dirty node/dentry + * and call has_curseg_enough_space + */ +static inline void __get_secs_required(struct f2fs_sb_info *sbi, + unsigned int *lower_p, unsigned int *upper_p, bool *curseg_p) { unsigned int total_node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) + get_pages(sbi, F2FS_DIRTY_DENTS) + @@ -610,27 +610,50 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, unsigned int dent_secs = total_dent_blocks / CAP_BLKS_PER_SEC(sbi); unsigned int node_blocks = total_node_blocks % CAP_BLKS_PER_SEC(sbi); unsigned int dent_blocks = total_dent_blocks % CAP_BLKS_PER_SEC(sbi); - unsigned int free, need_lower, need_upper; + + if (lower_p) + *lower_p = node_secs + dent_secs; + if (upper_p) + *upper_p = node_secs + dent_secs + + (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0); + if (curseg_p) + *curseg_p = has_curseg_enough_space(sbi, + node_blocks, dent_blocks); +} + +static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, + int freed, int needed) +{ + unsigned int free_secs, lower_secs, upper_secs; + bool curseg_space; if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; - free = free_sections(sbi) + freed; - need_lower = node_secs + dent_secs + reserved_sections(sbi) + needed; - need_upper = need_lower + (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0); + __get_secs_required(sbi, &lower_secs, &upper_secs, &curseg_space); + + free_secs = free_sections(sbi) + freed; + lower_secs += needed + reserved_sections(sbi); + upper_secs += needed + reserved_sections(sbi); - if (free > need_upper) + if (free_secs > upper_secs) return false; - else if (free <= need_lower) + else if (free_secs <= lower_secs) return true; - return !has_curseg_enough_space(sbi, node_blocks, dent_blocks); + return !curseg_space; +} + +static inline bool has_enough_free_secs(struct f2fs_sb_info *sbi, + int freed, int needed) +{ + return !has_not_enough_free_secs(sbi, freed, needed); } static inline bool f2fs_is_checkpoint_ready(struct f2fs_sb_info *sbi) { if (likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return true; - if (likely(!has_not_enough_free_secs(sbi, 0, 0))) + if (likely(has_enough_free_secs(sbi, 0, 0))) return true; return false; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index fbaaabbcd6de..9f15b03037db 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -880,8 +880,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) if (args->from && match_int(args, &arg)) return -EINVAL; if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_VECS)) { - f2fs_warn(sbi, "Not support %d, larger than %d", - 1 << arg, BIO_MAX_VECS); + f2fs_warn(sbi, "Not support %ld, larger than %d", + BIT(arg), BIO_MAX_VECS); return -EINVAL; } F2FS_OPTION(sbi).write_io_size_bits = arg; @@ -1179,9 +1179,17 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) kfree(name); break; case Opt_compress_chksum: + if (!f2fs_sb_has_compression(sbi)) { + f2fs_info(sbi, "Image doesn't support compression"); + break; + } F2FS_OPTION(sbi).compress_chksum = true; break; case Opt_compress_mode: + if (!f2fs_sb_has_compression(sbi)) { + f2fs_info(sbi, "Image doesn't support compression"); + break; + } name = match_strdup(&args[0]); if (!name) return -ENOMEM; @@ -1196,6 +1204,10 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) kfree(name); break; case Opt_compress_cache: + if (!f2fs_sb_has_compression(sbi)) { + f2fs_info(sbi, "Image doesn't support compression"); + break; + } set_opt(sbi, COMPRESS_CACHE); break; #else @@ -1310,7 +1322,7 @@ default_check: #endif if (F2FS_IO_SIZE_BITS(sbi) && !f2fs_lfs_mode(sbi)) { - f2fs_err(sbi, "Should set mode=lfs with %uKB-sized IO", + f2fs_err(sbi, "Should set mode=lfs with %luKB-sized IO", F2FS_IO_SIZE_KB(sbi)); return -EINVAL; } @@ -2060,10 +2072,12 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); - F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; - F2FS_OPTION(sbi).compress_log_size = MIN_COMPRESS_LOG_SIZE; - F2FS_OPTION(sbi).compress_ext_cnt = 0; - F2FS_OPTION(sbi).compress_mode = COMPR_MODE_FS; + if (f2fs_sb_has_compression(sbi)) { + F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; + F2FS_OPTION(sbi).compress_log_size = MIN_COMPRESS_LOG_SIZE; + F2FS_OPTION(sbi).compress_ext_cnt = 0; + F2FS_OPTION(sbi).compress_mode = COMPR_MODE_FS; + } F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON; F2FS_OPTION(sbi).memory_mode = MEMORY_MODE_NORMAL; @@ -2274,7 +2288,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if (f2fs_readonly(sb) && (*flags & SB_RDONLY)) goto skip; - if (f2fs_sb_has_readonly(sbi) && !(*flags & SB_RDONLY)) { + if (f2fs_dev_is_readonly(sbi) && !(*flags & SB_RDONLY)) { err = -EROFS; goto restore_opts; } @@ -2487,6 +2501,54 @@ restore_opts: } #ifdef CONFIG_QUOTA +static bool f2fs_need_recovery(struct f2fs_sb_info *sbi) +{ + /* need to recovery orphan */ + if (is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) + return true; + /* need to recovery data */ + if (test_opt(sbi, DISABLE_ROLL_FORWARD)) + return false; + if (test_opt(sbi, NORECOVERY)) + return false; + return !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG); +} + +static bool f2fs_recover_quota_begin(struct f2fs_sb_info *sbi) +{ + bool readonly = f2fs_readonly(sbi->sb); + + if (!f2fs_need_recovery(sbi)) + return false; + + /* it doesn't need to check f2fs_sb_has_readonly() */ + if (f2fs_hw_is_readonly(sbi)) + return false; + + if (readonly) { + sbi->sb->s_flags &= ~SB_RDONLY; + set_sbi_flag(sbi, SBI_IS_WRITABLE); + } + + /* + * Turn on quotas which were not enabled for read-only mounts if + * filesystem has quota feature, so that they are updated correctly. + */ + return f2fs_enable_quota_files(sbi, readonly); +} + +static void f2fs_recover_quota_end(struct f2fs_sb_info *sbi, + bool quota_enabled) +{ + if (quota_enabled) + f2fs_quota_off_umount(sbi->sb); + + if (is_sbi_flag_set(sbi, SBI_IS_WRITABLE)) { + clear_sbi_flag(sbi, SBI_IS_WRITABLE); + sbi->sb->s_flags |= SB_RDONLY; + } +} + /* Read data from quotafile */ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off) @@ -3260,7 +3322,7 @@ static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, raw_super->segment_count = cpu_to_le32((main_end_blkaddr - segment0_blkaddr) >> log_blocks_per_seg); - if (f2fs_readonly(sb) || bdev_read_only(sb->s_bdev)) { + if (f2fs_readonly(sb) || f2fs_hw_is_readonly(sbi)) { set_sbi_flag(sbi, SBI_NEED_SB_WRITE); res = "internally"; } else { @@ -3348,7 +3410,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, total_sections = le32_to_cpu(raw_super->section_count); /* blocks_per_seg should be 512, given the above check */ - blocks_per_seg = 1 << le32_to_cpu(raw_super->log_blocks_per_seg); + blocks_per_seg = BIT(le32_to_cpu(raw_super->log_blocks_per_seg)); if (segment_count > F2FS_MAX_SEGMENT || segment_count < F2FS_MIN_SEGMENTS) { @@ -3617,9 +3679,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->log_sectors_per_block = le32_to_cpu(raw_super->log_sectors_per_block); sbi->log_blocksize = le32_to_cpu(raw_super->log_blocksize); - sbi->blocksize = 1 << sbi->log_blocksize; + sbi->blocksize = BIT(sbi->log_blocksize); sbi->log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); - sbi->blocks_per_seg = 1 << sbi->log_blocks_per_seg; + sbi->blocks_per_seg = BIT(sbi->log_blocks_per_seg); sbi->segs_per_sec = le32_to_cpu(raw_super->segs_per_sec); sbi->secs_per_zone = le32_to_cpu(raw_super->secs_per_zone); sbi->total_sections = le32_to_cpu(raw_super->section_count); @@ -3744,12 +3806,8 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) SECTOR_TO_BLOCK(zone_sectors)) return -EINVAL; sbi->blocks_per_blkz = SECTOR_TO_BLOCK(zone_sectors); - if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz != - __ilog2_u32(sbi->blocks_per_blkz)) - return -EINVAL; - sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); - FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> - sbi->log_blocks_per_blkz; + FDEV(devi).nr_blkz = div_u64(SECTOR_TO_BLOCK(nr_sectors), + sbi->blocks_per_blkz); if (nr_sectors & (zone_sectors - 1)) FDEV(devi).nr_blkz++; @@ -3836,7 +3894,7 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) int err; if ((recover && f2fs_readonly(sbi->sb)) || - bdev_read_only(sbi->sb->s_bdev)) { + f2fs_hw_is_readonly(sbi)) { set_sbi_flag(sbi, SBI_NEED_SB_WRITE); return -EROFS; } @@ -3875,7 +3933,7 @@ void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason) f2fs_down_write(&sbi->sb_lock); - if (raw_super->s_stop_reason[reason] < ((1 << BITS_PER_BYTE) - 1)) + if (raw_super->s_stop_reason[reason] < GENMASK(BITS_PER_BYTE - 1, 0)) raw_super->s_stop_reason[reason]++; err = f2fs_commit_super(sbi, false); @@ -3885,7 +3943,7 @@ void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason) f2fs_up_write(&sbi->sb_lock); } -static void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag) +void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag) { spin_lock(&sbi->error_lock); if (!test_bit(flag, (unsigned long *)sbi->errors)) { @@ -4025,7 +4083,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) FDEV(i).start_blk, FDEV(i).end_blk); } f2fs_info(sbi, - "IO Block Size: %8d KB", F2FS_IO_SIZE_KB(sbi)); + "IO Block Size: %8ld KB", F2FS_IO_SIZE_KB(sbi)); return 0; } @@ -4102,6 +4160,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) int recovery, i, valid_super_block; struct curseg_info *seg_i; int retry_cnt = 1; +#ifdef CONFIG_QUOTA + bool quota_enabled = false; +#endif try_onemore: err = -EINVAL; @@ -4395,6 +4456,8 @@ try_onemore: if (err) f2fs_err(sbi, "Cannot turn on quotas: error %d", err); } + + quota_enabled = f2fs_recover_quota_begin(sbi); #endif /* if there are any orphan inodes, free them */ err = f2fs_recover_orphan_inodes(sbi); @@ -4452,6 +4515,10 @@ try_onemore: } } +#ifdef CONFIG_QUOTA + f2fs_recover_quota_end(sbi, quota_enabled); +#endif + /* * If the f2fs is not readonly and fsync data recovery succeeds, * check zoned block devices' write pointer consistency. diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 0b19163c90d4..8ea05340bad9 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -312,19 +312,14 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) { struct ckpt_req_control *cprc = &sbi->cprc_info; - int len = 0; int class = IOPRIO_PRIO_CLASS(cprc->ckpt_thread_ioprio); int data = IOPRIO_PRIO_DATA(cprc->ckpt_thread_ioprio); - if (class == IOPRIO_CLASS_RT) - len += scnprintf(buf + len, PAGE_SIZE - len, "rt,"); - else if (class == IOPRIO_CLASS_BE) - len += scnprintf(buf + len, PAGE_SIZE - len, "be,"); - else + if (class != IOPRIO_CLASS_RT && class != IOPRIO_CLASS_BE) return -EINVAL; - len += scnprintf(buf + len, PAGE_SIZE - len, "%d\n", data); - return len; + return sysfs_emit(buf, "%s,%d\n", + class == IOPRIO_CLASS_RT ? "rt" : "be", data); } #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -452,7 +447,7 @@ out: if (ret < 0) return ret; #ifdef CONFIG_F2FS_FAULT_INJECTION - if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) + if (a->struct_type == FAULT_INFO_TYPE && t >= BIT(FAULT_MAX)) return -EINVAL; if (a->struct_type == FAULT_INFO_RATE && t >= UINT_MAX) return -EINVAL; @@ -575,9 +570,9 @@ out: if (!strcmp(a->attr.name, "iostat_period_ms")) { if (t < MIN_IOSTAT_PERIOD_MS || t > MAX_IOSTAT_PERIOD_MS) return -EINVAL; - spin_lock(&sbi->iostat_lock); + spin_lock_irq(&sbi->iostat_lock); sbi->iostat_period_ms = (unsigned int)t; - spin_unlock(&sbi->iostat_lock); + spin_unlock_irq(&sbi->iostat_lock); return count; } #endif @@ -598,6 +593,20 @@ out: sbi->compr_new_inode = 0; return count; } + + if (!strcmp(a->attr.name, "compress_percent")) { + if (t == 0 || t > 100) + return -EINVAL; + *ui = t; + return count; + } + + if (!strcmp(a->attr.name, "compress_watermark")) { + if (t == 0 || t > 100) + return -EINVAL; + *ui = t; + return count; + } #endif if (!strcmp(a->attr.name, "atgc_candidate_ratio")) { @@ -950,6 +959,8 @@ F2FS_FEATURE_RO_ATTR(compression); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_written_block, compr_written_block); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_saved_block, compr_saved_block); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_new_inode, compr_new_inode); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compress_percent, compress_percent); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compress_watermark, compress_watermark); #endif F2FS_FEATURE_RO_ATTR(pin_file); @@ -1057,6 +1068,8 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(compr_written_block), ATTR_LIST(compr_saved_block), ATTR_LIST(compr_new_inode), + ATTR_LIST(compress_percent), + ATTR_LIST(compress_watermark), #endif /* For ATGC */ ATTR_LIST(atgc_candidate_ratio), @@ -1449,25 +1462,14 @@ put_sb_kobj: void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) { - if (sbi->s_proc) { -#ifdef CONFIG_F2FS_IOSTAT - remove_proc_entry("iostat_info", sbi->s_proc); -#endif - remove_proc_entry("segment_info", sbi->s_proc); - remove_proc_entry("segment_bits", sbi->s_proc); - remove_proc_entry("victim_bits", sbi->s_proc); - remove_proc_entry("discard_plist_info", sbi->s_proc); - remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); - } + if (sbi->s_proc) + remove_proc_subtree(sbi->sb->s_id, f2fs_proc_root); - kobject_del(&sbi->s_stat_kobj); kobject_put(&sbi->s_stat_kobj); wait_for_completion(&sbi->s_stat_kobj_unregister); - kobject_del(&sbi->s_feature_list_kobj); kobject_put(&sbi->s_feature_list_kobj); wait_for_completion(&sbi->s_feature_list_kobj_unregister); - kobject_del(&sbi->s_kobj); kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index d92edbbdc30e..213805d3592c 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -192,8 +192,8 @@ const struct xattr_handler f2fs_xattr_security_handler = { static const struct xattr_handler *f2fs_xattr_handler_map[] = { [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler, #ifdef CONFIG_F2FS_FS_POSIX_ACL - [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, - [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, + [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, + [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default, #endif [F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler, #ifdef CONFIG_F2FS_FS_SECURITY @@ -204,10 +204,6 @@ static const struct xattr_handler *f2fs_xattr_handler_map[] = { const struct xattr_handler *f2fs_xattr_handlers[] = { &f2fs_xattr_user_handler, -#ifdef CONFIG_F2FS_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif &f2fs_xattr_trusted_handler, #ifdef CONFIG_F2FS_FS_SECURITY &f2fs_xattr_security_handler, @@ -216,13 +212,18 @@ const struct xattr_handler *f2fs_xattr_handlers[] = { NULL, }; -static inline const struct xattr_handler *f2fs_xattr_handler(int index) +static inline const char *f2fs_xattr_prefix(int index, + struct dentry *dentry) { const struct xattr_handler *handler = NULL; if (index > 0 && index < ARRAY_SIZE(f2fs_xattr_handler_map)) handler = f2fs_xattr_handler_map[index]; - return handler; + + if (!xattr_handler_can_list(handler, dentry)) + return NULL; + + return xattr_prefix(handler); } static struct f2fs_xattr_entry *__find_xattr(void *base_addr, @@ -573,12 +574,12 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) last_base_addr = (void *)base_addr + XATTR_SIZE(inode); list_for_each_xattr(entry, base_addr) { - const struct xattr_handler *handler = - f2fs_xattr_handler(entry->e_name_index); const char *prefix; size_t prefix_len; size_t size; + prefix = f2fs_xattr_prefix(entry->e_name_index, dentry); + if ((void *)(entry) + sizeof(__u32) > last_base_addr || (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) { f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr", @@ -590,10 +591,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) goto cleanup; } - if (!handler || (handler->list && !handler->list(dentry))) + if (!prefix) continue; - prefix = xattr_prefix(handler); prefix_len = strlen(prefix); size = prefix_len + entry->e_name_len + 1; if (buffer) { diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 1db3e3c24b43..ae4e51e91ee3 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -829,7 +829,7 @@ void wbc_detach_inode(struct writeback_control *wbc) * is okay. The main goal is avoiding keeping an inode on * the wrong wb for an extended period of time. */ - if (hweight32(history) > WB_FRN_HIST_THR_SLOTS) + if (hweight16(history) > WB_FRN_HIST_THR_SLOTS) inode_switch_wbs(inode, max_id); } diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 179a5c5e28fd..91e89e68177e 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -623,7 +623,7 @@ static int __init cuse_init(void) /* CUSE is not prepared for FUSE_DEV_IOC_CLONE */ cuse_channel_fops.unlocked_ioctl = NULL; - cuse_class = class_create(THIS_MODULE, "cuse"); + cuse_class = class_create("cuse"); if (IS_ERR(cuse_class)) return PTR_ERR(cuse_class); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index eb4f88e3dc97..1a8f82f478cb 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2257,30 +2257,31 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, int res; int oldfd; struct fuse_dev *fud = NULL; + struct fd f; switch (cmd) { case FUSE_DEV_IOC_CLONE: - res = -EFAULT; - if (!get_user(oldfd, (__u32 __user *)arg)) { - struct file *old = fget(oldfd); - - res = -EINVAL; - if (old) { - /* - * Check against file->f_op because CUSE - * uses the same ioctl handler. - */ - if (old->f_op == file->f_op) - fud = fuse_get_dev(old); - - if (fud) { - mutex_lock(&fuse_mutex); - res = fuse_device_clone(fud->fc, file); - mutex_unlock(&fuse_mutex); - } - fput(old); - } + if (get_user(oldfd, (__u32 __user *)arg)) + return -EFAULT; + + f = fdget(oldfd); + if (!f.file) + return -EINVAL; + + /* + * Check against file->f_op because CUSE + * uses the same ioctl handler. + */ + if (f.file->f_op == file->f_op) + fud = fuse_get_dev(f.file); + + res = -EINVAL; + if (fud) { + mutex_lock(&fuse_mutex); + res = fuse_device_clone(fud->fc, file); + mutex_unlock(&fuse_mutex); } + fdput(f); break; default: res = -ENOTTY; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index de37a3a06a71..89d97f6188e0 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1419,7 +1419,7 @@ out: static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii) { - return (unsigned long)ii->iov->iov_base + ii->iov_offset; + return (unsigned long)iter_iov(ii)->iov_base + ii->iov_offset; } static inline size_t fuse_get_frag_size(const struct iov_iter *ii, diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index eedf6926c652..c739b258a2d9 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -2035,14 +2035,6 @@ static int do_shrink(struct inode *inode, u64 newsize) return error; } -void gfs2_trim_blocks(struct inode *inode) -{ - int ret; - - ret = do_shrink(inode, inode->i_size); - WARN_ON(ret != 0); -} - /** * do_grow - Touch and update inode size * @inode: The inode diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h index 53cce6c08e81..e5b7d17131ed 100644 --- a/fs/gfs2/bmap.h +++ b/fs/gfs2/bmap.h @@ -58,7 +58,6 @@ extern int gfs2_get_extent(struct inode *inode, u64 lblock, u64 *dblock, extern int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock, unsigned *extlen, bool *new); extern int gfs2_setattr_size(struct inode *inode, u64 size); -extern void gfs2_trim_blocks(struct inode *inode); extern int gfs2_truncatei_resume(struct gfs2_inode *ip); extern int gfs2_file_dealloc(struct gfs2_inode *ip); extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 4d99cc77a29b..01d433ed6ce7 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -90,7 +90,7 @@ static int gfs2_ail_empty_gl(struct gfs2_glock *gl) struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct gfs2_trans tr; unsigned int revokes; - int ret; + int ret = 0; revokes = atomic_read(&gl->gl_ail_count); @@ -124,15 +124,18 @@ static int gfs2_ail_empty_gl(struct gfs2_glock *gl) memset(&tr, 0, sizeof(tr)); set_bit(TR_ONSTACK, &tr.tr_flags); ret = __gfs2_trans_begin(&tr, sdp, 0, revokes, _RET_IP_); - if (ret) + if (ret) { + fs_err(sdp, "Transaction error %d: Unable to write revokes.", ret); goto flush; + } __gfs2_ail_flush(gl, 0, revokes); gfs2_trans_end(sdp); flush: - gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | - GFS2_LFC_AIL_EMPTY_GL); - return 0; + if (!ret) + gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_AIL_EMPTY_GL); + return ret; } void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) @@ -326,7 +329,9 @@ static int inode_go_sync(struct gfs2_glock *gl) ret = gfs2_inode_metasync(gl); if (!error) error = ret; - gfs2_ail_empty_gl(gl); + ret = gfs2_ail_empty_gl(gl); + if (!error) + error = ret; /* * Writeback of the data mapping may cause the dirty flag to be set * so we have to clear it again here. @@ -396,6 +401,7 @@ static int inode_go_demote_ok(const struct gfs2_glock *gl) static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) { + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); const struct gfs2_dinode *str = buf; struct timespec64 atime; u16 height, depth; @@ -442,7 +448,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) /* i_diskflags and i_eattr must be set before gfs2_set_inode_flags() */ gfs2_set_inode_flags(inode); height = be16_to_cpu(str->di_height); - if (unlikely(height > GFS2_MAX_META_HEIGHT)) + if (unlikely(height > sdp->sd_max_height)) goto corrupt; ip->i_height = (u8)height; @@ -534,12 +540,13 @@ static void inode_go_dump(struct seq_file *seq, struct gfs2_glock *gl, const char *fs_id_buf) { struct gfs2_inode *ip = gl->gl_object; - struct inode *inode = &ip->i_inode; + struct inode *inode; unsigned long nrpages; if (ip == NULL) return; + inode = &ip->i_inode; xa_lock_irq(&inode->i_data.i_pages); nrpages = inode->i_data.nrpages; xa_unlock_irq(&inode->i_data.i_pages); diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 1291b5ee3584..17c994a0c0d0 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -941,7 +941,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, struct gfs2_sbd *sdp = GFS2_SB(dir); struct inode *inode = d_inode(old_dentry); struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder ghs[2]; + struct gfs2_holder d_gh, gh; struct buffer_head *dibh; struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, }; int error; @@ -953,14 +953,14 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (error) return error; - gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); + gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, &d_gh); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - error = gfs2_glock_nq(ghs); /* parent */ + error = gfs2_glock_nq(&d_gh); if (error) goto out_parent; - error = gfs2_glock_nq(ghs + 1); /* child */ + error = gfs2_glock_nq(&gh); if (error) goto out_child; @@ -992,9 +992,6 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, error = -EPERM; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) goto out_gunlock; - error = -EINVAL; - if (!ip->i_inode.i_nlink) - goto out_gunlock; error = -EMLINK; if (ip->i_inode.i_nlink == (u32)-1) goto out_gunlock; @@ -1049,13 +1046,13 @@ out_gunlock_q: gfs2_quota_unlock(dip); out_gunlock: gfs2_dir_no_add(&da); - gfs2_glock_dq(ghs + 1); + gfs2_glock_dq(&gh); out_child: - gfs2_glock_dq(ghs); + gfs2_glock_dq(&d_gh); out_parent: gfs2_qa_put(dip); - gfs2_holder_uninit(ghs); - gfs2_holder_uninit(ghs + 1); + gfs2_holder_uninit(&d_gh); + gfs2_holder_uninit(&gh); return error; } @@ -1146,7 +1143,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) struct gfs2_sbd *sdp = GFS2_SB(dir); struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder ghs[3]; + struct gfs2_holder d_gh, r_gh, gh; struct gfs2_rgrpd *rgd; int error; @@ -1156,21 +1153,21 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) error = -EROFS; - gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); + gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, &d_gh); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr, 1); if (!rgd) goto out_inodes; - gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, LM_FLAG_NODE_SCOPE, ghs + 2); + gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, LM_FLAG_NODE_SCOPE, &r_gh); - error = gfs2_glock_nq(ghs); /* parent */ + error = gfs2_glock_nq(&d_gh); if (error) goto out_parent; - error = gfs2_glock_nq(ghs + 1); /* child */ + error = gfs2_glock_nq(&gh); if (error) goto out_child; @@ -1184,7 +1181,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) goto out_rgrp; } - error = gfs2_glock_nq(ghs + 2); /* rgrp */ + error = gfs2_glock_nq(&r_gh); /* rgrp */ if (error) goto out_rgrp; @@ -1200,16 +1197,16 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) gfs2_trans_end(sdp); out_gunlock: - gfs2_glock_dq(ghs + 2); + gfs2_glock_dq(&r_gh); out_rgrp: - gfs2_glock_dq(ghs + 1); + gfs2_glock_dq(&gh); out_child: - gfs2_glock_dq(ghs); + gfs2_glock_dq(&d_gh); out_parent: - gfs2_holder_uninit(ghs + 2); + gfs2_holder_uninit(&r_gh); out_inodes: - gfs2_holder_uninit(ghs + 1); - gfs2_holder_uninit(ghs); + gfs2_holder_uninit(&gh); + gfs2_holder_uninit(&d_gh); return error; } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 6de901c3b89b..9af9ddb61ca0 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -734,13 +734,11 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) struct inode *master = d_inode(sdp->sd_master_dir); struct gfs2_holder ji_gh; struct gfs2_inode *ip; - int jindex = 1; int error = 0; - if (undo) { - jindex = 0; + gfs2_holder_mark_uninitialized(&ji_gh); + if (undo) goto fail_statfs; - } sdp->sd_jindex = gfs2_lookup_simple(master, "jindex"); if (IS_ERR(sdp->sd_jindex)) { @@ -852,7 +850,6 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) sdp->sd_log_idle = 1; set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags); gfs2_glock_dq_uninit(&ji_gh); - jindex = 0; INIT_WORK(&sdp->sd_freeze_work, gfs2_freeze_func); return 0; @@ -869,7 +866,7 @@ fail_journal_gh: gfs2_glock_dq_uninit(&sdp->sd_journal_gh); fail_jindex: gfs2_jindex_free(sdp); - if (jindex) + if (gfs2_holder_initialized(&ji_gh)) gfs2_glock_dq_uninit(&ji_gh); fail: iput(sdp->sd_jindex); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index a83fa62106f0..a84bf6444bba 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -552,6 +552,15 @@ void gfs2_make_fs_ro(struct gfs2_sbd *sdp) gfs2_quota_sync(sdp->sd_vfs, 0); gfs2_statfs_sync(sdp->sd_vfs, 0); + /* We do two log flushes here. The first one commits dirty inodes + * and rgrps to the journal, but queues up revokes to the ail list. + * The second flush writes out and removes the revokes. + * + * The first must be done before the FLUSH_SHUTDOWN code + * clears the LIVE flag, otherwise it will not be able to start + * a transaction to write its revokes, and the error will cause + * a withdraw of the file system. */ + gfs2_log_flush(sdp, NULL, GFS2_LFC_MAKE_FS_RO); gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_SHUTDOWN | GFS2_LFC_MAKE_FS_RO); wait_event_timeout(sdp->sd_log_waitq, @@ -1410,6 +1419,14 @@ static void gfs2_evict_inode(struct inode *inode) if (inode->i_nlink || sb_rdonly(sb) || !ip->i_no_addr) goto out; + /* + * In case of an incomplete mount, gfs2_evict_inode() may be called for + * system files without having an active journal to write to. In that + * case, skip the filesystem evict. + */ + if (!sdp->sd_jdesc) + goto out; + gfs2_holder_mark_uninitialized(&gh); ret = evict_should_delete(inode, &gh); if (ret == SHOULD_DEFER_EVICTION) diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index adf6d17cf033..93b36d026bb4 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1501,8 +1501,6 @@ const struct xattr_handler *gfs2_xattr_handlers_max[] = { /* GFS2_FS_FORMAT_MIN */ &gfs2_xattr_user_handler, &gfs2_xattr_security_handler, - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, NULL, }; diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index abb91f5fae92..b21660475ac1 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -511,7 +511,11 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) if (type == HFSPLUS_FOLDER) { struct hfsplus_cat_folder *folder = &entry.folder; - WARN_ON(fd->entrylength < sizeof(struct hfsplus_cat_folder)); + if (fd->entrylength < sizeof(struct hfsplus_cat_folder)) { + pr_err("bad catalog folder entry\n"); + res = -EIO; + goto out; + } hfs_bnode_read(fd->bnode, &entry, fd->entryoffset, sizeof(struct hfsplus_cat_folder)); hfsplus_get_perms(inode, &folder->permissions, 1); @@ -531,7 +535,11 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) } else if (type == HFSPLUS_FILE) { struct hfsplus_cat_file *file = &entry.file; - WARN_ON(fd->entrylength < sizeof(struct hfsplus_cat_file)); + if (fd->entrylength < sizeof(struct hfsplus_cat_file)) { + pr_err("bad catalog file entry\n"); + res = -EIO; + goto out; + } hfs_bnode_read(fd->bnode, &entry, fd->entryoffset, sizeof(struct hfsplus_cat_file)); @@ -562,6 +570,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) pr_err("bad catalog entry used to create inode\n"); res = -EIO; } +out: return res; } @@ -570,6 +579,7 @@ int hfsplus_cat_write_inode(struct inode *inode) struct inode *main_inode = inode; struct hfs_find_data fd; hfsplus_cat_entry entry; + int res = 0; if (HFSPLUS_IS_RSRC(inode)) main_inode = HFSPLUS_I(inode)->rsrc_inode; @@ -588,7 +598,11 @@ int hfsplus_cat_write_inode(struct inode *inode) if (S_ISDIR(main_inode->i_mode)) { struct hfsplus_cat_folder *folder = &entry.folder; - WARN_ON(fd.entrylength < sizeof(struct hfsplus_cat_folder)); + if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) { + pr_err("bad catalog folder entry\n"); + res = -EIO; + goto out; + } hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, sizeof(struct hfsplus_cat_folder)); /* simple node checks? */ @@ -613,7 +627,11 @@ int hfsplus_cat_write_inode(struct inode *inode) } else { struct hfsplus_cat_file *file = &entry.file; - WARN_ON(fd.entrylength < sizeof(struct hfsplus_cat_file)); + if (fd.entrylength < sizeof(struct hfsplus_cat_file)) { + pr_err("bad catalog file entry\n"); + res = -EIO; + goto out; + } hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, sizeof(struct hfsplus_cat_file)); hfsplus_inode_write_fork(inode, &file->data_fork); @@ -634,7 +652,7 @@ int hfsplus_cat_write_inode(struct inode *inode) set_bit(HFSPLUS_I_CAT_DIRTY, &HFSPLUS_I(inode)->flags); out: hfs_find_exit(&fd); - return 0; + return res; } int hfsplus_fileattr_get(struct dentry *dentry, struct fileattr *fa) diff --git a/fs/hostfs/Makefile b/fs/hostfs/Makefile index 587bcd6e50a3..16be592e8085 100644 --- a/fs/hostfs/Makefile +++ b/fs/hostfs/Makefile @@ -3,9 +3,11 @@ # Licensed under the GPL # -hostfs-objs := hostfs_kern.o hostfs_user.o +hostfs-objs := hostfs_kern.o -obj-y := +hostfs-builtin-$(CONFIG_HOSTFS) += hostfs_user.o hostfs_user_exp.o + +obj-y := $(hostfs-builtin-y) $(hostfs-builtin-m) obj-$(CONFIG_HOSTFS) += hostfs.o include $(srctree)/arch/um/scripts/Makefile.rules diff --git a/fs/hostfs/hostfs_user_exp.c b/fs/hostfs/hostfs_user_exp.c new file mode 100644 index 000000000000..250c91c55c46 --- /dev/null +++ b/fs/hostfs/hostfs_user_exp.c @@ -0,0 +1,28 @@ +#include <linux/module.h> +#include "hostfs.h" + +EXPORT_SYMBOL_GPL(stat_file); +EXPORT_SYMBOL_GPL(access_file); +EXPORT_SYMBOL_GPL(open_file); +EXPORT_SYMBOL_GPL(open_dir); +EXPORT_SYMBOL_GPL(seek_dir); +EXPORT_SYMBOL_GPL(read_dir); +EXPORT_SYMBOL_GPL(read_file); +EXPORT_SYMBOL_GPL(write_file); +EXPORT_SYMBOL_GPL(lseek_file); +EXPORT_SYMBOL_GPL(fsync_file); +EXPORT_SYMBOL_GPL(replace_file); +EXPORT_SYMBOL_GPL(close_file); +EXPORT_SYMBOL_GPL(close_dir); +EXPORT_SYMBOL_GPL(file_create); +EXPORT_SYMBOL_GPL(set_attr); +EXPORT_SYMBOL_GPL(make_symlink); +EXPORT_SYMBOL_GPL(unlink_file); +EXPORT_SYMBOL_GPL(do_mkdir); +EXPORT_SYMBOL_GPL(hostfs_do_rmdir); +EXPORT_SYMBOL_GPL(do_mknod); +EXPORT_SYMBOL_GPL(link_file); +EXPORT_SYMBOL_GPL(hostfs_do_readlink); +EXPORT_SYMBOL_GPL(rename_file); +EXPORT_SYMBOL_GPL(rename2_file); +EXPORT_SYMBOL_GPL(do_statfs); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 9062da6da567..ecfdfb2529a3 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -208,7 +208,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; - info.low_limit = max(PAGE_SIZE, mmap_min_addr); + info.low_limit = PAGE_SIZE; info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; @@ -697,7 +697,7 @@ static void hugetlbfs_zero_partial_page(struct hstate *h, struct folio *folio; folio = filemap_lock_folio(mapping, idx); - if (!folio) + if (IS_ERR(folio)) return; start = start & ~huge_page_mask(h); diff --git a/fs/inode.c b/fs/inode.c index 4558dc2f1355..577799b7855f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -864,8 +864,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item, __count_vm_events(KSWAPD_INODESTEAL, reap); else __count_vm_events(PGINODESTEAL, reap); - if (current->reclaim_state) - current->reclaim_state->reclaimed_slab += reap; + mm_account_reclaimed_pages(reap); } iput(inode); spin_lock(lru_lock); @@ -1804,8 +1803,8 @@ EXPORT_SYMBOL(bmap); /* * With relative atime, only update atime if the previous atime is - * earlier than either the ctime or mtime or if at least a day has - * passed since the last atime update. + * earlier than or equal to either the ctime or mtime, + * or if at least a day has passed since the last atime update. */ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, struct timespec64 now) @@ -1814,12 +1813,12 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, if (!(mnt->mnt_flags & MNT_RELATIME)) return 1; /* - * Is mtime younger than atime? If yes, update atime: + * Is mtime younger than or equal to atime? If yes, update atime: */ if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0) return 1; /* - * Is ctime younger than atime? If yes, update atime: + * Is ctime younger than or equal to atime? If yes, update atime: */ if (timespec64_compare(&inode->i_ctime, &inode->i_atime) >= 0) return 1; diff --git a/fs/internal.h b/fs/internal.h index dc4eb91a577a..bd3b2810a36b 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -59,8 +59,6 @@ extern int finish_clean_context(struct fs_context *fc); */ extern int filename_lookup(int dfd, struct filename *name, unsigned flags, struct path *path, struct path *root); -extern int vfs_path_lookup(struct dentry *, struct vfsmount *, - const char *, unsigned int, struct path *); int do_rmdir(int dfd, struct filename *name); int do_unlinkat(int dfd, struct filename *name); int may_linkat(struct mnt_idmap *idmap, const struct path *link); @@ -259,8 +257,6 @@ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *po /* * fs/attr.c */ -int setattr_should_drop_sgid(struct mnt_idmap *idmap, - const struct inode *inode); struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns); struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap); void mnt_idmap_put(struct mnt_idmap *idmap); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 6f4c97a6d7e9..063133ec77f4 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -467,20 +467,13 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); */ struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos) { - unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS; - struct folio *folio; + unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS; if (iter->flags & IOMAP_NOWAIT) fgp |= FGP_NOWAIT; - folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, + return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, fgp, mapping_gfp_mask(iter->inode->i_mapping)); - if (folio) - return folio; - - if (iter->flags & IOMAP_NOWAIT) - return ERR_PTR(-EAGAIN); - return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL_GPL(iomap_get_folio); @@ -911,7 +904,7 @@ static int iomap_write_delalloc_scan(struct inode *inode, /* grab locked page */ folio = filemap_lock_folio(inode->i_mapping, start_byte >> PAGE_SHIFT); - if (!folio) { + if (IS_ERR(folio)) { start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) + PAGE_SIZE; continue; diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index f771001574d0..019cc87d0fb3 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -130,6 +130,7 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) if (ret > 0) ret += dio->done_before; + trace_iomap_dio_complete(iocb, dio->error, ret); kfree(dio); return ret; @@ -493,6 +494,8 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct blk_plug plug; struct iomap_dio *dio; + trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before); + if (!iomi.len) return NULL; @@ -541,7 +544,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, } /* for data sync or sync, we need sync completion processing */ - if (iocb_is_dsync(iocb) && !(dio_flags & IOMAP_DIO_NOSYNC)) { + if (iocb_is_dsync(iocb)) { dio->flags |= IOMAP_DIO_NEED_SYNC; /* @@ -650,8 +653,10 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, */ dio->wait_for_completion = wait_for_completion; if (!atomic_dec_and_test(&dio->ref)) { - if (!wait_for_completion) + if (!wait_for_completion) { + trace_iomap_dio_rw_queued(inode, iomi.pos, iomi.len); return ERR_PTR(-EIOCBQUEUED); + } for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); diff --git a/fs/iomap/trace.c b/fs/iomap/trace.c index da217246b1a9..728d5443daf5 100644 --- a/fs/iomap/trace.c +++ b/fs/iomap/trace.c @@ -3,6 +3,7 @@ * Copyright (c) 2019 Christoph Hellwig */ #include <linux/iomap.h> +#include <linux/uio.h> /* * We include this last to have the helpers above available for the trace diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index f6ea9540d082..c16fd55f5595 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -83,6 +83,7 @@ DEFINE_RANGE_EVENT(iomap_writepage); DEFINE_RANGE_EVENT(iomap_release_folio); DEFINE_RANGE_EVENT(iomap_invalidate_folio); DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail); +DEFINE_RANGE_EVENT(iomap_dio_rw_queued); #define IOMAP_TYPE_STRINGS \ { IOMAP_HOLE, "HOLE" }, \ @@ -107,6 +108,11 @@ DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail); { IOMAP_F_BUFFER_HEAD, "BH" }, \ { IOMAP_F_SIZE_CHANGED, "SIZE_CHANGED" } +#define IOMAP_DIO_STRINGS \ + {IOMAP_DIO_FORCE_WAIT, "DIO_FORCE_WAIT" }, \ + {IOMAP_DIO_OVERWRITE_ONLY, "DIO_OVERWRITE_ONLY" }, \ + {IOMAP_DIO_PARTIAL, "DIO_PARTIAL" } + DECLARE_EVENT_CLASS(iomap_class, TP_PROTO(struct inode *inode, struct iomap *iomap), TP_ARGS(inode, iomap), @@ -183,6 +189,78 @@ TRACE_EVENT(iomap_iter, (void *)__entry->caller) ); +TRACE_EVENT(iomap_dio_rw_begin, + TP_PROTO(struct kiocb *iocb, struct iov_iter *iter, + unsigned int dio_flags, size_t done_before), + TP_ARGS(iocb, iter, dio_flags, done_before), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(loff_t, isize) + __field(loff_t, pos) + __field(size_t, count) + __field(size_t, done_before) + __field(int, ki_flags) + __field(unsigned int, dio_flags) + __field(bool, aio) + ), + TP_fast_assign( + __entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev; + __entry->ino = file_inode(iocb->ki_filp)->i_ino; + __entry->isize = file_inode(iocb->ki_filp)->i_size; + __entry->pos = iocb->ki_pos; + __entry->count = iov_iter_count(iter); + __entry->done_before = done_before; + __entry->ki_flags = iocb->ki_flags; + __entry->dio_flags = dio_flags; + __entry->aio = !is_sync_kiocb(iocb); + ), + TP_printk("dev %d:%d ino 0x%lx size 0x%llx offset 0x%llx length 0x%zx done_before 0x%zx flags %s dio_flags %s aio %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->isize, + __entry->pos, + __entry->count, + __entry->done_before, + __print_flags(__entry->ki_flags, "|", TRACE_IOCB_STRINGS), + __print_flags(__entry->dio_flags, "|", IOMAP_DIO_STRINGS), + __entry->aio) +); + +TRACE_EVENT(iomap_dio_complete, + TP_PROTO(struct kiocb *iocb, int error, ssize_t ret), + TP_ARGS(iocb, error, ret), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(loff_t, isize) + __field(loff_t, pos) + __field(int, ki_flags) + __field(bool, aio) + __field(int, error) + __field(ssize_t, ret) + ), + TP_fast_assign( + __entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev; + __entry->ino = file_inode(iocb->ki_filp)->i_ino; + __entry->isize = file_inode(iocb->ki_filp)->i_size; + __entry->pos = iocb->ki_pos; + __entry->ki_flags = iocb->ki_flags; + __entry->aio = !is_sync_kiocb(iocb); + __entry->error = error; + __entry->ret = ret; + ), + TP_printk("dev %d:%d ino 0x%lx size 0x%llx offset 0x%llx flags %s aio %d error %d ret %zd", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->isize, + __entry->pos, + __print_flags(__entry->ki_flags, "|", TRACE_IOCB_STRINGS), + __entry->aio, + __entry->error, + __entry->ret) +); + #endif /* _IOMAP_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 15de1385012e..18611241f451 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2387,6 +2387,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, spin_unlock(&jh->b_state_lock); write_unlock(&journal->j_state_lock); jbd2_journal_put_journal_head(jh); + /* Already zapped buffer? Nothing to do... */ + if (!bh->b_bdev) + return 0; return -EBUSY; } /* diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index da3e18503c65..aa4048a27f31 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -920,16 +920,13 @@ const struct xattr_handler *jffs2_xattr_handlers[] = { #ifdef CONFIG_JFFS2_FS_SECURITY &jffs2_security_xattr_handler, #endif -#ifdef CONFIG_JFFS2_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif &jffs2_trusted_xattr_handler, NULL }; -static const struct xattr_handler *xprefix_to_handler(int xprefix) { - const struct xattr_handler *ret; +static const char *jffs2_xattr_prefix(int xprefix, struct dentry *dentry) +{ + const struct xattr_handler *ret = NULL; switch (xprefix) { case JFFS2_XPREFIX_USER: @@ -942,20 +939,23 @@ static const struct xattr_handler *xprefix_to_handler(int xprefix) { #endif #ifdef CONFIG_JFFS2_FS_POSIX_ACL case JFFS2_XPREFIX_ACL_ACCESS: - ret = &posix_acl_access_xattr_handler; + ret = &nop_posix_acl_access; break; case JFFS2_XPREFIX_ACL_DEFAULT: - ret = &posix_acl_default_xattr_handler; + ret = &nop_posix_acl_default; break; #endif case JFFS2_XPREFIX_TRUSTED: ret = &jffs2_trusted_xattr_handler; break; default: - ret = NULL; - break; + return NULL; } - return ret; + + if (!xattr_handler_can_list(ret, dentry)) + return NULL; + + return xattr_prefix(ret); } ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size) @@ -966,7 +966,6 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size) struct jffs2_inode_cache *ic = f->inocache; struct jffs2_xattr_ref *ref, **pref; struct jffs2_xattr_datum *xd; - const struct xattr_handler *xhandle; const char *prefix; ssize_t prefix_len, len, rc; int retry = 0; @@ -998,10 +997,10 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size) goto out; } } - xhandle = xprefix_to_handler(xd->xprefix); - if (!xhandle || (xhandle->list && !xhandle->list(dentry))) + + prefix = jffs2_xattr_prefix(xd->xprefix, dentry); + if (!prefix) continue; - prefix = xhandle->prefix ?: xhandle->name; prefix_len = strlen(prefix); rc = prefix_len + xd->name_len + 1; diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 2e8461ce74de..961569c11159 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -691,6 +691,35 @@ void grab_metapage(struct metapage * mp) unlock_page(mp->page); } +static int metapage_write_one(struct page *page) +{ + struct folio *folio = page_folio(page); + struct address_space *mapping = folio->mapping; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = folio_nr_pages(folio), + }; + int ret = 0; + + BUG_ON(!folio_test_locked(folio)); + + folio_wait_writeback(folio); + + if (folio_clear_dirty_for_io(folio)) { + folio_get(folio); + ret = metapage_writepage(page, &wbc); + if (ret == 0) + folio_wait_writeback(folio); + folio_put(folio); + } else { + folio_unlock(folio); + } + + if (!ret) + ret = filemap_check_errors(mapping); + return ret; +} + void force_metapage(struct metapage *mp) { struct page *page = mp->page; @@ -700,8 +729,8 @@ void force_metapage(struct metapage *mp) get_page(page); lock_page(page); set_page_dirty(page); - if (write_one_page(page)) - jfs_error(mp->sb, "write_one_page() failed\n"); + if (metapage_write_one(page)) + jfs_error(mp->sb, "metapage_write_one() failed\n"); clear_bit(META_forcewrite, &mp->flag); put_page(page); } @@ -746,9 +775,9 @@ void release_metapage(struct metapage * mp) set_page_dirty(page); if (test_bit(META_sync, &mp->flag)) { clear_bit(META_sync, &mp->flag); - if (write_one_page(page)) - jfs_error(mp->sb, "write_one_page() failed\n"); - lock_page(page); /* write_one_page unlocks the page */ + if (metapage_write_one(page)) + jfs_error(mp->sb, "metapage_write_one() failed\n"); + lock_page(page); } } else if (mp->lsn) /* discard_metapage doesn't remove it */ remove_from_logsync(mp); diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index f817798fa1eb..931e50018f88 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -986,10 +986,6 @@ static const struct xattr_handler jfs_trusted_xattr_handler = { }; const struct xattr_handler *jfs_xattr_handlers[] = { -#ifdef CONFIG_JFS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif &jfs_os2_xattr_handler, &jfs_user_xattr_handler, &jfs_security_xattr_handler, diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index ef00b5fe8cee..45b6919903e6 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -17,7 +17,7 @@ #include "kernfs-internal.h" -static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */ +static DEFINE_RWLOCK(kernfs_rename_lock); /* kn->parent and ->name */ /* * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to * call pr_cont() while holding rename_lock. Because sometimes pr_cont() @@ -196,9 +196,9 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) unsigned long flags; int ret; - spin_lock_irqsave(&kernfs_rename_lock, flags); + read_lock_irqsave(&kernfs_rename_lock, flags); ret = kernfs_name_locked(kn, buf, buflen); - spin_unlock_irqrestore(&kernfs_rename_lock, flags); + read_unlock_irqrestore(&kernfs_rename_lock, flags); return ret; } @@ -224,9 +224,9 @@ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from, unsigned long flags; int ret; - spin_lock_irqsave(&kernfs_rename_lock, flags); + read_lock_irqsave(&kernfs_rename_lock, flags); ret = kernfs_path_from_node_locked(to, from, buf, buflen); - spin_unlock_irqrestore(&kernfs_rename_lock, flags); + read_unlock_irqrestore(&kernfs_rename_lock, flags); return ret; } EXPORT_SYMBOL_GPL(kernfs_path_from_node); @@ -294,10 +294,10 @@ struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) struct kernfs_node *parent; unsigned long flags; - spin_lock_irqsave(&kernfs_rename_lock, flags); + read_lock_irqsave(&kernfs_rename_lock, flags); parent = kn->parent; kernfs_get(parent); - spin_unlock_irqrestore(&kernfs_rename_lock, flags); + read_unlock_irqrestore(&kernfs_rename_lock, flags); return parent; } @@ -770,12 +770,15 @@ int kernfs_add_one(struct kernfs_node *kn) goto out_unlock; /* Update timestamps on the parent */ + down_write(&root->kernfs_iattr_rwsem); + ps_iattr = parent->iattr; if (ps_iattr) { ktime_get_real_ts64(&ps_iattr->ia_ctime); ps_iattr->ia_mtime = ps_iattr->ia_ctime; } + up_write(&root->kernfs_iattr_rwsem); up_write(&root->kernfs_rwsem); /* @@ -940,6 +943,8 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, idr_init(&root->ino_idr); init_rwsem(&root->kernfs_rwsem); + init_rwsem(&root->kernfs_iattr_rwsem); + init_rwsem(&root->kernfs_supers_rwsem); INIT_LIST_HEAD(&root->supers); /* @@ -1462,11 +1467,14 @@ static void __kernfs_remove(struct kernfs_node *kn) pos->parent ? pos->parent->iattr : NULL; /* update timestamps on the parent */ + down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); + if (ps_iattr) { ktime_get_real_ts64(&ps_iattr->ia_ctime); ps_iattr->ia_mtime = ps_iattr->ia_ctime; } + up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); kernfs_put(pos); } @@ -1723,7 +1731,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, kernfs_get(new_parent); /* rename_lock protects ->parent and ->name accessors */ - spin_lock_irq(&kernfs_rename_lock); + write_lock_irq(&kernfs_rename_lock); old_parent = kn->parent; kn->parent = new_parent; @@ -1734,7 +1742,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, kn->name = new_name; } - spin_unlock_irq(&kernfs_rename_lock); + write_unlock_irq(&kernfs_rename_lock); kn->hash = kernfs_name_hash(kn->name, kn->ns); kernfs_link_sibling(kn); @@ -1748,12 +1756,6 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, return error; } -/* Relationship between mode and the DT_xxx types */ -static inline unsigned char dt_type(struct kernfs_node *kn) -{ - return (kn->mode >> 12) & 15; -} - static int kernfs_dir_fop_release(struct inode *inode, struct file *filp) { kernfs_put(filp->private_data); @@ -1831,7 +1833,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) pos; pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) { const char *name = pos->name; - unsigned int type = dt_type(pos); + unsigned int type = fs_umode_to_dtype(pos->mode); int len = strlen(name); ino_t ino = kernfs_ino(pos); diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index e4a50e4ff0d2..40c4661f15b7 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -922,8 +922,8 @@ repeat: root = kernfs_root(kn); /* kick fsnotify */ - down_write(&root->kernfs_rwsem); + down_read(&root->kernfs_supers_rwsem); list_for_each_entry(info, &kernfs_root(kn)->supers, node) { struct kernfs_node *parent; struct inode *p_inode = NULL; @@ -960,7 +960,7 @@ repeat: iput(inode); } - up_write(&root->kernfs_rwsem); + up_read(&root->kernfs_supers_rwsem); kernfs_put(kn); goto repeat; } diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 30494dcb0df3..b22b74d1a115 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -101,9 +101,9 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) int ret; struct kernfs_root *root = kernfs_root(kn); - down_write(&root->kernfs_rwsem); + down_write(&root->kernfs_iattr_rwsem); ret = __kernfs_setattr(kn, iattr); - up_write(&root->kernfs_rwsem); + up_write(&root->kernfs_iattr_rwsem); return ret; } @@ -119,7 +119,7 @@ int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry, return -EINVAL; root = kernfs_root(kn); - down_write(&root->kernfs_rwsem); + down_write(&root->kernfs_iattr_rwsem); error = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (error) goto out; @@ -132,7 +132,7 @@ int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry, setattr_copy(&nop_mnt_idmap, inode, iattr); out: - up_write(&root->kernfs_rwsem); + up_write(&root->kernfs_iattr_rwsem); return error; } @@ -189,10 +189,10 @@ int kernfs_iop_getattr(struct mnt_idmap *idmap, struct kernfs_node *kn = inode->i_private; struct kernfs_root *root = kernfs_root(kn); - down_read(&root->kernfs_rwsem); + down_read(&root->kernfs_iattr_rwsem); kernfs_refresh_inode(kn, inode); generic_fillattr(&nop_mnt_idmap, inode, stat); - up_read(&root->kernfs_rwsem); + up_read(&root->kernfs_iattr_rwsem); return 0; } @@ -285,10 +285,10 @@ int kernfs_iop_permission(struct mnt_idmap *idmap, kn = inode->i_private; root = kernfs_root(kn); - down_read(&root->kernfs_rwsem); + down_read(&root->kernfs_iattr_rwsem); kernfs_refresh_inode(kn, inode); ret = generic_permission(&nop_mnt_idmap, inode, mask); - up_read(&root->kernfs_rwsem); + up_read(&root->kernfs_iattr_rwsem); return ret; } diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 236c3a6113f1..a9b854cdfdb5 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -47,6 +47,8 @@ struct kernfs_root { wait_queue_head_t deactivate_waitq; struct rw_semaphore kernfs_rwsem; + struct rw_semaphore kernfs_iattr_rwsem; + struct rw_semaphore kernfs_supers_rwsem; }; /* +1 to avoid triggering overflow warning when negating it */ diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index e08e8d999807..d49606accb07 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -351,9 +351,9 @@ int kernfs_get_tree(struct fs_context *fc) } sb->s_flags |= SB_ACTIVE; - down_write(&root->kernfs_rwsem); + down_write(&root->kernfs_supers_rwsem); list_add(&info->node, &info->root->supers); - up_write(&root->kernfs_rwsem); + up_write(&root->kernfs_supers_rwsem); } fc->root = dget(sb->s_root); @@ -380,9 +380,9 @@ void kernfs_kill_sb(struct super_block *sb) struct kernfs_super_info *info = kernfs_info(sb); struct kernfs_root *root = info->root; - down_write(&root->kernfs_rwsem); + down_write(&root->kernfs_supers_rwsem); list_del(&info->node); - up_write(&root->kernfs_rwsem); + up_write(&root->kernfs_supers_rwsem); /* * Remove the superblock from fs_supers/s_instances diff --git a/fs/ksmbd/auth.c b/fs/ksmbd/auth.c index cead696b656a..df8fb076f6f1 100644 --- a/fs/ksmbd/auth.c +++ b/fs/ksmbd/auth.c @@ -221,22 +221,22 @@ int ksmbd_auth_ntlmv2(struct ksmbd_conn *conn, struct ksmbd_session *sess, { char ntlmv2_hash[CIFS_ENCPWD_SIZE]; char ntlmv2_rsp[CIFS_HMAC_MD5_HASH_SIZE]; - struct ksmbd_crypto_ctx *ctx; + struct ksmbd_crypto_ctx *ctx = NULL; char *construct = NULL; int rc, len; - ctx = ksmbd_crypto_ctx_find_hmacmd5(); - if (!ctx) { - ksmbd_debug(AUTH, "could not crypto alloc hmacmd5\n"); - return -ENOMEM; - } - rc = calc_ntlmv2_hash(conn, sess, ntlmv2_hash, domain_name); if (rc) { ksmbd_debug(AUTH, "could not get v2 hash rc %d\n", rc); goto out; } + ctx = ksmbd_crypto_ctx_find_hmacmd5(); + if (!ctx) { + ksmbd_debug(AUTH, "could not crypto alloc hmacmd5\n"); + return -ENOMEM; + } + rc = crypto_shash_setkey(CRYPTO_HMACMD5_TFM(ctx), ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); @@ -272,6 +272,8 @@ int ksmbd_auth_ntlmv2(struct ksmbd_conn *conn, struct ksmbd_session *sess, ksmbd_debug(AUTH, "Could not generate md5 hash\n"); goto out; } + ksmbd_release_crypto_ctx(ctx); + ctx = NULL; rc = ksmbd_gen_sess_key(sess, ntlmv2_hash, ntlmv2_rsp); if (rc) { @@ -282,7 +284,8 @@ int ksmbd_auth_ntlmv2(struct ksmbd_conn *conn, struct ksmbd_session *sess, if (memcmp(ntlmv2->ntlmv2_hash, ntlmv2_rsp, CIFS_HMAC_MD5_HASH_SIZE) != 0) rc = -EINVAL; out: - ksmbd_release_crypto_ctx(ctx); + if (ctx) + ksmbd_release_crypto_ctx(ctx); kfree(construct); return rc; } diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c index 365ac32af505..4ed379f9b1aa 100644 --- a/fs/ksmbd/connection.c +++ b/fs/ksmbd/connection.c @@ -20,7 +20,7 @@ static DEFINE_MUTEX(init_lock); static struct ksmbd_conn_ops default_conn_ops; LIST_HEAD(conn_list); -DEFINE_RWLOCK(conn_list_lock); +DECLARE_RWSEM(conn_list_lock); /** * ksmbd_conn_free() - free resources of the connection instance @@ -32,9 +32,9 @@ DEFINE_RWLOCK(conn_list_lock); */ void ksmbd_conn_free(struct ksmbd_conn *conn) { - write_lock(&conn_list_lock); + down_write(&conn_list_lock); list_del(&conn->conns_list); - write_unlock(&conn_list_lock); + up_write(&conn_list_lock); xa_destroy(&conn->sessions); kvfree(conn->request_buf); @@ -56,7 +56,7 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) return NULL; conn->need_neg = true; - conn->status = KSMBD_SESS_NEW; + ksmbd_conn_set_new(conn); conn->local_nls = load_nls("utf8"); if (!conn->local_nls) conn->local_nls = load_nls_default(); @@ -84,9 +84,9 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) spin_lock_init(&conn->llist_lock); INIT_LIST_HEAD(&conn->lock_list); - write_lock(&conn_list_lock); + down_write(&conn_list_lock); list_add(&conn->conns_list, &conn_list); - write_unlock(&conn_list_lock); + up_write(&conn_list_lock); return conn; } @@ -95,7 +95,7 @@ bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c) struct ksmbd_conn *t; bool ret = false; - read_lock(&conn_list_lock); + down_read(&conn_list_lock); list_for_each_entry(t, &conn_list, conns_list) { if (memcmp(t->ClientGUID, c->ClientGUID, SMB2_CLIENT_GUID_SIZE)) continue; @@ -103,7 +103,7 @@ bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c) ret = true; break; } - read_unlock(&conn_list_lock); + up_read(&conn_list_lock); return ret; } @@ -147,19 +147,47 @@ int ksmbd_conn_try_dequeue_request(struct ksmbd_work *work) return ret; } -static void ksmbd_conn_lock(struct ksmbd_conn *conn) +void ksmbd_conn_lock(struct ksmbd_conn *conn) { mutex_lock(&conn->srv_mutex); } -static void ksmbd_conn_unlock(struct ksmbd_conn *conn) +void ksmbd_conn_unlock(struct ksmbd_conn *conn) { mutex_unlock(&conn->srv_mutex); } -void ksmbd_conn_wait_idle(struct ksmbd_conn *conn) +void ksmbd_all_conn_set_status(u64 sess_id, u32 status) { + struct ksmbd_conn *conn; + + down_read(&conn_list_lock); + list_for_each_entry(conn, &conn_list, conns_list) { + if (conn->binding || xa_load(&conn->sessions, sess_id)) + WRITE_ONCE(conn->status, status); + } + up_read(&conn_list_lock); +} + +void ksmbd_conn_wait_idle(struct ksmbd_conn *conn, u64 sess_id) +{ + struct ksmbd_conn *bind_conn; + wait_event(conn->req_running_q, atomic_read(&conn->req_running) < 2); + + down_read(&conn_list_lock); + list_for_each_entry(bind_conn, &conn_list, conns_list) { + if (bind_conn == conn) + continue; + + if ((bind_conn->binding || xa_load(&bind_conn->sessions, sess_id)) && + !ksmbd_conn_releasing(bind_conn) && + atomic_read(&bind_conn->req_running)) { + wait_event(bind_conn->req_running_q, + atomic_read(&bind_conn->req_running) == 0); + } + } + up_read(&conn_list_lock); } int ksmbd_conn_write(struct ksmbd_work *work) @@ -243,7 +271,7 @@ bool ksmbd_conn_alive(struct ksmbd_conn *conn) if (!ksmbd_server_running()) return false; - if (conn->status == KSMBD_SESS_EXITING) + if (ksmbd_conn_exiting(conn)) return false; if (kthread_should_stop()) @@ -303,7 +331,7 @@ int ksmbd_conn_handler_loop(void *p) pdu_size = get_rfc1002_len(hdr_buf); ksmbd_debug(CONN, "RFC1002 header %u bytes\n", pdu_size); - if (conn->status == KSMBD_SESS_GOOD) + if (ksmbd_conn_good(conn)) max_allowed_pdu_size = SMB3_MAX_MSGSIZE + conn->vals->max_write_size; else @@ -312,7 +340,7 @@ int ksmbd_conn_handler_loop(void *p) if (pdu_size > max_allowed_pdu_size) { pr_err_ratelimited("PDU length(%u) exceeded maximum allowed pdu size(%u) on connection(%d)\n", pdu_size, max_allowed_pdu_size, - conn->status); + READ_ONCE(conn->status)); break; } @@ -360,10 +388,10 @@ int ksmbd_conn_handler_loop(void *p) } out: + ksmbd_conn_set_releasing(conn); /* Wait till all reference dropped to the Server object*/ wait_event(conn->r_count_q, atomic_read(&conn->r_count) == 0); - if (IS_ENABLED(CONFIG_UNICODE)) utf8_unload(conn->um); unload_nls(conn->local_nls); @@ -407,7 +435,7 @@ static void stop_sessions(void) struct ksmbd_transport *t; again: - read_lock(&conn_list_lock); + down_read(&conn_list_lock); list_for_each_entry(conn, &conn_list, conns_list) { struct task_struct *task; @@ -416,14 +444,14 @@ again: if (task) ksmbd_debug(CONN, "Stop session handler %s/%d\n", task->comm, task_pid_nr(task)); - conn->status = KSMBD_SESS_EXITING; + ksmbd_conn_set_exiting(conn); if (t->ops->shutdown) { - read_unlock(&conn_list_lock); + up_read(&conn_list_lock); t->ops->shutdown(t); - read_lock(&conn_list_lock); + down_read(&conn_list_lock); } } - read_unlock(&conn_list_lock); + up_read(&conn_list_lock); if (!list_empty(&conn_list)) { schedule_timeout_interruptible(HZ / 10); /* 100ms */ diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h index 0e3a848defaf..ad8dfaa48ffb 100644 --- a/fs/ksmbd/connection.h +++ b/fs/ksmbd/connection.h @@ -26,7 +26,8 @@ enum { KSMBD_SESS_GOOD, KSMBD_SESS_EXITING, KSMBD_SESS_NEED_RECONNECT, - KSMBD_SESS_NEED_NEGOTIATE + KSMBD_SESS_NEED_NEGOTIATE, + KSMBD_SESS_RELEASING }; struct ksmbd_stats { @@ -140,10 +141,10 @@ struct ksmbd_transport { #define KSMBD_TCP_PEER_SOCKADDR(c) ((struct sockaddr *)&((c)->peer_addr)) extern struct list_head conn_list; -extern rwlock_t conn_list_lock; +extern struct rw_semaphore conn_list_lock; bool ksmbd_conn_alive(struct ksmbd_conn *conn); -void ksmbd_conn_wait_idle(struct ksmbd_conn *conn); +void ksmbd_conn_wait_idle(struct ksmbd_conn *conn, u64 sess_id); struct ksmbd_conn *ksmbd_conn_alloc(void); void ksmbd_conn_free(struct ksmbd_conn *conn); bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c); @@ -162,6 +163,8 @@ void ksmbd_conn_init_server_callbacks(struct ksmbd_conn_ops *ops); int ksmbd_conn_handler_loop(void *p); int ksmbd_conn_transport_init(void); void ksmbd_conn_transport_destroy(void); +void ksmbd_conn_lock(struct ksmbd_conn *conn); +void ksmbd_conn_unlock(struct ksmbd_conn *conn); /* * WARNING @@ -169,43 +172,60 @@ void ksmbd_conn_transport_destroy(void); * This is a hack. We will move status to a proper place once we land * a multi-sessions support. */ -static inline bool ksmbd_conn_good(struct ksmbd_work *work) +static inline bool ksmbd_conn_good(struct ksmbd_conn *conn) { - return work->conn->status == KSMBD_SESS_GOOD; + return READ_ONCE(conn->status) == KSMBD_SESS_GOOD; } -static inline bool ksmbd_conn_need_negotiate(struct ksmbd_work *work) +static inline bool ksmbd_conn_need_negotiate(struct ksmbd_conn *conn) { - return work->conn->status == KSMBD_SESS_NEED_NEGOTIATE; + return READ_ONCE(conn->status) == KSMBD_SESS_NEED_NEGOTIATE; } -static inline bool ksmbd_conn_need_reconnect(struct ksmbd_work *work) +static inline bool ksmbd_conn_need_reconnect(struct ksmbd_conn *conn) { - return work->conn->status == KSMBD_SESS_NEED_RECONNECT; + return READ_ONCE(conn->status) == KSMBD_SESS_NEED_RECONNECT; } -static inline bool ksmbd_conn_exiting(struct ksmbd_work *work) +static inline bool ksmbd_conn_exiting(struct ksmbd_conn *conn) { - return work->conn->status == KSMBD_SESS_EXITING; + return READ_ONCE(conn->status) == KSMBD_SESS_EXITING; } -static inline void ksmbd_conn_set_good(struct ksmbd_work *work) +static inline bool ksmbd_conn_releasing(struct ksmbd_conn *conn) { - work->conn->status = KSMBD_SESS_GOOD; + return READ_ONCE(conn->status) == KSMBD_SESS_RELEASING; } -static inline void ksmbd_conn_set_need_negotiate(struct ksmbd_work *work) +static inline void ksmbd_conn_set_new(struct ksmbd_conn *conn) { - work->conn->status = KSMBD_SESS_NEED_NEGOTIATE; + WRITE_ONCE(conn->status, KSMBD_SESS_NEW); } -static inline void ksmbd_conn_set_need_reconnect(struct ksmbd_work *work) +static inline void ksmbd_conn_set_good(struct ksmbd_conn *conn) { - work->conn->status = KSMBD_SESS_NEED_RECONNECT; + WRITE_ONCE(conn->status, KSMBD_SESS_GOOD); } -static inline void ksmbd_conn_set_exiting(struct ksmbd_work *work) +static inline void ksmbd_conn_set_need_negotiate(struct ksmbd_conn *conn) { - work->conn->status = KSMBD_SESS_EXITING; + WRITE_ONCE(conn->status, KSMBD_SESS_NEED_NEGOTIATE); } + +static inline void ksmbd_conn_set_need_reconnect(struct ksmbd_conn *conn) +{ + WRITE_ONCE(conn->status, KSMBD_SESS_NEED_RECONNECT); +} + +static inline void ksmbd_conn_set_exiting(struct ksmbd_conn *conn) +{ + WRITE_ONCE(conn->status, KSMBD_SESS_EXITING); +} + +static inline void ksmbd_conn_set_releasing(struct ksmbd_conn *conn) +{ + WRITE_ONCE(conn->status, KSMBD_SESS_RELEASING); +} + +void ksmbd_all_conn_set_status(u64 sess_id, u32 status); #endif /* __CONNECTION_H__ */ diff --git a/fs/ksmbd/mgmt/tree_connect.c b/fs/ksmbd/mgmt/tree_connect.c index 8ce17b3fb8da..f07a05f37651 100644 --- a/fs/ksmbd/mgmt/tree_connect.c +++ b/fs/ksmbd/mgmt/tree_connect.c @@ -109,7 +109,15 @@ int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess, struct ksmbd_tree_connect *ksmbd_tree_conn_lookup(struct ksmbd_session *sess, unsigned int id) { - return xa_load(&sess->tree_conns, id); + struct ksmbd_tree_connect *tcon; + + tcon = xa_load(&sess->tree_conns, id); + if (tcon) { + if (test_bit(TREE_CONN_EXPIRE, &tcon->status)) + tcon = NULL; + } + + return tcon; } struct ksmbd_share_config *ksmbd_tree_conn_share(struct ksmbd_session *sess, @@ -129,6 +137,9 @@ int ksmbd_tree_conn_session_logoff(struct ksmbd_session *sess) struct ksmbd_tree_connect *tc; unsigned long id; + if (!sess) + return -EINVAL; + xa_for_each(&sess->tree_conns, id, tc) ret |= ksmbd_tree_conn_disconnect(sess, tc); xa_destroy(&sess->tree_conns); diff --git a/fs/ksmbd/mgmt/tree_connect.h b/fs/ksmbd/mgmt/tree_connect.h index 0f97ddc1e39c..700df36cf3e3 100644 --- a/fs/ksmbd/mgmt/tree_connect.h +++ b/fs/ksmbd/mgmt/tree_connect.h @@ -14,6 +14,8 @@ struct ksmbd_share_config; struct ksmbd_user; struct ksmbd_conn; +#define TREE_CONN_EXPIRE 1 + struct ksmbd_tree_connect { int id; @@ -25,6 +27,7 @@ struct ksmbd_tree_connect { int maximal_access; bool posix_extensions; + unsigned long status; }; struct ksmbd_tree_conn_status { diff --git a/fs/ksmbd/mgmt/user_session.c b/fs/ksmbd/mgmt/user_session.c index 1ca2aae4c299..8a5dcab05614 100644 --- a/fs/ksmbd/mgmt/user_session.c +++ b/fs/ksmbd/mgmt/user_session.c @@ -144,10 +144,6 @@ void ksmbd_session_destroy(struct ksmbd_session *sess) if (!sess) return; - down_write(&sessions_table_lock); - hash_del(&sess->hlist); - up_write(&sessions_table_lock); - if (sess->user) ksmbd_free_user(sess->user); @@ -165,17 +161,39 @@ static struct ksmbd_session *__session_lookup(unsigned long long id) struct ksmbd_session *sess; hash_for_each_possible(sessions_table, sess, hlist, id) { - if (id == sess->id) + if (id == sess->id) { + sess->last_active = jiffies; return sess; + } } return NULL; } +static void ksmbd_expire_session(struct ksmbd_conn *conn) +{ + unsigned long id; + struct ksmbd_session *sess; + + down_write(&sessions_table_lock); + xa_for_each(&conn->sessions, id, sess) { + if (sess->state != SMB2_SESSION_VALID || + time_after(jiffies, + sess->last_active + SMB2_SESSION_TIMEOUT)) { + xa_erase(&conn->sessions, sess->id); + hash_del(&sess->hlist); + ksmbd_session_destroy(sess); + continue; + } + } + up_write(&sessions_table_lock); +} + int ksmbd_session_register(struct ksmbd_conn *conn, struct ksmbd_session *sess) { sess->dialect = conn->dialect; memcpy(sess->ClientGUID, conn->ClientGUID, SMB2_CLIENT_GUID_SIZE); + ksmbd_expire_session(conn); return xa_err(xa_store(&conn->sessions, sess->id, sess, GFP_KERNEL)); } @@ -188,47 +206,56 @@ static int ksmbd_chann_del(struct ksmbd_conn *conn, struct ksmbd_session *sess) return -ENOENT; kfree(chann); - return 0; } void ksmbd_sessions_deregister(struct ksmbd_conn *conn) { struct ksmbd_session *sess; + unsigned long id; + down_write(&sessions_table_lock); if (conn->binding) { int bkt; + struct hlist_node *tmp; - down_write(&sessions_table_lock); - hash_for_each(sessions_table, bkt, sess, hlist) { - if (!ksmbd_chann_del(conn, sess)) { - up_write(&sessions_table_lock); - goto sess_destroy; + hash_for_each_safe(sessions_table, bkt, tmp, sess, hlist) { + if (!ksmbd_chann_del(conn, sess) && + xa_empty(&sess->ksmbd_chann_list)) { + hash_del(&sess->hlist); + ksmbd_session_destroy(sess); } } - up_write(&sessions_table_lock); - } else { - unsigned long id; - - xa_for_each(&conn->sessions, id, sess) { - if (!ksmbd_chann_del(conn, sess)) - goto sess_destroy; - } } - return; + xa_for_each(&conn->sessions, id, sess) { + unsigned long chann_id; + struct channel *chann; + + xa_for_each(&sess->ksmbd_chann_list, chann_id, chann) { + if (chann->conn != conn) + ksmbd_conn_set_exiting(chann->conn); + } -sess_destroy: - if (xa_empty(&sess->ksmbd_chann_list)) { - xa_erase(&conn->sessions, sess->id); - ksmbd_session_destroy(sess); + ksmbd_chann_del(conn, sess); + if (xa_empty(&sess->ksmbd_chann_list)) { + xa_erase(&conn->sessions, sess->id); + hash_del(&sess->hlist); + ksmbd_session_destroy(sess); + } } + up_write(&sessions_table_lock); } struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn, unsigned long long id) { - return xa_load(&conn->sessions, id); + struct ksmbd_session *sess; + + sess = xa_load(&conn->sessions, id); + if (sess) + sess->last_active = jiffies; + return sess; } struct ksmbd_session *ksmbd_session_lookup_slowpath(unsigned long long id) @@ -237,6 +264,8 @@ struct ksmbd_session *ksmbd_session_lookup_slowpath(unsigned long long id) down_read(&sessions_table_lock); sess = __session_lookup(id); + if (sess) + sess->last_active = jiffies; up_read(&sessions_table_lock); return sess; @@ -315,6 +344,8 @@ static struct ksmbd_session *__session_create(int protocol) if (ksmbd_init_file_table(&sess->file_table)) goto error; + sess->last_active = jiffies; + sess->state = SMB2_SESSION_IN_PROGRESS; set_session_flag(sess, protocol); xa_init(&sess->tree_conns); xa_init(&sess->ksmbd_chann_list); diff --git a/fs/ksmbd/mgmt/user_session.h b/fs/ksmbd/mgmt/user_session.h index b6a9e7a6aae4..f99d475b28db 100644 --- a/fs/ksmbd/mgmt/user_session.h +++ b/fs/ksmbd/mgmt/user_session.h @@ -59,6 +59,7 @@ struct ksmbd_session { __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE]; struct ksmbd_file_table file_table; + unsigned long last_active; }; static inline int test_session_flag(struct ksmbd_session *sess, int bit) diff --git a/fs/ksmbd/server.c b/fs/ksmbd/server.c index 0d8242789dc8..f9b2e0f19b03 100644 --- a/fs/ksmbd/server.c +++ b/fs/ksmbd/server.c @@ -93,7 +93,8 @@ static inline int check_conn_state(struct ksmbd_work *work) { struct smb_hdr *rsp_hdr; - if (ksmbd_conn_exiting(work) || ksmbd_conn_need_reconnect(work)) { + if (ksmbd_conn_exiting(work->conn) || + ksmbd_conn_need_reconnect(work->conn)) { rsp_hdr = work->response_buf; rsp_hdr->Status.CifsError = STATUS_CONNECTION_DISCONNECTED; return 1; @@ -415,7 +416,7 @@ int server_queue_ctrl_reset_work(void) return __queue_ctrl_work(SERVER_CTRL_TYPE_RESET); } -static ssize_t stats_show(struct class *class, struct class_attribute *attr, +static ssize_t stats_show(const struct class *class, const struct class_attribute *attr, char *buf) { /* @@ -434,8 +435,8 @@ static ssize_t stats_show(struct class *class, struct class_attribute *attr, server_conf.ipc_last_active / HZ); } -static ssize_t kill_server_store(struct class *class, - struct class_attribute *attr, const char *buf, +static ssize_t kill_server_store(const struct class *class, + const struct class_attribute *attr, const char *buf, size_t len) { if (!sysfs_streq(buf, "hard")) @@ -455,7 +456,7 @@ static const char * const debug_type_strings[] = {"smb", "auth", "vfs", "oplock", "ipc", "conn", "rdma"}; -static ssize_t debug_show(struct class *class, struct class_attribute *attr, +static ssize_t debug_show(const struct class *class, const struct class_attribute *attr, char *buf) { ssize_t sz = 0; @@ -473,7 +474,7 @@ static ssize_t debug_show(struct class *class, struct class_attribute *attr, return sz; } -static ssize_t debug_store(struct class *class, struct class_attribute *attr, +static ssize_t debug_store(const struct class *class, const struct class_attribute *attr, const char *buf, size_t len) { int i; @@ -513,7 +514,6 @@ ATTRIBUTE_GROUPS(ksmbd_control_class); static struct class ksmbd_control_class = { .name = "ksmbd-control", - .owner = THIS_MODULE, .class_groups = ksmbd_control_class_groups, }; @@ -606,6 +606,7 @@ err_unregister: static void __exit ksmbd_server_exit(void) { ksmbd_server_shutdown(); + rcu_barrier(); ksmbd_release_inode_hash(); } diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 67b7e766a06b..cb93fd231f4e 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -248,7 +248,7 @@ int init_smb2_neg_rsp(struct ksmbd_work *work) rsp = smb2_get_msg(work->response_buf); - WARN_ON(ksmbd_conn_good(work)); + WARN_ON(ksmbd_conn_good(conn)); rsp->StructureSize = cpu_to_le16(65); ksmbd_debug(SMB, "conn->dialect 0x%x\n", conn->dialect); @@ -277,7 +277,7 @@ int init_smb2_neg_rsp(struct ksmbd_work *work) rsp->SecurityMode |= SMB2_NEGOTIATE_SIGNING_REQUIRED_LE; conn->use_spnego = true; - ksmbd_conn_set_need_negotiate(work); + ksmbd_conn_set_need_negotiate(conn); return 0; } @@ -561,7 +561,7 @@ int smb2_check_user_session(struct ksmbd_work *work) cmd == SMB2_SESSION_SETUP_HE) return 0; - if (!ksmbd_conn_good(work)) + if (!ksmbd_conn_good(conn)) return -EINVAL; sess_id = le64_to_cpu(req_hdr->SessionId); @@ -594,7 +594,7 @@ static void destroy_previous_session(struct ksmbd_conn *conn, prev_sess->state = SMB2_SESSION_EXPIRED; xa_for_each(&prev_sess->ksmbd_chann_list, index, chann) - chann->conn->status = KSMBD_SESS_EXITING; + ksmbd_conn_set_exiting(chann->conn); } /** @@ -756,19 +756,6 @@ static void build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt, pneg_ctxt->Ciphers[0] = cipher_type; } -static void build_compression_ctxt(struct smb2_compression_capabilities_context *pneg_ctxt, - __le16 comp_algo) -{ - pneg_ctxt->ContextType = SMB2_COMPRESSION_CAPABILITIES; - pneg_ctxt->DataLength = - cpu_to_le16(sizeof(struct smb2_compression_capabilities_context) - - sizeof(struct smb2_neg_context)); - pneg_ctxt->Reserved = cpu_to_le32(0); - pneg_ctxt->CompressionAlgorithmCount = cpu_to_le16(1); - pneg_ctxt->Flags = cpu_to_le32(0); - pneg_ctxt->CompressionAlgorithms[0] = comp_algo; -} - static void build_sign_cap_ctxt(struct smb2_signing_capabilities *pneg_ctxt, __le16 sign_algo) { @@ -808,7 +795,7 @@ static void assemble_neg_contexts(struct ksmbd_conn *conn, struct smb2_negotiate_rsp *rsp, void *smb2_buf_len) { - char *pneg_ctxt = (char *)rsp + + char * const pneg_ctxt = (char *)rsp + le32_to_cpu(rsp->NegotiateContextOffset); int neg_ctxt_cnt = 1; int ctxt_size; @@ -817,61 +804,46 @@ static void assemble_neg_contexts(struct ksmbd_conn *conn, "assemble SMB2_PREAUTH_INTEGRITY_CAPABILITIES context\n"); build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt, conn->preauth_info->Preauth_HashId); - rsp->NegotiateContextCount = cpu_to_le16(neg_ctxt_cnt); inc_rfc1001_len(smb2_buf_len, AUTH_GSS_PADDING); ctxt_size = sizeof(struct smb2_preauth_neg_context); - /* Round to 8 byte boundary */ - pneg_ctxt += round_up(sizeof(struct smb2_preauth_neg_context), 8); if (conn->cipher_type) { + /* Round to 8 byte boundary */ ctxt_size = round_up(ctxt_size, 8); ksmbd_debug(SMB, "assemble SMB2_ENCRYPTION_CAPABILITIES context\n"); - build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt, + build_encrypt_ctxt((struct smb2_encryption_neg_context *) + (pneg_ctxt + ctxt_size), conn->cipher_type); - rsp->NegotiateContextCount = cpu_to_le16(++neg_ctxt_cnt); + neg_ctxt_cnt++; ctxt_size += sizeof(struct smb2_encryption_neg_context) + 2; - /* Round to 8 byte boundary */ - pneg_ctxt += - round_up(sizeof(struct smb2_encryption_neg_context) + 2, - 8); } - if (conn->compress_algorithm) { - ctxt_size = round_up(ctxt_size, 8); - ksmbd_debug(SMB, - "assemble SMB2_COMPRESSION_CAPABILITIES context\n"); - /* Temporarily set to SMB3_COMPRESS_NONE */ - build_compression_ctxt((struct smb2_compression_capabilities_context *)pneg_ctxt, - conn->compress_algorithm); - rsp->NegotiateContextCount = cpu_to_le16(++neg_ctxt_cnt); - ctxt_size += sizeof(struct smb2_compression_capabilities_context) + 2; - /* Round to 8 byte boundary */ - pneg_ctxt += round_up(sizeof(struct smb2_compression_capabilities_context) + 2, - 8); - } + /* compression context not yet supported */ + WARN_ON(conn->compress_algorithm != SMB3_COMPRESS_NONE); if (conn->posix_ext_supported) { ctxt_size = round_up(ctxt_size, 8); ksmbd_debug(SMB, "assemble SMB2_POSIX_EXTENSIONS_AVAILABLE context\n"); - build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt); - rsp->NegotiateContextCount = cpu_to_le16(++neg_ctxt_cnt); + build_posix_ctxt((struct smb2_posix_neg_context *) + (pneg_ctxt + ctxt_size)); + neg_ctxt_cnt++; ctxt_size += sizeof(struct smb2_posix_neg_context); - /* Round to 8 byte boundary */ - pneg_ctxt += round_up(sizeof(struct smb2_posix_neg_context), 8); } if (conn->signing_negotiated) { ctxt_size = round_up(ctxt_size, 8); ksmbd_debug(SMB, "assemble SMB2_SIGNING_CAPABILITIES context\n"); - build_sign_cap_ctxt((struct smb2_signing_capabilities *)pneg_ctxt, + build_sign_cap_ctxt((struct smb2_signing_capabilities *) + (pneg_ctxt + ctxt_size), conn->signing_algorithm); - rsp->NegotiateContextCount = cpu_to_le16(++neg_ctxt_cnt); + neg_ctxt_cnt++; ctxt_size += sizeof(struct smb2_signing_capabilities) + 2; } + rsp->NegotiateContextCount = cpu_to_le16(neg_ctxt_cnt); inc_rfc1001_len(smb2_buf_len, ctxt_size); } @@ -1079,7 +1051,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) ksmbd_debug(SMB, "Received negotiate request\n"); conn->need_neg = false; - if (ksmbd_conn_good(work)) { + if (ksmbd_conn_good(conn)) { pr_err("conn->tcp_status is already in CifsGood State\n"); work->send_no_response = 1; return rc; @@ -1233,7 +1205,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) } conn->srv_sec_mode = le16_to_cpu(rsp->SecurityMode); - ksmbd_conn_set_need_negotiate(work); + ksmbd_conn_set_need_negotiate(conn); err_out: if (rc < 0) @@ -1459,7 +1431,7 @@ static int ntlm_authenticate(struct ksmbd_work *work) * Reuse session if anonymous try to connect * on reauthetication. */ - if (ksmbd_anonymous_user(user)) { + if (conn->binding == false && ksmbd_anonymous_user(user)) { ksmbd_free_user(user); return 0; } @@ -1473,7 +1445,7 @@ static int ntlm_authenticate(struct ksmbd_work *work) sess->user = user; } - if (user_guest(sess->user)) { + if (conn->binding == false && user_guest(sess->user)) { rsp->SessionFlags = SMB2_SESSION_FLAG_IS_GUEST_LE; } else { struct authenticate_message *authblob; @@ -1656,6 +1628,7 @@ int smb2_sess_setup(struct ksmbd_work *work) rsp->SecurityBufferLength = 0; inc_rfc1001_len(work->response_buf, 9); + ksmbd_conn_lock(conn); if (!req->hdr.SessionId) { sess = ksmbd_smb2_session_create(); if (!sess) { @@ -1703,11 +1676,22 @@ int smb2_sess_setup(struct ksmbd_work *work) goto out_err; } + if (ksmbd_conn_need_reconnect(conn)) { + rc = -EFAULT; + sess = NULL; + goto out_err; + } + if (ksmbd_session_lookup(conn, sess_id)) { rc = -EACCES; goto out_err; } + if (user_guest(sess->user)) { + rc = -EOPNOTSUPP; + goto out_err; + } + conn->binding = true; } else if ((conn->dialect < SMB30_PROT_ID || server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) && @@ -1722,12 +1706,20 @@ int smb2_sess_setup(struct ksmbd_work *work) rc = -ENOENT; goto out_err; } + + if (sess->state == SMB2_SESSION_EXPIRED) { + rc = -EFAULT; + goto out_err; + } + + if (ksmbd_conn_need_reconnect(conn)) { + rc = -EFAULT; + sess = NULL; + goto out_err; + } } work->sess = sess; - if (sess->state == SMB2_SESSION_EXPIRED) - sess->state = SMB2_SESSION_IN_PROGRESS; - negblob_off = le16_to_cpu(req->SecurityBufferOffset); negblob_len = le16_to_cpu(req->SecurityBufferLength); if (negblob_off < offsetof(struct smb2_sess_setup_req, Buffer) || @@ -1757,8 +1749,10 @@ int smb2_sess_setup(struct ksmbd_work *work) goto out_err; } - ksmbd_conn_set_good(work); - sess->state = SMB2_SESSION_VALID; + if (!ksmbd_conn_need_reconnect(conn)) { + ksmbd_conn_set_good(conn); + sess->state = SMB2_SESSION_VALID; + } kfree(sess->Preauth_HashValue); sess->Preauth_HashValue = NULL; } else if (conn->preferred_auth_mech == KSMBD_AUTH_NTLMSSP) { @@ -1780,8 +1774,10 @@ int smb2_sess_setup(struct ksmbd_work *work) if (rc) goto out_err; - ksmbd_conn_set_good(work); - sess->state = SMB2_SESSION_VALID; + if (!ksmbd_conn_need_reconnect(conn)) { + ksmbd_conn_set_good(conn); + sess->state = SMB2_SESSION_VALID; + } if (conn->binding) { struct preauth_session *preauth_sess; @@ -1794,6 +1790,10 @@ int smb2_sess_setup(struct ksmbd_work *work) } kfree(sess->Preauth_HashValue); sess->Preauth_HashValue = NULL; + } else { + pr_info_ratelimited("Unknown NTLMSSP message type : 0x%x\n", + le32_to_cpu(negblob->MessageType)); + rc = -EINVAL; } } else { /* TODO: need one more negotiation */ @@ -1816,6 +1816,8 @@ out_err: rsp->hdr.Status = STATUS_NETWORK_SESSION_EXPIRED; else if (rc == -ENOMEM) rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; + else if (rc == -EOPNOTSUPP) + rsp->hdr.Status = STATUS_NOT_SUPPORTED; else if (rc) rsp->hdr.Status = STATUS_LOGON_FAILURE; @@ -1843,14 +1845,17 @@ out_err: if (sess->user && sess->user->flags & KSMBD_USER_FLAG_DELAY_SESSION) try_delay = true; - xa_erase(&conn->sessions, sess->id); - ksmbd_session_destroy(sess); - work->sess = NULL; - if (try_delay) + sess->last_active = jiffies; + sess->state = SMB2_SESSION_EXPIRED; + if (try_delay) { + ksmbd_conn_set_need_reconnect(conn); ssleep(5); + ksmbd_conn_set_need_negotiate(conn); + } } } + ksmbd_conn_unlock(conn); return rc; } @@ -2048,11 +2053,12 @@ int smb2_tree_disconnect(struct ksmbd_work *work) ksmbd_debug(SMB, "request\n"); - if (!tcon) { + if (!tcon || test_and_set_bit(TREE_CONN_EXPIRE, &tcon->status)) { struct smb2_tree_disconnect_req *req = smb2_get_msg(work->request_buf); ksmbd_debug(SMB, "Invalid tid %d\n", req->hdr.Id.SyncId.TreeId); + rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED; smb2_set_err_rsp(work); return 0; @@ -2074,21 +2080,25 @@ int smb2_session_logoff(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; struct smb2_logoff_rsp *rsp = smb2_get_msg(work->response_buf); - struct ksmbd_session *sess = work->sess; + struct ksmbd_session *sess; + struct smb2_logoff_req *req = smb2_get_msg(work->request_buf); + u64 sess_id = le64_to_cpu(req->hdr.SessionId); rsp->StructureSize = cpu_to_le16(4); inc_rfc1001_len(work->response_buf, 4); ksmbd_debug(SMB, "request\n"); - /* setting CifsExiting here may race with start_tcp_sess */ - ksmbd_conn_set_need_reconnect(work); + ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_RECONNECT); ksmbd_close_session_fds(work); - ksmbd_conn_wait_idle(conn); + ksmbd_conn_wait_idle(conn, sess_id); + /* + * Re-lookup session to validate if session is deleted + * while waiting request complete + */ + sess = ksmbd_session_lookup_all(conn, sess_id); if (ksmbd_tree_conn_session_logoff(sess)) { - struct smb2_logoff_req *req = smb2_get_msg(work->request_buf); - ksmbd_debug(SMB, "Invalid tid %d\n", req->hdr.Id.SyncId.TreeId); rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED; smb2_set_err_rsp(work); @@ -2100,9 +2110,7 @@ int smb2_session_logoff(struct ksmbd_work *work) ksmbd_free_user(sess->user); sess->user = NULL; - - /* let start_tcp_sess free connection info now */ - ksmbd_conn_set_need_negotiate(work); + ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_NEGOTIATE); return 0; } @@ -2436,7 +2444,7 @@ static int smb2_creat(struct ksmbd_work *work, struct path *path, char *name, return rc; } - rc = ksmbd_vfs_kern_path(work, name, 0, path, 0); + rc = ksmbd_vfs_kern_path_locked(work, name, 0, path, 0); if (rc) { pr_err("cannot get linux path (%s), err = %d\n", name, rc); @@ -2727,8 +2735,10 @@ int smb2_open(struct ksmbd_work *work) goto err_out1; } - rc = ksmbd_vfs_kern_path(work, name, LOOKUP_NO_SYMLINKS, &path, 1); + rc = ksmbd_vfs_kern_path_locked(work, name, LOOKUP_NO_SYMLINKS, &path, 1); if (!rc) { + file_present = true; + if (req->CreateOptions & FILE_DELETE_ON_CLOSE_LE) { /* * If file exists with under flags, return access @@ -2737,7 +2747,6 @@ int smb2_open(struct ksmbd_work *work) if (req->CreateDisposition == FILE_OVERWRITE_IF_LE || req->CreateDisposition == FILE_OPEN_IF_LE) { rc = -EACCES; - path_put(&path); goto err_out; } @@ -2745,26 +2754,23 @@ int smb2_open(struct ksmbd_work *work) ksmbd_debug(SMB, "User does not have write permission\n"); rc = -EACCES; - path_put(&path); goto err_out; } } else if (d_is_symlink(path.dentry)) { rc = -EACCES; - path_put(&path); goto err_out; } - } - if (rc) { + file_present = true; + idmap = mnt_idmap(path.mnt); + } else { if (rc != -ENOENT) goto err_out; ksmbd_debug(SMB, "can not get linux path for %s, rc = %d\n", name, rc); rc = 0; - } else { - file_present = true; - idmap = mnt_idmap(path.mnt); } + if (stream_name) { if (req->CreateOptions & FILE_DIRECTORY_FILE_LE) { if (s_type == DATA_STREAM) { @@ -2892,8 +2898,9 @@ int smb2_open(struct ksmbd_work *work) if ((daccess & FILE_DELETE_LE) || (req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)) { - rc = ksmbd_vfs_may_delete(idmap, - path.dentry); + rc = inode_permission(idmap, + d_inode(path.dentry->d_parent), + MAY_EXEC | MAY_WRITE); if (rc) goto err_out; } @@ -3264,10 +3271,13 @@ int smb2_open(struct ksmbd_work *work) } err_out: - if (file_present || created) - path_put(&path); + if (file_present || created) { + inode_unlock(d_inode(path.dentry->d_parent)); + dput(path.dentry); + } ksmbd_revert_fsids(work); err_out1: + if (rc) { if (rc == -EINVAL) rsp->hdr.Status = STATUS_INVALID_PARAMETER; @@ -4907,6 +4917,9 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, int rc = 0, len; int fs_infoclass_size = 0; + if (!share->path) + return -EIO; + rc = kern_path(share->path, LOOKUP_NO_SYMLINKS, &path); if (rc) { pr_err("cannot create vfs path\n"); @@ -5418,44 +5431,19 @@ int smb2_echo(struct ksmbd_work *work) static int smb2_rename(struct ksmbd_work *work, struct ksmbd_file *fp, - struct mnt_idmap *idmap, struct smb2_file_rename_info *file_info, struct nls_table *local_nls) { struct ksmbd_share_config *share = fp->tcon->share_conf; - char *new_name = NULL, *abs_oldname = NULL, *old_name = NULL; - char *pathname = NULL; - struct path path; - bool file_present = true; - int rc; + char *new_name = NULL; + int rc, flags = 0; ksmbd_debug(SMB, "setting FILE_RENAME_INFO\n"); - pathname = kmalloc(PATH_MAX, GFP_KERNEL); - if (!pathname) - return -ENOMEM; - - abs_oldname = file_path(fp->filp, pathname, PATH_MAX); - if (IS_ERR(abs_oldname)) { - rc = -EINVAL; - goto out; - } - old_name = strrchr(abs_oldname, '/'); - if (old_name && old_name[1] != '\0') { - old_name++; - } else { - ksmbd_debug(SMB, "can't get last component in path %s\n", - abs_oldname); - rc = -ENOENT; - goto out; - } - new_name = smb2_get_name(file_info->FileName, le32_to_cpu(file_info->FileNameLength), local_nls); - if (IS_ERR(new_name)) { - rc = PTR_ERR(new_name); - goto out; - } + if (IS_ERR(new_name)) + return PTR_ERR(new_name); if (strchr(new_name, ':')) { int s_type; @@ -5481,7 +5469,7 @@ static int smb2_rename(struct ksmbd_work *work, if (rc) goto out; - rc = ksmbd_vfs_setxattr(idmap, + rc = ksmbd_vfs_setxattr(file_mnt_idmap(fp->filp), fp->filp->f_path.dentry, xattr_stream_name, NULL, 0, 0); @@ -5496,47 +5484,18 @@ static int smb2_rename(struct ksmbd_work *work, } ksmbd_debug(SMB, "new name %s\n", new_name); - rc = ksmbd_vfs_kern_path(work, new_name, LOOKUP_NO_SYMLINKS, &path, 1); - if (rc) { - if (rc != -ENOENT) - goto out; - file_present = false; - } else { - path_put(&path); - } - if (ksmbd_share_veto_filename(share, new_name)) { rc = -ENOENT; ksmbd_debug(SMB, "Can't rename vetoed file: %s\n", new_name); goto out; } - if (file_info->ReplaceIfExists) { - if (file_present) { - rc = ksmbd_vfs_remove_file(work, new_name); - if (rc) { - if (rc != -ENOTEMPTY) - rc = -EINVAL; - ksmbd_debug(SMB, "cannot delete %s, rc %d\n", - new_name, rc); - goto out; - } - } - } else { - if (file_present && - strncmp(old_name, path.dentry->d_name.name, strlen(old_name))) { - rc = -EEXIST; - ksmbd_debug(SMB, - "cannot rename already existing file\n"); - goto out; - } - } + if (!file_info->ReplaceIfExists) + flags = RENAME_NOREPLACE; - rc = ksmbd_vfs_fp_rename(work, fp, new_name); + rc = ksmbd_vfs_rename(work, &fp->filp->f_path, new_name, flags); out: - kfree(pathname); - if (!IS_ERR(new_name)) - kfree(new_name); + kfree(new_name); return rc; } @@ -5576,18 +5535,17 @@ static int smb2_create_link(struct ksmbd_work *work, } ksmbd_debug(SMB, "target name is %s\n", target_name); - rc = ksmbd_vfs_kern_path(work, link_name, LOOKUP_NO_SYMLINKS, &path, 0); + rc = ksmbd_vfs_kern_path_locked(work, link_name, LOOKUP_NO_SYMLINKS, + &path, 0); if (rc) { if (rc != -ENOENT) goto out; file_present = false; - } else { - path_put(&path); } if (file_info->ReplaceIfExists) { if (file_present) { - rc = ksmbd_vfs_remove_file(work, link_name); + rc = ksmbd_vfs_remove_file(work, &path); if (rc) { rc = -EINVAL; ksmbd_debug(SMB, "cannot delete %s\n", @@ -5607,6 +5565,10 @@ static int smb2_create_link(struct ksmbd_work *work, if (rc) rc = -EINVAL; out: + if (file_present) { + inode_unlock(d_inode(path.dentry->d_parent)); + path_put(&path); + } if (!IS_ERR(link_name)) kfree(link_name); kfree(pathname); @@ -5784,12 +5746,6 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp, struct smb2_file_rename_info *rename_info, unsigned int buf_len) { - struct mnt_idmap *idmap; - struct ksmbd_file *parent_fp; - struct dentry *parent; - struct dentry *dentry = fp->filp->f_path.dentry; - int ret; - if (!(fp->daccess & FILE_DELETE_LE)) { pr_err("no right to delete : 0x%x\n", fp->daccess); return -EACCES; @@ -5799,32 +5755,10 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp, le32_to_cpu(rename_info->FileNameLength)) return -EINVAL; - idmap = file_mnt_idmap(fp->filp); - if (ksmbd_stream_fd(fp)) - goto next; - - parent = dget_parent(dentry); - ret = ksmbd_vfs_lock_parent(idmap, parent, dentry); - if (ret) { - dput(parent); - return ret; - } - - parent_fp = ksmbd_lookup_fd_inode(d_inode(parent)); - inode_unlock(d_inode(parent)); - dput(parent); + if (!le32_to_cpu(rename_info->FileNameLength)) + return -EINVAL; - if (parent_fp) { - if (parent_fp->daccess & FILE_DELETE_LE) { - pr_err("parent dir is opened with delete access\n"); - ksmbd_fd_put(work, parent_fp); - return -ESHARE; - } - ksmbd_fd_put(work, parent_fp); - } -next: - return smb2_rename(work, fp, idmap, rename_info, - work->conn->local_nls); + return smb2_rename(work, fp, rename_info, work->conn->local_nls); } static int set_file_disposition_info(struct ksmbd_file *fp, @@ -6931,7 +6865,7 @@ int smb2_lock(struct ksmbd_work *work) nolock = 1; /* check locks in connection list */ - read_lock(&conn_list_lock); + down_read(&conn_list_lock); list_for_each_entry(conn, &conn_list, conns_list) { spin_lock(&conn->llist_lock); list_for_each_entry_safe(cmp_lock, tmp2, &conn->lock_list, clist) { @@ -6948,7 +6882,7 @@ int smb2_lock(struct ksmbd_work *work) list_del(&cmp_lock->flist); list_del(&cmp_lock->clist); spin_unlock(&conn->llist_lock); - read_unlock(&conn_list_lock); + up_read(&conn_list_lock); locks_free_lock(cmp_lock->fl); kfree(cmp_lock); @@ -6970,7 +6904,7 @@ int smb2_lock(struct ksmbd_work *work) cmp_lock->start > smb_lock->start && cmp_lock->start < smb_lock->end) { spin_unlock(&conn->llist_lock); - read_unlock(&conn_list_lock); + up_read(&conn_list_lock); pr_err("previous lock conflict with zero byte lock range\n"); goto out; } @@ -6979,7 +6913,7 @@ int smb2_lock(struct ksmbd_work *work) smb_lock->start > cmp_lock->start && smb_lock->start < cmp_lock->end) { spin_unlock(&conn->llist_lock); - read_unlock(&conn_list_lock); + up_read(&conn_list_lock); pr_err("current lock conflict with zero byte lock range\n"); goto out; } @@ -6990,14 +6924,14 @@ int smb2_lock(struct ksmbd_work *work) cmp_lock->end >= smb_lock->end)) && !cmp_lock->zero_len && !smb_lock->zero_len) { spin_unlock(&conn->llist_lock); - read_unlock(&conn_list_lock); + up_read(&conn_list_lock); pr_err("Not allow lock operation on exclusive lock range\n"); goto out; } } spin_unlock(&conn->llist_lock); } - read_unlock(&conn_list_lock); + up_read(&conn_list_lock); out_check_cl: if (smb_lock->fl->fl_type == F_UNLCK && nolock) { pr_err("Try to unlock nolocked range\n"); diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h index 9420dd2813fb..2767c08a534a 100644 --- a/fs/ksmbd/smb2pdu.h +++ b/fs/ksmbd/smb2pdu.h @@ -61,6 +61,8 @@ struct preauth_integrity_info { #define SMB2_SESSION_IN_PROGRESS BIT(0) #define SMB2_SESSION_VALID BIT(1) +#define SMB2_SESSION_TIMEOUT (10 * HZ) + struct create_durable_req_v2 { struct create_context ccontext; __u8 Name[8]; @@ -70,18 +72,6 @@ struct create_durable_req_v2 { __u8 CreateGuid[16]; } __packed; -struct create_durable_reconn_req { - struct create_context ccontext; - __u8 Name[8]; - union { - __u8 Reserved[16]; - struct { - __u64 PersistentFileId; - __u64 VolatileFileId; - } Fid; - } Data; -} __packed; - struct create_durable_reconn_v2_req { struct create_context ccontext; __u8 Name[8]; @@ -93,28 +83,6 @@ struct create_durable_reconn_v2_req { __le32 Flags; } __packed; -struct create_app_inst_id { - struct create_context ccontext; - __u8 Name[8]; - __u8 Reserved[8]; - __u8 AppInstanceId[16]; -} __packed; - -struct create_app_inst_id_vers { - struct create_context ccontext; - __u8 Name[8]; - __u8 Reserved[2]; - __u8 Padding[4]; - __le64 AppInstanceVersionHigh; - __le64 AppInstanceVersionLow; -} __packed; - -struct create_mxac_req { - struct create_context ccontext; - __u8 Name[8]; - __le64 Timestamp; -} __packed; - struct create_alloc_size_req { struct create_context ccontext; __u8 Name[8]; @@ -137,21 +105,6 @@ struct create_durable_v2_rsp { __le32 Flags; } __packed; -struct create_mxac_rsp { - struct create_context ccontext; - __u8 Name[8]; - __le32 QueryStatus; - __le32 MaximalAccess; -} __packed; - -struct create_disk_id_rsp { - struct create_context ccontext; - __u8 Name[8]; - __le64 DiskFileId; - __le64 VolumeId; - __u8 Reserved[16]; -} __packed; - /* equivalent of the contents of SMB3.1.1 POSIX open context response */ struct create_posix_rsp { struct create_context ccontext; diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c index 20e85e2701f2..eff7a1d793f0 100644 --- a/fs/ksmbd/transport_tcp.c +++ b/fs/ksmbd/transport_tcp.c @@ -333,7 +333,7 @@ static int ksmbd_tcp_readv(struct tcp_transport *t, struct kvec *iov_orig, if (length == -EINTR) { total_read = -ESHUTDOWN; break; - } else if (conn->status == KSMBD_SESS_NEED_RECONNECT) { + } else if (ksmbd_conn_need_reconnect(conn)) { total_read = -EAGAIN; break; } else if (length == -ERESTARTSYS || length == -EAGAIN) { diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index 5ea9229dad2c..778c152708e4 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -18,8 +18,7 @@ #include <linux/vmalloc.h> #include <linux/sched/xacct.h> #include <linux/crc32c.h> - -#include "../internal.h" /* for vfs_path_lookup */ +#include <linux/namei.h> #include "glob.h" #include "oplock.h" @@ -37,19 +36,6 @@ #include "mgmt/user_session.h" #include "mgmt/user_config.h" -static char *extract_last_component(char *path) -{ - char *p = strrchr(path, '/'); - - if (p && p[1] != '\0') { - *p = '\0'; - p++; - } else { - p = NULL; - } - return p; -} - static void ksmbd_vfs_inherit_owner(struct ksmbd_work *work, struct inode *parent_inode, struct inode *inode) @@ -63,65 +49,77 @@ static void ksmbd_vfs_inherit_owner(struct ksmbd_work *work, /** * ksmbd_vfs_lock_parent() - lock parent dentry if it is stable - * - * the parent dentry got by dget_parent or @parent could be - * unstable, we try to lock a parent inode and lookup the - * child dentry again. - * - * the reference count of @parent isn't incremented. */ -int ksmbd_vfs_lock_parent(struct mnt_idmap *idmap, struct dentry *parent, - struct dentry *child) +int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child) { - struct dentry *dentry; - int ret = 0; - inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); - dentry = lookup_one(idmap, child->d_name.name, parent, - child->d_name.len); - if (IS_ERR(dentry)) { - ret = PTR_ERR(dentry); - goto out_err; - } - - if (dentry != child) { - ret = -ESTALE; - dput(dentry); - goto out_err; + if (child->d_parent != parent) { + inode_unlock(d_inode(parent)); + return -ENOENT; } - dput(dentry); return 0; -out_err: - inode_unlock(d_inode(parent)); - return ret; } -int ksmbd_vfs_may_delete(struct mnt_idmap *idmap, - struct dentry *dentry) +static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf, + char *pathname, unsigned int flags, + struct path *path) { - struct dentry *parent; - int ret; + struct qstr last; + struct filename *filename; + struct path *root_share_path = &share_conf->vfs_path; + int err, type; + struct path parent_path; + struct dentry *d; + + if (pathname[0] == '\0') { + pathname = share_conf->path; + root_share_path = NULL; + } else { + flags |= LOOKUP_BENEATH; + } + + filename = getname_kernel(pathname); + if (IS_ERR(filename)) + return PTR_ERR(filename); + + err = vfs_path_parent_lookup(filename, flags, + &parent_path, &last, &type, + root_share_path); + putname(filename); + if (err) + return err; - parent = dget_parent(dentry); - ret = ksmbd_vfs_lock_parent(idmap, parent, dentry); - if (ret) { - dput(parent); - return ret; + if (unlikely(type != LAST_NORM)) { + path_put(&parent_path); + return -ENOENT; } - ret = inode_permission(idmap, d_inode(parent), - MAY_EXEC | MAY_WRITE); + inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); + d = lookup_one_qstr_excl(&last, parent_path.dentry, 0); + if (IS_ERR(d)) + goto err_out; - inode_unlock(d_inode(parent)); - dput(parent); - return ret; + if (d_is_negative(d)) { + dput(d); + goto err_out; + } + + path->dentry = d; + path->mnt = share_conf->vfs_path.mnt; + path_put(&parent_path); + + return 0; + +err_out: + inode_unlock(parent_path.dentry->d_inode); + path_put(&parent_path); + return -ENOENT; } int ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap, struct dentry *dentry, __le32 *daccess) { - struct dentry *parent; int ret = 0; *daccess = cpu_to_le32(FILE_READ_ATTRIBUTES | READ_CONTROL); @@ -138,18 +136,9 @@ int ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap, if (!inode_permission(idmap, d_inode(dentry), MAY_OPEN | MAY_EXEC)) *daccess |= FILE_EXECUTE_LE; - parent = dget_parent(dentry); - ret = ksmbd_vfs_lock_parent(idmap, parent, dentry); - if (ret) { - dput(parent); - return ret; - } - - if (!inode_permission(idmap, d_inode(parent), MAY_EXEC | MAY_WRITE)) + if (!inode_permission(idmap, d_inode(dentry->d_parent), MAY_EXEC | MAY_WRITE)) *daccess |= FILE_DELETE_LE; - inode_unlock(d_inode(parent)); - dput(parent); return ret; } @@ -582,54 +571,32 @@ int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id) * * Return: 0 on success, otherwise error */ -int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name) +int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path) { struct mnt_idmap *idmap; - struct path path; - struct dentry *parent; + struct dentry *parent = path->dentry->d_parent; int err; if (ksmbd_override_fsids(work)) return -ENOMEM; - err = ksmbd_vfs_kern_path(work, name, LOOKUP_NO_SYMLINKS, &path, false); - if (err) { - ksmbd_debug(VFS, "can't get %s, err %d\n", name, err); - ksmbd_revert_fsids(work); - return err; - } - - idmap = mnt_idmap(path.mnt); - parent = dget_parent(path.dentry); - err = ksmbd_vfs_lock_parent(idmap, parent, path.dentry); - if (err) { - dput(parent); - path_put(&path); - ksmbd_revert_fsids(work); - return err; - } - - if (!d_inode(path.dentry)->i_nlink) { + if (!d_inode(path->dentry)->i_nlink) { err = -ENOENT; goto out_err; } - if (S_ISDIR(d_inode(path.dentry)->i_mode)) { - err = vfs_rmdir(idmap, d_inode(parent), path.dentry); + idmap = mnt_idmap(path->mnt); + if (S_ISDIR(d_inode(path->dentry)->i_mode)) { + err = vfs_rmdir(idmap, d_inode(parent), path->dentry); if (err && err != -ENOTEMPTY) - ksmbd_debug(VFS, "%s: rmdir failed, err %d\n", name, - err); + ksmbd_debug(VFS, "rmdir failed, err %d\n", err); } else { - err = vfs_unlink(idmap, d_inode(parent), path.dentry, NULL); + err = vfs_unlink(idmap, d_inode(parent), path->dentry, NULL); if (err) - ksmbd_debug(VFS, "%s: unlink failed, err %d\n", name, - err); + ksmbd_debug(VFS, "unlink failed, err %d\n", err); } out_err: - inode_unlock(d_inode(parent)); - dput(parent); - path_put(&path); ksmbd_revert_fsids(work); return err; } @@ -688,149 +655,114 @@ out1: return err; } -static int ksmbd_validate_entry_in_use(struct dentry *src_dent) +int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path, + char *newname, int flags) { - struct dentry *dst_dent; - - spin_lock(&src_dent->d_lock); - list_for_each_entry(dst_dent, &src_dent->d_subdirs, d_child) { - struct ksmbd_file *child_fp; + struct dentry *old_parent, *new_dentry, *trap; + struct dentry *old_child = old_path->dentry; + struct path new_path; + struct qstr new_last; + struct renamedata rd; + struct filename *to; + struct ksmbd_share_config *share_conf = work->tcon->share_conf; + struct ksmbd_file *parent_fp; + int new_type; + int err, lookup_flags = LOOKUP_NO_SYMLINKS; - if (d_really_is_negative(dst_dent)) - continue; + if (ksmbd_override_fsids(work)) + return -ENOMEM; - child_fp = ksmbd_lookup_fd_inode(d_inode(dst_dent)); - if (child_fp) { - spin_unlock(&src_dent->d_lock); - ksmbd_debug(VFS, "Forbid rename, sub file/dir is in use\n"); - return -EACCES; - } + to = getname_kernel(newname); + if (IS_ERR(to)) { + err = PTR_ERR(to); + goto revert_fsids; } - spin_unlock(&src_dent->d_lock); - return 0; -} +retry: + err = vfs_path_parent_lookup(to, lookup_flags | LOOKUP_BENEATH, + &new_path, &new_last, &new_type, + &share_conf->vfs_path); + if (err) + goto out1; -static int __ksmbd_vfs_rename(struct ksmbd_work *work, - struct mnt_idmap *src_idmap, - struct dentry *src_dent_parent, - struct dentry *src_dent, - struct mnt_idmap *dst_idmap, - struct dentry *dst_dent_parent, - struct dentry *trap_dent, - char *dst_name) -{ - struct dentry *dst_dent; - int err; + if (old_path->mnt != new_path.mnt) { + err = -EXDEV; + goto out2; + } - if (!work->tcon->posix_extensions) { - err = ksmbd_validate_entry_in_use(src_dent); - if (err) - return err; + trap = lock_rename_child(old_child, new_path.dentry); + + old_parent = dget(old_child->d_parent); + if (d_unhashed(old_child)) { + err = -EINVAL; + goto out3; } - if (d_really_is_negative(src_dent_parent)) - return -ENOENT; - if (d_really_is_negative(dst_dent_parent)) - return -ENOENT; - if (d_really_is_negative(src_dent)) - return -ENOENT; - if (src_dent == trap_dent) - return -EINVAL; + parent_fp = ksmbd_lookup_fd_inode(d_inode(old_child->d_parent)); + if (parent_fp) { + if (parent_fp->daccess & FILE_DELETE_LE) { + pr_err("parent dir is opened with delete access\n"); + err = -ESHARE; + ksmbd_fd_put(work, parent_fp); + goto out3; + } + ksmbd_fd_put(work, parent_fp); + } - if (ksmbd_override_fsids(work)) - return -ENOMEM; + new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry, + lookup_flags | LOOKUP_RENAME_TARGET); + if (IS_ERR(new_dentry)) { + err = PTR_ERR(new_dentry); + goto out3; + } - dst_dent = lookup_one(dst_idmap, dst_name, - dst_dent_parent, strlen(dst_name)); - err = PTR_ERR(dst_dent); - if (IS_ERR(dst_dent)) { - pr_err("lookup failed %s [%d]\n", dst_name, err); - goto out; + if (d_is_symlink(new_dentry)) { + err = -EACCES; + goto out4; } - err = -ENOTEMPTY; - if (dst_dent != trap_dent && !d_really_is_positive(dst_dent)) { - struct renamedata rd = { - .old_mnt_idmap = src_idmap, - .old_dir = d_inode(src_dent_parent), - .old_dentry = src_dent, - .new_mnt_idmap = dst_idmap, - .new_dir = d_inode(dst_dent_parent), - .new_dentry = dst_dent, - }; - err = vfs_rename(&rd); + if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry)) { + err = -EEXIST; + goto out4; } - if (err) - pr_err("vfs_rename failed err %d\n", err); - if (dst_dent) - dput(dst_dent); -out: - ksmbd_revert_fsids(work); - return err; -} -int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, - char *newname) -{ - struct mnt_idmap *idmap; - struct path dst_path; - struct dentry *src_dent_parent, *dst_dent_parent; - struct dentry *src_dent, *trap_dent, *src_child; - char *dst_name; - int err; + if (old_child == trap) { + err = -EINVAL; + goto out4; + } - dst_name = extract_last_component(newname); - if (!dst_name) { - dst_name = newname; - newname = ""; + if (new_dentry == trap) { + err = -ENOTEMPTY; + goto out4; } - src_dent_parent = dget_parent(fp->filp->f_path.dentry); - src_dent = fp->filp->f_path.dentry; + rd.old_mnt_idmap = mnt_idmap(old_path->mnt), + rd.old_dir = d_inode(old_parent), + rd.old_dentry = old_child, + rd.new_mnt_idmap = mnt_idmap(new_path.mnt), + rd.new_dir = new_path.dentry->d_inode, + rd.new_dentry = new_dentry, + rd.flags = flags, + err = vfs_rename(&rd); + if (err) + ksmbd_debug(VFS, "vfs_rename failed err %d\n", err); - err = ksmbd_vfs_kern_path(work, newname, - LOOKUP_NO_SYMLINKS | LOOKUP_DIRECTORY, - &dst_path, false); - if (err) { - ksmbd_debug(VFS, "Cannot get path for %s [%d]\n", newname, err); - goto out; +out4: + dput(new_dentry); +out3: + dput(old_parent); + unlock_rename(old_parent, new_path.dentry); +out2: + path_put(&new_path); + + if (retry_estale(err, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; } - dst_dent_parent = dst_path.dentry; - - trap_dent = lock_rename(src_dent_parent, dst_dent_parent); - dget(src_dent); - dget(dst_dent_parent); - idmap = file_mnt_idmap(fp->filp); - src_child = lookup_one(idmap, src_dent->d_name.name, src_dent_parent, - src_dent->d_name.len); - if (IS_ERR(src_child)) { - err = PTR_ERR(src_child); - goto out_lock; - } - - if (src_child != src_dent) { - err = -ESTALE; - dput(src_child); - goto out_lock; - } - dput(src_child); - - err = __ksmbd_vfs_rename(work, - idmap, - src_dent_parent, - src_dent, - mnt_idmap(dst_path.mnt), - dst_dent_parent, - trap_dent, - dst_name); -out_lock: - dput(src_dent); - dput(dst_dent_parent); - unlock_rename(src_dent_parent, dst_dent_parent); - path_put(&dst_path); -out: - dput(src_dent_parent); +out1: + putname(to); +revert_fsids: + ksmbd_revert_fsids(work); return err; } @@ -1081,14 +1013,16 @@ int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, return vfs_removexattr(idmap, dentry, attr_name); } -int ksmbd_vfs_unlink(struct mnt_idmap *idmap, - struct dentry *dir, struct dentry *dentry) +int ksmbd_vfs_unlink(struct file *filp) { int err = 0; + struct dentry *dir, *dentry = filp->f_path.dentry; + struct mnt_idmap *idmap = file_mnt_idmap(filp); - err = ksmbd_vfs_lock_parent(idmap, dir, dentry); + dir = dget_parent(dentry); + err = ksmbd_vfs_lock_parent(dir, dentry); if (err) - return err; + goto out; dget(dentry); if (S_ISDIR(d_inode(dentry)->i_mode)) @@ -1100,6 +1034,8 @@ int ksmbd_vfs_unlink(struct mnt_idmap *idmap, inode_unlock(d_inode(dir)); if (err) ksmbd_debug(VFS, "failed to delete, err %d\n", err); +out: + dput(dir); return err; } @@ -1202,7 +1138,7 @@ static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name, } /** - * ksmbd_vfs_kern_path() - lookup a file and get path info + * ksmbd_vfs_kern_path_locked() - lookup a file and get path info * @name: file path that is relative to share * @flags: lookup flags * @path: if lookup succeed, return path info @@ -1210,24 +1146,20 @@ static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name, * * Return: 0 on success, otherwise error */ -int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name, - unsigned int flags, struct path *path, bool caseless) +int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, + unsigned int flags, struct path *path, + bool caseless) { struct ksmbd_share_config *share_conf = work->tcon->share_conf; int err; + struct path parent_path; - flags |= LOOKUP_BENEATH; - err = vfs_path_lookup(share_conf->vfs_path.dentry, - share_conf->vfs_path.mnt, - name, - flags, - path); + err = ksmbd_vfs_path_lookup_locked(share_conf, name, flags, path); if (!err) - return 0; + return err; if (caseless) { char *filepath; - struct path parent; size_t path_len, remain_len; filepath = kstrdup(name, GFP_KERNEL); @@ -1237,10 +1169,10 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name, path_len = strlen(filepath); remain_len = path_len; - parent = share_conf->vfs_path; - path_get(&parent); + parent_path = share_conf->vfs_path; + path_get(&parent_path); - while (d_can_lookup(parent.dentry)) { + while (d_can_lookup(parent_path.dentry)) { char *filename = filepath + path_len - remain_len; char *next = strchrnul(filename, '/'); size_t filename_len = next - filename; @@ -1249,12 +1181,11 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name, if (filename_len == 0) break; - err = ksmbd_vfs_lookup_in_dir(&parent, filename, + err = ksmbd_vfs_lookup_in_dir(&parent_path, filename, filename_len, work->conn->um); - path_put(&parent); if (err) - goto out; + goto out2; next[0] = '\0'; @@ -1262,23 +1193,31 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name, share_conf->vfs_path.mnt, filepath, flags, - &parent); + path); if (err) - goto out; - else if (is_last) { - *path = parent; - goto out; - } + goto out2; + else if (is_last) + goto out1; + path_put(&parent_path); + parent_path = *path; next[0] = '/'; remain_len -= filename_len + 1; } - path_put(&parent); err = -EINVAL; -out: +out2: + path_put(&parent_path); +out1: kfree(filepath); } + + if (!err) { + err = ksmbd_vfs_lock_parent(parent_path.dentry, path->dentry); + if (err) + dput(path->dentry); + path_put(&parent_path); + } return err; } diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h index 9d676ab0cd25..a4ae89f3230d 100644 --- a/fs/ksmbd/vfs.h +++ b/fs/ksmbd/vfs.h @@ -71,9 +71,7 @@ struct ksmbd_kstat { __le32 file_attributes; }; -int ksmbd_vfs_lock_parent(struct mnt_idmap *idmap, struct dentry *parent, - struct dentry *child); -int ksmbd_vfs_may_delete(struct mnt_idmap *idmap, struct dentry *dentry); +int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child); int ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap, struct dentry *dentry, __le32 *daccess); int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode); @@ -84,12 +82,12 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp, char *buf, size_t count, loff_t *pos, bool sync, ssize_t *written); int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id); -int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name); +int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path); int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname, const char *newname); int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat); -int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, - char *newname); +int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path, + char *newname, int flags); int ksmbd_vfs_truncate(struct ksmbd_work *work, struct ksmbd_file *fp, loff_t size); struct srv_copychunk; @@ -116,9 +114,9 @@ int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name, size_t *xattr_stream_name_size, int s_type); int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, struct dentry *dentry, char *attr_name); -int ksmbd_vfs_kern_path(struct ksmbd_work *work, - char *name, unsigned int flags, struct path *path, - bool caseless); +int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, + unsigned int flags, struct path *path, + bool caseless); struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, const char *name, unsigned int flags, @@ -131,8 +129,7 @@ struct file_allocated_range_buffer; int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, struct file_allocated_range_buffer *ranges, unsigned int in_count, unsigned int *out_count); -int ksmbd_vfs_unlink(struct mnt_idmap *idmap, struct dentry *dir, - struct dentry *dentry); +int ksmbd_vfs_unlink(struct file *filp); void *ksmbd_vfs_init_kstat(char **p, struct ksmbd_kstat *ksmbd_kstat); int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work, struct mnt_idmap *idmap, diff --git a/fs/ksmbd/vfs_cache.c b/fs/ksmbd/vfs_cache.c index 054a7d2e0f48..2d0138e72d78 100644 --- a/fs/ksmbd/vfs_cache.c +++ b/fs/ksmbd/vfs_cache.c @@ -244,7 +244,6 @@ void ksmbd_release_inode_hash(void) static void __ksmbd_inode_close(struct ksmbd_file *fp) { - struct dentry *dir, *dentry; struct ksmbd_inode *ci = fp->f_ci; int err; struct file *filp; @@ -263,11 +262,9 @@ static void __ksmbd_inode_close(struct ksmbd_file *fp) if (atomic_dec_and_test(&ci->m_count)) { write_lock(&ci->m_lock); if (ci->m_flags & (S_DEL_ON_CLS | S_DEL_PENDING)) { - dentry = filp->f_path.dentry; - dir = dentry->d_parent; ci->m_flags &= ~(S_DEL_ON_CLS | S_DEL_PENDING); write_unlock(&ci->m_lock); - ksmbd_vfs_unlink(file_mnt_idmap(filp), dir, dentry); + ksmbd_vfs_unlink(filp); write_lock(&ci->m_lock); } write_unlock(&ci->m_lock); diff --git a/fs/libfs.c b/fs/libfs.c index 4eda519c3002..89cf614a3271 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -174,12 +174,6 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) } EXPORT_SYMBOL(dcache_dir_lseek); -/* Relationship between i_mode and the DT_xxx types */ -static inline unsigned char dt_type(struct inode *inode) -{ - return (inode->i_mode >> 12) & 15; -} - /* * Directory is locked and all positive dentries in it are safe, since * for ramfs-type trees they can't go away without unlink() or rmdir(), @@ -206,7 +200,8 @@ int dcache_readdir(struct file *file, struct dir_context *ctx) while ((next = scan_positives(cursor, p, 1, next)) != NULL) { if (!dir_emit(ctx, next->d_name.name, next->d_name.len, - d_inode(next)->i_ino, dt_type(d_inode(next)))) + d_inode(next)->i_ino, + fs_umode_to_dtype(d_inode(next)->i_mode))) break; ctx->pos++; p = &next->d_child; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index fe2f6dd9a874..04ba95b83d16 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -510,24 +510,6 @@ static struct ctl_table nlm_sysctls[] = { { } }; -static struct ctl_table nlm_sysctl_dir[] = { - { - .procname = "nfs", - .mode = 0555, - .child = nlm_sysctls, - }, - { } -}; - -static struct ctl_table nlm_sysctl_root[] = { - { - .procname = "fs", - .mode = 0555, - .child = nlm_sysctl_dir, - }, - { } -}; - #endif /* CONFIG_SYSCTL */ /* @@ -644,7 +626,7 @@ static int __init init_nlm(void) #ifdef CONFIG_SYSCTL err = -ENOMEM; - nlm_sysctl_table = register_sysctl_table(nlm_sysctl_root); + nlm_sysctl_table = register_sysctl("fs/nfs", nlm_sysctls); if (nlm_sysctl_table == NULL) goto err_sysctl; #endif diff --git a/fs/mpage.c b/fs/mpage.c index 22b9de5ddd68..242e213ee064 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -43,23 +43,49 @@ * status of that page is hard. See end_buffer_async_read() for the details. * There is no point in duplicating all that complexity. */ -static void mpage_end_io(struct bio *bio) +static void mpage_read_end_io(struct bio *bio) { - struct bio_vec *bv; - struct bvec_iter_all iter_all; + struct folio_iter fi; + int err = blk_status_to_errno(bio->bi_status); + + bio_for_each_folio_all(fi, bio) { + if (err) + folio_set_error(fi.folio); + else + folio_mark_uptodate(fi.folio); + folio_unlock(fi.folio); + } + + bio_put(bio); +} - bio_for_each_segment_all(bv, bio, iter_all) { - struct page *page = bv->bv_page; - page_endio(page, bio_op(bio), - blk_status_to_errno(bio->bi_status)); +static void mpage_write_end_io(struct bio *bio) +{ + struct folio_iter fi; + int err = blk_status_to_errno(bio->bi_status); + + bio_for_each_folio_all(fi, bio) { + if (err) { + folio_set_error(fi.folio); + mapping_set_error(fi.folio->mapping, err); + } + folio_end_writeback(fi.folio); } bio_put(bio); } -static struct bio *mpage_bio_submit(struct bio *bio) +static struct bio *mpage_bio_submit_read(struct bio *bio) +{ + bio->bi_end_io = mpage_read_end_io; + guard_bio_eod(bio); + submit_bio(bio); + return NULL; +} + +static struct bio *mpage_bio_submit_write(struct bio *bio) { - bio->bi_end_io = mpage_end_io; + bio->bi_end_io = mpage_write_end_io; guard_bio_eod(bio); submit_bio(bio); return NULL; @@ -265,7 +291,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) * This folio will go to BIO. Do we need to send this BIO off first? */ if (args->bio && (args->last_block_in_bio != blocks[0] - 1)) - args->bio = mpage_bio_submit(args->bio); + args->bio = mpage_bio_submit_read(args->bio); alloc_new: if (args->bio == NULL) { @@ -278,7 +304,7 @@ alloc_new: length = first_hole << blkbits; if (!bio_add_folio(args->bio, folio, length, 0)) { - args->bio = mpage_bio_submit(args->bio); + args->bio = mpage_bio_submit_read(args->bio); goto alloc_new; } @@ -286,7 +312,7 @@ alloc_new: nblocks = map_bh->b_size >> blkbits; if ((buffer_boundary(map_bh) && relative_block == nblocks) || (first_hole != blocks_per_page)) - args->bio = mpage_bio_submit(args->bio); + args->bio = mpage_bio_submit_read(args->bio); else args->last_block_in_bio = blocks[blocks_per_page - 1]; out: @@ -294,7 +320,7 @@ out: confused: if (args->bio) - args->bio = mpage_bio_submit(args->bio); + args->bio = mpage_bio_submit_read(args->bio); if (!folio_test_uptodate(folio)) block_read_full_folio(folio, args->get_block); else @@ -356,7 +382,7 @@ void mpage_readahead(struct readahead_control *rac, get_block_t get_block) args.bio = do_mpage_readpage(&args); } if (args.bio) - mpage_bio_submit(args.bio); + mpage_bio_submit_read(args.bio); } EXPORT_SYMBOL(mpage_readahead); @@ -373,7 +399,7 @@ int mpage_read_folio(struct folio *folio, get_block_t get_block) args.bio = do_mpage_readpage(&args); if (args.bio) - mpage_bio_submit(args.bio); + mpage_bio_submit_read(args.bio); return 0; } EXPORT_SYMBOL(mpage_read_folio); @@ -577,7 +603,7 @@ page_is_mapped: * This page will go to BIO. Do we need to send this BIO off first? */ if (bio && mpd->last_block_in_bio != blocks[0] - 1) - bio = mpage_bio_submit(bio); + bio = mpage_bio_submit_write(bio); alloc_new: if (bio == NULL) { @@ -596,7 +622,7 @@ alloc_new: wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio)); length = first_unmapped << blkbits; if (!bio_add_folio(bio, folio, length, 0)) { - bio = mpage_bio_submit(bio); + bio = mpage_bio_submit_write(bio); goto alloc_new; } @@ -606,7 +632,7 @@ alloc_new: folio_start_writeback(folio); folio_unlock(folio); if (boundary || (first_unmapped != blocks_per_page)) { - bio = mpage_bio_submit(bio); + bio = mpage_bio_submit_write(bio); if (boundary_block) { write_boundary_block(boundary_bdev, boundary_block, 1 << blkbits); @@ -618,7 +644,7 @@ alloc_new: confused: if (bio) - bio = mpage_bio_submit(bio); + bio = mpage_bio_submit_write(bio); /* * The caller has a ref on the inode, so *mapping is stable @@ -652,7 +678,7 @@ mpage_writepages(struct address_space *mapping, blk_start_plug(&plug); ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); if (mpd.bio) - mpage_bio_submit(mpd.bio); + mpage_bio_submit_write(mpd.bio); blk_finish_plug(&plug); return ret; } diff --git a/fs/namei.c b/fs/namei.c index edfedfbccaef..e4fe0879ae55 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -254,6 +254,7 @@ getname_kernel(const char * filename) return result; } +EXPORT_SYMBOL(getname_kernel); void putname(struct filename *name) { @@ -271,6 +272,7 @@ void putname(struct filename *name) } else __putname(name); } +EXPORT_SYMBOL(putname); /** * check_acl - perform ACL permission checking @@ -1581,8 +1583,9 @@ static struct dentry *lookup_dcache(const struct qstr *name, * when directory is guaranteed to have no in-lookup children * at all. */ -static struct dentry *__lookup_hash(const struct qstr *name, - struct dentry *base, unsigned int flags) +struct dentry *lookup_one_qstr_excl(const struct qstr *name, + struct dentry *base, + unsigned int flags) { struct dentry *dentry = lookup_dcache(name, base, flags); struct dentry *old; @@ -1606,6 +1609,7 @@ static struct dentry *__lookup_hash(const struct qstr *name, } return dentry; } +EXPORT_SYMBOL(lookup_one_qstr_excl); static struct dentry *lookup_fast(struct nameidata *nd) { @@ -2532,16 +2536,17 @@ static int path_parentat(struct nameidata *nd, unsigned flags, } /* Note: this does not consume "name" */ -static int filename_parentat(int dfd, struct filename *name, - unsigned int flags, struct path *parent, - struct qstr *last, int *type) +static int __filename_parentat(int dfd, struct filename *name, + unsigned int flags, struct path *parent, + struct qstr *last, int *type, + const struct path *root) { int retval; struct nameidata nd; if (IS_ERR(name)) return PTR_ERR(name); - set_nameidata(&nd, dfd, name, NULL); + set_nameidata(&nd, dfd, name, root); retval = path_parentat(&nd, flags | LOOKUP_RCU, parent); if (unlikely(retval == -ECHILD)) retval = path_parentat(&nd, flags, parent); @@ -2556,6 +2561,13 @@ static int filename_parentat(int dfd, struct filename *name, return retval; } +static int filename_parentat(int dfd, struct filename *name, + unsigned int flags, struct path *parent, + struct qstr *last, int *type) +{ + return __filename_parentat(dfd, name, flags, parent, last, type, NULL); +} + /* does lookup, returns the object with parent locked */ static struct dentry *__kern_path_locked(struct filename *name, struct path *path) { @@ -2571,7 +2583,7 @@ static struct dentry *__kern_path_locked(struct filename *name, struct path *pat return ERR_PTR(-EINVAL); } inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); - d = __lookup_hash(&last, path->dentry, 0); + d = lookup_one_qstr_excl(&last, path->dentry, 0); if (IS_ERR(d)) { inode_unlock(path->dentry->d_inode); path_put(path); @@ -2600,6 +2612,24 @@ int kern_path(const char *name, unsigned int flags, struct path *path) EXPORT_SYMBOL(kern_path); /** + * vfs_path_parent_lookup - lookup a parent path relative to a dentry-vfsmount pair + * @filename: filename structure + * @flags: lookup flags + * @parent: pointer to struct path to fill + * @last: last component + * @type: type of the last component + * @root: pointer to struct path of the base directory + */ +int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, + struct path *parent, struct qstr *last, int *type, + const struct path *root) +{ + return __filename_parentat(AT_FDCWD, filename, flags, parent, last, + type, root); +} +EXPORT_SYMBOL(vfs_path_parent_lookup); + +/** * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair * @dentry: pointer to dentry of the base directory * @mnt: pointer to vfs mount of the base directory @@ -2980,20 +3010,10 @@ static inline int may_create(struct mnt_idmap *idmap, return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); } -/* - * p1 and p2 should be directories on the same fs. - */ -struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) +static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2) { struct dentry *p; - if (p1 == p2) { - inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); - return NULL; - } - - mutex_lock(&p1->d_sb->s_vfs_rename_mutex); - p = d_ancestor(p2, p1); if (p) { inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); @@ -3012,8 +3032,64 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); return NULL; } + +/* + * p1 and p2 should be directories on the same fs. + */ +struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) +{ + if (p1 == p2) { + inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); + return NULL; + } + + mutex_lock(&p1->d_sb->s_vfs_rename_mutex); + return lock_two_directories(p1, p2); +} EXPORT_SYMBOL(lock_rename); +/* + * c1 and p2 should be on the same fs. + */ +struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2) +{ + if (READ_ONCE(c1->d_parent) == p2) { + /* + * hopefully won't need to touch ->s_vfs_rename_mutex at all. + */ + inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); + /* + * now that p2 is locked, nobody can move in or out of it, + * so the test below is safe. + */ + if (likely(c1->d_parent == p2)) + return NULL; + + /* + * c1 got moved out of p2 while we'd been taking locks; + * unlock and fall back to slow case. + */ + inode_unlock(p2->d_inode); + } + + mutex_lock(&c1->d_sb->s_vfs_rename_mutex); + /* + * nobody can move out of any directories on this fs. + */ + if (likely(c1->d_parent != p2)) + return lock_two_directories(c1->d_parent, p2); + + /* + * c1 got moved into p2 while we were taking locks; + * we need p2 locked and ->s_vfs_rename_mutex unlocked, + * for consistency with lock_rename(). + */ + inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); + mutex_unlock(&c1->d_sb->s_vfs_rename_mutex); + return NULL; +} +EXPORT_SYMBOL(lock_rename_child); + void unlock_rename(struct dentry *p1, struct dentry *p2) { inode_unlock(p1->d_inode); @@ -3574,9 +3650,9 @@ static int do_open(struct nameidata *nd, /** * vfs_tmpfile - create tmpfile * @idmap: idmap of the mount the inode was found from - * @dentry: pointer to dentry of the base directory + * @parentpath: pointer to the path of the base directory + * @file: file descriptor of the new tmpfile * @mode: mode of the new tmpfile - * @open_flag: flags * * Create a temporary file. * @@ -3806,7 +3882,8 @@ static struct dentry *filename_create(int dfd, struct filename *name, if (last.name[last.len] && !want_dir) create_flags = 0; inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); - dentry = __lookup_hash(&last, path->dentry, reval_flag | create_flags); + dentry = lookup_one_qstr_excl(&last, path->dentry, + reval_flag | create_flags); if (IS_ERR(dentry)) goto unlock; @@ -4166,7 +4243,7 @@ retry: goto exit2; inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); - dentry = __lookup_hash(&last, path.dentry, lookup_flags); + dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto exit3; @@ -4299,7 +4376,7 @@ retry: goto exit2; retry_deleg: inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); - dentry = __lookup_hash(&last, path.dentry, lookup_flags); + dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { @@ -4863,7 +4940,8 @@ retry: retry_deleg: trap = lock_rename(new_path.dentry, old_path.dentry); - old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags); + old_dentry = lookup_one_qstr_excl(&old_last, old_path.dentry, + lookup_flags); error = PTR_ERR(old_dentry); if (IS_ERR(old_dentry)) goto exit3; @@ -4871,7 +4949,8 @@ retry_deleg: error = -ENOENT; if (d_is_negative(old_dentry)) goto exit4; - new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags); + new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry, + lookup_flags | target_flags); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto exit4; diff --git a/fs/namespace.c b/fs/namespace.c index 6836e937ee61..54847db5b819 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2617,15 +2617,12 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) { char *buf = (char *)__get_free_page(GFP_KERNEL); char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM); - struct tm tm; - time64_to_tm(sb->s_time_max, 0, &tm); - - pr_warn("%s filesystem being %s at %s supports timestamps until %04ld (0x%llx)\n", + pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n", sb->s_type->name, is_mounted(mnt) ? "remounted" : "mounted", - mntpath, - tm.tm_year+1900, (unsigned long long)sb->s_time_max); + mntpath, &sb->s_time_max, + (unsigned long long)sb->s_time_max); free_page((unsigned long)buf); sb->s_iflags |= SB_I_TS_EXPIRY_WARNED; @@ -4197,7 +4194,7 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, int err = 0; struct ns_common *ns; struct user_namespace *mnt_userns; - struct file *file; + struct fd f; if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP)) return 0; @@ -4213,16 +4210,16 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, if (attr->userns_fd > INT_MAX) return -EINVAL; - file = fget(attr->userns_fd); - if (!file) + f = fdget(attr->userns_fd); + if (!f.file) return -EBADF; - if (!proc_ns_file(file)) { + if (!proc_ns_file(f.file)) { err = -EINVAL; goto out_fput; } - ns = get_proc_ns(file_inode(file)); + ns = get_proc_ns(file_inode(f.file)); if (ns->ops->type != CLONE_NEWUSER) { err = -EINVAL; goto out_fput; @@ -4251,7 +4248,7 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, kattr->mnt_userns = get_user_ns(mnt_userns); out_fput: - fput(file); + fdput(f); return err; } diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 7679a68e8193..3404707ddbe7 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -341,17 +341,16 @@ int netfs_write_begin(struct netfs_inode *ctx, { struct netfs_io_request *rreq; struct folio *folio; - unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; pgoff_t index = pos >> PAGE_SHIFT; int ret; DEFINE_READAHEAD(ractl, file, NULL, mapping, index); retry: - folio = __filemap_get_folio(mapping, index, fgp_flags, + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); - if (!folio) - return -ENOMEM; + if (IS_ERR(folio)) + return PTR_ERR(folio); if (ctx->ops->check_write_begin) { /* Allow the netfs (eg. ceph) to flush conflicts. */ diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index c1c7ed2fd860..b6fc169be1b1 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -170,6 +170,7 @@ config ROOT_NFS config NFS_FSCACHE bool "Provide NFS client caching support" depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y + select NETFS_SUPPORT help Say Y here if you want NFS data to be cached locally on disc through the general filesystem cache manager diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 6fbcbb8d6587..e63c1d46f189 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -55,7 +55,7 @@ static int nfs_closedir(struct inode *, struct file *); static int nfs_readdir(struct file *, struct dir_context *); static int nfs_fsync_dir(struct file *, loff_t, loff_t, int); static loff_t nfs_llseek_dir(struct file *, loff_t, int); -static void nfs_readdir_free_folio(struct folio *); +static void nfs_readdir_clear_array(struct folio *); const struct file_operations nfs_dir_operations = { .llseek = nfs_llseek_dir, @@ -67,7 +67,7 @@ const struct file_operations nfs_dir_operations = { }; const struct address_space_operations nfs_dir_aops = { - .free_folio = nfs_readdir_free_folio, + .free_folio = nfs_readdir_clear_array, }; #define NFS_INIT_DTSIZE PAGE_SIZE @@ -146,18 +146,18 @@ struct nfs_cache_array { u64 change_attr; u64 last_cookie; unsigned int size; - unsigned char page_full : 1, - page_is_eof : 1, + unsigned char folio_full : 1, + folio_is_eof : 1, cookies_are_ordered : 1; struct nfs_cache_array_entry array[]; }; struct nfs_readdir_descriptor { struct file *file; - struct page *page; + struct folio *folio; struct dir_context *ctx; - pgoff_t page_index; - pgoff_t page_index_max; + pgoff_t folio_index; + pgoff_t folio_index_max; u64 dir_cookie; u64 last_cookie; loff_t current_index; @@ -198,17 +198,17 @@ static void nfs_grow_dtsize(struct nfs_readdir_descriptor *desc) nfs_set_dtsize(desc, desc->dtsize << 1); } -static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie, - u64 change_attr) +static void nfs_readdir_folio_init_array(struct folio *folio, u64 last_cookie, + u64 change_attr) { struct nfs_cache_array *array; - array = kmap_local_page(page); + array = kmap_local_folio(folio, 0); array->change_attr = change_attr; array->last_cookie = last_cookie; array->size = 0; - array->page_full = 0; - array->page_is_eof = 0; + array->folio_full = 0; + array->folio_is_eof = 0; array->cookies_are_ordered = 1; kunmap_local(array); } @@ -216,44 +216,39 @@ static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie, /* * we are freeing strings created by nfs_add_to_readdir_array() */ -static void nfs_readdir_clear_array(struct page *page) +static void nfs_readdir_clear_array(struct folio *folio) { struct nfs_cache_array *array; unsigned int i; - array = kmap_local_page(page); + array = kmap_local_folio(folio, 0); for (i = 0; i < array->size; i++) kfree(array->array[i].name); array->size = 0; kunmap_local(array); } -static void nfs_readdir_free_folio(struct folio *folio) +static void nfs_readdir_folio_reinit_array(struct folio *folio, u64 last_cookie, + u64 change_attr) { - nfs_readdir_clear_array(&folio->page); + nfs_readdir_clear_array(folio); + nfs_readdir_folio_init_array(folio, last_cookie, change_attr); } -static void nfs_readdir_page_reinit_array(struct page *page, u64 last_cookie, - u64 change_attr) +static struct folio * +nfs_readdir_folio_array_alloc(u64 last_cookie, gfp_t gfp_flags) { - nfs_readdir_clear_array(page); - nfs_readdir_page_init_array(page, last_cookie, change_attr); + struct folio *folio = folio_alloc(gfp_flags, 0); + if (folio) + nfs_readdir_folio_init_array(folio, last_cookie, 0); + return folio; } -static struct page * -nfs_readdir_page_array_alloc(u64 last_cookie, gfp_t gfp_flags) +static void nfs_readdir_folio_array_free(struct folio *folio) { - struct page *page = alloc_page(gfp_flags); - if (page) - nfs_readdir_page_init_array(page, last_cookie, 0); - return page; -} - -static void nfs_readdir_page_array_free(struct page *page) -{ - if (page) { - nfs_readdir_clear_array(page); - put_page(page); + if (folio) { + nfs_readdir_clear_array(folio); + folio_put(folio); } } @@ -264,13 +259,13 @@ static u64 nfs_readdir_array_index_cookie(struct nfs_cache_array *array) static void nfs_readdir_array_set_eof(struct nfs_cache_array *array) { - array->page_is_eof = 1; - array->page_full = 1; + array->folio_is_eof = 1; + array->folio_full = 1; } static bool nfs_readdir_array_is_full(struct nfs_cache_array *array) { - return array->page_full; + return array->folio_full; } /* @@ -302,18 +297,18 @@ static size_t nfs_readdir_array_maxentries(void) */ static int nfs_readdir_array_can_expand(struct nfs_cache_array *array) { - if (array->page_full) + if (array->folio_full) return -ENOSPC; if (array->size == nfs_readdir_array_maxentries()) { - array->page_full = 1; + array->folio_full = 1; return -ENOSPC; } return 0; } -static int nfs_readdir_page_array_append(struct page *page, - const struct nfs_entry *entry, - u64 *cookie) +static int nfs_readdir_folio_array_append(struct folio *folio, + const struct nfs_entry *entry, + u64 *cookie) { struct nfs_cache_array *array; struct nfs_cache_array_entry *cache_entry; @@ -322,7 +317,7 @@ static int nfs_readdir_page_array_append(struct page *page, name = nfs_readdir_copy_name(entry->name, entry->len); - array = kmap_atomic(page); + array = kmap_atomic(folio_page(folio, 0)); if (!name) goto out; ret = nfs_readdir_array_can_expand(array); @@ -361,17 +356,17 @@ out: * 127 readdir entries for a typical 64-bit system, that works out to a * cache of ~ 33 million entries per directory. */ -static pgoff_t nfs_readdir_page_cookie_hash(u64 cookie) +static pgoff_t nfs_readdir_folio_cookie_hash(u64 cookie) { if (cookie == 0) return 0; return hash_64(cookie, 18); } -static bool nfs_readdir_page_validate(struct page *page, u64 last_cookie, - u64 change_attr) +static bool nfs_readdir_folio_validate(struct folio *folio, u64 last_cookie, + u64 change_attr) { - struct nfs_cache_array *array = kmap_local_page(page); + struct nfs_cache_array *array = kmap_local_folio(folio, 0); int ret = true; if (array->change_attr != change_attr) @@ -382,81 +377,83 @@ static bool nfs_readdir_page_validate(struct page *page, u64 last_cookie, return ret; } -static void nfs_readdir_page_unlock_and_put(struct page *page) +static void nfs_readdir_folio_unlock_and_put(struct folio *folio) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } -static void nfs_readdir_page_init_and_validate(struct page *page, u64 cookie, - u64 change_attr) +static void nfs_readdir_folio_init_and_validate(struct folio *folio, u64 cookie, + u64 change_attr) { - if (PageUptodate(page)) { - if (nfs_readdir_page_validate(page, cookie, change_attr)) + if (folio_test_uptodate(folio)) { + if (nfs_readdir_folio_validate(folio, cookie, change_attr)) return; - nfs_readdir_clear_array(page); + nfs_readdir_clear_array(folio); } - nfs_readdir_page_init_array(page, cookie, change_attr); - SetPageUptodate(page); + nfs_readdir_folio_init_array(folio, cookie, change_attr); + folio_mark_uptodate(folio); } -static struct page *nfs_readdir_page_get_locked(struct address_space *mapping, - u64 cookie, u64 change_attr) +static struct folio *nfs_readdir_folio_get_locked(struct address_space *mapping, + u64 cookie, u64 change_attr) { - pgoff_t index = nfs_readdir_page_cookie_hash(cookie); - struct page *page; + pgoff_t index = nfs_readdir_folio_cookie_hash(cookie); + struct folio *folio; - page = grab_cache_page(mapping, index); - if (!page) + folio = filemap_grab_folio(mapping, index); + if (IS_ERR(folio)) return NULL; - nfs_readdir_page_init_and_validate(page, cookie, change_attr); - return page; + nfs_readdir_folio_init_and_validate(folio, cookie, change_attr); + return folio; } -static u64 nfs_readdir_page_last_cookie(struct page *page) +static u64 nfs_readdir_folio_last_cookie(struct folio *folio) { struct nfs_cache_array *array; u64 ret; - array = kmap_local_page(page); + array = kmap_local_folio(folio, 0); ret = array->last_cookie; kunmap_local(array); return ret; } -static bool nfs_readdir_page_needs_filling(struct page *page) +static bool nfs_readdir_folio_needs_filling(struct folio *folio) { struct nfs_cache_array *array; bool ret; - array = kmap_local_page(page); + array = kmap_local_folio(folio, 0); ret = !nfs_readdir_array_is_full(array); kunmap_local(array); return ret; } -static void nfs_readdir_page_set_eof(struct page *page) +static void nfs_readdir_folio_set_eof(struct folio *folio) { struct nfs_cache_array *array; - array = kmap_local_page(page); + array = kmap_local_folio(folio, 0); nfs_readdir_array_set_eof(array); kunmap_local(array); } -static struct page *nfs_readdir_page_get_next(struct address_space *mapping, - u64 cookie, u64 change_attr) +static struct folio *nfs_readdir_folio_get_next(struct address_space *mapping, + u64 cookie, u64 change_attr) { - pgoff_t index = nfs_readdir_page_cookie_hash(cookie); - struct page *page; + pgoff_t index = nfs_readdir_folio_cookie_hash(cookie); + struct folio *folio; - page = grab_cache_page_nowait(mapping, index); - if (!page) + folio = __filemap_get_folio(mapping, index, + FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) return NULL; - nfs_readdir_page_init_and_validate(page, cookie, change_attr); - if (nfs_readdir_page_last_cookie(page) != cookie) - nfs_readdir_page_reinit_array(page, cookie, change_attr); - return page; + nfs_readdir_folio_init_and_validate(folio, cookie, change_attr); + if (nfs_readdir_folio_last_cookie(folio) != cookie) + nfs_readdir_folio_reinit_array(folio, cookie, change_attr); + return folio; } static inline @@ -481,11 +478,11 @@ bool nfs_readdir_use_cookie(const struct file *filp) static void nfs_readdir_seek_next_array(struct nfs_cache_array *array, struct nfs_readdir_descriptor *desc) { - if (array->page_full) { + if (array->folio_full) { desc->last_cookie = array->last_cookie; desc->current_index += array->size; desc->cache_entry_index = 0; - desc->page_index++; + desc->folio_index++; } else desc->last_cookie = nfs_readdir_array_index_cookie(array); } @@ -494,7 +491,7 @@ static void nfs_readdir_rewind_search(struct nfs_readdir_descriptor *desc) { desc->current_index = 0; desc->last_cookie = 0; - desc->page_index = 0; + desc->folio_index = 0; } static int nfs_readdir_search_for_pos(struct nfs_cache_array *array, @@ -506,7 +503,7 @@ static int nfs_readdir_search_for_pos(struct nfs_cache_array *array, if (diff < 0) goto out_eof; if (diff >= array->size) { - if (array->page_is_eof) + if (array->folio_is_eof) goto out_eof; nfs_readdir_seek_next_array(array, desc); return -EAGAIN; @@ -554,7 +551,7 @@ static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, } } check_eof: - if (array->page_is_eof) { + if (array->folio_is_eof) { status = -EBADCOOKIE; if (desc->dir_cookie == array->last_cookie) desc->eof = true; @@ -568,7 +565,7 @@ static int nfs_readdir_search_array(struct nfs_readdir_descriptor *desc) struct nfs_cache_array *array; int status; - array = kmap_local_page(desc->page); + array = kmap_local_folio(desc->folio, 0); if (desc->dir_cookie == 0) status = nfs_readdir_search_for_pos(array, desc); @@ -819,16 +816,17 @@ static int nfs_readdir_entry_decode(struct nfs_readdir_descriptor *desc, } /* Perform conversion from xdr to cache array */ -static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc, - struct nfs_entry *entry, - struct page **xdr_pages, unsigned int buflen, - struct page **arrays, size_t narrays, - u64 change_attr) +static int nfs_readdir_folio_filler(struct nfs_readdir_descriptor *desc, + struct nfs_entry *entry, + struct page **xdr_pages, unsigned int buflen, + struct folio **arrays, size_t narrays, + u64 change_attr) { struct address_space *mapping = desc->file->f_mapping; + struct folio *new, *folio = *arrays; struct xdr_stream stream; + struct page *scratch; struct xdr_buf buf; - struct page *scratch, *new, *page = *arrays; u64 cookie; int status; @@ -844,36 +842,36 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc, if (status != 0) break; - status = nfs_readdir_page_array_append(page, entry, &cookie); + status = nfs_readdir_folio_array_append(folio, entry, &cookie); if (status != -ENOSPC) continue; - if (page->mapping != mapping) { + if (folio->mapping != mapping) { if (!--narrays) break; - new = nfs_readdir_page_array_alloc(cookie, GFP_KERNEL); + new = nfs_readdir_folio_array_alloc(cookie, GFP_KERNEL); if (!new) break; arrays++; - *arrays = page = new; + *arrays = folio = new; } else { - new = nfs_readdir_page_get_next(mapping, cookie, - change_attr); + new = nfs_readdir_folio_get_next(mapping, cookie, + change_attr); if (!new) break; - if (page != *arrays) - nfs_readdir_page_unlock_and_put(page); - page = new; + if (folio != *arrays) + nfs_readdir_folio_unlock_and_put(folio); + folio = new; } - desc->page_index_max++; - status = nfs_readdir_page_array_append(page, entry, &cookie); + desc->folio_index_max++; + status = nfs_readdir_folio_array_append(folio, entry, &cookie); } while (!status && !entry->eof); switch (status) { case -EBADCOOKIE: if (!entry->eof) break; - nfs_readdir_page_set_eof(page); + nfs_readdir_folio_set_eof(folio); fallthrough; case -EAGAIN: status = 0; @@ -886,8 +884,8 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc, ; } - if (page != *arrays) - nfs_readdir_page_unlock_and_put(page); + if (folio != *arrays) + nfs_readdir_folio_unlock_and_put(folio); put_page(scratch); return status; @@ -927,11 +925,11 @@ out_freepages: static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, __be32 *verf_arg, __be32 *verf_res, - struct page **arrays, size_t narrays) + struct folio **arrays, size_t narrays) { u64 change_attr; struct page **pages; - struct page *page = *arrays; + struct folio *folio = *arrays; struct nfs_entry *entry; size_t array_size; struct inode *inode = file_inode(desc->file); @@ -942,7 +940,7 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; - entry->cookie = nfs_readdir_page_last_cookie(page); + entry->cookie = nfs_readdir_folio_last_cookie(folio); entry->fh = nfs_alloc_fhandle(); entry->fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode)); entry->server = NFS_SERVER(inode); @@ -962,10 +960,10 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, pglen = status; if (pglen != 0) - status = nfs_readdir_page_filler(desc, entry, pages, pglen, - arrays, narrays, change_attr); + status = nfs_readdir_folio_filler(desc, entry, pages, pglen, + arrays, narrays, change_attr); else - nfs_readdir_page_set_eof(page); + nfs_readdir_folio_set_eof(folio); desc->buffer_fills++; free_pages: @@ -977,33 +975,33 @@ out: return status; } -static void nfs_readdir_page_put(struct nfs_readdir_descriptor *desc) +static void nfs_readdir_folio_put(struct nfs_readdir_descriptor *desc) { - put_page(desc->page); - desc->page = NULL; + folio_put(desc->folio); + desc->folio = NULL; } static void -nfs_readdir_page_unlock_and_put_cached(struct nfs_readdir_descriptor *desc) +nfs_readdir_folio_unlock_and_put_cached(struct nfs_readdir_descriptor *desc) { - unlock_page(desc->page); - nfs_readdir_page_put(desc); + folio_unlock(desc->folio); + nfs_readdir_folio_put(desc); } -static struct page * -nfs_readdir_page_get_cached(struct nfs_readdir_descriptor *desc) +static struct folio * +nfs_readdir_folio_get_cached(struct nfs_readdir_descriptor *desc) { struct address_space *mapping = desc->file->f_mapping; u64 change_attr = inode_peek_iversion_raw(mapping->host); u64 cookie = desc->last_cookie; - struct page *page; + struct folio *folio; - page = nfs_readdir_page_get_locked(mapping, cookie, change_attr); - if (!page) + folio = nfs_readdir_folio_get_locked(mapping, cookie, change_attr); + if (!folio) return NULL; - if (desc->clear_cache && !nfs_readdir_page_needs_filling(page)) - nfs_readdir_page_reinit_array(page, cookie, change_attr); - return page; + if (desc->clear_cache && !nfs_readdir_folio_needs_filling(folio)) + nfs_readdir_folio_reinit_array(folio, cookie, change_attr); + return folio; } /* @@ -1017,21 +1015,21 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc) __be32 verf[NFS_DIR_VERIFIER_SIZE]; int res; - desc->page = nfs_readdir_page_get_cached(desc); - if (!desc->page) + desc->folio = nfs_readdir_folio_get_cached(desc); + if (!desc->folio) return -ENOMEM; - if (nfs_readdir_page_needs_filling(desc->page)) { + if (nfs_readdir_folio_needs_filling(desc->folio)) { /* Grow the dtsize if we had to go back for more pages */ - if (desc->page_index == desc->page_index_max) + if (desc->folio_index == desc->folio_index_max) nfs_grow_dtsize(desc); - desc->page_index_max = desc->page_index; + desc->folio_index_max = desc->folio_index; trace_nfs_readdir_cache_fill(desc->file, nfsi->cookieverf, desc->last_cookie, - desc->page->index, desc->dtsize); + desc->folio->index, desc->dtsize); res = nfs_readdir_xdr_to_array(desc, nfsi->cookieverf, verf, - &desc->page, 1); + &desc->folio, 1); if (res < 0) { - nfs_readdir_page_unlock_and_put_cached(desc); + nfs_readdir_folio_unlock_and_put_cached(desc); trace_nfs_readdir_cache_fill_done(inode, res); if (res == -EBADCOOKIE || res == -ENOTSYNC) { invalidate_inode_pages2(desc->file->f_mapping); @@ -1059,7 +1057,7 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc) res = nfs_readdir_search_array(desc); if (res == 0) return 0; - nfs_readdir_page_unlock_and_put_cached(desc); + nfs_readdir_folio_unlock_and_put_cached(desc); return res; } @@ -1087,7 +1085,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, unsigned int i; bool first_emit = !desc->dir_cookie; - array = kmap_local_page(desc->page); + array = kmap_local_folio(desc->folio, 0); for (i = desc->cache_entry_index; i < array->size; i++) { struct nfs_cache_array_entry *ent; @@ -1114,7 +1112,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, break; } } - if (array->page_is_eof) + if (array->folio_is_eof) desc->eof = !desc->eob; kunmap_local(array); @@ -1136,7 +1134,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, */ static int uncached_readdir(struct nfs_readdir_descriptor *desc) { - struct page **arrays; + struct folio **arrays; size_t i, sz = 512; __be32 verf[NFS_DIR_VERIFIER_SIZE]; int status = -ENOMEM; @@ -1147,14 +1145,14 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc) arrays = kcalloc(sz, sizeof(*arrays), GFP_KERNEL); if (!arrays) goto out; - arrays[0] = nfs_readdir_page_array_alloc(desc->dir_cookie, GFP_KERNEL); + arrays[0] = nfs_readdir_folio_array_alloc(desc->dir_cookie, GFP_KERNEL); if (!arrays[0]) goto out; - desc->page_index = 0; + desc->folio_index = 0; desc->cache_entry_index = 0; desc->last_cookie = desc->dir_cookie; - desc->page_index_max = 0; + desc->folio_index_max = 0; trace_nfs_readdir_uncached(desc->file, desc->verf, desc->last_cookie, -1, desc->dtsize); @@ -1166,10 +1164,10 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc) } for (i = 0; !desc->eob && i < sz && arrays[i]; i++) { - desc->page = arrays[i]; + desc->folio = arrays[i]; nfs_do_filldir(desc, verf); } - desc->page = NULL; + desc->folio = NULL; /* * Grow the dtsize if we have to go back for more pages, @@ -1179,16 +1177,16 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc) if (!desc->eob) nfs_grow_dtsize(desc); else if (desc->buffer_fills == 1 && - i < (desc->page_index_max >> 1)) + i < (desc->folio_index_max >> 1)) nfs_shrink_dtsize(desc); } out_free: for (i = 0; i < sz && arrays[i]; i++) - nfs_readdir_page_array_free(arrays[i]); + nfs_readdir_folio_array_free(arrays[i]); out: if (!nfs_readdir_use_cookie(desc->file)) nfs_readdir_rewind_search(desc); - desc->page_index_max = -1; + desc->folio_index_max = -1; kfree(arrays); dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status); return status; @@ -1240,11 +1238,11 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) goto out; desc->file = file; desc->ctx = ctx; - desc->page_index_max = -1; + desc->folio_index_max = -1; spin_lock(&file->f_lock); desc->dir_cookie = dir_ctx->dir_cookie; - desc->page_index = dir_ctx->page_index; + desc->folio_index = dir_ctx->page_index; desc->last_cookie = dir_ctx->last_cookie; desc->attr_gencount = dir_ctx->attr_gencount; desc->eof = dir_ctx->eof; @@ -1291,8 +1289,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) break; nfs_do_filldir(desc, nfsi->cookieverf); - nfs_readdir_page_unlock_and_put_cached(desc); - if (desc->page_index == desc->page_index_max) + nfs_readdir_folio_unlock_and_put_cached(desc); + if (desc->folio_index == desc->folio_index_max) desc->clear_cache = force_clear; } while (!desc->eob && !desc->eof); @@ -1300,7 +1298,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) dir_ctx->dir_cookie = desc->dir_cookie; dir_ctx->last_cookie = desc->last_cookie; dir_ctx->attr_gencount = desc->attr_gencount; - dir_ctx->page_index = desc->page_index; + dir_ctx->page_index = desc->folio_index; dir_ctx->force_clear = force_clear; dir_ctx->eof = desc->eof; dir_ctx->dtsize = desc->dtsize; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 893625eacab9..f0edf5a36237 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -306,15 +306,6 @@ static bool nfs_want_read_modify_write(struct file *file, struct folio *folio, return false; } -static struct folio * -nfs_folio_grab_cache_write_begin(struct address_space *mapping, pgoff_t index) -{ - unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; - - return __filemap_get_folio(mapping, index, fgp_flags, - mapping_gfp_mask(mapping)); -} - /* * This does the "real" work of the write. We must allocate and lock the * page to be sent back to the generic routine, which then copies the @@ -335,9 +326,10 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, file, mapping->host->i_ino, len, (long long) pos); start: - folio = nfs_folio_grab_cache_write_begin(mapping, pos >> PAGE_SHIFT); - if (!folio) - return -ENOMEM; + folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); *pagep = &folio->page; ret = nfs_flush_incompatible(file, folio); diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index ea5f2976dfab..8c35d88a84b1 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -15,6 +15,9 @@ #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/iversion.h> +#include <linux/xarray.h> +#include <linux/fscache.h> +#include <linux/netfs.h> #include "internal.h" #include "iostat.h" @@ -163,13 +166,14 @@ void nfs_fscache_init_inode(struct inode *inode) struct nfs_server *nfss = NFS_SERVER(inode); struct nfs_inode *nfsi = NFS_I(inode); - nfsi->fscache = NULL; + netfs_inode(inode)->cache = NULL; if (!(nfss->fscache && S_ISREG(inode->i_mode))) return; nfs_fscache_update_auxdata(&auxdata, inode); - nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache, + netfs_inode(inode)->cache = fscache_acquire_cookie( + nfss->fscache, 0, nfsi->fh.data, /* index_key */ nfsi->fh.size, @@ -183,11 +187,8 @@ void nfs_fscache_init_inode(struct inode *inode) */ void nfs_fscache_clear_inode(struct inode *inode) { - struct nfs_inode *nfsi = NFS_I(inode); - struct fscache_cookie *cookie = nfs_i_fscache(inode); - - fscache_relinquish_cookie(cookie, false); - nfsi->fscache = NULL; + fscache_relinquish_cookie(netfs_i_cookie(netfs_inode(inode)), false); + netfs_inode(inode)->cache = NULL; } /* @@ -212,7 +213,7 @@ void nfs_fscache_clear_inode(struct inode *inode) void nfs_fscache_open_file(struct inode *inode, struct file *filp) { struct nfs_fscache_inode_auxdata auxdata; - struct fscache_cookie *cookie = nfs_i_fscache(inode); + struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode)); bool open_for_write = inode_is_open_for_write(inode); if (!fscache_cookie_valid(cookie)) @@ -230,115 +231,160 @@ EXPORT_SYMBOL_GPL(nfs_fscache_open_file); void nfs_fscache_release_file(struct inode *inode, struct file *filp) { struct nfs_fscache_inode_auxdata auxdata; - struct fscache_cookie *cookie = nfs_i_fscache(inode); + struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode)); loff_t i_size = i_size_read(inode); nfs_fscache_update_auxdata(&auxdata, inode); fscache_unuse_cookie(cookie, &auxdata, &i_size); } -/* - * Fallback page reading interface. - */ -static int fscache_fallback_read_page(struct inode *inode, struct page *page) +int nfs_netfs_read_folio(struct file *file, struct folio *folio) { - struct netfs_cache_resources cres; - struct fscache_cookie *cookie = nfs_i_fscache(inode); - struct iov_iter iter; - struct bio_vec bvec; - int ret; - - memset(&cres, 0, sizeof(cres)); - bvec_set_page(&bvec, page, PAGE_SIZE, 0); - iov_iter_bvec(&iter, ITER_DEST, &bvec, 1, PAGE_SIZE); - - ret = fscache_begin_read_operation(&cres, cookie); - if (ret < 0) - return ret; - - ret = fscache_read(&cres, page_offset(page), &iter, NETFS_READ_HOLE_FAIL, - NULL, NULL); - fscache_end_operation(&cres); - return ret; + if (!netfs_inode(folio_inode(folio))->cache) + return -ENOBUFS; + + return netfs_read_folio(file, folio); } -/* - * Fallback page writing interface. - */ -static int fscache_fallback_write_page(struct inode *inode, struct page *page, - bool no_space_allocated_yet) +int nfs_netfs_readahead(struct readahead_control *ractl) { - struct netfs_cache_resources cres; - struct fscache_cookie *cookie = nfs_i_fscache(inode); - struct iov_iter iter; - struct bio_vec bvec; - loff_t start = page_offset(page); - size_t len = PAGE_SIZE; - int ret; - - memset(&cres, 0, sizeof(cres)); - bvec_set_page(&bvec, page, PAGE_SIZE, 0); - iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE); - - ret = fscache_begin_write_operation(&cres, cookie); - if (ret < 0) - return ret; - - ret = cres.ops->prepare_write(&cres, &start, &len, i_size_read(inode), - no_space_allocated_yet); - if (ret == 0) - ret = fscache_write(&cres, page_offset(page), &iter, NULL, NULL); - fscache_end_operation(&cres); - return ret; + struct inode *inode = ractl->mapping->host; + + if (!netfs_inode(inode)->cache) + return -ENOBUFS; + + netfs_readahead(ractl); + return 0; } -/* - * Retrieve a page from fscache - */ -int __nfs_fscache_read_page(struct inode *inode, struct page *page) +static atomic_t nfs_netfs_debug_id; +static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *file) { - int ret; + rreq->netfs_priv = get_nfs_open_context(nfs_file_open_context(file)); + rreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id); - trace_nfs_fscache_read_page(inode, page); - if (PageChecked(page)) { - ClearPageChecked(page); - ret = 1; - goto out; - } + return 0; +} - ret = fscache_fallback_read_page(inode, page); - if (ret < 0) { - nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL); - SetPageChecked(page); - goto out; - } +static void nfs_netfs_free_request(struct netfs_io_request *rreq) +{ + put_nfs_open_context(rreq->netfs_priv); +} - /* Read completed synchronously */ - nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK); - SetPageUptodate(page); - ret = 0; -out: - trace_nfs_fscache_read_page_exit(inode, page, ret); - return ret; +static inline int nfs_netfs_begin_cache_operation(struct netfs_io_request *rreq) +{ + return fscache_begin_read_operation(&rreq->cache_resources, + netfs_i_cookie(netfs_inode(rreq->inode))); } -/* - * Store a newly fetched page in fscache. We can be certain there's no page - * stored in the cache as yet otherwise we would've read it from there. - */ -void __nfs_fscache_write_page(struct inode *inode, struct page *page) +static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sreq) { - int ret; + struct nfs_netfs_io_data *netfs; + + netfs = kzalloc(sizeof(*netfs), GFP_KERNEL_ACCOUNT); + if (!netfs) + return NULL; + netfs->sreq = sreq; + refcount_set(&netfs->refcount, 1); + return netfs; +} - trace_nfs_fscache_write_page(inode, page); +static bool nfs_netfs_clamp_length(struct netfs_io_subrequest *sreq) +{ + size_t rsize = NFS_SB(sreq->rreq->inode->i_sb)->rsize; - ret = fscache_fallback_write_page(inode, page, true); + sreq->len = min(sreq->len, rsize); + return true; +} - if (ret != 0) { - nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL); - nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED); - } else { - nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_OK); +static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq) +{ + struct nfs_netfs_io_data *netfs; + struct nfs_pageio_descriptor pgio; + struct inode *inode = sreq->rreq->inode; + struct nfs_open_context *ctx = sreq->rreq->netfs_priv; + struct page *page; + int err; + pgoff_t start = (sreq->start + sreq->transferred) >> PAGE_SHIFT; + pgoff_t last = ((sreq->start + sreq->len - + sreq->transferred - 1) >> PAGE_SHIFT); + XA_STATE(xas, &sreq->rreq->mapping->i_pages, start); + + nfs_pageio_init_read(&pgio, inode, false, + &nfs_async_read_completion_ops); + + netfs = nfs_netfs_alloc(sreq); + if (!netfs) + return netfs_subreq_terminated(sreq, -ENOMEM, false); + + pgio.pg_netfs = netfs; /* used in completion */ + + xas_lock(&xas); + xas_for_each(&xas, page, last) { + /* nfs_read_add_folio() may schedule() due to pNFS layout and other RPCs */ + xas_pause(&xas); + xas_unlock(&xas); + err = nfs_read_add_folio(&pgio, ctx, page_folio(page)); + if (err < 0) { + netfs->error = err; + goto out; + } + xas_lock(&xas); } - trace_nfs_fscache_write_page_exit(inode, page, ret); + xas_unlock(&xas); +out: + nfs_pageio_complete_read(&pgio); + nfs_netfs_put(netfs); +} + +void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr) +{ + struct nfs_netfs_io_data *netfs = hdr->netfs; + + if (!netfs) + return; + + nfs_netfs_get(netfs); +} + +int nfs_netfs_folio_unlock(struct folio *folio) +{ + struct inode *inode = folio_file_mapping(folio)->host; + + /* + * If fscache is enabled, netfs will unlock pages. + */ + if (netfs_inode(inode)->cache) + return 0; + + return 1; } + +void nfs_netfs_read_completion(struct nfs_pgio_header *hdr) +{ + struct nfs_netfs_io_data *netfs = hdr->netfs; + struct netfs_io_subrequest *sreq; + + if (!netfs) + return; + + sreq = netfs->sreq; + if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &sreq->flags); + + if (hdr->error) + netfs->error = hdr->error; + else + atomic64_add(hdr->res.count, &netfs->transferred); + + nfs_netfs_put(netfs); + hdr->netfs = NULL; +} + +const struct netfs_request_ops nfs_netfs_ops = { + .init_request = nfs_netfs_init_request, + .free_request = nfs_netfs_free_request, + .begin_cache_operation = nfs_netfs_begin_cache_operation, + .issue_read = nfs_netfs_issue_read, + .clamp_length = nfs_netfs_clamp_length +}; diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index 2a37af880978..e1706e736c64 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -34,6 +34,58 @@ struct nfs_fscache_inode_auxdata { u64 change_attr; }; +struct nfs_netfs_io_data { + /* + * NFS may split a netfs_io_subrequest into multiple RPCs, each + * with their own read completion. In netfs, we can only call + * netfs_subreq_terminated() once for each subrequest. Use the + * refcount here to double as a marker of the last RPC completion, + * and only call netfs via netfs_subreq_terminated() once. + */ + refcount_t refcount; + struct netfs_io_subrequest *sreq; + + /* + * Final disposition of the netfs_io_subrequest, sent in + * netfs_subreq_terminated() + */ + atomic64_t transferred; + int error; +}; + +static inline void nfs_netfs_get(struct nfs_netfs_io_data *netfs) +{ + refcount_inc(&netfs->refcount); +} + +static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs) +{ + ssize_t final_len; + + /* Only the last RPC completion should call netfs_subreq_terminated() */ + if (!refcount_dec_and_test(&netfs->refcount)) + return; + + /* + * The NFS pageio interface may read a complete page, even when netfs + * only asked for a partial page. Specifically, this may be seen when + * one thread is truncating a file while another one is reading the last + * page of the file. + * Correct the final length here to be no larger than the netfs subrequest + * length, and thus avoid netfs's "Subreq overread" warning message. + */ + final_len = min_t(s64, netfs->sreq->len, atomic64_read(&netfs->transferred)); + netfs_subreq_terminated(netfs->sreq, netfs->error ?: final_len, false); + kfree(netfs); +} +static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) +{ + netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops); +} +extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr); +extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr); +extern int nfs_netfs_folio_unlock(struct folio *folio); + /* * fscache.c */ @@ -44,9 +96,8 @@ extern void nfs_fscache_init_inode(struct inode *); extern void nfs_fscache_clear_inode(struct inode *); extern void nfs_fscache_open_file(struct inode *, struct file *); extern void nfs_fscache_release_file(struct inode *, struct file *); - -extern int __nfs_fscache_read_page(struct inode *, struct page *); -extern void __nfs_fscache_write_page(struct inode *, struct page *); +extern int nfs_netfs_readahead(struct readahead_control *ractl); +extern int nfs_netfs_read_folio(struct file *file, struct folio *folio); static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp) { @@ -54,34 +105,11 @@ static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp) if (current_is_kswapd() || !(gfp & __GFP_FS)) return false; folio_wait_fscache(folio); - fscache_note_page_release(nfs_i_fscache(folio->mapping->host)); - nfs_inc_fscache_stats(folio->mapping->host, - NFSIOS_FSCACHE_PAGES_UNCACHED); } + fscache_note_page_release(netfs_i_cookie(netfs_inode(folio->mapping->host))); return true; } -/* - * Retrieve a page from an inode data storage object. - */ -static inline int nfs_fscache_read_page(struct inode *inode, struct page *page) -{ - if (nfs_i_fscache(inode)) - return __nfs_fscache_read_page(inode, page); - return -ENOBUFS; -} - -/* - * Store a page newly fetched from the server in an inode data storage object - * in the cache. - */ -static inline void nfs_fscache_write_page(struct inode *inode, - struct page *page) -{ - if (nfs_i_fscache(inode)) - __nfs_fscache_write_page(inode, page); -} - static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *auxdata, struct inode *inode) { @@ -101,13 +129,10 @@ static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata * static inline void nfs_fscache_invalidate(struct inode *inode, int flags) { struct nfs_fscache_inode_auxdata auxdata; - struct nfs_inode *nfsi = NFS_I(inode); + struct fscache_cookie *cookie = netfs_i_cookie(&NFS_I(inode)->netfs); - if (nfsi->fscache) { - nfs_fscache_update_auxdata(&auxdata, inode); - fscache_invalidate(nfsi->fscache, &auxdata, - i_size_read(inode), flags); - } + nfs_fscache_update_auxdata(&auxdata, inode); + fscache_invalidate(cookie, &auxdata, i_size_read(inode), flags); } /* @@ -120,7 +145,28 @@ static inline const char *nfs_server_fscache_state(struct nfs_server *server) return "no "; } +static inline void nfs_netfs_set_pgio_header(struct nfs_pgio_header *hdr, + struct nfs_pageio_descriptor *desc) +{ + hdr->netfs = desc->pg_netfs; +} +static inline void nfs_netfs_set_pageio_descriptor(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr) +{ + desc->pg_netfs = hdr->netfs; +} +static inline void nfs_netfs_reset_pageio_descriptor(struct nfs_pageio_descriptor *desc) +{ + desc->pg_netfs = NULL; +} #else /* CONFIG_NFS_FSCACHE */ +static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) {} +static inline void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr) {} +static inline void nfs_netfs_read_completion(struct nfs_pgio_header *hdr) {} +static inline int nfs_netfs_folio_unlock(struct folio *folio) +{ + return 1; +} static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {} static inline void nfs_fscache_init_inode(struct inode *inode) {} @@ -128,22 +174,29 @@ static inline void nfs_fscache_clear_inode(struct inode *inode) {} static inline void nfs_fscache_open_file(struct inode *inode, struct file *filp) {} static inline void nfs_fscache_release_file(struct inode *inode, struct file *file) {} - -static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp) +static inline int nfs_netfs_readahead(struct readahead_control *ractl) { - return true; /* may release folio */ + return -ENOBUFS; } -static inline int nfs_fscache_read_page(struct inode *inode, struct page *page) +static inline int nfs_netfs_read_folio(struct file *file, struct folio *folio) { return -ENOBUFS; } -static inline void nfs_fscache_write_page(struct inode *inode, struct page *page) {} + +static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp) +{ + return true; /* may release folio */ +} static inline void nfs_fscache_invalidate(struct inode *inode, int flags) {} static inline const char *nfs_server_fscache_state(struct nfs_server *server) { return "no "; } - +static inline void nfs_netfs_set_pgio_header(struct nfs_pgio_header *hdr, + struct nfs_pageio_descriptor *desc) {} +static inline void nfs_netfs_set_pageio_descriptor(struct nfs_pageio_descriptor *desc, + struct nfs_pgio_header *hdr) {} +static inline void nfs_netfs_reset_pageio_descriptor(struct nfs_pageio_descriptor *desc) {} #endif /* CONFIG_NFS_FSCACHE */ #endif /* _NFS_FSCACHE_H */ diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 222a28320e1c..a910b9a638c5 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -208,11 +208,12 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) nfsi->cache_validity |= flags; - if (inode->i_mapping->nrpages == 0) - nfsi->cache_validity &= ~(NFS_INO_INVALID_DATA | - NFS_INO_DATA_INVAL_DEFER); - else if (nfsi->cache_validity & NFS_INO_INVALID_DATA) - nfsi->cache_validity &= ~NFS_INO_DATA_INVAL_DEFER; + if (inode->i_mapping->nrpages == 0) { + nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; + nfs_ooo_clear(nfsi); + } else if (nfsi->cache_validity & NFS_INO_INVALID_DATA) { + nfs_ooo_clear(nfsi); + } trace_nfs_set_cache_invalid(inode, 0); } EXPORT_SYMBOL_GPL(nfs_set_cache_invalid); @@ -677,9 +678,10 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset) trace_nfs_size_truncate(inode, offset); i_size_write(inode, offset); /* Optimisation */ - if (offset == 0) - NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_DATA | - NFS_INO_DATA_INVAL_DEFER); + if (offset == 0) { + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA; + nfs_ooo_clear(NFS_I(inode)); + } NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE; spin_unlock(&inode->i_lock); @@ -717,9 +719,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, if ((attr->ia_valid & ATTR_KILL_SUID) != 0 && inode->i_mode & S_ISUID) inode->i_mode &= ~S_ISUID; - if ((attr->ia_valid & ATTR_KILL_SGID) != 0 && - (inode->i_mode & (S_ISGID | S_IXGRP)) == - (S_ISGID | S_IXGRP)) + if (setattr_should_drop_sgid(&nop_mnt_idmap, inode)) inode->i_mode &= ~S_ISGID; if ((attr->ia_valid & ATTR_MODE) != 0) { int mode = attr->ia_mode & S_IALLUGO; @@ -1109,7 +1109,7 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx) spin_lock(&inode->i_lock); if (list_empty(&nfsi->open_files) && - (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)) + nfs_ooo_test(nfsi)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA | NFS_INO_REVAL_FORCED); list_add_tail_rcu(&ctx->list, &nfsi->open_files); @@ -1353,8 +1353,8 @@ int nfs_clear_invalid_mapping(struct address_space *mapping) set_bit(NFS_INO_INVALIDATING, bitlock); smp_wmb(); - nfsi->cache_validity &= - ~(NFS_INO_INVALID_DATA | NFS_INO_DATA_INVAL_DEFER); + nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; + nfs_ooo_clear(nfsi); spin_unlock(&inode->i_lock); trace_nfs_invalidate_mapping_enter(inode); ret = nfs_invalidate_mapping(inode, mapping); @@ -1816,6 +1816,66 @@ static int nfs_inode_finish_partial_attr_update(const struct nfs_fattr *fattr, return 0; } +static void nfs_ooo_merge(struct nfs_inode *nfsi, + u64 start, u64 end) +{ + int i, cnt; + + if (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER) + /* No point merging anything */ + return; + + if (!nfsi->ooo) { + nfsi->ooo = kmalloc(sizeof(*nfsi->ooo), GFP_ATOMIC); + if (!nfsi->ooo) { + nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER; + return; + } + nfsi->ooo->cnt = 0; + } + + /* add this range, merging if possible */ + cnt = nfsi->ooo->cnt; + for (i = 0; i < cnt; i++) { + if (end == nfsi->ooo->gap[i].start) + end = nfsi->ooo->gap[i].end; + else if (start == nfsi->ooo->gap[i].end) + start = nfsi->ooo->gap[i].start; + else + continue; + /* Remove 'i' from table and loop to insert the new range */ + cnt -= 1; + nfsi->ooo->gap[i] = nfsi->ooo->gap[cnt]; + i = -1; + } + if (start != end) { + if (cnt >= ARRAY_SIZE(nfsi->ooo->gap)) { + nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER; + kfree(nfsi->ooo); + nfsi->ooo = NULL; + return; + } + nfsi->ooo->gap[cnt].start = start; + nfsi->ooo->gap[cnt].end = end; + cnt += 1; + } + nfsi->ooo->cnt = cnt; +} + +static void nfs_ooo_record(struct nfs_inode *nfsi, + struct nfs_fattr *fattr) +{ + /* This reply was out-of-order, so record in the + * pre/post change id, possibly cancelling + * gaps created when iversion was jumpped forward. + */ + if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) && + (fattr->valid & NFS_ATTR_FATTR_PRECHANGE)) + nfs_ooo_merge(nfsi, + fattr->change_attr, + fattr->pre_change_attr); +} + static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) { @@ -1826,8 +1886,12 @@ static int nfs_refresh_inode_locked(struct inode *inode, if (attr_cmp > 0 || nfs_inode_finish_partial_attr_update(fattr, inode)) ret = nfs_update_inode(inode, fattr); - else if (attr_cmp == 0) - ret = nfs_check_inode_attributes(inode, fattr); + else { + nfs_ooo_record(NFS_I(inode), fattr); + + if (attr_cmp == 0) + ret = nfs_check_inode_attributes(inode, fattr); + } trace_nfs_refresh_inode_exit(inode, ret); return ret; @@ -1918,6 +1982,8 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa if (attr_cmp < 0) return 0; if ((fattr->valid & NFS_ATTR_FATTR) == 0 || !attr_cmp) { + /* Record the pre/post change info before clearing PRECHANGE */ + nfs_ooo_record(NFS_I(inode), fattr); fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE | NFS_ATTR_FATTR_PRESIZE | NFS_ATTR_FATTR_PREMTIME @@ -2072,6 +2138,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* More cache consistency checks */ if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { + if (!have_writers && nfsi->ooo && nfsi->ooo->cnt == 1 && + nfsi->ooo->gap[0].end == inode_peek_iversion_raw(inode)) { + /* There is one remaining gap that hasn't been + * merged into iversion - do that now. + */ + inode_set_iversion_raw(inode, nfsi->ooo->gap[0].start); + kfree(nfsi->ooo); + nfsi->ooo = NULL; + } if (!inode_eq_iversion_raw(inode, fattr->change_attr)) { /* Could it be a race with writeback? */ if (!(have_writers || have_delegation)) { @@ -2093,8 +2168,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); - } else if (!have_delegation) - nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER; + } else if (!have_delegation) { + nfs_ooo_record(nfsi, fattr); + nfs_ooo_merge(nfsi, inode_peek_iversion_raw(inode), + fattr->change_attr); + } inode_set_iversion_raw(inode, fattr->change_attr); } } else { @@ -2248,18 +2326,22 @@ struct inode *nfs_alloc_inode(struct super_block *sb) return NULL; nfsi->flags = 0UL; nfsi->cache_validity = 0UL; + nfsi->ooo = NULL; #if IS_ENABLED(CONFIG_NFS_V4) nfsi->nfs4_acl = NULL; #endif /* CONFIG_NFS_V4 */ #ifdef CONFIG_NFS_V4_2 nfsi->xattr_cache = NULL; #endif + nfs_netfs_inode_init(nfsi); + return &nfsi->vfs_inode; } EXPORT_SYMBOL_GPL(nfs_alloc_inode); void nfs_free_inode(struct inode *inode) { + kfree(NFS_I(inode)->ooo); kmem_cache_free(nfs_inode_cachep, NFS_I(inode)); } EXPORT_SYMBOL_GPL(nfs_free_inode); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 10fb5e7573eb..3cc027d3bd58 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -452,6 +452,10 @@ extern void nfs_sb_deactive(struct super_block *sb); extern int nfs_client_for_each_server(struct nfs_client *clp, int (*fn)(struct nfs_server *, void *), void *data); +#ifdef CONFIG_NFS_FSCACHE +extern const struct netfs_request_ops nfs_netfs_ops; +#endif + /* io.c */ extern void nfs_start_io_read(struct inode *inode); extern void nfs_end_io_read(struct inode *inode); @@ -481,9 +485,14 @@ extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool struct nfs_pgio_completion_ops; /* read.c */ +extern const struct nfs_pgio_completion_ops nfs_async_read_completion_ops; extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, bool force_mds, const struct nfs_pgio_completion_ops *compl_ops); +extern int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio, + struct nfs_open_context *ctx, + struct folio *folio); +extern void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio); extern void nfs_read_prepare(struct rpc_task *task, void *calldata); extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h index 2ddaab1ac653..5aa776b5a3e7 100644 --- a/fs/nfs/iostat.h +++ b/fs/nfs/iostat.h @@ -17,9 +17,6 @@ struct nfs_iostats { unsigned long long bytes[__NFSIOS_BYTESMAX]; -#ifdef CONFIG_NFS_FSCACHE - unsigned long long fscache[__NFSIOS_FSCACHEMAX]; -#endif unsigned long events[__NFSIOS_COUNTSMAX]; } ____cacheline_aligned; @@ -49,20 +46,6 @@ static inline void nfs_add_stats(const struct inode *inode, nfs_add_server_stats(NFS_SERVER(inode), stat, addend); } -#ifdef CONFIG_NFS_FSCACHE -static inline void nfs_add_fscache_stats(struct inode *inode, - enum nfs_stat_fscachecounters stat, - long addend) -{ - this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend); -} -static inline void nfs_inc_fscache_stats(struct inode *inode, - enum nfs_stat_fscachecounters stat) -{ - this_cpu_inc(NFS_SERVER(inode)->io_stats->fscache[stat]); -} -#endif - static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void) { return alloc_percpu(struct nfs_iostats); diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h index 4fa37dc038b5..b333ea119ef5 100644 --- a/fs/nfs/nfs3_fs.h +++ b/fs/nfs/nfs3_fs.h @@ -17,7 +17,6 @@ extern int nfs3_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, struct posix_acl *dfacl); extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t); -extern const struct xattr_handler *nfs3_xattr_handlers[]; #else static inline int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, struct posix_acl *dfacl) diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 1247f544a440..18d8f6529f61 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -21,9 +21,8 @@ static void nfs3_prepare_get_acl(struct posix_acl **p) { struct posix_acl *sentinel = uncached_acl_sentinel(current); - if (cmpxchg(p, ACL_NOT_CACHED, sentinel) != ACL_NOT_CACHED) { - /* Not the first reader or sentinel already in place. */ - } + /* If the ACL isn't being read yet, set our sentinel. */ + cmpxchg(p, ACL_NOT_CACHED, sentinel); } static void nfs3_complete_get_acl(struct posix_acl **p, struct posix_acl *acl) @@ -300,12 +299,6 @@ fail: goto out; } -const struct xattr_handler *nfs3_xattr_handlers[] = { - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, - NULL, -}; - static int nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data, size_t size, ssize_t *result) diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c index 7c5809431e61..8a9be9e47f76 100644 --- a/fs/nfs/nfs3super.c +++ b/fs/nfs/nfs3super.c @@ -14,9 +14,6 @@ struct nfs_subversion nfs_v3 = { .rpc_vers = &nfs_version3, .rpc_ops = &nfs_v3_clientops, .sops = &nfs_sops, -#ifdef CONFIG_NFS_V3_ACL - .xattr = nfs3_xattr_handlers, -#endif }; static int __init init_nfs_v3(void) diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index d80ee88ca996..a6df815a140c 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -1122,7 +1122,6 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) uint32_t segments; struct read_plus_segment *segs; int status, i; - char scratch_buf[16]; __be32 *p; status = decode_op_hdr(xdr, OP_READ_PLUS); @@ -1143,7 +1142,6 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) if (!segs) return -ENOMEM; - xdr_set_scratch_buffer(xdr, &scratch_buf, sizeof(scratch_buf)); status = -EIO; for (i = 0; i < segments; i++) { status = decode_read_plus_segment(xdr, &segs[i]); @@ -1348,6 +1346,8 @@ static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp, struct compound_hdr hdr; int status; + xdr_set_scratch_buffer(xdr, res->scratch, sizeof(res->scratch)); + status = decode_compound_hdr(xdr, &hdr); if (status) goto out; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5607b1e2b821..18f25ff4bff7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5439,6 +5439,8 @@ static bool nfs4_read_plus_not_supported(struct rpc_task *task, static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { + if (hdr->res.scratch) + kfree(hdr->res.scratch); if (!nfs4_sequence_done(task, &hdr->res.seq_res)) return -EAGAIN; if (nfs4_read_stateid_changed(task, &hdr->args)) @@ -5452,17 +5454,22 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) } #if defined CONFIG_NFS_V4_2 && defined CONFIG_NFS_V4_2_READ_PLUS -static void nfs42_read_plus_support(struct nfs_pgio_header *hdr, +static bool nfs42_read_plus_support(struct nfs_pgio_header *hdr, struct rpc_message *msg) { /* Note: We don't use READ_PLUS with pNFS yet */ - if (nfs_server_capable(hdr->inode, NFS_CAP_READ_PLUS) && !hdr->ds_clp) + if (nfs_server_capable(hdr->inode, NFS_CAP_READ_PLUS) && !hdr->ds_clp) { msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS]; + hdr->res.scratch = kmalloc(32, GFP_KERNEL); + return hdr->res.scratch != NULL; + } + return false; } #else -static void nfs42_read_plus_support(struct nfs_pgio_header *hdr, +static bool nfs42_read_plus_support(struct nfs_pgio_header *hdr, struct rpc_message *msg) { + return false; } #endif /* CONFIG_NFS_V4_2 */ @@ -5472,8 +5479,8 @@ static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr, hdr->timestamp = jiffies; if (!hdr->pgio_done_cb) hdr->pgio_done_cb = nfs4_read_done_cb; - msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; - nfs42_read_plus_support(hdr, msg); + if (!nfs42_read_plus_support(hdr, msg)) + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0); } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 2a0ca5c7f082..bbe49315d99e 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -67,6 +67,8 @@ #define OPENOWNER_POOL_SIZE 8 +static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp); + const nfs4_stateid zero_stateid = { { .data = { 0 } }, .type = NFS4_SPECIAL_STATEID_TYPE, @@ -330,6 +332,8 @@ do_confirm: status = nfs4_proc_create_session(clp, cred); if (status != 0) goto out; + if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_CONFIRMED_R)) + nfs4_state_start_reclaim_reboot(clp); nfs41_finish_session_reset(clp); nfs_mark_client_ready(clp, NFS_CS_READY); out: @@ -1205,10 +1209,6 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) { struct task_struct *task; char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1]; - struct rpc_clnt *cl = clp->cl_rpcclient; - - while (cl != cl->cl_parent) - cl = cl->cl_parent; set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) != 0) { diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index c394e4447100..e776200e9a11 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -37,27 +37,10 @@ static struct ctl_table nfs4_cb_sysctls[] = { { } }; -static struct ctl_table nfs4_cb_sysctl_dir[] = { - { - .procname = "nfs", - .mode = 0555, - .child = nfs4_cb_sysctls, - }, - { } -}; - -static struct ctl_table nfs4_cb_sysctl_root[] = { - { - .procname = "fs", - .mode = 0555, - .child = nfs4_cb_sysctl_dir, - }, - { } -}; - int nfs4_register_sysctl(void) { - nfs4_callback_sysctl_table = register_sysctl_table(nfs4_cb_sysctl_root); + nfs4_callback_sysctl_table = register_sysctl("fs/nfs", + nfs4_cb_sysctls); if (nfs4_callback_sysctl_table == NULL) return -ENOMEM; return 0; diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index a778713343df..4e90ca531176 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -39,7 +39,6 @@ { BIT(NFS_INO_STALE), "STALE" }, \ { BIT(NFS_INO_ACL_LRU_SET), "ACL_LRU_SET" }, \ { BIT(NFS_INO_INVALIDATING), "INVALIDATING" }, \ - { BIT(NFS_INO_FSCACHE), "FSCACHE" }, \ { BIT(NFS_INO_LAYOUTCOMMIT), "NEED_LAYOUTCOMMIT" }, \ { BIT(NFS_INO_LAYOUTCOMMITTING), "LAYOUTCOMMIT" }, \ { BIT(NFS_INO_LAYOUTSTATS), "LAYOUTSTATS" }, \ @@ -1243,96 +1242,6 @@ TRACE_EVENT(nfs_readpage_short, ) ); -DECLARE_EVENT_CLASS(nfs_fscache_page_event, - TP_PROTO( - const struct inode *inode, - struct page *page - ), - - TP_ARGS(inode, page), - - TP_STRUCT__entry( - __field(dev_t, dev) - __field(u32, fhandle) - __field(u64, fileid) - __field(loff_t, offset) - ), - - TP_fast_assign( - const struct nfs_inode *nfsi = NFS_I(inode); - const struct nfs_fh *fh = &nfsi->fh; - - __entry->offset = page_index(page) << PAGE_SHIFT; - __entry->dev = inode->i_sb->s_dev; - __entry->fileid = nfsi->fileid; - __entry->fhandle = nfs_fhandle_hash(fh); - ), - - TP_printk( - "fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld", - MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long long)__entry->fileid, - __entry->fhandle, - (long long)__entry->offset - ) -); -DECLARE_EVENT_CLASS(nfs_fscache_page_event_done, - TP_PROTO( - const struct inode *inode, - struct page *page, - int error - ), - - TP_ARGS(inode, page, error), - - TP_STRUCT__entry( - __field(int, error) - __field(dev_t, dev) - __field(u32, fhandle) - __field(u64, fileid) - __field(loff_t, offset) - ), - - TP_fast_assign( - const struct nfs_inode *nfsi = NFS_I(inode); - const struct nfs_fh *fh = &nfsi->fh; - - __entry->offset = page_index(page) << PAGE_SHIFT; - __entry->dev = inode->i_sb->s_dev; - __entry->fileid = nfsi->fileid; - __entry->fhandle = nfs_fhandle_hash(fh); - __entry->error = error; - ), - - TP_printk( - "fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld error=%d", - MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long long)__entry->fileid, - __entry->fhandle, - (long long)__entry->offset, __entry->error - ) -); -#define DEFINE_NFS_FSCACHE_PAGE_EVENT(name) \ - DEFINE_EVENT(nfs_fscache_page_event, name, \ - TP_PROTO( \ - const struct inode *inode, \ - struct page *page \ - ), \ - TP_ARGS(inode, page)) -#define DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(name) \ - DEFINE_EVENT(nfs_fscache_page_event_done, name, \ - TP_PROTO( \ - const struct inode *inode, \ - struct page *page, \ - int error \ - ), \ - TP_ARGS(inode, page, error)) -DEFINE_NFS_FSCACHE_PAGE_EVENT(nfs_fscache_read_page); -DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(nfs_fscache_read_page_exit); -DEFINE_NFS_FSCACHE_PAGE_EVENT(nfs_fscache_write_page); -DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(nfs_fscache_write_page_exit); TRACE_EVENT(nfs_pgio_error, TP_PROTO( diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 64fa8de199de..6efb5068c116 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -26,6 +26,7 @@ #include "internal.h" #include "pnfs.h" #include "nfstrace.h" +#include "fscache.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE @@ -105,6 +106,7 @@ void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, hdr->good_bytes = mirror->pg_count; hdr->io_completion = desc->pg_io_completion; hdr->dreq = desc->pg_dreq; + nfs_netfs_set_pgio_header(hdr, desc); hdr->release = release; hdr->completion_ops = desc->pg_completion_ops; if (hdr->completion_ops->init_hdr) @@ -941,6 +943,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, desc->pg_lseg = NULL; desc->pg_io_completion = NULL; desc->pg_dreq = NULL; + nfs_netfs_reset_pageio_descriptor(desc); desc->pg_bsize = bsize; desc->pg_mirror_count = 1; @@ -1477,6 +1480,7 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc, desc->pg_io_completion = hdr->io_completion; desc->pg_dreq = hdr->dreq; + nfs_netfs_set_pageio_descriptor(desc, hdr); list_splice_init(&hdr->pages, &pages); while (!list_empty(&pages)) { struct nfs_page *req = nfs_list_entry(pages.next); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index e90988591df4..f71eeee67e20 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -31,7 +31,7 @@ #define NFSDBG_FACILITY NFSDBG_PAGECACHE -static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops; +const struct nfs_pgio_completion_ops nfs_async_read_completion_ops; static const struct nfs_rw_ops nfs_rw_read_ops; static struct kmem_cache *nfs_rdata_cachep; @@ -74,7 +74,7 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, } EXPORT_SYMBOL_GPL(nfs_pageio_init_read); -static void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio) +void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio) { struct nfs_pgio_mirror *pgm; unsigned long npages; @@ -110,28 +110,17 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); static void nfs_readpage_release(struct nfs_page *req, int error) { - struct inode *inode = d_inode(nfs_req_openctx(req)->dentry); struct folio *folio = nfs_page_to_folio(req); - dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(inode), req->wb_bytes, - (long long)req_offset(req)); - if (nfs_error_is_fatal_on_server(error) && error != -ETIMEDOUT) folio_set_error(folio); - if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) { - if (folio_test_uptodate(folio)) - nfs_fscache_write_page(inode, &folio->page); - folio_unlock(folio); - } + if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) + if (nfs_netfs_folio_unlock(folio)) + folio_unlock(folio); + nfs_release_request(req); } -struct nfs_readdesc { - struct nfs_pageio_descriptor pgio; - struct nfs_open_context *ctx; -}; - static void nfs_page_group_set_uptodate(struct nfs_page *req) { if (nfs_page_group_sync_on_bit(req, PG_UPTODATE)) @@ -153,7 +142,8 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr) if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) { /* note: regions of the page not covered by a - * request are zeroed in readpage_async_filler */ + * request are zeroed in nfs_read_add_folio + */ if (bytes > hdr->good_bytes) { /* nothing in this request was good, so zero * the full extent of the request */ @@ -181,6 +171,8 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr) nfs_list_remove_request(req); nfs_readpage_release(req, error); } + nfs_netfs_read_completion(hdr); + out: hdr->release(hdr); } @@ -191,6 +183,7 @@ static void nfs_initiate_read(struct nfs_pgio_header *hdr, struct rpc_task_setup *task_setup_data, int how) { rpc_ops->read_setup(hdr, msg); + nfs_netfs_initiate_read(hdr); trace_nfs_initiate_read(hdr); } @@ -206,7 +199,7 @@ nfs_async_read_error(struct list_head *head, int error) } } -static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = { +const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = { .error_cleanup = nfs_async_read_error, .completion = nfs_read_completion, }; @@ -281,7 +274,9 @@ static void nfs_readpage_result(struct rpc_task *task, nfs_readpage_retry(task, hdr); } -static int readpage_async_filler(struct nfs_readdesc *desc, struct folio *folio) +int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio, + struct nfs_open_context *ctx, + struct folio *folio) { struct inode *inode = folio_file_mapping(folio)->host; struct nfs_server *server = NFS_SERVER(inode); @@ -297,29 +292,21 @@ static int readpage_async_filler(struct nfs_readdesc *desc, struct folio *folio) aligned_len = min_t(unsigned int, ALIGN(len, rsize), fsize); - if (!IS_SYNC(inode)) { - error = nfs_fscache_read_page(inode, &folio->page); - if (error == 0) - goto out_unlock; + new = nfs_page_create_from_folio(ctx, folio, 0, aligned_len); + if (IS_ERR(new)) { + error = PTR_ERR(new); + goto out; } - new = nfs_page_create_from_folio(desc->ctx, folio, 0, aligned_len); - if (IS_ERR(new)) - goto out_error; - if (len < fsize) folio_zero_segment(folio, len, fsize); - if (!nfs_pageio_add_request(&desc->pgio, new)) { + if (!nfs_pageio_add_request(pgio, new)) { nfs_list_remove_request(new); - error = desc->pgio.pg_error; + error = pgio->pg_error; nfs_readpage_release(new, error); goto out; } return 0; -out_error: - error = PTR_ERR(new); -out_unlock: - folio_unlock(folio); out: return error; } @@ -332,8 +319,9 @@ out: */ int nfs_read_folio(struct file *file, struct folio *folio) { - struct nfs_readdesc desc; struct inode *inode = file_inode(file); + struct nfs_pageio_descriptor pgio; + struct nfs_open_context *ctx; int ret; trace_nfs_aop_readpage(inode, folio); @@ -357,38 +345,43 @@ int nfs_read_folio(struct file *file, struct folio *folio) if (NFS_STALE(inode)) goto out_unlock; - desc.ctx = get_nfs_open_context(nfs_file_open_context(file)); + ret = nfs_netfs_read_folio(file, folio); + if (!ret) + goto out; + + ctx = get_nfs_open_context(nfs_file_open_context(file)); - xchg(&desc.ctx->error, 0); - nfs_pageio_init_read(&desc.pgio, inode, false, + xchg(&ctx->error, 0); + nfs_pageio_init_read(&pgio, inode, false, &nfs_async_read_completion_ops); - ret = readpage_async_filler(&desc, folio); + ret = nfs_read_add_folio(&pgio, ctx, folio); if (ret) - goto out; + goto out_put; - nfs_pageio_complete_read(&desc.pgio); - ret = desc.pgio.pg_error < 0 ? desc.pgio.pg_error : 0; + nfs_pageio_complete_read(&pgio); + ret = pgio.pg_error < 0 ? pgio.pg_error : 0; if (!ret) { ret = folio_wait_locked_killable(folio); if (!folio_test_uptodate(folio) && !ret) - ret = xchg(&desc.ctx->error, 0); + ret = xchg(&ctx->error, 0); } +out_put: + put_nfs_open_context(ctx); out: - put_nfs_open_context(desc.ctx); trace_nfs_aop_readpage_done(inode, folio, ret); return ret; out_unlock: folio_unlock(folio); - trace_nfs_aop_readpage_done(inode, folio, ret); - return ret; + goto out; } void nfs_readahead(struct readahead_control *ractl) { + struct nfs_pageio_descriptor pgio; + struct nfs_open_context *ctx; unsigned int nr_pages = readahead_count(ractl); struct file *file = ractl->file; - struct nfs_readdesc desc; struct inode *inode = ractl->mapping->host; struct folio *folio; int ret; @@ -401,26 +394,30 @@ void nfs_readahead(struct readahead_control *ractl) if (NFS_STALE(inode)) goto out; + ret = nfs_netfs_readahead(ractl); + if (!ret) + goto out; + if (file == NULL) { ret = -EBADF; - desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ); - if (desc.ctx == NULL) + ctx = nfs_find_open_context(inode, NULL, FMODE_READ); + if (ctx == NULL) goto out; } else - desc.ctx = get_nfs_open_context(nfs_file_open_context(file)); + ctx = get_nfs_open_context(nfs_file_open_context(file)); - nfs_pageio_init_read(&desc.pgio, inode, false, + nfs_pageio_init_read(&pgio, inode, false, &nfs_async_read_completion_ops); while ((folio = readahead_folio(ractl)) != NULL) { - ret = readpage_async_filler(&desc, folio); + ret = nfs_read_add_folio(&pgio, ctx, folio); if (ret) break; } - nfs_pageio_complete_read(&desc.pgio); + nfs_pageio_complete_read(&pgio); - put_nfs_open_context(desc.ctx); + put_nfs_open_context(ctx); out: trace_nfs_aop_readahead_done(inode, nr_pages, ret); } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 05ae23657527..30e53e93049e 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -692,10 +692,6 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root) totals.events[i] += stats->events[i]; for (i = 0; i < __NFSIOS_BYTESMAX; i++) totals.bytes[i] += stats->bytes[i]; -#ifdef CONFIG_NFS_FSCACHE - for (i = 0; i < __NFSIOS_FSCACHEMAX; i++) - totals.fscache[i] += stats->fscache[i]; -#endif preempt_enable(); } @@ -706,13 +702,6 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root) seq_puts(m, "\n\tbytes:\t"); for (i = 0; i < __NFSIOS_BYTESMAX; i++) seq_printf(m, "%Lu ", totals.bytes[i]); -#ifdef CONFIG_NFS_FSCACHE - if (nfss->options & NFS_OPTION_FSCACHE) { - seq_puts(m, "\n\tfsc:\t"); - for (i = 0; i < __NFSIOS_FSCACHEMAX; i++) - seq_printf(m, "%Lu ", totals.fscache[i]); - } -#endif seq_putc(m, '\n'); rpc_clnt_show_stats(m, nfss->client); @@ -1274,9 +1263,6 @@ int nfs_get_tree_common(struct fs_context *fc) if (ctx->clone_data.sb->s_flags & SB_SYNCHRONOUS) fc->sb_flags |= SB_SYNCHRONOUS; - if (server->caps & NFS_CAP_SECURITY_LABEL) - fc->lsm_flags |= SECURITY_LSM_NATIVE_LABELS; - /* Get a superblock - note that we may end up sharing one that already exists */ fc->s_fs_info = server; s = sget_fc(fc, compare_super, nfs_set_super); diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c index 7aea195ddb35..f39e2089bc4c 100644 --- a/fs/nfs/sysctl.c +++ b/fs/nfs/sysctl.c @@ -32,27 +32,9 @@ static struct ctl_table nfs_cb_sysctls[] = { { } }; -static struct ctl_table nfs_cb_sysctl_dir[] = { - { - .procname = "nfs", - .mode = 0555, - .child = nfs_cb_sysctls, - }, - { } -}; - -static struct ctl_table nfs_cb_sysctl_root[] = { - { - .procname = "fs", - .mode = 0555, - .child = nfs_cb_sysctl_dir, - }, - { } -}; - int nfs_register_sysctl(void) { - nfs_callback_sysctl_table = register_sysctl_table(nfs_cb_sysctl_root); + nfs_callback_sysctl_table = register_sysctl("fs/nfs", nfs_cb_sysctls); if (nfs_callback_sysctl_table == NULL) return -ENOMEM; return 0; diff --git a/fs/nfs_common/nfs_ssc.c b/fs/nfs_common/nfs_ssc.c index 7c1509e968c8..832246b22c51 100644 --- a/fs/nfs_common/nfs_ssc.c +++ b/fs/nfs_common/nfs_ssc.c @@ -12,7 +12,6 @@ #include <linux/nfs_ssc.h> #include "../nfs/nfs4_fs.h" -MODULE_LICENSE("GPL"); struct nfs_ssc_client_ops_tbl nfs_ssc_client_tbl; EXPORT_SYMBOL_GPL(nfs_ssc_client_tbl); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index e2e485167ac4..76db2fe29624 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3446,8 +3446,7 @@ out_acl: p = xdr_reserve_space(xdr, 4); if (!p) goto out_resource; - err = xattr_supported_namespace(d_inode(dentry), - XATTR_USER_PREFIX); + err = xattr_supports_user_prefix(d_inode(dentry)); *p++ = cpu_to_be32(err == 0); } diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index 798a2c1b38c6..7a8f166f2c8d 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -67,20 +67,28 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level, down_read(&bmap->b_sem); ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp); - if (ret < 0) { - ret = nilfs_bmap_convert_error(bmap, __func__, ret); + if (ret < 0) goto out; - } + if (NILFS_BMAP_USE_VBN(bmap)) { ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), *ptrp, &blocknr); if (!ret) *ptrp = blocknr; + else if (ret == -ENOENT) { + /* + * If there was no valid entry in DAT for the block + * address obtained by b_ops->bop_lookup, then pass + * internal code -EINVAL to nilfs_bmap_convert_error + * to treat it as metadata corruption. + */ + ret = -EINVAL; + } } out: up_read(&bmap->b_sem); - return ret; + return nilfs_bmap_convert_error(bmap, __func__, ret); } int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp, diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 41ccd43cd979..5cf30827f244 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -259,10 +259,10 @@ repeat: NILFS_PAGE_BUG(&folio->page, "inconsistent dirty state"); dfolio = filemap_grab_folio(dmap, folio->index); - if (unlikely(!dfolio)) { + if (unlikely(IS_ERR(dfolio))) { /* No empty page is added to the page cache */ - err = -ENOMEM; folio_unlock(folio); + err = PTR_ERR(dfolio); break; } if (unlikely(!folio_buffers(folio))) @@ -311,7 +311,7 @@ repeat: folio_lock(folio); dfolio = filemap_lock_folio(dmap, index); - if (dfolio) { + if (!IS_ERR(dfolio)) { /* overwrite existing folio in the destination cache */ WARN_ON(folio_test_dirty(dfolio)); nilfs_copy_page(&dfolio->page, &folio->page, 0); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 228659612c0d..ac949fd7603f 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2041,6 +2041,9 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) struct the_nilfs *nilfs = sci->sc_super->s_fs_info; int err; + if (sb_rdonly(sci->sc_super)) + return -EROFS; + nilfs_sc_cstage_set(sci, NILFS_ST_INIT); sci->sc_cno = nilfs->ns_cno; @@ -2724,7 +2727,7 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci) flush_work(&sci->sc_iput_work); - } while (ret && retrycount-- > 0); + } while (ret && ret != -EROFS && retrycount-- > 0); } /** diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 8f430bfad487..22fb1cf7e1fc 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -663,7 +663,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, struct fanotify_info *info = fanotify_event_info(event); unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; - struct file *f = NULL; + struct file *f = NULL, *pidfd_file = NULL; int ret, pidfd = FAN_NOPIDFD, fd = FAN_NOFD; pr_debug("%s: group=%p event=%p\n", __func__, group, event); @@ -718,7 +718,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, !pid_has_task(event->pid, PIDTYPE_TGID)) { pidfd = FAN_NOPIDFD; } else { - pidfd = pidfd_create(event->pid, 0); + pidfd = pidfd_prepare(event->pid, 0, &pidfd_file); if (pidfd < 0) pidfd = FAN_EPIDFD; } @@ -751,6 +751,9 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, if (f) fd_install(fd, f); + if (pidfd_file) + fd_install(pidfd, pidfd_file); + return metadata.event_len; out_close_fd: @@ -759,8 +762,10 @@ out_close_fd: fput(f); } - if (pidfd >= 0) - close_fd(pidfd); + if (pidfd >= 0) { + put_unused_fd(pidfd); + fput(pidfd_file); + } return ret; } diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 49cfe2ae6d23..993375f0db67 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -65,7 +65,7 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask, struct fsnotify_event *fsn_event; struct fsnotify_group *group = inode_mark->group; int ret; - int len = 0; + int len = 0, wd; int alloc_len = sizeof(struct inotify_event_info); struct mem_cgroup *old_memcg; @@ -81,6 +81,13 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask, fsn_mark); /* + * We can be racing with mark being detached. Don't report event with + * invalid wd. + */ + wd = READ_ONCE(i_mark->wd); + if (wd == -1) + return 0; + /* * Whoever is interested in the event, pays for the allocation. Do not * trigger OOM killer in the target monitoring memcg as it may have * security repercussion. @@ -110,7 +117,7 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask, fsn_event = &event->fse; fsnotify_init_event(fsn_event); event->mask = mask; - event->wd = i_mark->wd; + event->wd = wd; event->sync_cookie = cookie; event->name_len = len; if (len) diff --git a/fs/nsfs.c b/fs/nsfs.c index f8df60b3b901..f602a96a1afe 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -235,24 +235,6 @@ bool proc_ns_file(const struct file *file) return file->f_op == &ns_file_operations; } -struct file *proc_ns_fget(int fd) -{ - struct file *file; - - file = fget(fd); - if (!file) - return ERR_PTR(-EBADF); - - if (file->f_op != &ns_file_operations) - goto out_invalid; - - return file; - -out_invalid: - fput(file); - return ERR_PTR(-EINVAL); -} - /** * ns_match() - Returns true if current namespace matches dev/ino provided. * @ns: current namespace diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c index a030d00af90c..174fe536a1c0 100644 --- a/fs/ntfs/sysctl.c +++ b/fs/ntfs/sysctl.c @@ -31,16 +31,6 @@ static struct ctl_table ntfs_sysctls[] = { {} }; -/* Define the parent directory /proc/sys/fs. */ -static struct ctl_table sysctls_root[] = { - { - .procname = "fs", - .mode = 0555, - .child = ntfs_sysctls - }, - {} -}; - /* Storage for the sysctls header. */ static struct ctl_table_header *sysctls_root_table; @@ -54,7 +44,7 @@ int ntfs_sysctl(int add) { if (add) { BUG_ON(sysctls_root_table); - sysctls_root_table = register_sysctl_table(sysctls_root); + sysctls_root_table = register_sysctl("fs", ntfs_sysctls); if (!sysctls_root_table) return -ENOMEM; } else { diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c index 5e6bafb10f42..0b8bc66377db 100644 --- a/fs/ntfs3/attrib.c +++ b/fs/ntfs3/attrib.c @@ -405,8 +405,8 @@ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type, int err = 0; struct ntfs_sb_info *sbi = ni->mi.sbi; u8 cluster_bits = sbi->cluster_bits; - bool is_mft = - ni->mi.rno == MFT_REC_MFT && type == ATTR_DATA && !name_len; + bool is_mft = ni->mi.rno == MFT_REC_MFT && type == ATTR_DATA && + !name_len; u64 old_valid, old_size, old_alloc, new_alloc, new_alloc_tmp; struct ATTRIB *attr = NULL, *attr_b; struct ATTR_LIST_ENTRY *le, *le_b; @@ -531,11 +531,10 @@ add_alloc_in_same_attr_seg: pre_alloc = 0; if (type == ATTR_DATA && !name_len && sbi->options->prealloc) { - pre_alloc = - bytes_to_cluster( - sbi, - get_pre_allocated(new_size)) - - new_alen; + pre_alloc = bytes_to_cluster( + sbi, get_pre_allocated( + new_size)) - + new_alen; } /* Get the last LCN to allocate from. */ @@ -573,8 +572,8 @@ add_alloc_in_same_attr_seg: err = attr_allocate_clusters( sbi, run, vcn, lcn, to_allocate, &pre_alloc, is_mft ? ALLOCATE_MFT : ALLOCATE_DEF, &alen, - is_mft ? 0 - : (sbi->record_size - + is_mft ? 0 : + (sbi->record_size - le32_to_cpu(rec->used) + 8) / 3 + 1, diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c index 723fb64e6531..9a6c6a09d70c 100644 --- a/fs/ntfs3/bitmap.c +++ b/fs/ntfs3/bitmap.c @@ -40,9 +40,9 @@ static struct kmem_cache *ntfs_enode_cachep; int __init ntfs3_init_bitmap(void) { - ntfs_enode_cachep = - kmem_cache_create("ntfs3_enode_cache", sizeof(struct e_node), 0, - SLAB_RECLAIM_ACCOUNT, NULL); + ntfs_enode_cachep = kmem_cache_create("ntfs3_enode_cache", + sizeof(struct e_node), 0, + SLAB_RECLAIM_ACCOUNT, NULL); return ntfs_enode_cachep ? 0 : -ENOMEM; } @@ -286,9 +286,9 @@ static void wnd_add_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len, if (wnd->uptodated != 1) { /* Check bits before 'bit'. */ ib = wnd->zone_bit == wnd->zone_end || - bit < wnd->zone_end - ? 0 - : wnd->zone_end; + bit < wnd->zone_end ? + 0 : + wnd->zone_end; while (bit > ib && wnd_is_free_hlp(wnd, bit - 1, 1)) { bit -= 1; @@ -297,9 +297,9 @@ static void wnd_add_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len, /* Check bits after 'end_in'. */ ib = wnd->zone_bit == wnd->zone_end || - end_in > wnd->zone_bit - ? wnd->nbits - : wnd->zone_bit; + end_in > wnd->zone_bit ? + wnd->nbits : + wnd->zone_bit; while (end_in < ib && wnd_is_free_hlp(wnd, end_in, 1)) { end_in += 1; @@ -417,8 +417,8 @@ static void wnd_remove_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len) return; n3 = rb_first(&wnd->count_tree); wnd->extent_max = - n3 ? rb_entry(n3, struct e_node, count.node)->count.key - : 0; + n3 ? rb_entry(n3, struct e_node, count.node)->count.key : + 0; return; } @@ -658,7 +658,8 @@ int wnd_init(struct wnd_bitmap *wnd, struct super_block *sb, size_t nbits) if (!wnd->bits_last) wnd->bits_last = wbits; - wnd->free_bits = kcalloc(wnd->nwnd, sizeof(u16), GFP_NOFS | __GFP_NOWARN); + wnd->free_bits = + kcalloc(wnd->nwnd, sizeof(u16), GFP_NOFS | __GFP_NOWARN); if (!wnd->free_bits) return -ENOMEM; diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index e9bdc1ff08c9..9a3d55c367d9 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -22,20 +22,21 @@ static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg) { struct fstrim_range __user *user_range; struct fstrim_range range; + struct block_device *dev; int err; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!bdev_max_discard_sectors(sbi->sb->s_bdev)) + dev = sbi->sb->s_bdev; + if (!bdev_max_discard_sectors(dev)) return -EOPNOTSUPP; user_range = (struct fstrim_range __user *)arg; if (copy_from_user(&range, user_range, sizeof(range))) return -EFAULT; - range.minlen = max_t(u32, range.minlen, - bdev_discard_granularity(sbi->sb->s_bdev)); + range.minlen = max_t(u32, range.minlen, bdev_discard_granularity(dev)); err = ntfs_trim_fs(sbi, &range); if (err < 0) @@ -190,8 +191,8 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) for (; idx < idx_end; idx += 1, from = 0) { page_off = (loff_t)idx << PAGE_SHIFT; - to = (page_off + PAGE_SIZE) > vbo_to ? (vbo_to - page_off) - : PAGE_SIZE; + to = (page_off + PAGE_SIZE) > vbo_to ? (vbo_to - page_off) : + PAGE_SIZE; iblock = page_off >> inode->i_blkbits; page = find_or_create_page(mapping, idx, @@ -223,16 +224,10 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { - lock_buffer(bh); - bh->b_end_io = end_buffer_read_sync; - get_bh(bh); - submit_bh(REQ_OP_READ, bh); - - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { + err = bh_read(bh, 0); + if (err < 0) { unlock_page(page); put_page(page); - err = -EIO; goto out; } } @@ -570,13 +565,14 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) ni_unlock(ni); } else { /* Check new size. */ + u8 cluster_bits = sbi->cluster_bits; /* generic/213: expected -ENOSPC instead of -EFBIG. */ if (!is_supported_holes) { loff_t to_alloc = new_size - inode_get_bytes(inode); if (to_alloc > 0 && - (to_alloc >> sbi->cluster_bits) > + (to_alloc >> cluster_bits) > wnd_zeroes(&sbi->used.bitmap)) { err = -ENOSPC; goto out; @@ -597,7 +593,7 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) } if (is_supported_holes) { - CLST vcn = vbo >> sbi->cluster_bits; + CLST vcn = vbo >> cluster_bits; CLST cend = bytes_to_cluster(sbi, end); CLST cend_v = bytes_to_cluster(sbi, ni->i_valid); CLST lcn, clen; @@ -660,22 +656,12 @@ out: int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { - struct super_block *sb = dentry->d_sb; - struct ntfs_sb_info *sbi = sb->s_fs_info; struct inode *inode = d_inode(dentry); struct ntfs_inode *ni = ntfs_i(inode); u32 ia_valid = attr->ia_valid; umode_t mode = inode->i_mode; int err; - if (sbi->options->noacsrules) { - /* "No access rules" - Force any changes of time etc. */ - attr->ia_valid |= ATTR_FORCE; - /* and disable for editing some attributes. */ - attr->ia_valid &= ~(ATTR_UID | ATTR_GID | ATTR_MODE); - ia_valid = attr->ia_valid; - } - err = setattr_prepare(idmap, dentry, attr); if (err) goto out; @@ -719,7 +705,7 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE)) - ntfs_save_wsl_perm(inode); + ntfs_save_wsl_perm(inode, NULL); mark_inode_dirty(inode); out: return err; @@ -1065,8 +1051,8 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret) goto out; - ret = is_compressed(ni) ? ntfs_compress_write(iocb, from) - : __generic_file_write_iter(iocb, from); + ret = is_compressed(ni) ? ntfs_compress_write(iocb, from) : + __generic_file_write_iter(iocb, from); out: inode_unlock(inode); @@ -1118,8 +1104,9 @@ static int ntfs_file_release(struct inode *inode, struct file *file) int err = 0; /* If we are last writer on the inode, drop the block reservation. */ - if (sbi->options->prealloc && ((file->f_mode & FMODE_WRITE) && - atomic_read(&inode->i_writecount) == 1)) { + if (sbi->options->prealloc && + ((file->f_mode & FMODE_WRITE) && + atomic_read(&inode->i_writecount) == 1)) { ni_lock(ni); down_write(&ni->file.run_lock); @@ -1159,8 +1146,7 @@ const struct inode_operations ntfs_file_inode_operations = { .getattr = ntfs_getattr, .setattr = ntfs3_setattr, .listxattr = ntfs_listxattr, - .permission = ntfs_permission, - .get_inode_acl = ntfs_get_acl, + .get_acl = ntfs_get_acl, .set_acl = ntfs_set_acl, .fiemap = ntfs_fiemap, }; diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index f1df52dfab74..2bfcf1a989c9 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -76,8 +76,8 @@ struct ATTR_STD_INFO *ni_std(struct ntfs_inode *ni) const struct ATTRIB *attr; attr = mi_find_attr(&ni->mi, NULL, ATTR_STD, NULL, 0, NULL); - return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO)) - : NULL; + return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO)) : + NULL; } /* @@ -91,8 +91,8 @@ struct ATTR_STD_INFO5 *ni_std5(struct ntfs_inode *ni) attr = mi_find_attr(&ni->mi, NULL, ATTR_STD, NULL, 0, NULL); - return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO5)) - : NULL; + return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO5)) : + NULL; } /* @@ -102,7 +102,7 @@ void ni_clear(struct ntfs_inode *ni) { struct rb_node *node; - if (!ni->vfs_inode.i_nlink && is_rec_inuse(ni->mi.mrec)) + if (!ni->vfs_inode.i_nlink && ni->mi.mrec && is_rec_inuse(ni->mi.mrec)) ni_delete_all(ni); al_destroy(ni); @@ -1439,8 +1439,8 @@ int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type, int err; CLST plen; struct ATTRIB *attr; - bool is_ext = - (flags & (ATTR_FLAG_SPARSED | ATTR_FLAG_COMPRESSED)) && !svcn; + bool is_ext = (flags & (ATTR_FLAG_SPARSED | ATTR_FLAG_COMPRESSED)) && + !svcn; u32 name_size = ALIGN(name_len * sizeof(short), 8); u32 name_off = is_ext ? SIZEOF_NONRESIDENT_EX : SIZEOF_NONRESIDENT; u32 run_off = name_off + name_size; @@ -1645,7 +1645,7 @@ struct ATTR_FILE_NAME *ni_fname_name(struct ntfs_inode *ni, { struct ATTRIB *attr = NULL; struct ATTR_FILE_NAME *fname; - struct le_str *fns; + struct le_str *fns; if (le) *le = NULL; @@ -1756,9 +1756,9 @@ int ni_new_attr_flags(struct ntfs_inode *ni, enum FILE_ATTRIBUTE new_fa) } /* Resize nonresident empty attribute in-place only. */ - new_asize = (new_aflags & (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED)) - ? (SIZEOF_NONRESIDENT_EX + 8) - : (SIZEOF_NONRESIDENT + 8); + new_asize = (new_aflags & (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED)) ? + (SIZEOF_NONRESIDENT_EX + 8) : + (SIZEOF_NONRESIDENT + 8); if (!mi_resize_attr(mi, attr, new_asize - le32_to_cpu(attr->size))) return -EOPNOTSUPP; @@ -2965,14 +2965,14 @@ bool ni_remove_name_undo(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, { struct ntfs_sb_info *sbi = ni->mi.sbi; struct ATTRIB *attr; - u16 de_key_size = de2 ? le16_to_cpu(de2->key_size) : 0; + u16 de_key_size; switch (undo_step) { case 4: + de_key_size = le16_to_cpu(de2->key_size); if (ni_insert_resident(ni, de_key_size, ATTR_NAME, NULL, 0, - &attr, NULL, NULL)) { + &attr, NULL, NULL)) return false; - } memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de2 + 1, de_key_size); mi_get_ref(&ni->mi, &de2->ref); @@ -2981,19 +2981,16 @@ bool ni_remove_name_undo(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, de2->flags = 0; de2->res = 0; - if (indx_insert_entry(&dir_ni->dir, dir_ni, de2, sbi, NULL, - 1)) { + if (indx_insert_entry(&dir_ni->dir, dir_ni, de2, sbi, NULL, 1)) return false; - } fallthrough; case 2: de_key_size = le16_to_cpu(de->key_size); if (ni_insert_resident(ni, de_key_size, ATTR_NAME, NULL, 0, - &attr, NULL, NULL)) { + &attr, NULL, NULL)) return false; - } memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de + 1, de_key_size); mi_get_ref(&ni->mi, &de->ref); @@ -3162,9 +3159,9 @@ static bool ni_update_parent(struct ntfs_inode *ni, struct NTFS_DUP_INFO *dup, u64 data_size = le64_to_cpu(attr->nres.data_size); __le64 valid_le; - dup->alloc_size = is_attr_ext(attr) - ? attr->nres.total_size - : attr->nres.alloc_size; + dup->alloc_size = is_attr_ext(attr) ? + attr->nres.total_size : + attr->nres.alloc_size; dup->data_size = attr->nres.data_size; if (new_valid > data_size) @@ -3258,6 +3255,9 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint) return 0; } + if (!ni->mi.mrec) + goto out; + if (is_rec_inuse(ni->mi.mrec) && !(sbi->flags & NTFS_FLAGS_LOG_REPLAYING) && inode->i_nlink) { bool modified = false; @@ -3360,7 +3360,7 @@ out: ni_unlock(ni); if (err) { - ntfs_err(sb, "%s r=%lx failed, %d.", hint, inode->i_ino, err); + ntfs_inode_err(inode, "%s failed, %d.", hint, err); ntfs_set_state(sbi, NTFS_DIRTY_ERROR); return err; } diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c index c6eb371a3695..57762c5fe68b 100644 --- a/fs/ntfs3/fslog.c +++ b/fs/ntfs3/fslog.c @@ -827,10 +827,10 @@ static inline struct RESTART_TABLE *extend_rsttbl(struct RESTART_TABLE *tbl, memcpy(rt + 1, tbl + 1, esize * used); - rt->free_goal = free_goal == ~0u - ? cpu_to_le32(~0u) - : cpu_to_le32(sizeof(struct RESTART_TABLE) + - free_goal * esize); + rt->free_goal = free_goal == ~0u ? + cpu_to_le32(~0u) : + cpu_to_le32(sizeof(struct RESTART_TABLE) + + free_goal * esize); if (tbl->first_free) { rt->first_free = tbl->first_free; @@ -1089,9 +1089,9 @@ static inline u64 base_lsn(struct ntfs_log *log, (lsn < (lsn_to_vbo(log, h_lsn) & ~log->page_mask) ? 1 : 0)) << log->file_data_bits) + ((((is_log_record_end(hdr) && - h_lsn <= le64_to_cpu(hdr->record_hdr.last_end_lsn)) - ? le16_to_cpu(hdr->record_hdr.next_record_off) - : log->page_size) + + h_lsn <= le64_to_cpu(hdr->record_hdr.last_end_lsn)) ? + le16_to_cpu(hdr->record_hdr.next_record_off) : + log->page_size) + lsn) >> 3); @@ -1298,9 +1298,9 @@ static void log_init_pg_hdr(struct ntfs_log *log, u32 sys_page_size, if (!log->clst_per_page) log->clst_per_page = 1; - log->first_page = major_ver >= 2 - ? 0x22 * page_size - : ((sys_page_size << 1) + (page_size << 1)); + log->first_page = major_ver >= 2 ? + 0x22 * page_size : + ((sys_page_size << 1) + (page_size << 1)); log->major_ver = major_ver; log->minor_ver = minor_ver; } @@ -1512,20 +1512,19 @@ static u32 current_log_avail(struct ntfs_log *log) * have to compute the free range. * If there is no oldest lsn then start at the first page of the file. */ - oldest_off = (log->l_flags & NTFSLOG_NO_OLDEST_LSN) - ? log->first_page - : (log->oldest_lsn_off & ~log->sys_page_mask); + oldest_off = (log->l_flags & NTFSLOG_NO_OLDEST_LSN) ? + log->first_page : + (log->oldest_lsn_off & ~log->sys_page_mask); /* * We will use the next log page offset to compute the next free page. * If we are going to reuse this page go to the next page. * If we are at the first page then use the end of the file. */ - next_free_off = (log->l_flags & NTFSLOG_REUSE_TAIL) - ? log->next_page + log->page_size - : log->next_page == log->first_page - ? log->l_size - : log->next_page; + next_free_off = (log->l_flags & NTFSLOG_REUSE_TAIL) ? + log->next_page + log->page_size : + log->next_page == log->first_page ? log->l_size : + log->next_page; /* If the two offsets are the same then there is no available space. */ if (oldest_off == next_free_off) @@ -1535,9 +1534,9 @@ static u32 current_log_avail(struct ntfs_log *log) * this range from the total available pages. */ free_bytes = - oldest_off < next_free_off - ? log->total_avail_pages - (next_free_off - oldest_off) - : oldest_off - next_free_off; + oldest_off < next_free_off ? + log->total_avail_pages - (next_free_off - oldest_off) : + oldest_off - next_free_off; free_bytes >>= log->page_bits; return free_bytes * log->reserved; @@ -1671,8 +1670,8 @@ next_tail: } best_lsn1 = first_tail ? base_lsn(log, first_tail, first_file_off) : 0; - best_lsn2 = - second_tail ? base_lsn(log, second_tail, second_file_off) : 0; + best_lsn2 = second_tail ? base_lsn(log, second_tail, second_file_off) : + 0; if (first_tail && second_tail) { if (best_lsn1 > best_lsn2) { @@ -1767,8 +1766,8 @@ tail_read: page_cnt = page_pos = 1; - curpage_off = seq_base == log->seq_num ? min(log->next_page, page_off) - : log->next_page; + curpage_off = seq_base == log->seq_num ? min(log->next_page, page_off) : + log->next_page; wrapped_file = curpage_off == log->first_page && @@ -1826,9 +1825,9 @@ use_cur_page: le64_to_cpu(cur_page->record_hdr.last_end_lsn) && ((lsn_cur >> log->file_data_bits) + ((curpage_off < - (lsn_to_vbo(log, lsn_cur) & ~log->page_mask)) - ? 1 - : 0)) != expected_seq) { + (lsn_to_vbo(log, lsn_cur) & ~log->page_mask)) ? + 1 : + 0)) != expected_seq) { goto check_tail; } @@ -2575,7 +2574,7 @@ static int read_next_log_rec(struct ntfs_log *log, struct lcb *lcb, u64 *lsn) return find_log_rec(log, *lsn, lcb); } -static inline bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes) +bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes) { __le16 mask; u32 min_de, de_off, used, total; @@ -2642,9 +2641,10 @@ static inline bool check_index_root(const struct ATTRIB *attr, { bool ret; const struct INDEX_ROOT *root = resident_data(attr); - u8 index_bits = le32_to_cpu(root->index_block_size) >= sbi->cluster_size - ? sbi->cluster_bits - : SECTOR_SHIFT; + u8 index_bits = le32_to_cpu(root->index_block_size) >= + sbi->cluster_size ? + sbi->cluster_bits : + SECTOR_SHIFT; u8 block_clst = root->index_block_clst; if (le32_to_cpu(attr->res.data_size) < sizeof(struct INDEX_ROOT) || @@ -3683,7 +3683,8 @@ move_data: if (a_dirty) { attr = oa->attr; - err = ntfs_sb_write_run(sbi, oa->run1, vbo, buffer_le, bytes, 0); + err = ntfs_sb_write_run(sbi, oa->run1, vbo, buffer_le, bytes, + 0); if (err) goto out; } @@ -3768,11 +3769,10 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) if (!log) return -ENOMEM; - memset(&rst_info, 0, sizeof(struct restart_info)); - log->ni = ni; log->l_size = l_size; log->one_page_buf = kmalloc(page_size, GFP_NOFS); + if (!log->one_page_buf) { err = -ENOMEM; goto out; @@ -3783,6 +3783,7 @@ int log_replay(struct ntfs_inode *ni, bool *initialized) log->page_bits = blksize_bits(page_size); /* Look for a restart area on the disk. */ + memset(&rst_info, 0, sizeof(struct restart_info)); err = log_read_rst(log, l_size, true, &rst_info); if (err) goto out; @@ -3859,10 +3860,10 @@ check_restart_area: log->init_ra = !!rst_info.vbo; /* If we have a valid page then grab a pointer to the restart area. */ - ra2 = rst_info.valid_page - ? Add2Ptr(rst_info.r_page, - le16_to_cpu(rst_info.r_page->ra_off)) - : NULL; + ra2 = rst_info.valid_page ? + Add2Ptr(rst_info.r_page, + le16_to_cpu(rst_info.r_page->ra_off)) : + NULL; if (rst_info.chkdsk_was_run || (ra2 && ra2->client_idx[1] == LFS_NO_CLIENT_LE)) { @@ -4256,6 +4257,10 @@ check_attribute_names: rec_len -= t32; attr_names = kmemdup(Add2Ptr(lrh, t32), rec_len, GFP_NOFS); + if (!attr_names) { + err = -ENOMEM; + goto out; + } lcb_put(lcb); lcb = NULL; diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index 567563771bf8..28cc421102e5 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -172,8 +172,8 @@ int ntfs_fix_post_read(struct NTFS_RECORD_HEADER *rhdr, size_t bytes, u16 sample, fo, fn; fo = le16_to_cpu(rhdr->fix_off); - fn = simple ? ((bytes >> SECTOR_SHIFT) + 1) - : le16_to_cpu(rhdr->fix_num); + fn = simple ? ((bytes >> SECTOR_SHIFT) + 1) : + le16_to_cpu(rhdr->fix_num); /* Check errors. */ if ((fo & 1) || fo + fn * sizeof(short) > SECTOR_SIZE || !fn-- || @@ -223,7 +223,7 @@ int ntfs_extend_init(struct ntfs_sb_info *sbi) inode = ntfs_iget5(sb, &ref, &NAME_EXTEND); if (IS_ERR(inode)) { err = PTR_ERR(inode); - ntfs_err(sb, "Failed to load $Extend."); + ntfs_err(sb, "Failed to load $Extend (%d).", err); inode = NULL; goto out; } @@ -282,7 +282,7 @@ int ntfs_loadlog_and_replay(struct ntfs_inode *ni, struct ntfs_sb_info *sbi) /* Check for 4GB. */ if (ni->vfs_inode.i_size >= 0x100000000ull) { - ntfs_err(sb, "\x24LogFile is too big"); + ntfs_err(sb, "\x24LogFile is large than 4G."); err = -EINVAL; goto out; } @@ -646,13 +646,13 @@ next: NULL, 0, NULL, NULL)) goto next; - __clear_bit_le(ir - MFT_REC_RESERVED, + __clear_bit(ir - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap); } } /* Scan 5 bits for zero. Bit 0 == MFT_REC_RESERVED */ - zbit = find_next_zero_bit_le(&sbi->mft.reserved_bitmap, + zbit = find_next_zero_bit(&sbi->mft.reserved_bitmap, MFT_REC_FREE, MFT_REC_RESERVED); if (zbit >= MFT_REC_FREE) { sbi->mft.next_reserved = MFT_REC_FREE; @@ -720,7 +720,7 @@ found: if (*rno >= MFT_REC_FREE) wnd_set_used(wnd, *rno, 1); else if (*rno >= MFT_REC_RESERVED && sbi->mft.reserved_bitmap_inited) - __set_bit_le(*rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap); + __set_bit(*rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap); out: if (!mft) @@ -748,7 +748,7 @@ void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno, bool is_mft) else wnd_set_free(wnd, rno, 1); } else if (rno >= MFT_REC_RESERVED && sbi->mft.reserved_bitmap_inited) { - __clear_bit_le(rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap); + __clear_bit(rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap); } if (rno < wnd_zone_bit(wnd)) @@ -846,18 +846,16 @@ void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait) { int err; struct super_block *sb = sbi->sb; - u32 blocksize; + u32 blocksize, bytes; sector_t block1, block2; - u32 bytes; - if (!sb) + /* + * sb can be NULL here. In this case sbi->flags should be 0 too. + */ + if (!sb || !(sbi->flags & NTFS_FLAGS_MFTMIRR)) return; blocksize = sb->s_blocksize; - - if (!(sbi->flags & NTFS_FLAGS_MFTMIRR)) - return; - bytes = sbi->mft.recs_mirr << sbi->record_bits; block1 = sbi->mft.lbo >> sb->s_blocksize_bits; block2 = sbi->mft.lbo2 >> sb->s_blocksize_bits; @@ -925,6 +923,7 @@ int ntfs_set_state(struct ntfs_sb_info *sbi, enum NTFS_DIRTY_FLAGS dirty) struct VOLUME_INFO *info; struct mft_inode *mi; struct ntfs_inode *ni; + __le16 info_flags; /* * Do not change state if fs was real_dirty. @@ -957,6 +956,8 @@ int ntfs_set_state(struct ntfs_sb_info *sbi, enum NTFS_DIRTY_FLAGS dirty) goto out; } + info_flags = info->flags; + switch (dirty) { case NTFS_DIRTY_ERROR: ntfs_notice(sbi->sb, "Mark volume as dirty due to NTFS errors"); @@ -970,8 +971,10 @@ int ntfs_set_state(struct ntfs_sb_info *sbi, enum NTFS_DIRTY_FLAGS dirty) break; } /* Cache current volume flags. */ - sbi->volume.flags = info->flags; - mi->dirty = true; + if (info_flags != info->flags) { + sbi->volume.flags = info->flags; + mi->dirty = true; + } err = 0; out: @@ -1683,6 +1686,7 @@ struct ntfs_inode *ntfs_new_inode(struct ntfs_sb_info *sbi, CLST rno, bool dir) out: if (err) { + make_bad_inode(inode); iput(inode); ni = ERR_PTR(err); } @@ -1859,7 +1863,7 @@ int ntfs_security_init(struct ntfs_sb_info *sbi) inode = ntfs_iget5(sb, &ref, &NAME_SECURE); if (IS_ERR(inode)) { err = PTR_ERR(inode); - ntfs_err(sb, "Failed to load $Secure."); + ntfs_err(sb, "Failed to load $Secure (%d).", err); inode = NULL; goto out; } @@ -1870,41 +1874,43 @@ int ntfs_security_init(struct ntfs_sb_info *sbi) attr = ni_find_attr(ni, NULL, &le, ATTR_ROOT, SDH_NAME, ARRAY_SIZE(SDH_NAME), NULL, NULL); - if (!attr) { - err = -EINVAL; - goto out; - } - - root_sdh = resident_data_ex(attr, sizeof(struct INDEX_ROOT)); - if (root_sdh->type != ATTR_ZERO || + if (!attr || + !(root_sdh = resident_data_ex(attr, sizeof(struct INDEX_ROOT))) || + root_sdh->type != ATTR_ZERO || root_sdh->rule != NTFS_COLLATION_TYPE_SECURITY_HASH || - offsetof(struct INDEX_ROOT, ihdr) + root_sdh->ihdr.used > attr->res.data_size) { + offsetof(struct INDEX_ROOT, ihdr) + + le32_to_cpu(root_sdh->ihdr.used) > + le32_to_cpu(attr->res.data_size)) { + ntfs_err(sb, "$Secure::$SDH is corrupted."); err = -EINVAL; goto out; } err = indx_init(indx_sdh, sbi, attr, INDEX_MUTEX_SDH); - if (err) + if (err) { + ntfs_err(sb, "Failed to initialize $Secure::$SDH (%d).", err); goto out; + } attr = ni_find_attr(ni, attr, &le, ATTR_ROOT, SII_NAME, ARRAY_SIZE(SII_NAME), NULL, NULL); - if (!attr) { - err = -EINVAL; - goto out; - } - - root_sii = resident_data_ex(attr, sizeof(struct INDEX_ROOT)); - if (root_sii->type != ATTR_ZERO || + if (!attr || + !(root_sii = resident_data_ex(attr, sizeof(struct INDEX_ROOT))) || + root_sii->type != ATTR_ZERO || root_sii->rule != NTFS_COLLATION_TYPE_UINT || - offsetof(struct INDEX_ROOT, ihdr) + root_sii->ihdr.used > attr->res.data_size) { + offsetof(struct INDEX_ROOT, ihdr) + + le32_to_cpu(root_sii->ihdr.used) > + le32_to_cpu(attr->res.data_size)) { + ntfs_err(sb, "$Secure::$SII is corrupted."); err = -EINVAL; goto out; } err = indx_init(indx_sii, sbi, attr, INDEX_MUTEX_SII); - if (err) + if (err) { + ntfs_err(sb, "Failed to initialize $Secure::$SII (%d).", err); goto out; + } fnd_sii = fnd_get(); if (!fnd_sii) { @@ -2594,8 +2600,10 @@ static inline bool is_reserved_name(struct ntfs_sb_info *sbi, if (len == 4 || (len > 4 && le16_to_cpu(name[4]) == '.')) { port_digit = le16_to_cpu(name[3]); if (port_digit >= '1' && port_digit <= '9') - if (!ntfs_cmp_names(name, 3, COM_NAME, 3, upcase, false) || - !ntfs_cmp_names(name, 3, LPT_NAME, 3, upcase, false)) + if (!ntfs_cmp_names(name, 3, COM_NAME, 3, upcase, + false) || + !ntfs_cmp_names(name, 3, LPT_NAME, 3, upcase, + false)) return true; } diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c index 51ab75954640..0a48d2d67219 100644 --- a/fs/ntfs3/index.c +++ b/fs/ntfs3/index.c @@ -431,8 +431,9 @@ next_run: if (vbo + blocksize > data_size) nbits = 8 * (data_size - vbo); - ok = nbits > from ? (*fn)((ulong *)bh->b_data, from, nbits, ret) - : false; + ok = nbits > from ? + (*fn)((ulong *)bh->b_data, from, nbits, ret) : + false; put_bh(bh); if (ok) { @@ -725,9 +726,13 @@ static struct NTFS_DE *hdr_find_e(const struct ntfs_index *indx, u32 e_size, e_key_len; u32 end = le32_to_cpu(hdr->used); u32 off = le32_to_cpu(hdr->de_off); + u32 total = le32_to_cpu(hdr->total); u16 offs[128]; fill_table: + if (end > total) + return NULL; + if (off + sizeof(struct NTFS_DE) > end) return NULL; @@ -760,8 +765,7 @@ binary_search: return NULL; max_idx = 0; - table_size = min(table_size * 2, - (int)ARRAY_SIZE(offs)); + table_size = min(table_size * 2, (int)ARRAY_SIZE(offs)); goto fill_table; } } else if (diff2 < 0) { @@ -844,6 +848,10 @@ static inline struct NTFS_DE *hdr_delete_de(struct INDEX_HDR *hdr, u32 off = PtrOffset(hdr, re); int bytes = used - (off + esize); + /* check INDEX_HDR valid before using INDEX_HDR */ + if (!check_index_header(hdr, le32_to_cpu(hdr->total))) + return NULL; + if (off >= used || esize < sizeof(struct NTFS_DE) || bytes < sizeof(struct NTFS_DE)) return NULL; @@ -986,6 +994,7 @@ struct INDEX_ROOT *indx_get_root(struct ntfs_index *indx, struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le = NULL; struct ATTRIB *a; const struct INDEX_NAMES *in = &s_index_names[indx->type]; + struct INDEX_ROOT *root; a = ni_find_attr(ni, NULL, &le, ATTR_ROOT, in->name, in->name_len, NULL, mi); @@ -995,7 +1004,16 @@ struct INDEX_ROOT *indx_get_root(struct ntfs_index *indx, struct ntfs_inode *ni, if (attr) *attr = a; - return resident_data_ex(a, sizeof(struct INDEX_ROOT)); + root = resident_data_ex(a, sizeof(struct INDEX_ROOT)); + + /* length check */ + if (root && + offsetof(struct INDEX_ROOT, ihdr) + le32_to_cpu(root->ihdr.used) > + le32_to_cpu(a->res.data_size)) { + return NULL; + } + + return root; } static int indx_write(struct ntfs_index *indx, struct ntfs_inode *ni, @@ -1085,7 +1103,8 @@ ok: } /* check for index header length */ - if (offsetof(struct INDEX_BUFFER, ihdr) + ib->ihdr.used > bytes) { + if (offsetof(struct INDEX_BUFFER, ihdr) + le32_to_cpu(ib->ihdr.used) > + bytes) { err = -EINVAL; goto out; } @@ -1151,8 +1170,10 @@ int indx_find(struct ntfs_index *indx, struct ntfs_inode *ni, /* Read next level. */ err = indx_read(indx, ni, de_get_vbn(e), &node); - if (err) + if (err) { + /* io error? */ return err; + } /* Lookup entry that is <= to the search value. */ e = hdr_find_e(indx, &node->index->ihdr, key, key_len, ctx, @@ -1654,9 +1675,9 @@ static int indx_insert_into_root(struct ntfs_index *indx, struct ntfs_inode *ni, mi->dirty = true; /* Create alloc and bitmap attributes (if not). */ - err = run_is_empty(&indx->alloc_run) - ? indx_create_allocate(indx, ni, &new_vbn) - : indx_add_allocate(indx, ni, &new_vbn); + err = run_is_empty(&indx->alloc_run) ? + indx_create_allocate(indx, ni, &new_vbn) : + indx_add_allocate(indx, ni, &new_vbn); /* Layout of record may be changed, so rescan root. */ root = indx_get_root(indx, ni, &attr, &mi); @@ -1759,10 +1780,11 @@ indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni, struct indx_node *n1 = fnd->nodes[level]; struct INDEX_HDR *hdr1 = &n1->index->ihdr; struct INDEX_HDR *hdr2; - u32 to_copy, used; + u32 to_copy, used, used1; CLST new_vbn; __le64 t_vbn, *sub_vbn; u16 sp_size; + void *hdr1_saved = NULL; /* Try the most easy case. */ e = fnd->level - 1 == level ? fnd->de[level] : NULL; @@ -1795,6 +1817,13 @@ indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni, return -ENOMEM; memcpy(up_e, sp, sp_size); + used1 = le32_to_cpu(hdr1->used); + hdr1_saved = kmemdup(hdr1, used1, GFP_NOFS); + if (!hdr1_saved) { + err = -ENOMEM; + goto out; + } + if (!hdr1->flags) { up_e->flags |= NTFS_IE_HAS_SUBNODES; up_e->size = cpu_to_le16(sp_size + sizeof(u64)); @@ -1827,7 +1856,7 @@ indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni, hdr_insert_head(hdr2, de_t, to_copy); /* Remove all entries (sp including) from hdr1. */ - used = le32_to_cpu(hdr1->used) - to_copy - sp_size; + used = used1 - to_copy - sp_size; memmove(de_t, Add2Ptr(sp, sp_size), used - le32_to_cpu(hdr1->de_off)); hdr1->used = cpu_to_le32(used); @@ -1838,9 +1867,9 @@ indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni, hdr_insert_de(indx, (*indx->cmp)(new_de + 1, le16_to_cpu(new_de->key_size), up_e + 1, le16_to_cpu(up_e->key_size), - ctx) < 0 - ? hdr2 - : hdr1, + ctx) < 0 ? + hdr2 : + hdr1, new_de, NULL, ctx); indx_mark_used(indx, ni, new_vbn >> indx->idx2vbn_bits); @@ -1857,8 +1886,6 @@ indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni, if (!level) { /* Insert in root. */ err = indx_insert_into_root(indx, ni, up_e, NULL, ctx, fnd, 0); - if (err) - goto out; } else { /* * The target buffer's parent is another index buffer. @@ -1866,12 +1893,20 @@ indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni, */ err = indx_insert_into_buffer(indx, ni, root, up_e, ctx, level - 1, fnd); - if (err) - goto out; + } + + if (err) { + /* + * Undo critical operations. + */ + indx_mark_free(indx, ni, new_vbn >> indx->idx2vbn_bits); + memcpy(hdr1, hdr1_saved, used1); + indx_write(indx, ni, n1, 0); } out: kfree(up_e); + kfree(hdr1_saved); return err; } @@ -1930,16 +1965,12 @@ int indx_insert_entry(struct ntfs_index *indx, struct ntfs_inode *ni, */ err = indx_insert_into_root(indx, ni, new_de, fnd->root_de, ctx, fnd, undo); - if (err) - goto out; } else { /* * Found a leaf buffer, so we'll insert the new entry into it. */ err = indx_insert_into_buffer(indx, ni, root, new_de, ctx, fnd->level - 1, fnd); - if (err) - goto out; } out: @@ -2308,8 +2339,8 @@ int indx_delete_entry(struct ntfs_index *indx, struct ntfs_inode *ni, err = level ? indx_insert_into_buffer(indx, ni, root, re, ctx, fnd->level - 1, - fnd) - : indx_insert_into_root(indx, ni, re, e, + fnd) : + indx_insert_into_root(indx, ni, re, e, ctx, fnd, 0); kfree(re); diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 309d9b46b5d5..6c560245eef4 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -100,6 +100,12 @@ static struct inode *ntfs_read_mft(struct inode *inode, /* Record should contain $I30 root. */ is_dir = rec->flags & RECORD_FLAG_DIR; + /* MFT_REC_MFT is not a dir */ + if (is_dir && ino == MFT_REC_MFT) { + err = -EINVAL; + goto out; + } + inode->i_generation = le16_to_cpu(rec->seq); /* Enumerate all struct Attributes MFT. */ @@ -131,7 +137,13 @@ next_attr: rsize = attr->non_res ? 0 : le32_to_cpu(attr->res.data_size); asize = le32_to_cpu(attr->size); - if (le16_to_cpu(attr->name_off) + attr->name_len > asize) + /* + * Really this check was done in 'ni_enum_attr_ex' -> ... 'mi_enum_attr'. + * There not critical to check this case again + */ + if (attr->name_len && + sizeof(short) * attr->name_len + le16_to_cpu(attr->name_off) > + asize) goto out; if (attr->non_res) { @@ -250,8 +262,8 @@ next_attr: if (!attr->nres.alloc_size) goto next_attr; - run = ino == MFT_REC_BITMAP ? &sbi->used.bitmap.run - : &ni->file.run; + run = ino == MFT_REC_BITMAP ? &sbi->used.bitmap.run : + &ni->file.run; break; case ATTR_ROOT: @@ -259,7 +271,6 @@ next_attr: goto out; root = Add2Ptr(attr, roff); - is_root = true; if (attr->name_len != ARRAY_SIZE(I30_NAME) || memcmp(attr_name(attr), I30_NAME, sizeof(I30_NAME))) @@ -272,15 +283,16 @@ next_attr: if (!is_dir) goto next_attr; + is_root = true; ni->ni_flags |= NI_FLAG_DIR; err = indx_init(&ni->dir, sbi, attr, INDEX_MUTEX_I30); if (err) goto out; - mode = sb->s_root - ? (S_IFDIR | (0777 & sbi->options->fs_dmask_inv)) - : (S_IFDIR | 0777); + mode = sb->s_root ? + (S_IFDIR | (0777 & sbi->options->fs_dmask_inv)) : + (S_IFDIR | 0777); goto next_attr; case ATTR_ALLOC: @@ -437,8 +449,8 @@ end_enum: ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY; inode->i_op = &ntfs_file_inode_operations; inode->i_fop = &ntfs_file_operations; - inode->i_mapping->a_ops = - is_compressed(ni) ? &ntfs_aops_cmpr : &ntfs_aops; + inode->i_mapping->a_ops = is_compressed(ni) ? &ntfs_aops_cmpr : + &ntfs_aops; if (ino != MFT_REC_MFT) init_rwsem(&ni->file.run_lock); } else if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode) || @@ -636,6 +648,7 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, bh->b_size = block_size; off = vbo & (PAGE_SIZE - 1); set_bh_page(bh, page, off); + err = bh_read(bh, 0); if (err < 0) goto out; @@ -773,8 +786,8 @@ static ssize_t ntfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } ret = blockdev_direct_IO(iocb, inode, iter, - wr ? ntfs_get_block_direct_IO_W - : ntfs_get_block_direct_IO_R); + wr ? ntfs_get_block_direct_IO_W : + ntfs_get_block_direct_IO_R); if (ret > 0) end = vbo + ret; @@ -833,7 +846,7 @@ out: } static int ntfs_resident_writepage(struct folio *folio, - struct writeback_control *wbc, void *data) + struct writeback_control *wbc, void *data) { struct address_space *mapping = data; struct ntfs_inode *ni = ntfs_i(mapping->host); @@ -874,8 +887,8 @@ int ntfs_write_begin(struct file *file, struct address_space *mapping, *pagep = NULL; if (is_resident(ni)) { - struct page *page = grab_cache_page_write_begin( - mapping, pos >> PAGE_SHIFT); + struct page *page = + grab_cache_page_write_begin(mapping, pos >> PAGE_SHIFT); if (!page) { err = -ENOMEM; @@ -907,9 +920,8 @@ out: /* * ntfs_write_end - Address_space_operations::write_end. */ -int ntfs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, u32 len, u32 copied, struct page *page, - void *fsdata) +int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, + u32 len, u32 copied, struct page *page, void *fsdata) { struct inode *inode = mapping->host; struct ntfs_inode *ni = ntfs_i(inode); @@ -1307,8 +1319,7 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, inode_init_owner(idmap, inode, dir, mode); mode = inode->i_mode; - inode->i_atime = inode->i_mtime = inode->i_ctime = ni->i_crtime = - current_time(inode); + ni->i_crtime = current_time(inode); rec = ni->mi.mrec; rec->hard_links = cpu_to_le16(1); @@ -1349,10 +1360,9 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, attr->res.data_size = cpu_to_le32(dsize); std5->cr_time = std5->m_time = std5->c_time = std5->a_time = - kernel2nt(&inode->i_atime); + kernel2nt(&ni->i_crtime); - ni->std_fa = fa; - std5->fa = fa; + std5->fa = ni->std_fa = fa; attr = Add2Ptr(attr, asize); @@ -1551,11 +1561,15 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, } asize = SIZEOF_NONRESIDENT + ALIGN(err, 8); + /* Write non resident data. */ + err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rp, + nsize, 0); + if (err) + goto out5; } else { attr->res.data_off = SIZEOF_RESIDENT_LE; attr->res.data_size = cpu_to_le32(nsize); memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), rp, nsize); - nsize = 0; } /* Size of symlink equals the length of input string. */ inode->i_size = size; @@ -1576,19 +1590,8 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, rec->used = cpu_to_le32(PtrOffset(rec, attr) + 8); rec->next_attr_id = cpu_to_le16(aid); - /* Step 2: Add new name in index. */ - err = indx_insert_entry(&dir_ni->dir, dir_ni, new_de, sbi, fnd, 0); - if (err) - goto out6; - - /* Unlock parent directory before ntfs_init_acl. */ - if (!fnd) - ni_unlock(dir_ni); - inode->i_generation = le16_to_cpu(rec->seq); - dir->i_mtime = dir->i_ctime = inode->i_atime; - if (S_ISDIR(mode)) { inode->i_op = &ntfs_dir_inode_operations; inode->i_fop = &ntfs_dir_operations; @@ -1601,8 +1604,8 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, } else if (S_ISREG(mode)) { inode->i_op = &ntfs_file_inode_operations; inode->i_fop = &ntfs_file_operations; - inode->i_mapping->a_ops = - is_compressed(ni) ? &ntfs_aops_cmpr : &ntfs_aops; + inode->i_mapping->a_ops = is_compressed(ni) ? &ntfs_aops_cmpr : + &ntfs_aops; init_rwsem(&ni->file.run_lock); } else { inode->i_op = &ntfs_special_inode_operations; @@ -1613,41 +1616,58 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, if (!S_ISLNK(mode) && (sb->s_flags & SB_POSIXACL)) { err = ntfs_init_acl(idmap, inode, dir); if (err) - goto out7; + goto out5; } else #endif { inode->i_flags |= S_NOSEC; } - /* Write non resident data. */ - if (nsize) { - err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rp, nsize, 0); - if (err) - goto out7; + /* + * ntfs_init_acl and ntfs_save_wsl_perm update extended attribute. + * The packed size of extended attribute is stored in direntry too. + * 'fname' here points to inside new_de. + */ + ntfs_save_wsl_perm(inode, &fname->dup.ea_size); + + /* + * update ea_size in file_name attribute too. + * Use ni_find_attr cause layout of MFT record may be changed + * in ntfs_init_acl and ntfs_save_wsl_perm. + */ + attr = ni_find_attr(ni, NULL, NULL, ATTR_NAME, NULL, 0, NULL, NULL); + if (attr) { + struct ATTR_FILE_NAME *fn; + + fn = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME); + if (fn) + fn->dup.ea_size = fname->dup.ea_size; } + /* We do not need to update parent directory later */ + ni->ni_flags &= ~NI_FLAG_UPDATE_PARENT; + + /* Step 2: Add new name in index. */ + err = indx_insert_entry(&dir_ni->dir, dir_ni, new_de, sbi, fnd, 0); + if (err) + goto out6; + /* * Call 'd_instantiate' after inode->i_op is set * but before finish_open. */ d_instantiate(dentry, inode); - ntfs_save_wsl_perm(inode); + /* Set original time. inode times (i_ctime) may be changed in ntfs_init_acl. */ + inode->i_atime = inode->i_mtime = inode->i_ctime = dir->i_mtime = + dir->i_ctime = ni->i_crtime; + mark_inode_dirty(dir); mark_inode_dirty(inode); /* Normal exit. */ goto out2; -out7: - - /* Undo 'indx_insert_entry'. */ - if (!fnd) - ni_lock_dir(dir_ni); - indx_delete_entry(&dir_ni->dir, dir_ni, new_de + 1, - le16_to_cpu(new_de->key_size), sbi); - /* ni_unlock(dir_ni); will be called later. */ out6: if (rp_inserted) ntfs_remove_reparse(sbi, IO_REPARSE_TAG_SYMLINK, &new_de->ref); @@ -1669,11 +1689,11 @@ out2: kfree(rp); out1: - if (err) { - if (!fnd) - ni_unlock(dir_ni); + if (!fnd) + ni_unlock(dir_ni); + + if (err) return ERR_PTR(err); - } unlock_new_inode(inode); @@ -1770,9 +1790,6 @@ void ntfs_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); - if (inode->i_nlink) - _ni_write_inode(inode, inode_needs_sync(inode)); - invalidate_inode_buffers(inode); clear_inode(inode); @@ -2057,7 +2074,6 @@ const struct inode_operations ntfs_link_inode_operations = { .get_link = ntfs_get_link, .setattr = ntfs3_setattr, .listxattr = ntfs_listxattr, - .permission = ntfs_permission, }; const struct address_space_operations ntfs_aops = { diff --git a/fs/ntfs3/lznt.c b/fs/ntfs3/lznt.c index 28f654561f27..61e161c7c567 100644 --- a/fs/ntfs3/lznt.c +++ b/fs/ntfs3/lznt.c @@ -296,8 +296,8 @@ next: */ struct lznt *get_lznt_ctx(int level) { - struct lznt *r = kzalloc(level ? offsetof(struct lznt, hash) - : sizeof(struct lznt), + struct lznt *r = kzalloc(level ? offsetof(struct lznt, hash) : + sizeof(struct lznt), GFP_NOFS); if (r) @@ -392,9 +392,9 @@ ssize_t decompress_lznt(const void *cmpr, size_t cmpr_size, void *unc, unc_use = err; } else { /* This chunk does not contain compressed data. */ - unc_use = unc_chunk + LZNT_CHUNK_SIZE > unc_end - ? unc_end - unc_chunk - : LZNT_CHUNK_SIZE; + unc_use = unc_chunk + LZNT_CHUNK_SIZE > unc_end ? + unc_end - unc_chunk : + LZNT_CHUNK_SIZE; if (cmpr_chunk + sizeof(chunk_hdr) + unc_use > cmpr_end) { diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index 407fe92394e2..9736b1e4a0f6 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -88,6 +88,16 @@ static struct dentry *ntfs_lookup(struct inode *dir, struct dentry *dentry, __putname(uni); } + /* + * Check for a null pointer + * If the MFT record of ntfs inode is not a base record, inode->i_op can be NULL. + * This causes null pointer dereference in d_splice_alias(). + */ + if (!IS_ERR_OR_NULL(inode) && !inode->i_op) { + iput(inode); + inode = ERR_PTR(-EINVAL); + } + return d_splice_alias(inode, dentry); } @@ -423,8 +433,8 @@ static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry, inode = ntfs_create_inode(&nop_mnt_idmap, dir, dentry, uni, mode, 0, NULL, 0, fnd); - err = IS_ERR(inode) ? PTR_ERR(inode) - : finish_open(file, dentry, ntfs_file_open); + err = IS_ERR(inode) ? PTR_ERR(inode) : + finish_open(file, dentry, ntfs_file_open); dput(d); out2: @@ -597,8 +607,7 @@ const struct inode_operations ntfs_dir_inode_operations = { .rmdir = ntfs_rmdir, .mknod = ntfs_mknod, .rename = ntfs_rename, - .permission = ntfs_permission, - .get_inode_acl = ntfs_get_acl, + .get_acl = ntfs_get_acl, .set_acl = ntfs_set_acl, .setattr = ntfs3_setattr, .getattr = ntfs_getattr, @@ -611,7 +620,7 @@ const struct inode_operations ntfs_special_inode_operations = { .setattr = ntfs3_setattr, .getattr = ntfs_getattr, .listxattr = ntfs_listxattr, - .get_inode_acl = ntfs_get_acl, + .get_acl = ntfs_get_acl, .set_acl = ntfs_set_acl, }; diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h index 86ea1826d099..90151e56c122 100644 --- a/fs/ntfs3/ntfs.h +++ b/fs/ntfs3/ntfs.h @@ -435,9 +435,6 @@ static inline u64 attr_svcn(const struct ATTRIB *attr) return attr->non_res ? le64_to_cpu(attr->nres.svcn) : 0; } -/* The size of resident attribute by its resident size. */ -#define BYTES_PER_RESIDENT(b) (0x18 + (b)) - static_assert(sizeof(struct ATTRIB) == 0x48); static_assert(sizeof(((struct ATTRIB *)NULL)->res) == 0x08); static_assert(sizeof(((struct ATTRIB *)NULL)->nres) == 0x38); diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 80072e5f96f7..eb01f7e76479 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -100,7 +100,6 @@ struct ntfs_mount_options { unsigned hide_dot_files : 1; /* Set hidden flag on dot files. */ unsigned windows_names : 1; /* Disallow names forbidden by Windows. */ unsigned force : 1; /* RW mount dirty volume. */ - unsigned noacsrules : 1; /* Exclude acs rules. */ unsigned prealloc : 1; /* Preallocate space when file is growing. */ unsigned nocase : 1; /* case insensitive. */ }; @@ -164,7 +163,6 @@ struct wnd_bitmap { size_t zone_bit; size_t zone_end; - bool set_tail; // Not necessary in driver. bool inited; }; @@ -340,7 +338,7 @@ enum ntfs_inode_mutex_lock_class { }; /* - * sturct ntfs_inode + * struct ntfs_inode * * Ntfs inode - extends linux inode. consists of one or more MFT inodes. */ @@ -581,6 +579,7 @@ int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni, bool ni_is_dirty(struct inode *inode); /* Globals from fslog.c */ +bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes); int log_replay(struct ntfs_inode *ni, bool *initialized); /* Globals from fsntfs.c */ @@ -700,9 +699,8 @@ int ntfs_get_block(struct inode *inode, sector_t vbn, struct buffer_head *bh_result, int create); int ntfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, u32 len, struct page **pagep, void **fsdata); -int ntfs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, u32 len, u32 copied, struct page *page, - void *fsdata); +int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, + u32 len, u32 copied, struct page *page, void *fsdata); int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc); int ntfs_sync_inode(struct inode *inode); int ntfs_flush_inodes(struct super_block *sb, struct inode *i1, @@ -858,23 +856,22 @@ unsigned long ntfs_names_hash(const u16 *name, size_t len, const u16 *upcase, /* globals from xattr.c */ #ifdef CONFIG_NTFS3_FS_POSIX_ACL -struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu); +struct posix_acl *ntfs_get_acl(struct mnt_idmap *idmap, + struct dentry *dentry, int type); int ntfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode, - struct inode *dir); + struct inode *dir); #else #define ntfs_get_acl NULL #define ntfs_set_acl NULL #endif int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry); -int ntfs_permission(struct mnt_idmap *idmap, struct inode *inode, - int mask); ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size); extern const struct xattr_handler *ntfs_xattr_handlers[]; -int ntfs_save_wsl_perm(struct inode *inode); +int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size); void ntfs_get_wsl_perm(struct inode *inode); /* globals from lznt.c */ diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c index defce6a5c8e1..2a281cead2bc 100644 --- a/fs/ntfs3/record.c +++ b/fs/ntfs3/record.c @@ -221,7 +221,7 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) } if (off + asize < off) { - /* overflow check */ + /* Overflow check. */ return NULL; } @@ -247,8 +247,8 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) if ((t32 & 0xf) || (t32 > 0x100)) return NULL; - /* Check boundary. */ - if (off + asize > used) + /* Check overflow and boundary. */ + if (off + asize < off || off + asize > used) return NULL; /* Check size of attribute. */ @@ -419,10 +419,9 @@ struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type, struct ntfs_sb_info *sbi = mi->sbi; u32 used = le32_to_cpu(rec->used); const u16 *upcase = sbi->upcase; - int diff; /* Can we insert mi attribute? */ - if (used + asize > mi->sbi->record_size) + if (used + asize > sbi->record_size) return NULL; /* @@ -431,7 +430,7 @@ struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type, */ attr = NULL; while ((attr = mi_enum_attr(mi, attr))) { - diff = compare_attr(attr, type, name, name_len, upcase); + int diff = compare_attr(attr, type, name, name_len, upcase); if (diff < 0) continue; @@ -442,9 +441,11 @@ struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type, } if (!attr) { - tail = 8; /* Not used, just to suppress warning. */ + /* Append. */ + tail = 8; attr = Add2Ptr(rec, used - 8); } else { + /* Insert before 'attr'. */ tail = used - PtrOffset(rec, attr); } diff --git a/fs/ntfs3/run.c b/fs/ntfs3/run.c index a5af71cd8d14..47612d16c027 100644 --- a/fs/ntfs3/run.c +++ b/fs/ntfs3/run.c @@ -433,9 +433,9 @@ requires_new_range: should_add_tail = Tovcn < r->len; if (should_add_tail) { - tail_lcn = r->lcn == SPARSE_LCN - ? SPARSE_LCN - : (r->lcn + Tovcn); + tail_lcn = r->lcn == SPARSE_LCN ? + SPARSE_LCN : + (r->lcn + Tovcn); tail_vcn = r->vcn + Tovcn; tail_len = r->len - Tovcn; } diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index ef4ea3f21905..5158dd31fd97 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -39,10 +39,10 @@ * To mount large volumes as ntfs one should use large cluster size (up to 2M) * The maximum volume size in this case is 2^32 * 2^21 = 2^53 = 8P * - * ntfs limits, cluster size is 2M (2^31) + * ntfs limits, cluster size is 2M (2^21) * ----------------------------------------------------------------------------- - * | < 8P, 2^54 | < 2^32 | yes | yes | yes | yes | yes | - * | > 8P, 2^54 | > 2^32 | no | no | yes | yes | yes | + * | < 8P, 2^53 | < 2^32 | yes | yes | yes | yes | yes | + * | > 8P, 2^53 | > 2^32 | no | no | yes | yes | yes | * ----------------------------------------------------------|------------------ * */ @@ -115,9 +115,9 @@ void ntfs_inode_printk(struct inode *inode, const char *fmt, ...) return; /* Use static allocated buffer, if possible. */ - name = atomic_dec_and_test(&s_name_buf_cnt) - ? s_name_buf - : kmalloc(sizeof(s_name_buf), GFP_NOFS); + name = atomic_dec_and_test(&s_name_buf_cnt) ? + s_name_buf : + kmalloc(sizeof(s_name_buf), GFP_NOFS); if (name) { struct dentry *de = d_find_alias(inode); @@ -253,7 +253,6 @@ enum Opt { Opt_acl, Opt_iocharset, Opt_prealloc, - Opt_noacsrules, Opt_nocase, Opt_err, }; @@ -271,12 +270,11 @@ static const struct fs_parameter_spec ntfs_fs_parameters[] = { fsparam_flag_no("hidden", Opt_nohidden), fsparam_flag_no("hide_dot_files", Opt_hide_dot_files), fsparam_flag_no("windows_names", Opt_windows_names), - fsparam_flag_no("acl", Opt_acl), fsparam_flag_no("showmeta", Opt_showmeta), + fsparam_flag_no("acl", Opt_acl), + fsparam_string("iocharset", Opt_iocharset), fsparam_flag_no("prealloc", Opt_prealloc), - fsparam_flag_no("acsrules", Opt_noacsrules), fsparam_flag_no("nocase", Opt_nocase), - fsparam_string("iocharset", Opt_iocharset), {} }; @@ -366,19 +364,20 @@ static int ntfs_fs_parse_param(struct fs_context *fc, case Opt_windows_names: opts->windows_names = result.negated ? 0 : 1; break; + case Opt_showmeta: + opts->showmeta = result.negated ? 0 : 1; + break; case Opt_acl: if (!result.negated) #ifdef CONFIG_NTFS3_FS_POSIX_ACL fc->sb_flags |= SB_POSIXACL; #else - return invalf(fc, "ntfs3: Support for ACL not compiled in!"); + return invalf( + fc, "ntfs3: Support for ACL not compiled in!"); #endif else fc->sb_flags &= ~SB_POSIXACL; break; - case Opt_showmeta: - opts->showmeta = result.negated ? 0 : 1; - break; case Opt_iocharset: kfree(opts->nls_name); opts->nls_name = param->string; @@ -387,9 +386,6 @@ static int ntfs_fs_parse_param(struct fs_context *fc, case Opt_prealloc: opts->prealloc = result.negated ? 0 : 1; break; - case Opt_noacsrules: - opts->noacsrules = result.negated ? 1 : 0; - break; case Opt_nocase: opts->nocase = result.negated ? 1 : 0; break; @@ -409,24 +405,29 @@ static int ntfs_fs_reconfigure(struct fs_context *fc) ro_rw = sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY); if (ro_rw && (sbi->flags & NTFS_FLAGS_NEED_REPLAY)) { - errorf(fc, "ntfs3: Couldn't remount rw because journal is not replayed. Please umount/remount instead\n"); + errorf(fc, + "ntfs3: Couldn't remount rw because journal is not replayed. Please umount/remount instead\n"); return -EINVAL; } new_opts->nls = ntfs_load_nls(new_opts->nls_name); if (IS_ERR(new_opts->nls)) { new_opts->nls = NULL; - errorf(fc, "ntfs3: Cannot load iocharset %s", new_opts->nls_name); + errorf(fc, "ntfs3: Cannot load iocharset %s", + new_opts->nls_name); return -EINVAL; } if (new_opts->nls != sbi->options->nls) - return invalf(fc, "ntfs3: Cannot use different iocharset when remounting!"); + return invalf( + fc, + "ntfs3: Cannot use different iocharset when remounting!"); sync_filesystem(sb); if (ro_rw && (sbi->volume.flags & VOLUME_FLAG_DIRTY) && !new_opts->force) { - errorf(fc, "ntfs3: Volume is dirty and \"force\" flag is not set!"); + errorf(fc, + "ntfs3: Volume is dirty and \"force\" flag is not set!"); return -EINVAL; } @@ -544,40 +545,38 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root) struct ntfs_mount_options *opts = sbi->options; struct user_namespace *user_ns = seq_user_ns(m); - seq_printf(m, ",uid=%u", - from_kuid_munged(user_ns, opts->fs_uid)); - seq_printf(m, ",gid=%u", - from_kgid_munged(user_ns, opts->fs_gid)); - if (opts->fmask) - seq_printf(m, ",fmask=%04o", opts->fs_fmask_inv ^ 0xffff); + seq_printf(m, ",uid=%u", from_kuid_munged(user_ns, opts->fs_uid)); + seq_printf(m, ",gid=%u", from_kgid_munged(user_ns, opts->fs_gid)); if (opts->dmask) seq_printf(m, ",dmask=%04o", opts->fs_dmask_inv ^ 0xffff); - if (opts->nls) - seq_printf(m, ",iocharset=%s", opts->nls->charset); - else - seq_puts(m, ",iocharset=utf8"); + if (opts->fmask) + seq_printf(m, ",fmask=%04o", opts->fs_fmask_inv ^ 0xffff); if (opts->sys_immutable) seq_puts(m, ",sys_immutable"); if (opts->discard) seq_puts(m, ",discard"); + if (opts->force) + seq_puts(m, ",force"); if (opts->sparse) seq_puts(m, ",sparse"); - if (opts->showmeta) - seq_puts(m, ",showmeta"); if (opts->nohidden) seq_puts(m, ",nohidden"); - if (opts->windows_names) - seq_puts(m, ",windows_names"); if (opts->hide_dot_files) seq_puts(m, ",hide_dot_files"); - if (opts->force) - seq_puts(m, ",force"); - if (opts->noacsrules) - seq_puts(m, ",noacsrules"); - if (opts->prealloc) - seq_puts(m, ",prealloc"); + if (opts->windows_names) + seq_puts(m, ",windows_names"); + if (opts->showmeta) + seq_puts(m, ",showmeta"); if (sb->s_flags & SB_POSIXACL) seq_puts(m, ",acl"); + if (opts->nls) + seq_printf(m, ",iocharset=%s", opts->nls->charset); + else + seq_puts(m, ",iocharset=utf8"); + if (opts->prealloc) + seq_puts(m, ",prealloc"); + if (opts->nocase) + seq_puts(m, ",nocase"); return 0; } @@ -706,7 +705,7 @@ static u32 true_sectors_per_clst(const struct NTFS_BOOT *boot) if (boot->sectors_per_clusters <= 0x80) return boot->sectors_per_clusters; if (boot->sectors_per_clusters >= 0xf4) /* limit shift to 2MB max */ - return 1U << -(s8)boot->sectors_per_clusters; + return 1U << (-(s8)boot->sectors_per_clusters); return -EINVAL; } @@ -724,6 +723,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, struct buffer_head *bh; struct MFT_REC *rec; u16 fn, ao; + u8 cluster_bits; sbi->volume.blocks = dev_size >> PAGE_SHIFT; @@ -734,48 +734,81 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, err = -EINVAL; boot = (struct NTFS_BOOT *)bh->b_data; - if (memcmp(boot->system_id, "NTFS ", sizeof("NTFS ") - 1)) + if (memcmp(boot->system_id, "NTFS ", sizeof("NTFS ") - 1)) { + ntfs_err(sb, "Boot's signature is not NTFS."); goto out; + } /* 0x55AA is not mandaroty. Thanks Maxim Suhanov*/ /*if (0x55 != boot->boot_magic[0] || 0xAA != boot->boot_magic[1]) * goto out; */ - boot_sector_size = (u32)boot->bytes_per_sector[1] << 8; - if (boot->bytes_per_sector[0] || boot_sector_size < SECTOR_SIZE || + boot_sector_size = ((u32)boot->bytes_per_sector[1] << 8) | + boot->bytes_per_sector[0]; + if (boot_sector_size < SECTOR_SIZE || !is_power_of_2(boot_sector_size)) { + ntfs_err(sb, "Invalid bytes per sector %u.", boot_sector_size); goto out; } /* cluster size: 512, 1K, 2K, 4K, ... 2M */ sct_per_clst = true_sectors_per_clst(boot); - if ((int)sct_per_clst < 0) - goto out; - if (!is_power_of_2(sct_per_clst)) + if ((int)sct_per_clst < 0 || !is_power_of_2(sct_per_clst)) { + ntfs_err(sb, "Invalid sectors per cluster %u.", sct_per_clst); goto out; + } + + sbi->cluster_size = boot_sector_size * sct_per_clst; + sbi->cluster_bits = cluster_bits = blksize_bits(sbi->cluster_size); + sbi->cluster_mask = sbi->cluster_size - 1; + sbi->cluster_mask_inv = ~(u64)sbi->cluster_mask; mlcn = le64_to_cpu(boot->mft_clst); mlcn2 = le64_to_cpu(boot->mft2_clst); sectors = le64_to_cpu(boot->sectors_per_volume); - if (mlcn * sct_per_clst >= sectors) + if (mlcn * sct_per_clst >= sectors || mlcn2 * sct_per_clst >= sectors) { + ntfs_err( + sb, + "Start of MFT 0x%llx (0x%llx) is out of volume 0x%llx.", + mlcn, mlcn2, sectors); goto out; + } - if (mlcn2 * sct_per_clst >= sectors) - goto out; + sbi->record_size = record_size = + boot->record_size < 0 ? 1 << (-boot->record_size) : + (u32)boot->record_size << cluster_bits; + sbi->record_bits = blksize_bits(record_size); + sbi->attr_size_tr = (5 * record_size >> 4); // ~320 bytes /* Check MFT record size. */ - if ((boot->record_size < 0 && - SECTOR_SIZE > (2U << (-boot->record_size))) || - (boot->record_size >= 0 && !is_power_of_2(boot->record_size))) { + if (record_size < SECTOR_SIZE || !is_power_of_2(record_size)) { + ntfs_err(sb, "Invalid bytes per MFT record %u (%d).", + record_size, boot->record_size); + goto out; + } + + if (record_size > MAXIMUM_BYTES_PER_MFT) { + ntfs_err(sb, "Unsupported bytes per MFT record %u.", + record_size); goto out; } + sbi->index_size = boot->index_size < 0 ? + 1u << (-boot->index_size) : + (u32)boot->index_size << cluster_bits; + /* Check index record size. */ - if ((boot->index_size < 0 && - SECTOR_SIZE > (2U << (-boot->index_size))) || - (boot->index_size >= 0 && !is_power_of_2(boot->index_size))) { + if (sbi->index_size < SECTOR_SIZE || !is_power_of_2(sbi->index_size)) { + ntfs_err(sb, "Invalid bytes per index %u(%d).", sbi->index_size, + boot->index_size); + goto out; + } + + if (sbi->index_size > MAXIMUM_BYTES_PER_INDEX) { + ntfs_err(sb, "Unsupported bytes per index %u.", + sbi->index_size); goto out; } @@ -791,53 +824,36 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, if (boot_sector_size != sector_size) { ntfs_warn( sb, - "Different NTFS' sector size (%u) and media sector size (%u)", + "Different NTFS sector size (%u) and media sector size (%u).", boot_sector_size, sector_size); dev_size += sector_size - 1; } - sbi->cluster_size = boot_sector_size * sct_per_clst; - sbi->cluster_bits = blksize_bits(sbi->cluster_size); - - sbi->mft.lbo = mlcn << sbi->cluster_bits; - sbi->mft.lbo2 = mlcn2 << sbi->cluster_bits; + sbi->mft.lbo = mlcn << cluster_bits; + sbi->mft.lbo2 = mlcn2 << cluster_bits; /* Compare boot's cluster and sector. */ - if (sbi->cluster_size < boot_sector_size) + if (sbi->cluster_size < boot_sector_size) { + ntfs_err(sb, "Invalid bytes per cluster (%u).", + sbi->cluster_size); goto out; + } /* Compare boot's cluster and media sector. */ if (sbi->cluster_size < sector_size) { /* No way to use ntfs_get_block in this case. */ ntfs_err( sb, - "Failed to mount 'cause NTFS's cluster size (%u) is less than media sector size (%u)", + "Failed to mount 'cause NTFS's cluster size (%u) is less than media sector size (%u).", sbi->cluster_size, sector_size); goto out; } - sbi->cluster_mask = sbi->cluster_size - 1; - sbi->cluster_mask_inv = ~(u64)sbi->cluster_mask; - sbi->record_size = record_size = boot->record_size < 0 - ? 1 << (-boot->record_size) - : (u32)boot->record_size - << sbi->cluster_bits; - - if (record_size > MAXIMUM_BYTES_PER_MFT || record_size < SECTOR_SIZE) - goto out; - - sbi->record_bits = blksize_bits(record_size); - sbi->attr_size_tr = (5 * record_size >> 4); // ~320 bytes - sbi->max_bytes_per_attr = record_size - ALIGN(MFTRECORD_FIXUP_OFFSET_1, 8) - ALIGN(((record_size >> SECTOR_SHIFT) * sizeof(short)), 8) - ALIGN(sizeof(enum ATTR_TYPE), 8); - sbi->index_size = boot->index_size < 0 - ? 1u << (-boot->index_size) - : (u32)boot->index_size << sbi->cluster_bits; - sbi->volume.ser_num = le64_to_cpu(boot->serial_num); /* Warning if RAW volume. */ @@ -847,18 +863,18 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, gb0 = format_size_gb(dev_size, &mb0); ntfs_warn( sb, - "RAW NTFS volume: Filesystem size %u.%02u Gb > volume size %u.%02u Gb. Mount in read-only", + "RAW NTFS volume: Filesystem size %u.%02u Gb > volume size %u.%02u Gb. Mount in read-only.", gb, mb, gb0, mb0); sb->s_flags |= SB_RDONLY; } - clusters = sbi->volume.size >> sbi->cluster_bits; + clusters = sbi->volume.size >> cluster_bits; #ifndef CONFIG_NTFS3_64BIT_CLUSTER /* 32 bits per cluster. */ if (clusters >> 32) { ntfs_notice( sb, - "NTFS %u.%02u Gb is too big to use 32 bits per cluster", + "NTFS %u.%02u Gb is too big to use 32 bits per cluster.", gb, mb); goto out; } @@ -892,17 +908,17 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, sbi->volume.blocks = sbi->volume.size >> sb->s_blocksize_bits; /* Maximum size for normal files. */ - sbi->maxbytes = (clusters << sbi->cluster_bits) - 1; + sbi->maxbytes = (clusters << cluster_bits) - 1; #ifdef CONFIG_NTFS3_64BIT_CLUSTER - if (clusters >= (1ull << (64 - sbi->cluster_bits))) + if (clusters >= (1ull << (64 - cluster_bits))) sbi->maxbytes = -1; sbi->maxbytes_sparse = -1; sb->s_maxbytes = MAX_LFS_FILESIZE; #else /* Maximum size for sparse file. */ - sbi->maxbytes_sparse = (1ull << (sbi->cluster_bits + 32)) - 1; - sb->s_maxbytes = 0xFFFFFFFFull << sbi->cluster_bits; + sbi->maxbytes_sparse = (1ull << (cluster_bits + 32)) - 1; + sb->s_maxbytes = 0xFFFFFFFFull << cluster_bits; #endif /* @@ -910,7 +926,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, * It would be nice if we are able to allocate 1/8 of * total clusters for MFT but not more then 512 MB. */ - sbi->zone_max = min_t(CLST, 0x20000000 >> sbi->cluster_bits, clusters >> 3); + sbi->zone_max = min_t(CLST, 0x20000000 >> cluster_bits, clusters >> 3); err = 0; @@ -928,6 +944,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) int err; struct ntfs_sb_info *sbi = sb->s_fs_info; struct block_device *bdev = sb->s_bdev; + struct ntfs_mount_options *options; struct inode *inode; struct ntfs_inode *ni; size_t i, tt, bad_len, bad_frags; @@ -942,7 +959,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) ref.high = 0; sbi->sb = sb; - sbi->options = fc->fs_private; + sbi->options = options = fc->fs_private; fc->fs_private = NULL; sb->s_flags |= SB_NODIRATIME; sb->s_magic = 0x7366746e; // "ntfs" @@ -950,12 +967,12 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_export_op = &ntfs_export_ops; sb->s_time_gran = NTFS_TIME_GRAN; // 100 nsec sb->s_xattr = ntfs_xattr_handlers; - sb->s_d_op = sbi->options->nocase ? &ntfs_dentry_ops : NULL; + sb->s_d_op = options->nocase ? &ntfs_dentry_ops : NULL; - sbi->options->nls = ntfs_load_nls(sbi->options->nls_name); - if (IS_ERR(sbi->options->nls)) { - sbi->options->nls = NULL; - errorf(fc, "Cannot load nls %s", sbi->options->nls_name); + options->nls = ntfs_load_nls(options->nls_name); + if (IS_ERR(options->nls)) { + options->nls = NULL; + errorf(fc, "Cannot load nls %s", options->nls_name); err = -EINVAL; goto out; } @@ -980,8 +997,8 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) ref.seq = cpu_to_le16(MFT_REC_VOL); inode = ntfs_iget5(sb, &ref, &NAME_VOLUME); if (IS_ERR(inode)) { - ntfs_err(sb, "Failed to load $Volume."); err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load $Volume (%d).", err); goto out; } @@ -1007,13 +1024,9 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) } attr = ni_find_attr(ni, attr, NULL, ATTR_VOL_INFO, NULL, 0, NULL, NULL); - if (!attr || is_attr_ext(attr)) { - err = -EINVAL; - goto put_inode_out; - } - - info = resident_data_ex(attr, SIZEOF_ATTRIBUTE_VOLUME_INFO); - if (!info) { + if (!attr || is_attr_ext(attr) || + !(info = resident_data_ex(attr, SIZEOF_ATTRIBUTE_VOLUME_INFO))) { + ntfs_err(sb, "$Volume is corrupted."); err = -EINVAL; goto put_inode_out; } @@ -1028,13 +1041,13 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) ref.seq = cpu_to_le16(MFT_REC_MIRR); inode = ntfs_iget5(sb, &ref, &NAME_MIRROR); if (IS_ERR(inode)) { - ntfs_err(sb, "Failed to load $MFTMirr."); err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load $MFTMirr (%d).", err); goto out; } - sbi->mft.recs_mirr = - ntfs_up_cluster(sbi, inode->i_size) >> sbi->record_bits; + sbi->mft.recs_mirr = ntfs_up_cluster(sbi, inode->i_size) >> + sbi->record_bits; iput(inode); @@ -1043,8 +1056,8 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) ref.seq = cpu_to_le16(MFT_REC_LOG); inode = ntfs_iget5(sb, &ref, &NAME_LOGFILE); if (IS_ERR(inode)) { - ntfs_err(sb, "Failed to load \x24LogFile."); err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load \x24LogFile (%d).", err); goto out; } @@ -1064,7 +1077,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) goto out; } } else if (sbi->volume.flags & VOLUME_FLAG_DIRTY) { - if (!sb_rdonly(sb) && !sbi->options->force) { + if (!sb_rdonly(sb) && !options->force) { ntfs_warn( sb, "volume is dirty and \"force\" flag is not set!"); @@ -1079,8 +1092,8 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) inode = ntfs_iget5(sb, &ref, &NAME_MFT); if (IS_ERR(inode)) { - ntfs_err(sb, "Failed to load $MFT."); err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load $MFT (%d).", err); goto out; } @@ -1095,8 +1108,10 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) goto put_inode_out; err = ni_load_all_mi(ni); - if (err) + if (err) { + ntfs_err(sb, "Failed to load $MFT's subrecords (%d).", err); goto put_inode_out; + } sbi->mft.ni = ni; @@ -1105,8 +1120,8 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) ref.seq = cpu_to_le16(MFT_REC_BITMAP); inode = ntfs_iget5(sb, &ref, &NAME_BITMAP); if (IS_ERR(inode)) { - ntfs_err(sb, "Failed to load $Bitmap."); err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load $Bitmap (%d).", err); goto out; } @@ -1120,22 +1135,25 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) /* Check bitmap boundary. */ tt = sbi->used.bitmap.nbits; if (inode->i_size < bitmap_size(tt)) { + ntfs_err(sb, "$Bitmap is corrupted."); err = -EINVAL; goto put_inode_out; } - /* Not necessary. */ - sbi->used.bitmap.set_tail = true; err = wnd_init(&sbi->used.bitmap, sb, tt); - if (err) + if (err) { + ntfs_err(sb, "Failed to initialize $Bitmap (%d).", err); goto put_inode_out; + } iput(inode); /* Compute the MFT zone. */ err = ntfs_refresh_zone(sbi); - if (err) + if (err) { + ntfs_err(sb, "Failed to initialize MFT zone (%d).", err); goto out; + } /* Load $BadClus. */ ref.low = cpu_to_le32(MFT_REC_BADCLUST); @@ -1180,15 +1198,23 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) ref.seq = cpu_to_le16(MFT_REC_ATTR); inode = ntfs_iget5(sb, &ref, &NAME_ATTRDEF); if (IS_ERR(inode)) { - ntfs_err(sb, "Failed to load $AttrDef -> %d", err); err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load $AttrDef (%d)", err); goto out; } - if (inode->i_size < sizeof(struct ATTR_DEF_ENTRY)) { + /* + * Typical $AttrDef contains up to 20 entries. + * Check for extremely large/small size. + */ + if (inode->i_size < sizeof(struct ATTR_DEF_ENTRY) || + inode->i_size > 100 * sizeof(struct ATTR_DEF_ENTRY)) { + ntfs_err(sb, "Looks like $AttrDef is corrupted (size=%llu).", + inode->i_size); err = -EINVAL; goto put_inode_out; } + bytes = inode->i_size; sbi->def_table = t = kmalloc(bytes, GFP_NOFS | __GFP_NOWARN); if (!t) { @@ -1202,6 +1228,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) if (IS_ERR(page)) { err = PTR_ERR(page); + ntfs_err(sb, "Failed to read $AttrDef (%d).", err); goto put_inode_out; } memcpy(Add2Ptr(t, done), page_address(page), @@ -1209,6 +1236,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) ntfs_unmap_page(page); if (!idx && ATTR_STD != t->type) { + ntfs_err(sb, "$AttrDef is corrupted."); err = -EINVAL; goto put_inode_out; } @@ -1243,13 +1271,14 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) ref.seq = cpu_to_le16(MFT_REC_UPCASE); inode = ntfs_iget5(sb, &ref, &NAME_UPCASE); if (IS_ERR(inode)) { - ntfs_err(sb, "Failed to load $UpCase."); err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load $UpCase (%d).", err); goto out; } if (inode->i_size != 0x10000 * sizeof(short)) { err = -EINVAL; + ntfs_err(sb, "$UpCase is corrupted."); goto put_inode_out; } @@ -1260,6 +1289,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) if (IS_ERR(page)) { err = PTR_ERR(page); + ntfs_err(sb, "Failed to read $UpCase (%d).", err); goto put_inode_out; } @@ -1285,23 +1315,31 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) if (is_ntfs3(sbi)) { /* Load $Secure. */ err = ntfs_security_init(sbi); - if (err) + if (err) { + ntfs_err(sb, "Failed to initialize $Secure (%d).", err); goto out; + } /* Load $Extend. */ err = ntfs_extend_init(sbi); - if (err) + if (err) { + ntfs_warn(sb, "Failed to initialize $Extend."); goto load_root; + } - /* Load $Extend\$Reparse. */ + /* Load $Extend/$Reparse. */ err = ntfs_reparse_init(sbi); - if (err) + if (err) { + ntfs_warn(sb, "Failed to initialize $Extend/$Reparse."); goto load_root; + } - /* Load $Extend\$ObjId. */ + /* Load $Extend/$ObjId. */ err = ntfs_objid_init(sbi); - if (err) + if (err) { + ntfs_warn(sb, "Failed to initialize $Extend/$ObjId."); goto load_root; + } } load_root: @@ -1309,12 +1347,21 @@ load_root: ref.low = cpu_to_le32(MFT_REC_ROOT); ref.seq = cpu_to_le16(MFT_REC_ROOT); inode = ntfs_iget5(sb, &ref, &NAME_ROOT); - if (IS_ERR(inode) || !inode->i_op) { - ntfs_err(sb, "Failed to load root."); - err = IS_ERR(inode) ? PTR_ERR(inode) : -EINVAL; + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load root (%d).", err); goto out; } + /* + * Final check. Looks like this case should never occurs. + */ + if (!inode->i_op) { + err = -EINVAL; + ntfs_err(sb, "Failed to load root (%d).", err); + goto put_inode_out; + } + sb->s_root = d_make_root(inode); if (!sb->s_root) { err = -ENOMEM; @@ -1434,7 +1481,7 @@ static const struct fs_context_operations ntfs_context_ops = { }; /* - * ntfs_init_fs_context - Initialize spi and opts + * ntfs_init_fs_context - Initialize sbi and opts * * This will called when mount/remount. We will first initialize * options so that if remount we can use just that. @@ -1507,7 +1554,8 @@ static int __init init_ntfs_fs(void) if (IS_ENABLED(CONFIG_NTFS3_FS_POSIX_ACL)) pr_info("ntfs3: Enabled Linux POSIX ACLs support\n"); if (IS_ENABLED(CONFIG_NTFS3_64BIT_CLUSTER)) - pr_notice("ntfs3: Warning: Activated 64 bits per cluster. Windows does not support this\n"); + pr_notice( + "ntfs3: Warning: Activated 64 bits per cluster. Windows does not support this\n"); if (IS_ENABLED(CONFIG_NTFS3_LZX_XPRESS)) pr_info("ntfs3: Read-only LZX/Xpress compression included\n"); @@ -1550,7 +1598,9 @@ MODULE_DESCRIPTION("ntfs3 read/write filesystem"); MODULE_INFO(behaviour, "Enabled Linux POSIX ACLs support"); #endif #ifdef CONFIG_NTFS3_64BIT_CLUSTER -MODULE_INFO(cluster, "Warning: Activated 64 bits per cluster. Windows does not support this"); +MODULE_INFO( + cluster, + "Warning: Activated 64 bits per cluster. Windows does not support this"); #endif #ifdef CONFIG_NTFS3_LZX_XPRESS MODULE_INFO(compression, "Read-only lzx/xpress compression included"); diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index ff64302e87e5..c3de60a4543f 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -23,8 +23,8 @@ static inline size_t unpacked_ea_size(const struct EA_FULL *ea) { - return ea->size ? le32_to_cpu(ea->size) - : ALIGN(struct_size(ea, name, + return ea->size ? le32_to_cpu(ea->size) : + ALIGN(struct_size(ea, name, 1 + ea->name_len + le16_to_cpu(ea->elength)), 4); @@ -296,7 +296,8 @@ out: static noinline int ntfs_set_ea(struct inode *inode, const char *name, size_t name_len, const void *value, - size_t val_size, int flags, bool locked) + size_t val_size, int flags, bool locked, + __le16 *ea_size) { struct ntfs_inode *ni = ntfs_i(inode); struct ntfs_sb_info *sbi = ni->mi.sbi; @@ -410,7 +411,7 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name, /* * 1. Check ea_info.size_pack for overflow. - * 2. New attibute size must fit value from $AttrDef + * 2. New attribute size must fit value from $AttrDef */ if (new_pack > 0xffff || size > sbi->ea_max_size) { ntfs_inode_warn( @@ -504,6 +505,8 @@ update_ea: if (ea_info.size_pack != size_pack) ni->ni_flags |= NI_FLAG_UPDATE_PARENT; + if (ea_size) + *ea_size = ea_info.size_pack; mark_inode_dirty(&ni->vfs_inode); out: @@ -517,9 +520,14 @@ out: } #ifdef CONFIG_NTFS3_FS_POSIX_ACL -static struct posix_acl *ntfs_get_acl_ex(struct inode *inode, int type, - int locked) + +/* + * ntfs_get_acl - inode_operations::get_acl + */ +struct posix_acl *ntfs_get_acl(struct mnt_idmap *idmap, + struct dentry *dentry, int type) { + struct inode *inode = d_inode(dentry); struct ntfs_inode *ni = ntfs_i(inode); const char *name; size_t name_len; @@ -542,13 +550,11 @@ static struct posix_acl *ntfs_get_acl_ex(struct inode *inode, int type, name_len = sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1; } - if (!locked) - ni_lock(ni); + ni_lock(ni); err = ntfs_get_ea(inode, name, name_len, buf, PATH_MAX, &req); - if (!locked) - ni_unlock(ni); + ni_unlock(ni); /* Translate extended attribute to acl. */ if (err >= 0) { @@ -567,17 +573,6 @@ static struct posix_acl *ntfs_get_acl_ex(struct inode *inode, int type, return acl; } -/* - * ntfs_get_acl - inode_operations::get_acl - */ -struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu) -{ - if (rcu) - return ERR_PTR(-ECHILD); - - return ntfs_get_acl_ex(inode, type, 0); -} - static noinline int ntfs_set_acl_ex(struct mnt_idmap *idmap, struct inode *inode, struct posix_acl *acl, int type, bool init_acl) @@ -633,7 +628,7 @@ static noinline int ntfs_set_acl_ex(struct mnt_idmap *idmap, flags = 0; } - err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0); + err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0, NULL); if (err == -ENODATA && !size) err = 0; /* Removing non existed xattr. */ if (!err) { @@ -712,20 +707,6 @@ int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry) } /* - * ntfs_permission - inode_operations::permission - */ -int ntfs_permission(struct mnt_idmap *idmap, struct inode *inode, - int mask) -{ - if (ntfs_sb(inode->i_sb)->options->noacsrules) { - /* "No access rules" mode - Allow all changes. */ - return 0; - } - - return generic_permission(idmap, inode, mask); -} - -/* * ntfs_listxattr - inode_operations::listxattr */ ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size) @@ -780,7 +761,7 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de, err = sizeof(u32); *(u32 *)buffer = le32_to_cpu(ni->std_fa); if (!strcmp(name, SYSTEM_NTFS_ATTRIB_BE)) - *(u32 *)buffer = cpu_to_be32(*(u32 *)buffer); + *(__be32 *)buffer = cpu_to_be32(*(u32 *)buffer); } goto out; } @@ -857,7 +838,7 @@ static noinline int ntfs_setxattr(const struct xattr_handler *handler, if (size != sizeof(u32)) goto out; if (!strcmp(name, SYSTEM_NTFS_ATTRIB_BE)) - new_fa = cpu_to_le32(be32_to_cpu(*(u32 *)value)); + new_fa = cpu_to_le32(be32_to_cpu(*(__be32 *)value)); else new_fa = cpu_to_le32(*(u32 *)value); @@ -937,7 +918,8 @@ set_new_fa: } /* Deal with NTFS extended attribute. */ - err = ntfs_set_ea(inode, name, strlen(name), value, size, flags, 0); + err = ntfs_set_ea(inode, name, strlen(name), value, size, flags, 0, + NULL); out: inode->i_ctime = current_time(inode); @@ -951,7 +933,7 @@ out: * * save uid/gid/mode in xattr */ -int ntfs_save_wsl_perm(struct inode *inode) +int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size) { int err; __le32 value; @@ -960,26 +942,26 @@ int ntfs_save_wsl_perm(struct inode *inode) ni_lock(ni); value = cpu_to_le32(i_uid_read(inode)); err = ntfs_set_ea(inode, "$LXUID", sizeof("$LXUID") - 1, &value, - sizeof(value), 0, true); /* true == already locked. */ + sizeof(value), 0, true, ea_size); if (err) goto out; value = cpu_to_le32(i_gid_read(inode)); err = ntfs_set_ea(inode, "$LXGID", sizeof("$LXGID") - 1, &value, - sizeof(value), 0, true); + sizeof(value), 0, true, ea_size); if (err) goto out; value = cpu_to_le32(inode->i_mode); err = ntfs_set_ea(inode, "$LXMOD", sizeof("$LXMOD") - 1, &value, - sizeof(value), 0, true); + sizeof(value), 0, true, ea_size); if (err) goto out; if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { value = cpu_to_le32(inode->i_rdev); err = ntfs_set_ea(inode, "$LXDEV", sizeof("$LXDEV") - 1, &value, - sizeof(value), 0, true); + sizeof(value), 0, true, ea_size); if (err) goto out; } @@ -1033,10 +1015,6 @@ static const struct xattr_handler ntfs_other_xattr_handler = { }; const struct xattr_handler *ntfs_xattr_handlers[] = { -#ifdef CONFIG_NTFS3_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif &ntfs_other_xattr_handler, NULL, }; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 0394505fdce3..8dfc284e85f0 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2463,7 +2463,7 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, get_block, - ocfs2_dio_end_io, NULL, 0); + ocfs2_dio_end_io, 0); } const struct address_space_operations ocfs2_aops = { diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 811a6ea374bb..b1550ba73f96 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -803,8 +803,8 @@ bail: * a better backward&forward compatibility, since a small piece of * request will be less likely to be broken if disk layout get changed. */ -static int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, - int compat_flag) +static noinline_for_stack int +ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, int compat_flag) { int i, status = 0; u64 req_addr; @@ -840,27 +840,26 @@ bail: long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); - int new_clusters; - int status; - struct ocfs2_space_resv sr; - struct ocfs2_new_group_input input; - struct reflink_arguments args; - const char __user *old_path; - const char __user *new_path; - bool preserve; - struct ocfs2_info info; void __user *argp = (void __user *)arg; + int status; switch (cmd) { case OCFS2_IOC_RESVSP: case OCFS2_IOC_RESVSP64: case OCFS2_IOC_UNRESVSP: case OCFS2_IOC_UNRESVSP64: + { + struct ocfs2_space_resv sr; + if (copy_from_user(&sr, (int __user *) arg, sizeof(sr))) return -EFAULT; return ocfs2_change_file_space(filp, cmd, &sr); + } case OCFS2_IOC_GROUP_EXTEND: + { + int new_clusters; + if (!capable(CAP_SYS_RESOURCE)) return -EPERM; @@ -873,8 +872,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) status = ocfs2_group_extend(inode, new_clusters); mnt_drop_write_file(filp); return status; + } case OCFS2_IOC_GROUP_ADD: case OCFS2_IOC_GROUP_ADD64: + { + struct ocfs2_new_group_input input; + if (!capable(CAP_SYS_RESOURCE)) return -EPERM; @@ -887,7 +890,14 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) status = ocfs2_group_add(inode, &input); mnt_drop_write_file(filp); return status; + } case OCFS2_IOC_REFLINK: + { + struct reflink_arguments args; + const char __user *old_path; + const char __user *new_path; + bool preserve; + if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; old_path = (const char __user *)(unsigned long)args.old_path; @@ -895,11 +905,16 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) preserve = (args.preserve != 0); return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve); + } case OCFS2_IOC_INFO: + { + struct ocfs2_info info; + if (copy_from_user(&info, argp, sizeof(struct ocfs2_info))) return -EFAULT; return ocfs2_info_handle(inode, &info, 0); + } case FITRIM: { struct super_block *sb = inode->i_sb; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 9175dbc47201..17c52225b87d 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -242,6 +242,7 @@ static int ocfs2_mknod(struct mnt_idmap *idmap, int want_meta = 0; int xattr_credits = 0; struct ocfs2_security_xattr_info si = { + .name = NULL, .enable = 1, }; int did_quota_inode = 0; @@ -1805,6 +1806,7 @@ static int ocfs2_symlink(struct mnt_idmap *idmap, int want_clusters = 0; int xattr_credits = 0; struct ocfs2_security_xattr_info si = { + .name = NULL, .enable = 1, }; int did_quota = 0, did_quota_inode = 0; diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 5a656dc683f1..564ab48d03ef 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2952,10 +2952,11 @@ retry: */ if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize) { if (PageDirty(page)) { - /* - * write_on_page will unlock the page on return - */ - ret = write_one_page(page); + unlock_page(page); + put_page(page); + + ret = filemap_write_and_wait_range(mapping, + offset, map_end - 1); goto retry; } } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 389308efe854..4ac77ff6e676 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -89,21 +89,17 @@ static struct ocfs2_xattr_def_value_root def_xv = { const struct xattr_handler *ocfs2_xattr_handlers[] = { &ocfs2_xattr_user_handler, - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, &ocfs2_xattr_trusted_handler, &ocfs2_xattr_security_handler, NULL }; static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { - [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, - [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] - = &posix_acl_access_xattr_handler, - [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT] - = &posix_acl_default_xattr_handler, - [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, - [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler, + [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, + [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, + [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default, + [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, + [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler, }; struct ocfs2_xattr_info { @@ -7259,9 +7255,21 @@ static int ocfs2_xattr_security_set(const struct xattr_handler *handler, static int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, void *fs_info) { + struct ocfs2_security_xattr_info *si = fs_info; const struct xattr *xattr; int err = 0; + if (si) { + si->value = kmemdup(xattr_array->value, xattr_array->value_len, + GFP_KERNEL); + if (!si->value) + return -ENOMEM; + + si->name = xattr_array->name; + si->value_len = xattr_array->value_len; + return 0; + } + for (xattr = xattr_array; xattr->name != NULL; xattr++) { err = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, xattr->name, xattr->value, @@ -7277,13 +7285,23 @@ int ocfs2_init_security_get(struct inode *inode, const struct qstr *qstr, struct ocfs2_security_xattr_info *si) { + int ret; + /* check whether ocfs2 support feature xattr */ if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb))) return -EOPNOTSUPP; - if (si) - return security_old_inode_init_security(inode, dir, qstr, - &si->name, &si->value, - &si->value_len); + if (si) { + ret = security_inode_init_security(inode, dir, qstr, + &ocfs2_initxattrs, si); + /* + * security_inode_init_security() does not return -EOPNOTSUPP, + * we have to check the xattr ourselves. + */ + if (!ret && !si->name) + si->enable = 0; + + return ret; + } return security_inode_init_security(inode, dir, qstr, &ocfs2_initxattrs, NULL); diff --git a/fs/open.c b/fs/open.c index 4401a73d4032..4478adcc4f3a 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1196,13 +1196,21 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op) } /* - * In order to ensure programs get explicit errors when trying to use - * O_TMPFILE on old kernels, O_TMPFILE is implemented such that it - * looks like (O_DIRECTORY|O_RDWR & ~O_CREAT) to old kernels. But we - * have to require userspace to explicitly set it. + * Block bugs where O_DIRECTORY | O_CREAT created regular files. + * Note, that blocking O_DIRECTORY | O_CREAT here also protects + * O_TMPFILE below which requires O_DIRECTORY being raised. */ + if ((flags & (O_DIRECTORY | O_CREAT)) == (O_DIRECTORY | O_CREAT)) + return -EINVAL; + + /* Now handle the creative implementation of O_TMPFILE. */ if (flags & __O_TMPFILE) { - if ((flags & O_TMPFILE_MASK) != O_TMPFILE) + /* + * In order to ensure programs get explicit errors when trying + * to use O_TMPFILE on old kernels we enforce that O_DIRECTORY + * is raised alongside __O_TMPFILE. + */ + if (!(flags & O_DIRECTORY)) return -EINVAL; if (!(acc_mode & MAY_WRITE)) return -EINVAL; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index aefdf1d3be7c..9014bbcc8031 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -244,7 +244,7 @@ static void orangefs_readahead(struct readahead_control *rac) struct iov_iter iter; struct inode *inode = rac->mapping->host; struct xarray *i_pages; - struct page *page; + struct folio *folio; loff_t new_start = readahead_pos(rac); int ret; size_t new_len = 0; @@ -275,9 +275,10 @@ static void orangefs_readahead(struct readahead_control *rac) ret = 0; /* clean up. */ - while ((page = readahead_page(rac))) { - page_endio(page, false, ret); - put_page(page); + while ((folio = readahead_folio(rac))) { + if (!ret) + folio_mark_uptodate(folio); + folio_unlock(folio); } } diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c index 6ecad4f94ae6..68b62689a63e 100644 --- a/fs/orangefs/xattr.c +++ b/fs/orangefs/xattr.c @@ -555,8 +555,6 @@ static const struct xattr_handler orangefs_xattr_default_handler = { }; const struct xattr_handler *orangefs_xattr_handlers[] = { - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, &orangefs_xattr_default_handler, NULL }; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index c14e90764e35..f658cc8ea492 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -81,8 +81,7 @@ int ovl_copy_xattr(struct super_block *sb, const struct path *oldpath, struct de int error = 0; size_t slen; - if (!(old->d_inode->i_opflags & IOP_XATTR) || - !(new->d_inode->i_opflags & IOP_XATTR)) + if (!old->d_inode->i_op->listxattr || !new->d_inode->i_op->listxattr) return 0; list_size = vfs_listxattr(old, NULL, 0); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index f1d9f75f8786..f97ad8b40dbb 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1055,20 +1055,12 @@ static const struct xattr_handler ovl_other_xattr_handler = { }; static const struct xattr_handler *ovl_trusted_xattr_handlers[] = { -#ifdef CONFIG_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif &ovl_own_trusted_xattr_handler, &ovl_other_xattr_handler, NULL }; static const struct xattr_handler *ovl_user_xattr_handlers[] = { -#ifdef CONFIG_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif &ovl_own_user_xattr_handler, &ovl_other_xattr_handler, NULL diff --git a/fs/pipe.c b/fs/pipe.c index 42c7ff41c2db..2d88f73f585a 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -342,7 +342,8 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) break; if (ret) break; - if (filp->f_flags & O_NONBLOCK) { + if ((filp->f_flags & O_NONBLOCK) || + (iocb->ki_flags & IOCB_NOWAIT)) { ret = -EAGAIN; break; } @@ -547,7 +548,8 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) continue; /* Wait for buffer space to become available. */ - if (filp->f_flags & O_NONBLOCK) { + if ((filp->f_flags & O_NONBLOCK) || + (iocb->ki_flags & IOCB_NOWAIT)) { if (!ret) ret = -EAGAIN; break; @@ -976,6 +978,9 @@ static int __do_pipe_flags(int *fd, struct file **files, int flags) audit_fd_pair(fdr, fdw); fd[0] = fdr; fd[1] = fdw; + /* pipe groks IOCB_NOWAIT */ + files[0]->f_mode |= FMODE_NOWAIT; + files[1]->f_mode |= FMODE_NOWAIT; return 0; err_fdr: diff --git a/fs/pnode.c b/fs/pnode.c index 468e4e65a615..3cede8b18c8b 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -214,7 +214,6 @@ static struct mount *next_group(struct mount *m, struct mount *origin) /* all accesses are serialized by namespace_sem */ static struct mount *last_dest, *first_source, *last_source, *dest_master; -static struct mountpoint *mp; static struct hlist_head *list; static inline bool peers(struct mount *m1, struct mount *m2) @@ -222,7 +221,7 @@ static inline bool peers(struct mount *m1, struct mount *m2) return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id; } -static int propagate_one(struct mount *m) +static int propagate_one(struct mount *m, struct mountpoint *dest_mp) { struct mount *child; int type; @@ -230,7 +229,7 @@ static int propagate_one(struct mount *m) if (IS_MNT_NEW(m)) return 0; /* skip if mountpoint isn't covered by it */ - if (!is_subdir(mp->m_dentry, m->mnt.mnt_root)) + if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) return 0; if (peers(m, last_dest)) { type = CL_MAKE_SHARED; @@ -262,7 +261,7 @@ static int propagate_one(struct mount *m) if (IS_ERR(child)) return PTR_ERR(child); read_seqlock_excl(&mount_lock); - mnt_set_mountpoint(m, mp, child); + mnt_set_mountpoint(m, dest_mp, child); if (m->mnt_master != dest_master) SET_MNT_MARK(m->mnt_master); read_sequnlock_excl(&mount_lock); @@ -299,13 +298,12 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, last_dest = dest_mnt; first_source = source_mnt; last_source = source_mnt; - mp = dest_mp; list = tree_list; dest_master = dest_mnt->mnt_master; /* all peers of dest_mnt, except dest_mnt itself */ for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) { - ret = propagate_one(n); + ret = propagate_one(n, dest_mp); if (ret) goto out; } @@ -316,7 +314,7 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, /* everything in that slave group */ n = m; do { - ret = propagate_one(n); + ret = propagate_one(n, dest_mp); if (ret) goto out; n = next_peer(n); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 5a76fb35923a..7fa1b738bbab 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -957,25 +957,62 @@ set_posix_acl(struct mnt_idmap *idmap, struct dentry *dentry, } EXPORT_SYMBOL(set_posix_acl); +int posix_acl_listxattr(struct inode *inode, char **buffer, + ssize_t *remaining_size) +{ + int err; + + if (!IS_POSIXACL(inode)) + return 0; + + if (inode->i_acl) { + err = xattr_list_one(buffer, remaining_size, + XATTR_NAME_POSIX_ACL_ACCESS); + if (err) + return err; + } + + if (inode->i_default_acl) { + err = xattr_list_one(buffer, remaining_size, + XATTR_NAME_POSIX_ACL_DEFAULT); + if (err) + return err; + } + + return 0; +} + static bool posix_acl_xattr_list(struct dentry *dentry) { return IS_POSIXACL(d_backing_inode(dentry)); } -const struct xattr_handler posix_acl_access_xattr_handler = { +/* + * nop_posix_acl_access - legacy xattr handler for access POSIX ACLs + * + * This is the legacy POSIX ACL access xattr handler. It is used by some + * filesystems to implement their ->listxattr() inode operation. New code + * should never use them. + */ +const struct xattr_handler nop_posix_acl_access = { .name = XATTR_NAME_POSIX_ACL_ACCESS, - .flags = ACL_TYPE_ACCESS, .list = posix_acl_xattr_list, }; -EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler); +EXPORT_SYMBOL_GPL(nop_posix_acl_access); -const struct xattr_handler posix_acl_default_xattr_handler = { +/* + * nop_posix_acl_default - legacy xattr handler for default POSIX ACLs + * + * This is the legacy POSIX ACL default xattr handler. It is used by some + * filesystems to implement their ->listxattr() inode operation. New code + * should never use them. + */ +const struct xattr_handler nop_posix_acl_default = { .name = XATTR_NAME_POSIX_ACL_DEFAULT, - .flags = ACL_TYPE_DEFAULT, .list = posix_acl_xattr_list, }; -EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler); +EXPORT_SYMBOL_GPL(nop_posix_acl_default); int simple_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) @@ -1094,12 +1131,10 @@ retry_deleg: if (error) goto out_inode_unlock; - if (inode->i_opflags & IOP_XATTR) + if (likely(!is_bad_inode(inode))) error = set_posix_acl(idmap, dentry, acl_type, kacl); - else if (unlikely(is_bad_inode(inode))) - error = -EIO; else - error = -EOPNOTSUPP; + error = -EIO; if (!error) { fsnotify_xattr(dentry); evm_inode_post_set_acl(dentry, acl_name, kacl); @@ -1204,12 +1239,10 @@ retry_deleg: if (error) goto out_inode_unlock; - if (inode->i_opflags & IOP_XATTR) + if (likely(!is_bad_inode(inode))) error = set_posix_acl(idmap, dentry, acl_type, NULL); - else if (unlikely(is_bad_inode(inode))) - error = -EIO; else - error = -EOPNOTSUPP; + error = -EIO; if (!error) { fsnotify_xattr(dentry); evm_inode_post_remove_acl(idmap, dentry, acl_name); diff --git a/fs/proc/array.c b/fs/proc/array.c index 9b0315d34c58..d35bbf35a874 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -91,6 +91,7 @@ #include <linux/user_namespace.h> #include <linux/fs_struct.h> #include <linux/kthread.h> +#include <linux/mmu_context.h> #include <asm/processor.h> #include "internal.h" @@ -219,6 +220,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, seq_put_decimal_ull(m, "\t", task_session_nr_ns(p, pid->numbers[g].ns)); #endif seq_putc(m, '\n'); + + seq_printf(m, "Kthread:\t%c\n", p->flags & PF_KTHREAD ? '1' : '0'); } void render_sigset_t(struct seq_file *m, const char *header, @@ -423,6 +426,11 @@ static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm) seq_printf(m, "THP_enabled:\t%d\n", thp_enabled); } +static inline void task_untag_mask(struct seq_file *m, struct mm_struct *mm) +{ + seq_printf(m, "untag_mask:\t%#lx\n", mm_untag_mask(mm)); +} + int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -438,6 +446,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, task_mem(m, mm); task_core_dumping(m, task); task_thp_status(m, mm); + task_untag_mask(m, mm); mmput(mm); } task_sig(m, task); diff --git a/fs/proc/base.c b/fs/proc/base.c index 5e0e0ccd47aa..05452c3b9872 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -96,6 +96,7 @@ #include <linux/time_namespace.h> #include <linux/resctrl.h> #include <linux/cn_proc.h> +#include <linux/ksm.h> #include <trace/events/oom.h> #include "internal.h" #include "fd.h" @@ -699,7 +700,6 @@ int proc_setattr(struct mnt_idmap *idmap, struct dentry *dentry, return error; setattr_copy(&nop_mnt_idmap, inode, attr); - mark_inode_dirty(inode); return 0; } @@ -3207,6 +3207,8 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, mm = get_task_mm(task); if (mm) { seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); + seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages); + seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm)); mmput(mm); } diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 8379593fa4bb..42ae38ff6e7e 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -127,7 +127,6 @@ static int proc_notify_change(struct mnt_idmap *idmap, return error; setattr_copy(&nop_mnt_idmap, inode, iattr); - mark_inode_dirty(inode); proc_set_user(de, inode->i_uid, inode->i_gid); de->mode = inode->i_mode; diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 71157ee35c1a..25b44b303b35 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -24,7 +24,7 @@ #include <linux/memblock.h> #include <linux/init.h> #include <linux/slab.h> -#include <linux/uaccess.h> +#include <linux/uio.h> #include <asm/io.h> #include <linux/list.h> #include <linux/ioport.h> @@ -307,10 +307,9 @@ static void append_kcore_note(char *notes, size_t *i, const char *name, *i = ALIGN(*i + descsz, 4); } -static ssize_t -read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) +static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) { - char *buf = file->private_data; + loff_t *fpos = &iocb->ki_pos; size_t phdrs_offset, notes_offset, data_offset; size_t page_offline_frozen = 1; size_t phdrs_len, notes_len; @@ -318,6 +317,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) size_t tsz; int nphdr; unsigned long start; + size_t buflen = iov_iter_count(iter); size_t orig_buflen = buflen; int ret = 0; @@ -356,12 +356,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) }; tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *fpos); - if (copy_to_user(buffer, (char *)&ehdr + *fpos, tsz)) { + if (copy_to_iter((char *)&ehdr + *fpos, tsz, iter) != tsz) { ret = -EFAULT; goto out; } - buffer += tsz; buflen -= tsz; *fpos += tsz; } @@ -398,15 +397,14 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) } tsz = min_t(size_t, buflen, phdrs_offset + phdrs_len - *fpos); - if (copy_to_user(buffer, (char *)phdrs + *fpos - phdrs_offset, - tsz)) { + if (copy_to_iter((char *)phdrs + *fpos - phdrs_offset, tsz, + iter) != tsz) { kfree(phdrs); ret = -EFAULT; goto out; } kfree(phdrs); - buffer += tsz; buflen -= tsz; *fpos += tsz; } @@ -448,14 +446,13 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) min(vmcoreinfo_size, notes_len - i)); tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos); - if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) { + if (copy_to_iter(notes + *fpos - notes_offset, tsz, iter) != tsz) { kfree(notes); ret = -EFAULT; goto out; } kfree(notes); - buffer += tsz; buflen -= tsz; *fpos += tsz; } @@ -497,7 +494,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) } if (!m) { - if (clear_user(buffer, tsz)) { + if (iov_iter_zero(tsz, iter) != tsz) { ret = -EFAULT; goto out; } @@ -506,16 +503,33 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) switch (m->type) { case KCORE_VMALLOC: - vread(buf, (char *)start, tsz); - /* we have to zero-fill user buffer even if no read */ - if (copy_to_user(buffer, buf, tsz)) { - ret = -EFAULT; - goto out; + { + const char *src = (char *)start; + size_t read = 0, left = tsz; + + /* + * vmalloc uses spinlocks, so we optimistically try to + * read memory. If this fails, fault pages in and try + * again until we are done. + */ + while (true) { + read += vread_iter(iter, src, left); + if (read == tsz) + break; + + src += read; + left -= read; + + if (fault_in_iov_iter_writeable(iter, left)) { + ret = -EFAULT; + goto out; + } } break; + } case KCORE_USER: /* User page is handled prior to normal kernel page: */ - if (copy_to_user(buffer, (char *)start, tsz)) { + if (copy_to_iter((char *)start, tsz, iter) != tsz) { ret = -EFAULT; goto out; } @@ -531,7 +545,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) */ if (!page || PageOffline(page) || is_page_hwpoison(page) || !pfn_is_ram(pfn)) { - if (clear_user(buffer, tsz)) { + if (iov_iter_zero(tsz, iter) != tsz) { ret = -EFAULT; goto out; } @@ -541,24 +555,17 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) case KCORE_VMEMMAP: case KCORE_TEXT: /* - * Using bounce buffer to bypass the - * hardened user copy kernel text checks. + * We use _copy_to_iter() to bypass usermode hardening + * which would otherwise prevent this operation. */ - if (copy_from_kernel_nofault(buf, (void *)start, tsz)) { - if (clear_user(buffer, tsz)) { - ret = -EFAULT; - goto out; - } - } else { - if (copy_to_user(buffer, buf, tsz)) { - ret = -EFAULT; - goto out; - } + if (_copy_to_iter((char *)start, tsz, iter) != tsz) { + ret = -EFAULT; + goto out; } break; default: pr_warn_once("Unhandled KCORE type: %d\n", m->type); - if (clear_user(buffer, tsz)) { + if (iov_iter_zero(tsz, iter) != tsz) { ret = -EFAULT; goto out; } @@ -566,7 +573,6 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) skip: buflen -= tsz; *fpos += tsz; - buffer += tsz; start += tsz; tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen); } @@ -589,10 +595,6 @@ static int open_kcore(struct inode *inode, struct file *filp) if (ret) return ret; - filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!filp->private_data) - return -ENOMEM; - if (kcore_need_update) kcore_update_ram(); if (i_size_read(inode) != proc_root_kcore->size) { @@ -603,16 +605,9 @@ static int open_kcore(struct inode *inode, struct file *filp) return 0; } -static int release_kcore(struct inode *inode, struct file *file) -{ - kfree(file->private_data); - return 0; -} - static const struct proc_ops kcore_proc_ops = { - .proc_read = read_kcore, + .proc_read_iter = read_kcore_iter, .proc_open = open_kcore, - .proc_release = release_kcore, .proc_lseek = default_llseek, }; diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 440960110a42..b43d0bd42762 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -6,6 +6,7 @@ #include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/mmzone.h> +#include <linux/memblock.h> #include <linux/proc_fs.h> #include <linux/percpu.h> #include <linux/seq_file.h> @@ -131,6 +132,18 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "VmallocChunk: ", 0ul); show_val_kb(m, "Percpu: ", pcpu_nr_pages()); +#ifdef CONFIG_MEMTEST + if (early_memtest_done) { + unsigned long early_memtest_bad_size_kb; + + early_memtest_bad_size_kb = early_memtest_bad_size>>10; + if (early_memtest_bad_size && !early_memtest_bad_size_kb) + early_memtest_bad_size_kb = 1; + /* When 0 is reported, it means there actually was a successful test */ + seq_printf(m, "EarlyMemtestBad: %5lu kB\n", early_memtest_bad_size_kb); + } +#endif + #ifdef CONFIG_MEMORY_FAILURE seq_printf(m, "HardwareCorrupted: %5lu kB\n", atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)); diff --git a/fs/proc/page.c b/fs/proc/page.c index 6249c347809a..195b077c0fac 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -125,7 +125,7 @@ u64 stable_page_flags(struct page *page) /* * pseudo flags for the well known (anonymous) memory mapped pages * - * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the + * Note that page->_mapcount is overloaded in SLAB, so the * simple test in page_mapped() is not enough. */ if (!PageSlab(page) && page_mapped(page)) @@ -165,9 +165,8 @@ u64 stable_page_flags(struct page *page) /* - * Caveats on high order pages: page->_refcount will only be set - * -1 on the head page; SLUB/SLQB do the same for PG_slab; - * SLOB won't set PG_slab at all on compound pages. + * Caveats on high order pages: PG_buddy and PG_slab will only be set + * on the head page. */ if (PageBuddy(page)) u |= 1 << KPF_BUDDY; @@ -185,7 +184,7 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); - if (PageTail(page) && PageSlab(compound_head(page))) + if (PageTail(page) && PageSlab(page)) u |= 1 << KPF_SLAB; u |= kpf_copy_bit(k, KPF_ERROR, PG_error); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 5851eb5bc726..8038833ff5b0 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -841,7 +841,6 @@ static int proc_sys_setattr(struct mnt_idmap *idmap, return error; setattr_copy(&nop_mnt_idmap, inode, attr); - mark_inode_dirty(inode); return 0; } @@ -1283,11 +1282,43 @@ out: return err; } +/* Find the directory for the ctl_table. If one is not found create it. */ +static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path) +{ + const char *name, *nextname; + + for (name = path; name; name = nextname) { + int namelen; + nextname = strchr(name, '/'); + if (nextname) { + namelen = nextname - name; + nextname++; + } else { + namelen = strlen(name); + } + if (namelen == 0) + continue; + + /* + * namelen ensures if name is "foo/bar/yay" only foo is + * registered first. We traverse as if using mkdir -p and + * return a ctl_dir for the last directory entry. + */ + dir = get_subdir(dir, name, namelen); + if (IS_ERR(dir)) + break; + } + return dir; +} + /** * __register_sysctl_table - register a leaf sysctl table * @set: Sysctl tree to register on * @path: The path to the directory the sysctl table is in. - * @table: the top-level table structure + * @table: the top-level table structure without any child. This table + * should not be free'd after registration. So it should not be + * used on stack. It can either be a global or dynamically allocated + * by the caller and free'd later after sysctl unregistration. * * Register a sysctl table hierarchy. @table should be a filled in ctl_table * array. A completely 0 filled entry terminates the table. @@ -1308,9 +1339,12 @@ out: * proc_handler - the text handler routine (described below) * * extra1, extra2 - extra pointers usable by the proc handler routines + * XXX: we should eventually modify these to use long min / max [0] + * [0] https://lkml.kernel.org/87zgpte9o4.fsf@email.froward.int.ebiederm.org * * Leaf nodes in the sysctl tree will be represented by a single file - * under /proc; non-leaf nodes will be represented by directories. + * under /proc; non-leaf nodes (where child is not NULL) are not allowed, + * sysctl_check_table() verifies this. * * There must be a proc_handler routine for any terminal nodes. * Several default handlers are available to cover common cases - @@ -1331,7 +1365,6 @@ struct ctl_table_header *__register_sysctl_table( { struct ctl_table_root *root = set->dir.header.root; struct ctl_table_header *header; - const char *name, *nextname; struct ctl_dir *dir; struct ctl_table *entry; struct ctl_node *node; @@ -1352,28 +1385,13 @@ struct ctl_table_header *__register_sysctl_table( spin_lock(&sysctl_lock); dir = &set->dir; - /* Reference moved down the diretory tree get_subdir */ + /* Reference moved down the directory tree get_subdir */ dir->header.nreg++; spin_unlock(&sysctl_lock); - /* Find the directory for the ctl_table */ - for (name = path; name; name = nextname) { - int namelen; - nextname = strchr(name, '/'); - if (nextname) { - namelen = nextname - name; - nextname++; - } else { - namelen = strlen(name); - } - if (namelen == 0) - continue; - - dir = get_subdir(dir, name, namelen); - if (IS_ERR(dir)) - goto fail; - } - + dir = sysctl_mkdir_p(dir, path); + if (IS_ERR(dir)) + goto fail; spin_lock(&sysctl_lock); if (insert_header(dir, header)) goto fail_put_dir_locked; @@ -1394,8 +1412,15 @@ fail: /** * register_sysctl - register a sysctl table - * @path: The path to the directory the sysctl table is in. - * @table: the table structure + * @path: The path to the directory the sysctl table is in. If the path + * doesn't exist we will create it for you. + * @table: the table structure. The calller must ensure the life of the @table + * will be kept during the lifetime use of the syctl. It must not be freed + * until unregister_sysctl_table() is called with the given returned table + * with this registration. If your code is non modular then you don't need + * to call unregister_sysctl_table() and can instead use something like + * register_sysctl_init() which does not care for the result of the syctl + * registration. * * Register a sysctl table. @table should be a filled in ctl_table * array. A completely 0 filled entry terminates the table. @@ -1411,8 +1436,11 @@ EXPORT_SYMBOL(register_sysctl); /** * __register_sysctl_init() - register sysctl table to path - * @path: path name for sysctl base - * @table: This is the sysctl table that needs to be registered to the path + * @path: path name for sysctl base. If that path doesn't exist we will create + * it for you. + * @table: This is the sysctl table that needs to be registered to the path. + * The caller must ensure the life of the @table will be kept during the + * lifetime use of the sysctl. * @table_name: The name of sysctl table, only used for log printing when * registration fails * @@ -1424,10 +1452,7 @@ EXPORT_SYMBOL(register_sysctl); * register_sysctl() failing on init are extremely low, and so for both reasons * this function does not return any error as it is used by initialization code. * - * Context: Can only be called after your respective sysctl base path has been - * registered. So for instance, most base directories are registered early on - * init before init levels are processed through proc_sys_init() and - * sysctl_init_bases(). + * Context: if your base directory does not exist it will be created for you. */ void __init __register_sysctl_init(const char *path, struct ctl_table *table, const char *table_name) @@ -1550,24 +1575,18 @@ out: } /** - * __register_sysctl_paths - register a sysctl table hierarchy - * @set: Sysctl tree to register on - * @path: The path to the directory the sysctl table is in. + * register_sysctl_table - register a sysctl table hierarchy * @table: the top-level table structure * * Register a sysctl table hierarchy. @table should be a filled in ctl_table * array. A completely 0 filled entry terminates the table. - * - * See __register_sysctl_table for more details. + * We are slowly deprecating this call so avoid its use. */ -struct ctl_table_header *__register_sysctl_paths( - struct ctl_table_set *set, - const struct ctl_path *path, struct ctl_table *table) +struct ctl_table_header *register_sysctl_table(struct ctl_table *table) { struct ctl_table *ctl_table_arg = table; int nr_subheaders = count_subheaders(table); struct ctl_table_header *header = NULL, **subheaders, **subheader; - const struct ctl_path *component; char *new_path, *pos; pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL); @@ -1575,11 +1594,6 @@ struct ctl_table_header *__register_sysctl_paths( return NULL; pos[0] = '\0'; - for (component = path; component->procname; component++) { - pos = append_path(new_path, pos, component->procname); - if (!pos) - goto out; - } while (table->procname && table->child && !table[1].procname) { pos = append_path(new_path, pos, table->procname); if (!pos) @@ -1587,7 +1601,7 @@ struct ctl_table_header *__register_sysctl_paths( table = table->child; } if (nr_subheaders == 1) { - header = __register_sysctl_table(set, new_path, table); + header = __register_sysctl_table(&sysctl_table_root.default_set, new_path, table); if (header) header->ctl_table_arg = ctl_table_arg; } else { @@ -1601,7 +1615,7 @@ struct ctl_table_header *__register_sysctl_paths( header->ctl_table_arg = ctl_table_arg; if (register_leaf_sysctl_tables(new_path, pos, &subheader, - set, table)) + &sysctl_table_root.default_set, table)) goto err_register_leaves; } @@ -1620,40 +1634,6 @@ err_register_leaves: header = NULL; goto out; } - -/** - * register_sysctl_paths - register a sysctl table hierarchy - * @path: The path to the directory the sysctl table is in. - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * See __register_sysctl_paths for more details. - */ -struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, - struct ctl_table *table) -{ - return __register_sysctl_paths(&sysctl_table_root.default_set, - path, table); -} -EXPORT_SYMBOL(register_sysctl_paths); - -/** - * register_sysctl_table - register a sysctl table hierarchy - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * See register_sysctl_paths for more details. - */ -struct ctl_table_header *register_sysctl_table(struct ctl_table *table) -{ - static const struct ctl_path null_path[] = { {} }; - - return register_sysctl_paths(null_path, table); -} EXPORT_SYMBOL(register_sysctl_table); int __register_sysctl_base(struct ctl_table *base_table) diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 4fb8729a68d4..da60956b2915 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -22,30 +22,6 @@ #define arch_irq_stat() 0 #endif -#ifdef arch_idle_time - -u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) -{ - u64 idle; - - idle = kcs->cpustat[CPUTIME_IDLE]; - if (cpu_online(cpu) && !nr_iowait_cpu(cpu)) - idle += arch_idle_time(cpu); - return idle; -} - -static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) -{ - u64 iowait; - - iowait = kcs->cpustat[CPUTIME_IOWAIT]; - if (cpu_online(cpu) && nr_iowait_cpu(cpu)) - iowait += arch_idle_time(cpu); - return iowait; -} - -#else - u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) { u64 idle, idle_usecs = -1ULL; @@ -78,8 +54,6 @@ static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) return iowait; } -#endif - static void show_irq_gap(struct seq_file *p, unsigned int gap) { static const char zeros[] = " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6a96e1713fd5..420510f6a545 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -782,7 +782,6 @@ static void smap_gather_stats(struct vm_area_struct *vma, if (start >= vma->vm_end) return; -#ifdef CONFIG_SHMEM if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { /* * For shared or readonly shmem mappings we know that all @@ -803,7 +802,7 @@ static void smap_gather_stats(struct vm_area_struct *vma, ops = &smaps_shmem_walk_ops; } } -#endif + /* mmap_lock is held in m_start */ if (!start) walk_page_vma(vma, ops, mss); @@ -1689,8 +1688,13 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, /* watch out for wraparound */ start_vaddr = end_vaddr; - if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) - start_vaddr = untagged_addr(svpfn << PAGE_SHIFT); + if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) { + ret = mmap_read_lock_killable(mm); + if (ret) + goto out_free; + start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT); + mmap_read_unlock(mm); + } /* Ensure the address is inside the task */ if (start_vaddr > mm->task_size) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 12af614f33ce..03f5963914a1 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -339,7 +339,7 @@ static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos) return acc; } - /* Read Elf note segment */ + /* Read ELF note segment */ if (*fpos < elfcorebuf_sz + elfnotes_sz) { void *kaddr; @@ -1109,7 +1109,7 @@ static int __init process_ptload_program_headers_elf64(char *elfptr, ehdr_ptr = (Elf64_Ehdr *)elfptr; phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */ - /* Skip Elf header, program headers and Elf note segment. */ + /* Skip ELF header, program headers and ELF note segment. */ vmcore_off = elfsz + elfnotes_sz; for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { @@ -1152,7 +1152,7 @@ static int __init process_ptload_program_headers_elf32(char *elfptr, ehdr_ptr = (Elf32_Ehdr *)elfptr; phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */ - /* Skip Elf header, program headers and Elf note segment. */ + /* Skip ELF header, program headers and ELF note segment. */ vmcore_off = elfsz + elfnotes_sz; for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { @@ -1188,7 +1188,7 @@ static void set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz, loff_t vmcore_off; struct vmcore *m; - /* Skip Elf header, program headers and Elf note segment. */ + /* Skip ELF header, program headers and ELF note segment. */ vmcore_off = elfsz + elfnotes_sz; list_for_each_entry(m, vc_list, list) { @@ -1213,7 +1213,7 @@ static int __init parse_crash_elf64_headers(void) addr = elfcorehdr_addr; - /* Read Elf header */ + /* Read ELF header */ rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf64_Ehdr), &addr); if (rc < 0) return rc; @@ -1269,7 +1269,7 @@ static int __init parse_crash_elf32_headers(void) addr = elfcorehdr_addr; - /* Read Elf header */ + /* Read ELF header */ rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf32_Ehdr), &addr); if (rc < 0) return rc; @@ -1376,12 +1376,12 @@ static void vmcoredd_write_header(void *buf, struct vmcoredd_data *data, } /** - * vmcoredd_update_program_headers - Update all Elf program headers + * vmcoredd_update_program_headers - Update all ELF program headers * @elfptr: Pointer to elf header * @elfnotesz: Size of elf notes aligned to page size * @vmcoreddsz: Size of device dumps to be added to elf note header * - * Determine type of Elf header (Elf64 or Elf32) and update the elf note size. + * Determine type of ELF header (Elf64 or Elf32) and update the elf note size. * Also update the offsets of all the program headers after the elf note header. */ static void vmcoredd_update_program_headers(char *elfptr, size_t elfnotesz, @@ -1439,10 +1439,10 @@ static void vmcoredd_update_program_headers(char *elfptr, size_t elfnotesz, /** * vmcoredd_update_size - Update the total size of the device dumps and update - * Elf header + * ELF header * @dump_size: Size of the current device dump to be added to total size * - * Update the total size of all the device dumps and update the Elf program + * Update the total size of all the device dumps and update the ELF program * headers. Calculate the new offsets for the vmcore list and update the * total vmcore size. */ @@ -1466,7 +1466,7 @@ static void vmcoredd_update_size(size_t dump_size) * @data: dump info. * * Allocate a buffer and invoke the calling driver's dump collect routine. - * Write Elf note at the beginning of the buffer to indicate vmcore device + * Write ELF note at the beginning of the buffer to indicate vmcore device * dump and add the dump to global list. */ int vmcore_add_device_dump(struct vmcoredd_data *data) diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c index ab82e5f05346..55f139afa327 100644 --- a/fs/pstore/pmsg.c +++ b/fs/pstore/pmsg.c @@ -7,10 +7,9 @@ #include <linux/device.h> #include <linux/fs.h> #include <linux/uaccess.h> -#include <linux/rtmutex.h> #include "internal.h" -static DEFINE_RT_MUTEX(pmsg_lock); +static DEFINE_MUTEX(pmsg_lock); static ssize_t write_pmsg(struct file *file, const char __user *buf, size_t count, loff_t *ppos) @@ -29,9 +28,9 @@ static ssize_t write_pmsg(struct file *file, const char __user *buf, if (!access_ok(buf, count)) return -EFAULT; - rt_mutex_lock(&pmsg_lock); + mutex_lock(&pmsg_lock); ret = psinfo->write_user(&record, buf); - rt_mutex_unlock(&pmsg_lock); + mutex_unlock(&pmsg_lock); return ret ? ret : count; } @@ -64,7 +63,7 @@ void pstore_register_pmsg(void) goto err; } - pmsg_class = class_create(THIS_MODULE, PMSG_NAME); + pmsg_class = class_create(PMSG_NAME); if (IS_ERR(pmsg_class)) { pr_err("device class file already in use\n"); goto err_class; diff --git a/fs/qnx4/README b/fs/qnx4/README deleted file mode 100644 index 1f1e320d91da..000000000000 --- a/fs/qnx4/README +++ /dev/null @@ -1,9 +0,0 @@ - - This is a snapshot of the QNX4 filesystem for Linux. - Please send diffs and remarks to <al@alarsen.net> . - -Credits : - -Richard "Scuba" A. Frowijn <scuba@wxs.nl> -Frank "Jedi/Sector One" Denis <j@pureftpd.org> -Anders Larsen <al@alarsen.net> (Maintainer) diff --git a/fs/qnx6/README b/fs/qnx6/README deleted file mode 100644 index 116d622026cc..000000000000 --- a/fs/qnx6/README +++ /dev/null @@ -1,8 +0,0 @@ - - This is a snapshot of the QNX6 filesystem for Linux. - Please send diffs and remarks to <chaosman@ontika.net> . - -Credits : - -Al Viro <viro@ZenIV.linux.org.uk> (endless patience with me & support ;)) -Kai Bankett <chaosman@ontika.net> (Maintainer) diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig index d5a85a8062d0..4c925e55dbcd 100644 --- a/fs/quota/Kconfig +++ b/fs/quota/Kconfig @@ -9,7 +9,7 @@ config QUOTA help If you say Y here, you will be able to set per user limits for disk usage (also called disk quotas). Currently, it works for the - ext2, ext3, ext4, jfs, ocfs2 and reiserfs file systems. + ext2, ext3, ext4, f2fs, jfs, ocfs2 and reiserfs file systems. Note that gfs2 and xfs use their own quota system. Ext3, ext4 and reiserfs also support journaled quotas for which you don't need to run quotacheck(8) after an unclean shutdown. @@ -28,7 +28,7 @@ config QUOTA_NETLINK_INTERFACE config PRINT_QUOTA_WARNING bool "Print quota warnings to console (OBSOLETE)" - depends on QUOTA + depends on QUOTA && BROKEN default y help If you say Y here, quota warnings (about exceeding softlimit, reaching diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index a6357f728034..ffd40dc3e4e9 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2819,7 +2819,6 @@ EXPORT_SYMBOL(dquot_get_state); int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii) { struct mem_dqinfo *mi; - int err = 0; if ((ii->i_fieldmask & QC_WARNS_MASK) || (ii->i_fieldmask & QC_RT_SPC_TIMER)) @@ -2846,8 +2845,7 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii) spin_unlock(&dq_data_lock); mark_info_dirty(sb, type); /* Force write to disk */ - sb->dq_op->write_info(sb, type); - return err; + return sb->dq_op->write_info(sb, type); } EXPORT_SYMBOL(dquot_set_dqinfo); @@ -2948,24 +2946,6 @@ static struct ctl_table fs_dqstats_table[] = { { }, }; -static struct ctl_table fs_table[] = { - { - .procname = "quota", - .mode = 0555, - .child = fs_dqstats_table, - }, - { }, -}; - -static struct ctl_table sys_table[] = { - { - .procname = "fs", - .mode = 0555, - .child = fs_table, - }, - { }, -}; - static int __init dquot_init(void) { int i, ret; @@ -2973,7 +2953,7 @@ static int __init dquot_init(void) printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__); - register_sysctl_table(sys_table); + register_sysctl_init("fs/quota", fs_dqstats_table); dquot_cachep = kmem_cache_create("dquot", sizeof(struct dquot), sizeof(unsigned long) * 4, diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c index cd92e5fa0062..a0db3f195e95 100644 --- a/fs/quota/quota_v1.c +++ b/fs/quota/quota_v1.c @@ -206,7 +206,7 @@ static int v1_write_file_info(struct super_block *sb, int type) sizeof(struct v1_disk_dqblk), v1_dqoff(0)); if (ret == sizeof(struct v1_disk_dqblk)) ret = 0; - else if (ret > 0) + else if (ret >= 0) ret = -EIO; out: up_write(&dqopt->dqio_sem); diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index b1467f3921c2..ae99e7b88205 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -212,7 +212,7 @@ static int v2_write_file_info(struct super_block *sb, int type) up_write(&dqopt->dqio_sem); if (size != sizeof(struct v2_disk_dqinfo)) { quota_error(sb, "Can't write info structure"); - return -1; + return size < 0 ? size : -EIO; } return 0; } diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 2f67516bb9bf..9fbb9b5256f7 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -70,7 +70,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) /* make various checks */ order = get_order(newsize); - if (unlikely(order >= MAX_ORDER)) + if (unlikely(order > MAX_ORDER)) return -EFBIG; ret = inode_newsize_ok(inode, newsize); diff --git a/fs/read_write.c b/fs/read_write.c index 7a2ff6157eda..a21ba3be7dbe 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -749,15 +749,14 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, return -EOPNOTSUPP; while (iov_iter_count(iter)) { - struct iovec iovec = iov_iter_iovec(iter); ssize_t nr; if (type == READ) { - nr = filp->f_op->read(filp, iovec.iov_base, - iovec.iov_len, ppos); + nr = filp->f_op->read(filp, iter_iov_addr(iter), + iter_iov_len(iter), ppos); } else { - nr = filp->f_op->write(filp, iovec.iov_base, - iovec.iov_len, ppos); + nr = filp->f_op->write(filp, iter_iov_addr(iter), + iter_iov_len(iter), ppos); } if (nr < 0) { @@ -766,7 +765,7 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, break; } ret += nr; - if (nr != iovec.iov_len) + if (nr != iter_iov_len(iter)) break; iov_iter_advance(iter, nr); } diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 467d13da198f..b54cc7048f02 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -261,3 +261,10 @@ const struct inode_operations reiserfs_file_inode_operations = { .fileattr_get = reiserfs_fileattr_get, .fileattr_set = reiserfs_fileattr_set, }; + +const struct inode_operations reiserfs_priv_file_inode_operations = { + .setattr = reiserfs_setattr, + .permission = reiserfs_permission, + .fileattr_get = reiserfs_fileattr_get, + .fileattr_set = reiserfs_fileattr_set, +}; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index d54cab854f60..d8debbb6105f 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2087,10 +2087,8 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, * Mark it private if we're creating the privroot * or something under it. */ - if (IS_PRIVATE(dir) || dentry == REISERFS_SB(sb)->priv_root) { - inode->i_flags |= S_PRIVATE; - inode->i_opflags &= ~IOP_XATTR; - } + if (IS_PRIVATE(dir) || dentry == REISERFS_SB(sb)->priv_root) + reiserfs_init_priv_inode(inode); if (reiserfs_posixacl(inode->i_sb)) { reiserfs_write_unlock(inode->i_sb); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 9ce4ec296b74..4d11d60f493c 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -3031,7 +3031,6 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, unsigned int old_trans_id; struct reiserfs_journal *journal = SB_JOURNAL(sb); struct reiserfs_transaction_handle myth; - int sched_count = 0; int retval; int depth; @@ -3088,7 +3087,6 @@ relock: ((journal->j_len + nblocks + 2) * 100) < (journal->j_len_alloc * 75)) { if (atomic_read(&journal->j_wcount) > 10) { - sched_count++; queue_log_writer(sb); goto relock; } diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 42d2c20e1345..52240cc891cf 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -378,13 +378,11 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, /* * Propagate the private flag so we know we're - * in the priv tree. Also clear IOP_XATTR + * in the priv tree. Also clear xattr support * since we don't have xattrs on xattr files. */ - if (IS_PRIVATE(dir)) { - inode->i_flags |= S_PRIVATE; - inode->i_opflags &= ~IOP_XATTR; - } + if (IS_PRIVATE(dir)) + reiserfs_init_priv_inode(inode); } reiserfs_write_unlock(dir->i_sb); if (retval == IO_ERROR) { @@ -1649,6 +1647,48 @@ static int reiserfs_rename(struct mnt_idmap *idmap, return retval; } +static const struct inode_operations reiserfs_priv_dir_inode_operations = { + .create = reiserfs_create, + .lookup = reiserfs_lookup, + .link = reiserfs_link, + .unlink = reiserfs_unlink, + .symlink = reiserfs_symlink, + .mkdir = reiserfs_mkdir, + .rmdir = reiserfs_rmdir, + .mknod = reiserfs_mknod, + .rename = reiserfs_rename, + .setattr = reiserfs_setattr, + .permission = reiserfs_permission, + .fileattr_get = reiserfs_fileattr_get, + .fileattr_set = reiserfs_fileattr_set, +}; + +static const struct inode_operations reiserfs_priv_symlink_inode_operations = { + .get_link = page_get_link, + .setattr = reiserfs_setattr, + .permission = reiserfs_permission, +}; + +static const struct inode_operations reiserfs_priv_special_inode_operations = { + .setattr = reiserfs_setattr, + .permission = reiserfs_permission, +}; + +void reiserfs_init_priv_inode(struct inode *inode) +{ + inode->i_flags |= S_PRIVATE; + inode->i_opflags &= ~IOP_XATTR; + + if (S_ISREG(inode->i_mode)) + inode->i_op = &reiserfs_priv_file_inode_operations; + else if (S_ISDIR(inode->i_mode)) + inode->i_op = &reiserfs_priv_dir_inode_operations; + else if (S_ISLNK(inode->i_mode)) + inode->i_op = &reiserfs_priv_symlink_inode_operations; + else + inode->i_op = &reiserfs_priv_special_inode_operations; +} + /* directories can handle most operations... */ const struct inode_operations reiserfs_dir_inode_operations = { .create = reiserfs_create, diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 98e6f53c2fe0..1bccf6a2e908 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -3106,6 +3106,7 @@ int reiserfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len); /* namei.c */ +void reiserfs_init_priv_inode(struct inode *inode); void set_de_name_and_namelen(struct reiserfs_dir_entry *de); int search_by_entry_key(struct super_block *sb, const struct cpu_key *key, struct treepath *path, struct reiserfs_dir_entry *de); @@ -3175,6 +3176,7 @@ void reiserfs_unmap_buffer(struct buffer_head *); /* file.c */ extern const struct inode_operations reiserfs_file_inode_operations; +extern const struct inode_operations reiserfs_priv_file_inode_operations; extern const struct file_operations reiserfs_file_operations; extern const struct address_space_operations reiserfs_address_space_operations; diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 84c12a1947b2..ce5003986789 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -1262,7 +1262,6 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, #ifdef CONFIG_REISERFS_CHECK char mode; - int iter = 0; #endif BUG_ON(!th->t_trans_id); @@ -1274,7 +1273,6 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, removed = 0; #ifdef CONFIG_REISERFS_CHECK - iter++; mode = #endif prepare_for_delete_or_cut(th, inode, path, diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 06d810c72c52..651027967159 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -52,6 +52,7 @@ #include <linux/quotaops.h> #include <linux/security.h> #include <linux/posix_acl_xattr.h> +#include <linux/xattr.h> #define PRIVROOT_NAME ".reiserfs_priv" #define XAROOT_NAME "xattrs" @@ -770,23 +771,34 @@ out: (handler) != NULL; \ (handler) = *(handlers)++) +static inline bool reiserfs_posix_acl_list(const char *name, + struct dentry *dentry) +{ + return (posix_acl_type(name) >= 0) && + IS_POSIXACL(d_backing_inode(dentry)); +} + /* This is the implementation for the xattr plugin infrastructure */ -static inline const struct xattr_handler * -find_xattr_handler_prefix(const struct xattr_handler **handlers, - const char *name) +static inline bool reiserfs_xattr_list(const struct xattr_handler **handlers, + const char *name, struct dentry *dentry) { - const struct xattr_handler *xah; + if (handlers) { + const struct xattr_handler *xah = NULL; - if (!handlers) - return NULL; + for_each_xattr_handler(handlers, xah) { + const char *prefix = xattr_prefix(xah); - for_each_xattr_handler(handlers, xah) { - const char *prefix = xattr_prefix(xah); - if (strncmp(prefix, name, strlen(prefix)) == 0) - break; + if (strncmp(prefix, name, strlen(prefix))) + continue; + + if (!xattr_handler_can_list(xah, dentry)) + return false; + + return true; + } } - return xah; + return reiserfs_posix_acl_list(name, dentry); } struct listxattr_buf { @@ -807,12 +819,8 @@ static bool listxattr_filler(struct dir_context *ctx, const char *name, if (name[0] != '.' || (namelen != 1 && (name[1] != '.' || namelen != 2))) { - const struct xattr_handler *handler; - - handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr, - name); - if (!handler /* Unsupported xattr name */ || - (handler->list && !handler->list(b->dentry))) + if (!reiserfs_xattr_list(b->dentry->d_sb->s_xattr, name, + b->dentry)) return true; size = namelen + 1; if (b->buf) { @@ -888,8 +896,7 @@ static int create_privroot(struct dentry *dentry) return -EOPNOTSUPP; } - d_inode(dentry)->i_flags |= S_PRIVATE; - d_inode(dentry)->i_opflags &= ~IOP_XATTR; + reiserfs_init_priv_inode(d_inode(dentry)); reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr " "storage.\n", PRIVROOT_NAME); @@ -911,10 +918,6 @@ const struct xattr_handler *reiserfs_xattr_handlers[] = { #ifdef CONFIG_REISERFS_FS_SECURITY &reiserfs_xattr_security_handler, #endif -#ifdef CONFIG_REISERFS_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif NULL }; @@ -975,10 +978,8 @@ int reiserfs_lookup_privroot(struct super_block *s) if (!IS_ERR(dentry)) { REISERFS_SB(s)->priv_root = dentry; d_set_d_op(dentry, &xattr_lookup_poison_ops); - if (d_really_is_positive(dentry)) { - d_inode(dentry)->i_flags |= S_PRIVATE; - d_inode(dentry)->i_opflags &= ~IOP_XATTR; - } + if (d_really_is_positive(dentry)) + reiserfs_init_priv_inode(d_inode(dentry)); } else err = PTR_ERR(dentry); inode_unlock(d_inode(s->s_root)); diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index 41c0ea84fbff..6e0a099dd788 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -39,6 +39,22 @@ static bool security_list(struct dentry *dentry) return !IS_PRIVATE(d_inode(dentry)); } +static int +reiserfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) +{ + struct reiserfs_security_handle *sec = fs_info; + + sec->value = kmemdup(xattr_array->value, xattr_array->value_len, + GFP_KERNEL); + if (!sec->value) + return -ENOMEM; + + sec->name = xattr_array->name; + sec->length = xattr_array->value_len; + return 0; +} + /* Initializes the security context for a new inode and returns the number * of blocks needed for the transaction. If successful, reiserfs_security * must be released using reiserfs_security_free when the caller is done. */ @@ -56,12 +72,9 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode, if (IS_PRIVATE(dir)) return 0; - error = security_old_inode_init_security(inode, dir, qstr, &sec->name, - &sec->value, &sec->length); + error = security_inode_init_security(inode, dir, qstr, + &reiserfs_initxattrs, sec); if (error) { - if (error == -EOPNOTSUPP) - error = 0; - sec->name = NULL; sec->value = NULL; sec->length = 0; @@ -82,11 +95,15 @@ int reiserfs_security_write(struct reiserfs_transaction_handle *th, struct inode *inode, struct reiserfs_security_handle *sec) { + char xattr_name[XATTR_NAME_MAX + 1] = XATTR_SECURITY_PREFIX; int error; - if (strlen(sec->name) < sizeof(XATTR_SECURITY_PREFIX)) + + if (XATTR_SECURITY_PREFIX_LEN + strlen(sec->name) > XATTR_NAME_MAX) return -EINVAL; - error = reiserfs_xattr_set_handle(th, inode, sec->name, sec->value, + strlcat(xattr_name, sec->name, sizeof(xattr_name)); + + error = reiserfs_xattr_set_handle(th, inode, xattr_name, sec->value, sec->length, XATTR_CREATE); if (error == -ENODATA || error == -EOPNOTSUPP) error = 0; diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h index ace133cf6072..bae590eec871 100644 --- a/fs/smbfs_common/smb2pdu.h +++ b/fs/smbfs_common/smb2pdu.h @@ -327,17 +327,18 @@ struct smb2_tree_connect_req { #define SMB2_SHAREFLAG_NO_CACHING 0x00000030 #define SHI1005_FLAGS_DFS 0x00000001 #define SHI1005_FLAGS_DFS_ROOT 0x00000002 -#define SHI1005_FLAGS_RESTRICT_EXCLUSIVE_OPENS 0x00000100 -#define SHI1005_FLAGS_FORCE_SHARED_DELETE 0x00000200 -#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING 0x00000400 -#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM 0x00000800 -#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK 0x00001000 -#define SHI1005_FLAGS_ENABLE_HASH_V1 0x00002000 -#define SHI1005_FLAGS_ENABLE_HASH_V2 0x00004000 +#define SMB2_SHAREFLAG_RESTRICT_EXCLUSIVE_OPENS 0x00000100 +#define SMB2_SHAREFLAG_FORCE_SHARED_DELETE 0x00000200 +#define SMB2_SHAREFLAG_ALLOW_NAMESPACE_CACHING 0x00000400 +#define SMB2_SHAREFLAG_ACCESS_BASED_DIRECTORY_ENUM 0x00000800 +#define SMB2_SHAREFLAG_FORCE_LEVELII_OPLOCK 0x00001000 +#define SMB2_SHAREFLAG_ENABLE_HASH_V1 0x00002000 +#define SMB2_SHAREFLAG_ENABLE_HASH_V2 0x00004000 #define SHI1005_FLAGS_ENCRYPT_DATA 0x00008000 #define SMB2_SHAREFLAG_IDENTITY_REMOTING 0x00040000 /* 3.1.1 */ #define SMB2_SHAREFLAG_COMPRESS_DATA 0x00100000 /* 3.1.1 */ -#define SHI1005_FLAGS_ALL 0x0014FF33 +#define SMB2_SHAREFLAG_ISOLATED_TRANSPORT 0x00200000 +#define SHI1005_FLAGS_ALL 0x0034FF33 /* Possible share capabilities */ #define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) /* all dialects */ @@ -1171,6 +1172,34 @@ struct create_posix { __u32 Reserved; } __packed; +/* See MS-SMB2 2.2.13.2.3 and MS-SMB2 2.2.13.2.4 */ +struct create_durable { + struct create_context ccontext; + __u8 Name[8]; + union { + __u8 Reserved[16]; + struct { + __u64 PersistentFileId; + __u64 VolatileFileId; + } Fid; + } Data; +} __packed; + +/* See MS-SMB2 2.2.13.2.5 */ +struct create_mxac_req { + struct create_context ccontext; + __u8 Name[8]; + __le64 Timestamp; +} __packed; + +/* See MS-SMB2 2.2.14.2.5 */ +struct create_mxac_rsp { + struct create_context ccontext; + __u8 Name[8]; + __le32 QueryStatus; + __le32 MaximalAccess; +} __packed; + #define SMB2_LEASE_NONE_LE cpu_to_le32(0x00) #define SMB2_LEASE_READ_CACHING_LE cpu_to_le32(0x01) #define SMB2_LEASE_HANDLE_CACHING_LE cpu_to_le32(0x02) @@ -1180,6 +1209,7 @@ struct create_posix { #define SMB2_LEASE_KEY_SIZE 16 +/* See MS-SMB2 2.2.13.2.8 */ struct lease_context { __u8 LeaseKey[SMB2_LEASE_KEY_SIZE]; __le32 LeaseState; @@ -1187,6 +1217,7 @@ struct lease_context { __le64 LeaseDuration; } __packed; +/* See MS-SMB2 2.2.13.2.10 */ struct lease_context_v2 { __u8 LeaseKey[SMB2_LEASE_KEY_SIZE]; __le32 LeaseState; @@ -1210,6 +1241,35 @@ struct create_lease_v2 { __u8 Pad[4]; } __packed; +/* See MS-SMB2 2.2.14.2.9 */ +struct create_disk_id_rsp { + struct create_context ccontext; + __u8 Name[8]; + __le64 DiskFileId; + __le64 VolumeId; + __u8 Reserved[16]; +} __packed; + +/* See MS-SMB2 2.2.13.2.13 */ +struct create_app_inst_id { + struct create_context ccontext; + __u8 Name[16]; + __le32 StructureSize; /* Must be 20 */ + __u16 Reserved; + __u8 AppInstanceId[16]; +} __packed; + +/* See MS-SMB2 2.2.13.2.15 */ +struct create_app_inst_id_vers { + struct create_context ccontext; + __u8 Name[16]; + __le32 StructureSize; /* Must be 24 */ + __u16 Reserved; + __u32 Padding; + __le64 AppInstanceVersionHigh; + __le64 AppInstanceVersionLow; +} __packed; + /* See MS-SMB2 2.2.31 and 2.2.32 */ struct smb2_ioctl_req { struct smb2_hdr hdr; diff --git a/fs/splice.c b/fs/splice.c index 2c3dec2b6dfa..3e06611d19ae 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -30,6 +30,7 @@ #include <linux/export.h> #include <linux/syscalls.h> #include <linux/uio.h> +#include <linux/fsnotify.h> #include <linux/security.h> #include <linux/gfp.h> #include <linux/socket.h> @@ -38,6 +39,22 @@ #include "internal.h" /* + * Splice doesn't support FMODE_NOWAIT. Since pipes may set this flag to + * indicate they support non-blocking reads or writes, we must clear it + * here if set to avoid blocking other users of this pipe if splice is + * being done on it. + */ +static noinline void noinline pipe_clear_nowait(struct file *file) +{ + fmode_t fmode = READ_ONCE(file->f_mode); + + do { + if (!(fmode & FMODE_NOWAIT)) + break; + } while (!try_cmpxchg(&file->f_mode, &fmode, fmode & ~FMODE_NOWAIT)); +} + +/* * Attempt to steal a page from a pipe buffer. This should perhaps go into * a vm helper function, it's already simplified quite a bit by the * addition of remove_mapping(). If success is returned, the caller may @@ -1165,6 +1182,9 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out, ret = do_splice_from(ipipe, out, &offset, len, flags); file_end_write(out); + if (ret > 0) + fsnotify_modify(out); + if (!off_out) out->f_pos = offset; else @@ -1188,6 +1208,10 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out, flags |= SPLICE_F_NONBLOCK; ret = splice_file_to_pipe(in, opipe, &offset, len, flags); + + if (ret > 0) + fsnotify_access(in); + if (!off_in) in->f_pos = offset; else @@ -1211,10 +1235,16 @@ static long __do_splice(struct file *in, loff_t __user *off_in, ipipe = get_pipe_info(in, true); opipe = get_pipe_info(out, true); - if (ipipe && off_in) - return -ESPIPE; - if (opipe && off_out) - return -ESPIPE; + if (ipipe) { + if (off_in) + return -ESPIPE; + pipe_clear_nowait(in); + } + if (opipe) { + if (off_out) + return -ESPIPE; + pipe_clear_nowait(out); + } if (off_out) { if (copy_from_user(&offset, off_out, sizeof(loff_t))) @@ -1311,6 +1341,8 @@ static long vmsplice_to_user(struct file *file, struct iov_iter *iter, if (!pipe) return -EBADF; + pipe_clear_nowait(file); + if (sd.total_len) { pipe_lock(pipe); ret = __splice_from_pipe(pipe, &sd, pipe_to_user); @@ -1339,6 +1371,8 @@ static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter, if (!pipe) return -EBADF; + pipe_clear_nowait(file); + pipe_lock(pipe); ret = wait_for_space(pipe, flags); if (!ret) diff --git a/fs/super.c b/fs/super.c index 04bc62ab7dfe..34afe411cf2b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -54,7 +54,7 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = { * One thing we have to be careful of with a per-sb shrinker is that we don't * drop the last active reference to the superblock from within the shrinker. * If that happens we could trigger unregistering the shrinker from within the - * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we + * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we * take a passive reference to the superblock to avoid this from occurring. */ static unsigned long super_cache_scan(struct shrinker *shrink, diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 999bceb99974..cdb3d632c63d 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -28,12 +28,6 @@ const struct file_operations sysv_dir_operations = { .fsync = generic_file_fsync, }; -inline void dir_put_page(struct page *page, void *page_addr) -{ - kunmap_local((void *)((unsigned long)page_addr & PAGE_MASK)); - put_page(page); -} - static void dir_commit_chunk(struct page *page, loff_t pos, unsigned len) { struct address_space *mapping = page->mapping; @@ -58,7 +52,7 @@ static int sysv_handle_dirsync(struct inode *dir) } /* - * Calls to dir_get_page()/dir_put_page() must be nested according to the + * Calls to dir_get_page()/put_and_unmap_page() must be nested according to the * rules documented in mm/highmem.rst. * * NOTE: sysv_find_entry() and sysv_dotdot() act as calls to dir_get_page() @@ -109,11 +103,11 @@ static int sysv_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN), fs16_to_cpu(SYSV_SB(sb), de->inode), DT_UNKNOWN)) { - dir_put_page(page, kaddr); + put_and_unmap_page(page, kaddr); return 0; } } - dir_put_page(page, kaddr); + put_and_unmap_page(page, kaddr); } return 0; } @@ -137,7 +131,7 @@ static inline int namecompare(int len, int maxlen, * itself (as a parameter - res_dir). It does NOT read the inode of the * entry - you'll have to do that yourself if you want to. * - * On Success dir_put_page() should be called on *res_page. + * On Success put_and_unmap_page() should be called on *res_page. * * sysv_find_entry() acts as a call to dir_get_page() and must be treated * accordingly for nesting purposes. @@ -172,7 +166,7 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_ name, de->name)) goto found; } - dir_put_page(page, kaddr); + put_and_unmap_page(page, kaddr); } if (++n >= npages) @@ -215,7 +209,7 @@ int sysv_add_link(struct dentry *dentry, struct inode *inode) goto out_page; de++; } - dir_put_page(page, kaddr); + put_and_unmap_page(page, kaddr); } BUG(); return -EINVAL; @@ -234,7 +228,7 @@ got_it: mark_inode_dirty(dir); err = sysv_handle_dirsync(dir); out_page: - dir_put_page(page, kaddr); + put_and_unmap_page(page, kaddr); return err; out_unlock: unlock_page(page); @@ -327,12 +321,12 @@ int sysv_empty_dir(struct inode * inode) if (de->name[1] != '.' || de->name[2]) goto not_empty; } - dir_put_page(page, kaddr); + put_and_unmap_page(page, kaddr); } return 1; not_empty: - dir_put_page(page, kaddr); + put_and_unmap_page(page, kaddr); return 0; } @@ -358,7 +352,7 @@ int sysv_set_link(struct sysv_dir_entry *de, struct page *page, } /* - * Calls to dir_get_page()/dir_put_page() must be nested according to the + * Calls to dir_get_page()/put_and_unmap_page() must be nested according to the * rules documented in mm/highmem.rst. * * sysv_dotdot() acts as a call to dir_get_page() and must be treated @@ -382,7 +376,7 @@ ino_t sysv_inode_by_name(struct dentry *dentry) if (de) { res = fs16_to_cpu(SYSV_SB(dentry->d_sb), de->inode); - dir_put_page(page, de); + put_and_unmap_page(page, de); } return res; } diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index a25862773d82..2b2dba4c4f56 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -164,7 +164,7 @@ static int sysv_unlink(struct inode * dir, struct dentry * dentry) inode->i_ctime = dir->i_ctime; inode_dec_link_count(inode); } - dir_put_page(page, de); + put_and_unmap_page(page, de); return err; } @@ -227,7 +227,7 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (!new_de) goto out_dir; err = sysv_set_link(new_de, new_page, old_inode); - dir_put_page(new_page, new_de); + put_and_unmap_page(new_page, new_de); if (err) goto out_dir; new_inode->i_ctime = current_time(new_inode); @@ -256,9 +256,9 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, out_dir: if (dir_de) - dir_put_page(dir_page, dir_de); + put_and_unmap_page(dir_page, dir_de); out_old: - dir_put_page(old_page, old_de); + put_and_unmap_page(old_page, old_de); out: return err; } diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index f2c36ea42df6..e3f988b469ee 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -148,7 +148,6 @@ extern void sysv_destroy_icache(void); /* dir.c */ -extern void dir_put_page(struct page *page, void *vaddr); extern struct sysv_dir_entry *sysv_find_entry(struct dentry *, struct page **); extern int sysv_add_link(struct dentry *, struct inode *); extern int sysv_delete_entry(struct sysv_dir_entry *, struct page *); diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c index 3a92e6af69b2..75461777c466 100644 --- a/fs/ubifs/compress.c +++ b/fs/ubifs/compress.c @@ -217,7 +217,6 @@ static void compr_exit(struct ubifs_compressor *compr) { if (compr->capi_name) crypto_free_comp(compr->cc); - return; } /** diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 1505539f6fe9..ef0499edc248 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -358,7 +358,6 @@ static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry) umode_t mode = S_IFCHR | WHITEOUT_MODE; struct inode *inode; struct ubifs_info *c = dir->i_sb->s_fs_info; - struct fscrypt_name nm; /* * Create an inode('nlink = 1') for whiteout without updating journal, @@ -369,10 +368,6 @@ static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry) dbg_gen("dent '%pd', mode %#hx in dir ino %lu", dentry, mode, dir->i_ino); - err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); - if (err) - return ERR_PTR(err); - inode = ubifs_new_inode(c, dir, mode, false); if (IS_ERR(inode)) { err = PTR_ERR(inode); @@ -395,7 +390,6 @@ out_inode: make_bad_inode(inode); iput(inode); out_free: - fscrypt_free_filename(&nm); ubifs_err(c, "cannot create whiteout file, error %d", err); return ERR_PTR(err); } @@ -492,6 +486,7 @@ static int ubifs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, unlock_2_inodes(dir, inode); ubifs_release_budget(c, &req); + fscrypt_free_filename(&nm); return finish_open_simple(file, 0); diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 2469f72eeaab..6b7d95b65f4b 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -44,6 +44,33 @@ enum { NOT_ON_MEDIA = 3, }; +static void do_insert_old_idx(struct ubifs_info *c, + struct ubifs_old_idx *old_idx) +{ + struct ubifs_old_idx *o; + struct rb_node **p, *parent = NULL; + + p = &c->old_idx.rb_node; + while (*p) { + parent = *p; + o = rb_entry(parent, struct ubifs_old_idx, rb); + if (old_idx->lnum < o->lnum) + p = &(*p)->rb_left; + else if (old_idx->lnum > o->lnum) + p = &(*p)->rb_right; + else if (old_idx->offs < o->offs) + p = &(*p)->rb_left; + else if (old_idx->offs > o->offs) + p = &(*p)->rb_right; + else { + ubifs_err(c, "old idx added twice!"); + kfree(old_idx); + } + } + rb_link_node(&old_idx->rb, parent, p); + rb_insert_color(&old_idx->rb, &c->old_idx); +} + /** * insert_old_idx - record an index node obsoleted since the last commit start. * @c: UBIFS file-system description object @@ -69,35 +96,15 @@ enum { */ static int insert_old_idx(struct ubifs_info *c, int lnum, int offs) { - struct ubifs_old_idx *old_idx, *o; - struct rb_node **p, *parent = NULL; + struct ubifs_old_idx *old_idx; old_idx = kmalloc(sizeof(struct ubifs_old_idx), GFP_NOFS); if (unlikely(!old_idx)) return -ENOMEM; old_idx->lnum = lnum; old_idx->offs = offs; + do_insert_old_idx(c, old_idx); - p = &c->old_idx.rb_node; - while (*p) { - parent = *p; - o = rb_entry(parent, struct ubifs_old_idx, rb); - if (lnum < o->lnum) - p = &(*p)->rb_left; - else if (lnum > o->lnum) - p = &(*p)->rb_right; - else if (offs < o->offs) - p = &(*p)->rb_left; - else if (offs > o->offs) - p = &(*p)->rb_right; - else { - ubifs_err(c, "old idx added twice!"); - kfree(old_idx); - return 0; - } - } - rb_link_node(&old_idx->rb, parent, p); - rb_insert_color(&old_idx->rb, &c->old_idx); return 0; } @@ -199,23 +206,6 @@ static struct ubifs_znode *copy_znode(struct ubifs_info *c, __set_bit(DIRTY_ZNODE, &zn->flags); __clear_bit(COW_ZNODE, &zn->flags); - ubifs_assert(c, !ubifs_zn_obsolete(znode)); - __set_bit(OBSOLETE_ZNODE, &znode->flags); - - if (znode->level != 0) { - int i; - const int n = zn->child_cnt; - - /* The children now have new parent */ - for (i = 0; i < n; i++) { - struct ubifs_zbranch *zbr = &zn->zbranch[i]; - - if (zbr->znode) - zbr->znode->parent = zn; - } - } - - atomic_long_inc(&c->dirty_zn_cnt); return zn; } @@ -234,6 +224,42 @@ static int add_idx_dirt(struct ubifs_info *c, int lnum, int dirt) } /** + * replace_znode - replace old znode with new znode. + * @c: UBIFS file-system description object + * @new_zn: new znode + * @old_zn: old znode + * @zbr: the branch of parent znode + * + * Replace old znode with new znode in TNC. + */ +static void replace_znode(struct ubifs_info *c, struct ubifs_znode *new_zn, + struct ubifs_znode *old_zn, struct ubifs_zbranch *zbr) +{ + ubifs_assert(c, !ubifs_zn_obsolete(old_zn)); + __set_bit(OBSOLETE_ZNODE, &old_zn->flags); + + if (old_zn->level != 0) { + int i; + const int n = new_zn->child_cnt; + + /* The children now have new parent */ + for (i = 0; i < n; i++) { + struct ubifs_zbranch *child = &new_zn->zbranch[i]; + + if (child->znode) + child->znode->parent = new_zn; + } + } + + zbr->znode = new_zn; + zbr->lnum = 0; + zbr->offs = 0; + zbr->len = 0; + + atomic_long_inc(&c->dirty_zn_cnt); +} + +/** * dirty_cow_znode - ensure a znode is not being committed. * @c: UBIFS file-system description object * @zbr: branch of znode to check @@ -265,28 +291,32 @@ static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c, return zn; if (zbr->len) { - err = insert_old_idx(c, zbr->lnum, zbr->offs); - if (unlikely(err)) - /* - * Obsolete znodes will be freed by tnc_destroy_cnext() - * or free_obsolete_znodes(), copied up znodes should - * be added back to tnc and freed by - * ubifs_destroy_tnc_subtree(). - */ + struct ubifs_old_idx *old_idx; + + old_idx = kmalloc(sizeof(struct ubifs_old_idx), GFP_NOFS); + if (unlikely(!old_idx)) { + err = -ENOMEM; goto out; + } + old_idx->lnum = zbr->lnum; + old_idx->offs = zbr->offs; + err = add_idx_dirt(c, zbr->lnum, zbr->len); - } else - err = 0; + if (err) { + kfree(old_idx); + goto out; + } -out: - zbr->znode = zn; - zbr->lnum = 0; - zbr->offs = 0; - zbr->len = 0; + do_insert_old_idx(c, old_idx); + } + + replace_znode(c, zn, znode, zbr); - if (unlikely(err)) - return ERR_PTR(err); return zn; + +out: + kfree(zn); + return ERR_PTR(err); } /** diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 2210e5eb1ea0..1e71e04ae8f6 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -188,14 +188,14 @@ static void udf_write_failed(struct address_space *mapping, loff_t to) static int udf_adinicb_writepage(struct folio *folio, struct writeback_control *wbc, void *data) { - struct page *page = &folio->page; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct udf_inode_info *iinfo = UDF_I(inode); - BUG_ON(!PageLocked(page)); - memcpy_from_page(iinfo->i_data + iinfo->i_lenEAttr, page, 0, + BUG_ON(!folio_test_locked(folio)); + BUG_ON(folio->index != 0); + memcpy_from_file_folio(iinfo->i_data + iinfo->i_lenEAttr, folio, 0, i_size_read(inode)); - unlock_page(page); + folio_unlock(folio); mark_inode_dirty(inode); return 0; diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 871856c69df5..2e7ba234bab8 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -127,7 +127,7 @@ void udf_discard_prealloc(struct inode *inode) uint64_t lbcount = 0; int8_t etype = -1; struct udf_inode_info *iinfo = UDF_I(inode); - int bsize = 1 << inode->i_blkbits; + int bsize = i_blocksize(inode); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB || ALIGN(inode->i_size, bsize) == ALIGN(iinfo->i_lenExtents, bsize)) @@ -149,7 +149,7 @@ void udf_discard_prealloc(struct inode *inode) lbcount -= elen; udf_delete_aext(inode, prev_epos); udf_free_blocks(inode->i_sb, inode, &eloc, 0, - DIV_ROUND_UP(elen, 1 << inode->i_blkbits)); + DIV_ROUND_UP(elen, bsize)); } /* This inode entry is in-memory only and thus we don't have to mark * the inode dirty */ diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 391efaf1d528..379d75796a5c 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -42,11 +42,10 @@ static inline int ufs_match(struct super_block *sb, int len, return !memcmp(name, de->d_name, len); } -static int ufs_commit_chunk(struct page *page, loff_t pos, unsigned len) +static void ufs_commit_chunk(struct page *page, loff_t pos, unsigned len) { struct address_space *mapping = page->mapping; struct inode *dir = mapping->host; - int err = 0; inode_inc_iversion(dir); block_write_end(NULL, mapping, pos, len, len, page, NULL); @@ -54,10 +53,16 @@ static int ufs_commit_chunk(struct page *page, loff_t pos, unsigned len) i_size_write(dir, pos+len); mark_inode_dirty(dir); } - if (IS_DIRSYNC(dir)) - err = write_one_page(page); - else - unlock_page(page); + unlock_page(page); +} + +static int ufs_handle_dirsync(struct inode *dir) +{ + int err; + + err = filemap_write_and_wait(dir->i_mapping); + if (!err) + err = sync_inode_metadata(dir, 1); return err; } @@ -99,11 +104,12 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino); ufs_set_de_type(dir->i_sb, de, inode->i_mode); - err = ufs_commit_chunk(page, pos, len); + ufs_commit_chunk(page, pos, len); ufs_put_page(page); if (update_times) dir->i_mtime = dir->i_ctime = current_time(dir); mark_inode_dirty(dir); + ufs_handle_dirsync(dir); } @@ -390,10 +396,11 @@ got_it: de->d_ino = cpu_to_fs32(sb, inode->i_ino); ufs_set_de_type(sb, de, inode->i_mode); - err = ufs_commit_chunk(page, pos, rec_len); + ufs_commit_chunk(page, pos, rec_len); dir->i_mtime = dir->i_ctime = current_time(dir); mark_inode_dirty(dir); + err = ufs_handle_dirsync(dir); /* OFFSET_CACHE */ out_put: ufs_put_page(page); @@ -531,9 +538,10 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir, if (pde) pde->d_reclen = cpu_to_fs16(sb, to - from); dir->d_ino = 0; - err = ufs_commit_chunk(page, pos, to - from); + ufs_commit_chunk(page, pos, to - from); inode->i_ctime = inode->i_mtime = current_time(inode); mark_inode_dirty(inode); + err = ufs_handle_dirsync(inode); out: ufs_put_page(page); UFSD("EXIT\n"); @@ -579,7 +587,8 @@ int ufs_make_empty(struct inode * inode, struct inode *dir) strcpy (de->d_name, ".."); kunmap(page); - err = ufs_commit_chunk(page, 0, chunk_size); + ufs_commit_chunk(page, 0, chunk_size); + err = ufs_handle_dirsync(inode); fail: put_page(page); return err; diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c index 67aaadc3ab07..8395066341a4 100644 --- a/fs/unicode/utf8-core.c +++ b/fs/unicode/utf8-core.c @@ -214,4 +214,3 @@ void utf8_unload(struct unicode_map *um) } EXPORT_SYMBOL(utf8_unload); -MODULE_LICENSE("GPL v2"); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 40f9e1a2ebdd..0fd96d6e39ce 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -32,7 +32,22 @@ #include <linux/swapops.h> #include <linux/miscdevice.h> -int sysctl_unprivileged_userfaultfd __read_mostly; +static int sysctl_unprivileged_userfaultfd __read_mostly; + +#ifdef CONFIG_SYSCTL +static struct ctl_table vm_userfaultfd_table[] = { + { + .procname = "unprivileged_userfaultfd", + .data = &sysctl_unprivileged_userfaultfd, + .maxlen = sizeof(sysctl_unprivileged_userfaultfd), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { } +}; +#endif static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; @@ -108,6 +123,21 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx) return ctx->features & UFFD_FEATURE_INITIALIZED; } +/* + * Whether WP_UNPOPULATED is enabled on the uffd context. It is only + * meaningful when userfaultfd_wp()==true on the vma and when it's + * anonymous. + */ +bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) +{ + struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; + + if (!ctx) + return false; + + return ctx->features & UFFD_FEATURE_WP_UNPOPULATED; +} + static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, vm_flags_t flags) { @@ -1629,7 +1659,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, /* Reset ptes for the whole vma range if wr-protected */ if (userfaultfd_wp(vma)) - uffd_wp_range(mm, vma, start, vma_end - start, false); + uffd_wp_range(vma, start, vma_end - start, false); new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, @@ -1714,6 +1744,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, struct uffdio_copy uffdio_copy; struct uffdio_copy __user *user_uffdio_copy; struct userfaultfd_wake_range range; + uffd_flags_t flags = 0; user_uffdio_copy = (struct uffdio_copy __user *) arg; @@ -1740,10 +1771,12 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, goto out; if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) goto out; + if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP) + flags |= MFILL_ATOMIC_WP; if (mmget_not_zero(ctx->mm)) { - ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, - uffdio_copy.len, &ctx->mmap_changing, - uffdio_copy.mode); + ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src, + uffdio_copy.len, &ctx->mmap_changing, + flags); mmput(ctx->mm); } else { return -ESRCH; @@ -1793,9 +1826,9 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, goto out; if (mmget_not_zero(ctx->mm)) { - ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, - uffdio_zeropage.range.len, - &ctx->mmap_changing); + ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start, + uffdio_zeropage.range.len, + &ctx->mmap_changing); mmput(ctx->mm); } else { return -ESRCH; @@ -1875,6 +1908,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) struct uffdio_continue uffdio_continue; struct uffdio_continue __user *user_uffdio_continue; struct userfaultfd_wake_range range; + uffd_flags_t flags = 0; user_uffdio_continue = (struct uffdio_continue __user *)arg; @@ -1899,13 +1933,16 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) uffdio_continue.range.start) { goto out; } - if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE) + if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE | + UFFDIO_CONTINUE_MODE_WP)) goto out; + if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP) + flags |= MFILL_ATOMIC_WP; if (mmget_not_zero(ctx->mm)) { - ret = mcopy_continue(ctx->mm, uffdio_continue.range.start, - uffdio_continue.range.len, - &ctx->mmap_changing); + ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start, + uffdio_continue.range.len, + &ctx->mmap_changing, flags); mmput(ctx->mm); } else { return -ESRCH; @@ -1973,6 +2010,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, #endif #ifndef CONFIG_PTE_MARKER_UFFD_WP uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM; + uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED; #endif uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; @@ -2180,6 +2218,9 @@ static int __init userfaultfd_init(void) 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, init_once_userfaultfd_ctx); +#ifdef CONFIG_SYSCTL + register_sysctl_init("vm", vm_userfaultfd_table); +#endif return 0; } __initcall(userfaultfd_init); diff --git a/fs/verity/enable.c b/fs/verity/enable.c index 7a0e3a84d370..fc4c50e5219d 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -13,6 +13,7 @@ struct block_buffer { u32 filled; + bool is_root_hash; u8 *data; }; @@ -24,6 +25,14 @@ static int hash_one_block(struct inode *inode, struct block_buffer *next = cur + 1; int err; + /* + * Safety check to prevent a buffer overflow in case of a filesystem bug + * that allows the file size to change despite deny_write_access(), or a + * bug in the Merkle tree logic itself + */ + if (WARN_ON_ONCE(next->is_root_hash && next->filled != 0)) + return -EINVAL; + /* Zero-pad the block if it's shorter than the block size. */ memset(&cur->data[cur->filled], 0, params->block_size - cur->filled); @@ -97,6 +106,7 @@ static int build_merkle_tree(struct file *filp, } } buffers[num_levels].data = root_hash; + buffers[num_levels].is_root_hash = true; BUILD_BUG_ON(sizeof(level_offset) != sizeof(params->level_start)); memcpy(level_offset, params->level_start, sizeof(level_offset)); @@ -165,7 +175,7 @@ static int build_merkle_tree(struct file *filp, } } /* The root hash was filled by the last call to hash_one_block(). */ - if (WARN_ON(buffers[num_levels].filled != params->digest_size)) { + if (WARN_ON_ONCE(buffers[num_levels].filled != params->digest_size)) { err = -EINVAL; goto out; } @@ -277,7 +287,7 @@ static int enable_verity(struct file *filp, fsverity_err(inode, "%ps() failed with err %d", vops->end_enable_verity, err); fsverity_free_info(vi); - } else if (WARN_ON(!IS_VERITY(inode))) { + } else if (WARN_ON_ONCE(!IS_VERITY(inode))) { err = -EINVAL; fsverity_free_info(vi); } else { @@ -347,6 +357,13 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *uarg) err = file_permission(filp, MAY_WRITE); if (err) return err; + /* + * __kernel_read() is used while building the Merkle tree. So, we can't + * allow file descriptors that were opened for ioctl access only, using + * the special nonstandard access mode 3. O_RDONLY only, please! + */ + if (!(filp->f_mode & FMODE_READ)) + return -EBADF; if (IS_APPEND(inode)) return -EPERM; diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c index 13fcf31be844..ea00dbedf756 100644 --- a/fs/verity/hash_algs.c +++ b/fs/verity/hash_algs.c @@ -84,9 +84,9 @@ struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, } err = -EINVAL; - if (WARN_ON(alg->digest_size != crypto_ahash_digestsize(tfm))) + if (WARN_ON_ONCE(alg->digest_size != crypto_ahash_digestsize(tfm))) goto err_free_tfm; - if (WARN_ON(alg->block_size != crypto_ahash_blocksize(tfm))) + if (WARN_ON_ONCE(alg->block_size != crypto_ahash_blocksize(tfm))) goto err_free_tfm; err = mempool_init_kmalloc_pool(&alg->req_pool, 1, diff --git a/fs/verity/open.c b/fs/verity/open.c index 9366b441d01c..52048b7630dc 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -83,7 +83,7 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, params->log_blocks_per_page = PAGE_SHIFT - log_blocksize; params->blocks_per_page = 1 << params->log_blocks_per_page; - if (WARN_ON(!is_power_of_2(params->digest_size))) { + if (WARN_ON_ONCE(!is_power_of_2(params->digest_size))) { err = -EINVAL; goto out_err; } diff --git a/fs/verity/signature.c b/fs/verity/signature.c index e7d3ca919a1e..b8c51ad40d3a 100644 --- a/fs/verity/signature.c +++ b/fs/verity/signature.c @@ -88,12 +88,6 @@ int fsverity_verify_signature(const struct fsverity_info *vi, #ifdef CONFIG_SYSCTL static struct ctl_table_header *fsverity_sysctl_header; -static const struct ctl_path fsverity_sysctl_path[] = { - { .procname = "fs", }, - { .procname = "verity", }, - { } -}; - static struct ctl_table fsverity_sysctl_table[] = { { .procname = "require_signatures", @@ -109,8 +103,7 @@ static struct ctl_table fsverity_sysctl_table[] = { static int __init fsverity_sysctl_init(void) { - fsverity_sysctl_header = register_sysctl_paths(fsverity_sysctl_path, - fsverity_sysctl_table); + fsverity_sysctl_header = register_sysctl("fs/verity", fsverity_sysctl_table); if (!fsverity_sysctl_header) { pr_err("sysctl registration failed!\n"); return -ENOMEM; diff --git a/fs/xattr.c b/fs/xattr.c index 14a7eb3c8fa8..fcf67d80d7f9 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -160,11 +160,10 @@ xattr_permission(struct mnt_idmap *idmap, struct inode *inode, * Look for any handler that deals with the specified namespace. */ int -xattr_supported_namespace(struct inode *inode, const char *prefix) +xattr_supports_user_prefix(struct inode *inode) { const struct xattr_handler **handlers = inode->i_sb->s_xattr; const struct xattr_handler *handler; - size_t preflen; if (!(inode->i_opflags & IOP_XATTR)) { if (unlikely(is_bad_inode(inode))) @@ -172,16 +171,15 @@ xattr_supported_namespace(struct inode *inode, const char *prefix) return -EOPNOTSUPP; } - preflen = strlen(prefix); - for_each_xattr_handler(handlers, handler) { - if (!strncmp(xattr_prefix(handler), prefix, preflen)) + if (!strncmp(xattr_prefix(handler), XATTR_USER_PREFIX, + XATTR_USER_PREFIX_LEN)) return 0; } return -EOPNOTSUPP; } -EXPORT_SYMBOL(xattr_supported_namespace); +EXPORT_SYMBOL(xattr_supports_user_prefix); int __vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, @@ -460,6 +458,28 @@ nolsm: } EXPORT_SYMBOL_GPL(vfs_getxattr); +/** + * vfs_listxattr - retrieve \0 separated list of xattr names + * @dentry: the dentry from whose inode the xattr names are retrieved + * @list: buffer to store xattr names into + * @size: size of the buffer + * + * This function returns the names of all xattrs associated with the + * inode of @dentry. + * + * Note, for legacy reasons the vfs_listxattr() function lists POSIX + * ACLs as well. Since POSIX ACLs are decoupled from IOP_XATTR the + * vfs_listxattr() function doesn't check for this flag since a + * filesystem could implement POSIX ACLs without implementing any other + * xattrs. + * + * However, since all codepaths that remove IOP_XATTR also assign of + * inode operations that either don't implement or implement a stub + * ->listxattr() operation. + * + * Return: On success, the size of the buffer that was used. On error a + * negative error code. + */ ssize_t vfs_listxattr(struct dentry *dentry, char *list, size_t size) { @@ -469,7 +489,8 @@ vfs_listxattr(struct dentry *dentry, char *list, size_t size) error = security_inode_listxattr(dentry); if (error) return error; - if (inode->i_op->listxattr && (inode->i_opflags & IOP_XATTR)) { + + if (inode->i_op->listxattr) { error = inode->i_op->listxattr(dentry, list, size); } else { error = security_inode_listsecurity(inode, list, size); @@ -949,6 +970,21 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) return error; } +int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name) +{ + size_t len; + + len = strlen(name) + 1; + if (*buffer) { + if (*remaining_size < len) + return -ERANGE; + memcpy(*buffer, name, len); + *buffer += len; + } + *remaining_size -= len; + return 0; +} + /* * Combine the results of the list() operation from every xattr_handler in the * list. @@ -957,33 +993,22 @@ ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr; - unsigned int size = 0; - - if (!buffer) { - for_each_xattr_handler(handlers, handler) { - if (!handler->name || - (handler->list && !handler->list(dentry))) - continue; - size += strlen(handler->name) + 1; - } - } else { - char *buf = buffer; - size_t len; - - for_each_xattr_handler(handlers, handler) { - if (!handler->name || - (handler->list && !handler->list(dentry))) - continue; - len = strlen(handler->name); - if (len + 1 > buffer_size) - return -ERANGE; - memcpy(buf, handler->name, len + 1); - buf += len + 1; - buffer_size -= len + 1; - } - size = buf - buffer; + ssize_t remaining_size = buffer_size; + int err = 0; + + err = posix_acl_listxattr(d_inode(dentry), &buffer, &remaining_size); + if (err) + return err; + + for_each_xattr_handler(handlers, handler) { + if (!handler->name || (handler->list && !handler->list(dentry))) + continue; + err = xattr_list_one(&buffer, &remaining_size, handler->name); + if (err) + return err; } - return size; + + return err ? err : buffer_size - remaining_size; } EXPORT_SYMBOL(generic_listxattr); @@ -1245,20 +1270,6 @@ static bool xattr_is_trusted(const char *name) return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); } -static int xattr_list_one(char **buffer, ssize_t *remaining_size, - const char *name) -{ - size_t len = strlen(name) + 1; - if (*buffer) { - if (*remaining_size < len) - return -ERANGE; - memcpy(*buffer, name, len); - *buffer += len; - } - *remaining_size -= len; - return 0; -} - /** * simple_xattr_list - list all xattr objects * @inode: inode from which to get the xattrs @@ -1287,22 +1298,9 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, ssize_t remaining_size = size; int err = 0; -#ifdef CONFIG_FS_POSIX_ACL - if (IS_POSIXACL(inode)) { - if (inode->i_acl) { - err = xattr_list_one(&buffer, &remaining_size, - XATTR_NAME_POSIX_ACL_ACCESS); - if (err) - return err; - } - if (inode->i_default_acl) { - err = xattr_list_one(&buffer, &remaining_size, - XATTR_NAME_POSIX_ACL_DEFAULT); - if (err) - return err; - } - } -#endif + err = posix_acl_listxattr(inode, &buffer, &remaining_size); + if (err) + return err; read_lock(&xattrs->lock); for (rbp = rb_first(&xattrs->rb_root); rbp; rbp = rb_next(rbp)) { diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 9fac5ea8d0e4..52e1823241fb 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -47,6 +47,33 @@ config XFS_SUPPORT_V4 To continue supporting the old V4 format (crc=0), say Y. To close off an attack surface, say N. +config XFS_SUPPORT_ASCII_CI + bool "Support deprecated case-insensitive ascii (ascii-ci=1) format" + depends on XFS_FS + default y + help + The ASCII case insensitivity filesystem feature only works correctly + on systems that have been coerced into using ISO 8859-1, and it does + not work on extended attributes. The kernel has no visibility into + the locale settings in userspace, so it corrupts UTF-8 names. + Enabling this feature makes XFS vulnerable to mixed case sensitivity + attacks. Because of this, the feature is deprecated. All users + should upgrade by backing up their files, reformatting, and restoring + from the backup. + + Administrators and users can detect such a filesystem by running + xfs_info against a filesystem mountpoint and checking for a string + beginning with "ascii-ci=". If the string "ascii-ci=1" is found, the + filesystem is a case-insensitive filesystem. If no such string is + found, please upgrade xfsprogs to the latest version and try again. + + This option will become default N in September 2025. Support for the + feature will be removed entirely in September 2030. Distributors + can say N here to withdraw support earlier. + + To continue supporting case-insensitivity (ascii-ci=1), say Y. + To close off an attack surface, say N. + config XFS_QUOTA bool "XFS Quota support" depends on XFS_FS @@ -93,10 +120,15 @@ config XFS_RT If unsure, say N. +config XFS_DRAIN_INTENTS + bool + select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL + config XFS_ONLINE_SCRUB bool "XFS online metadata check support" default n depends on XFS_FS + select XFS_DRAIN_INTENTS help If you say Y here you will be able to check metadata on a mounted XFS filesystem. This feature is intended to reduce diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 92d88dc3c9f7..16e4eb431230 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -136,6 +136,8 @@ ifeq ($(CONFIG_MEMORY_FAILURE),y) xfs-$(CONFIG_FS_DAX) += xfs_notify_failure.o endif +xfs-$(CONFIG_XFS_DRAIN_INTENTS) += xfs_drain.o + # online scrub/repair ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y) @@ -146,6 +148,7 @@ xfs-y += $(addprefix scrub/, \ agheader.o \ alloc.o \ attr.o \ + bitmap.o \ bmap.o \ btree.o \ common.o \ @@ -156,6 +159,7 @@ xfs-y += $(addprefix scrub/, \ ialloc.o \ inode.o \ parent.o \ + readdir.o \ refcount.o \ rmap.o \ scrub.o \ @@ -169,7 +173,6 @@ xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y) xfs-y += $(addprefix scrub/, \ agheader_repair.o \ - bitmap.o \ repair.o \ ) endif diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 86696a1c6891..9b373a0c7aaf 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -81,6 +81,19 @@ xfs_perag_get_tag( return pag; } +/* Get a passive reference to the given perag. */ +struct xfs_perag * +xfs_perag_hold( + struct xfs_perag *pag) +{ + ASSERT(atomic_read(&pag->pag_ref) > 0 || + atomic_read(&pag->pag_active_ref) > 0); + + trace_xfs_perag_hold(pag, _RET_IP_); + atomic_inc(&pag->pag_ref); + return pag; +} + void xfs_perag_put( struct xfs_perag *pag) @@ -247,6 +260,7 @@ xfs_free_perag( spin_unlock(&mp->m_perag_lock); ASSERT(pag); XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0); + xfs_defer_drain_free(&pag->pag_intents_drain); cancel_delayed_work_sync(&pag->pag_blockgc_work); xfs_buf_hash_destroy(pag); @@ -372,6 +386,7 @@ xfs_initialize_perag( spin_lock_init(&pag->pag_state_lock); INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); + xfs_defer_drain_init(&pag->pag_intents_drain); init_waitqueue_head(&pag->pagb_wait); init_waitqueue_head(&pag->pag_active_wq); pag->pagb_count = 0; @@ -408,6 +423,7 @@ xfs_initialize_perag( return 0; out_remove_pag: + xfs_defer_drain_free(&pag->pag_intents_drain); radix_tree_delete(&mp->m_perag_tree, index); out_free_pag: kmem_free(pag); @@ -418,6 +434,7 @@ out_unwind_new_pags: if (!pag) break; xfs_buf_hash_destroy(pag); + xfs_defer_drain_free(&pag->pag_intents_drain); kmem_free(pag); } return error; @@ -478,10 +495,12 @@ xfs_freesp_init_recs( ASSERT(start >= mp->m_ag_prealloc_blocks); if (start != mp->m_ag_prealloc_blocks) { /* - * Modify first record to pad stripe align of log + * Modify first record to pad stripe align of log and + * bump the record count. */ arec->ar_blockcount = cpu_to_be32(start - mp->m_ag_prealloc_blocks); + be16_add_cpu(&block->bb_numrecs, 1); nrec = arec + 1; /* @@ -492,7 +511,6 @@ xfs_freesp_init_recs( be32_to_cpu(arec->ar_startblock) + be32_to_cpu(arec->ar_blockcount)); arec = nrec; - be16_add_cpu(&block->bb_numrecs, 1); } /* * Change record start to after the internal log @@ -501,15 +519,13 @@ xfs_freesp_init_recs( } /* - * Calculate the record block count and check for the case where - * the log might have consumed all available space in the AG. If - * so, reset the record count to 0 to avoid exposure of an invalid - * record start block. + * Calculate the block count of this record; if it is nonzero, + * increment the record count. */ arec->ar_blockcount = cpu_to_be32(id->agsize - be32_to_cpu(arec->ar_startblock)); - if (!arec->ar_blockcount) - block->bb_numrecs = 0; + if (arec->ar_blockcount) + be16_add_cpu(&block->bb_numrecs, 1); } /* @@ -521,7 +537,7 @@ xfs_bnoroot_init( struct xfs_buf *bp, struct aghdr_init_data *id) { - xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, id->agno); + xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 0, id->agno); xfs_freesp_init_recs(mp, bp, id); } @@ -531,7 +547,7 @@ xfs_cntroot_init( struct xfs_buf *bp, struct aghdr_init_data *id) { - xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, id->agno); + xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 0, id->agno); xfs_freesp_init_recs(mp, bp, id); } @@ -1043,10 +1059,8 @@ xfs_ag_extend_space( if (error) return error; - error = xfs_free_extent(tp, XFS_AGB_TO_FSB(pag->pag_mount, pag->pag_agno, - be32_to_cpu(agf->agf_length) - len), - len, &XFS_RMAP_OINFO_SKIP_UPDATE, - XFS_AG_RESV_NONE); + error = xfs_free_extent(tp, pag, be32_to_cpu(agf->agf_length) - len, + len, &XFS_RMAP_OINFO_SKIP_UPDATE, XFS_AG_RESV_NONE); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 5e18536dfdce..2e0aef87d633 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -101,6 +101,14 @@ struct xfs_perag { /* background prealloc block trimming */ struct delayed_work pag_blockgc_work; + /* + * We use xfs_drain to track the number of deferred log intent items + * that have been queued (but not yet processed) so that waiters (e.g. + * scrub) will not lock resources when other threads are in the middle + * of processing a chain of intent items only to find momentary + * inconsistencies. + */ + struct xfs_defer_drain pag_intents_drain; #endif /* __KERNEL__ */ }; @@ -134,6 +142,7 @@ void xfs_free_perag(struct xfs_mount *mp); struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno); struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int tag); +struct xfs_perag *xfs_perag_hold(struct xfs_perag *pag); void xfs_perag_put(struct xfs_perag *pag); /* Active AG references */ diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 203f16c48c19..fdfa08cbf4db 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -233,6 +233,52 @@ xfs_alloc_update( return xfs_btree_update(cur, &rec); } +/* Convert the ondisk btree record to its incore representation. */ +void +xfs_alloc_btrec_to_irec( + const union xfs_btree_rec *rec, + struct xfs_alloc_rec_incore *irec) +{ + irec->ar_startblock = be32_to_cpu(rec->alloc.ar_startblock); + irec->ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount); +} + +/* Simple checks for free space records. */ +xfs_failaddr_t +xfs_alloc_check_irec( + struct xfs_btree_cur *cur, + const struct xfs_alloc_rec_incore *irec) +{ + struct xfs_perag *pag = cur->bc_ag.pag; + + if (irec->ar_blockcount == 0) + return __this_address; + + /* check for valid extent range, including overflow */ + if (!xfs_verify_agbext(pag, irec->ar_startblock, irec->ar_blockcount)) + return __this_address; + + return NULL; +} + +static inline int +xfs_alloc_complain_bad_rec( + struct xfs_btree_cur *cur, + xfs_failaddr_t fa, + const struct xfs_alloc_rec_incore *irec) +{ + struct xfs_mount *mp = cur->bc_mp; + + xfs_warn(mp, + "%s Freespace BTree record corruption in AG %d detected at %pS!", + cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", + cur->bc_ag.pag->pag_agno, fa); + xfs_warn(mp, + "start block 0x%x block count 0x%x", irec->ar_startblock, + irec->ar_blockcount); + return -EFSCORRUPTED; +} + /* * Get the data from the pointed-to record. */ @@ -243,35 +289,23 @@ xfs_alloc_get_rec( xfs_extlen_t *len, /* output: length of extent */ int *stat) /* output: success/failure */ { - struct xfs_mount *mp = cur->bc_mp; - struct xfs_perag *pag = cur->bc_ag.pag; + struct xfs_alloc_rec_incore irec; union xfs_btree_rec *rec; + xfs_failaddr_t fa; int error; error = xfs_btree_get_rec(cur, &rec, stat); if (error || !(*stat)) return error; - *bno = be32_to_cpu(rec->alloc.ar_startblock); - *len = be32_to_cpu(rec->alloc.ar_blockcount); - - if (*len == 0) - goto out_bad_rec; - - /* check for valid extent range, including overflow */ - if (!xfs_verify_agbext(pag, *bno, *len)) - goto out_bad_rec; + xfs_alloc_btrec_to_irec(rec, &irec); + fa = xfs_alloc_check_irec(cur, &irec); + if (fa) + return xfs_alloc_complain_bad_rec(cur, fa, &irec); + *bno = irec.ar_startblock; + *len = irec.ar_blockcount; return 0; - -out_bad_rec: - xfs_warn(mp, - "%s Freespace BTree record corruption in AG %d detected!", - cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", - pag->pag_agno); - xfs_warn(mp, - "start block 0x%x block count 0x%x", *bno, *len); - return -EFSCORRUPTED; } /* @@ -2405,6 +2439,7 @@ xfs_defer_agfl_block( trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); + xfs_extent_free_get_group(mp, xefi); xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list); } @@ -2421,8 +2456,8 @@ __xfs_free_extent_later( bool skip_discard) { struct xfs_extent_free_item *xefi; -#ifdef DEBUG struct xfs_mount *mp = tp->t_mountp; +#ifdef DEBUG xfs_agnumber_t agno; xfs_agblock_t agbno; @@ -2456,9 +2491,11 @@ __xfs_free_extent_later( } else { xefi->xefi_owner = XFS_RMAP_OWN_NULL; } - trace_xfs_bmap_free_defer(tp->t_mountp, + trace_xfs_bmap_free_defer(mp, XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0, XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); + + xfs_extent_free_get_group(mp, xefi); xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list); } @@ -3596,7 +3633,8 @@ xfs_free_extent_fix_freelist( int __xfs_free_extent( struct xfs_trans *tp, - xfs_fsblock_t bno, + struct xfs_perag *pag, + xfs_agblock_t agbno, xfs_extlen_t len, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type, @@ -3604,12 +3642,9 @@ __xfs_free_extent( { struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *agbp; - xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno); - xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno); struct xfs_agf *agf; int error; unsigned int busy_flags = 0; - struct xfs_perag *pag; ASSERT(len != 0); ASSERT(type != XFS_AG_RESV_AGFL); @@ -3618,10 +3653,9 @@ __xfs_free_extent( XFS_ERRTAG_FREE_EXTENT)) return -EIO; - pag = xfs_perag_get(mp, agno); error = xfs_free_extent_fix_freelist(tp, pag, &agbp); if (error) - goto err; + return error; agf = agbp->b_addr; if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) { @@ -3635,20 +3669,18 @@ __xfs_free_extent( goto err_release; } - error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type); + error = xfs_free_ag_extent(tp, agbp, pag->pag_agno, agbno, len, oinfo, + type); if (error) goto err_release; if (skip_discard) busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD; xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags); - xfs_perag_put(pag); return 0; err_release: xfs_trans_brelse(tp, agbp); -err: - xfs_perag_put(pag); return error; } @@ -3666,9 +3698,13 @@ xfs_alloc_query_range_helper( { struct xfs_alloc_query_range_info *query = priv; struct xfs_alloc_rec_incore irec; + xfs_failaddr_t fa; + + xfs_alloc_btrec_to_irec(rec, &irec); + fa = xfs_alloc_check_irec(cur, &irec); + if (fa) + return xfs_alloc_complain_bad_rec(cur, fa, &irec); - irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock); - irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount); return query->fn(cur, &irec, query->priv); } @@ -3709,13 +3745,16 @@ xfs_alloc_query_all( return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query); } -/* Is there a record covering a given extent? */ +/* + * Scan part of the keyspace of the free space and tell us if the area has no + * records, is fully mapped by records, or is partially filled. + */ int -xfs_alloc_has_record( +xfs_alloc_has_records( struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, - bool *exists) + enum xbtree_recpacking *outcome) { union xfs_btree_irec low; union xfs_btree_irec high; @@ -3725,7 +3764,7 @@ xfs_alloc_has_record( memset(&high, 0xFF, sizeof(high)); high.a.ar_startblock = bno + len - 1; - return xfs_btree_has_record(cur, &low, &high, exists); + return xfs_btree_has_records(cur, &low, &high, NULL, outcome); } /* diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 2b246d74c189..5dbb25546d0b 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -141,7 +141,8 @@ int xfs_alloc_vextent_first_ag(struct xfs_alloc_arg *args, int /* error */ __xfs_free_extent( struct xfs_trans *tp, /* transaction pointer */ - xfs_fsblock_t bno, /* starting block number of extent */ + struct xfs_perag *pag, + xfs_agblock_t agbno, xfs_extlen_t len, /* length of extent */ const struct xfs_owner_info *oinfo, /* extent owner */ enum xfs_ag_resv_type type, /* block reservation type */ @@ -150,12 +151,13 @@ __xfs_free_extent( static inline int xfs_free_extent( struct xfs_trans *tp, - xfs_fsblock_t bno, + struct xfs_perag *pag, + xfs_agblock_t agbno, xfs_extlen_t len, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type) { - return __xfs_free_extent(tp, bno, len, oinfo, type, false); + return __xfs_free_extent(tp, pag, agbno, len, oinfo, type, false); } int /* error */ @@ -179,6 +181,12 @@ xfs_alloc_get_rec( xfs_extlen_t *len, /* output: length of extent */ int *stat); /* output: success/failure */ +union xfs_btree_rec; +void xfs_alloc_btrec_to_irec(const union xfs_btree_rec *rec, + struct xfs_alloc_rec_incore *irec); +xfs_failaddr_t xfs_alloc_check_irec(struct xfs_btree_cur *cur, + const struct xfs_alloc_rec_incore *irec); + int xfs_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags, struct xfs_buf **agfbpp); int xfs_alloc_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags, @@ -205,8 +213,8 @@ int xfs_alloc_query_range(struct xfs_btree_cur *cur, int xfs_alloc_query_all(struct xfs_btree_cur *cur, xfs_alloc_query_range_fn fn, void *priv); -int xfs_alloc_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len, bool *exist); +int xfs_alloc_has_records(struct xfs_btree_cur *cur, xfs_agblock_t bno, + xfs_extlen_t len, enum xbtree_recpacking *outcome); typedef int (*xfs_agfl_walk_fn)(struct xfs_mount *mp, xfs_agblock_t bno, void *priv); @@ -235,9 +243,13 @@ struct xfs_extent_free_item { uint64_t xefi_owner; xfs_fsblock_t xefi_startblock;/* starting fs block number */ xfs_extlen_t xefi_blockcount;/* number of blocks in extent */ + struct xfs_perag *xefi_pag; unsigned int xefi_flags; }; +void xfs_extent_free_get_group(struct xfs_mount *mp, + struct xfs_extent_free_item *xefi); + #define XFS_EFI_SKIP_DISCARD (1U << 0) /* don't issue discard */ #define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */ #define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 0f29c7b1b39f..c65228efed4a 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -260,20 +260,27 @@ STATIC int64_t xfs_bnobt_diff_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, - const union xfs_btree_key *k2) + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) { + ASSERT(!mask || mask->alloc.ar_startblock); + return (int64_t)be32_to_cpu(k1->alloc.ar_startblock) - - be32_to_cpu(k2->alloc.ar_startblock); + be32_to_cpu(k2->alloc.ar_startblock); } STATIC int64_t xfs_cntbt_diff_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, - const union xfs_btree_key *k2) + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) { int64_t diff; + ASSERT(!mask || (mask->alloc.ar_blockcount && + mask->alloc.ar_startblock)); + diff = be32_to_cpu(k1->alloc.ar_blockcount) - be32_to_cpu(k2->alloc.ar_blockcount); if (diff) @@ -423,6 +430,19 @@ xfs_cntbt_recs_inorder( be32_to_cpu(r2->alloc.ar_startblock)); } +STATIC enum xbtree_key_contig +xfs_allocbt_keys_contiguous( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + ASSERT(!mask || mask->alloc.ar_startblock); + + return xbtree_key_contig(be32_to_cpu(key1->alloc.ar_startblock), + be32_to_cpu(key2->alloc.ar_startblock)); +} + static const struct xfs_btree_ops xfs_bnobt_ops = { .rec_len = sizeof(xfs_alloc_rec_t), .key_len = sizeof(xfs_alloc_key_t), @@ -443,6 +463,7 @@ static const struct xfs_btree_ops xfs_bnobt_ops = { .diff_two_keys = xfs_bnobt_diff_two_keys, .keys_inorder = xfs_bnobt_keys_inorder, .recs_inorder = xfs_bnobt_recs_inorder, + .keys_contiguous = xfs_allocbt_keys_contiguous, }; static const struct xfs_btree_ops xfs_cntbt_ops = { @@ -465,6 +486,7 @@ static const struct xfs_btree_ops xfs_cntbt_ops = { .diff_two_keys = xfs_cntbt_diff_two_keys, .keys_inorder = xfs_cntbt_keys_inorder, .recs_inorder = xfs_cntbt_recs_inorder, + .keys_contiguous = NULL, /* not needed right now */ }; /* Allocate most of a new allocation btree cursor. */ @@ -492,9 +514,7 @@ xfs_allocbt_init_common( cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); } - /* take a reference for the cursor */ - atomic_inc(&pag->pag_ref); - cur->bc_ag.pag = pag; + cur->bc_ag.pag = xfs_perag_hold(pag); if (xfs_has_crc(mp)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 34de6e6898c4..cd8870a16fd1 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1083,6 +1083,34 @@ struct xfs_iread_state { xfs_extnum_t loaded; }; +int +xfs_bmap_complain_bad_rec( + struct xfs_inode *ip, + int whichfork, + xfs_failaddr_t fa, + const struct xfs_bmbt_irec *irec) +{ + struct xfs_mount *mp = ip->i_mount; + const char *forkname; + + switch (whichfork) { + case XFS_DATA_FORK: forkname = "data"; break; + case XFS_ATTR_FORK: forkname = "attr"; break; + case XFS_COW_FORK: forkname = "CoW"; break; + default: forkname = "???"; break; + } + + xfs_warn(mp, + "Bmap BTree record corruption in inode 0x%llx %s fork detected at %pS!", + ip->i_ino, forkname, fa); + xfs_warn(mp, + "Offset 0x%llx, start block 0x%llx, block count 0x%llx state 0x%x", + irec->br_startoff, irec->br_startblock, irec->br_blockcount, + irec->br_state); + + return -EFSCORRUPTED; +} + /* Stuff every bmbt record from this block into the incore extent map. */ static int xfs_iread_bmbt_block( @@ -1125,7 +1153,8 @@ xfs_iread_bmbt_block( xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iread_extents(2)", frp, sizeof(*frp), fa); - return -EFSCORRUPTED; + return xfs_bmap_complain_bad_rec(ip, whichfork, fa, + &new); } xfs_iext_insert(ip, &ir->icur, &new, xfs_bmap_fork_to_state(whichfork)); @@ -1171,6 +1200,12 @@ xfs_iread_extents( goto out; } ASSERT(ir.loaded == xfs_iext_count(ifp)); + /* + * Use release semantics so that we can use acquire semantics in + * xfs_need_iread_extents and be guaranteed to see a valid mapping tree + * after that load. + */ + smp_store_release(&ifp->if_needextents, 0); return 0; out: xfs_iext_destroy(ifp); @@ -3459,8 +3494,10 @@ xfs_bmap_btalloc_at_eof( if (!caller_pag) args->pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ap->blkno)); error = xfs_alloc_vextent_exact_bno(args, ap->blkno); - if (!caller_pag) + if (!caller_pag) { xfs_perag_put(args->pag); + args->pag = NULL; + } if (error) return error; @@ -3470,7 +3507,6 @@ xfs_bmap_btalloc_at_eof( * Exact allocation failed. Reset to try an aligned allocation * according to the original allocation specification. */ - args->pag = NULL; args->alignment = stripe_align; args->minlen = nextminlen; args->minalignslop = 0; @@ -3505,7 +3541,6 @@ xfs_bmap_btalloc_at_eof( * original non-aligned state so the caller can proceed on allocation * failure as if this function was never called. */ - args->fsbno = ap->blkno; args->alignment = 1; return 0; } @@ -6075,6 +6110,7 @@ __xfs_bmap_add( bi->bi_whichfork = whichfork; bi->bi_bmap = *bmap; + xfs_bmap_update_get_group(tp->t_mountp, bi); xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_BMAP, &bi->bi_list); return 0; } diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index dd08361ca5a6..e33470e39728 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -145,7 +145,7 @@ static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags) { BMAP_COWFORK, "COW" } /* Return true if the extent is an allocated extent, written or not. */ -static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) +static inline bool xfs_bmap_is_real_extent(const struct xfs_bmbt_irec *irec) { return irec->br_startblock != HOLESTARTBLOCK && irec->br_startblock != DELAYSTARTBLOCK && @@ -238,9 +238,13 @@ struct xfs_bmap_intent { enum xfs_bmap_intent_type bi_type; int bi_whichfork; struct xfs_inode *bi_owner; + struct xfs_perag *bi_pag; struct xfs_bmbt_irec bi_bmap; }; +void xfs_bmap_update_get_group(struct xfs_mount *mp, + struct xfs_bmap_intent *bi); + int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_bmap_intent *bi); void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmbt_irec *imap); @@ -261,6 +265,8 @@ static inline uint32_t xfs_bmap_fork_to_state(int whichfork) xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *irec); +int xfs_bmap_complain_bad_rec(struct xfs_inode *ip, int whichfork, + xfs_failaddr_t fa, const struct xfs_bmbt_irec *irec); int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index b8ad95050c9b..1b40e5f8b1ec 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -382,11 +382,14 @@ STATIC int64_t xfs_bmbt_diff_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, - const union xfs_btree_key *k2) + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) { uint64_t a = be64_to_cpu(k1->bmbt.br_startoff); uint64_t b = be64_to_cpu(k2->bmbt.br_startoff); + ASSERT(!mask || mask->bmbt.br_startoff); + /* * Note: This routine previously casted a and b to int64 and subtracted * them to generate a result. This lead to problems if b was the @@ -500,6 +503,19 @@ xfs_bmbt_recs_inorder( xfs_bmbt_disk_get_startoff(&r2->bmbt); } +STATIC enum xbtree_key_contig +xfs_bmbt_keys_contiguous( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + ASSERT(!mask || mask->bmbt.br_startoff); + + return xbtree_key_contig(be64_to_cpu(key1->bmbt.br_startoff), + be64_to_cpu(key2->bmbt.br_startoff)); +} + static const struct xfs_btree_ops xfs_bmbt_ops = { .rec_len = sizeof(xfs_bmbt_rec_t), .key_len = sizeof(xfs_bmbt_key_t), @@ -520,6 +536,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .buf_ops = &xfs_bmbt_buf_ops, .keys_inorder = xfs_bmbt_keys_inorder, .recs_inorder = xfs_bmbt_recs_inorder, + .keys_contiguous = xfs_bmbt_keys_contiguous, }; /* diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index c4649cc624e1..6a6503ab0cd7 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -2067,8 +2067,7 @@ xfs_btree_get_leaf_keys( for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { rec = xfs_btree_rec_addr(cur, n, block); cur->bc_ops->init_high_key_from_rec(&hkey, rec); - if (cur->bc_ops->diff_two_keys(cur, &hkey, &max_hkey) - > 0) + if (xfs_btree_keycmp_gt(cur, &hkey, &max_hkey)) max_hkey = hkey; } @@ -2096,7 +2095,7 @@ xfs_btree_get_node_keys( max_hkey = xfs_btree_high_key_addr(cur, 1, block); for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { hkey = xfs_btree_high_key_addr(cur, n, block); - if (cur->bc_ops->diff_two_keys(cur, hkey, max_hkey) > 0) + if (xfs_btree_keycmp_gt(cur, hkey, max_hkey)) max_hkey = hkey; } @@ -2183,8 +2182,8 @@ __xfs_btree_updkeys( nlkey = xfs_btree_key_addr(cur, ptr, block); nhkey = xfs_btree_high_key_addr(cur, ptr, block); if (!force_all && - !(cur->bc_ops->diff_two_keys(cur, nlkey, lkey) != 0 || - cur->bc_ops->diff_two_keys(cur, nhkey, hkey) != 0)) + xfs_btree_keycmp_eq(cur, nlkey, lkey) && + xfs_btree_keycmp_eq(cur, nhkey, hkey)) break; xfs_btree_copy_keys(cur, nlkey, lkey, 1); xfs_btree_log_keys(cur, bp, ptr, ptr); @@ -4716,7 +4715,6 @@ xfs_btree_simple_query_range( { union xfs_btree_rec *recp; union xfs_btree_key rec_key; - int64_t diff; int stat; bool firstrec = true; int error; @@ -4746,20 +4744,17 @@ xfs_btree_simple_query_range( if (error || !stat) break; - /* Skip if high_key(rec) < low_key. */ + /* Skip if low_key > high_key(rec). */ if (firstrec) { cur->bc_ops->init_high_key_from_rec(&rec_key, recp); firstrec = false; - diff = cur->bc_ops->diff_two_keys(cur, low_key, - &rec_key); - if (diff > 0) + if (xfs_btree_keycmp_gt(cur, low_key, &rec_key)) goto advloop; } - /* Stop if high_key < low_key(rec). */ + /* Stop if low_key(rec) > high_key. */ cur->bc_ops->init_key_from_rec(&rec_key, recp); - diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key); - if (diff > 0) + if (xfs_btree_keycmp_gt(cur, &rec_key, high_key)) break; /* Callback */ @@ -4813,8 +4808,6 @@ xfs_btree_overlapped_query_range( union xfs_btree_key *hkp; union xfs_btree_rec *recp; struct xfs_btree_block *block; - int64_t ldiff; - int64_t hdiff; int level; struct xfs_buf *bp; int i; @@ -4854,25 +4847,23 @@ pop_up: block); cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp); - ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey, - low_key); - cur->bc_ops->init_key_from_rec(&rec_key, recp); - hdiff = cur->bc_ops->diff_two_keys(cur, high_key, - &rec_key); /* + * If (query's high key < record's low key), then there + * are no more interesting records in this block. Pop + * up to the leaf level to find more record blocks. + * * If (record's high key >= query's low key) and * (query's high key >= record's low key), then * this record overlaps the query range; callback. */ - if (ldiff >= 0 && hdiff >= 0) { + if (xfs_btree_keycmp_lt(cur, high_key, &rec_key)) + goto pop_up; + if (xfs_btree_keycmp_ge(cur, &rec_hkey, low_key)) { error = fn(cur, recp, priv); if (error) break; - } else if (hdiff < 0) { - /* Record is larger than high key; pop. */ - goto pop_up; } cur->bc_levels[level].ptr++; continue; @@ -4884,15 +4875,18 @@ pop_up: block); pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block); - ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key); - hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp); - /* + * If (query's high key < pointer's low key), then there are no + * more interesting keys in this block. Pop up one leaf level + * to continue looking for records. + * * If (pointer's high key >= query's low key) and * (query's high key >= pointer's low key), then * this record overlaps the query range; follow pointer. */ - if (ldiff >= 0 && hdiff >= 0) { + if (xfs_btree_keycmp_lt(cur, high_key, lkp)) + goto pop_up; + if (xfs_btree_keycmp_ge(cur, hkp, low_key)) { level--; error = xfs_btree_lookup_get_block(cur, level, pp, &block); @@ -4907,9 +4901,6 @@ pop_up: #endif cur->bc_levels[level].ptr = 1; continue; - } else if (hdiff < 0) { - /* The low key is larger than the upper range; pop. */ - goto pop_up; } cur->bc_levels[level].ptr++; } @@ -4937,6 +4928,19 @@ out: return error; } +static inline void +xfs_btree_key_from_irec( + struct xfs_btree_cur *cur, + union xfs_btree_key *key, + const union xfs_btree_irec *irec) +{ + union xfs_btree_rec rec; + + cur->bc_rec = *irec; + cur->bc_ops->init_rec_from_cur(cur, &rec); + cur->bc_ops->init_key_from_rec(key, &rec); +} + /* * Query a btree for all records overlapping a given interval of keys. The * supplied function will be called with each record found; return one of the @@ -4951,21 +4955,15 @@ xfs_btree_query_range( xfs_btree_query_range_fn fn, void *priv) { - union xfs_btree_rec rec; union xfs_btree_key low_key; union xfs_btree_key high_key; /* Find the keys of both ends of the interval. */ - cur->bc_rec = *high_rec; - cur->bc_ops->init_rec_from_cur(cur, &rec); - cur->bc_ops->init_key_from_rec(&high_key, &rec); + xfs_btree_key_from_irec(cur, &high_key, high_rec); + xfs_btree_key_from_irec(cur, &low_key, low_rec); - cur->bc_rec = *low_rec; - cur->bc_ops->init_rec_from_cur(cur, &rec); - cur->bc_ops->init_key_from_rec(&low_key, &rec); - - /* Enforce low key < high key. */ - if (cur->bc_ops->diff_two_keys(cur, &low_key, &high_key) > 0) + /* Enforce low key <= high key. */ + if (!xfs_btree_keycmp_le(cur, &low_key, &high_key)) return -EINVAL; if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) @@ -5027,34 +5025,132 @@ xfs_btree_diff_two_ptrs( return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s); } -/* If there's an extent, we're done. */ +struct xfs_btree_has_records { + /* Keys for the start and end of the range we want to know about. */ + union xfs_btree_key start_key; + union xfs_btree_key end_key; + + /* Mask for key comparisons, if desired. */ + const union xfs_btree_key *key_mask; + + /* Highest record key we've seen so far. */ + union xfs_btree_key high_key; + + enum xbtree_recpacking outcome; +}; + STATIC int -xfs_btree_has_record_helper( +xfs_btree_has_records_helper( struct xfs_btree_cur *cur, const union xfs_btree_rec *rec, void *priv) { - return -ECANCELED; + union xfs_btree_key rec_key; + union xfs_btree_key rec_high_key; + struct xfs_btree_has_records *info = priv; + enum xbtree_key_contig key_contig; + + cur->bc_ops->init_key_from_rec(&rec_key, rec); + + if (info->outcome == XBTREE_RECPACKING_EMPTY) { + info->outcome = XBTREE_RECPACKING_SPARSE; + + /* + * If the first record we find does not overlap the start key, + * then there is a hole at the start of the search range. + * Classify this as sparse and stop immediately. + */ + if (xfs_btree_masked_keycmp_lt(cur, &info->start_key, &rec_key, + info->key_mask)) + return -ECANCELED; + } else { + /* + * If a subsequent record does not overlap with the any record + * we've seen so far, there is a hole in the middle of the + * search range. Classify this as sparse and stop. + * If the keys overlap and this btree does not allow overlap, + * signal corruption. + */ + key_contig = cur->bc_ops->keys_contiguous(cur, &info->high_key, + &rec_key, info->key_mask); + if (key_contig == XBTREE_KEY_OVERLAP && + !(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + return -EFSCORRUPTED; + if (key_contig == XBTREE_KEY_GAP) + return -ECANCELED; + } + + /* + * If high_key(rec) is larger than any other high key we've seen, + * remember it for later. + */ + cur->bc_ops->init_high_key_from_rec(&rec_high_key, rec); + if (xfs_btree_masked_keycmp_gt(cur, &rec_high_key, &info->high_key, + info->key_mask)) + info->high_key = rec_high_key; /* struct copy */ + + return 0; } -/* Is there a record covering a given range of keys? */ +/* + * Scan part of the keyspace of a btree and tell us if that keyspace does not + * map to any records; is fully mapped to records; or is partially mapped to + * records. This is the btree record equivalent to determining if a file is + * sparse. + * + * For most btree types, the record scan should use all available btree key + * fields to compare the keys encountered. These callers should pass NULL for + * @mask. However, some callers (e.g. scanning physical space in the rmapbt) + * want to ignore some part of the btree record keyspace when performing the + * comparison. These callers should pass in a union xfs_btree_key object with + * the fields that *should* be a part of the comparison set to any nonzero + * value, and the rest zeroed. + */ int -xfs_btree_has_record( +xfs_btree_has_records( struct xfs_btree_cur *cur, const union xfs_btree_irec *low, const union xfs_btree_irec *high, - bool *exists) + const union xfs_btree_key *mask, + enum xbtree_recpacking *outcome) { + struct xfs_btree_has_records info = { + .outcome = XBTREE_RECPACKING_EMPTY, + .key_mask = mask, + }; int error; - error = xfs_btree_query_range(cur, low, high, - &xfs_btree_has_record_helper, NULL); - if (error == -ECANCELED) { - *exists = true; - return 0; + /* Not all btrees support this operation. */ + if (!cur->bc_ops->keys_contiguous) { + ASSERT(0); + return -EOPNOTSUPP; } - *exists = false; - return error; + + xfs_btree_key_from_irec(cur, &info.start_key, low); + xfs_btree_key_from_irec(cur, &info.end_key, high); + + error = xfs_btree_query_range(cur, low, high, + xfs_btree_has_records_helper, &info); + if (error == -ECANCELED) + goto out; + if (error) + return error; + + if (info.outcome == XBTREE_RECPACKING_EMPTY) + goto out; + + /* + * If the largest high_key(rec) we saw during the walk is greater than + * the end of the search range, classify this as full. Otherwise, + * there is a hole at the end of the search range. + */ + if (xfs_btree_masked_keycmp_ge(cur, &info.high_key, &info.end_key, + mask)) + info.outcome = XBTREE_RECPACKING_FULL; + +out: + *outcome = info.outcome; + return 0; } /* Are there more records in this btree? */ diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 29c4b4ccb909..a2aa36b23e25 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -90,6 +90,27 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum); #define XFS_BTREE_STATS_ADD(cur, stat, val) \ XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val) +enum xbtree_key_contig { + XBTREE_KEY_GAP = 0, + XBTREE_KEY_CONTIGUOUS, + XBTREE_KEY_OVERLAP, +}; + +/* + * Decide if these two numeric btree key fields are contiguous, overlapping, + * or if there's a gap between them. @x should be the field from the high + * key and @y should be the field from the low key. + */ +static inline enum xbtree_key_contig xbtree_key_contig(uint64_t x, uint64_t y) +{ + x++; + if (x < y) + return XBTREE_KEY_GAP; + if (x == y) + return XBTREE_KEY_CONTIGUOUS; + return XBTREE_KEY_OVERLAP; +} + struct xfs_btree_ops { /* size of the key and record structures */ size_t key_len; @@ -140,11 +161,14 @@ struct xfs_btree_ops { /* * Difference between key2 and key1 -- positive if key1 > key2, - * negative if key1 < key2, and zero if equal. + * negative if key1 < key2, and zero if equal. If the @mask parameter + * is non NULL, each key field to be used in the comparison must + * contain a nonzero value. */ int64_t (*diff_two_keys)(struct xfs_btree_cur *cur, const union xfs_btree_key *key1, - const union xfs_btree_key *key2); + const union xfs_btree_key *key2, + const union xfs_btree_key *mask); const struct xfs_buf_ops *buf_ops; @@ -157,6 +181,22 @@ struct xfs_btree_ops { int (*recs_inorder)(struct xfs_btree_cur *cur, const union xfs_btree_rec *r1, const union xfs_btree_rec *r2); + + /* + * Are these two btree keys immediately adjacent? + * + * Given two btree keys @key1 and @key2, decide if it is impossible for + * there to be a third btree key K satisfying the relationship + * @key1 < K < @key2. To determine if two btree records are + * immediately adjacent, @key1 should be the high key of the first + * record and @key2 should be the low key of the second record. + * If the @mask parameter is non NULL, each key field to be used in the + * comparison must contain a nonzero value. + */ + enum xbtree_key_contig (*keys_contiguous)(struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask); }; /* @@ -540,12 +580,105 @@ void xfs_btree_get_keys(struct xfs_btree_cur *cur, struct xfs_btree_block *block, union xfs_btree_key *key); union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur, union xfs_btree_key *key); -int xfs_btree_has_record(struct xfs_btree_cur *cur, +typedef bool (*xfs_btree_key_gap_fn)(struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2); + +int xfs_btree_has_records(struct xfs_btree_cur *cur, const union xfs_btree_irec *low, - const union xfs_btree_irec *high, bool *exists); + const union xfs_btree_irec *high, + const union xfs_btree_key *mask, + enum xbtree_recpacking *outcome); + bool xfs_btree_has_more_records(struct xfs_btree_cur *cur); struct xfs_ifork *xfs_btree_ifork_ptr(struct xfs_btree_cur *cur); +/* Key comparison helpers */ +static inline bool +xfs_btree_keycmp_lt( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2) +{ + return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) < 0; +} + +static inline bool +xfs_btree_keycmp_gt( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2) +{ + return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) > 0; +} + +static inline bool +xfs_btree_keycmp_eq( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2) +{ + return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) == 0; +} + +static inline bool +xfs_btree_keycmp_le( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2) +{ + return !xfs_btree_keycmp_gt(cur, key1, key2); +} + +static inline bool +xfs_btree_keycmp_ge( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2) +{ + return !xfs_btree_keycmp_lt(cur, key1, key2); +} + +static inline bool +xfs_btree_keycmp_ne( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2) +{ + return !xfs_btree_keycmp_eq(cur, key1, key2); +} + +/* Masked key comparison helpers */ +static inline bool +xfs_btree_masked_keycmp_lt( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + return cur->bc_ops->diff_two_keys(cur, key1, key2, mask) < 0; +} + +static inline bool +xfs_btree_masked_keycmp_gt( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + return cur->bc_ops->diff_two_keys(cur, key1, key2, mask) > 0; +} + +static inline bool +xfs_btree_masked_keycmp_ge( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + return !xfs_btree_masked_keycmp_lt(cur, key1, key2, mask); +} + /* Does this cursor point to the last block in the given level? */ static inline bool xfs_btree_islastblock( diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 5a321b783398..bcfb6a4203cd 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -397,6 +397,7 @@ xfs_defer_cancel_list( list_for_each_safe(pwi, n, &dfp->dfp_work) { list_del(pwi); dfp->dfp_count--; + trace_xfs_defer_cancel_item(mp, dfp, pwi); ops->cancel_item(pwi); } ASSERT(dfp->dfp_count == 0); @@ -476,6 +477,7 @@ xfs_defer_finish_one( list_for_each_safe(li, n, &dfp->dfp_work) { list_del(li); dfp->dfp_count--; + trace_xfs_defer_finish_item(tp->t_mountp, dfp, li); error = ops->finish_item(tp, dfp->dfp_done, li, &state); if (error == -EAGAIN) { int ret; @@ -623,7 +625,7 @@ xfs_defer_add( struct list_head *li) { struct xfs_defer_pending *dfp = NULL; - const struct xfs_defer_op_type *ops; + const struct xfs_defer_op_type *ops = defer_op_types[type]; ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX); @@ -636,7 +638,6 @@ xfs_defer_add( if (!list_empty(&tp->t_dfops)) { dfp = list_last_entry(&tp->t_dfops, struct xfs_defer_pending, dfp_list); - ops = defer_op_types[dfp->dfp_type]; if (dfp->dfp_type != type || (ops->max_items && dfp->dfp_count >= ops->max_items)) dfp = NULL; @@ -653,6 +654,7 @@ xfs_defer_add( } list_add_tail(li, &dfp->dfp_work); + trace_xfs_defer_add_item(tp->t_mountp, dfp, li); dfp->dfp_count++; } diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 92bac3373f1f..f5462fd582d5 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -64,7 +64,7 @@ xfs_ascii_ci_hashname( int i; for (i = 0, hash = 0; i < name->len; i++) - hash = tolower(name->name[i]) ^ rol32(hash, 7); + hash = xfs_ascii_ci_xfrm(name->name[i]) ^ rol32(hash, 7); return hash; } @@ -85,7 +85,8 @@ xfs_ascii_ci_compname( for (i = 0; i < len; i++) { if (args->name[i] == name[i]) continue; - if (tolower(args->name[i]) != tolower(name[i])) + if (xfs_ascii_ci_xfrm(args->name[i]) != + xfs_ascii_ci_xfrm(name[i])) return XFS_CMP_DIFFERENT; result = XFS_CMP_CASE; } diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index dd39f17dd9a9..19af22a16c41 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -248,4 +248,35 @@ unsigned int xfs_dir3_data_end_offset(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr); bool xfs_dir2_namecheck(const void *name, size_t length); +/* + * The "ascii-ci" feature was created to speed up case-insensitive lookups for + * a Samba product. Because of the inherent problems with CI and UTF-8 + * encoding, etc, it was decided that Samba would be configured to export + * latin1/iso 8859-1 encodings as that covered >90% of the target markets for + * the product. Hence the "ascii-ci" casefolding code could be encoded into + * the XFS directory operations and remove all the overhead of casefolding from + * Samba. + * + * To provide consistent hashing behavior between the userspace and kernel, + * these functions prepare names for hashing by transforming specific bytes + * to other bytes. Robustness with other encodings is not guaranteed. + */ +static inline bool xfs_ascii_ci_need_xfrm(unsigned char c) +{ + if (c >= 0x41 && c <= 0x5a) /* A-Z */ + return true; + if (c >= 0xc0 && c <= 0xd6) /* latin A-O with accents */ + return true; + if (c >= 0xd8 && c <= 0xde) /* latin O-Y with accents */ + return true; + return false; +} + +static inline unsigned char xfs_ascii_ci_xfrm(unsigned char c) +{ + if (xfs_ascii_ci_need_xfrm(c)) + c -= 'A' - 'a'; + return c; +} + #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 7ee292aecbeb..a16d5de16933 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -95,33 +95,25 @@ xfs_inobt_btrec_to_irec( irec->ir_free = be64_to_cpu(rec->inobt.ir_free); } -/* - * Get the data from the pointed-to record. - */ -int -xfs_inobt_get_rec( - struct xfs_btree_cur *cur, - struct xfs_inobt_rec_incore *irec, - int *stat) +/* Simple checks for inode records. */ +xfs_failaddr_t +xfs_inobt_check_irec( + struct xfs_btree_cur *cur, + const struct xfs_inobt_rec_incore *irec) { - struct xfs_mount *mp = cur->bc_mp; - union xfs_btree_rec *rec; - int error; uint64_t realfree; - error = xfs_btree_get_rec(cur, &rec, stat); - if (error || *stat == 0) - return error; - - xfs_inobt_btrec_to_irec(mp, rec, irec); - + /* Record has to be properly aligned within the AG. */ if (!xfs_verify_agino(cur->bc_ag.pag, irec->ir_startino)) - goto out_bad_rec; + return __this_address; + if (!xfs_verify_agino(cur->bc_ag.pag, + irec->ir_startino + XFS_INODES_PER_CHUNK - 1)) + return __this_address; if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT || irec->ir_count > XFS_INODES_PER_CHUNK) - goto out_bad_rec; + return __this_address; if (irec->ir_freecount > XFS_INODES_PER_CHUNK) - goto out_bad_rec; + return __this_address; /* if there are no holes, return the first available offset */ if (!xfs_inobt_issparse(irec->ir_holemask)) @@ -129,15 +121,23 @@ xfs_inobt_get_rec( else realfree = irec->ir_free & xfs_inobt_irec_to_allocmask(irec); if (hweight64(realfree) != irec->ir_freecount) - goto out_bad_rec; + return __this_address; - return 0; + return NULL; +} + +static inline int +xfs_inobt_complain_bad_rec( + struct xfs_btree_cur *cur, + xfs_failaddr_t fa, + const struct xfs_inobt_rec_incore *irec) +{ + struct xfs_mount *mp = cur->bc_mp; -out_bad_rec: xfs_warn(mp, - "%s Inode BTree record corruption in AG %d detected!", + "%s Inode BTree record corruption in AG %d detected at %pS!", cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free", - cur->bc_ag.pag->pag_agno); + cur->bc_ag.pag->pag_agno, fa); xfs_warn(mp, "start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x", irec->ir_startino, irec->ir_count, irec->ir_freecount, @@ -146,6 +146,32 @@ out_bad_rec: } /* + * Get the data from the pointed-to record. + */ +int +xfs_inobt_get_rec( + struct xfs_btree_cur *cur, + struct xfs_inobt_rec_incore *irec, + int *stat) +{ + struct xfs_mount *mp = cur->bc_mp; + union xfs_btree_rec *rec; + xfs_failaddr_t fa; + int error; + + error = xfs_btree_get_rec(cur, &rec, stat); + if (error || *stat == 0) + return error; + + xfs_inobt_btrec_to_irec(mp, rec, irec); + fa = xfs_inobt_check_irec(cur, irec); + if (fa) + return xfs_inobt_complain_bad_rec(cur, fa, irec); + + return 0; +} + +/* * Insert a single inobt record. Cursor must already point to desired location. */ int @@ -1952,8 +1978,6 @@ xfs_difree_inobt( */ if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { - struct xfs_perag *pag = agbp->b_pag; - xic->deleted = true; xic->first_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino); @@ -2617,44 +2641,50 @@ xfs_ialloc_read_agi( return 0; } -/* Is there an inode record covering a given range of inode numbers? */ -int -xfs_ialloc_has_inode_record( - struct xfs_btree_cur *cur, - xfs_agino_t low, - xfs_agino_t high, - bool *exists) +/* How many inodes are backed by inode clusters ondisk? */ +STATIC int +xfs_ialloc_count_ondisk( + struct xfs_btree_cur *cur, + xfs_agino_t low, + xfs_agino_t high, + unsigned int *allocated) { struct xfs_inobt_rec_incore irec; - xfs_agino_t agino; - uint16_t holemask; - int has_record; - int i; - int error; + unsigned int ret = 0; + int has_record; + int error; - *exists = false; error = xfs_inobt_lookup(cur, low, XFS_LOOKUP_LE, &has_record); - while (error == 0 && has_record) { + if (error) + return error; + + while (has_record) { + unsigned int i, hole_idx; + error = xfs_inobt_get_rec(cur, &irec, &has_record); - if (error || irec.ir_startino > high) + if (error) + return error; + if (irec.ir_startino > high) break; - agino = irec.ir_startino; - holemask = irec.ir_holemask; - for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; holemask >>= 1, - i++, agino += XFS_INODES_PER_HOLEMASK_BIT) { - if (holemask & 1) + for (i = 0; i < XFS_INODES_PER_CHUNK; i++) { + if (irec.ir_startino + i < low) continue; - if (agino + XFS_INODES_PER_HOLEMASK_BIT > low && - agino <= high) { - *exists = true; - return 0; - } + if (irec.ir_startino + i > high) + break; + + hole_idx = i / XFS_INODES_PER_HOLEMASK_BIT; + if (!(irec.ir_holemask & (1U << hole_idx))) + ret++; } error = xfs_btree_increment(cur, 0, &has_record); + if (error) + return error; } - return error; + + *allocated = ret; + return 0; } /* Is there an inode record covering a given extent? */ @@ -2663,15 +2693,27 @@ xfs_ialloc_has_inodes_at_extent( struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, - bool *exists) + enum xbtree_recpacking *outcome) { - xfs_agino_t low; - xfs_agino_t high; + xfs_agino_t agino; + xfs_agino_t last_agino; + unsigned int allocated; + int error; + + agino = XFS_AGB_TO_AGINO(cur->bc_mp, bno); + last_agino = XFS_AGB_TO_AGINO(cur->bc_mp, bno + len) - 1; - low = XFS_AGB_TO_AGINO(cur->bc_mp, bno); - high = XFS_AGB_TO_AGINO(cur->bc_mp, bno + len) - 1; + error = xfs_ialloc_count_ondisk(cur, agino, last_agino, &allocated); + if (error) + return error; - return xfs_ialloc_has_inode_record(cur, low, high, exists); + if (allocated == 0) + *outcome = XBTREE_RECPACKING_EMPTY; + else if (allocated == last_agino - agino + 1) + *outcome = XBTREE_RECPACKING_FULL; + else + *outcome = XBTREE_RECPACKING_SPARSE; + return 0; } struct xfs_ialloc_count_inodes { @@ -2688,8 +2730,13 @@ xfs_ialloc_count_inodes_rec( { struct xfs_inobt_rec_incore irec; struct xfs_ialloc_count_inodes *ci = priv; + xfs_failaddr_t fa; xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec); + fa = xfs_inobt_check_irec(cur, &irec); + if (fa) + return xfs_inobt_complain_bad_rec(cur, fa, &irec); + ci->count += irec.ir_count; ci->freecount += irec.ir_freecount; diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index ab8c30b4ec22..fe824bb04a09 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -93,10 +93,11 @@ union xfs_btree_rec; void xfs_inobt_btrec_to_irec(struct xfs_mount *mp, const union xfs_btree_rec *rec, struct xfs_inobt_rec_incore *irec); +xfs_failaddr_t xfs_inobt_check_irec(struct xfs_btree_cur *cur, + const struct xfs_inobt_rec_incore *irec); int xfs_ialloc_has_inodes_at_extent(struct xfs_btree_cur *cur, - xfs_agblock_t bno, xfs_extlen_t len, bool *exists); -int xfs_ialloc_has_inode_record(struct xfs_btree_cur *cur, xfs_agino_t low, - xfs_agino_t high, bool *exists); + xfs_agblock_t bno, xfs_extlen_t len, + enum xbtree_recpacking *outcome); int xfs_ialloc_count_inodes(struct xfs_btree_cur *cur, xfs_agino_t *count, xfs_agino_t *freecount); int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask, diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 9b28211d5a4c..5a945ae21b5d 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -156,9 +156,12 @@ __xfs_inobt_free_block( struct xfs_buf *bp, enum xfs_ag_resv_type resv) { + xfs_fsblock_t fsbno; + xfs_inobt_mod_blockcount(cur, -1); - return xfs_free_extent(cur->bc_tp, - XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)), 1, + fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); + return xfs_free_extent(cur->bc_tp, cur->bc_ag.pag, + XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1, &XFS_RMAP_OINFO_INOBT, resv); } @@ -266,10 +269,13 @@ STATIC int64_t xfs_inobt_diff_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, - const union xfs_btree_key *k2) + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) { + ASSERT(!mask || mask->inobt.ir_startino); + return (int64_t)be32_to_cpu(k1->inobt.ir_startino) - - be32_to_cpu(k2->inobt.ir_startino); + be32_to_cpu(k2->inobt.ir_startino); } static xfs_failaddr_t @@ -380,6 +386,19 @@ xfs_inobt_recs_inorder( be32_to_cpu(r2->inobt.ir_startino); } +STATIC enum xbtree_key_contig +xfs_inobt_keys_contiguous( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + ASSERT(!mask || mask->inobt.ir_startino); + + return xbtree_key_contig(be32_to_cpu(key1->inobt.ir_startino), + be32_to_cpu(key2->inobt.ir_startino)); +} + static const struct xfs_btree_ops xfs_inobt_ops = { .rec_len = sizeof(xfs_inobt_rec_t), .key_len = sizeof(xfs_inobt_key_t), @@ -399,6 +418,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = { .diff_two_keys = xfs_inobt_diff_two_keys, .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, + .keys_contiguous = xfs_inobt_keys_contiguous, }; static const struct xfs_btree_ops xfs_finobt_ops = { @@ -420,6 +440,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = { .diff_two_keys = xfs_inobt_diff_two_keys, .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, + .keys_contiguous = xfs_inobt_keys_contiguous, }; /* @@ -447,9 +468,7 @@ xfs_inobt_init_common( if (xfs_has_crc(mp)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - /* take a reference for the cursor */ - atomic_inc(&pag->pag_ref); - cur->bc_ag.pag = pag; + cur->bc_ag.pag = xfs_perag_hold(pag); return cur; } @@ -607,7 +626,7 @@ xfs_iallocbt_maxlevels_ondisk(void) */ uint64_t xfs_inobt_irec_to_allocmask( - struct xfs_inobt_rec_incore *rec) + const struct xfs_inobt_rec_incore *rec) { uint64_t bitmap = 0; uint64_t inodespbit; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index e859a6e05230..3262c3fe5ebe 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -53,7 +53,7 @@ struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_perag *pag, extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); /* ir_holemask to inode allocation bitmap conversion */ -uint64_t xfs_inobt_irec_to_allocmask(struct xfs_inobt_rec_incore *); +uint64_t xfs_inobt_irec_to_allocmask(const struct xfs_inobt_rec_incore *irec); #if defined(DEBUG) || defined(XFS_WARN) int xfs_inobt_rec_check_count(struct xfs_mount *, diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 6b21760184d9..5a2e7ddfa76d 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -140,7 +140,8 @@ xfs_iformat_extents( xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iformat_extents(2)", dp, sizeof(*dp), fa); - return -EFSCORRUPTED; + return xfs_bmap_complain_bad_rec(ip, whichfork, + fa, &new); } xfs_iext_insert(ip, &icur, &new, state); @@ -226,10 +227,15 @@ xfs_iformat_data_fork( /* * Initialize the extent count early, as the per-format routines may - * depend on it. + * depend on it. Use release semantics to set needextents /after/ we + * set the format. This ensures that we can use acquire semantics on + * needextents in xfs_need_iread_extents() and be guaranteed to see a + * valid format value after that load. */ ip->i_df.if_format = dip->di_format; ip->i_df.if_nextents = xfs_dfork_data_extents(dip); + smp_store_release(&ip->i_df.if_needextents, + ip->i_df.if_format == XFS_DINODE_FMT_BTREE ? 1 : 0); switch (inode->i_mode & S_IFMT) { case S_IFIFO: @@ -282,8 +288,17 @@ xfs_ifork_init_attr( enum xfs_dinode_fmt format, xfs_extnum_t nextents) { + /* + * Initialize the extent count early, as the per-format routines may + * depend on it. Use release semantics to set needextents /after/ we + * set the format. This ensures that we can use acquire semantics on + * needextents in xfs_need_iread_extents() and be guaranteed to see a + * valid format value after that load. + */ ip->i_af.if_format = format; ip->i_af.if_nextents = nextents; + smp_store_release(&ip->i_af.if_needextents, + ip->i_af.if_format == XFS_DINODE_FMT_BTREE ? 1 : 0); } void diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index d3943d6ad0b9..96d307784c85 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -24,6 +24,7 @@ struct xfs_ifork { xfs_extnum_t if_nextents; /* # of extents in this fork */ short if_broot_bytes; /* bytes allocated for root */ int8_t if_format; /* format of this fork */ + uint8_t if_needextents; /* extents have not been read */ }; /* @@ -260,9 +261,10 @@ int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip, uint nr_to_add); /* returns true if the fork has extents but they are not read in yet. */ -static inline bool xfs_need_iread_extents(struct xfs_ifork *ifp) +static inline bool xfs_need_iread_extents(const struct xfs_ifork *ifp) { - return ifp->if_format == XFS_DINODE_FMT_BTREE && ifp->if_height == 0; + /* see xfs_iformat_{data,attr}_fork() for needextents semantics */ + return smp_load_acquire(&ifp->if_needextents) != 0; } #endif /* __XFS_INODE_FORK_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index bcf46aa0d08b..c1c65774dcc2 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -120,45 +120,41 @@ xfs_refcount_btrec_to_irec( irec->rc_refcount = be32_to_cpu(rec->refc.rc_refcount); } -/* - * Get the data from the pointed-to record. - */ -int -xfs_refcount_get_rec( +/* Simple checks for refcount records. */ +xfs_failaddr_t +xfs_refcount_check_irec( struct xfs_btree_cur *cur, - struct xfs_refcount_irec *irec, - int *stat) + const struct xfs_refcount_irec *irec) { - struct xfs_mount *mp = cur->bc_mp; struct xfs_perag *pag = cur->bc_ag.pag; - union xfs_btree_rec *rec; - int error; - - error = xfs_btree_get_rec(cur, &rec, stat); - if (error || !*stat) - return error; - xfs_refcount_btrec_to_irec(rec, irec); if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN) - goto out_bad_rec; + return __this_address; if (!xfs_refcount_check_domain(irec)) - goto out_bad_rec; + return __this_address; /* check for valid extent range, including overflow */ if (!xfs_verify_agbext(pag, irec->rc_startblock, irec->rc_blockcount)) - goto out_bad_rec; + return __this_address; if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT) - goto out_bad_rec; + return __this_address; - trace_xfs_refcount_get(cur->bc_mp, pag->pag_agno, irec); - return 0; + return NULL; +} + +static inline int +xfs_refcount_complain_bad_rec( + struct xfs_btree_cur *cur, + xfs_failaddr_t fa, + const struct xfs_refcount_irec *irec) +{ + struct xfs_mount *mp = cur->bc_mp; -out_bad_rec: xfs_warn(mp, - "Refcount BTree record corruption in AG %d detected!", - pag->pag_agno); + "Refcount BTree record corruption in AG %d detected at %pS!", + cur->bc_ag.pag->pag_agno, fa); xfs_warn(mp, "Start block 0x%x, block count 0x%x, references 0x%x", irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount); @@ -166,6 +162,32 @@ out_bad_rec: } /* + * Get the data from the pointed-to record. + */ +int +xfs_refcount_get_rec( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *irec, + int *stat) +{ + union xfs_btree_rec *rec; + xfs_failaddr_t fa; + int error; + + error = xfs_btree_get_rec(cur, &rec, stat); + if (error || !*stat) + return error; + + xfs_refcount_btrec_to_irec(rec, irec); + fa = xfs_refcount_check_irec(cur, irec); + if (fa) + return xfs_refcount_complain_bad_rec(cur, fa, irec); + + trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); + return 0; +} + +/* * Update the record referred to by cur to the value given * by [bno, len, refcount]. * This either works (return 0) or gets an EFSCORRUPTED error. @@ -1332,26 +1354,22 @@ xfs_refcount_finish_one( xfs_agblock_t bno; unsigned long nr_ops = 0; int shape_changes = 0; - struct xfs_perag *pag; - pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock)); bno = XFS_FSB_TO_AGBNO(mp, ri->ri_startblock); trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, ri->ri_startblock), ri->ri_type, XFS_FSB_TO_AGBNO(mp, ri->ri_startblock), ri->ri_blockcount); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) { - error = -EIO; - goto out_drop; - } + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) + return -EIO; /* * If we haven't gotten a cursor or the cursor AG doesn't match * the startblock, get one now. */ rcur = *pcur; - if (rcur != NULL && rcur->bc_ag.pag != pag) { + if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) { nr_ops = rcur->bc_ag.refc.nr_ops; shape_changes = rcur->bc_ag.refc.shape_changes; xfs_refcount_finish_one_cleanup(tp, rcur, 0); @@ -1359,12 +1377,12 @@ xfs_refcount_finish_one( *pcur = NULL; } if (rcur == NULL) { - error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_FREEING, - &agbp); + error = xfs_alloc_read_agf(ri->ri_pag, tp, + XFS_ALLOC_FLAG_FREEING, &agbp); if (error) - goto out_drop; + return error; - rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag); + rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, ri->ri_pag); rcur->bc_ag.refc.nr_ops = nr_ops; rcur->bc_ag.refc.shape_changes = shape_changes; } @@ -1375,7 +1393,7 @@ xfs_refcount_finish_one( error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount, XFS_REFCOUNT_ADJUST_INCREASE); if (error) - goto out_drop; + return error; if (ri->ri_blockcount > 0) error = xfs_refcount_continue_op(rcur, ri, bno); break; @@ -1383,31 +1401,29 @@ xfs_refcount_finish_one( error = xfs_refcount_adjust(rcur, &bno, &ri->ri_blockcount, XFS_REFCOUNT_ADJUST_DECREASE); if (error) - goto out_drop; + return error; if (ri->ri_blockcount > 0) error = xfs_refcount_continue_op(rcur, ri, bno); break; case XFS_REFCOUNT_ALLOC_COW: error = __xfs_refcount_cow_alloc(rcur, bno, ri->ri_blockcount); if (error) - goto out_drop; + return error; ri->ri_blockcount = 0; break; case XFS_REFCOUNT_FREE_COW: error = __xfs_refcount_cow_free(rcur, bno, ri->ri_blockcount); if (error) - goto out_drop; + return error; ri->ri_blockcount = 0; break; default: ASSERT(0); - error = -EFSCORRUPTED; + return -EFSCORRUPTED; } if (!error && ri->ri_blockcount > 0) - trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno, + trace_xfs_refcount_finish_one_leftover(mp, ri->ri_pag->pag_agno, ri->ri_type, bno, ri->ri_blockcount); -out_drop: - xfs_perag_put(pag); return error; } @@ -1435,6 +1451,7 @@ __xfs_refcount_add( ri->ri_startblock = startblock; ri->ri_blockcount = blockcount; + xfs_refcount_update_get_group(tp->t_mountp, ri); xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list); } @@ -1876,7 +1893,8 @@ xfs_refcount_recover_extent( INIT_LIST_HEAD(&rr->rr_list); xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); - if (XFS_IS_CORRUPT(cur->bc_mp, + if (xfs_refcount_check_irec(cur, &rr->rr_rrec) != NULL || + XFS_IS_CORRUPT(cur->bc_mp, rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) { kfree(rr); return -EFSCORRUPTED; @@ -1980,14 +1998,17 @@ out_free: return error; } -/* Is there a record covering a given extent? */ +/* + * Scan part of the keyspace of the refcount records and tell us if the area + * has no records, is fully mapped by records, or is partially filled. + */ int -xfs_refcount_has_record( +xfs_refcount_has_records( struct xfs_btree_cur *cur, enum xfs_refc_domain domain, xfs_agblock_t bno, xfs_extlen_t len, - bool *exists) + enum xbtree_recpacking *outcome) { union xfs_btree_irec low; union xfs_btree_irec high; @@ -1998,7 +2019,7 @@ xfs_refcount_has_record( high.rc.rc_startblock = bno + len - 1; low.rc.rc_domain = high.rc.rc_domain = domain; - return xfs_btree_has_record(cur, &low, &high, exists); + return xfs_btree_has_records(cur, &low, &high, NULL, outcome); } int __init diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index c633477ce3ce..783cd89ca195 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -50,6 +50,7 @@ enum xfs_refcount_intent_type { struct xfs_refcount_intent { struct list_head ri_list; + struct xfs_perag *ri_pag; enum xfs_refcount_intent_type ri_type; xfs_extlen_t ri_blockcount; xfs_fsblock_t ri_startblock; @@ -67,6 +68,9 @@ xfs_refcount_check_domain( return true; } +void xfs_refcount_update_get_group(struct xfs_mount *mp, + struct xfs_refcount_intent *ri); + void xfs_refcount_increase_extent(struct xfs_trans *tp, struct xfs_bmbt_irec *irec); void xfs_refcount_decrease_extent(struct xfs_trans *tp, @@ -107,12 +111,14 @@ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, */ #define XFS_REFCOUNT_ITEM_OVERHEAD 32 -extern int xfs_refcount_has_record(struct xfs_btree_cur *cur, +extern int xfs_refcount_has_records(struct xfs_btree_cur *cur, enum xfs_refc_domain domain, xfs_agblock_t bno, - xfs_extlen_t len, bool *exists); + xfs_extlen_t len, enum xbtree_recpacking *outcome); union xfs_btree_rec; extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_refcount_irec *irec); +xfs_failaddr_t xfs_refcount_check_irec(struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *irec); extern int xfs_refcount_insert(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, int *stat); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index f3b860970b26..d4afc5f4e6a5 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -112,8 +112,9 @@ xfs_refcountbt_free_block( XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1); be32_add_cpu(&agf->agf_refcount_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); - error = xfs_free_extent(cur->bc_tp, fsbno, 1, &XFS_RMAP_OINFO_REFC, - XFS_AG_RESV_METADATA); + error = xfs_free_extent(cur->bc_tp, cur->bc_ag.pag, + XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1, + &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA); if (error) return error; @@ -201,10 +202,13 @@ STATIC int64_t xfs_refcountbt_diff_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, - const union xfs_btree_key *k2) + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) { + ASSERT(!mask || mask->refc.rc_startblock); + return (int64_t)be32_to_cpu(k1->refc.rc_startblock) - - be32_to_cpu(k2->refc.rc_startblock); + be32_to_cpu(k2->refc.rc_startblock); } STATIC xfs_failaddr_t @@ -299,6 +303,19 @@ xfs_refcountbt_recs_inorder( be32_to_cpu(r2->refc.rc_startblock); } +STATIC enum xbtree_key_contig +xfs_refcountbt_keys_contiguous( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + ASSERT(!mask || mask->refc.rc_startblock); + + return xbtree_key_contig(be32_to_cpu(key1->refc.rc_startblock), + be32_to_cpu(key2->refc.rc_startblock)); +} + static const struct xfs_btree_ops xfs_refcountbt_ops = { .rec_len = sizeof(struct xfs_refcount_rec), .key_len = sizeof(struct xfs_refcount_key), @@ -318,6 +335,7 @@ static const struct xfs_btree_ops xfs_refcountbt_ops = { .diff_two_keys = xfs_refcountbt_diff_two_keys, .keys_inorder = xfs_refcountbt_keys_inorder, .recs_inorder = xfs_refcountbt_recs_inorder, + .keys_contiguous = xfs_refcountbt_keys_contiguous, }; /* @@ -339,10 +357,7 @@ xfs_refcountbt_init_common( cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - /* take a reference for the cursor */ - atomic_inc(&pag->pag_ref); - cur->bc_ag.pag = pag; - + cur->bc_ag.pag = xfs_perag_hold(pag); cur->bc_ag.refc.nr_ops = 0; cur->bc_ag.refc.shape_changes = 0; cur->bc_ops = &xfs_refcountbt_ops; diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index df720041cd3d..f4dc23b3b837 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -193,7 +193,7 @@ done: } /* Convert an internal btree record to an rmap record. */ -int +xfs_failaddr_t xfs_rmap_btrec_to_irec( const union xfs_btree_rec *rec, struct xfs_rmap_irec *irec) @@ -205,51 +205,74 @@ xfs_rmap_btrec_to_irec( irec); } -/* - * Get the data from the pointed-to record. - */ -int -xfs_rmap_get_rec( - struct xfs_btree_cur *cur, - struct xfs_rmap_irec *irec, - int *stat) +/* Simple checks for rmap records. */ +xfs_failaddr_t +xfs_rmap_check_irec( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *irec) { - struct xfs_mount *mp = cur->bc_mp; - struct xfs_perag *pag = cur->bc_ag.pag; - union xfs_btree_rec *rec; - int error; - - error = xfs_btree_get_rec(cur, &rec, stat); - if (error || !*stat) - return error; - - if (xfs_rmap_btrec_to_irec(rec, irec)) - goto out_bad_rec; + struct xfs_mount *mp = cur->bc_mp; + bool is_inode; + bool is_unwritten; + bool is_bmbt; + bool is_attr; if (irec->rm_blockcount == 0) - goto out_bad_rec; + return __this_address; if (irec->rm_startblock <= XFS_AGFL_BLOCK(mp)) { if (irec->rm_owner != XFS_RMAP_OWN_FS) - goto out_bad_rec; + return __this_address; if (irec->rm_blockcount != XFS_AGFL_BLOCK(mp) + 1) - goto out_bad_rec; + return __this_address; } else { /* check for valid extent range, including overflow */ - if (!xfs_verify_agbext(pag, irec->rm_startblock, - irec->rm_blockcount)) - goto out_bad_rec; + if (!xfs_verify_agbext(cur->bc_ag.pag, irec->rm_startblock, + irec->rm_blockcount)) + return __this_address; } if (!(xfs_verify_ino(mp, irec->rm_owner) || (irec->rm_owner <= XFS_RMAP_OWN_FS && irec->rm_owner >= XFS_RMAP_OWN_MIN))) - goto out_bad_rec; + return __this_address; + + /* Check flags. */ + is_inode = !XFS_RMAP_NON_INODE_OWNER(irec->rm_owner); + is_bmbt = irec->rm_flags & XFS_RMAP_BMBT_BLOCK; + is_attr = irec->rm_flags & XFS_RMAP_ATTR_FORK; + is_unwritten = irec->rm_flags & XFS_RMAP_UNWRITTEN; + + if (is_bmbt && irec->rm_offset != 0) + return __this_address; + + if (!is_inode && irec->rm_offset != 0) + return __this_address; + + if (is_unwritten && (is_bmbt || !is_inode || is_attr)) + return __this_address; + + if (!is_inode && (is_bmbt || is_unwritten || is_attr)) + return __this_address; + + /* Check for a valid fork offset, if applicable. */ + if (is_inode && !is_bmbt && + !xfs_verify_fileext(mp, irec->rm_offset, irec->rm_blockcount)) + return __this_address; + + return NULL; +} + +static inline int +xfs_rmap_complain_bad_rec( + struct xfs_btree_cur *cur, + xfs_failaddr_t fa, + const struct xfs_rmap_irec *irec) +{ + struct xfs_mount *mp = cur->bc_mp; - return 0; -out_bad_rec: xfs_warn(mp, - "Reverse Mapping BTree record corruption in AG %d detected!", - pag->pag_agno); + "Reverse Mapping BTree record corruption in AG %d detected at %pS!", + cur->bc_ag.pag->pag_agno, fa); xfs_warn(mp, "Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x", irec->rm_owner, irec->rm_flags, irec->rm_startblock, @@ -257,6 +280,32 @@ out_bad_rec: return -EFSCORRUPTED; } +/* + * Get the data from the pointed-to record. + */ +int +xfs_rmap_get_rec( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *irec, + int *stat) +{ + union xfs_btree_rec *rec; + xfs_failaddr_t fa; + int error; + + error = xfs_btree_get_rec(cur, &rec, stat); + if (error || !*stat) + return error; + + fa = xfs_rmap_btrec_to_irec(rec, irec); + if (!fa) + fa = xfs_rmap_check_irec(cur, irec); + if (fa) + return xfs_rmap_complain_bad_rec(cur, fa, irec); + + return 0; +} + struct xfs_find_left_neighbor_info { struct xfs_rmap_irec high; struct xfs_rmap_irec *irec; @@ -2320,11 +2369,14 @@ xfs_rmap_query_range_helper( { struct xfs_rmap_query_range_info *query = priv; struct xfs_rmap_irec irec; - int error; + xfs_failaddr_t fa; + + fa = xfs_rmap_btrec_to_irec(rec, &irec); + if (!fa) + fa = xfs_rmap_check_irec(cur, &irec); + if (fa) + return xfs_rmap_complain_bad_rec(cur, fa, &irec); - error = xfs_rmap_btrec_to_irec(rec, &irec); - if (error) - return error; return query->fn(cur, &irec, query->priv); } @@ -2394,7 +2446,6 @@ xfs_rmap_finish_one( struct xfs_btree_cur **pcur) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_perag *pag; struct xfs_btree_cur *rcur; struct xfs_buf *agbp = NULL; int error = 0; @@ -2402,26 +2453,22 @@ xfs_rmap_finish_one( xfs_agblock_t bno; bool unwritten; - pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock)); bno = XFS_FSB_TO_AGBNO(mp, ri->ri_bmap.br_startblock); - trace_xfs_rmap_deferred(mp, pag->pag_agno, ri->ri_type, bno, + trace_xfs_rmap_deferred(mp, ri->ri_pag->pag_agno, ri->ri_type, bno, ri->ri_owner, ri->ri_whichfork, ri->ri_bmap.br_startoff, ri->ri_bmap.br_blockcount, ri->ri_bmap.br_state); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) { - error = -EIO; - goto out_drop; - } - + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) + return -EIO; /* * If we haven't gotten a cursor or the cursor AG doesn't match * the startblock, get one now. */ rcur = *pcur; - if (rcur != NULL && rcur->bc_ag.pag != pag) { + if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) { xfs_rmap_finish_one_cleanup(tp, rcur, 0); rcur = NULL; *pcur = NULL; @@ -2432,15 +2479,13 @@ xfs_rmap_finish_one( * rmapbt, because a shape change could cause us to * allocate blocks. */ - error = xfs_free_extent_fix_freelist(tp, pag, &agbp); + error = xfs_free_extent_fix_freelist(tp, ri->ri_pag, &agbp); if (error) - goto out_drop; - if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) { - error = -EFSCORRUPTED; - goto out_drop; - } + return error; + if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) + return -EFSCORRUPTED; - rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); + rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, ri->ri_pag); } *pcur = rcur; @@ -2480,8 +2525,7 @@ xfs_rmap_finish_one( ASSERT(0); error = -EFSCORRUPTED; } -out_drop: - xfs_perag_put(pag); + return error; } @@ -2526,6 +2570,7 @@ __xfs_rmap_add( ri->ri_whichfork = whichfork; ri->ri_bmap = *bmap; + xfs_rmap_update_get_group(tp->t_mountp, ri); xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list); } @@ -2664,14 +2709,21 @@ xfs_rmap_compare( return 0; } -/* Is there a record covering a given extent? */ +/* + * Scan the physical storage part of the keyspace of the reverse mapping index + * and tell us if the area has no records, is fully mapped by records, or is + * partially filled. + */ int -xfs_rmap_has_record( +xfs_rmap_has_records( struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, - bool *exists) + enum xbtree_recpacking *outcome) { + union xfs_btree_key mask = { + .rmap.rm_startblock = cpu_to_be32(-1U), + }; union xfs_btree_irec low; union xfs_btree_irec high; @@ -2680,68 +2732,144 @@ xfs_rmap_has_record( memset(&high, 0xFF, sizeof(high)); high.r.rm_startblock = bno + len - 1; - return xfs_btree_has_record(cur, &low, &high, exists); + return xfs_btree_has_records(cur, &low, &high, &mask, outcome); } -/* - * Is there a record for this owner completely covering a given physical - * extent? If so, *has_rmap will be set to true. If there is no record - * or the record only covers part of the range, we set *has_rmap to false. - * This function doesn't perform range lookups or offset checks, so it is - * not suitable for checking data fork blocks. - */ -int -xfs_rmap_record_exists( - struct xfs_btree_cur *cur, +struct xfs_rmap_ownercount { + /* Owner that we're looking for. */ + struct xfs_rmap_irec good; + + /* rmap search keys */ + struct xfs_rmap_irec low; + struct xfs_rmap_irec high; + + struct xfs_rmap_matches *results; + + /* Stop early if we find a nonmatch? */ + bool stop_on_nonmatch; +}; + +/* Does this rmap represent space that can have multiple owners? */ +static inline bool +xfs_rmap_shareable( + struct xfs_mount *mp, + const struct xfs_rmap_irec *rmap) +{ + if (!xfs_has_reflink(mp)) + return false; + if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner)) + return false; + if (rmap->rm_flags & (XFS_RMAP_ATTR_FORK | + XFS_RMAP_BMBT_BLOCK)) + return false; + return true; +} + +static inline void +xfs_rmap_ownercount_init( + struct xfs_rmap_ownercount *roc, xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo, - bool *has_rmap) + struct xfs_rmap_matches *results) { - uint64_t owner; - uint64_t offset; - unsigned int flags; - int has_record; - struct xfs_rmap_irec irec; - int error; + memset(roc, 0, sizeof(*roc)); + roc->results = results; + + roc->low.rm_startblock = bno; + memset(&roc->high, 0xFF, sizeof(roc->high)); + roc->high.rm_startblock = bno + len - 1; + + memset(results, 0, sizeof(*results)); + roc->good.rm_startblock = bno; + roc->good.rm_blockcount = len; + roc->good.rm_owner = oinfo->oi_owner; + roc->good.rm_offset = oinfo->oi_offset; + if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK) + roc->good.rm_flags |= XFS_RMAP_ATTR_FORK; + if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK) + roc->good.rm_flags |= XFS_RMAP_BMBT_BLOCK; +} - xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); - ASSERT(XFS_RMAP_NON_INODE_OWNER(owner) || - (flags & XFS_RMAP_BMBT_BLOCK)); +/* Figure out if this is a match for the owner. */ +STATIC int +xfs_rmap_count_owners_helper( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xfs_rmap_ownercount *roc = priv; + struct xfs_rmap_irec check = *rec; + unsigned int keyflags; + bool filedata; + int64_t delta; + + filedata = !XFS_RMAP_NON_INODE_OWNER(check.rm_owner) && + !(check.rm_flags & XFS_RMAP_BMBT_BLOCK); + + /* Trim the part of check that comes before the comparison range. */ + delta = (int64_t)roc->good.rm_startblock - check.rm_startblock; + if (delta > 0) { + check.rm_startblock += delta; + check.rm_blockcount -= delta; + if (filedata) + check.rm_offset += delta; + } - error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &irec, - &has_record); - if (error) - return error; - if (!has_record) { - *has_rmap = false; - return 0; + /* Trim the part of check that comes after the comparison range. */ + delta = (check.rm_startblock + check.rm_blockcount) - + (roc->good.rm_startblock + roc->good.rm_blockcount); + if (delta > 0) + check.rm_blockcount -= delta; + + /* Don't care about unwritten status for establishing ownership. */ + keyflags = check.rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK); + + if (check.rm_startblock == roc->good.rm_startblock && + check.rm_blockcount == roc->good.rm_blockcount && + check.rm_owner == roc->good.rm_owner && + check.rm_offset == roc->good.rm_offset && + keyflags == roc->good.rm_flags) { + roc->results->matches++; + } else { + roc->results->non_owner_matches++; + if (xfs_rmap_shareable(cur->bc_mp, &roc->good) ^ + xfs_rmap_shareable(cur->bc_mp, &check)) + roc->results->bad_non_owner_matches++; } - *has_rmap = (irec.rm_owner == owner && irec.rm_startblock <= bno && - irec.rm_startblock + irec.rm_blockcount >= bno + len); + if (roc->results->non_owner_matches && roc->stop_on_nonmatch) + return -ECANCELED; + return 0; } -struct xfs_rmap_key_state { - uint64_t owner; - uint64_t offset; - unsigned int flags; -}; - -/* For each rmap given, figure out if it doesn't match the key we want. */ -STATIC int -xfs_rmap_has_other_keys_helper( +/* Count the number of owners and non-owners of this range of blocks. */ +int +xfs_rmap_count_owners( struct xfs_btree_cur *cur, - const struct xfs_rmap_irec *rec, - void *priv) + xfs_agblock_t bno, + xfs_extlen_t len, + const struct xfs_owner_info *oinfo, + struct xfs_rmap_matches *results) { - struct xfs_rmap_key_state *rks = priv; + struct xfs_rmap_ownercount roc; + int error; - if (rks->owner == rec->rm_owner && rks->offset == rec->rm_offset && - ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags) - return 0; - return -ECANCELED; + xfs_rmap_ownercount_init(&roc, bno, len, oinfo, results); + error = xfs_rmap_query_range(cur, &roc.low, &roc.high, + xfs_rmap_count_owners_helper, &roc); + if (error) + return error; + + /* + * There can't be any non-owner rmaps that conflict with the given + * owner if we didn't find any rmaps matching the owner. + */ + if (!results->matches) + results->bad_non_owner_matches = 0; + + return 0; } /* @@ -2754,28 +2882,26 @@ xfs_rmap_has_other_keys( xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo, - bool *has_rmap) + bool *has_other) { - struct xfs_rmap_irec low = {0}; - struct xfs_rmap_irec high; - struct xfs_rmap_key_state rks; + struct xfs_rmap_matches res; + struct xfs_rmap_ownercount roc; int error; - xfs_owner_info_unpack(oinfo, &rks.owner, &rks.offset, &rks.flags); - *has_rmap = false; - - low.rm_startblock = bno; - memset(&high, 0xFF, sizeof(high)); - high.rm_startblock = bno + len - 1; + xfs_rmap_ownercount_init(&roc, bno, len, oinfo, &res); + roc.stop_on_nonmatch = true; - error = xfs_rmap_query_range(cur, &low, &high, - xfs_rmap_has_other_keys_helper, &rks); + error = xfs_rmap_query_range(cur, &roc.low, &roc.high, + xfs_rmap_count_owners_helper, &roc); if (error == -ECANCELED) { - *has_rmap = true; + *has_other = true; return 0; } + if (error) + return error; - return error; + *has_other = false; + return 0; } const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = { diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 2dac88cea28d..3c98d9d50afb 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -62,13 +62,14 @@ xfs_rmap_irec_offset_pack( return x; } -static inline int +static inline xfs_failaddr_t xfs_rmap_irec_offset_unpack( __u64 offset, struct xfs_rmap_irec *irec) { if (offset & ~(XFS_RMAP_OFF_MASK | XFS_RMAP_OFF_FLAGS)) - return -EFSCORRUPTED; + return __this_address; + irec->rm_offset = XFS_RMAP_OFF(offset); irec->rm_flags = 0; if (offset & XFS_RMAP_OFF_ATTR_FORK) @@ -77,7 +78,7 @@ xfs_rmap_irec_offset_unpack( irec->rm_flags |= XFS_RMAP_BMBT_BLOCK; if (offset & XFS_RMAP_OFF_UNWRITTEN) irec->rm_flags |= XFS_RMAP_UNWRITTEN; - return 0; + return NULL; } static inline void @@ -162,8 +163,12 @@ struct xfs_rmap_intent { int ri_whichfork; uint64_t ri_owner; struct xfs_bmbt_irec ri_bmap; + struct xfs_perag *ri_pag; }; +void xfs_rmap_update_get_group(struct xfs_mount *mp, + struct xfs_rmap_intent *ri); + /* functions for updating the rmapbt based on bmbt map/unmap operations */ void xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *imap); @@ -188,16 +193,31 @@ int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, int xfs_rmap_compare(const struct xfs_rmap_irec *a, const struct xfs_rmap_irec *b); union xfs_btree_rec; -int xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec, +xfs_failaddr_t xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_rmap_irec *irec); -int xfs_rmap_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len, bool *exists); -int xfs_rmap_record_exists(struct xfs_btree_cur *cur, xfs_agblock_t bno, +xfs_failaddr_t xfs_rmap_check_irec(struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *irec); + +int xfs_rmap_has_records(struct xfs_btree_cur *cur, xfs_agblock_t bno, + xfs_extlen_t len, enum xbtree_recpacking *outcome); + +struct xfs_rmap_matches { + /* Number of owner matches. */ + unsigned long long matches; + + /* Number of non-owner matches. */ + unsigned long long non_owner_matches; + + /* Number of non-owner matches that conflict with the owner matches. */ + unsigned long long bad_non_owner_matches; +}; + +int xfs_rmap_count_owners(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo, - bool *has_rmap); + struct xfs_rmap_matches *rmatch); int xfs_rmap_has_other_keys(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo, - bool *has_rmap); + bool *has_other); int xfs_rmap_map_raw(struct xfs_btree_cur *cur, struct xfs_rmap_irec *rmap); extern const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE; diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index d3285684bb5e..6c81b20e97d2 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -156,6 +156,16 @@ xfs_rmapbt_get_maxrecs( return cur->bc_mp->m_rmap_mxr[level != 0]; } +/* + * Convert the ondisk record's offset field into the ondisk key's offset field. + * Fork and bmbt are significant parts of the rmap record key, but written + * status is merely a record attribute. + */ +static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec) +{ + return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN); +} + STATIC void xfs_rmapbt_init_key_from_rec( union xfs_btree_key *key, @@ -163,7 +173,7 @@ xfs_rmapbt_init_key_from_rec( { key->rmap.rm_startblock = rec->rmap.rm_startblock; key->rmap.rm_owner = rec->rmap.rm_owner; - key->rmap.rm_offset = rec->rmap.rm_offset; + key->rmap.rm_offset = ondisk_rec_offset_to_key(rec); } /* @@ -186,7 +196,7 @@ xfs_rmapbt_init_high_key_from_rec( key->rmap.rm_startblock = rec->rmap.rm_startblock; be32_add_cpu(&key->rmap.rm_startblock, adj); key->rmap.rm_owner = rec->rmap.rm_owner; - key->rmap.rm_offset = rec->rmap.rm_offset; + key->rmap.rm_offset = ondisk_rec_offset_to_key(rec); if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) || XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset))) return; @@ -219,6 +229,16 @@ xfs_rmapbt_init_ptr_from_cur( ptr->s = agf->agf_roots[cur->bc_btnum]; } +/* + * Mask the appropriate parts of the ondisk key field for a key comparison. + * Fork and bmbt are significant parts of the rmap record key, but written + * status is merely a record attribute. + */ +static inline uint64_t offset_keymask(uint64_t offset) +{ + return offset & ~XFS_RMAP_OFF_UNWRITTEN; +} + STATIC int64_t xfs_rmapbt_key_diff( struct xfs_btree_cur *cur, @@ -240,8 +260,8 @@ xfs_rmapbt_key_diff( else if (y > x) return -1; - x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset)); - y = rec->rm_offset; + x = offset_keymask(be64_to_cpu(kp->rm_offset)); + y = offset_keymask(xfs_rmap_irec_offset_pack(rec)); if (x > y) return 1; else if (y > x) @@ -253,31 +273,43 @@ STATIC int64_t xfs_rmapbt_diff_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, - const union xfs_btree_key *k2) + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) { const struct xfs_rmap_key *kp1 = &k1->rmap; const struct xfs_rmap_key *kp2 = &k2->rmap; int64_t d; __u64 x, y; + /* Doesn't make sense to mask off the physical space part */ + ASSERT(!mask || mask->rmap.rm_startblock); + d = (int64_t)be32_to_cpu(kp1->rm_startblock) - - be32_to_cpu(kp2->rm_startblock); + be32_to_cpu(kp2->rm_startblock); if (d) return d; - x = be64_to_cpu(kp1->rm_owner); - y = be64_to_cpu(kp2->rm_owner); - if (x > y) - return 1; - else if (y > x) - return -1; + if (!mask || mask->rmap.rm_owner) { + x = be64_to_cpu(kp1->rm_owner); + y = be64_to_cpu(kp2->rm_owner); + if (x > y) + return 1; + else if (y > x) + return -1; + } + + if (!mask || mask->rmap.rm_offset) { + /* Doesn't make sense to allow offset but not owner */ + ASSERT(!mask || mask->rmap.rm_owner); + + x = offset_keymask(be64_to_cpu(kp1->rm_offset)); + y = offset_keymask(be64_to_cpu(kp2->rm_offset)); + if (x > y) + return 1; + else if (y > x) + return -1; + } - x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset)); - y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset)); - if (x > y) - return 1; - else if (y > x) - return -1; return 0; } @@ -387,8 +419,8 @@ xfs_rmapbt_keys_inorder( return 1; else if (a > b) return 0; - a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset)); - b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset)); + a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset)); + b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset)); if (a <= b) return 1; return 0; @@ -417,13 +449,33 @@ xfs_rmapbt_recs_inorder( return 1; else if (a > b) return 0; - a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset)); - b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset)); + a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset)); + b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset)); if (a <= b) return 1; return 0; } +STATIC enum xbtree_key_contig +xfs_rmapbt_keys_contiguous( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask) +{ + ASSERT(!mask || mask->rmap.rm_startblock); + + /* + * We only support checking contiguity of the physical space component. + * If any callers ever need more specificity than that, they'll have to + * implement it here. + */ + ASSERT(!mask || (!mask->rmap.rm_owner && !mask->rmap.rm_offset)); + + return xbtree_key_contig(be32_to_cpu(key1->rmap.rm_startblock), + be32_to_cpu(key2->rmap.rm_startblock)); +} + static const struct xfs_btree_ops xfs_rmapbt_ops = { .rec_len = sizeof(struct xfs_rmap_rec), .key_len = 2 * sizeof(struct xfs_rmap_key), @@ -443,6 +495,7 @@ static const struct xfs_btree_ops xfs_rmapbt_ops = { .diff_two_keys = xfs_rmapbt_diff_two_keys, .keys_inorder = xfs_rmapbt_keys_inorder, .recs_inorder = xfs_rmapbt_recs_inorder, + .keys_contiguous = xfs_rmapbt_keys_contiguous, }; static struct xfs_btree_cur * @@ -460,10 +513,7 @@ xfs_rmapbt_init_common( cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); cur->bc_ops = &xfs_rmapbt_ops; - /* take a reference for the cursor */ - atomic_inc(&pag->pag_ref); - cur->bc_ag.pag = pag; - + cur->bc_ag.pag = xfs_perag_hold(pag); return cur; } diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 99cc03a298e2..ba0f17bc1dc0 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -72,7 +72,8 @@ xfs_sb_validate_v5_features( } /* - * We support all XFS versions newer than a v4 superblock with V2 directories. + * We current support XFS v5 formats with known features and v4 superblocks with + * at least V2 directories. */ bool xfs_sb_good_version( @@ -86,16 +87,16 @@ xfs_sb_good_version( if (xfs_sb_is_v5(sbp)) return xfs_sb_validate_v5_features(sbp); + /* versions prior to v4 are not supported */ + if (XFS_SB_VERSION_NUM(sbp) != XFS_SB_VERSION_4) + return false; + /* We must not have any unknown v4 feature bits set */ if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) return false; - /* versions prior to v4 are not supported */ - if (XFS_SB_VERSION_NUM(sbp) < XFS_SB_VERSION_4) - return false; - /* V4 filesystems need v2 directories and unwritten extents */ if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)) return false; diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 5ebdda7e1078..851220021484 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -204,6 +204,18 @@ enum xfs_ag_resv_type { XFS_AG_RESV_RMAPBT, }; +/* Results of scanning a btree keyspace to check occupancy. */ +enum xbtree_recpacking { + /* None of the keyspace maps to records. */ + XBTREE_RECPACKING_EMPTY = 0, + + /* Some, but not all, of the keyspace maps to records. */ + XBTREE_RECPACKING_SPARSE, + + /* The entire keyspace maps to records. */ + XBTREE_RECPACKING_FULL, +}; + /* * Type verifier functions */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 4dd52b15f09c..6c6e5eba42c8 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -18,6 +18,15 @@ #include "scrub/scrub.h" #include "scrub/common.h" +int +xchk_setup_agheader( + struct xfs_scrub *sc) +{ + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + return xchk_setup_fs(sc); +} + /* Superblock */ /* Cross-reference with the other btrees. */ @@ -42,8 +51,9 @@ xchk_superblock_xref( xchk_xref_is_used_space(sc, agbno, 1); xchk_xref_is_not_inode_chunk(sc, agbno, 1); - xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); + xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); xchk_xref_is_not_shared(sc, agbno, 1); + xchk_xref_is_not_cow_staging(sc, agbno, 1); /* scrub teardown will take care of sc->sa for us */ } @@ -505,9 +515,10 @@ xchk_agf_xref( xchk_agf_xref_freeblks(sc); xchk_agf_xref_cntbt(sc); xchk_xref_is_not_inode_chunk(sc, agbno, 1); - xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); + xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); xchk_agf_xref_btreeblks(sc); xchk_xref_is_not_shared(sc, agbno, 1); + xchk_xref_is_not_cow_staging(sc, agbno, 1); xchk_agf_xref_refcblks(sc); /* scrub teardown will take care of sc->sa for us */ @@ -633,8 +644,9 @@ xchk_agfl_block_xref( xchk_xref_is_used_space(sc, agbno, 1); xchk_xref_is_not_inode_chunk(sc, agbno, 1); - xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_AG); + xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_AG); xchk_xref_is_not_shared(sc, agbno, 1); + xchk_xref_is_not_cow_staging(sc, agbno, 1); } /* Scrub an AGFL block. */ @@ -689,8 +701,9 @@ xchk_agfl_xref( xchk_xref_is_used_space(sc, agbno, 1); xchk_xref_is_not_inode_chunk(sc, agbno, 1); - xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); + xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); xchk_xref_is_not_shared(sc, agbno, 1); + xchk_xref_is_not_cow_staging(sc, agbno, 1); /* * Scrub teardown will take care of sc->sa for us. Leave sc->sa @@ -844,8 +857,9 @@ xchk_agi_xref( xchk_xref_is_used_space(sc, agbno, 1); xchk_xref_is_not_inode_chunk(sc, agbno, 1); xchk_agi_xref_icounts(sc); - xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); + xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); xchk_xref_is_not_shared(sc, agbno, 1); + xchk_xref_is_not_cow_staging(sc, agbno, 1); xchk_agi_xref_fiblocks(sc); /* scrub teardown will take care of sc->sa for us */ diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index c37e6d72760b..bbaa65422c4f 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2018 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -487,10 +487,11 @@ xrep_agfl_walk_rmap( /* Strike out the blocks that are cross-linked according to the rmapbt. */ STATIC int xrep_agfl_check_extent( - struct xrep_agfl *ra, uint64_t start, - uint64_t len) + uint64_t len, + void *priv) { + struct xrep_agfl *ra = priv; xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(ra->sc->mp, start); xfs_agblock_t last_agbno = agbno + len - 1; int error; @@ -538,7 +539,6 @@ xrep_agfl_collect_blocks( struct xrep_agfl ra; struct xfs_mount *mp = sc->mp; struct xfs_btree_cur *cur; - struct xbitmap_range *br, *n; int error; ra.sc = sc; @@ -579,11 +579,7 @@ xrep_agfl_collect_blocks( /* Strike out the blocks that are cross-linked. */ ra.rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); - for_each_xbitmap_extent(br, n, agfl_extents) { - error = xrep_agfl_check_extent(&ra, br->start, br->len); - if (error) - break; - } + error = xbitmap_walk(agfl_extents, xrep_agfl_check_extent, &ra); xfs_btree_del_cursor(ra.rmap_cur, error); if (error) goto out_bmp; @@ -629,21 +625,58 @@ xrep_agfl_update_agf( XFS_AGF_FLFIRST | XFS_AGF_FLLAST | XFS_AGF_FLCOUNT); } +struct xrep_agfl_fill { + struct xbitmap used_extents; + struct xfs_scrub *sc; + __be32 *agfl_bno; + xfs_agblock_t flcount; + unsigned int fl_off; +}; + +/* Fill the AGFL with whatever blocks are in this extent. */ +static int +xrep_agfl_fill( + uint64_t start, + uint64_t len, + void *priv) +{ + struct xrep_agfl_fill *af = priv; + struct xfs_scrub *sc = af->sc; + xfs_fsblock_t fsbno = start; + int error; + + while (fsbno < start + len && af->fl_off < af->flcount) + af->agfl_bno[af->fl_off++] = + cpu_to_be32(XFS_FSB_TO_AGBNO(sc->mp, fsbno++)); + + trace_xrep_agfl_insert(sc->mp, sc->sa.pag->pag_agno, + XFS_FSB_TO_AGBNO(sc->mp, start), len); + + error = xbitmap_set(&af->used_extents, start, fsbno - 1); + if (error) + return error; + + if (af->fl_off == af->flcount) + return -ECANCELED; + + return 0; +} + /* Write out a totally new AGFL. */ -STATIC void +STATIC int xrep_agfl_init_header( struct xfs_scrub *sc, struct xfs_buf *agfl_bp, struct xbitmap *agfl_extents, xfs_agblock_t flcount) { + struct xrep_agfl_fill af = { + .sc = sc, + .flcount = flcount, + }; struct xfs_mount *mp = sc->mp; - __be32 *agfl_bno; - struct xbitmap_range *br; - struct xbitmap_range *n; struct xfs_agfl *agfl; - xfs_agblock_t agbno; - unsigned int fl_off; + int error; ASSERT(flcount <= xfs_agfl_size(mp)); @@ -662,36 +695,18 @@ xrep_agfl_init_header( * blocks than fit in the AGFL, they will be freed in a subsequent * step. */ - fl_off = 0; - agfl_bno = xfs_buf_to_agfl_bno(agfl_bp); - for_each_xbitmap_extent(br, n, agfl_extents) { - agbno = XFS_FSB_TO_AGBNO(mp, br->start); - - trace_xrep_agfl_insert(mp, sc->sa.pag->pag_agno, agbno, - br->len); - - while (br->len > 0 && fl_off < flcount) { - agfl_bno[fl_off] = cpu_to_be32(agbno); - fl_off++; - agbno++; - - /* - * We've now used br->start by putting it in the AGFL, - * so bump br so that we don't reap the block later. - */ - br->start++; - br->len--; - } - - if (br->len) - break; - list_del(&br->list); - kfree(br); - } + xbitmap_init(&af.used_extents); + af.agfl_bno = xfs_buf_to_agfl_bno(agfl_bp), + xbitmap_walk(agfl_extents, xrep_agfl_fill, &af); + error = xbitmap_disunion(agfl_extents, &af.used_extents); + if (error) + return error; /* Write new AGFL to disk. */ xfs_trans_buf_set_type(sc->tp, agfl_bp, XFS_BLFT_AGFL_BUF); xfs_trans_log_buf(sc->tp, agfl_bp, 0, BBTOB(agfl_bp->b_length) - 1); + xbitmap_destroy(&af.used_extents); + return 0; } /* Repair the AGFL. */ @@ -744,7 +759,9 @@ xrep_agfl( * buffers until we know that part works. */ xrep_agfl_update_agf(sc, agf_bp, flcount); - xrep_agfl_init_header(sc, agfl_bp, &agfl_extents, flcount); + error = xrep_agfl_init_header(sc, agfl_bp, &agfl_extents, flcount); + if (error) + goto err; /* * Ok, the AGFL should be ready to go now. Roll the transaction to diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 3b38f4e2a537..279af72b1671 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -24,10 +24,19 @@ int xchk_setup_ag_allocbt( struct xfs_scrub *sc) { + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + return xchk_setup_ag_btree(sc, false); } /* Free space btree scrubber. */ + +struct xchk_alloc { + /* Previous free space extent. */ + struct xfs_alloc_rec_incore prev; +}; + /* * Ensure there's a corresponding cntbt/bnobt record matching this * bnobt/cntbt record, respectively. @@ -75,9 +84,11 @@ xchk_allocbt_xref_other( STATIC void xchk_allocbt_xref( struct xfs_scrub *sc, - xfs_agblock_t agbno, - xfs_extlen_t len) + const struct xfs_alloc_rec_incore *irec) { + xfs_agblock_t agbno = irec->ar_startblock; + xfs_extlen_t len = irec->ar_blockcount; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return; @@ -85,25 +96,44 @@ xchk_allocbt_xref( xchk_xref_is_not_inode_chunk(sc, agbno, len); xchk_xref_has_no_owner(sc, agbno, len); xchk_xref_is_not_shared(sc, agbno, len); + xchk_xref_is_not_cow_staging(sc, agbno, len); +} + +/* Flag failures for records that could be merged. */ +STATIC void +xchk_allocbt_mergeable( + struct xchk_btree *bs, + struct xchk_alloc *ca, + const struct xfs_alloc_rec_incore *irec) +{ + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + if (ca->prev.ar_blockcount > 0 && + ca->prev.ar_startblock + ca->prev.ar_blockcount == irec->ar_startblock && + ca->prev.ar_blockcount + irec->ar_blockcount < (uint32_t)~0U) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + + memcpy(&ca->prev, irec, sizeof(*irec)); } /* Scrub a bnobt/cntbt record. */ STATIC int xchk_allocbt_rec( - struct xchk_btree *bs, - const union xfs_btree_rec *rec) + struct xchk_btree *bs, + const union xfs_btree_rec *rec) { - struct xfs_perag *pag = bs->cur->bc_ag.pag; - xfs_agblock_t bno; - xfs_extlen_t len; + struct xfs_alloc_rec_incore irec; + struct xchk_alloc *ca = bs->private; - bno = be32_to_cpu(rec->alloc.ar_startblock); - len = be32_to_cpu(rec->alloc.ar_blockcount); - - if (!xfs_verify_agbext(pag, bno, len)) + xfs_alloc_btrec_to_irec(rec, &irec); + if (xfs_alloc_check_irec(bs->cur, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } - xchk_allocbt_xref(bs->sc, bno, len); + xchk_allocbt_mergeable(bs, ca, &irec); + xchk_allocbt_xref(bs->sc, &irec); return 0; } @@ -114,10 +144,11 @@ xchk_allocbt( struct xfs_scrub *sc, xfs_btnum_t which) { + struct xchk_alloc ca = { }; struct xfs_btree_cur *cur; cur = which == XFS_BTNUM_BNO ? sc->sa.bno_cur : sc->sa.cnt_cur; - return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, NULL); + return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, &ca); } int @@ -141,15 +172,15 @@ xchk_xref_is_used_space( xfs_agblock_t agbno, xfs_extlen_t len) { - bool is_freesp; + enum xbtree_recpacking outcome; int error; if (!sc->sa.bno_cur || xchk_skip_xref(sc->sm)) return; - error = xfs_alloc_has_record(sc->sa.bno_cur, agbno, len, &is_freesp); + error = xfs_alloc_has_records(sc->sa.bno_cur, agbno, len, &outcome); if (!xchk_should_check_xref(sc, &error, &sc->sa.bno_cur)) return; - if (is_freesp) + if (outcome != XBTREE_RECPACKING_EMPTY) xchk_btree_xref_set_corrupt(sc, sc->sa.bno_cur, 0); } diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 31529b9bf389..6c16d9530cca 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -15,11 +15,51 @@ #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" +#include "xfs_attr_sf.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/dabtree.h" #include "scrub/attr.h" +/* Free the buffers linked from the xattr buffer. */ +static void +xchk_xattr_buf_cleanup( + void *priv) +{ + struct xchk_xattr_buf *ab = priv; + + kvfree(ab->freemap); + ab->freemap = NULL; + kvfree(ab->usedmap); + ab->usedmap = NULL; + kvfree(ab->value); + ab->value = NULL; + ab->value_sz = 0; +} + +/* + * Allocate the free space bitmap if we're trying harder; there are leaf blocks + * in the attr fork; or we can't tell if there are leaf blocks. + */ +static inline bool +xchk_xattr_want_freemap( + struct xfs_scrub *sc) +{ + struct xfs_ifork *ifp; + + if (sc->flags & XCHK_TRY_HARDER) + return true; + + if (!sc->ip) + return true; + + ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); + if (!ifp) + return false; + + return xfs_ifork_has_extents(ifp); +} + /* * Allocate enough memory to hold an attr value and attr block bitmaps, * reallocating the buffer if necessary. Buffer contents are not preserved @@ -28,41 +68,49 @@ static int xchk_setup_xattr_buf( struct xfs_scrub *sc, - size_t value_size, - gfp_t flags) + size_t value_size) { - size_t sz; + size_t bmp_sz; struct xchk_xattr_buf *ab = sc->buf; + void *new_val; - /* - * We need enough space to read an xattr value from the file or enough - * space to hold three copies of the xattr free space bitmap. We don't - * need the buffer space for both purposes at the same time. - */ - sz = 3 * sizeof(long) * BITS_TO_LONGS(sc->mp->m_attr_geo->blksize); - sz = max_t(size_t, sz, value_size); + bmp_sz = sizeof(long) * BITS_TO_LONGS(sc->mp->m_attr_geo->blksize); - /* - * If there's already a buffer, figure out if we need to reallocate it - * to accommodate a larger size. - */ - if (ab) { - if (sz <= ab->sz) - return 0; - kvfree(ab); - sc->buf = NULL; - } + if (ab) + goto resize_value; - /* - * Don't zero the buffer upon allocation to avoid runtime overhead. - * All users must be careful never to read uninitialized contents. - */ - ab = kvmalloc(sizeof(*ab) + sz, flags); + ab = kvzalloc(sizeof(struct xchk_xattr_buf), XCHK_GFP_FLAGS); if (!ab) return -ENOMEM; - - ab->sz = sz; sc->buf = ab; + sc->buf_cleanup = xchk_xattr_buf_cleanup; + + ab->usedmap = kvmalloc(bmp_sz, XCHK_GFP_FLAGS); + if (!ab->usedmap) + return -ENOMEM; + + if (xchk_xattr_want_freemap(sc)) { + ab->freemap = kvmalloc(bmp_sz, XCHK_GFP_FLAGS); + if (!ab->freemap) + return -ENOMEM; + } + +resize_value: + if (ab->value_sz >= value_size) + return 0; + + if (ab->value) { + kvfree(ab->value); + ab->value = NULL; + ab->value_sz = 0; + } + + new_val = kvmalloc(value_size, XCHK_GFP_FLAGS); + if (!new_val) + return -ENOMEM; + + ab->value = new_val; + ab->value_sz = value_size; return 0; } @@ -79,8 +127,7 @@ xchk_setup_xattr( * without the inode lock held, which means we can sleep. */ if (sc->flags & XCHK_TRY_HARDER) { - error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, - XCHK_GFP_FLAGS); + error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX); if (error) return error; } @@ -111,11 +158,24 @@ xchk_xattr_listent( int namelen, int valuelen) { + struct xfs_da_args args = { + .op_flags = XFS_DA_OP_NOTIME, + .attr_filter = flags & XFS_ATTR_NSP_ONDISK_MASK, + .geo = context->dp->i_mount->m_attr_geo, + .whichfork = XFS_ATTR_FORK, + .dp = context->dp, + .name = name, + .namelen = namelen, + .hashval = xfs_da_hashname(name, namelen), + .trans = context->tp, + .valuelen = valuelen, + }; + struct xchk_xattr_buf *ab; struct xchk_xattr *sx; - struct xfs_da_args args = { NULL }; int error = 0; sx = container_of(context, struct xchk_xattr, context); + ab = sx->sc->buf; if (xchk_should_terminate(sx->sc, &error)) { context->seen_enough = error; @@ -128,18 +188,32 @@ xchk_xattr_listent( return; } + /* Only one namespace bit allowed. */ + if (hweight32(flags & XFS_ATTR_NSP_ONDISK_MASK) > 1) { + xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); + goto fail_xref; + } + /* Does this name make sense? */ if (!xfs_attr_namecheck(name, namelen)) { xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); - return; + goto fail_xref; } /* + * Local xattr values are stored in the attr leaf block, so we don't + * need to retrieve the value from a remote block to detect corruption + * problems. + */ + if (flags & XFS_ATTR_LOCAL) + goto fail_xref; + + /* * Try to allocate enough memory to extrat the attr value. If that * doesn't work, we overload the seen_enough variable to convey * the error message back to the main scrub function. */ - error = xchk_setup_xattr_buf(sx->sc, valuelen, XCHK_GFP_FLAGS); + error = xchk_setup_xattr_buf(sx->sc, valuelen); if (error == -ENOMEM) error = -EDEADLOCK; if (error) { @@ -147,17 +221,7 @@ xchk_xattr_listent( return; } - args.op_flags = XFS_DA_OP_NOTIME; - args.attr_filter = flags & XFS_ATTR_NSP_ONDISK_MASK; - args.geo = context->dp->i_mount->m_attr_geo; - args.whichfork = XFS_ATTR_FORK; - args.dp = context->dp; - args.name = name; - args.namelen = namelen; - args.hashval = xfs_da_hashname(args.name, args.namelen); - args.trans = context->tp; - args.value = xchk_xattr_valuebuf(sx->sc); - args.valuelen = valuelen; + args.value = ab->value; error = xfs_attr_get_ilocked(&args); /* ENODATA means the hash lookup failed and the attr is bad */ @@ -213,25 +277,23 @@ xchk_xattr_set_map( STATIC bool xchk_xattr_check_freemap( struct xfs_scrub *sc, - unsigned long *map, struct xfs_attr3_icleaf_hdr *leafhdr) { - unsigned long *freemap = xchk_xattr_freemap(sc); - unsigned long *dstmap = xchk_xattr_dstmap(sc); + struct xchk_xattr_buf *ab = sc->buf; unsigned int mapsize = sc->mp->m_attr_geo->blksize; int i; /* Construct bitmap of freemap contents. */ - bitmap_zero(freemap, mapsize); + bitmap_zero(ab->freemap, mapsize); for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { - if (!xchk_xattr_set_map(sc, freemap, + if (!xchk_xattr_set_map(sc, ab->freemap, leafhdr->freemap[i].base, leafhdr->freemap[i].size)) return false; } /* Look for bits that are set in freemap and are marked in use. */ - return bitmap_and(dstmap, freemap, map, mapsize) == 0; + return !bitmap_intersects(ab->freemap, ab->usedmap, mapsize); } /* @@ -251,7 +313,7 @@ xchk_xattr_entry( __u32 *last_hashval) { struct xfs_mount *mp = ds->state->mp; - unsigned long *usedmap = xchk_xattr_usedmap(ds->sc); + struct xchk_xattr_buf *ab = ds->sc->buf; char *name_end; struct xfs_attr_leaf_name_local *lentry; struct xfs_attr_leaf_name_remote *rentry; @@ -291,7 +353,7 @@ xchk_xattr_entry( if (name_end > buf_end) xchk_da_set_corrupt(ds, level); - if (!xchk_xattr_set_map(ds->sc, usedmap, nameidx, namesize)) + if (!xchk_xattr_set_map(ds->sc, ab->usedmap, nameidx, namesize)) xchk_da_set_corrupt(ds, level); if (!(ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) *usedbytes += namesize; @@ -311,35 +373,26 @@ xchk_xattr_block( struct xfs_attr_leafblock *leaf = bp->b_addr; struct xfs_attr_leaf_entry *ent; struct xfs_attr_leaf_entry *entries; - unsigned long *usedmap; + struct xchk_xattr_buf *ab = ds->sc->buf; char *buf_end; size_t off; __u32 last_hashval = 0; unsigned int usedbytes = 0; unsigned int hdrsize; int i; - int error; if (*last_checked == blk->blkno) return 0; - /* Allocate memory for block usage checking. */ - error = xchk_setup_xattr_buf(ds->sc, 0, XCHK_GFP_FLAGS); - if (error == -ENOMEM) - return -EDEADLOCK; - if (error) - return error; - usedmap = xchk_xattr_usedmap(ds->sc); - *last_checked = blk->blkno; - bitmap_zero(usedmap, mp->m_attr_geo->blksize); + bitmap_zero(ab->usedmap, mp->m_attr_geo->blksize); /* Check all the padding. */ if (xfs_has_crc(ds->sc->mp)) { - struct xfs_attr3_leafblock *leaf = bp->b_addr; + struct xfs_attr3_leafblock *leaf3 = bp->b_addr; - if (leaf->hdr.pad1 != 0 || leaf->hdr.pad2 != 0 || - leaf->hdr.info.hdr.pad != 0) + if (leaf3->hdr.pad1 != 0 || leaf3->hdr.pad2 != 0 || + leaf3->hdr.info.hdr.pad != 0) xchk_da_set_corrupt(ds, level); } else { if (leaf->hdr.pad1 != 0 || leaf->hdr.info.pad != 0) @@ -356,7 +409,7 @@ xchk_xattr_block( xchk_da_set_corrupt(ds, level); if (leafhdr.firstused < hdrsize) xchk_da_set_corrupt(ds, level); - if (!xchk_xattr_set_map(ds->sc, usedmap, 0, hdrsize)) + if (!xchk_xattr_set_map(ds->sc, ab->usedmap, 0, hdrsize)) xchk_da_set_corrupt(ds, level); if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) @@ -370,7 +423,7 @@ xchk_xattr_block( for (i = 0, ent = entries; i < leafhdr.count; ent++, i++) { /* Mark the leaf entry itself. */ off = (char *)ent - (char *)leaf; - if (!xchk_xattr_set_map(ds->sc, usedmap, off, + if (!xchk_xattr_set_map(ds->sc, ab->usedmap, off, sizeof(xfs_attr_leaf_entry_t))) { xchk_da_set_corrupt(ds, level); goto out; @@ -384,7 +437,7 @@ xchk_xattr_block( goto out; } - if (!xchk_xattr_check_freemap(ds->sc, usedmap, &leafhdr)) + if (!xchk_xattr_check_freemap(ds->sc, &leafhdr)) xchk_da_set_corrupt(ds, level); if (leafhdr.usedbytes != usedbytes) @@ -468,38 +521,115 @@ out: return error; } +/* Check space usage of shortform attrs. */ +STATIC int +xchk_xattr_check_sf( + struct xfs_scrub *sc) +{ + struct xchk_xattr_buf *ab = sc->buf; + struct xfs_attr_shortform *sf; + struct xfs_attr_sf_entry *sfe; + struct xfs_attr_sf_entry *next; + struct xfs_ifork *ifp; + unsigned char *end; + int i; + int error = 0; + + ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); + + bitmap_zero(ab->usedmap, ifp->if_bytes); + sf = (struct xfs_attr_shortform *)sc->ip->i_af.if_u1.if_data; + end = (unsigned char *)ifp->if_u1.if_data + ifp->if_bytes; + xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(sf->hdr)); + + sfe = &sf->list[0]; + if ((unsigned char *)sfe > end) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + return 0; + } + + for (i = 0; i < sf->hdr.count; i++) { + unsigned char *name = sfe->nameval; + unsigned char *value = &sfe->nameval[sfe->namelen]; + + if (xchk_should_terminate(sc, &error)) + return error; + + next = xfs_attr_sf_nextentry(sfe); + if ((unsigned char *)next > end) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + break; + } + + if (!xchk_xattr_set_map(sc, ab->usedmap, + (char *)sfe - (char *)sf, + sizeof(struct xfs_attr_sf_entry))) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + break; + } + + if (!xchk_xattr_set_map(sc, ab->usedmap, + (char *)name - (char *)sf, + sfe->namelen)) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + break; + } + + if (!xchk_xattr_set_map(sc, ab->usedmap, + (char *)value - (char *)sf, + sfe->valuelen)) { + xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); + break; + } + + sfe = next; + } + + return 0; +} + /* Scrub the extended attribute metadata. */ int xchk_xattr( struct xfs_scrub *sc) { - struct xchk_xattr sx; + struct xchk_xattr sx = { + .sc = sc, + .context = { + .dp = sc->ip, + .tp = sc->tp, + .resynch = 1, + .put_listent = xchk_xattr_listent, + .allow_incomplete = true, + }, + }; xfs_dablk_t last_checked = -1U; int error = 0; if (!xfs_inode_hasattr(sc->ip)) return -ENOENT; - memset(&sx, 0, sizeof(sx)); - /* Check attribute tree structure */ - error = xchk_da_btree(sc, XFS_ATTR_FORK, xchk_xattr_rec, - &last_checked); + /* Allocate memory for xattr checking. */ + error = xchk_setup_xattr_buf(sc, 0); + if (error == -ENOMEM) + return -EDEADLOCK; if (error) - goto out; + return error; - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - goto out; + /* Check the physical structure of the xattr. */ + if (sc->ip->i_af.if_format == XFS_DINODE_FMT_LOCAL) + error = xchk_xattr_check_sf(sc); + else + error = xchk_da_btree(sc, XFS_ATTR_FORK, xchk_xattr_rec, + &last_checked); + if (error) + return error; - /* Check that every attr key can also be looked up by hash. */ - sx.context.dp = sc->ip; - sx.context.resynch = 1; - sx.context.put_listent = xchk_xattr_listent; - sx.context.tp = sc->tp; - sx.context.allow_incomplete = true; - sx.sc = sc; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; /* - * Look up every xattr in this file by name. + * Look up every xattr in this file by name and hash. * * Use the backend implementation of xfs_attr_list to call * xchk_xattr_listent on every attribute key in this inode. @@ -516,11 +646,11 @@ xchk_xattr( */ error = xfs_attr_list_ilocked(&sx.context); if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error)) - goto out; + return error; /* Did our listent function try to return any errors? */ if (sx.context.seen_enough < 0) - error = sx.context.seen_enough; -out: - return error; + return sx.context.seen_enough; + + return 0; } diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h index 3590e10e3e62..48fd9402c432 100644 --- a/fs/xfs/scrub/attr.h +++ b/fs/xfs/scrub/attr.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * Copyright (C) 2019 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2019-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #ifndef __XFS_SCRUB_ATTR_H__ #define __XFS_SCRUB_ATTR_H__ @@ -10,59 +10,15 @@ * Temporary storage for online scrub and repair of extended attributes. */ struct xchk_xattr_buf { - /* Size of @buf, in bytes. */ - size_t sz; + /* Bitmap of used space in xattr leaf blocks and shortform forks. */ + unsigned long *usedmap; - /* - * Memory buffer -- either used for extracting attr values while - * walking the attributes; or for computing attr block bitmaps when - * checking the attribute tree. - * - * Each bitmap contains enough bits to track every byte in an attr - * block (rounded up to the size of an unsigned long). The attr block - * used space bitmap starts at the beginning of the buffer; the free - * space bitmap follows immediately after; and we have a third buffer - * for storing intermediate bitmap results. - */ - uint8_t buf[]; -}; - -/* A place to store attribute values. */ -static inline uint8_t * -xchk_xattr_valuebuf( - struct xfs_scrub *sc) -{ - struct xchk_xattr_buf *ab = sc->buf; - - return ab->buf; -} - -/* A bitmap of space usage computed by walking an attr leaf block. */ -static inline unsigned long * -xchk_xattr_usedmap( - struct xfs_scrub *sc) -{ - struct xchk_xattr_buf *ab = sc->buf; + /* Bitmap of free space in xattr leaf blocks. */ + unsigned long *freemap; - return (unsigned long *)ab->buf; -} - -/* A bitmap of free space computed by walking attr leaf block free info. */ -static inline unsigned long * -xchk_xattr_freemap( - struct xfs_scrub *sc) -{ - return xchk_xattr_usedmap(sc) + - BITS_TO_LONGS(sc->mp->m_attr_geo->blksize); -} - -/* A bitmap used to hold temporary results. */ -static inline unsigned long * -xchk_xattr_dstmap( - struct xfs_scrub *sc) -{ - return xchk_xattr_freemap(sc) + - BITS_TO_LONGS(sc->mp->m_attr_geo->blksize); -} + /* Memory buffer used to extract xattr values. */ + void *value; + size_t value_sz; +}; #endif /* __XFS_SCRUB_ATTR_H__ */ diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index a255f09e9f0a..0c959be396ea 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -1,11 +1,12 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2018 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" +#include "xfs_bit.h" #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" @@ -13,27 +14,160 @@ #include "scrub/scrub.h" #include "scrub/bitmap.h" +#include <linux/interval_tree_generic.h> + +struct xbitmap_node { + struct rb_node bn_rbnode; + + /* First set bit of this interval and subtree. */ + uint64_t bn_start; + + /* Last set bit of this interval. */ + uint64_t bn_last; + + /* Last set bit of this subtree. Do not touch this. */ + uint64_t __bn_subtree_last; +}; + +/* Define our own interval tree type with uint64_t parameters. */ + +#define START(node) ((node)->bn_start) +#define LAST(node) ((node)->bn_last) + /* - * Set a range of this bitmap. Caller must ensure the range is not set. - * - * This is the logical equivalent of bitmap |= mask(start, len). + * These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll + * forward-declare them anyway for clarity. */ +static inline void +xbitmap_tree_insert(struct xbitmap_node *node, struct rb_root_cached *root); + +static inline void +xbitmap_tree_remove(struct xbitmap_node *node, struct rb_root_cached *root); + +static inline struct xbitmap_node * +xbitmap_tree_iter_first(struct rb_root_cached *root, uint64_t start, + uint64_t last); + +static inline struct xbitmap_node * +xbitmap_tree_iter_next(struct xbitmap_node *node, uint64_t start, + uint64_t last); + +INTERVAL_TREE_DEFINE(struct xbitmap_node, bn_rbnode, uint64_t, + __bn_subtree_last, START, LAST, static inline, xbitmap_tree) + +/* Iterate each interval of a bitmap. Do not change the bitmap. */ +#define for_each_xbitmap_extent(bn, bitmap) \ + for ((bn) = rb_entry_safe(rb_first(&(bitmap)->xb_root.rb_root), \ + struct xbitmap_node, bn_rbnode); \ + (bn) != NULL; \ + (bn) = rb_entry_safe(rb_next(&(bn)->bn_rbnode), \ + struct xbitmap_node, bn_rbnode)) + +/* Clear a range of this bitmap. */ +int +xbitmap_clear( + struct xbitmap *bitmap, + uint64_t start, + uint64_t len) +{ + struct xbitmap_node *bn; + struct xbitmap_node *new_bn; + uint64_t last = start + len - 1; + + while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last))) { + if (bn->bn_start < start && bn->bn_last > last) { + uint64_t old_last = bn->bn_last; + + /* overlaps with the entire clearing range */ + xbitmap_tree_remove(bn, &bitmap->xb_root); + bn->bn_last = start - 1; + xbitmap_tree_insert(bn, &bitmap->xb_root); + + /* add an extent */ + new_bn = kmalloc(sizeof(struct xbitmap_node), + XCHK_GFP_FLAGS); + if (!new_bn) + return -ENOMEM; + new_bn->bn_start = last + 1; + new_bn->bn_last = old_last; + xbitmap_tree_insert(new_bn, &bitmap->xb_root); + } else if (bn->bn_start < start) { + /* overlaps with the left side of the clearing range */ + xbitmap_tree_remove(bn, &bitmap->xb_root); + bn->bn_last = start - 1; + xbitmap_tree_insert(bn, &bitmap->xb_root); + } else if (bn->bn_last > last) { + /* overlaps with the right side of the clearing range */ + xbitmap_tree_remove(bn, &bitmap->xb_root); + bn->bn_start = last + 1; + xbitmap_tree_insert(bn, &bitmap->xb_root); + break; + } else { + /* in the middle of the clearing range */ + xbitmap_tree_remove(bn, &bitmap->xb_root); + kfree(bn); + } + } + + return 0; +} + +/* Set a range of this bitmap. */ int xbitmap_set( struct xbitmap *bitmap, uint64_t start, uint64_t len) { - struct xbitmap_range *bmr; + struct xbitmap_node *left; + struct xbitmap_node *right; + uint64_t last = start + len - 1; + int error; - bmr = kmalloc(sizeof(struct xbitmap_range), XCHK_GFP_FLAGS); - if (!bmr) - return -ENOMEM; + /* Is this whole range already set? */ + left = xbitmap_tree_iter_first(&bitmap->xb_root, start, last); + if (left && left->bn_start <= start && left->bn_last >= last) + return 0; + + /* Clear out everything in the range we want to set. */ + error = xbitmap_clear(bitmap, start, len); + if (error) + return error; + + /* Do we have a left-adjacent extent? */ + left = xbitmap_tree_iter_first(&bitmap->xb_root, start - 1, start - 1); + ASSERT(!left || left->bn_last + 1 == start); + + /* Do we have a right-adjacent extent? */ + right = xbitmap_tree_iter_first(&bitmap->xb_root, last + 1, last + 1); + ASSERT(!right || right->bn_start == last + 1); - INIT_LIST_HEAD(&bmr->list); - bmr->start = start; - bmr->len = len; - list_add_tail(&bmr->list, &bitmap->list); + if (left && right) { + /* combine left and right adjacent extent */ + xbitmap_tree_remove(left, &bitmap->xb_root); + xbitmap_tree_remove(right, &bitmap->xb_root); + left->bn_last = right->bn_last; + xbitmap_tree_insert(left, &bitmap->xb_root); + kfree(right); + } else if (left) { + /* combine with left extent */ + xbitmap_tree_remove(left, &bitmap->xb_root); + left->bn_last = last; + xbitmap_tree_insert(left, &bitmap->xb_root); + } else if (right) { + /* combine with right extent */ + xbitmap_tree_remove(right, &bitmap->xb_root); + right->bn_start = start; + xbitmap_tree_insert(right, &bitmap->xb_root); + } else { + /* add an extent */ + left = kmalloc(sizeof(struct xbitmap_node), XCHK_GFP_FLAGS); + if (!left) + return -ENOMEM; + left->bn_start = start; + left->bn_last = last; + xbitmap_tree_insert(left, &bitmap->xb_root); + } return 0; } @@ -43,12 +177,11 @@ void xbitmap_destroy( struct xbitmap *bitmap) { - struct xbitmap_range *bmr; - struct xbitmap_range *n; + struct xbitmap_node *bn; - for_each_xbitmap_extent(bmr, n, bitmap) { - list_del(&bmr->list); - kfree(bmr); + while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, 0, -1ULL))) { + xbitmap_tree_remove(bn, &bitmap->xb_root); + kfree(bn); } } @@ -57,27 +190,7 @@ void xbitmap_init( struct xbitmap *bitmap) { - INIT_LIST_HEAD(&bitmap->list); -} - -/* Compare two btree extents. */ -static int -xbitmap_range_cmp( - void *priv, - const struct list_head *a, - const struct list_head *b) -{ - struct xbitmap_range *ap; - struct xbitmap_range *bp; - - ap = container_of(a, struct xbitmap_range, list); - bp = container_of(b, struct xbitmap_range, list); - - if (ap->start > bp->start) - return 1; - if (ap->start < bp->start) - return -1; - return 0; + bitmap->xb_root = RB_ROOT_CACHED; } /* @@ -94,118 +207,26 @@ xbitmap_range_cmp( * * This is the logical equivalent of bitmap &= ~sub. */ -#define LEFT_ALIGNED (1 << 0) -#define RIGHT_ALIGNED (1 << 1) int xbitmap_disunion( struct xbitmap *bitmap, struct xbitmap *sub) { - struct list_head *lp; - struct xbitmap_range *br; - struct xbitmap_range *new_br; - struct xbitmap_range *sub_br; - uint64_t sub_start; - uint64_t sub_len; - int state; - int error = 0; + struct xbitmap_node *bn; + int error; - if (list_empty(&bitmap->list) || list_empty(&sub->list)) + if (xbitmap_empty(bitmap) || xbitmap_empty(sub)) return 0; - ASSERT(!list_empty(&sub->list)); - - list_sort(NULL, &bitmap->list, xbitmap_range_cmp); - list_sort(NULL, &sub->list, xbitmap_range_cmp); - - /* - * Now that we've sorted both lists, we iterate bitmap once, rolling - * forward through sub and/or bitmap as necessary until we find an - * overlap or reach the end of either list. We do not reset lp to the - * head of bitmap nor do we reset sub_br to the head of sub. The - * list traversal is similar to merge sort, but we're deleting - * instead. In this manner we avoid O(n^2) operations. - */ - sub_br = list_first_entry(&sub->list, struct xbitmap_range, - list); - lp = bitmap->list.next; - while (lp != &bitmap->list) { - br = list_entry(lp, struct xbitmap_range, list); - - /* - * Advance sub_br and/or br until we find a pair that - * intersect or we run out of extents. - */ - while (sub_br->start + sub_br->len <= br->start) { - if (list_is_last(&sub_br->list, &sub->list)) - goto out; - sub_br = list_next_entry(sub_br, list); - } - if (sub_br->start >= br->start + br->len) { - lp = lp->next; - continue; - } - /* trim sub_br to fit the extent we have */ - sub_start = sub_br->start; - sub_len = sub_br->len; - if (sub_br->start < br->start) { - sub_len -= br->start - sub_br->start; - sub_start = br->start; - } - if (sub_len > br->len) - sub_len = br->len; - - state = 0; - if (sub_start == br->start) - state |= LEFT_ALIGNED; - if (sub_start + sub_len == br->start + br->len) - state |= RIGHT_ALIGNED; - switch (state) { - case LEFT_ALIGNED: - /* Coincides with only the left. */ - br->start += sub_len; - br->len -= sub_len; - break; - case RIGHT_ALIGNED: - /* Coincides with only the right. */ - br->len -= sub_len; - lp = lp->next; - break; - case LEFT_ALIGNED | RIGHT_ALIGNED: - /* Total overlap, just delete ex. */ - lp = lp->next; - list_del(&br->list); - kfree(br); - break; - case 0: - /* - * Deleting from the middle: add the new right extent - * and then shrink the left extent. - */ - new_br = kmalloc(sizeof(struct xbitmap_range), - XCHK_GFP_FLAGS); - if (!new_br) { - error = -ENOMEM; - goto out; - } - INIT_LIST_HEAD(&new_br->list); - new_br->start = sub_start + sub_len; - new_br->len = br->start + br->len - new_br->start; - list_add(&new_br->list, &br->list); - br->len = sub_start - br->start; - lp = lp->next; - break; - default: - ASSERT(0); - break; - } + for_each_xbitmap_extent(bn, sub) { + error = xbitmap_clear(bitmap, bn->bn_start, + bn->bn_last - bn->bn_start + 1); + if (error) + return error; } -out: - return error; + return 0; } -#undef LEFT_ALIGNED -#undef RIGHT_ALIGNED /* * Record all btree blocks seen while iterating all records of a btree. @@ -242,6 +263,38 @@ out: * For the 300th record we just exit, with the list being [1, 4, 2, 3]. */ +/* Mark a btree block to the agblock bitmap. */ +STATIC int +xagb_bitmap_visit_btblock( + struct xfs_btree_cur *cur, + int level, + void *priv) +{ + struct xagb_bitmap *bitmap = priv; + struct xfs_buf *bp; + xfs_fsblock_t fsbno; + xfs_agblock_t agbno; + + xfs_btree_get_block(cur, level, &bp); + if (!bp) + return 0; + + fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); + agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + + return xagb_bitmap_set(bitmap, agbno, 1); +} + +/* Mark all (per-AG) btree blocks in the agblock bitmap. */ +int +xagb_bitmap_set_btblocks( + struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur) +{ + return xfs_btree_visit_blocks(cur, xagb_bitmap_visit_btblock, + XFS_BTREE_VISIT_ALL, bitmap); +} + /* * Record all the buffers pointed to by the btree cursor. Callers already * engaged in a btree walk should call this function to capture the list of @@ -304,12 +357,97 @@ uint64_t xbitmap_hweight( struct xbitmap *bitmap) { - struct xbitmap_range *bmr; - struct xbitmap_range *n; + struct xbitmap_node *bn; uint64_t ret = 0; - for_each_xbitmap_extent(bmr, n, bitmap) - ret += bmr->len; + for_each_xbitmap_extent(bn, bitmap) + ret += bn->bn_last - bn->bn_start + 1; return ret; } + +/* Call a function for every run of set bits in this bitmap. */ +int +xbitmap_walk( + struct xbitmap *bitmap, + xbitmap_walk_fn fn, + void *priv) +{ + struct xbitmap_node *bn; + int error = 0; + + for_each_xbitmap_extent(bn, bitmap) { + error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv); + if (error) + break; + } + + return error; +} + +struct xbitmap_walk_bits { + xbitmap_walk_bits_fn fn; + void *priv; +}; + +/* Walk all the bits in a run. */ +static int +xbitmap_walk_bits_in_run( + uint64_t start, + uint64_t len, + void *priv) +{ + struct xbitmap_walk_bits *wb = priv; + uint64_t i; + int error = 0; + + for (i = start; i < start + len; i++) { + error = wb->fn(i, wb->priv); + if (error) + break; + } + + return error; +} + +/* Call a function for every set bit in this bitmap. */ +int +xbitmap_walk_bits( + struct xbitmap *bitmap, + xbitmap_walk_bits_fn fn, + void *priv) +{ + struct xbitmap_walk_bits wb = {.fn = fn, .priv = priv}; + + return xbitmap_walk(bitmap, xbitmap_walk_bits_in_run, &wb); +} + +/* Does this bitmap have no bits set at all? */ +bool +xbitmap_empty( + struct xbitmap *bitmap) +{ + return bitmap->xb_root.rb_root.rb_node == NULL; +} + +/* Is the start of the range set or clear? And for how long? */ +bool +xbitmap_test( + struct xbitmap *bitmap, + uint64_t start, + uint64_t *len) +{ + struct xbitmap_node *bn; + uint64_t last = start + *len - 1; + + bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last); + if (!bn) + return false; + if (bn->bn_start <= start) { + if (bn->bn_last < last) + *len = bn->bn_last - start + 1; + return true; + } + *len = bn->bn_start - start; + return false; +} diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h index 900646b72de1..84981724ecaf 100644 --- a/fs/xfs/scrub/bitmap.h +++ b/fs/xfs/scrub/bitmap.h @@ -1,31 +1,19 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2018 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #ifndef __XFS_SCRUB_BITMAP_H__ #define __XFS_SCRUB_BITMAP_H__ -struct xbitmap_range { - struct list_head list; - uint64_t start; - uint64_t len; -}; - struct xbitmap { - struct list_head list; + struct rb_root_cached xb_root; }; void xbitmap_init(struct xbitmap *bitmap); void xbitmap_destroy(struct xbitmap *bitmap); -#define for_each_xbitmap_extent(bex, n, bitmap) \ - list_for_each_entry_safe((bex), (n), &(bitmap)->list, list) - -#define for_each_xbitmap_block(b, bex, n, bitmap) \ - list_for_each_entry_safe((bex), (n), &(bitmap)->list, list) \ - for ((b) = (bex)->start; (b) < (bex)->start + (bex)->len; (b)++) - +int xbitmap_clear(struct xbitmap *bitmap, uint64_t start, uint64_t len); int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len); int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub); int xbitmap_set_btcur_path(struct xbitmap *bitmap, @@ -34,4 +22,93 @@ int xbitmap_set_btblocks(struct xbitmap *bitmap, struct xfs_btree_cur *cur); uint64_t xbitmap_hweight(struct xbitmap *bitmap); +/* + * Return codes for the bitmap iterator functions are 0 to continue iterating, + * and non-zero to stop iterating. Any non-zero value will be passed up to the + * iteration caller. The special value -ECANCELED can be used to stop + * iteration, because neither bitmap iterator ever generates that error code on + * its own. Callers must not modify the bitmap while walking it. + */ +typedef int (*xbitmap_walk_fn)(uint64_t start, uint64_t len, void *priv); +int xbitmap_walk(struct xbitmap *bitmap, xbitmap_walk_fn fn, + void *priv); + +typedef int (*xbitmap_walk_bits_fn)(uint64_t bit, void *priv); +int xbitmap_walk_bits(struct xbitmap *bitmap, xbitmap_walk_bits_fn fn, + void *priv); + +bool xbitmap_empty(struct xbitmap *bitmap); +bool xbitmap_test(struct xbitmap *bitmap, uint64_t start, uint64_t *len); + +/* Bitmaps, but for type-checked for xfs_agblock_t */ + +struct xagb_bitmap { + struct xbitmap agbitmap; +}; + +static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap) +{ + xbitmap_init(&bitmap->agbitmap); +} + +static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap) +{ + xbitmap_destroy(&bitmap->agbitmap); +} + +static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap, + xfs_agblock_t start, xfs_extlen_t len) +{ + return xbitmap_clear(&bitmap->agbitmap, start, len); +} +static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap, + xfs_agblock_t start, xfs_extlen_t len) +{ + return xbitmap_set(&bitmap->agbitmap, start, len); +} + +static inline bool +xagb_bitmap_test( + struct xagb_bitmap *bitmap, + xfs_agblock_t start, + xfs_extlen_t *len) +{ + uint64_t biglen = *len; + bool ret; + + ret = xbitmap_test(&bitmap->agbitmap, start, &biglen); + + if (start + biglen >= UINT_MAX) { + ASSERT(0); + biglen = UINT_MAX - start; + } + + *len = biglen; + return ret; +} + +static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap, + struct xagb_bitmap *sub) +{ + return xbitmap_disunion(&bitmap->agbitmap, &sub->agbitmap); +} + +static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap) +{ + return xbitmap_hweight(&bitmap->agbitmap); +} +static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap) +{ + return xbitmap_empty(&bitmap->agbitmap); +} + +static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap, + xbitmap_walk_fn fn, void *priv) +{ + return xbitmap_walk(&bitmap->agbitmap, fn, priv); +} + +int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur); + #endif /* __XFS_SCRUB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index dbbc7037074c..69bc89d0fc68 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -31,22 +31,28 @@ xchk_setup_inode_bmap( { int error; - error = xchk_get_inode(sc); + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + + error = xchk_iget_for_scrubbing(sc); if (error) goto out; - sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - xfs_ilock(sc->ip, sc->ilock_flags); + sc->ilock_flags = XFS_IOLOCK_EXCL; + xfs_ilock(sc->ip, XFS_IOLOCK_EXCL); /* - * We don't want any ephemeral data fork updates sitting around + * We don't want any ephemeral data/cow fork updates sitting around * while we inspect block mappings, so wait for directio to finish * and flush dirty data if we have delalloc reservations. */ if (S_ISREG(VFS_I(sc->ip)->i_mode) && - sc->sm->sm_type == XFS_SCRUB_TYPE_BMBTD) { + sc->sm->sm_type != XFS_SCRUB_TYPE_BMBTA) { struct address_space *mapping = VFS_I(sc->ip)->i_mapping; + sc->ilock_flags |= XFS_MMAPLOCK_EXCL; + xfs_ilock(sc->ip, XFS_MMAPLOCK_EXCL); + inode_dio_wait(VFS_I(sc->ip)); /* @@ -90,11 +96,23 @@ out: struct xchk_bmap_info { struct xfs_scrub *sc; + + /* Incore extent tree cursor */ struct xfs_iext_cursor icur; - xfs_fileoff_t lastoff; + + /* Previous fork mapping that we examined */ + struct xfs_bmbt_irec prev_rec; + + /* Is this a realtime fork? */ bool is_rt; + + /* May mappings point to shared space? */ bool is_shared; + + /* Was the incore extent tree loaded? */ bool was_loaded; + + /* Which inode fork are we checking? */ int whichfork; }; @@ -147,49 +165,7 @@ xchk_bmap_get_rmap( return has_rmap; } -static inline bool -xchk_bmap_has_prev( - struct xchk_bmap_info *info, - struct xfs_bmbt_irec *irec) -{ - struct xfs_bmbt_irec got; - struct xfs_ifork *ifp; - - ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork); - - if (!xfs_iext_peek_prev_extent(ifp, &info->icur, &got)) - return false; - if (got.br_startoff + got.br_blockcount != irec->br_startoff) - return false; - if (got.br_startblock + got.br_blockcount != irec->br_startblock) - return false; - if (got.br_state != irec->br_state) - return false; - return true; -} - -static inline bool -xchk_bmap_has_next( - struct xchk_bmap_info *info, - struct xfs_bmbt_irec *irec) -{ - struct xfs_bmbt_irec got; - struct xfs_ifork *ifp; - - ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork); - - if (!xfs_iext_peek_next_extent(ifp, &info->icur, &got)) - return false; - if (irec->br_startoff + irec->br_blockcount != got.br_startoff) - return false; - if (irec->br_startblock + irec->br_blockcount != got.br_startblock) - return false; - if (got.br_state != irec->br_state) - return false; - return true; -} - -/* Make sure that we have rmapbt records for this extent. */ +/* Make sure that we have rmapbt records for this data/attr fork extent. */ STATIC void xchk_bmap_xref_rmap( struct xchk_bmap_info *info, @@ -198,41 +174,39 @@ xchk_bmap_xref_rmap( { struct xfs_rmap_irec rmap; unsigned long long rmap_end; - uint64_t owner; + uint64_t owner = info->sc->ip->i_ino; if (!info->sc->sa.rmap_cur || xchk_skip_xref(info->sc->sm)) return; - if (info->whichfork == XFS_COW_FORK) - owner = XFS_RMAP_OWN_COW; - else - owner = info->sc->ip->i_ino; - /* Find the rmap record for this irec. */ if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap)) return; - /* Check the rmap. */ + /* + * The rmap must be an exact match for this incore file mapping record, + * which may have arisen from multiple ondisk records. + */ + if (rmap.rm_startblock != agbno) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount; - if (rmap.rm_startblock > agbno || - agbno + irec->br_blockcount > rmap_end) + if (rmap_end != agbno + irec->br_blockcount) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); - /* - * Check the logical offsets if applicable. CoW staging extents - * don't track logical offsets since the mappings only exist in - * memory. - */ - if (info->whichfork != XFS_COW_FORK) { - rmap_end = (unsigned long long)rmap.rm_offset + - rmap.rm_blockcount; - if (rmap.rm_offset > irec->br_startoff || - irec->br_startoff + irec->br_blockcount > rmap_end) - xchk_fblock_xref_set_corrupt(info->sc, - info->whichfork, irec->br_startoff); - } + /* Check the logical offsets. */ + if (rmap.rm_offset != irec->br_startoff) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + rmap_end = (unsigned long long)rmap.rm_offset + rmap.rm_blockcount; + if (rmap_end != irec->br_startoff + irec->br_blockcount) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* Check the owner */ if (rmap.rm_owner != owner) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); @@ -244,8 +218,7 @@ xchk_bmap_xref_rmap( * records because the blocks are owned (on-disk) by the refcountbt, * which doesn't track unwritten state. */ - if (owner != XFS_RMAP_OWN_COW && - !!(irec->br_state == XFS_EXT_UNWRITTEN) != + if (!!(irec->br_state == XFS_EXT_UNWRITTEN) != !!(rmap.rm_flags & XFS_RMAP_UNWRITTEN)) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); @@ -257,34 +230,60 @@ xchk_bmap_xref_rmap( if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); +} + +/* Make sure that we have rmapbt records for this COW fork extent. */ +STATIC void +xchk_bmap_xref_rmap_cow( + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec, + xfs_agblock_t agbno) +{ + struct xfs_rmap_irec rmap; + unsigned long long rmap_end; + uint64_t owner = XFS_RMAP_OWN_COW; + + if (!info->sc->sa.rmap_cur || xchk_skip_xref(info->sc->sm)) + return; + + /* Find the rmap record for this irec. */ + if (!xchk_bmap_get_rmap(info, irec, agbno, owner, &rmap)) + return; /* - * If the rmap starts before this bmbt record, make sure there's a bmbt - * record for the previous offset that is contiguous with this mapping. - * Skip this for CoW fork extents because the refcount btree (and not - * the inode) is the ondisk owner for those extents. + * CoW staging extents are owned by the refcount btree, so the rmap + * can start before and end after the physical space allocated to this + * mapping. There are no offsets to check. */ - if (info->whichfork != XFS_COW_FORK && rmap.rm_startblock < agbno && - !xchk_bmap_has_prev(info, irec)) { + if (rmap.rm_startblock > agbno) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount; + if (rmap_end < agbno + irec->br_blockcount) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + /* Check the owner */ + if (rmap.rm_owner != owner) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); - return; - } /* - * If the rmap ends after this bmbt record, make sure there's a bmbt - * record for the next offset that is contiguous with this mapping. - * Skip this for CoW fork extents because the refcount btree (and not - * the inode) is the ondisk owner for those extents. + * No flags allowed. Note that the (in-memory) CoW fork distinguishes + * between unwritten and written extents, but we don't track that in + * the rmap records because the blocks are owned (on-disk) by the + * refcountbt, which doesn't track unwritten state. */ - rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount; - if (info->whichfork != XFS_COW_FORK && - rmap_end > agbno + irec->br_blockcount && - !xchk_bmap_has_next(info, irec)) { + if (rmap.rm_flags & XFS_RMAP_ATTR_FORK) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK) + xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + if (rmap.rm_flags & XFS_RMAP_UNWRITTEN) xchk_fblock_xref_set_corrupt(info->sc, info->whichfork, irec->br_startoff); - return; - } } /* Cross-reference a single rtdev extent record. */ @@ -305,6 +304,7 @@ xchk_bmap_iextent_xref( struct xchk_bmap_info *info, struct xfs_bmbt_irec *irec) { + struct xfs_owner_info oinfo; struct xfs_mount *mp = info->sc->mp; xfs_agnumber_t agno; xfs_agblock_t agbno; @@ -322,17 +322,35 @@ xchk_bmap_iextent_xref( xchk_xref_is_used_space(info->sc, agbno, len); xchk_xref_is_not_inode_chunk(info->sc, agbno, len); - xchk_bmap_xref_rmap(info, irec, agbno); switch (info->whichfork) { case XFS_DATA_FORK: - if (xfs_is_reflink_inode(info->sc->ip)) - break; - fallthrough; + xchk_bmap_xref_rmap(info, irec, agbno); + if (!xfs_is_reflink_inode(info->sc->ip)) { + xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino, + info->whichfork, irec->br_startoff); + xchk_xref_is_only_owned_by(info->sc, agbno, + irec->br_blockcount, &oinfo); + xchk_xref_is_not_shared(info->sc, agbno, + irec->br_blockcount); + } + xchk_xref_is_not_cow_staging(info->sc, agbno, + irec->br_blockcount); + break; case XFS_ATTR_FORK: + xchk_bmap_xref_rmap(info, irec, agbno); + xfs_rmap_ino_owner(&oinfo, info->sc->ip->i_ino, + info->whichfork, irec->br_startoff); + xchk_xref_is_only_owned_by(info->sc, agbno, irec->br_blockcount, + &oinfo); xchk_xref_is_not_shared(info->sc, agbno, irec->br_blockcount); + xchk_xref_is_not_cow_staging(info->sc, agbno, + irec->br_blockcount); break; case XFS_COW_FORK: + xchk_bmap_xref_rmap_cow(info, irec, agbno); + xchk_xref_is_only_owned_by(info->sc, agbno, irec->br_blockcount, + &XFS_RMAP_OINFO_COW); xchk_xref_is_cow_staging(info->sc, agbno, irec->br_blockcount); xchk_xref_is_not_shared(info->sc, agbno, @@ -382,7 +400,8 @@ xchk_bmap_iextent( * Check for out-of-order extents. This record could have come * from the incore list, for which there is no ordering check. */ - if (irec->br_startoff < info->lastoff) + if (irec->br_startoff < info->prev_rec.br_startoff + + info->prev_rec.br_blockcount) xchk_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); @@ -392,15 +411,7 @@ xchk_bmap_iextent( xchk_bmap_dirattr_extent(ip, info, irec); - /* There should never be a "hole" extent in either extent list. */ - if (irec->br_startblock == HOLESTARTBLOCK) - xchk_fblock_set_corrupt(info->sc, info->whichfork, - irec->br_startoff); - /* Make sure the extent points to a valid place. */ - if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN) - xchk_fblock_set_corrupt(info->sc, info->whichfork, - irec->br_startoff); if (info->is_rt && !xfs_verify_rtext(mp, irec->br_startblock, irec->br_blockcount)) xchk_fblock_set_corrupt(info->sc, info->whichfork, @@ -468,6 +479,12 @@ xchk_bmapbt_rec( return 0; xfs_bmbt_disk_get_all(&rec->bmbt, &irec); + if (xfs_bmap_validate_extent(ip, info->whichfork, &irec) != NULL) { + xchk_fblock_set_corrupt(bs->sc, info->whichfork, + irec.br_startoff); + return 0; + } + if (!xfs_iext_lookup_extent(ip, ifp, irec.br_startoff, &icur, &iext_irec) || irec.br_startoff != iext_irec.br_startoff || @@ -618,45 +635,57 @@ xchk_bmap_check_ag_rmaps( return error; } -/* Make sure each rmap has a corresponding bmbt entry. */ -STATIC int -xchk_bmap_check_rmaps( - struct xfs_scrub *sc, - int whichfork) +/* + * Decide if we want to walk every rmap btree in the fs to make sure that each + * rmap for this file fork has corresponding bmbt entries. + */ +static bool +xchk_bmap_want_check_rmaps( + struct xchk_bmap_info *info) { - struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork); - struct xfs_perag *pag; - xfs_agnumber_t agno; - bool zero_size; - int error; + struct xfs_scrub *sc = info->sc; + struct xfs_ifork *ifp; - if (!xfs_has_rmapbt(sc->mp) || - whichfork == XFS_COW_FORK || - (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) - return 0; + if (!xfs_has_rmapbt(sc->mp)) + return false; + if (info->whichfork == XFS_COW_FORK) + return false; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return false; /* Don't support realtime rmap checks yet. */ - if (XFS_IS_REALTIME_INODE(sc->ip) && whichfork == XFS_DATA_FORK) - return 0; - - ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL); + if (info->is_rt) + return false; /* - * Only do this for complex maps that are in btree format, or for - * situations where we would seem to have a size but zero extents. - * The inode repair code can zap broken iforks, which means we have - * to flag this bmap as corrupt if there are rmaps that need to be - * reattached. + * The inode repair code zaps broken inode forks by resetting them back + * to EXTENTS format and zero extent records. If we encounter a fork + * in this state along with evidence that the fork isn't supposed to be + * empty, we need to scan the reverse mappings to decide if we're going + * to rebuild the fork. Data forks with nonzero file size are scanned. + * xattr forks are never empty of content, so they are always scanned. */ + ifp = xfs_ifork_ptr(sc->ip, info->whichfork); + if (ifp->if_format == XFS_DINODE_FMT_EXTENTS && ifp->if_nextents == 0) { + if (info->whichfork == XFS_DATA_FORK && + i_size_read(VFS_I(sc->ip)) == 0) + return false; - if (whichfork == XFS_DATA_FORK) - zero_size = i_size_read(VFS_I(sc->ip)) == 0; - else - zero_size = false; + return true; + } - if (ifp->if_format != XFS_DINODE_FMT_BTREE && - (zero_size || ifp->if_nextents > 0)) - return 0; + return false; +} + +/* Make sure each rmap has a corresponding bmbt entry. */ +STATIC int +xchk_bmap_check_rmaps( + struct xfs_scrub *sc, + int whichfork) +{ + struct xfs_perag *pag; + xfs_agnumber_t agno; + int error; for_each_perag(sc->mp, agno, pag) { error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag); @@ -683,7 +712,8 @@ xchk_bmap_iextent_delalloc( * Check for out-of-order extents. This record could have come * from the incore list, for which there is no ordering check. */ - if (irec->br_startoff < info->lastoff) + if (irec->br_startoff < info->prev_rec.br_startoff + + info->prev_rec.br_blockcount) xchk_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); @@ -697,6 +727,101 @@ xchk_bmap_iextent_delalloc( irec->br_startoff); } +/* Decide if this individual fork mapping is ok. */ +static bool +xchk_bmap_iext_mapping( + struct xchk_bmap_info *info, + const struct xfs_bmbt_irec *irec) +{ + /* There should never be a "hole" extent in either extent list. */ + if (irec->br_startblock == HOLESTARTBLOCK) + return false; + if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN) + return false; + return true; +} + +/* Are these two mappings contiguous with each other? */ +static inline bool +xchk_are_bmaps_contiguous( + const struct xfs_bmbt_irec *b1, + const struct xfs_bmbt_irec *b2) +{ + /* Don't try to combine unallocated mappings. */ + if (!xfs_bmap_is_real_extent(b1)) + return false; + if (!xfs_bmap_is_real_extent(b2)) + return false; + + /* Does b2 come right after b1 in the logical and physical range? */ + if (b1->br_startoff + b1->br_blockcount != b2->br_startoff) + return false; + if (b1->br_startblock + b1->br_blockcount != b2->br_startblock) + return false; + if (b1->br_state != b2->br_state) + return false; + return true; +} + +/* + * Walk the incore extent records, accumulating consecutive contiguous records + * into a single incore mapping. Returns true if @irec has been set to a + * mapping or false if there are no more mappings. Caller must ensure that + * @info.icur is zeroed before the first call. + */ +static int +xchk_bmap_iext_iter( + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + struct xfs_bmbt_irec got; + struct xfs_ifork *ifp; + xfs_filblks_t prev_len; + + ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork); + + /* Advance to the next iextent record and check the mapping. */ + xfs_iext_next(ifp, &info->icur); + if (!xfs_iext_get_extent(ifp, &info->icur, irec)) + return false; + + if (!xchk_bmap_iext_mapping(info, irec)) { + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + return false; + } + + /* + * Iterate subsequent iextent records and merge them with the one + * that we just read, if possible. + */ + prev_len = irec->br_blockcount; + while (xfs_iext_peek_next_extent(ifp, &info->icur, &got)) { + if (!xchk_are_bmaps_contiguous(irec, &got)) + break; + + if (!xchk_bmap_iext_mapping(info, &got)) { + xchk_fblock_set_corrupt(info->sc, info->whichfork, + got.br_startoff); + return false; + } + + /* + * Notify the user of mergeable records in the data or attr + * forks. CoW forks only exist in memory so we ignore them. + */ + if (info->whichfork != XFS_COW_FORK && + prev_len + got.br_blockcount > BMBT_BLOCKCOUNT_MASK) + xchk_ino_set_preen(info->sc, info->sc->ip->i_ino); + + irec->br_blockcount += got.br_blockcount; + prev_len = got.br_blockcount; + xfs_iext_next(ifp, &info->icur); + } + + return true; +} + /* * Scrub an inode fork's block mappings. * @@ -776,10 +901,15 @@ xchk_bmap( if (!xchk_fblock_process_error(sc, whichfork, 0, &error)) goto out; - /* Scrub extent records. */ - info.lastoff = 0; - ifp = xfs_ifork_ptr(ip, whichfork); - for_each_xfs_iext(ifp, &info.icur, &irec) { + /* + * Scrub extent records. We use a special iterator function here that + * combines adjacent mappings if they are logically and physically + * contiguous. For large allocations that require multiple bmbt + * records, this reduces the number of cross-referencing calls, which + * reduces runtime. Cross referencing with the rmap is simpler because + * the rmap must match the combined mapping exactly. + */ + while (xchk_bmap_iext_iter(&info, &irec)) { if (xchk_should_terminate(sc, &error) || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) goto out; @@ -794,12 +924,14 @@ xchk_bmap( xchk_bmap_iextent_delalloc(ip, &info, &irec); else xchk_bmap_iextent(ip, &info, &irec); - info.lastoff = irec.br_startoff + irec.br_blockcount; + memcpy(&info.prev_rec, &irec, sizeof(struct xfs_bmbt_irec)); } - error = xchk_bmap_check_rmaps(sc, whichfork); - if (!xchk_fblock_xref_process_error(sc, whichfork, 0, &error)) - goto out; + if (xchk_bmap_want_check_rmaps(&info)) { + error = xchk_bmap_check_rmaps(sc, whichfork); + if (!xchk_fblock_xref_process_error(sc, whichfork, 0, &error)) + goto out; + } out: return error; } diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 0fd36d5b4646..1935b9ce1885 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -36,6 +36,7 @@ __xchk_btree_process_error( switch (*error) { case -EDEADLOCK: + case -ECHRNG: /* Used to restart an op with deadlock avoidance. */ trace_xchk_deadlock_retry(sc->ip, sc->sm, *error); break; @@ -118,6 +119,16 @@ xchk_btree_xref_set_corrupt( __return_address); } +void +xchk_btree_set_preen( + struct xfs_scrub *sc, + struct xfs_btree_cur *cur, + int level) +{ + __xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_PREEN, + __return_address); +} + /* * Make sure this record is in order and doesn't stray outside of the parent * keys. @@ -140,29 +151,30 @@ xchk_btree_rec( trace_xchk_btree_rec(bs->sc, cur, 0); - /* If this isn't the first record, are they in order? */ - if (cur->bc_levels[0].ptr > 1 && + /* Are all records across all record blocks in order? */ + if (bs->lastrec_valid && !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec)) xchk_btree_set_corrupt(bs->sc, cur, 0); memcpy(&bs->lastrec, rec, cur->bc_ops->rec_len); + bs->lastrec_valid = true; if (cur->bc_nlevels == 1) return; - /* Is this at least as large as the parent low key? */ + /* Is low_key(rec) at least as large as the parent low key? */ cur->bc_ops->init_key_from_rec(&key, rec); keyblock = xfs_btree_get_block(cur, 1, &bp); keyp = xfs_btree_key_addr(cur, cur->bc_levels[1].ptr, keyblock); - if (cur->bc_ops->diff_two_keys(cur, &key, keyp) < 0) + if (xfs_btree_keycmp_lt(cur, &key, keyp)) xchk_btree_set_corrupt(bs->sc, cur, 1); if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) return; - /* Is this no larger than the parent high key? */ + /* Is high_key(rec) no larger than the parent high key? */ cur->bc_ops->init_high_key_from_rec(&hkey, rec); keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[1].ptr, keyblock); - if (cur->bc_ops->diff_two_keys(cur, keyp, &hkey) < 0) + if (xfs_btree_keycmp_lt(cur, keyp, &hkey)) xchk_btree_set_corrupt(bs->sc, cur, 1); } @@ -187,29 +199,30 @@ xchk_btree_key( trace_xchk_btree_key(bs->sc, cur, level); - /* If this isn't the first key, are they in order? */ - if (cur->bc_levels[level].ptr > 1 && - !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level - 1], key)) + /* Are all low keys across all node blocks in order? */ + if (bs->lastkey[level - 1].valid && + !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level - 1].key, key)) xchk_btree_set_corrupt(bs->sc, cur, level); - memcpy(&bs->lastkey[level - 1], key, cur->bc_ops->key_len); + memcpy(&bs->lastkey[level - 1].key, key, cur->bc_ops->key_len); + bs->lastkey[level - 1].valid = true; if (level + 1 >= cur->bc_nlevels) return; - /* Is this at least as large as the parent low key? */ + /* Is this block's low key at least as large as the parent low key? */ keyblock = xfs_btree_get_block(cur, level + 1, &bp); keyp = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, keyblock); - if (cur->bc_ops->diff_two_keys(cur, key, keyp) < 0) + if (xfs_btree_keycmp_lt(cur, key, keyp)) xchk_btree_set_corrupt(bs->sc, cur, level); if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) return; - /* Is this no larger than the parent high key? */ + /* Is this block's high key no larger than the parent high key? */ key = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, block); keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr, keyblock); - if (cur->bc_ops->diff_two_keys(cur, keyp, key) < 0) + if (xfs_btree_keycmp_lt(cur, keyp, key)) xchk_btree_set_corrupt(bs->sc, cur, level); } @@ -389,7 +402,7 @@ xchk_btree_check_block_owner( if (!bs->sc->sa.bno_cur && btnum == XFS_BTNUM_BNO) bs->cur = NULL; - xchk_xref_is_owned_by(bs->sc, agbno, 1, bs->oinfo); + xchk_xref_is_only_owned_by(bs->sc, agbno, 1, bs->oinfo); if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP) bs->cur = NULL; @@ -519,6 +532,48 @@ xchk_btree_check_minrecs( } /* + * If this btree block has a parent, make sure that the parent's keys capture + * the keyspace contained in this block. + */ +STATIC void +xchk_btree_block_check_keys( + struct xchk_btree *bs, + int level, + struct xfs_btree_block *block) +{ + union xfs_btree_key block_key; + union xfs_btree_key *block_high_key; + union xfs_btree_key *parent_low_key, *parent_high_key; + struct xfs_btree_cur *cur = bs->cur; + struct xfs_btree_block *parent_block; + struct xfs_buf *bp; + + if (level == cur->bc_nlevels - 1) + return; + + xfs_btree_get_keys(cur, block, &block_key); + + /* Make sure the low key of this block matches the parent. */ + parent_block = xfs_btree_get_block(cur, level + 1, &bp); + parent_low_key = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, + parent_block); + if (xfs_btree_keycmp_ne(cur, &block_key, parent_low_key)) { + xchk_btree_set_corrupt(bs->sc, bs->cur, level); + return; + } + + if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + return; + + /* Make sure the high key of this block matches the parent. */ + parent_high_key = xfs_btree_high_key_addr(cur, + cur->bc_levels[level + 1].ptr, parent_block); + block_high_key = xfs_btree_high_key_from_key(cur, &block_key); + if (xfs_btree_keycmp_ne(cur, block_high_key, parent_high_key)) + xchk_btree_set_corrupt(bs->sc, bs->cur, level); +} + +/* * Grab and scrub a btree block given a btree pointer. Returns block * and buffer pointers (if applicable) if they're ok to use. */ @@ -569,7 +624,12 @@ xchk_btree_get_block( * Check the block's siblings; this function absorbs error codes * for us. */ - return xchk_btree_block_check_siblings(bs, *pblock); + error = xchk_btree_block_check_siblings(bs, *pblock); + if (error) + return error; + + xchk_btree_block_check_keys(bs, level, *pblock); + return 0; } /* @@ -601,7 +661,7 @@ xchk_btree_block_keys( parent_keys = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, parent_block); - if (cur->bc_ops->diff_two_keys(cur, &block_keys, parent_keys) != 0) + if (xfs_btree_keycmp_ne(cur, &block_keys, parent_keys)) xchk_btree_set_corrupt(bs->sc, cur, 1); if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) @@ -612,7 +672,7 @@ xchk_btree_block_keys( high_pk = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr, parent_block); - if (cur->bc_ops->diff_two_keys(cur, high_bk, high_pk) != 0) + if (xfs_btree_keycmp_ne(cur, high_bk, high_pk)) xchk_btree_set_corrupt(bs->sc, cur, 1); } diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h index da61a53a0b61..9d7b9ee8bef4 100644 --- a/fs/xfs/scrub/btree.h +++ b/fs/xfs/scrub/btree.h @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #ifndef __XFS_SCRUB_BTREE_H__ #define __XFS_SCRUB_BTREE_H__ @@ -19,6 +19,8 @@ bool xchk_btree_xref_process_error(struct xfs_scrub *sc, /* Check for btree corruption. */ void xchk_btree_set_corrupt(struct xfs_scrub *sc, struct xfs_btree_cur *cur, int level); +void xchk_btree_set_preen(struct xfs_scrub *sc, struct xfs_btree_cur *cur, + int level); /* Check for btree xref discrepancies. */ void xchk_btree_xref_set_corrupt(struct xfs_scrub *sc, @@ -29,6 +31,11 @@ typedef int (*xchk_btree_rec_fn)( struct xchk_btree *bs, const union xfs_btree_rec *rec); +struct xchk_btree_key { + union xfs_btree_key key; + bool valid; +}; + struct xchk_btree { /* caller-provided scrub state */ struct xfs_scrub *sc; @@ -38,11 +45,12 @@ struct xchk_btree { void *private; /* internal scrub state */ + bool lastrec_valid; union xfs_btree_rec lastrec; struct list_head to_check; /* this element must come last! */ - union xfs_btree_key lastkey[]; + struct xchk_btree_key lastkey[]; }; /* diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 848a8e32e56f..7a20256be969 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -75,6 +75,7 @@ __xchk_process_error( case 0: return true; case -EDEADLOCK: + case -ECHRNG: /* Used to restart an op with deadlock avoidance. */ trace_xchk_deadlock_retry( sc->ip ? sc->ip : XFS_I(file_inode(sc->file)), @@ -130,6 +131,7 @@ __xchk_fblock_process_error( case 0: return true; case -EDEADLOCK: + case -ECHRNG: /* Used to restart an op with deadlock avoidance. */ trace_xchk_deadlock_retry(sc->ip, sc->sm, *error); break; @@ -396,26 +398,19 @@ want_ag_read_header_failure( } /* - * Grab the perag structure and all the headers for an AG. + * Grab the AG header buffers for the attached perag structure. * * The headers should be released by xchk_ag_free, but as a fail safe we attach * all the buffers we grab to the scrub transaction so they'll all be freed - * when we cancel it. Returns ENOENT if we can't grab the perag structure. + * when we cancel it. */ -int -xchk_ag_read_headers( +static inline int +xchk_perag_read_headers( struct xfs_scrub *sc, - xfs_agnumber_t agno, struct xchk_ag *sa) { - struct xfs_mount *mp = sc->mp; int error; - ASSERT(!sa->pag); - sa->pag = xfs_perag_get(mp, agno); - if (!sa->pag) - return -ENOENT; - error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp); if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI)) return error; @@ -427,6 +422,104 @@ xchk_ag_read_headers( return 0; } +/* + * Grab the AG headers for the attached perag structure and wait for pending + * intents to drain. + */ +static int +xchk_perag_drain_and_lock( + struct xfs_scrub *sc) +{ + struct xchk_ag *sa = &sc->sa; + int error = 0; + + ASSERT(sa->pag != NULL); + ASSERT(sa->agi_bp == NULL); + ASSERT(sa->agf_bp == NULL); + + do { + if (xchk_should_terminate(sc, &error)) + return error; + + error = xchk_perag_read_headers(sc, sa); + if (error) + return error; + + /* + * If we've grabbed an inode for scrubbing then we assume that + * holding its ILOCK will suffice to coordinate with any intent + * chains involving this inode. + */ + if (sc->ip) + return 0; + + /* + * Decide if this AG is quiet enough for all metadata to be + * consistent with each other. XFS allows the AG header buffer + * locks to cycle across transaction rolls while processing + * chains of deferred ops, which means that there could be + * other threads in the middle of processing a chain of + * deferred ops. For regular operations we are careful about + * ordering operations to prevent collisions between threads + * (which is why we don't need a per-AG lock), but scrub and + * repair have to serialize against chained operations. + * + * We just locked all the AG headers buffers; now take a look + * to see if there are any intents in progress. If there are, + * drop the AG headers and wait for the intents to drain. + * Since we hold all the AG header locks for the duration of + * the scrub, this is the only time we have to sample the + * intents counter; any threads increasing it after this point + * can't possibly be in the middle of a chain of AG metadata + * updates. + * + * Obviously, this should be slanted against scrub and in favor + * of runtime threads. + */ + if (!xfs_perag_intent_busy(sa->pag)) + return 0; + + if (sa->agf_bp) { + xfs_trans_brelse(sc->tp, sa->agf_bp); + sa->agf_bp = NULL; + } + + if (sa->agi_bp) { + xfs_trans_brelse(sc->tp, sa->agi_bp); + sa->agi_bp = NULL; + } + + if (!(sc->flags & XCHK_FSGATES_DRAIN)) + return -ECHRNG; + error = xfs_perag_intent_drain(sa->pag); + if (error == -ERESTARTSYS) + error = -EINTR; + } while (!error); + + return error; +} + +/* + * Grab the per-AG structure, grab all AG header buffers, and wait until there + * aren't any pending intents. Returns -ENOENT if we can't grab the perag + * structure. + */ +int +xchk_ag_read_headers( + struct xfs_scrub *sc, + xfs_agnumber_t agno, + struct xchk_ag *sa) +{ + struct xfs_mount *mp = sc->mp; + + ASSERT(!sa->pag); + sa->pag = xfs_perag_get(mp, agno); + if (!sa->pag) + return -ENOENT; + + return xchk_perag_drain_and_lock(sc); +} + /* Release all the AG btree cursors. */ void xchk_ag_btcur_free( @@ -550,6 +643,14 @@ xchk_ag_init( /* Per-scrubber setup functions */ +void +xchk_trans_cancel( + struct xfs_scrub *sc) +{ + xfs_trans_cancel(sc->tp); + sc->tp = NULL; +} + /* * Grab an empty transaction so that we can re-grab locked buffers if * one of our btrees turns out to be cyclic. @@ -625,80 +726,273 @@ xchk_checkpoint_log( return 0; } +/* Verify that an inode is allocated ondisk, then return its cached inode. */ +int +xchk_iget( + struct xfs_scrub *sc, + xfs_ino_t inum, + struct xfs_inode **ipp) +{ + return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp); +} + +/* + * Try to grab an inode in a manner that avoids races with physical inode + * allocation. If we can't, return the locked AGI buffer so that the caller + * can single-step the loading process to see where things went wrong. + * Callers must have a valid scrub transaction. + * + * If the iget succeeds, return 0, a NULL AGI, and the inode. + * + * If the iget fails, return the error, the locked AGI, and a NULL inode. This + * can include -EINVAL and -ENOENT for invalid inode numbers or inodes that are + * no longer allocated; or any other corruption or runtime error. + * + * If the AGI read fails, return the error, a NULL AGI, and NULL inode. + * + * If a fatal signal is pending, return -EINTR, a NULL AGI, and a NULL inode. + */ +int +xchk_iget_agi( + struct xfs_scrub *sc, + xfs_ino_t inum, + struct xfs_buf **agi_bpp, + struct xfs_inode **ipp) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_trans *tp = sc->tp; + struct xfs_perag *pag; + int error; + + ASSERT(sc->tp != NULL); + +again: + *agi_bpp = NULL; + *ipp = NULL; + error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + /* + * Attach the AGI buffer to the scrub transaction to avoid deadlocks + * in the iget cache miss path. + */ + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); + error = xfs_ialloc_read_agi(pag, tp, agi_bpp); + xfs_perag_put(pag); + if (error) + return error; + + error = xfs_iget(mp, tp, inum, + XFS_IGET_NORETRY | XFS_IGET_UNTRUSTED, 0, ipp); + if (error == -EAGAIN) { + /* + * The inode may be in core but temporarily unavailable and may + * require the AGI buffer before it can be returned. Drop the + * AGI buffer and retry the lookup. + * + * Incore lookup will fail with EAGAIN on a cache hit if the + * inode is queued to the inactivation list. The inactivation + * worker may remove the inode from the unlinked list and hence + * needs the AGI. + * + * Hence xchk_iget_agi() needs to drop the AGI lock on EAGAIN + * to allow inodegc to make progress and move the inode to + * IRECLAIMABLE state where xfs_iget will be able to return it + * again if it can lock the inode. + */ + xfs_trans_brelse(tp, *agi_bpp); + delay(1); + goto again; + } + if (error) + return error; + + /* We got the inode, so we can release the AGI. */ + ASSERT(*ipp != NULL); + xfs_trans_brelse(tp, *agi_bpp); + *agi_bpp = NULL; + return 0; +} + +/* Install an inode that we opened by handle for scrubbing. */ +int +xchk_install_handle_inode( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + if (VFS_I(ip)->i_generation != sc->sm->sm_gen) { + xchk_irele(sc, ip); + return -ENOENT; + } + + sc->ip = ip; + return 0; +} + /* - * Given an inode and the scrub control structure, grab either the - * inode referenced in the control structure or the inode passed in. - * The inode is not locked. + * In preparation to scrub metadata structures that hang off of an inode, + * grab either the inode referenced in the scrub control structure or the + * inode passed in. If the inumber does not reference an allocated inode + * record, the function returns ENOENT to end the scrub early. The inode + * is not locked. */ int -xchk_get_inode( +xchk_iget_for_scrubbing( struct xfs_scrub *sc) { struct xfs_imap imap; struct xfs_mount *mp = sc->mp; struct xfs_perag *pag; + struct xfs_buf *agi_bp; struct xfs_inode *ip_in = XFS_I(file_inode(sc->file)); struct xfs_inode *ip = NULL; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino); int error; + ASSERT(sc->tp == NULL); + /* We want to scan the inode we already had opened. */ if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) { sc->ip = ip_in; return 0; } - /* Look up the inode, see if the generation number matches. */ + /* Reject internal metadata files and obviously bad inode numbers. */ if (xfs_internal_inum(mp, sc->sm->sm_ino)) return -ENOENT; - error = xfs_iget(mp, NULL, sc->sm->sm_ino, - XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, &ip); - switch (error) { - case -ENOENT: - /* Inode doesn't exist, just bail out. */ + if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino)) + return -ENOENT; + + /* Try a regular untrusted iget. */ + error = xchk_iget(sc, sc->sm->sm_ino, &ip); + if (!error) + return xchk_install_handle_inode(sc, ip); + if (error == -ENOENT) return error; - case 0: - /* Got an inode, continue. */ - break; - case -EINVAL: + if (error != -EINVAL) + goto out_error; + + /* + * EINVAL with IGET_UNTRUSTED probably means one of several things: + * userspace gave us an inode number that doesn't correspond to fs + * space; the inode btree lacks a record for this inode; or there is a + * record, and it says this inode is free. + * + * We want to look up this inode in the inobt to distinguish two + * scenarios: (1) the inobt says the inode is free, in which case + * there's nothing to do; and (2) the inobt says the inode is + * allocated, but loading it failed due to corruption. + * + * Allocate a transaction and grab the AGI to prevent inobt activity + * in this AG. Retry the iget in case someone allocated a new inode + * after the first iget failed. + */ + error = xchk_trans_alloc(sc, 0); + if (error) + goto out_error; + + error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip); + if (error == 0) { + /* Actually got the inode, so install it. */ + xchk_trans_cancel(sc); + return xchk_install_handle_inode(sc, ip); + } + if (error == -ENOENT) + goto out_gone; + if (error != -EINVAL) + goto out_cancel; + + /* Ensure that we have protected against inode allocation/freeing. */ + if (agi_bp == NULL) { + ASSERT(agi_bp != NULL); + error = -ECANCELED; + goto out_cancel; + } + + /* + * Untrusted iget failed a second time. Let's try an inobt lookup. + * If the inobt thinks this the inode neither can exist inside the + * filesystem nor is allocated, return ENOENT to signal that the check + * can be skipped. + * + * If the lookup returns corruption, we'll mark this inode corrupt and + * exit to userspace. There's little chance of fixing anything until + * the inobt is straightened out, but there's nothing we can do here. + * + * If the lookup encounters any other error, exit to userspace. + * + * If the lookup succeeds, something else must be very wrong in the fs + * such that setting up the incore inode failed in some strange way. + * Treat those as corruptions. + */ + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino)); + if (!pag) { + error = -EFSCORRUPTED; + goto out_cancel; + } + + error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap, + XFS_IGET_UNTRUSTED); + xfs_perag_put(pag); + if (error == -EINVAL || error == -ENOENT) + goto out_gone; + if (!error) + error = -EFSCORRUPTED; + +out_cancel: + xchk_trans_cancel(sc); +out_error: + trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino), + error, __return_address); + return error; +out_gone: + /* The file is gone, so there's nothing to check. */ + xchk_trans_cancel(sc); + return -ENOENT; +} + +/* Release an inode, possibly dropping it in the process. */ +void +xchk_irele( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + if (current->journal_info != NULL) { + ASSERT(current->journal_info == sc->tp); + /* - * -EINVAL with IGET_UNTRUSTED could mean one of several - * things: userspace gave us an inode number that doesn't - * correspond to fs space, or doesn't have an inobt entry; - * or it could simply mean that the inode buffer failed the - * read verifiers. + * If we are in a transaction, we /cannot/ drop the inode + * ourselves, because the VFS will trigger writeback, which + * can require a transaction. Clear DONTCACHE to force the + * inode to the LRU, where someone else can take care of + * dropping it. * - * Try just the inode mapping lookup -- if it succeeds, then - * the inode buffer verifier failed and something needs fixing. - * Otherwise, we really couldn't find it so tell userspace - * that it no longer exists. + * Note that when we grabbed our reference to the inode, it + * could have had an active ref and DONTCACHE set if a sysadmin + * is trying to coerce a change in file access mode. icache + * hits do not clear DONTCACHE, so we must do it here. */ - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino)); - if (pag) { - error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap, - XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE); - xfs_perag_put(pag); - if (error) - return -ENOENT; - } - error = -EFSCORRUPTED; - fallthrough; - default: - trace_xchk_op_error(sc, - XFS_INO_TO_AGNO(mp, sc->sm->sm_ino), - XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino), - error, __return_address); - return error; - } - if (VFS_I(ip)->i_generation != sc->sm->sm_gen) { - xfs_irele(ip); - return -ENOENT; + spin_lock(&VFS_I(ip)->i_lock); + VFS_I(ip)->i_state &= ~I_DONTCACHE; + spin_unlock(&VFS_I(ip)->i_lock); + } else if (atomic_read(&VFS_I(ip)->i_count) == 1) { + /* + * If this is the last reference to the inode and the caller + * permits it, set DONTCACHE to avoid thrashing. + */ + d_mark_dontcache(VFS_I(ip)); } - sc->ip = ip; - return 0; + xfs_irele(ip); } -/* Set us up to scrub a file's contents. */ +/* + * Set us up to scrub metadata mapped by a file's fork. Callers must not use + * this to operate on user-accessible regular file data because the MMAPLOCK is + * not taken. + */ int xchk_setup_inode_contents( struct xfs_scrub *sc, @@ -706,13 +1000,14 @@ xchk_setup_inode_contents( { int error; - error = xchk_get_inode(sc); + error = xchk_iget_for_scrubbing(sc); if (error) return error; - /* Got the inode, lock it and we're ready to go. */ - sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + /* Lock the inode so the VFS cannot touch this file. */ + sc->ilock_flags = XFS_IOLOCK_EXCL; xfs_ilock(sc->ip, sc->ilock_flags); + error = xchk_trans_alloc(sc, resblks); if (error) goto out; @@ -870,49 +1165,23 @@ xchk_metadata_inode_forks( } /* - * Try to lock an inode in violation of the usual locking order rules. For - * example, trying to get the IOLOCK while in transaction context, or just - * plain breaking AG-order or inode-order inode locking rules. Either way, - * the only way to avoid an ABBA deadlock is to use trylock and back off if - * we can't. + * Enable filesystem hooks (i.e. runtime code patching) before starting a scrub + * operation. Callers must not hold any locks that intersect with the CPU + * hotplug lock (e.g. writeback locks) because code patching must halt the CPUs + * to change kernel code. */ -int -xchk_ilock_inverted( - struct xfs_inode *ip, - uint lock_mode) +void +xchk_fsgates_enable( + struct xfs_scrub *sc, + unsigned int scrub_fsgates) { - int i; + ASSERT(!(scrub_fsgates & ~XCHK_FSGATES_ALL)); + ASSERT(!(sc->flags & scrub_fsgates)); - for (i = 0; i < 20; i++) { - if (xfs_ilock_nowait(ip, lock_mode)) - return 0; - delay(1); - } - return -EDEADLOCK; -} + trace_xchk_fsgates_enable(sc, scrub_fsgates); -/* Pause background reaping of resources. */ -void -xchk_stop_reaping( - struct xfs_scrub *sc) -{ - sc->flags |= XCHK_REAPING_DISABLED; - xfs_blockgc_stop(sc->mp); - xfs_inodegc_stop(sc->mp); -} + if (scrub_fsgates & XCHK_FSGATES_DRAIN) + xfs_drain_wait_enable(); -/* Restart background reaping of resources. */ -void -xchk_start_reaping( - struct xfs_scrub *sc) -{ - /* - * Readonly filesystems do not perform inactivation or speculative - * preallocation, so there's no need to restart the workers. - */ - if (!xfs_is_readonly(sc->mp)) { - xfs_inodegc_start(sc->mp); - xfs_blockgc_start(sc->mp); - } - sc->flags &= ~XCHK_REAPING_DISABLED; + sc->flags |= scrub_fsgates; } diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index b73648d81d23..791235cd9b00 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #ifndef __XFS_SCRUB_COMMON_H__ #define __XFS_SCRUB_COMMON_H__ @@ -32,6 +32,8 @@ xchk_should_terminate( } int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks); +void xchk_trans_cancel(struct xfs_scrub *sc); + bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno, xfs_agblock_t bno, int *error); bool xchk_fblock_process_error(struct xfs_scrub *sc, int whichfork, @@ -72,6 +74,7 @@ bool xchk_should_check_xref(struct xfs_scrub *sc, int *error, struct xfs_btree_cur **curpp); /* Setup functions */ +int xchk_setup_agheader(struct xfs_scrub *sc); int xchk_setup_fs(struct xfs_scrub *sc); int xchk_setup_ag_allocbt(struct xfs_scrub *sc); int xchk_setup_ag_iallocbt(struct xfs_scrub *sc); @@ -132,10 +135,16 @@ int xchk_count_rmap_ownedby_ag(struct xfs_scrub *sc, struct xfs_btree_cur *cur, const struct xfs_owner_info *oinfo, xfs_filblks_t *blocks); int xchk_setup_ag_btree(struct xfs_scrub *sc, bool force_log); -int xchk_get_inode(struct xfs_scrub *sc); +int xchk_iget_for_scrubbing(struct xfs_scrub *sc); int xchk_setup_inode_contents(struct xfs_scrub *sc, unsigned int resblks); void xchk_buffer_recheck(struct xfs_scrub *sc, struct xfs_buf *bp); +int xchk_iget(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp); +int xchk_iget_agi(struct xfs_scrub *sc, xfs_ino_t inum, + struct xfs_buf **agi_bpp, struct xfs_inode **ipp); +void xchk_irele(struct xfs_scrub *sc, struct xfs_inode *ip); +int xchk_install_handle_inode(struct xfs_scrub *sc, struct xfs_inode *ip); + /* * Don't bother cross-referencing if we already found corruption or cross * referencing discrepancies. @@ -147,8 +156,19 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm) } int xchk_metadata_inode_forks(struct xfs_scrub *sc); -int xchk_ilock_inverted(struct xfs_inode *ip, uint lock_mode); -void xchk_stop_reaping(struct xfs_scrub *sc); -void xchk_start_reaping(struct xfs_scrub *sc); + +/* + * Setting up a hook to wait for intents to drain is costly -- we have to take + * the CPU hotplug lock and force an i-cache flush on all CPUs once to set it + * up, and again to tear it down. These costs add up quickly, so we only want + * to enable the drain waiter if the drain actually detected a conflict with + * running intent chains. + */ +static inline bool xchk_need_intent_drain(struct xfs_scrub *sc) +{ + return sc->flags & XCHK_NEED_DRAIN; +} + +void xchk_fsgates_enable(struct xfs_scrub *sc, unsigned int scrub_fshooks); #endif /* __XFS_SCRUB_COMMON_H__ */ diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index d17cee177085..82b150d3b8b7 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -39,6 +39,7 @@ xchk_da_process_error( switch (*error) { case -EDEADLOCK: + case -ECHRNG: /* Used to restart an op with deadlock avoidance. */ trace_xchk_deadlock_retry(sc->ip, sc->sm, *error); break; diff --git a/fs/xfs/scrub/dabtree.h b/fs/xfs/scrub/dabtree.h index 1f3515c6d5a8..4f8c2138a1ec 100644 --- a/fs/xfs/scrub/dabtree.h +++ b/fs/xfs/scrub/dabtree.h @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #ifndef __XFS_SCRUB_DABTREE_H__ #define __XFS_SCRUB_DABTREE_H__ diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index d1b0f23c2c59..0b491784b759 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -18,6 +18,7 @@ #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/dabtree.h" +#include "scrub/readdir.h" /* Set us up to scrub directories. */ int @@ -31,168 +32,114 @@ xchk_setup_directory( /* Scrub a directory entry. */ -struct xchk_dir_ctx { - /* VFS fill-directory iterator */ - struct dir_context dir_iter; - - struct xfs_scrub *sc; -}; - -/* Check that an inode's mode matches a given DT_ type. */ -STATIC int +/* Check that an inode's mode matches a given XFS_DIR3_FT_* type. */ +STATIC void xchk_dir_check_ftype( - struct xchk_dir_ctx *sdc, + struct xfs_scrub *sc, xfs_fileoff_t offset, - xfs_ino_t inum, - int dtype) + struct xfs_inode *ip, + int ftype) { - struct xfs_mount *mp = sdc->sc->mp; - struct xfs_inode *ip; - int ino_dtype; - int error = 0; + struct xfs_mount *mp = sc->mp; if (!xfs_has_ftype(mp)) { - if (dtype != DT_UNKNOWN && dtype != DT_DIR) - xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, - offset); - goto out; - } - - /* - * Grab the inode pointed to by the dirent. We release the - * inode before we cancel the scrub transaction. Since we're - * don't know a priori that releasing the inode won't trigger - * eofblocks cleanup (which allocates what would be a nested - * transaction), we can't use DONTCACHE here because DONTCACHE - * inodes can trigger immediate inactive cleanup of the inode. - * - * If _iget returns -EINVAL or -ENOENT then the child inode number is - * garbage and the directory is corrupt. If the _iget returns - * -EFSCORRUPTED or -EFSBADCRC then the child is corrupt which is a - * cross referencing error. Any other error is an operational error. - */ - error = xfs_iget(mp, sdc->sc->tp, inum, 0, 0, &ip); - if (error == -EINVAL || error == -ENOENT) { - error = -EFSCORRUPTED; - xchk_fblock_process_error(sdc->sc, XFS_DATA_FORK, 0, &error); - goto out; + if (ftype != XFS_DIR3_FT_UNKNOWN && ftype != XFS_DIR3_FT_DIR) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return; } - if (!xchk_fblock_xref_process_error(sdc->sc, XFS_DATA_FORK, offset, - &error)) - goto out; - /* Convert mode to the DT_* values that dir_emit uses. */ - ino_dtype = xfs_dir3_get_dtype(mp, - xfs_mode_to_ftype(VFS_I(ip)->i_mode)); - if (ino_dtype != dtype) - xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); - xfs_irele(ip); -out: - return error; + if (xfs_mode_to_ftype(VFS_I(ip)->i_mode) != ftype) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); } /* * Scrub a single directory entry. * - * We use the VFS directory iterator (i.e. readdir) to call this - * function for every directory entry in a directory. Once we're here, - * we check the inode number to make sure it's sane, then we check that - * we can look up this filename. Finally, we check the ftype. + * Check the inode number to make sure it's sane, then we check that we can + * look up this filename. Finally, we check the ftype. */ -STATIC bool +STATIC int xchk_dir_actor( - struct dir_context *dir_iter, - const char *name, - int namelen, - loff_t pos, - u64 ino, - unsigned type) + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) { - struct xfs_mount *mp; + struct xfs_mount *mp = dp->i_mount; struct xfs_inode *ip; - struct xchk_dir_ctx *sdc; - struct xfs_name xname; xfs_ino_t lookup_ino; xfs_dablk_t offset; - bool checked_ftype = false; int error = 0; - sdc = container_of(dir_iter, struct xchk_dir_ctx, dir_iter); - ip = sdc->sc->ip; - mp = ip->i_mount; offset = xfs_dir2_db_to_da(mp->m_dir_geo, - xfs_dir2_dataptr_to_db(mp->m_dir_geo, pos)); + xfs_dir2_dataptr_to_db(mp->m_dir_geo, dapos)); - if (xchk_should_terminate(sdc->sc, &error)) - return !error; + if (xchk_should_terminate(sc, &error)) + return error; /* Does this inode number make sense? */ if (!xfs_verify_dir_ino(mp, ino)) { - xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); - goto out; + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return -ECANCELED; } /* Does this name make sense? */ - if (!xfs_dir2_namecheck(name, namelen)) { - xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); - goto out; + if (!xfs_dir2_namecheck(name->name, name->len)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return -ECANCELED; } - if (!strncmp(".", name, namelen)) { + if (!strncmp(".", name->name, name->len)) { /* If this is "." then check that the inum matches the dir. */ - if (xfs_has_ftype(mp) && type != DT_DIR) - xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, - offset); - checked_ftype = true; - if (ino != ip->i_ino) - xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, - offset); - } else if (!strncmp("..", name, namelen)) { + if (ino != dp->i_ino) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + } else if (!strncmp("..", name->name, name->len)) { /* * If this is ".." in the root inode, check that the inum * matches this dir. */ - if (xfs_has_ftype(mp) && type != DT_DIR) - xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, - offset); - checked_ftype = true; - if (ip->i_ino == mp->m_sb.sb_rootino && ino != ip->i_ino) - xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, - offset); + if (dp->i_ino == mp->m_sb.sb_rootino && ino != dp->i_ino) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); } /* Verify that we can look up this name by hash. */ - xname.name = name; - xname.len = namelen; - xname.type = XFS_DIR3_FT_UNKNOWN; - - error = xfs_dir_lookup(sdc->sc->tp, ip, &xname, &lookup_ino, NULL); + error = xchk_dir_lookup(sc, dp, name, &lookup_ino); /* ENOENT means the hash lookup failed and the dir is corrupt */ if (error == -ENOENT) error = -EFSCORRUPTED; - if (!xchk_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset, - &error)) + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, offset, &error)) goto out; if (lookup_ino != ino) { - xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); - goto out; + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return -ECANCELED; } - /* Verify the file type. This function absorbs error codes. */ - if (!checked_ftype) { - error = xchk_dir_check_ftype(sdc, offset, lookup_ino, type); - if (error) - goto out; - } -out: /* - * A negative error code returned here is supposed to cause the - * dir_emit caller (xfs_readdir) to abort the directory iteration - * and return zero to xchk_directory. + * Grab the inode pointed to by the dirent. We release the inode + * before we cancel the scrub transaction. + * + * If _iget returns -EINVAL or -ENOENT then the child inode number is + * garbage and the directory is corrupt. If the _iget returns + * -EFSCORRUPTED or -EFSBADCRC then the child is corrupt which is a + * cross referencing error. Any other error is an operational error. */ - if (error == 0 && sdc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return false; - return !error; + error = xchk_iget(sc, ino, &ip); + if (error == -EINVAL || error == -ENOENT) { + error = -EFSCORRUPTED; + xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error); + goto out; + } + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, offset, &error)) + goto out; + + xchk_dir_check_ftype(sc, offset, ip, name->type); + xchk_irele(sc, ip); +out: + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return -ECANCELED; + return error; } /* Scrub a directory btree record. */ @@ -201,6 +148,7 @@ xchk_dir_rec( struct xchk_da_btree *ds, int level) { + struct xfs_name dname = { }; struct xfs_da_state_blk *blk = &ds->state->path.blk[level]; struct xfs_mount *mp = ds->state->mp; struct xfs_inode *dp = ds->dargs.dp; @@ -297,7 +245,11 @@ xchk_dir_rec( xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); goto out_relse; } - calc_hash = xfs_da_hashname(dent->name, dent->namelen); + + /* Does the directory hash match? */ + dname.name = dent->name; + dname.len = dent->namelen; + calc_hash = xfs_dir2_hashname(mp, &dname); if (calc_hash != hash) xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno); @@ -803,14 +755,7 @@ int xchk_directory( struct xfs_scrub *sc) { - struct xchk_dir_ctx sdc = { - .dir_iter.actor = xchk_dir_actor, - .dir_iter.pos = 0, - .sc = sc, - }; - size_t bufsize; - loff_t oldpos; - int error = 0; + int error; if (!S_ISDIR(VFS_I(sc->ip)->i_mode)) return -ENOENT; @@ -818,7 +763,7 @@ xchk_directory( /* Plausible size? */ if (sc->ip->i_disk_size < xfs_dir2_sf_hdr_size(0)) { xchk_ino_set_corrupt(sc, sc->ip->i_ino); - goto out; + return 0; } /* Check directory tree structure */ @@ -827,7 +772,7 @@ xchk_directory( return error; if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return error; + return 0; /* Check the freespace. */ error = xchk_directory_blocks(sc); @@ -835,44 +780,11 @@ xchk_directory( return error; if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return error; - - /* - * Check that every dirent we see can also be looked up by hash. - * Userspace usually asks for a 32k buffer, so we will too. - */ - bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, - sc->ip->i_disk_size); - - /* - * Look up every name in this directory by hash. - * - * Use the xfs_readdir function to call xchk_dir_actor on - * every directory entry in this directory. In _actor, we check - * the name, inode number, and ftype (if applicable) of the - * entry. xfs_readdir uses the VFS filldir functions to provide - * iteration context. - * - * The VFS grabs a read or write lock via i_rwsem before it reads - * or writes to a directory. If we've gotten this far we've - * already obtained IOLOCK_EXCL, which (since 4.10) is the same as - * getting a write lock on i_rwsem. Therefore, it is safe for us - * to drop the ILOCK here in order to reuse the _readdir and - * _dir_lookup routines, which do their own ILOCK locking. - */ - oldpos = 0; - sc->ilock_flags &= ~XFS_ILOCK_EXCL; - xfs_iunlock(sc->ip, XFS_ILOCK_EXCL); - while (true) { - error = xfs_readdir(sc->tp, sc->ip, &sdc.dir_iter, bufsize); - if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, - &error)) - goto out; - if (oldpos == sdc.dir_iter.pos) - break; - oldpos = sdc.dir_iter.pos; - } + return 0; -out: + /* Look up every name in this directory by hash. */ + error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, NULL); + if (error == -ECANCELED) + error = 0; return error; } diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index f0c7f41897b9..e382a35e98d8 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * Copyright (C) 2019 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2019-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -130,6 +130,13 @@ xchk_setup_fscounters( struct xchk_fscounters *fsc; int error; + /* + * If the AGF doesn't track btreeblks, we have to lock the AGF to count + * btree block usage by walking the actual btrees. + */ + if (!xfs_has_lazysbcount(sc->mp)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS); if (!sc->buf) return -ENOMEM; @@ -143,13 +150,6 @@ xchk_setup_fscounters( if (error) return error; - /* - * Pause background reclaim while we're scrubbing to reduce the - * likelihood of background perturbations to the counters throwing off - * our calculations. - */ - xchk_stop_reaping(sc); - return xchk_trans_alloc(sc, 0); } @@ -447,6 +447,12 @@ xchk_fscounters( xchk_set_corrupt(sc); /* + * XXX: We can't quiesce percpu counter updates, so exit early. + * This can be re-enabled when we gain exclusive freeze functionality. + */ + return 0; + + /* * If ifree exceeds icount by more than the minimum variance then * something's probably wrong with the counters. */ diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index aa65ec88a0c0..d2b2a1cb6533 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -1,12 +1,14 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2019 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2019-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" #include "xfs_btree.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h index d0b938d3d028..66a273f8585b 100644 --- a/fs/xfs/scrub/health.h +++ b/fs/xfs/scrub/health.h @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2019 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2019-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #ifndef __XFS_SCRUB_HEALTH_H__ #define __XFS_SCRUB_HEALTH_H__ diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index e312be7cd375..575f22a02ebe 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -32,6 +32,8 @@ int xchk_setup_ag_iallocbt( struct xfs_scrub *sc) { + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); return xchk_setup_ag_btree(sc, sc->flags & XCHK_TRY_HARDER); } @@ -49,83 +51,237 @@ struct xchk_iallocbt { }; /* - * If we're checking the finobt, cross-reference with the inobt. - * Otherwise we're checking the inobt; if there is an finobt, make sure - * we have a record or not depending on freecount. + * Does the finobt have a record for this inode with the same hole/free state? + * This is a bit complicated because of the following: + * + * - The finobt need not have a record if all inodes in the inobt record are + * allocated. + * - The finobt need not have a record if all inodes in the inobt record are + * free. + * - The finobt need not have a record if the inobt record says this is a hole. + * This likely doesn't happen in practice. */ -static inline void -xchk_iallocbt_chunk_xref_other( +STATIC int +xchk_inobt_xref_finobt( + struct xfs_scrub *sc, + struct xfs_inobt_rec_incore *irec, + xfs_agino_t agino, + bool free, + bool hole) +{ + struct xfs_inobt_rec_incore frec; + struct xfs_btree_cur *cur = sc->sa.fino_cur; + bool ffree, fhole; + unsigned int frec_idx, fhole_idx; + int has_record; + int error; + + ASSERT(cur->bc_btnum == XFS_BTNUM_FINO); + + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record); + if (error) + return error; + if (!has_record) + goto no_record; + + error = xfs_inobt_get_rec(cur, &frec, &has_record); + if (!has_record) + return -EFSCORRUPTED; + + if (frec.ir_startino + XFS_INODES_PER_CHUNK <= agino) + goto no_record; + + /* There's a finobt record; free and hole status must match. */ + frec_idx = agino - frec.ir_startino; + ffree = frec.ir_free & (1ULL << frec_idx); + fhole_idx = frec_idx / XFS_INODES_PER_HOLEMASK_BIT; + fhole = frec.ir_holemask & (1U << fhole_idx); + + if (ffree != free) + xchk_btree_xref_set_corrupt(sc, cur, 0); + if (fhole != hole) + xchk_btree_xref_set_corrupt(sc, cur, 0); + return 0; + +no_record: + /* inobt record is fully allocated */ + if (irec->ir_free == 0) + return 0; + + /* inobt record is totally unallocated */ + if (irec->ir_free == XFS_INOBT_ALL_FREE) + return 0; + + /* inobt record says this is a hole */ + if (hole) + return 0; + + /* finobt doesn't care about allocated inodes */ + if (!free) + return 0; + + xchk_btree_xref_set_corrupt(sc, cur, 0); + return 0; +} + +/* + * Make sure that each inode of this part of an inobt record has the same + * sparse and free status as the finobt. + */ +STATIC void +xchk_inobt_chunk_xref_finobt( struct xfs_scrub *sc, struct xfs_inobt_rec_incore *irec, - xfs_agino_t agino) + xfs_agino_t agino, + unsigned int nr_inodes) { - struct xfs_btree_cur **pcur; - bool has_irec; + xfs_agino_t i; + unsigned int rec_idx; int error; - if (sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT) - pcur = &sc->sa.ino_cur; - else - pcur = &sc->sa.fino_cur; - if (!(*pcur)) - return; - error = xfs_ialloc_has_inode_record(*pcur, agino, agino, &has_irec); - if (!xchk_should_check_xref(sc, &error, pcur)) + ASSERT(sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT); + + if (!sc->sa.fino_cur || xchk_skip_xref(sc->sm)) return; - if (((irec->ir_freecount > 0 && !has_irec) || - (irec->ir_freecount == 0 && has_irec))) - xchk_btree_xref_set_corrupt(sc, *pcur, 0); + + for (i = agino, rec_idx = agino - irec->ir_startino; + i < agino + nr_inodes; + i++, rec_idx++) { + bool free, hole; + unsigned int hole_idx; + + free = irec->ir_free & (1ULL << rec_idx); + hole_idx = rec_idx / XFS_INODES_PER_HOLEMASK_BIT; + hole = irec->ir_holemask & (1U << hole_idx); + + error = xchk_inobt_xref_finobt(sc, irec, i, free, hole); + if (!xchk_should_check_xref(sc, &error, &sc->sa.fino_cur)) + return; + } +} + +/* + * Does the inobt have a record for this inode with the same hole/free state? + * The inobt must always have a record if there's a finobt record. + */ +STATIC int +xchk_finobt_xref_inobt( + struct xfs_scrub *sc, + struct xfs_inobt_rec_incore *frec, + xfs_agino_t agino, + bool ffree, + bool fhole) +{ + struct xfs_inobt_rec_incore irec; + struct xfs_btree_cur *cur = sc->sa.ino_cur; + bool free, hole; + unsigned int rec_idx, hole_idx; + int has_record; + int error; + + ASSERT(cur->bc_btnum == XFS_BTNUM_INO); + + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record); + if (error) + return error; + if (!has_record) + goto no_record; + + error = xfs_inobt_get_rec(cur, &irec, &has_record); + if (!has_record) + return -EFSCORRUPTED; + + if (irec.ir_startino + XFS_INODES_PER_CHUNK <= agino) + goto no_record; + + /* There's an inobt record; free and hole status must match. */ + rec_idx = agino - irec.ir_startino; + free = irec.ir_free & (1ULL << rec_idx); + hole_idx = rec_idx / XFS_INODES_PER_HOLEMASK_BIT; + hole = irec.ir_holemask & (1U << hole_idx); + + if (ffree != free) + xchk_btree_xref_set_corrupt(sc, cur, 0); + if (fhole != hole) + xchk_btree_xref_set_corrupt(sc, cur, 0); + return 0; + +no_record: + /* finobt should never have a record for which the inobt does not */ + xchk_btree_xref_set_corrupt(sc, cur, 0); + return 0; } -/* Cross-reference with the other btrees. */ +/* + * Make sure that each inode of this part of an finobt record has the same + * sparse and free status as the inobt. + */ STATIC void -xchk_iallocbt_chunk_xref( +xchk_finobt_chunk_xref_inobt( struct xfs_scrub *sc, - struct xfs_inobt_rec_incore *irec, + struct xfs_inobt_rec_incore *frec, xfs_agino_t agino, - xfs_agblock_t agbno, - xfs_extlen_t len) + unsigned int nr_inodes) { - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + xfs_agino_t i; + unsigned int rec_idx; + int error; + + ASSERT(sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT); + + if (!sc->sa.ino_cur || xchk_skip_xref(sc->sm)) return; - xchk_xref_is_used_space(sc, agbno, len); - xchk_iallocbt_chunk_xref_other(sc, irec, agino); - xchk_xref_is_owned_by(sc, agbno, len, &XFS_RMAP_OINFO_INODES); - xchk_xref_is_not_shared(sc, agbno, len); + for (i = agino, rec_idx = agino - frec->ir_startino; + i < agino + nr_inodes; + i++, rec_idx++) { + bool ffree, fhole; + unsigned int hole_idx; + + ffree = frec->ir_free & (1ULL << rec_idx); + hole_idx = rec_idx / XFS_INODES_PER_HOLEMASK_BIT; + fhole = frec->ir_holemask & (1U << hole_idx); + + error = xchk_finobt_xref_inobt(sc, frec, i, ffree, fhole); + if (!xchk_should_check_xref(sc, &error, &sc->sa.ino_cur)) + return; + } } -/* Is this chunk worth checking? */ +/* Is this chunk worth checking and cross-referencing? */ STATIC bool xchk_iallocbt_chunk( struct xchk_btree *bs, struct xfs_inobt_rec_incore *irec, xfs_agino_t agino, - xfs_extlen_t len) + unsigned int nr_inodes) { + struct xfs_scrub *sc = bs->sc; struct xfs_mount *mp = bs->cur->bc_mp; struct xfs_perag *pag = bs->cur->bc_ag.pag; - xfs_agblock_t bno; + xfs_agblock_t agbno; + xfs_extlen_t len; - bno = XFS_AGINO_TO_AGBNO(mp, agino); + agbno = XFS_AGINO_TO_AGBNO(mp, agino); + len = XFS_B_TO_FSB(mp, nr_inodes * mp->m_sb.sb_inodesize); - if (!xfs_verify_agbext(pag, bno, len)) + if (!xfs_verify_agbext(pag, agbno, len)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - xchk_iallocbt_chunk_xref(bs->sc, irec, agino, bno, len); + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return false; + xchk_xref_is_used_space(sc, agbno, len); + if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT) + xchk_inobt_chunk_xref_finobt(sc, irec, agino, nr_inodes); + else + xchk_finobt_chunk_xref_inobt(sc, irec, agino, nr_inodes); + xchk_xref_is_only_owned_by(sc, agbno, len, &XFS_RMAP_OINFO_INODES); + xchk_xref_is_not_shared(sc, agbno, len); + xchk_xref_is_not_cow_staging(sc, agbno, len); return true; } -/* Count the number of free inodes. */ -static unsigned int -xchk_iallocbt_freecount( - xfs_inofree_t freemask) -{ - BUILD_BUG_ON(sizeof(freemask) != sizeof(__u64)); - return hweight64(freemask); -} - /* * Check that an inode's allocation status matches ir_free in the inobt * record. First we try querying the in-core inode state, and if the inode @@ -272,7 +428,7 @@ xchk_iallocbt_check_cluster( return 0; } - xchk_xref_is_owned_by(bs->sc, agbno, M_IGEO(mp)->blocks_per_cluster, + xchk_xref_is_only_owned_by(bs->sc, agbno, M_IGEO(mp)->blocks_per_cluster, &XFS_RMAP_OINFO_INODES); /* Grab the inode cluster buffer. */ @@ -420,36 +576,22 @@ xchk_iallocbt_rec( const union xfs_btree_rec *rec) { struct xfs_mount *mp = bs->cur->bc_mp; - struct xfs_perag *pag = bs->cur->bc_ag.pag; struct xchk_iallocbt *iabt = bs->private; struct xfs_inobt_rec_incore irec; uint64_t holes; xfs_agino_t agino; - xfs_extlen_t len; int holecount; int i; int error = 0; - unsigned int real_freecount; uint16_t holemask; xfs_inobt_btrec_to_irec(mp, rec, &irec); - - if (irec.ir_count > XFS_INODES_PER_CHUNK || - irec.ir_freecount > XFS_INODES_PER_CHUNK) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - - real_freecount = irec.ir_freecount + - (XFS_INODES_PER_CHUNK - irec.ir_count); - if (real_freecount != xchk_iallocbt_freecount(irec.ir_free)) + if (xfs_inobt_check_irec(bs->cur, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } agino = irec.ir_startino; - /* Record has to be properly aligned within the AG. */ - if (!xfs_verify_agino(pag, agino) || - !xfs_verify_agino(pag, agino + XFS_INODES_PER_CHUNK - 1)) { - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - goto out; - } xchk_iallocbt_rec_alignment(bs, &irec); if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) @@ -459,12 +601,11 @@ xchk_iallocbt_rec( /* Handle non-sparse inodes */ if (!xfs_inobt_issparse(irec.ir_holemask)) { - len = XFS_B_TO_FSB(mp, - XFS_INODES_PER_CHUNK * mp->m_sb.sb_inodesize); if (irec.ir_count != XFS_INODES_PER_CHUNK) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - if (!xchk_iallocbt_chunk(bs, &irec, agino, len)) + if (!xchk_iallocbt_chunk(bs, &irec, agino, + XFS_INODES_PER_CHUNK)) goto out; goto check_clusters; } @@ -472,8 +613,6 @@ xchk_iallocbt_rec( /* Check each chunk of a sparse inode cluster. */ holemask = irec.ir_holemask; holecount = 0; - len = XFS_B_TO_FSB(mp, - XFS_INODES_PER_HOLEMASK_BIT * mp->m_sb.sb_inodesize); holes = ~xfs_inobt_irec_to_allocmask(&irec); if ((holes & irec.ir_free) != holes || irec.ir_freecount > irec.ir_count) @@ -482,8 +621,9 @@ xchk_iallocbt_rec( for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; i++) { if (holemask & 1) holecount += XFS_INODES_PER_HOLEMASK_BIT; - else if (!xchk_iallocbt_chunk(bs, &irec, agino, len)) - break; + else if (!xchk_iallocbt_chunk(bs, &irec, agino, + XFS_INODES_PER_HOLEMASK_BIT)) + goto out; holemask >>= 1; agino += XFS_INODES_PER_HOLEMASK_BIT; } @@ -493,6 +633,9 @@ xchk_iallocbt_rec( xchk_btree_set_corrupt(bs->sc, bs->cur, 0); check_clusters: + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + error = xchk_iallocbt_check_clusters(bs, &irec); if (error) goto out; @@ -622,18 +765,18 @@ xchk_xref_inode_check( xfs_agblock_t agbno, xfs_extlen_t len, struct xfs_btree_cur **icur, - bool should_have_inodes) + enum xbtree_recpacking expected) { - bool has_inodes; + enum xbtree_recpacking outcome; int error; if (!(*icur) || xchk_skip_xref(sc->sm)) return; - error = xfs_ialloc_has_inodes_at_extent(*icur, agbno, len, &has_inodes); + error = xfs_ialloc_has_inodes_at_extent(*icur, agbno, len, &outcome); if (!xchk_should_check_xref(sc, &error, icur)) return; - if (has_inodes != should_have_inodes) + if (outcome != expected) xchk_btree_xref_set_corrupt(sc, *icur, 0); } @@ -644,8 +787,10 @@ xchk_xref_is_not_inode_chunk( xfs_agblock_t agbno, xfs_extlen_t len) { - xchk_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur, false); - xchk_xref_inode_check(sc, agbno, len, &sc->sa.fino_cur, false); + xchk_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur, + XBTREE_RECPACKING_EMPTY); + xchk_xref_inode_check(sc, agbno, len, &sc->sa.fino_cur, + XBTREE_RECPACKING_EMPTY); } /* xref check that the extent is covered by inodes */ @@ -655,5 +800,6 @@ xchk_xref_is_inode_chunk( xfs_agblock_t agbno, xfs_extlen_t len) { - xchk_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur, true); + xchk_xref_inode_check(sc, agbno, len, &sc->sa.ino_cur, + XBTREE_RECPACKING_FULL); } diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 7a2f38e5202c..3e1e02e340a6 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -11,8 +11,11 @@ #include "xfs_mount.h" #include "xfs_btree.h" #include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_ag.h" #include "xfs_inode.h" #include "xfs_ialloc.h" +#include "xfs_icache.h" #include "xfs_da_format.h" #include "xfs_reflink.h" #include "xfs_rmap.h" @@ -20,45 +23,176 @@ #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" +#include "scrub/trace.h" + +/* Prepare the attached inode for scrubbing. */ +static inline int +xchk_prepare_iscrub( + struct xfs_scrub *sc) +{ + int error; + + sc->ilock_flags = XFS_IOLOCK_EXCL; + xfs_ilock(sc->ip, sc->ilock_flags); + + error = xchk_trans_alloc(sc, 0); + if (error) + return error; + + sc->ilock_flags |= XFS_ILOCK_EXCL; + xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + return 0; +} + +/* Install this scrub-by-handle inode and prepare it for scrubbing. */ +static inline int +xchk_install_handle_iscrub( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + int error; + + error = xchk_install_handle_inode(sc, ip); + if (error) + return error; + + return xchk_prepare_iscrub(sc); +} /* - * Grab total control of the inode metadata. It doesn't matter here if - * the file data is still changing; exclusive access to the metadata is - * the goal. + * Grab total control of the inode metadata. In the best case, we grab the + * incore inode and take all locks on it. If the incore inode cannot be + * constructed due to corruption problems, lock the AGI so that we can single + * step the loading process to fix everything that can go wrong. */ int xchk_setup_inode( struct xfs_scrub *sc) { + struct xfs_imap imap; + struct xfs_inode *ip; + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip_in = XFS_I(file_inode(sc->file)); + struct xfs_buf *agi_bp; + struct xfs_perag *pag; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino); int error; + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + + /* We want to scan the opened inode, so lock it and exit. */ + if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) { + sc->ip = ip_in; + return xchk_prepare_iscrub(sc); + } + + /* Reject internal metadata files and obviously bad inode numbers. */ + if (xfs_internal_inum(mp, sc->sm->sm_ino)) + return -ENOENT; + if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino)) + return -ENOENT; + + /* Try a regular untrusted iget. */ + error = xchk_iget(sc, sc->sm->sm_ino, &ip); + if (!error) + return xchk_install_handle_iscrub(sc, ip); + if (error == -ENOENT) + return error; + if (error != -EFSCORRUPTED && error != -EFSBADCRC && error != -EINVAL) + goto out_error; + /* - * Try to get the inode. If the verifiers fail, we try again - * in raw mode. + * EINVAL with IGET_UNTRUSTED probably means one of several things: + * userspace gave us an inode number that doesn't correspond to fs + * space; the inode btree lacks a record for this inode; or there is + * a record, and it says this inode is free. + * + * EFSCORRUPTED/EFSBADCRC could mean that the inode was mappable, but + * some other metadata corruption (e.g. inode forks) prevented + * instantiation of the incore inode. Or it could mean the inobt is + * corrupt. + * + * We want to look up this inode in the inobt directly to distinguish + * three different scenarios: (1) the inobt says the inode is free, + * in which case there's nothing to do; (2) the inobt is corrupt so we + * should flag the corruption and exit to userspace to let it fix the + * inobt; and (3) the inobt says the inode is allocated, but loading it + * failed due to corruption. + * + * Allocate a transaction and grab the AGI to prevent inobt activity in + * this AG. Retry the iget in case someone allocated a new inode after + * the first iget failed. */ - error = xchk_get_inode(sc); - switch (error) { - case 0: - break; - case -EFSCORRUPTED: - case -EFSBADCRC: - return xchk_trans_alloc(sc, 0); - default: - return error; + error = xchk_trans_alloc(sc, 0); + if (error) + goto out_error; + + error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip); + if (error == 0) { + /* Actually got the incore inode, so install it and proceed. */ + xchk_trans_cancel(sc); + return xchk_install_handle_iscrub(sc, ip); + } + if (error == -ENOENT) + goto out_gone; + if (error != -EFSCORRUPTED && error != -EFSBADCRC && error != -EINVAL) + goto out_cancel; + + /* Ensure that we have protected against inode allocation/freeing. */ + if (agi_bp == NULL) { + ASSERT(agi_bp != NULL); + error = -ECANCELED; + goto out_cancel; } - /* Got the inode, lock it and we're ready to go. */ - sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - xfs_ilock(sc->ip, sc->ilock_flags); - error = xchk_trans_alloc(sc, 0); + /* + * Untrusted iget failed a second time. Let's try an inobt lookup. + * If the inobt doesn't think this is an allocated inode then we'll + * return ENOENT to signal that the check can be skipped. + * + * If the lookup signals corruption, we'll mark this inode corrupt and + * exit to userspace. There's little chance of fixing anything until + * the inobt is straightened out, but there's nothing we can do here. + * + * If the lookup encounters a runtime error, exit to userspace. + */ + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino)); + if (!pag) { + error = -EFSCORRUPTED; + goto out_cancel; + } + + error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap, + XFS_IGET_UNTRUSTED); + xfs_perag_put(pag); + if (error == -EINVAL || error == -ENOENT) + goto out_gone; if (error) - goto out; - sc->ilock_flags |= XFS_ILOCK_EXCL; - xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + goto out_cancel; -out: - /* scrub teardown will unlock and release the inode for us */ + /* + * The lookup succeeded. Chances are the ondisk inode is corrupt and + * preventing iget from reading it. Retain the scrub transaction and + * the AGI buffer to prevent anyone from allocating or freeing inodes. + * This ensures that we preserve the inconsistency between the inobt + * saying the inode is allocated and the icache being unable to load + * the inode until we can flag the corruption in xchk_inode. The + * scrub function has to note the corruption, since we're not really + * supposed to do that from the setup function. + */ + return 0; + +out_cancel: + xchk_trans_cancel(sc); +out_error: + trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino), + error, __return_address); return error; +out_gone: + /* The file is gone, so there's nothing to check. */ + xchk_trans_cancel(sc); + return -ENOENT; } /* Inode core */ @@ -553,8 +687,9 @@ xchk_inode_xref( xchk_xref_is_used_space(sc, agbno, 1); xchk_inode_xref_finobt(sc, ino); - xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_INODES); + xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_INODES); xchk_xref_is_not_shared(sc, agbno, 1); + xchk_xref_is_not_cow_staging(sc, agbno, 1); xchk_inode_xref_bmap(sc, dip); out_free: diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index d8dff3fd8053..58d5dfb7ea21 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -16,6 +16,7 @@ #include "xfs_dir2_priv.h" #include "scrub/scrub.h" #include "scrub/common.h" +#include "scrub/readdir.h" /* Set us up to scrub parents. */ int @@ -30,122 +31,93 @@ xchk_setup_parent( /* Look for an entry in a parent pointing to this inode. */ struct xchk_parent_ctx { - struct dir_context dc; struct xfs_scrub *sc; - xfs_ino_t ino; xfs_nlink_t nlink; - bool cancelled; }; /* Look for a single entry in a directory pointing to an inode. */ -STATIC bool +STATIC int xchk_parent_actor( - struct dir_context *dc, - const char *name, - int namelen, - loff_t pos, - u64 ino, - unsigned type) + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) { - struct xchk_parent_ctx *spc; + struct xchk_parent_ctx *spc = priv; int error = 0; - spc = container_of(dc, struct xchk_parent_ctx, dc); - if (spc->ino == ino) + /* Does this name make sense? */ + if (!xfs_dir2_namecheck(name->name, name->len)) + error = -EFSCORRUPTED; + if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) + return error; + + if (sc->ip->i_ino == ino) spc->nlink++; - /* - * If we're facing a fatal signal, bail out. Store the cancellation - * status separately because the VFS readdir code squashes error codes - * into short directory reads. - */ if (xchk_should_terminate(spc->sc, &error)) - spc->cancelled = true; + return error; - return !error; + return 0; } -/* Count the number of dentries in the parent dir that point to this inode. */ -STATIC int -xchk_parent_count_parent_dentries( - struct xfs_scrub *sc, - struct xfs_inode *parent, - xfs_nlink_t *nlink) +/* + * Try to lock a parent directory for checking dirents. Returns the inode + * flags for the locks we now hold, or zero if we failed. + */ +STATIC unsigned int +xchk_parent_ilock_dir( + struct xfs_inode *dp) { - struct xchk_parent_ctx spc = { - .dc.actor = xchk_parent_actor, - .ino = sc->ip->i_ino, - .sc = sc, - }; - size_t bufsize; - loff_t oldpos; - uint lock_mode; - int error = 0; + if (!xfs_ilock_nowait(dp, XFS_ILOCK_SHARED)) + return 0; - /* - * If there are any blocks, read-ahead block 0 as we're almost - * certain to have the next operation be a read there. This is - * how we guarantee that the parent's extent map has been loaded, - * if there is one. - */ - lock_mode = xfs_ilock_data_map_shared(parent); - if (parent->i_df.if_nextents > 0) - error = xfs_dir3_data_readahead(parent, 0, 0); - xfs_iunlock(parent, lock_mode); - if (error) - return error; + if (!xfs_need_iread_extents(&dp->i_df)) + return XFS_ILOCK_SHARED; - /* - * Iterate the parent dir to confirm that there is - * exactly one entry pointing back to the inode being - * scanned. - */ - bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, - parent->i_disk_size); - oldpos = 0; - while (true) { - error = xfs_readdir(sc->tp, parent, &spc.dc, bufsize); - if (error) - goto out; - if (spc.cancelled) { - error = -EAGAIN; - goto out; - } - if (oldpos == spc.dc.pos) - break; - oldpos = spc.dc.pos; - } - *nlink = spc.nlink; -out: - return error; + xfs_iunlock(dp, XFS_ILOCK_SHARED); + + if (!xfs_ilock_nowait(dp, XFS_ILOCK_EXCL)) + return 0; + + return XFS_ILOCK_EXCL; } /* - * Given the inode number of the alleged parent of the inode being - * scrubbed, try to validate that the parent has exactly one directory - * entry pointing back to the inode being scrubbed. + * Given the inode number of the alleged parent of the inode being scrubbed, + * try to validate that the parent has exactly one directory entry pointing + * back to the inode being scrubbed. Returns -EAGAIN if we need to revalidate + * the dotdot entry. */ STATIC int xchk_parent_validate( struct xfs_scrub *sc, - xfs_ino_t dnum, - bool *try_again) + xfs_ino_t parent_ino) { + struct xchk_parent_ctx spc = { + .sc = sc, + .nlink = 0, + }; struct xfs_mount *mp = sc->mp; struct xfs_inode *dp = NULL; xfs_nlink_t expected_nlink; - xfs_nlink_t nlink; + unsigned int lock_mode; int error = 0; - *try_again = false; - - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - goto out; + /* Is this the root dir? Then '..' must point to itself. */ + if (sc->ip == mp->m_rootip) { + if (sc->ip->i_ino != mp->m_sb.sb_rootino || + sc->ip->i_ino != parent_ino) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } /* '..' must not point to ourselves. */ - if (sc->ip->i_ino == dnum) { + if (sc->ip->i_ino == parent_ino) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - goto out; + return 0; } /* @@ -155,106 +127,51 @@ xchk_parent_validate( expected_nlink = VFS_I(sc->ip)->i_nlink == 0 ? 0 : 1; /* - * Grab this parent inode. We release the inode before we - * cancel the scrub transaction. Since we're don't know a - * priori that releasing the inode won't trigger eofblocks - * cleanup (which allocates what would be a nested transaction) - * if the parent pointer erroneously points to a file, we - * can't use DONTCACHE here because DONTCACHE inodes can trigger - * immediate inactive cleanup of the inode. + * Grab the parent directory inode. This must be released before we + * cancel the scrub transaction. * * If _iget returns -EINVAL or -ENOENT then the parent inode number is * garbage and the directory is corrupt. If the _iget returns * -EFSCORRUPTED or -EFSBADCRC then the parent is corrupt which is a * cross referencing error. Any other error is an operational error. */ - error = xfs_iget(mp, sc->tp, dnum, XFS_IGET_UNTRUSTED, 0, &dp); + error = xchk_iget(sc, parent_ino, &dp); if (error == -EINVAL || error == -ENOENT) { error = -EFSCORRUPTED; xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error); - goto out; + return error; } if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) - goto out; + return error; if (dp == sc->ip || !S_ISDIR(VFS_I(dp)->i_mode)) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); goto out_rele; } - /* - * We prefer to keep the inode locked while we lock and search - * its alleged parent for a forward reference. If we can grab - * the iolock, validate the pointers and we're done. We must - * use nowait here to avoid an ABBA deadlock on the parent and - * the child inodes. - */ - if (xfs_ilock_nowait(dp, XFS_IOLOCK_SHARED)) { - error = xchk_parent_count_parent_dentries(sc, dp, &nlink); - if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, - &error)) - goto out_unlock; - if (nlink != expected_nlink) - xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - goto out_unlock; - } - - /* - * The game changes if we get here. We failed to lock the parent, - * so we're going to try to verify both pointers while only holding - * one lock so as to avoid deadlocking with something that's actually - * trying to traverse down the directory tree. - */ - xfs_iunlock(sc->ip, sc->ilock_flags); - sc->ilock_flags = 0; - error = xchk_ilock_inverted(dp, XFS_IOLOCK_SHARED); - if (error) + lock_mode = xchk_parent_ilock_dir(dp); + if (!lock_mode) { + xfs_iunlock(sc->ip, XFS_ILOCK_EXCL); + xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + error = -EAGAIN; goto out_rele; + } - /* Go looking for our dentry. */ - error = xchk_parent_count_parent_dentries(sc, dp, &nlink); + /* Look for a directory entry in the parent pointing to the child. */ + error = xchk_dir_walk(sc, dp, xchk_parent_actor, &spc); if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) goto out_unlock; - /* Drop the parent lock, relock this inode. */ - xfs_iunlock(dp, XFS_IOLOCK_SHARED); - error = xchk_ilock_inverted(sc->ip, XFS_IOLOCK_EXCL); - if (error) - goto out_rele; - sc->ilock_flags = XFS_IOLOCK_EXCL; - - /* - * If we're an unlinked directory, the parent /won't/ have a link - * to us. Otherwise, it should have one link. We have to re-set - * it here because we dropped the lock on sc->ip. - */ - expected_nlink = VFS_I(sc->ip)->i_nlink == 0 ? 0 : 1; - - /* Look up '..' to see if the inode changed. */ - error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &dnum, NULL); - if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) - goto out_rele; - - /* Drat, parent changed. Try again! */ - if (dnum != dp->i_ino) { - xfs_irele(dp); - *try_again = true; - return 0; - } - xfs_irele(dp); - /* - * '..' didn't change, so check that there was only one entry - * for us in the parent. + * Ensure that the parent has as many links to the child as the child + * thinks it has to the parent. */ - if (nlink != expected_nlink) + if (spc.nlink != expected_nlink) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - return error; out_unlock: - xfs_iunlock(dp, XFS_IOLOCK_SHARED); + xfs_iunlock(dp, lock_mode); out_rele: - xfs_irele(dp); -out: + xchk_irele(sc, dp); return error; } @@ -264,9 +181,7 @@ xchk_parent( struct xfs_scrub *sc) { struct xfs_mount *mp = sc->mp; - xfs_ino_t dnum; - bool try_again; - int tries = 0; + xfs_ino_t parent_ino; int error = 0; /* @@ -279,56 +194,29 @@ xchk_parent( /* We're not a special inode, are we? */ if (!xfs_verify_dir_ino(mp, sc->ip->i_ino)) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - goto out; + return 0; } - /* - * The VFS grabs a read or write lock via i_rwsem before it reads - * or writes to a directory. If we've gotten this far we've - * already obtained IOLOCK_EXCL, which (since 4.10) is the same as - * getting a write lock on i_rwsem. Therefore, it is safe for us - * to drop the ILOCK here in order to do directory lookups. - */ - sc->ilock_flags &= ~(XFS_ILOCK_EXCL | XFS_MMAPLOCK_EXCL); - xfs_iunlock(sc->ip, XFS_ILOCK_EXCL | XFS_MMAPLOCK_EXCL); - - /* Look up '..' */ - error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &dnum, NULL); - if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) - goto out; - if (!xfs_verify_dir_ino(mp, dnum)) { - xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - goto out; - } + do { + if (xchk_should_terminate(sc, &error)) + break; - /* Is this the root dir? Then '..' must point to itself. */ - if (sc->ip == mp->m_rootip) { - if (sc->ip->i_ino != mp->m_sb.sb_rootino || - sc->ip->i_ino != dnum) + /* Look up '..' */ + error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, + &parent_ino); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) + return error; + if (!xfs_verify_dir_ino(mp, parent_ino)) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - goto out; - } + return 0; + } - do { - error = xchk_parent_validate(sc, dnum, &try_again); - if (error) - goto out; - } while (try_again && ++tries < 20); + /* + * Check that the dotdot entry points to a parent directory + * containing a dirent pointing to this subdirectory. + */ + error = xchk_parent_validate(sc, parent_ino); + } while (error == -EAGAIN); - /* - * We gave it our best shot but failed, so mark this scrub - * incomplete. Userspace can decide if it wants to try again. - */ - if (try_again && tries == 20) - xchk_set_incomplete(sc); -out: - /* - * If we failed to lock the parent inode even after a retry, just mark - * this scrub incomplete and return. - */ - if ((sc->flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) { - error = 0; - xchk_set_incomplete(sc); - } return error; } diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 9eeac8565394..e6caa358cbda 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -53,6 +53,9 @@ xchk_setup_quota( if (!xfs_this_quota_on(sc->mp, dqtype)) return -ENOENT; + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + error = xchk_setup_fs(sc); if (error) return error; diff --git a/fs/xfs/scrub/readdir.c b/fs/xfs/scrub/readdir.c new file mode 100644 index 000000000000..e51c1544be63 --- /dev/null +++ b/fs/xfs/scrub/readdir.c @@ -0,0 +1,375 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_trace.h" +#include "xfs_bmap.h" +#include "xfs_trans.h" +#include "xfs_error.h" +#include "scrub/scrub.h" +#include "scrub/readdir.h" + +/* Call a function for every entry in a shortform directory. */ +STATIC int +xchk_dir_walk_sf( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xchk_dirent_fn dirent_fn, + void *priv) +{ + struct xfs_name name = { + .name = ".", + .len = 1, + .type = XFS_DIR3_FT_DIR, + }; + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_geometry *geo = mp->m_dir_geo; + struct xfs_dir2_sf_entry *sfep; + struct xfs_dir2_sf_hdr *sfp; + xfs_ino_t ino; + xfs_dir2_dataptr_t dapos; + unsigned int i; + int error; + + ASSERT(dp->i_df.if_bytes == dp->i_disk_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + + sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data; + + /* dot entry */ + dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + geo->data_entry_offset); + + error = dirent_fn(sc, dp, dapos, &name, dp->i_ino, priv); + if (error) + return error; + + /* dotdot entry */ + dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + geo->data_entry_offset + + xfs_dir2_data_entsize(mp, sizeof(".") - 1)); + ino = xfs_dir2_sf_get_parent_ino(sfp); + name.name = ".."; + name.len = 2; + + error = dirent_fn(sc, dp, dapos, &name, ino, priv); + if (error) + return error; + + /* iterate everything else */ + sfep = xfs_dir2_sf_firstentry(sfp); + for (i = 0; i < sfp->count; i++) { + dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + xfs_dir2_sf_get_offset(sfep)); + ino = xfs_dir2_sf_get_ino(mp, sfp, sfep); + name.name = sfep->name; + name.len = sfep->namelen; + name.type = xfs_dir2_sf_get_ftype(mp, sfep); + + error = dirent_fn(sc, dp, dapos, &name, ino, priv); + if (error) + return error; + + sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); + } + + return 0; +} + +/* Call a function for every entry in a block directory. */ +STATIC int +xchk_dir_walk_block( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xchk_dirent_fn dirent_fn, + void *priv) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_geometry *geo = mp->m_dir_geo; + struct xfs_buf *bp; + unsigned int off, next_off, end; + int error; + + error = xfs_dir3_block_read(sc->tp, dp, &bp); + if (error) + return error; + + /* Walk each directory entry. */ + end = xfs_dir3_data_end_offset(geo, bp->b_addr); + for (off = geo->data_entry_offset; off < end; off = next_off) { + struct xfs_name name = { }; + struct xfs_dir2_data_unused *dup = bp->b_addr + off; + struct xfs_dir2_data_entry *dep = bp->b_addr + off; + xfs_ino_t ino; + xfs_dir2_dataptr_t dapos; + + /* Skip an empty entry. */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + next_off = off + be16_to_cpu(dup->length); + continue; + } + + /* Otherwise, find the next entry and report it. */ + next_off = off + xfs_dir2_data_entsize(mp, dep->namelen); + if (next_off > end) + break; + + dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, off); + ino = be64_to_cpu(dep->inumber); + name.name = dep->name; + name.len = dep->namelen; + name.type = xfs_dir2_data_get_ftype(mp, dep); + + error = dirent_fn(sc, dp, dapos, &name, ino, priv); + if (error) + break; + } + + xfs_trans_brelse(sc->tp, bp); + return error; +} + +/* Read a leaf-format directory buffer. */ +STATIC int +xchk_read_leaf_dir_buf( + struct xfs_trans *tp, + struct xfs_inode *dp, + struct xfs_da_geometry *geo, + xfs_dir2_off_t *curoff, + struct xfs_buf **bpp) +{ + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec map; + struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK); + xfs_dablk_t last_da; + xfs_dablk_t map_off; + xfs_dir2_off_t new_off; + + *bpp = NULL; + + /* + * Look for mapped directory blocks at or above the current offset. + * Truncate down to the nearest directory block to start the scanning + * operation. + */ + last_da = xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET); + map_off = xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, *curoff)); + + if (!xfs_iext_lookup_extent(dp, ifp, map_off, &icur, &map)) + return 0; + if (map.br_startoff >= last_da) + return 0; + xfs_trim_extent(&map, map_off, last_da - map_off); + + /* Read the directory block of that first mapping. */ + new_off = xfs_dir2_da_to_byte(geo, map.br_startoff); + if (new_off > *curoff) + *curoff = new_off; + + return xfs_dir3_data_read(tp, dp, map.br_startoff, 0, bpp); +} + +/* Call a function for every entry in a leaf directory. */ +STATIC int +xchk_dir_walk_leaf( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xchk_dirent_fn dirent_fn, + void *priv) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_geometry *geo = mp->m_dir_geo; + struct xfs_buf *bp = NULL; + xfs_dir2_off_t curoff = 0; + unsigned int offset = 0; + int error; + + /* Iterate every directory offset in this directory. */ + while (curoff < XFS_DIR2_LEAF_OFFSET) { + struct xfs_name name = { }; + struct xfs_dir2_data_unused *dup; + struct xfs_dir2_data_entry *dep; + xfs_ino_t ino; + unsigned int length; + xfs_dir2_dataptr_t dapos; + + /* + * If we have no buffer, or we're off the end of the + * current buffer, need to get another one. + */ + if (!bp || offset >= geo->blksize) { + if (bp) { + xfs_trans_brelse(sc->tp, bp); + bp = NULL; + } + + error = xchk_read_leaf_dir_buf(sc->tp, dp, geo, &curoff, + &bp); + if (error || !bp) + break; + + /* + * Find our position in the block. + */ + offset = geo->data_entry_offset; + curoff += geo->data_entry_offset; + } + + /* Skip an empty entry. */ + dup = bp->b_addr + offset; + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + length = be16_to_cpu(dup->length); + offset += length; + curoff += length; + continue; + } + + /* Otherwise, find the next entry and report it. */ + dep = bp->b_addr + offset; + length = xfs_dir2_data_entsize(mp, dep->namelen); + + dapos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff; + ino = be64_to_cpu(dep->inumber); + name.name = dep->name; + name.len = dep->namelen; + name.type = xfs_dir2_data_get_ftype(mp, dep); + + error = dirent_fn(sc, dp, dapos, &name, ino, priv); + if (error) + break; + + /* Advance to the next entry. */ + offset += length; + curoff += length; + } + + if (bp) + xfs_trans_brelse(sc->tp, bp); + return error; +} + +/* + * Call a function for every entry in a directory. + * + * Callers must hold the ILOCK. File types are XFS_DIR3_FT_*. + */ +int +xchk_dir_walk( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xchk_dirent_fn dirent_fn, + void *priv) +{ + struct xfs_da_args args = { + .dp = dp, + .geo = dp->i_mount->m_dir_geo, + .trans = sc->tp, + }; + bool isblock; + int error; + + if (xfs_is_shutdown(dp->i_mount)) + return -EIO; + + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); + ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) + return xchk_dir_walk_sf(sc, dp, dirent_fn, priv); + + /* dir2 functions require that the data fork is loaded */ + error = xfs_iread_extents(sc->tp, dp, XFS_DATA_FORK); + if (error) + return error; + + error = xfs_dir2_isblock(&args, &isblock); + if (error) + return error; + + if (isblock) + return xchk_dir_walk_block(sc, dp, dirent_fn, priv); + + return xchk_dir_walk_leaf(sc, dp, dirent_fn, priv); +} + +/* + * Look up the inode number for an exact name in a directory. + * + * Callers must hold the ILOCK. File types are XFS_DIR3_FT_*. Names are not + * checked for correctness. + */ +int +xchk_dir_lookup( + struct xfs_scrub *sc, + struct xfs_inode *dp, + const struct xfs_name *name, + xfs_ino_t *ino) +{ + struct xfs_da_args args = { + .dp = dp, + .geo = dp->i_mount->m_dir_geo, + .trans = sc->tp, + .name = name->name, + .namelen = name->len, + .filetype = name->type, + .hashval = xfs_dir2_hashname(dp->i_mount, name), + .whichfork = XFS_DATA_FORK, + .op_flags = XFS_DA_OP_OKNOENT, + }; + bool isblock, isleaf; + int error; + + if (xfs_is_shutdown(dp->i_mount)) + return -EIO; + + ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); + ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { + error = xfs_dir2_sf_lookup(&args); + goto out_check_rval; + } + + /* dir2 functions require that the data fork is loaded */ + error = xfs_iread_extents(sc->tp, dp, XFS_DATA_FORK); + if (error) + return error; + + error = xfs_dir2_isblock(&args, &isblock); + if (error) + return error; + + if (isblock) { + error = xfs_dir2_block_lookup(&args); + goto out_check_rval; + } + + error = xfs_dir2_isleaf(&args, &isleaf); + if (error) + return error; + + if (isleaf) { + error = xfs_dir2_leaf_lookup(&args); + goto out_check_rval; + } + + error = xfs_dir2_node_lookup(&args); + +out_check_rval: + if (error == -EEXIST) + error = 0; + if (!error) + *ino = args.inumber; + return error; +} diff --git a/fs/xfs/scrub/readdir.h b/fs/xfs/scrub/readdir.h new file mode 100644 index 000000000000..55787f4df123 --- /dev/null +++ b/fs/xfs/scrub/readdir.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_READDIR_H__ +#define __XFS_SCRUB_READDIR_H__ + +typedef int (*xchk_dirent_fn)(struct xfs_scrub *sc, struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, const struct xfs_name *name, + xfs_ino_t ino, void *priv); + +int xchk_dir_walk(struct xfs_scrub *sc, struct xfs_inode *dp, + xchk_dirent_fn dirent_fn, void *priv); + +int xchk_dir_lookup(struct xfs_scrub *sc, struct xfs_inode *dp, + const struct xfs_name *name, xfs_ino_t *ino); + +#endif /* __XFS_SCRUB_READDIR_H__ */ diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index d9c1b3cea4a5..304ea1e1bfb0 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -1,21 +1,22 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_ag.h" #include "xfs_btree.h" #include "xfs_rmap.h" #include "xfs_refcount.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" -#include "xfs_trans_resv.h" -#include "xfs_mount.h" -#include "xfs_ag.h" +#include "scrub/trace.h" /* * Set us up to scrub reference count btrees. @@ -24,6 +25,8 @@ int xchk_setup_ag_refcountbt( struct xfs_scrub *sc) { + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); return xchk_setup_ag_btree(sc, false); } @@ -300,8 +303,10 @@ xchk_refcountbt_xref_rmap( goto out_free; xchk_refcountbt_process_rmap_fragments(&refchk); - if (irec->rc_refcount != refchk.seen) + if (irec->rc_refcount != refchk.seen) { + trace_xchk_refcount_incorrect(sc->sa.pag, irec, refchk.seen); xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + } out_free: list_for_each_entry_safe(frag, n, &refchk.fragments, list) { @@ -325,6 +330,107 @@ xchk_refcountbt_xref( xchk_refcountbt_xref_rmap(sc, irec); } +struct xchk_refcbt_records { + /* Previous refcount record. */ + struct xfs_refcount_irec prev_rec; + + /* The next AG block where we aren't expecting shared extents. */ + xfs_agblock_t next_unshared_agbno; + + /* Number of CoW blocks we expect. */ + xfs_agblock_t cow_blocks; + + /* Was the last record a shared or CoW staging extent? */ + enum xfs_refc_domain prev_domain; +}; + +STATIC int +xchk_refcountbt_rmap_check_gap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + xfs_agblock_t *next_bno = priv; + + if (*next_bno != NULLAGBLOCK && rec->rm_startblock < *next_bno) + return -ECANCELED; + + *next_bno = rec->rm_startblock + rec->rm_blockcount; + return 0; +} + +/* + * Make sure that a gap in the reference count records does not correspond to + * overlapping records (i.e. shared extents) in the reverse mappings. + */ +static inline void +xchk_refcountbt_xref_gaps( + struct xfs_scrub *sc, + struct xchk_refcbt_records *rrc, + xfs_agblock_t bno) +{ + struct xfs_rmap_irec low; + struct xfs_rmap_irec high; + xfs_agblock_t next_bno = NULLAGBLOCK; + int error; + + if (bno <= rrc->next_unshared_agbno || !sc->sa.rmap_cur || + xchk_skip_xref(sc->sm)) + return; + + memset(&low, 0, sizeof(low)); + low.rm_startblock = rrc->next_unshared_agbno; + memset(&high, 0xFF, sizeof(high)); + high.rm_startblock = bno - 1; + + error = xfs_rmap_query_range(sc->sa.rmap_cur, &low, &high, + xchk_refcountbt_rmap_check_gap, &next_bno); + if (error == -ECANCELED) + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + else + xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur); +} + +static inline bool +xchk_refcount_mergeable( + struct xchk_refcbt_records *rrc, + const struct xfs_refcount_irec *r2) +{ + const struct xfs_refcount_irec *r1 = &rrc->prev_rec; + + /* Ignore if prev_rec is not yet initialized. */ + if (r1->rc_blockcount > 0) + return false; + + if (r1->rc_domain != r2->rc_domain) + return false; + if (r1->rc_startblock + r1->rc_blockcount != r2->rc_startblock) + return false; + if (r1->rc_refcount != r2->rc_refcount) + return false; + if ((unsigned long long)r1->rc_blockcount + r2->rc_blockcount > + MAXREFCEXTLEN) + return false; + + return true; +} + +/* Flag failures for records that could be merged. */ +STATIC void +xchk_refcountbt_check_mergeable( + struct xchk_btree *bs, + struct xchk_refcbt_records *rrc, + const struct xfs_refcount_irec *irec) +{ + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + if (xchk_refcount_mergeable(rrc, irec)) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + + memcpy(&rrc->prev_rec, irec, sizeof(struct xfs_refcount_irec)); +} + /* Scrub a refcountbt record. */ STATIC int xchk_refcountbt_rec( @@ -332,27 +438,37 @@ xchk_refcountbt_rec( const union xfs_btree_rec *rec) { struct xfs_refcount_irec irec; - xfs_agblock_t *cow_blocks = bs->private; - struct xfs_perag *pag = bs->cur->bc_ag.pag; + struct xchk_refcbt_records *rrc = bs->private; xfs_refcount_btrec_to_irec(rec, &irec); - - /* Check the domain and refcount are not incompatible. */ - if (!xfs_refcount_check_domain(&irec)) + if (xfs_refcount_check_irec(bs->cur, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } if (irec.rc_domain == XFS_REFC_DOMAIN_COW) - (*cow_blocks) += irec.rc_blockcount; - - /* Check the extent. */ - if (!xfs_verify_agbext(pag, irec.rc_startblock, irec.rc_blockcount)) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + rrc->cow_blocks += irec.rc_blockcount; - if (irec.rc_refcount == 0) + /* Shared records always come before CoW records. */ + if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED && + rrc->prev_domain == XFS_REFC_DOMAIN_COW) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + rrc->prev_domain = irec.rc_domain; + xchk_refcountbt_check_mergeable(bs, rrc, &irec); xchk_refcountbt_xref(bs->sc, &irec); + /* + * If this is a record for a shared extent, check that all blocks + * between the previous record and this one have at most one reverse + * mapping. + */ + if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED) { + xchk_refcountbt_xref_gaps(bs->sc, rrc, irec.rc_startblock); + rrc->next_unshared_agbno = irec.rc_startblock + + irec.rc_blockcount; + } + return 0; } @@ -394,15 +510,25 @@ int xchk_refcountbt( struct xfs_scrub *sc) { - xfs_agblock_t cow_blocks = 0; + struct xchk_refcbt_records rrc = { + .cow_blocks = 0, + .next_unshared_agbno = 0, + .prev_domain = XFS_REFC_DOMAIN_SHARED, + }; int error; error = xchk_btree(sc, sc->sa.refc_cur, xchk_refcountbt_rec, - &XFS_RMAP_OINFO_REFC, &cow_blocks); + &XFS_RMAP_OINFO_REFC, &rrc); if (error) return error; - xchk_refcount_xref_rmap(sc, cow_blocks); + /* + * Check that all blocks between the last refcount > 1 record and the + * end of the AG have at most one reverse mapping. + */ + xchk_refcountbt_xref_gaps(sc, &rrc, sc->mp->m_sb.sb_agblocks); + + xchk_refcount_xref_rmap(sc, rrc.cow_blocks); return 0; } @@ -458,16 +584,37 @@ xchk_xref_is_not_shared( xfs_agblock_t agbno, xfs_extlen_t len) { - bool shared; + enum xbtree_recpacking outcome; + int error; + + if (!sc->sa.refc_cur || xchk_skip_xref(sc->sm)) + return; + + error = xfs_refcount_has_records(sc->sa.refc_cur, + XFS_REFC_DOMAIN_SHARED, agbno, len, &outcome); + if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur)) + return; + if (outcome != XBTREE_RECPACKING_EMPTY) + xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); +} + +/* xref check that the extent is not being used for CoW staging. */ +void +xchk_xref_is_not_cow_staging( + struct xfs_scrub *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + enum xbtree_recpacking outcome; int error; if (!sc->sa.refc_cur || xchk_skip_xref(sc->sm)) return; - error = xfs_refcount_has_record(sc->sa.refc_cur, XFS_REFC_DOMAIN_SHARED, - agbno, len, &shared); + error = xfs_refcount_has_records(sc->sa.refc_cur, XFS_REFC_DOMAIN_COW, + agbno, len, &outcome); if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur)) return; - if (shared) + if (outcome != XBTREE_RECPACKING_EMPTY) xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0); } diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 1b71174ec0d6..ac6d8803e660 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2018 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -60,6 +60,9 @@ xrep_attempt( sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; sc->flags |= XREP_ALREADY_FIXED; return -EAGAIN; + case -ECHRNG: + sc->flags |= XCHK_NEED_DRAIN; + return -EAGAIN; case -EDEADLOCK: /* Tell the caller to try again having grabbed all the locks. */ if (!(sc->flags & XCHK_TRY_HARDER)) { @@ -442,6 +445,30 @@ xrep_init_btblock( * buffers associated with @bitmap. */ +static int +xrep_invalidate_block( + uint64_t fsbno, + void *priv) +{ + struct xfs_scrub *sc = priv; + struct xfs_buf *bp; + int error; + + /* Skip AG headers and post-EOFS blocks */ + if (!xfs_verify_fsbno(sc->mp, fsbno)) + return 0; + + error = xfs_buf_incore(sc->mp->m_ddev_targp, + XFS_FSB_TO_DADDR(sc->mp, fsbno), + XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK, &bp); + if (error) + return 0; + + xfs_trans_bjoin(sc->tp, bp); + xfs_trans_binval(sc->tp, bp); + return 0; +} + /* * Invalidate buffers for per-AG btree blocks we're dumping. This function * is not intended for use with file data repairs; we have bunmapi for that. @@ -451,11 +478,6 @@ xrep_invalidate_blocks( struct xfs_scrub *sc, struct xbitmap *bitmap) { - struct xbitmap_range *bmr; - struct xbitmap_range *n; - struct xfs_buf *bp; - xfs_fsblock_t fsbno; - /* * For each block in each extent, see if there's an incore buffer for * exactly that block; if so, invalidate it. The buffer cache only @@ -464,23 +486,7 @@ xrep_invalidate_blocks( * because we never own those; and if we can't TRYLOCK the buffer we * assume it's owned by someone else. */ - for_each_xbitmap_block(fsbno, bmr, n, bitmap) { - int error; - - /* Skip AG headers and post-EOFS blocks */ - if (!xfs_verify_fsbno(sc->mp, fsbno)) - continue; - error = xfs_buf_incore(sc->mp->m_ddev_targp, - XFS_FSB_TO_DADDR(sc->mp, fsbno), - XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK, &bp); - if (error) - continue; - - xfs_trans_bjoin(sc->tp, bp); - xfs_trans_binval(sc->tp, bp); - } - - return 0; + return xbitmap_walk_bits(bitmap, xrep_invalidate_block, sc); } /* Ensure the freelist is the correct size. */ @@ -501,6 +507,15 @@ xrep_fix_freelist( can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK); } +/* Information about reaping extents after a repair. */ +struct xrep_reap_state { + struct xfs_scrub *sc; + + /* Reverse mapping owner and metadata reservation type. */ + const struct xfs_owner_info *oinfo; + enum xfs_ag_resv_type resv; +}; + /* * Put a block back on the AGFL. */ @@ -545,17 +560,23 @@ xrep_put_freelist( /* Dispose of a single block. */ STATIC int xrep_reap_block( - struct xfs_scrub *sc, - xfs_fsblock_t fsbno, - const struct xfs_owner_info *oinfo, - enum xfs_ag_resv_type resv) + uint64_t fsbno, + void *priv) { + struct xrep_reap_state *rs = priv; + struct xfs_scrub *sc = rs->sc; struct xfs_btree_cur *cur; struct xfs_buf *agf_bp = NULL; xfs_agblock_t agbno; bool has_other_rmap; int error; + ASSERT(sc->ip != NULL || + XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno); + trace_xrep_dispose_btree_extent(sc->mp, + XFS_FSB_TO_AGNO(sc->mp, fsbno), + XFS_FSB_TO_AGBNO(sc->mp, fsbno), 1); + agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); ASSERT(XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno); @@ -574,7 +595,8 @@ xrep_reap_block( cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, sc->sa.pag); /* Can we find any other rmappings? */ - error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap); + error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo, + &has_other_rmap); xfs_btree_del_cursor(cur, error); if (error) goto out_free; @@ -594,11 +616,12 @@ xrep_reap_block( */ if (has_other_rmap) error = xfs_rmap_free(sc->tp, agf_bp, sc->sa.pag, agbno, - 1, oinfo); - else if (resv == XFS_AG_RESV_AGFL) + 1, rs->oinfo); + else if (rs->resv == XFS_AG_RESV_AGFL) error = xrep_put_freelist(sc, agbno); else - error = xfs_free_extent(sc->tp, fsbno, 1, oinfo, resv); + error = xfs_free_extent(sc->tp, sc->sa.pag, agbno, 1, rs->oinfo, + rs->resv); if (agf_bp != sc->sa.agf_bp) xfs_trans_brelse(sc->tp, agf_bp); if (error) @@ -622,26 +645,15 @@ xrep_reap_extents( const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type) { - struct xbitmap_range *bmr; - struct xbitmap_range *n; - xfs_fsblock_t fsbno; - int error = 0; + struct xrep_reap_state rs = { + .sc = sc, + .oinfo = oinfo, + .resv = type, + }; ASSERT(xfs_has_rmapbt(sc->mp)); - for_each_xbitmap_block(fsbno, bmr, n, bitmap) { - ASSERT(sc->ip != NULL || - XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno); - trace_xrep_dispose_btree_extent(sc->mp, - XFS_FSB_TO_AGNO(sc->mp, fsbno), - XFS_FSB_TO_AGBNO(sc->mp, fsbno), 1); - - error = xrep_reap_block(sc, fsbno, oinfo, type); - if (error) - break; - } - - return error; + return xbitmap_walk_bits(bitmap, xrep_reap_block, &rs); } /* diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 840f74ec431c..dce791c679ee 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2018 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #ifndef __XFS_SCRUB_REPAIR_H__ #define __XFS_SCRUB_REPAIR_H__ @@ -31,6 +31,7 @@ int xrep_init_btblock(struct xfs_scrub *sc, xfs_fsblock_t fsb, const struct xfs_buf_ops *ops); struct xbitmap; +struct xagb_bitmap; int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink); int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xbitmap *btlist); diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index 229826b2e1c0..d29a26ecddd6 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -1,21 +1,29 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" +#include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_trans.h" #include "xfs_btree.h" #include "xfs_rmap.h" #include "xfs_refcount.h" +#include "xfs_ag.h" +#include "xfs_bit.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_refcount_btree.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" -#include "xfs_ag.h" +#include "scrub/bitmap.h" /* * Set us up to scrub reverse mapping btrees. @@ -24,11 +32,39 @@ int xchk_setup_ag_rmapbt( struct xfs_scrub *sc) { + if (xchk_need_intent_drain(sc)) + xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + return xchk_setup_ag_btree(sc, false); } /* Reverse-mapping scrubber. */ +struct xchk_rmap { + /* + * The furthest-reaching of the rmapbt records that we've already + * processed. This enables us to detect overlapping records for space + * allocations that cannot be shared. + */ + struct xfs_rmap_irec overlap_rec; + + /* + * The previous rmapbt record, so that we can check for two records + * that could be one. + */ + struct xfs_rmap_irec prev_rec; + + /* Bitmaps containing all blocks for each type of AG metadata. */ + struct xagb_bitmap fs_owned; + struct xagb_bitmap log_owned; + struct xagb_bitmap ag_owned; + struct xagb_bitmap inobt_owned; + struct xagb_bitmap refcbt_owned; + + /* Did we complete the AG space metadata bitmaps? */ + bool bitmaps_complete; +}; + /* Cross-reference a rmap against the refcount btree. */ STATIC void xchk_rmapbt_xref_refc( @@ -84,80 +120,415 @@ xchk_rmapbt_xref( xchk_rmapbt_xref_refc(sc, irec); } -/* Scrub an rmapbt record. */ -STATIC int -xchk_rmapbt_rec( - struct xchk_btree *bs, - const union xfs_btree_rec *rec) +/* + * Check for bogus UNWRITTEN flags in the rmapbt node block keys. + * + * In reverse mapping records, the file mapping extent state + * (XFS_RMAP_OFF_UNWRITTEN) is a record attribute, not a key field. It is not + * involved in lookups in any way. In older kernels, the functions that + * convert rmapbt records to keys forgot to filter out the extent state bit, + * even though the key comparison functions have filtered the flag correctly. + * If we spot an rmap key with the unwritten bit set in rm_offset, we should + * mark the btree as needing optimization to rebuild the btree without those + * flags. + */ +STATIC void +xchk_rmapbt_check_unwritten_in_keyflags( + struct xchk_btree *bs) { - struct xfs_mount *mp = bs->cur->bc_mp; - struct xfs_rmap_irec irec; - struct xfs_perag *pag = bs->cur->bc_ag.pag; - bool non_inode; - bool is_unwritten; - bool is_bmbt; - bool is_attr; - int error; + struct xfs_scrub *sc = bs->sc; + struct xfs_btree_cur *cur = bs->cur; + struct xfs_btree_block *keyblock; + union xfs_btree_key *lkey, *hkey; + __be64 badflag = cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN); + unsigned int level; - error = xfs_rmap_btrec_to_irec(rec, &irec); - if (!xchk_btree_process_error(bs->sc, bs->cur, 0, &error)) - goto out; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_PREEN) + return; + + for (level = 1; level < cur->bc_nlevels; level++) { + struct xfs_buf *bp; + unsigned int ptr; + + /* Only check the first time we've seen this node block. */ + if (cur->bc_levels[level].ptr > 1) + continue; + + keyblock = xfs_btree_get_block(cur, level, &bp); + for (ptr = 1; ptr <= be16_to_cpu(keyblock->bb_numrecs); ptr++) { + lkey = xfs_btree_key_addr(cur, ptr, keyblock); + + if (lkey->rmap.rm_offset & badflag) { + xchk_btree_set_preen(sc, cur, level); + break; + } + + hkey = xfs_btree_high_key_addr(cur, ptr, keyblock); + if (hkey->rmap.rm_offset & badflag) { + xchk_btree_set_preen(sc, cur, level); + break; + } + } + } +} + +static inline bool +xchk_rmapbt_is_shareable( + struct xfs_scrub *sc, + const struct xfs_rmap_irec *irec) +{ + if (!xfs_has_reflink(sc->mp)) + return false; + if (XFS_RMAP_NON_INODE_OWNER(irec->rm_owner)) + return false; + if (irec->rm_flags & (XFS_RMAP_BMBT_BLOCK | XFS_RMAP_ATTR_FORK | + XFS_RMAP_UNWRITTEN)) + return false; + return true; +} + +/* Flag failures for records that overlap but cannot. */ +STATIC void +xchk_rmapbt_check_overlapping( + struct xchk_btree *bs, + struct xchk_rmap *cr, + const struct xfs_rmap_irec *irec) +{ + xfs_agblock_t pnext, inext; + + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + /* No previous record? */ + if (cr->overlap_rec.rm_blockcount == 0) + goto set_prev; + + /* Do overlap_rec and irec overlap? */ + pnext = cr->overlap_rec.rm_startblock + cr->overlap_rec.rm_blockcount; + if (pnext <= irec->rm_startblock) + goto set_prev; - /* Check extent. */ - if (irec.rm_startblock + irec.rm_blockcount <= irec.rm_startblock) + /* Overlap is only allowed if both records are data fork mappings. */ + if (!xchk_rmapbt_is_shareable(bs->sc, &cr->overlap_rec) || + !xchk_rmapbt_is_shareable(bs->sc, irec)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - if (irec.rm_owner == XFS_RMAP_OWN_FS) { + /* Save whichever rmap record extends furthest. */ + inext = irec->rm_startblock + irec->rm_blockcount; + if (pnext > inext) + return; + +set_prev: + memcpy(&cr->overlap_rec, irec, sizeof(struct xfs_rmap_irec)); +} + +/* Decide if two reverse-mapping records can be merged. */ +static inline bool +xchk_rmap_mergeable( + struct xchk_rmap *cr, + const struct xfs_rmap_irec *r2) +{ + const struct xfs_rmap_irec *r1 = &cr->prev_rec; + + /* Ignore if prev_rec is not yet initialized. */ + if (cr->prev_rec.rm_blockcount == 0) + return false; + + if (r1->rm_owner != r2->rm_owner) + return false; + if (r1->rm_startblock + r1->rm_blockcount != r2->rm_startblock) + return false; + if ((unsigned long long)r1->rm_blockcount + r2->rm_blockcount > + XFS_RMAP_LEN_MAX) + return false; + if (XFS_RMAP_NON_INODE_OWNER(r2->rm_owner)) + return true; + /* must be an inode owner below here */ + if (r1->rm_flags != r2->rm_flags) + return false; + if (r1->rm_flags & XFS_RMAP_BMBT_BLOCK) + return true; + return r1->rm_offset + r1->rm_blockcount == r2->rm_offset; +} + +/* Flag failures for records that could be merged. */ +STATIC void +xchk_rmapbt_check_mergeable( + struct xchk_btree *bs, + struct xchk_rmap *cr, + const struct xfs_rmap_irec *irec) +{ + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return; + + if (xchk_rmap_mergeable(cr, irec)) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + + memcpy(&cr->prev_rec, irec, sizeof(struct xfs_rmap_irec)); +} + +/* Compare an rmap for AG metadata against the metadata walk. */ +STATIC int +xchk_rmapbt_mark_bitmap( + struct xchk_btree *bs, + struct xchk_rmap *cr, + const struct xfs_rmap_irec *irec) +{ + struct xfs_scrub *sc = bs->sc; + struct xagb_bitmap *bmp = NULL; + xfs_extlen_t fsbcount = irec->rm_blockcount; + + /* + * Skip corrupt records. It is essential that we detect records in the + * btree that cannot overlap but do, flag those as CORRUPT, and skip + * the bitmap comparison to avoid generating false XCORRUPT reports. + */ + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* + * If the AG metadata walk didn't complete, there's no point in + * comparing against partial results. + */ + if (!cr->bitmaps_complete) + return 0; + + switch (irec->rm_owner) { + case XFS_RMAP_OWN_FS: + bmp = &cr->fs_owned; + break; + case XFS_RMAP_OWN_LOG: + bmp = &cr->log_owned; + break; + case XFS_RMAP_OWN_AG: + bmp = &cr->ag_owned; + break; + case XFS_RMAP_OWN_INOBT: + bmp = &cr->inobt_owned; + break; + case XFS_RMAP_OWN_REFC: + bmp = &cr->refcbt_owned; + break; + } + + if (!bmp) + return 0; + + if (xagb_bitmap_test(bmp, irec->rm_startblock, &fsbcount)) { /* - * xfs_verify_agbno returns false for static fs metadata. - * Since that only exists at the start of the AG, validate - * that by hand. + * The start of this reverse mapping corresponds to a set + * region in the bitmap. If the mapping covers more area than + * the set region, then it covers space that wasn't found by + * the AG metadata walk. */ - if (irec.rm_startblock != 0 || - irec.rm_blockcount != XFS_AGFL_BLOCK(mp) + 1) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + if (fsbcount < irec->rm_blockcount) + xchk_btree_xref_set_corrupt(bs->sc, + bs->sc->sa.rmap_cur, 0); } else { /* - * Otherwise we must point somewhere past the static metadata - * but before the end of the FS. Run the regular check. + * The start of this reverse mapping does not correspond to a + * completely set region in the bitmap. The region wasn't + * fully set by walking the AG metadata, so this is a + * cross-referencing corruption. */ - if (!xfs_verify_agbno(pag, irec.rm_startblock) || - !xfs_verify_agbno(pag, irec.rm_startblock + - irec.rm_blockcount - 1)) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + xchk_btree_xref_set_corrupt(bs->sc, bs->sc->sa.rmap_cur, 0); } - /* Check flags. */ - non_inode = XFS_RMAP_NON_INODE_OWNER(irec.rm_owner); - is_bmbt = irec.rm_flags & XFS_RMAP_BMBT_BLOCK; - is_attr = irec.rm_flags & XFS_RMAP_ATTR_FORK; - is_unwritten = irec.rm_flags & XFS_RMAP_UNWRITTEN; + /* Unset the region so that we can detect missing rmap records. */ + return xagb_bitmap_clear(bmp, irec->rm_startblock, irec->rm_blockcount); +} - if (is_bmbt && irec.rm_offset != 0) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); +/* Scrub an rmapbt record. */ +STATIC int +xchk_rmapbt_rec( + struct xchk_btree *bs, + const union xfs_btree_rec *rec) +{ + struct xchk_rmap *cr = bs->private; + struct xfs_rmap_irec irec; - if (non_inode && irec.rm_offset != 0) + if (xfs_rmap_btrec_to_irec(rec, &irec) != NULL || + xfs_rmap_check_irec(bs->cur, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } - if (is_unwritten && (is_bmbt || non_inode || is_attr)) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + xchk_rmapbt_check_unwritten_in_keyflags(bs); + xchk_rmapbt_check_mergeable(bs, cr, &irec); + xchk_rmapbt_check_overlapping(bs, cr, &irec); + xchk_rmapbt_xref(bs->sc, &irec); - if (non_inode && (is_bmbt || is_unwritten || is_attr)) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return xchk_rmapbt_mark_bitmap(bs, cr, &irec); +} - if (!non_inode) { - if (!xfs_verify_ino(mp, irec.rm_owner)) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - } else { - /* Non-inode owner within the magic values? */ - if (irec.rm_owner <= XFS_RMAP_OWN_MIN || - irec.rm_owner > XFS_RMAP_OWN_FS) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); +/* Add an AGFL block to the rmap list. */ +STATIC int +xchk_rmapbt_walk_agfl( + struct xfs_mount *mp, + xfs_agblock_t agbno, + void *priv) +{ + struct xagb_bitmap *bitmap = priv; + + return xagb_bitmap_set(bitmap, agbno, 1); +} + +/* + * Set up bitmaps mapping all the AG metadata to compare with the rmapbt + * records. + * + * Grab our own btree cursors here if the scrub setup function didn't give us a + * btree cursor due to reports of poor health. We need to find out if the + * rmapbt disagrees with primary metadata btrees to tag the rmapbt as being + * XCORRUPT. + */ +STATIC int +xchk_rmapbt_walk_ag_metadata( + struct xfs_scrub *sc, + struct xchk_rmap *cr) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_buf *agfl_bp; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + struct xfs_btree_cur *cur; + int error; + + /* OWN_FS: AG headers */ + error = xagb_bitmap_set(&cr->fs_owned, XFS_SB_BLOCK(mp), + XFS_AGFL_BLOCK(mp) - XFS_SB_BLOCK(mp) + 1); + if (error) + goto out; + + /* OWN_LOG: Internal log */ + if (xfs_ag_contains_log(mp, sc->sa.pag->pag_agno)) { + error = xagb_bitmap_set(&cr->log_owned, + XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart), + mp->m_sb.sb_logblocks); + if (error) + goto out; + } + + /* OWN_AG: bnobt, cntbt, rmapbt, and AGFL */ + cur = sc->sa.bno_cur; + if (!cur) + cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag, XFS_BTNUM_BNO); + error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur); + if (cur != sc->sa.bno_cur) + xfs_btree_del_cursor(cur, error); + if (error) + goto out; + + cur = sc->sa.cnt_cur; + if (!cur) + cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag, XFS_BTNUM_CNT); + error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur); + if (cur != sc->sa.cnt_cur) + xfs_btree_del_cursor(cur, error); + if (error) + goto out; + + error = xagb_bitmap_set_btblocks(&cr->ag_owned, sc->sa.rmap_cur); + if (error) + goto out; + + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); + if (error) + goto out; + + error = xfs_agfl_walk(sc->mp, agf, agfl_bp, xchk_rmapbt_walk_agfl, + &cr->ag_owned); + xfs_trans_brelse(sc->tp, agfl_bp); + if (error) + goto out; + + /* OWN_INOBT: inobt, finobt */ + cur = sc->sa.ino_cur; + if (!cur) + cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, sc->sa.agi_bp, + XFS_BTNUM_INO); + error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur); + if (cur != sc->sa.ino_cur) + xfs_btree_del_cursor(cur, error); + if (error) + goto out; + + if (xfs_has_finobt(sc->mp)) { + cur = sc->sa.fino_cur; + if (!cur) + cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, + sc->sa.agi_bp, XFS_BTNUM_FINO); + error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur); + if (cur != sc->sa.fino_cur) + xfs_btree_del_cursor(cur, error); + if (error) + goto out; + } + + /* OWN_REFC: refcountbt */ + if (xfs_has_reflink(sc->mp)) { + cur = sc->sa.refc_cur; + if (!cur) + cur = xfs_refcountbt_init_cursor(sc->mp, sc->tp, + sc->sa.agf_bp, sc->sa.pag); + error = xagb_bitmap_set_btblocks(&cr->refcbt_owned, cur); + if (cur != sc->sa.refc_cur) + xfs_btree_del_cursor(cur, error); + if (error) + goto out; } - xchk_rmapbt_xref(bs->sc, &irec); out: - return error; + /* + * If there's an error, set XFAIL and disable the bitmap + * cross-referencing checks, but proceed with the scrub anyway. + */ + if (error) + xchk_btree_xref_process_error(sc, sc->sa.rmap_cur, + sc->sa.rmap_cur->bc_nlevels - 1, &error); + else + cr->bitmaps_complete = true; + return 0; +} + +/* + * Check for set regions in the bitmaps; if there are any, the rmap records do + * not describe all the AG metadata. + */ +STATIC void +xchk_rmapbt_check_bitmaps( + struct xfs_scrub *sc, + struct xchk_rmap *cr) +{ + struct xfs_btree_cur *cur = sc->sa.rmap_cur; + unsigned int level; + + if (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XFAIL)) + return; + if (!cur) + return; + level = cur->bc_nlevels - 1; + + /* + * Any bitmap with bits still set indicates that the reverse mapping + * doesn't cover the entire primary structure. + */ + if (xagb_bitmap_hweight(&cr->fs_owned) != 0) + xchk_btree_xref_set_corrupt(sc, cur, level); + + if (xagb_bitmap_hweight(&cr->log_owned) != 0) + xchk_btree_xref_set_corrupt(sc, cur, level); + + if (xagb_bitmap_hweight(&cr->ag_owned) != 0) + xchk_btree_xref_set_corrupt(sc, cur, level); + + if (xagb_bitmap_hweight(&cr->inobt_owned) != 0) + xchk_btree_xref_set_corrupt(sc, cur, level); + + if (xagb_bitmap_hweight(&cr->refcbt_owned) != 0) + xchk_btree_xref_set_corrupt(sc, cur, level); } /* Scrub the rmap btree for some AG. */ @@ -165,42 +536,63 @@ int xchk_rmapbt( struct xfs_scrub *sc) { - return xchk_btree(sc, sc->sa.rmap_cur, xchk_rmapbt_rec, - &XFS_RMAP_OINFO_AG, NULL); + struct xchk_rmap *cr; + int error; + + cr = kzalloc(sizeof(struct xchk_rmap), XCHK_GFP_FLAGS); + if (!cr) + return -ENOMEM; + + xagb_bitmap_init(&cr->fs_owned); + xagb_bitmap_init(&cr->log_owned); + xagb_bitmap_init(&cr->ag_owned); + xagb_bitmap_init(&cr->inobt_owned); + xagb_bitmap_init(&cr->refcbt_owned); + + error = xchk_rmapbt_walk_ag_metadata(sc, cr); + if (error) + goto out; + + error = xchk_btree(sc, sc->sa.rmap_cur, xchk_rmapbt_rec, + &XFS_RMAP_OINFO_AG, cr); + if (error) + goto out; + + xchk_rmapbt_check_bitmaps(sc, cr); + +out: + xagb_bitmap_destroy(&cr->refcbt_owned); + xagb_bitmap_destroy(&cr->inobt_owned); + xagb_bitmap_destroy(&cr->ag_owned); + xagb_bitmap_destroy(&cr->log_owned); + xagb_bitmap_destroy(&cr->fs_owned); + kfree(cr); + return error; } -/* xref check that the extent is owned by a given owner */ -static inline void -xchk_xref_check_owner( +/* xref check that the extent is owned only by a given owner */ +void +xchk_xref_is_only_owned_by( struct xfs_scrub *sc, xfs_agblock_t bno, xfs_extlen_t len, - const struct xfs_owner_info *oinfo, - bool should_have_rmap) + const struct xfs_owner_info *oinfo) { - bool has_rmap; + struct xfs_rmap_matches res; int error; if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm)) return; - error = xfs_rmap_record_exists(sc->sa.rmap_cur, bno, len, oinfo, - &has_rmap); + error = xfs_rmap_count_owners(sc->sa.rmap_cur, bno, len, oinfo, &res); if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) return; - if (has_rmap != should_have_rmap) + if (res.matches != 1) + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + if (res.bad_non_owner_matches) + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + if (res.non_owner_matches) xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); -} - -/* xref check that the extent is owned by a given owner */ -void -xchk_xref_is_owned_by( - struct xfs_scrub *sc, - xfs_agblock_t bno, - xfs_extlen_t len, - const struct xfs_owner_info *oinfo) -{ - xchk_xref_check_owner(sc, bno, len, oinfo, true); } /* xref check that the extent is not owned by a given owner */ @@ -211,7 +603,19 @@ xchk_xref_is_not_owned_by( xfs_extlen_t len, const struct xfs_owner_info *oinfo) { - xchk_xref_check_owner(sc, bno, len, oinfo, false); + struct xfs_rmap_matches res; + int error; + + if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm)) + return; + + error = xfs_rmap_count_owners(sc->sa.rmap_cur, bno, len, oinfo, &res); + if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) + return; + if (res.matches != 0) + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); + if (res.bad_non_owner_matches) + xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); } /* xref check that the extent has no reverse mapping at all */ @@ -221,15 +625,15 @@ xchk_xref_has_no_owner( xfs_agblock_t bno, xfs_extlen_t len) { - bool has_rmap; + enum xbtree_recpacking outcome; int error; if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm)) return; - error = xfs_rmap_has_record(sc->sa.rmap_cur, bno, len, &has_rmap); + error = xfs_rmap_has_records(sc->sa.rmap_cur, bno, len, &outcome); if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) return; - if (has_rmap) + if (outcome != XBTREE_RECPACKING_EMPTY) xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); } diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 0a3bde64c675..e7dace7b4be8 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 07a7a75f987f..3d98f604765e 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" @@ -145,6 +145,21 @@ xchk_probe( /* Scrub setup and teardown */ +static inline void +xchk_fsgates_disable( + struct xfs_scrub *sc) +{ + if (!(sc->flags & XCHK_FSGATES_ALL)) + return; + + trace_xchk_fsgates_disable(sc, sc->flags & XCHK_FSGATES_ALL); + + if (sc->flags & XCHK_FSGATES_DRAIN) + xfs_drain_wait_disable(); + + sc->flags &= ~XCHK_FSGATES_ALL; +} + /* Free all the resources and finish the transactions. */ STATIC int xchk_teardown( @@ -166,17 +181,20 @@ xchk_teardown( xfs_iunlock(sc->ip, sc->ilock_flags); if (sc->ip != ip_in && !xfs_internal_inum(sc->mp, sc->ip->i_ino)) - xfs_irele(sc->ip); + xchk_irele(sc, sc->ip); sc->ip = NULL; } if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) mnt_drop_write_file(sc->file); - if (sc->flags & XCHK_REAPING_DISABLED) - xchk_start_reaping(sc); if (sc->buf) { + if (sc->buf_cleanup) + sc->buf_cleanup(sc->buf); kvfree(sc->buf); + sc->buf_cleanup = NULL; sc->buf = NULL; } + + xchk_fsgates_disable(sc); return error; } @@ -191,25 +209,25 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { }, [XFS_SCRUB_TYPE_SB] = { /* superblock */ .type = ST_PERAG, - .setup = xchk_setup_fs, + .setup = xchk_setup_agheader, .scrub = xchk_superblock, .repair = xrep_superblock, }, [XFS_SCRUB_TYPE_AGF] = { /* agf */ .type = ST_PERAG, - .setup = xchk_setup_fs, + .setup = xchk_setup_agheader, .scrub = xchk_agf, .repair = xrep_agf, }, [XFS_SCRUB_TYPE_AGFL]= { /* agfl */ .type = ST_PERAG, - .setup = xchk_setup_fs, + .setup = xchk_setup_agheader, .scrub = xchk_agfl, .repair = xrep_agfl, }, [XFS_SCRUB_TYPE_AGI] = { /* agi */ .type = ST_PERAG, - .setup = xchk_setup_fs, + .setup = xchk_setup_agheader, .scrub = xchk_agi, .repair = xrep_agi, }, @@ -491,23 +509,20 @@ retry_op: /* Set up for the operation. */ error = sc->ops->setup(sc); + if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER)) + goto try_harder; + if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN)) + goto need_drain; if (error) goto out_teardown; /* Scrub for errors. */ error = sc->ops->scrub(sc); - if (!(sc->flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) { - /* - * Scrubbers return -EDEADLOCK to mean 'try harder'. - * Tear down everything we hold, then set up again with - * preparation for worst-case scenarios. - */ - error = xchk_teardown(sc, 0); - if (error) - goto out_sc; - sc->flags |= XCHK_TRY_HARDER; - goto retry_op; - } else if (error || (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)) + if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER)) + goto try_harder; + if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN)) + goto need_drain; + if (error || (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)) goto out_teardown; xchk_update_health(sc); @@ -565,4 +580,21 @@ out: error = 0; } return error; +need_drain: + error = xchk_teardown(sc, 0); + if (error) + goto out_sc; + sc->flags |= XCHK_NEED_DRAIN; + goto retry_op; +try_harder: + /* + * Scrubbers return -EDEADLOCK to mean 'try harder'. Tear down + * everything we hold, then set up again with preparation for + * worst-case scenarios. + */ + error = xchk_teardown(sc, 0); + if (error) + goto out_sc; + sc->flags |= XCHK_TRY_HARDER; + goto retry_op; } diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index b4d391b4c938..b38e93830dde 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #ifndef __XFS_SCRUB_SCRUB_H__ #define __XFS_SCRUB_SCRUB_H__ @@ -77,7 +77,17 @@ struct xfs_scrub { */ struct xfs_inode *ip; + /* Kernel memory buffer used by scrubbers; freed at teardown. */ void *buf; + + /* + * Clean up resources owned by whatever is in the buffer. Cleanup can + * be deferred with this hook as a means for scrub functions to pass + * data to repair functions. This function must not free the buffer + * itself. + */ + void (*buf_cleanup)(void *buf); + uint ilock_flags; /* See the XCHK/XREP state flags below. */ @@ -96,9 +106,18 @@ struct xfs_scrub { /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */ #define XCHK_TRY_HARDER (1 << 0) /* can't get resources, try again */ -#define XCHK_REAPING_DISABLED (1 << 2) /* background block reaping paused */ +#define XCHK_FSGATES_DRAIN (1 << 2) /* defer ops draining enabled */ +#define XCHK_NEED_DRAIN (1 << 3) /* scrub needs to drain defer ops */ #define XREP_ALREADY_FIXED (1 << 31) /* checking our repair work */ +/* + * The XCHK_FSGATES* flags reflect functionality in the main filesystem that + * are only enabled for this particular online fsck. When not in use, the + * features are gated off via dynamic code patching, which is why the state + * must be enabled during scrub setup and can only be torn down afterwards. + */ +#define XCHK_FSGATES_ALL (XCHK_FSGATES_DRAIN) + /* Metadata scrubbers */ int xchk_tester(struct xfs_scrub *sc); int xchk_superblock(struct xfs_scrub *sc); @@ -152,7 +171,7 @@ void xchk_xref_is_not_inode_chunk(struct xfs_scrub *sc, xfs_agblock_t agbno, xfs_extlen_t len); void xchk_xref_is_inode_chunk(struct xfs_scrub *sc, xfs_agblock_t agbno, xfs_extlen_t len); -void xchk_xref_is_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno, +void xchk_xref_is_only_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno, xfs_extlen_t len, const struct xfs_owner_info *oinfo); void xchk_xref_is_not_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno, xfs_extlen_t len, const struct xfs_owner_info *oinfo); @@ -162,6 +181,8 @@ void xchk_xref_is_cow_staging(struct xfs_scrub *sc, xfs_agblock_t bno, xfs_extlen_t len); void xchk_xref_is_not_shared(struct xfs_scrub *sc, xfs_agblock_t bno, xfs_extlen_t len); +void xchk_xref_is_not_cow_staging(struct xfs_scrub *sc, xfs_agblock_t bno, + xfs_extlen_t len); #ifdef CONFIG_XFS_RT void xchk_xref_is_used_rt_space(struct xfs_scrub *sc, xfs_rtblock_t rtbno, xfs_extlen_t len); diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index c1c99ffe7408..38708fb9a5d7 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index b5f94676c37c..0a975439d2b6 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_fs.h" diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 93ece6df02e3..b3894daeb86a 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> * * NOTE: none of these tracepoints shall be considered a stable kernel ABI * as they can change at any time. See xfs_trace.h for documentation of @@ -30,6 +30,9 @@ TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi); TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi); TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi); +TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED); +TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW); + TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PROBE); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_SB); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGF); @@ -93,6 +96,12 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); { XFS_SCRUB_OFLAG_WARNING, "warning" }, \ { XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED, "norepair" } +#define XFS_SCRUB_STATE_STRINGS \ + { XCHK_TRY_HARDER, "try_harder" }, \ + { XCHK_FSGATES_DRAIN, "fsgates_drain" }, \ + { XCHK_NEED_DRAIN, "need_drain" }, \ + { XREP_ALREADY_FIXED, "already_fixed" } + DECLARE_EVENT_CLASS(xchk_class, TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm, int error), @@ -139,6 +148,33 @@ DEFINE_SCRUB_EVENT(xchk_deadlock_retry); DEFINE_SCRUB_EVENT(xrep_attempt); DEFINE_SCRUB_EVENT(xrep_done); +DECLARE_EVENT_CLASS(xchk_fsgate_class, + TP_PROTO(struct xfs_scrub *sc, unsigned int fsgate_flags), + TP_ARGS(sc, fsgate_flags), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(unsigned int, fsgate_flags) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->fsgate_flags = fsgate_flags; + ), + TP_printk("dev %d:%d type %s fsgates '%s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __print_flags(__entry->fsgate_flags, "|", XFS_SCRUB_STATE_STRINGS)) +) + +#define DEFINE_SCRUB_FSHOOK_EVENT(name) \ +DEFINE_EVENT(xchk_fsgate_class, name, \ + TP_PROTO(struct xfs_scrub *sc, unsigned int fsgates_flags), \ + TP_ARGS(sc, fsgates_flags)) + +DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_enable); +DEFINE_SCRUB_FSHOOK_EVENT(xchk_fsgates_disable); + TRACE_EVENT(xchk_op_error, TP_PROTO(struct xfs_scrub *sc, xfs_agnumber_t agno, xfs_agblock_t bno, int error, void *ret_ip), @@ -657,6 +693,38 @@ TRACE_EVENT(xchk_fscounters_within_range, __entry->old_value) ) +TRACE_EVENT(xchk_refcount_incorrect, + TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *irec, + xfs_nlink_t seen), + TP_ARGS(pag, irec, seen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, domain) + __field(xfs_agblock_t, startblock) + __field(xfs_extlen_t, blockcount) + __field(xfs_nlink_t, refcount) + __field(xfs_nlink_t, seen) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->domain = irec->rc_domain; + __entry->startblock = irec->rc_startblock; + __entry->blockcount = irec->rc_blockcount; + __entry->refcount = irec->rc_refcount; + __entry->seen = seen; + ), + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u seen %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), + __entry->startblock, + __entry->blockcount, + __entry->refcount, + __entry->seen) +) + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) diff --git a/fs/xfs/scrub/xfs_scrub.h b/fs/xfs/scrub/xfs_scrub.h index 2ceae614ade8..a39befa743ce 100644 --- a/fs/xfs/scrub/xfs_scrub.h +++ b/fs/xfs/scrub/xfs_scrub.h @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * Copyright (C) 2017 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> */ #ifndef __XFS_SCRUB_H__ #define __XFS_SCRUB_H__ diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 6e2f0013380a..7551c3ec4ea5 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -24,6 +24,7 @@ #include "xfs_error.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" +#include "xfs_ag.h" struct kmem_cache *xfs_bui_cache; struct kmem_cache *xfs_bud_cache; @@ -363,6 +364,34 @@ xfs_bmap_update_create_done( return &xfs_trans_get_bud(tp, BUI_ITEM(intent))->bud_item; } +/* Take a passive ref to the AG containing the space we're mapping. */ +void +xfs_bmap_update_get_group( + struct xfs_mount *mp, + struct xfs_bmap_intent *bi) +{ + xfs_agnumber_t agno; + + agno = XFS_FSB_TO_AGNO(mp, bi->bi_bmap.br_startblock); + + /* + * Bump the intent count on behalf of the deferred rmap and refcount + * intent items that that we can queue when we finish this bmap work. + * This new intent item will bump the intent count before the bmap + * intent drops the intent count, ensuring that the intent count + * remains nonzero across the transaction roll. + */ + bi->bi_pag = xfs_perag_intent_get(mp, agno); +} + +/* Release a passive AG ref after finishing mapping work. */ +static inline void +xfs_bmap_update_put_group( + struct xfs_bmap_intent *bi) +{ + xfs_perag_intent_put(bi->bi_pag); +} + /* Process a deferred rmap update. */ STATIC int xfs_bmap_update_finish_item( @@ -381,6 +410,8 @@ xfs_bmap_update_finish_item( ASSERT(bi->bi_type == XFS_BMAP_UNMAP); return -EAGAIN; } + + xfs_bmap_update_put_group(bi); kmem_cache_free(xfs_bmap_intent_cache, bi); return error; } @@ -393,7 +424,7 @@ xfs_bmap_update_abort_intent( xfs_bui_release(BUI_ITEM(intent)); } -/* Cancel a deferred rmap update. */ +/* Cancel a deferred bmap update. */ STATIC void xfs_bmap_update_cancel_item( struct list_head *item) @@ -401,6 +432,8 @@ xfs_bmap_update_cancel_item( struct xfs_bmap_intent *bi; bi = container_of(item, struct xfs_bmap_intent, bi_list); + + xfs_bmap_update_put_group(bi); kmem_cache_free(xfs_bmap_intent_cache, bi); } @@ -509,10 +542,12 @@ xfs_bui_item_recover( fake.bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + xfs_bmap_update_get_group(mp, &fake); error = xfs_trans_log_finish_bmap_update(tp, budp, &fake); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, map, sizeof(*map)); + xfs_bmap_update_put_group(&fake); if (error) goto err_cancel; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index a09dd2606479..fbb675563208 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -314,15 +314,13 @@ xfs_getbmap_report_one( if (isnullstartblock(got->br_startblock) || got->br_startblock == DELAYSTARTBLOCK) { /* - * Delalloc extents that start beyond EOF can occur due to - * speculative EOF allocation when the delalloc extent is larger - * than the largest freespace extent at conversion time. These - * extents cannot be converted by data writeback, so can exist - * here even if we are not supposed to be finding delalloc - * extents. + * Take the flush completion as being a point-in-time snapshot + * where there are no delalloc extents, and if any new ones + * have been created racily, just skip them as being 'after' + * the flush and so don't get reported. */ - if (got->br_startoff < XFS_B_TO_FSB(ip->i_mount, XFS_ISIZE(ip))) - ASSERT((bmv->bmv_iflags & BMV_IF_DELALLOC) != 0); + if (!(bmv->bmv_iflags & BMV_IF_DELALLOC)) + return 0; p->bmv_oflags |= BMV_OF_DELALLOC; p->bmv_block = -2; @@ -560,7 +558,9 @@ xfs_getbmap( if (!xfs_iext_next_extent(ifp, &icur, &got)) { xfs_fileoff_t end = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); - out[bmv->bmv_entries - 1].bmv_oflags |= BMV_OF_LAST; + if (bmv->bmv_entries > 0) + out[bmv->bmv_entries - 1].bmv_oflags |= + BMV_OF_LAST; if (whichfork != XFS_ATTR_FORK && bno < end && !xfs_getbmap_full(bmv)) { diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 54c774af6e1c..15d1e5a7c2d3 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -286,8 +286,7 @@ xfs_buf_free_pages( if (bp->b_pages[i]) __free_page(bp->b_pages[i]); } - if (current->reclaim_state) - current->reclaim_state->reclaimed_slab += bp->b_page_count; + mm_account_reclaimed_pages(bp->b_page_count); if (bp->b_pages != bp->b_page_array) kmem_free(bp->b_pages); diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index ffa94102094d..43167f543afc 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -943,6 +943,16 @@ xlog_recover_buf_commit_pass2( if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { trace_xfs_log_recover_buf_skip(log, buf_f); xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN); + + /* + * We're skipping replay of this buffer log item due to the log + * item LSN being behind the ondisk buffer. Verify the buffer + * contents since we aren't going to run the write verifier. + */ + if (bp->b_ops) { + bp->b_ops->verify_read(bp); + error = bp->b_error; + } goto out_release; } diff --git a/fs/xfs/xfs_dahash_test.c b/fs/xfs/xfs_dahash_test.c index 230651ab5ce4..0dab5941e080 100644 --- a/fs/xfs/xfs_dahash_test.c +++ b/fs/xfs/xfs_dahash_test.c @@ -9,6 +9,9 @@ #include "xfs_format.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_dir2_priv.h" #include "xfs_dahash_test.h" /* 4096 random bytes */ @@ -533,108 +536,109 @@ static struct dahash_test { uint16_t start; /* random 12 bit offset in buf */ uint16_t length; /* random 8 bit length of test */ xfs_dahash_t dahash; /* expected dahash result */ + xfs_dahash_t ascii_ci_dahash; /* expected ascii-ci dahash result */ } test[] __initdata = { - {0x0567, 0x0097, 0x96951389}, - {0x0869, 0x0055, 0x6455ab4f}, - {0x0c51, 0x00be, 0x8663afde}, - {0x044a, 0x00fc, 0x98fbe432}, - {0x0f29, 0x0079, 0x42371997}, - {0x08ba, 0x0052, 0x942be4f7}, - {0x01f2, 0x0013, 0x5262687e}, - {0x09e3, 0x00e2, 0x8ffb0908}, - {0x007c, 0x0051, 0xb3158491}, - {0x0854, 0x001f, 0x83bb20d9}, - {0x031b, 0x0008, 0x98970bdf}, - {0x0de7, 0x0027, 0xbfbf6f6c}, - {0x0f76, 0x0005, 0x906a7105}, - {0x092e, 0x00d0, 0x86631850}, - {0x0233, 0x0082, 0xdbdd914e}, - {0x04c9, 0x0075, 0x5a400a9e}, - {0x0b66, 0x0099, 0xae128b45}, - {0x000d, 0x00ed, 0xe61c216a}, - {0x0a31, 0x003d, 0xf69663b9}, - {0x00a3, 0x0052, 0x643c39ae}, - {0x0125, 0x00d5, 0x7c310b0d}, - {0x0105, 0x004a, 0x06a77e74}, - {0x0858, 0x008e, 0x265bc739}, - {0x045e, 0x0095, 0x13d6b192}, - {0x0dab, 0x003c, 0xc4498704}, - {0x00cd, 0x00b5, 0x802a4e2d}, - {0x069b, 0x008c, 0x5df60f71}, - {0x0454, 0x006c, 0x5f03d8bb}, - {0x040e, 0x0032, 0x0ce513b5}, - {0x0874, 0x00e2, 0x6a811fb3}, - {0x0521, 0x00b4, 0x93296833}, - {0x0ddc, 0x00cf, 0xf9305338}, - {0x0a70, 0x0023, 0x239549ea}, - {0x083e, 0x0027, 0x2d88ba97}, - {0x0241, 0x00a7, 0xfe0b32e1}, - {0x0dfc, 0x0096, 0x1a11e815}, - {0x023e, 0x001e, 0xebc9a1f3}, - {0x067e, 0x0066, 0xb1067f81}, - {0x09ea, 0x000e, 0x46fd7247}, - {0x036b, 0x008c, 0x1a39acdf}, - {0x078f, 0x0030, 0x964042ab}, - {0x085c, 0x008f, 0x1829edab}, - {0x02ec, 0x009f, 0x6aefa72d}, - {0x043b, 0x00ce, 0x65642ff5}, - {0x0a32, 0x00b8, 0xbd82759e}, - {0x0d3c, 0x0087, 0xf4d66d54}, - {0x09ec, 0x008a, 0x06bfa1ff}, - {0x0902, 0x0015, 0x755025d2}, - {0x08fe, 0x000e, 0xf690ce2d}, - {0x00fb, 0x00dc, 0xe55f1528}, - {0x0eaa, 0x003a, 0x0fe0a8d7}, - {0x05fb, 0x0006, 0x86281cfb}, - {0x0dd1, 0x00a7, 0x60ab51b4}, - {0x0005, 0x001b, 0xf51d969b}, - {0x077c, 0x00dd, 0xc2fed268}, - {0x0575, 0x00f5, 0x432c0b1a}, - {0x05be, 0x0088, 0x78baa04b}, - {0x0c89, 0x0068, 0xeda9e428}, - {0x0f5c, 0x0068, 0xec143c76}, - {0x06a8, 0x0009, 0xd72651ce}, - {0x060f, 0x008e, 0x765426cd}, - {0x07b1, 0x0047, 0x2cfcfa0c}, - {0x04f1, 0x0041, 0x55b172f9}, - {0x0e05, 0x00ac, 0x61efde93}, - {0x0bf7, 0x0097, 0x05b83eee}, - {0x04e9, 0x00f3, 0x9928223a}, - {0x023a, 0x0005, 0xdfada9bc}, - {0x0acb, 0x000e, 0x2217cecd}, - {0x0148, 0x0060, 0xbc3f7405}, - {0x0764, 0x0059, 0xcbc201b1}, - {0x021f, 0x0059, 0x5d6b2256}, - {0x0f1e, 0x006c, 0xdefeeb45}, - {0x071c, 0x00b9, 0xb9b59309}, - {0x0564, 0x0063, 0xae064271}, - {0x0b14, 0x0044, 0xdb867d9b}, - {0x0e5a, 0x0055, 0xff06b685}, - {0x015e, 0x00ba, 0x1115ccbc}, - {0x0379, 0x00e6, 0x5f4e58dd}, - {0x013b, 0x0067, 0x4897427e}, - {0x0e64, 0x0071, 0x7af2b7a4}, - {0x0a11, 0x0050, 0x92105726}, - {0x0109, 0x0055, 0xd0d000f9}, - {0x00aa, 0x0022, 0x815d229d}, - {0x09ac, 0x004f, 0x02f9d985}, - {0x0e1b, 0x00ce, 0x5cf92ab4}, - {0x08af, 0x00d8, 0x17ca72d1}, - {0x0e33, 0x000a, 0xda2dba6b}, - {0x0ee3, 0x006a, 0xb00048e5}, - {0x0648, 0x001a, 0x2364b8cb}, - {0x0315, 0x0085, 0x0596fd0d}, - {0x0fbb, 0x003e, 0x298230ca}, - {0x0422, 0x006a, 0x78ada4ab}, - {0x04ba, 0x0073, 0xced1fbc2}, - {0x007d, 0x0061, 0x4b7ff236}, - {0x070b, 0x00d0, 0x261cf0ae}, - {0x0c1a, 0x0035, 0x8be92ee2}, - {0x0af8, 0x0063, 0x824dcf03}, - {0x08f8, 0x006d, 0xd289710c}, - {0x021b, 0x00ee, 0x6ac1c41d}, - {0x05b5, 0x00da, 0x8e52f0e2}, + {0x0567, 0x0097, 0x96951389, 0xc153aa0d}, + {0x0869, 0x0055, 0x6455ab4f, 0xd07f69bf}, + {0x0c51, 0x00be, 0x8663afde, 0xf9add90c}, + {0x044a, 0x00fc, 0x98fbe432, 0xbf2abb76}, + {0x0f29, 0x0079, 0x42371997, 0x282588b3}, + {0x08ba, 0x0052, 0x942be4f7, 0x2e023547}, + {0x01f2, 0x0013, 0x5262687e, 0x5266287e}, + {0x09e3, 0x00e2, 0x8ffb0908, 0x1da892f3}, + {0x007c, 0x0051, 0xb3158491, 0xb67f9e63}, + {0x0854, 0x001f, 0x83bb20d9, 0x22bb21db}, + {0x031b, 0x0008, 0x98970bdf, 0x9cd70adf}, + {0x0de7, 0x0027, 0xbfbf6f6c, 0xae3f296c}, + {0x0f76, 0x0005, 0x906a7105, 0x906a7105}, + {0x092e, 0x00d0, 0x86631850, 0xa3f6ac04}, + {0x0233, 0x0082, 0xdbdd914e, 0x5d8c7aac}, + {0x04c9, 0x0075, 0x5a400a9e, 0x12f60711}, + {0x0b66, 0x0099, 0xae128b45, 0x7551310d}, + {0x000d, 0x00ed, 0xe61c216a, 0xc22d3c4c}, + {0x0a31, 0x003d, 0xf69663b9, 0x51960bf8}, + {0x00a3, 0x0052, 0x643c39ae, 0xa93c73a8}, + {0x0125, 0x00d5, 0x7c310b0d, 0xf221cbb3}, + {0x0105, 0x004a, 0x06a77e74, 0xa4ef4561}, + {0x0858, 0x008e, 0x265bc739, 0xd6c36d9b}, + {0x045e, 0x0095, 0x13d6b192, 0x5f5c1d62}, + {0x0dab, 0x003c, 0xc4498704, 0x10414654}, + {0x00cd, 0x00b5, 0x802a4e2d, 0xfbd17c9d}, + {0x069b, 0x008c, 0x5df60f71, 0x91ddca5f}, + {0x0454, 0x006c, 0x5f03d8bb, 0x5c59fce0}, + {0x040e, 0x0032, 0x0ce513b5, 0xa8cd99b1}, + {0x0874, 0x00e2, 0x6a811fb3, 0xca028316}, + {0x0521, 0x00b4, 0x93296833, 0x2c4d4880}, + {0x0ddc, 0x00cf, 0xf9305338, 0x2c94210d}, + {0x0a70, 0x0023, 0x239549ea, 0x22b561aa}, + {0x083e, 0x0027, 0x2d88ba97, 0x5cd8bb9d}, + {0x0241, 0x00a7, 0xfe0b32e1, 0x17b506b8}, + {0x0dfc, 0x0096, 0x1a11e815, 0xee4141bd}, + {0x023e, 0x001e, 0xebc9a1f3, 0x5689a1f3}, + {0x067e, 0x0066, 0xb1067f81, 0xd9952571}, + {0x09ea, 0x000e, 0x46fd7247, 0x42b57245}, + {0x036b, 0x008c, 0x1a39acdf, 0x58bf1586}, + {0x078f, 0x0030, 0x964042ab, 0xb04218b9}, + {0x085c, 0x008f, 0x1829edab, 0x9ceca89c}, + {0x02ec, 0x009f, 0x6aefa72d, 0x634cc2a7}, + {0x043b, 0x00ce, 0x65642ff5, 0x6c8a584e}, + {0x0a32, 0x00b8, 0xbd82759e, 0x0f96a34f}, + {0x0d3c, 0x0087, 0xf4d66d54, 0xb71ba5f4}, + {0x09ec, 0x008a, 0x06bfa1ff, 0x576ca80f}, + {0x0902, 0x0015, 0x755025d2, 0x517225c2}, + {0x08fe, 0x000e, 0xf690ce2d, 0xf690cf3d}, + {0x00fb, 0x00dc, 0xe55f1528, 0x707d7d92}, + {0x0eaa, 0x003a, 0x0fe0a8d7, 0x87638cc5}, + {0x05fb, 0x0006, 0x86281cfb, 0x86281cf9}, + {0x0dd1, 0x00a7, 0x60ab51b4, 0xe28ef00c}, + {0x0005, 0x001b, 0xf51d969b, 0xe71dd6d3}, + {0x077c, 0x00dd, 0xc2fed268, 0xdc30c555}, + {0x0575, 0x00f5, 0x432c0b1a, 0x81dd7d16}, + {0x05be, 0x0088, 0x78baa04b, 0xd69b433e}, + {0x0c89, 0x0068, 0xeda9e428, 0xe9b4fa0a}, + {0x0f5c, 0x0068, 0xec143c76, 0x9947067a}, + {0x06a8, 0x0009, 0xd72651ce, 0xd72651ee}, + {0x060f, 0x008e, 0x765426cd, 0x2099626f}, + {0x07b1, 0x0047, 0x2cfcfa0c, 0x1a4baa07}, + {0x04f1, 0x0041, 0x55b172f9, 0x15331a79}, + {0x0e05, 0x00ac, 0x61efde93, 0x320568cc}, + {0x0bf7, 0x0097, 0x05b83eee, 0xc72fb7a3}, + {0x04e9, 0x00f3, 0x9928223a, 0xe8c77de2}, + {0x023a, 0x0005, 0xdfada9bc, 0xdfadb9be}, + {0x0acb, 0x000e, 0x2217cecd, 0x0017d6cd}, + {0x0148, 0x0060, 0xbc3f7405, 0xf5fd6615}, + {0x0764, 0x0059, 0xcbc201b1, 0xbb089bf4}, + {0x021f, 0x0059, 0x5d6b2256, 0xa16a0a59}, + {0x0f1e, 0x006c, 0xdefeeb45, 0xfc34f9d6}, + {0x071c, 0x00b9, 0xb9b59309, 0xb645eae2}, + {0x0564, 0x0063, 0xae064271, 0x954dc6d1}, + {0x0b14, 0x0044, 0xdb867d9b, 0xdf432309}, + {0x0e5a, 0x0055, 0xff06b685, 0xa65ff257}, + {0x015e, 0x00ba, 0x1115ccbc, 0x11c365f4}, + {0x0379, 0x00e6, 0x5f4e58dd, 0x2d176d31}, + {0x013b, 0x0067, 0x4897427e, 0xc40532fe}, + {0x0e64, 0x0071, 0x7af2b7a4, 0x1fb7bf43}, + {0x0a11, 0x0050, 0x92105726, 0xb1185e51}, + {0x0109, 0x0055, 0xd0d000f9, 0x60a60bfd}, + {0x00aa, 0x0022, 0x815d229d, 0x215d379c}, + {0x09ac, 0x004f, 0x02f9d985, 0x10b90b20}, + {0x0e1b, 0x00ce, 0x5cf92ab4, 0x6a477573}, + {0x08af, 0x00d8, 0x17ca72d1, 0x385af156}, + {0x0e33, 0x000a, 0xda2dba6b, 0xda2dbb69}, + {0x0ee3, 0x006a, 0xb00048e5, 0xa9a2decc}, + {0x0648, 0x001a, 0x2364b8cb, 0x3364b1cb}, + {0x0315, 0x0085, 0x0596fd0d, 0xa651740f}, + {0x0fbb, 0x003e, 0x298230ca, 0x7fc617c7}, + {0x0422, 0x006a, 0x78ada4ab, 0xc576ae2a}, + {0x04ba, 0x0073, 0xced1fbc2, 0xaac8455b}, + {0x007d, 0x0061, 0x4b7ff236, 0x347d5739}, + {0x070b, 0x00d0, 0x261cf0ae, 0xc7fb1c10}, + {0x0c1a, 0x0035, 0x8be92ee2, 0x8be9b4e1}, + {0x0af8, 0x0063, 0x824dcf03, 0x53010388}, + {0x08f8, 0x006d, 0xd289710c, 0x30418edd}, + {0x021b, 0x00ee, 0x6ac1c41d, 0x2557e9a3}, + {0x05b5, 0x00da, 0x8e52f0e2, 0x98531012}, }; int __init @@ -644,12 +648,19 @@ xfs_dahash_test(void) unsigned int errors = 0; for (i = 0; i < ARRAY_SIZE(test); i++) { + struct xfs_name xname = { }; xfs_dahash_t hash; hash = xfs_da_hashname(test_buf + test[i].start, test[i].length); if (hash != test[i].dahash) errors++; + + xname.name = test_buf + test[i].start; + xname.len = test[i].length; + hash = xfs_ascii_ci_hashname(&xname); + if (hash != test[i].ascii_ci_dahash) + errors++; } if (errors) { diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 8fb90da89787..7f071757f278 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -798,7 +798,6 @@ xfs_qm_dqget_cache_insert( error = radix_tree_insert(tree, id, dqp); if (unlikely(error)) { /* Duplicate found! Caller must try again. */ - WARN_ON(error != -EEXIST); mutex_unlock(&qi->qi_tree_lock); trace_xfs_dqget_dup(dqp); return error; diff --git a/fs/xfs/xfs_drain.c b/fs/xfs/xfs_drain.c new file mode 100644 index 000000000000..005a66be44a2 --- /dev/null +++ b/fs/xfs/xfs_drain.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_ag.h" +#include "xfs_trace.h" + +/* + * Use a static key here to reduce the overhead of xfs_drain_rele. If the + * compiler supports jump labels, the static branch will be replaced by a nop + * sled when there are no xfs_drain_wait callers. Online fsck is currently + * the only caller, so this is a reasonable tradeoff. + * + * Note: Patching the kernel code requires taking the cpu hotplug lock. Other + * parts of the kernel allocate memory with that lock held, which means that + * XFS callers cannot hold any locks that might be used by memory reclaim or + * writeback when calling the static_branch_{inc,dec} functions. + */ +static DEFINE_STATIC_KEY_FALSE(xfs_drain_waiter_gate); + +void +xfs_drain_wait_disable(void) +{ + static_branch_dec(&xfs_drain_waiter_gate); +} + +void +xfs_drain_wait_enable(void) +{ + static_branch_inc(&xfs_drain_waiter_gate); +} + +void +xfs_defer_drain_init( + struct xfs_defer_drain *dr) +{ + atomic_set(&dr->dr_count, 0); + init_waitqueue_head(&dr->dr_waiters); +} + +void +xfs_defer_drain_free(struct xfs_defer_drain *dr) +{ + ASSERT(atomic_read(&dr->dr_count) == 0); +} + +/* Increase the pending intent count. */ +static inline void xfs_defer_drain_grab(struct xfs_defer_drain *dr) +{ + atomic_inc(&dr->dr_count); +} + +static inline bool has_waiters(struct wait_queue_head *wq_head) +{ + /* + * This memory barrier is paired with the one in set_current_state on + * the waiting side. + */ + smp_mb__after_atomic(); + return waitqueue_active(wq_head); +} + +/* Decrease the pending intent count, and wake any waiters, if appropriate. */ +static inline void xfs_defer_drain_rele(struct xfs_defer_drain *dr) +{ + if (atomic_dec_and_test(&dr->dr_count) && + static_branch_unlikely(&xfs_drain_waiter_gate) && + has_waiters(&dr->dr_waiters)) + wake_up(&dr->dr_waiters); +} + +/* Are there intents pending? */ +static inline bool xfs_defer_drain_busy(struct xfs_defer_drain *dr) +{ + return atomic_read(&dr->dr_count) > 0; +} + +/* + * Wait for the pending intent count for a drain to hit zero. + * + * Callers must not hold any locks that would prevent intents from being + * finished. + */ +static inline int xfs_defer_drain_wait(struct xfs_defer_drain *dr) +{ + return wait_event_killable(dr->dr_waiters, !xfs_defer_drain_busy(dr)); +} + +/* + * Get a passive reference to an AG and declare an intent to update its + * metadata. + */ +struct xfs_perag * +xfs_perag_intent_get( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, agno); + if (!pag) + return NULL; + + xfs_perag_intent_hold(pag); + return pag; +} + +/* + * Release our intent to update this AG's metadata, and then release our + * passive ref to the AG. + */ +void +xfs_perag_intent_put( + struct xfs_perag *pag) +{ + xfs_perag_intent_rele(pag); + xfs_perag_put(pag); +} + +/* + * Declare an intent to update AG metadata. Other threads that need exclusive + * access can decide to back off if they see declared intentions. + */ +void +xfs_perag_intent_hold( + struct xfs_perag *pag) +{ + trace_xfs_perag_intent_hold(pag, __return_address); + xfs_defer_drain_grab(&pag->pag_intents_drain); +} + +/* Release our intent to update this AG's metadata. */ +void +xfs_perag_intent_rele( + struct xfs_perag *pag) +{ + trace_xfs_perag_intent_rele(pag, __return_address); + xfs_defer_drain_rele(&pag->pag_intents_drain); +} + +/* + * Wait for the intent update count for this AG to hit zero. + * Callers must not hold any AG header buffers. + */ +int +xfs_perag_intent_drain( + struct xfs_perag *pag) +{ + trace_xfs_perag_wait_intents(pag, __return_address); + return xfs_defer_drain_wait(&pag->pag_intents_drain); +} + +/* Has anyone declared an intent to update this AG? */ +bool +xfs_perag_intent_busy( + struct xfs_perag *pag) +{ + return xfs_defer_drain_busy(&pag->pag_intents_drain); +} diff --git a/fs/xfs/xfs_drain.h b/fs/xfs/xfs_drain.h new file mode 100644 index 000000000000..50a5772a8296 --- /dev/null +++ b/fs/xfs/xfs_drain.h @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef XFS_DRAIN_H_ +#define XFS_DRAIN_H_ + +struct xfs_perag; + +#ifdef CONFIG_XFS_DRAIN_INTENTS +/* + * Passive drain mechanism. This data structure tracks a count of some items + * and contains a waitqueue for callers who would like to wake up when the + * count hits zero. + */ +struct xfs_defer_drain { + /* Number of items pending in some part of the filesystem. */ + atomic_t dr_count; + + /* Queue to wait for dri_count to go to zero */ + struct wait_queue_head dr_waiters; +}; + +void xfs_defer_drain_init(struct xfs_defer_drain *dr); +void xfs_defer_drain_free(struct xfs_defer_drain *dr); + +void xfs_drain_wait_disable(void); +void xfs_drain_wait_enable(void); + +/* + * Deferred Work Intent Drains + * =========================== + * + * When a writer thread executes a chain of log intent items, the AG header + * buffer locks will cycle during a transaction roll to get from one intent + * item to the next in a chain. Although scrub takes all AG header buffer + * locks, this isn't sufficient to guard against scrub checking an AG while + * that writer thread is in the middle of finishing a chain because there's no + * higher level locking primitive guarding allocation groups. + * + * When there's a collision, cross-referencing between data structures (e.g. + * rmapbt and refcountbt) yields false corruption events; if repair is running, + * this results in incorrect repairs, which is catastrophic. + * + * The solution is to the perag structure the count of active intents and make + * scrub wait until it has both AG header buffer locks and the intent counter + * reaches zero. It is therefore critical that deferred work threads hold the + * AGI or AGF buffers when decrementing the intent counter. + * + * Given a list of deferred work items, the deferred work manager will complete + * a work item and all the sub-items that the parent item creates before moving + * on to the next work item in the list. This is also true for all levels of + * sub-items. Writer threads are permitted to queue multiple work items + * targetting the same AG, so a deferred work item (such as a BUI) that creates + * sub-items (such as RUIs) must bump the intent counter and maintain it until + * the sub-items can themselves bump the intent counter. + * + * Therefore, the intent count tracks entire lifetimes of deferred work items. + * All functions that create work items must increment the intent counter as + * soon as the item is added to the transaction and cannot drop the counter + * until the item is finished or cancelled. + */ +struct xfs_perag *xfs_perag_intent_get(struct xfs_mount *mp, + xfs_agnumber_t agno); +void xfs_perag_intent_put(struct xfs_perag *pag); + +void xfs_perag_intent_hold(struct xfs_perag *pag); +void xfs_perag_intent_rele(struct xfs_perag *pag); + +int xfs_perag_intent_drain(struct xfs_perag *pag); +bool xfs_perag_intent_busy(struct xfs_perag *pag); +#else +struct xfs_defer_drain { /* empty */ }; + +#define xfs_defer_drain_free(dr) ((void)0) +#define xfs_defer_drain_init(dr) ((void)0) + +#define xfs_perag_intent_get(mp, agno) xfs_perag_get((mp), (agno)) +#define xfs_perag_intent_put(pag) xfs_perag_put(pag) + +static inline void xfs_perag_intent_hold(struct xfs_perag *pag) { } +static inline void xfs_perag_intent_rele(struct xfs_perag *pag) { } + +#endif /* CONFIG_XFS_DRAIN_INTENTS */ + +#endif /* XFS_DRAIN_H_ */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 011b50469301..f9e36b810663 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -351,8 +351,6 @@ xfs_trans_free_extent( struct xfs_mount *mp = tp->t_mountp; struct xfs_extent *extp; uint next_extent; - xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, - xefi->xefi_startblock); xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock); int error; @@ -363,12 +361,13 @@ xfs_trans_free_extent( if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK) oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; - trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, - xefi->xefi_blockcount); + trace_xfs_bmap_free_deferred(tp->t_mountp, xefi->xefi_pag->pag_agno, 0, + agbno, xefi->xefi_blockcount); - error = __xfs_free_extent(tp, xefi->xefi_startblock, + error = __xfs_free_extent(tp, xefi->xefi_pag, agbno, xefi->xefi_blockcount, &oinfo, XFS_AG_RESV_NONE, xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); + /* * Mark the transaction dirty, even on error. This ensures the * transaction is aborted, which: @@ -396,14 +395,13 @@ xfs_extent_free_diff_items( const struct list_head *a, const struct list_head *b) { - struct xfs_mount *mp = priv; struct xfs_extent_free_item *ra; struct xfs_extent_free_item *rb; ra = container_of(a, struct xfs_extent_free_item, xefi_list); rb = container_of(b, struct xfs_extent_free_item, xefi_list); - return XFS_FSB_TO_AGNO(mp, ra->xefi_startblock) - - XFS_FSB_TO_AGNO(mp, rb->xefi_startblock); + + return ra->xefi_pag->pag_agno - rb->xefi_pag->pag_agno; } /* Log a free extent to the intent item. */ @@ -462,6 +460,26 @@ xfs_extent_free_create_done( return &xfs_trans_get_efd(tp, EFI_ITEM(intent), count)->efd_item; } +/* Take a passive ref to the AG containing the space we're freeing. */ +void +xfs_extent_free_get_group( + struct xfs_mount *mp, + struct xfs_extent_free_item *xefi) +{ + xfs_agnumber_t agno; + + agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock); + xefi->xefi_pag = xfs_perag_intent_get(mp, agno); +} + +/* Release a passive AG ref after some freeing work. */ +static inline void +xfs_extent_free_put_group( + struct xfs_extent_free_item *xefi) +{ + xfs_perag_intent_put(xefi->xefi_pag); +} + /* Process a free extent. */ STATIC int xfs_extent_free_finish_item( @@ -476,6 +494,8 @@ xfs_extent_free_finish_item( xefi = container_of(item, struct xfs_extent_free_item, xefi_list); error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi); + + xfs_extent_free_put_group(xefi); kmem_cache_free(xfs_extfree_item_cache, xefi); return error; } @@ -496,6 +516,8 @@ xfs_extent_free_cancel_item( struct xfs_extent_free_item *xefi; xefi = container_of(item, struct xfs_extent_free_item, xefi_list); + + xfs_extent_free_put_group(xefi); kmem_cache_free(xfs_extfree_item_cache, xefi); } @@ -526,24 +548,21 @@ xfs_agfl_free_finish_item( struct xfs_extent *extp; struct xfs_buf *agbp; int error; - xfs_agnumber_t agno; xfs_agblock_t agbno; uint next_extent; - struct xfs_perag *pag; xefi = container_of(item, struct xfs_extent_free_item, xefi_list); ASSERT(xefi->xefi_blockcount == 1); - agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock); agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock); oinfo.oi_owner = xefi->xefi_owner; - trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, xefi->xefi_blockcount); + trace_xfs_agfl_free_deferred(mp, xefi->xefi_pag->pag_agno, 0, agbno, + xefi->xefi_blockcount); - pag = xfs_perag_get(mp, agno); - error = xfs_alloc_read_agf(pag, tp, 0, &agbp); + error = xfs_alloc_read_agf(xefi->xefi_pag, tp, 0, &agbp); if (!error) - error = xfs_free_agfl_block(tp, agno, agbno, agbp, &oinfo); - xfs_perag_put(pag); + error = xfs_free_agfl_block(tp, xefi->xefi_pag->pag_agno, + agbno, agbp, &oinfo); /* * Mark the transaction dirty, even on error. This ensures the @@ -562,6 +581,7 @@ xfs_agfl_free_finish_item( extp->ext_len = xefi->xefi_blockcount; efdp->efd_next_extent++; + xfs_extent_free_put_group(xefi); kmem_cache_free(xfs_extfree_item_cache, xefi); return error; } @@ -632,7 +652,9 @@ xfs_efi_item_recover( fake.xefi_startblock = extp->ext_start; fake.xefi_blockcount = extp->ext_len; + xfs_extent_free_get_group(mp, &fake); error = xfs_trans_free_extent(tp, efdp, &fake); + xfs_extent_free_put_group(&fake); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, extp, sizeof(*extp)); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 705250f9f90a..aede746541f8 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1171,7 +1171,8 @@ xfs_file_open( { if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; - file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC; + file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC | + FMODE_DIO_PARALLEL_WRITE; return generic_file_open(inode, file); } @@ -1388,25 +1389,10 @@ xfs_filemap_pfn_mkwrite( return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); } -static vm_fault_t -xfs_filemap_map_pages( - struct vm_fault *vmf, - pgoff_t start_pgoff, - pgoff_t end_pgoff) -{ - struct inode *inode = file_inode(vmf->vma->vm_file); - vm_fault_t ret; - - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = filemap_map_pages(vmf, start_pgoff, end_pgoff); - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - return ret; -} - static const struct vm_operations_struct xfs_file_vm_ops = { .fault = xfs_filemap_fault, .huge_fault = xfs_filemap_huge_fault, - .map_pages = xfs_filemap_map_pages, + .map_pages = filemap_map_pages, .page_mkwrite = xfs_filemap_page_mkwrite, .pfn_mkwrite = xfs_filemap_pfn_mkwrite, }; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index c9a7e270a428..0f60e301eb1f 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -435,18 +435,23 @@ xfs_iget_check_free_state( } /* Make all pending inactivation work start immediately. */ -static void +static bool xfs_inodegc_queue_all( struct xfs_mount *mp) { struct xfs_inodegc *gc; int cpu; + bool ret = false; for_each_online_cpu(cpu) { gc = per_cpu_ptr(mp->m_inodegc, cpu); - if (!llist_empty(&gc->list)) + if (!llist_empty(&gc->list)) { mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); + ret = true; + } } + + return ret; } /* @@ -767,7 +772,8 @@ again: return 0; out_error_or_again: - if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) { + if (!(flags & (XFS_IGET_INCORE | XFS_IGET_NORETRY)) && + error == -EAGAIN) { delay(1); goto again; } @@ -1855,6 +1861,8 @@ xfs_inodegc_worker( struct xfs_inode *ip, *n; unsigned int nofs_flag; + ASSERT(gc->cpu == smp_processor_id()); + WRITE_ONCE(gc->items, 0); if (!node) @@ -1908,24 +1916,41 @@ xfs_inodegc_flush( /* * Flush all the pending work and then disable the inode inactivation background - * workers and wait for them to stop. + * workers and wait for them to stop. Caller must hold sb->s_umount to + * coordinate changes in the inodegc_enabled state. */ void xfs_inodegc_stop( struct xfs_mount *mp) { + bool rerun; + if (!xfs_clear_inodegc_enabled(mp)) return; + /* + * Drain all pending inodegc work, including inodes that could be + * queued by racing xfs_inodegc_queue or xfs_inodegc_shrinker_scan + * threads that sample the inodegc state just prior to us clearing it. + * The inodegc flag state prevents new threads from queuing more + * inodes, so we queue pending work items and flush the workqueue until + * all inodegc lists are empty. IOWs, we cannot use drain_workqueue + * here because it does not allow other unserialized mechanisms to + * reschedule inodegc work while this draining is in progress. + */ xfs_inodegc_queue_all(mp); - drain_workqueue(mp->m_inodegc_wq); + do { + flush_workqueue(mp->m_inodegc_wq); + rerun = xfs_inodegc_queue_all(mp); + } while (rerun); trace_xfs_inodegc_stop(mp, __return_address); } /* * Enable the inode inactivation background workers and schedule deferred inode - * inactivation work if there is any. + * inactivation work if there is any. Caller must hold sb->s_umount to + * coordinate changes in the inodegc_enabled state. */ void xfs_inodegc_start( @@ -2068,7 +2093,8 @@ xfs_inodegc_queue( queue_delay = 0; trace_xfs_inodegc_queue(mp, __return_address); - mod_delayed_work(mp->m_inodegc_wq, &gc->work, queue_delay); + mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work, + queue_delay); put_cpu_ptr(gc); if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) { @@ -2112,7 +2138,8 @@ xfs_inodegc_cpu_dead( if (xfs_is_inodegc_enabled(mp)) { trace_xfs_inodegc_queue(mp, __return_address); - mod_delayed_work(mp->m_inodegc_wq, &gc->work, 0); + mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work, + 0); } put_cpu_ptr(gc); } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 6cd180721659..87910191a9dd 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -34,10 +34,13 @@ struct xfs_icwalk { /* * Flags for xfs_iget() */ -#define XFS_IGET_CREATE 0x1 -#define XFS_IGET_UNTRUSTED 0x2 -#define XFS_IGET_DONTCACHE 0x4 -#define XFS_IGET_INCORE 0x8 /* don't read from disk or reinit */ +#define XFS_IGET_CREATE (1U << 0) +#define XFS_IGET_UNTRUSTED (1U << 1) +#define XFS_IGET_DONTCACHE (1U << 2) +/* don't read from disk or reinit */ +#define XFS_IGET_INCORE (1U << 3) +/* Return -EAGAIN immediately if the inode is unavailable. */ +#define XFS_IGET_NORETRY (1U << 4) int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, uint flags, uint lock_flags, xfs_inode_t **ipp); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 285885c308bd..18c8f168b153 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1006,8 +1006,9 @@ xfs_buffered_write_iomap_begin( if (eof) imap.br_startoff = end_fsb; /* fake hole until the end */ - /* We never need to allocate blocks for zeroing a hole. */ - if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) { + /* We never need to allocate blocks for zeroing or unsharing a hole. */ + if ((flags & (IOMAP_UNSHARE | IOMAP_ZERO)) && + imap.br_startoff > offset_fsb) { xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff); goto out_unlock; } diff --git a/fs/xfs/xfs_iunlink_item.c b/fs/xfs/xfs_iunlink_item.c index 43005ce8bd48..2ddccb172fa0 100644 --- a/fs/xfs/xfs_iunlink_item.c +++ b/fs/xfs/xfs_iunlink_item.c @@ -168,9 +168,7 @@ xfs_iunlink_log_inode( iup->ip = ip; iup->next_agino = next_agino; iup->old_agino = ip->i_next_unlinked; - - atomic_inc(&pag->pag_ref); - iup->pag = pag; + iup->pag = xfs_perag_hold(pag); xfs_trans_add_item(tp, &iup->item); tp->t_flags |= XFS_TRANS_DIRTY; diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 21be93bf006d..b3275e8d47b6 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -667,11 +667,10 @@ xfs_iwalk_threaded( iwag->mp = mp; /* - * perag is being handed off to async work, so take another + * perag is being handed off to async work, so take a passive * reference for the async work to release. */ - atomic_inc(&pag->pag_ref); - iwag->pag = pag; + iwag->pag = xfs_perag_hold(pag); iwag->iwalk_fn = iwalk_fn; iwag->data = data; iwag->startino = startino; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index e88f18f85e4b..74dcb05069e8 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -80,6 +80,7 @@ typedef __u32 xfs_nlink_t; #include "xfs_cksum.h" #include "xfs_buf.h" #include "xfs_message.h" +#include "xfs_drain.h" #ifdef __BIG_ENDIAN #define XFS_NATIVE_HOST 1 diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index f3269c0626f0..aaaf5ec13492 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -66,6 +66,9 @@ struct xfs_inodegc { /* approximate count of inodes in the list */ unsigned int items; unsigned int shrinker_hits; +#if defined(DEBUG) || defined(XFS_WARN) + unsigned int cpu; +#endif }; /* diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 48d771a76add..edd8587658d5 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -20,6 +20,7 @@ #include "xfs_error.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" +#include "xfs_ag.h" struct kmem_cache *xfs_cui_cache; struct kmem_cache *xfs_cud_cache; @@ -279,14 +280,13 @@ xfs_refcount_update_diff_items( const struct list_head *a, const struct list_head *b) { - struct xfs_mount *mp = priv; struct xfs_refcount_intent *ra; struct xfs_refcount_intent *rb; ra = container_of(a, struct xfs_refcount_intent, ri_list); rb = container_of(b, struct xfs_refcount_intent, ri_list); - return XFS_FSB_TO_AGNO(mp, ra->ri_startblock) - - XFS_FSB_TO_AGNO(mp, rb->ri_startblock); + + return ra->ri_pag->pag_agno - rb->ri_pag->pag_agno; } /* Set the phys extent flags for this reverse mapping. */ @@ -365,6 +365,26 @@ xfs_refcount_update_create_done( return &xfs_trans_get_cud(tp, CUI_ITEM(intent))->cud_item; } +/* Take a passive ref to the AG containing the space we're refcounting. */ +void +xfs_refcount_update_get_group( + struct xfs_mount *mp, + struct xfs_refcount_intent *ri) +{ + xfs_agnumber_t agno; + + agno = XFS_FSB_TO_AGNO(mp, ri->ri_startblock); + ri->ri_pag = xfs_perag_intent_get(mp, agno); +} + +/* Release a passive AG ref after finishing refcounting work. */ +static inline void +xfs_refcount_update_put_group( + struct xfs_refcount_intent *ri) +{ + xfs_perag_intent_put(ri->ri_pag); +} + /* Process a deferred refcount update. */ STATIC int xfs_refcount_update_finish_item( @@ -386,6 +406,8 @@ xfs_refcount_update_finish_item( ri->ri_type == XFS_REFCOUNT_DECREASE); return -EAGAIN; } + + xfs_refcount_update_put_group(ri); kmem_cache_free(xfs_refcount_intent_cache, ri); return error; } @@ -406,6 +428,8 @@ xfs_refcount_update_cancel_item( struct xfs_refcount_intent *ri; ri = container_of(item, struct xfs_refcount_intent, ri_list); + + xfs_refcount_update_put_group(ri); kmem_cache_free(xfs_refcount_intent_cache, ri); } @@ -520,9 +544,13 @@ xfs_cui_item_recover( fake.ri_startblock = pmap->pe_startblock; fake.ri_blockcount = pmap->pe_len; - if (!requeue_only) + + if (!requeue_only) { + xfs_refcount_update_get_group(mp, &fake); error = xfs_trans_log_finish_refcount_update(tp, cudp, &fake, &rcur); + xfs_refcount_update_put_group(&fake); + } if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, &cuip->cui_format, diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index a1619d67015f..520c7ebdfed8 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -20,6 +20,7 @@ #include "xfs_error.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" +#include "xfs_ag.h" struct kmem_cache *xfs_rui_cache; struct kmem_cache *xfs_rud_cache; @@ -320,14 +321,13 @@ xfs_rmap_update_diff_items( const struct list_head *a, const struct list_head *b) { - struct xfs_mount *mp = priv; struct xfs_rmap_intent *ra; struct xfs_rmap_intent *rb; ra = container_of(a, struct xfs_rmap_intent, ri_list); rb = container_of(b, struct xfs_rmap_intent, ri_list); - return XFS_FSB_TO_AGNO(mp, ra->ri_bmap.br_startblock) - - XFS_FSB_TO_AGNO(mp, rb->ri_bmap.br_startblock); + + return ra->ri_pag->pag_agno - rb->ri_pag->pag_agno; } /* Log rmap updates in the intent item. */ @@ -390,6 +390,26 @@ xfs_rmap_update_create_done( return &xfs_trans_get_rud(tp, RUI_ITEM(intent))->rud_item; } +/* Take a passive ref to the AG containing the space we're rmapping. */ +void +xfs_rmap_update_get_group( + struct xfs_mount *mp, + struct xfs_rmap_intent *ri) +{ + xfs_agnumber_t agno; + + agno = XFS_FSB_TO_AGNO(mp, ri->ri_bmap.br_startblock); + ri->ri_pag = xfs_perag_intent_get(mp, agno); +} + +/* Release a passive AG ref after finishing rmapping work. */ +static inline void +xfs_rmap_update_put_group( + struct xfs_rmap_intent *ri) +{ + xfs_perag_intent_put(ri->ri_pag); +} + /* Process a deferred rmap update. */ STATIC int xfs_rmap_update_finish_item( @@ -405,6 +425,8 @@ xfs_rmap_update_finish_item( error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), ri, state); + + xfs_rmap_update_put_group(ri); kmem_cache_free(xfs_rmap_intent_cache, ri); return error; } @@ -425,6 +447,8 @@ xfs_rmap_update_cancel_item( struct xfs_rmap_intent *ri; ri = container_of(item, struct xfs_rmap_intent, ri_list); + + xfs_rmap_update_put_group(ri); kmem_cache_free(xfs_rmap_intent_cache, ri); } @@ -559,11 +583,13 @@ xfs_rui_item_recover( fake.ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + xfs_rmap_update_get_group(mp, &fake); error = xfs_trans_log_finish_rmap_update(tp, rudp, &fake, &rcur); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, map, sizeof(*map)); + xfs_rmap_update_put_group(&fake); if (error) goto abort_error; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 4f814f9e12ab..7e706255f165 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1095,6 +1095,9 @@ xfs_inodegc_init_percpu( for_each_possible_cpu(cpu) { gc = per_cpu_ptr(mp->m_inodegc, cpu); +#if defined(DEBUG) || defined(XFS_WARN) + gc->cpu = cpu; +#endif init_llist_head(&gc->list); gc->items = 0; INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker); @@ -1548,6 +1551,19 @@ xfs_fs_fill_super( #endif } + /* ASCII case insensitivity is undergoing deprecation. */ + if (xfs_has_asciici(mp)) { +#ifdef CONFIG_XFS_SUPPORT_ASCII_CI + xfs_warn_once(mp, + "Deprecated ASCII case-insensitivity feature (ascii-ci=1) will not be supported after September 2030."); +#else + xfs_warn(mp, + "Deprecated ASCII case-insensitivity feature (ascii-ci=1) not supported by kernel."); + error = -EINVAL; + goto out_free_sb; +#endif + } + /* Filesystem claims it needs repair, so refuse the mount. */ if (xfs_has_needsrepair(mp)) { xfs_warn(mp, "Filesystem needs repair. Please run xfs_repair."); diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index 546a6cd96729..fade33735393 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -210,28 +210,10 @@ static struct ctl_table xfs_table[] = { {} }; -static struct ctl_table xfs_dir_table[] = { - { - .procname = "xfs", - .mode = 0555, - .child = xfs_table - }, - {} -}; - -static struct ctl_table xfs_root_table[] = { - { - .procname = "fs", - .mode = 0555, - .child = xfs_dir_table - }, - {} -}; - int xfs_sysctl_register(void) { - xfs_table_header = register_sysctl_table(xfs_root_table); + xfs_table_header = register_sysctl("fs/xfs", xfs_table); if (!xfs_table_header) return -ENOMEM; return 0; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 9c0006c55fec..cd4ca5b1fcb0 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -190,6 +190,7 @@ DEFINE_EVENT(xfs_perag_class, name, \ TP_ARGS(pag, caller_ip)) DEFINE_PERAG_REF_EVENT(xfs_perag_get); DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag); +DEFINE_PERAG_REF_EVENT(xfs_perag_hold); DEFINE_PERAG_REF_EVENT(xfs_perag_put); DEFINE_PERAG_REF_EVENT(xfs_perag_grab); DEFINE_PERAG_REF_EVENT(xfs_perag_grab_tag); @@ -2686,6 +2687,44 @@ DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_deferred); DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_defer); DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_deferred); +DECLARE_EVENT_CLASS(xfs_defer_pending_item_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp, + void *item), + TP_ARGS(mp, dfp, item), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, type) + __field(void *, intent) + __field(void *, item) + __field(char, committed) + __field(int, nr) + ), + TP_fast_assign( + __entry->dev = mp ? mp->m_super->s_dev : 0; + __entry->type = dfp->dfp_type; + __entry->intent = dfp->dfp_intent; + __entry->item = item; + __entry->committed = dfp->dfp_done != NULL; + __entry->nr = dfp->dfp_count; + ), + TP_printk("dev %d:%d optype %d intent %p item %p committed %d nr %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->intent, + __entry->item, + __entry->committed, + __entry->nr) +) +#define DEFINE_DEFER_PENDING_ITEM_EVENT(name) \ +DEFINE_EVENT(xfs_defer_pending_item_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp, \ + void *item), \ + TP_ARGS(mp, dfp, item)) + +DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_add_item); +DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_cancel_item); +DEFINE_DEFER_PENDING_ITEM_EVENT(xfs_defer_finish_item); + /* rmap tracepoints */ DECLARE_EVENT_CLASS(xfs_rmap_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, @@ -4325,6 +4364,39 @@ TRACE_EVENT(xfs_force_shutdown, __entry->line_num) ); +#ifdef CONFIG_XFS_DRAIN_INTENTS +DECLARE_EVENT_CLASS(xfs_perag_intents_class, + TP_PROTO(struct xfs_perag *pag, void *caller_ip), + TP_ARGS(pag, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(long, nr_intents) + __field(void *, caller_ip) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->nr_intents = atomic_read(&pag->pag_intents_drain.dr_count); + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d agno 0x%x intents %ld caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->nr_intents, + __entry->caller_ip) +); + +#define DEFINE_PERAG_INTENTS_EVENT(name) \ +DEFINE_EVENT(xfs_perag_intents_class, name, \ + TP_PROTO(struct xfs_perag *pag, void *caller_ip), \ + TP_ARGS(pag, caller_ip)) +DEFINE_PERAG_INTENTS_EVENT(xfs_perag_intent_hold); +DEFINE_PERAG_INTENTS_EVENT(xfs_perag_intent_rele); +DEFINE_PERAG_INTENTS_EVENT(xfs_perag_wait_intents); + +#endif /* CONFIG_XFS_DRAIN_INTENTS */ + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 7b9a0ed1b11f..43e5c219aaed 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -179,10 +179,6 @@ const struct xattr_handler *xfs_xattr_handlers[] = { &xfs_xattr_user_handler, &xfs_xattr_trusted_handler, &xfs_xattr_security_handler, -#ifdef CONFIG_XFS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif NULL }; |