summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig6
-rw-r--r--fs/Kconfig.binfmt4
-rw-r--r--fs/Makefile1
-rw-r--r--fs/affs/namei.c4
-rw-r--r--fs/afs/main.c6
-rw-r--r--fs/anon_inodes.c157
-rw-r--r--fs/binfmt_elf.c26
-rw-r--r--fs/binfmt_elf_fdpic.c27
-rw-r--r--fs/binfmt_misc.c4
-rw-r--r--fs/block_dev.c30
-rw-r--r--fs/btrfs/Makefile19
-rw-r--r--fs/btrfs/backref.c19
-rw-r--r--fs/btrfs/backref.h9
-rw-r--r--fs/btrfs/block-group.c191
-rw-r--r--fs/btrfs/block-group.h21
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/check-integrity.c10
-rw-r--r--fs/btrfs/compression.c10
-rw-r--r--fs/btrfs/ctree.c9
-rw-r--r--fs/btrfs/ctree.h22
-rw-r--r--fs/btrfs/delalloc-space.c29
-rw-r--r--fs/btrfs/delayed-inode.c2
-rw-r--r--fs/btrfs/delayed-ref.c79
-rw-r--r--fs/btrfs/delayed-ref.h28
-rw-r--r--fs/btrfs/dev-replace.c186
-rw-r--r--fs/btrfs/dev-replace.h3
-rw-r--r--fs/btrfs/discard.c6
-rw-r--r--fs/btrfs/disk-io.c186
-rw-r--r--fs/btrfs/disk-io.h6
-rw-r--r--fs/btrfs/extent-tree.c432
-rw-r--r--fs/btrfs/extent_io.c791
-rw-r--r--fs/btrfs/extent_io.h17
-rw-r--r--fs/btrfs/extent_map.c18
-rw-r--r--fs/btrfs/file-item.c22
-rw-r--r--fs/btrfs/file.c65
-rw-r--r--fs/btrfs/free-space-cache.c123
-rw-r--r--fs/btrfs/free-space-cache.h2
-rw-r--r--fs/btrfs/free-space-tree.c10
-rw-r--r--fs/btrfs/inode.c336
-rw-r--r--fs/btrfs/ioctl.c29
-rw-r--r--fs/btrfs/ordered-data.c224
-rw-r--r--fs/btrfs/ordered-data.h57
-rw-r--r--fs/btrfs/raid56.c10
-rw-r--r--fs/btrfs/ref-verify.c43
-rw-r--r--fs/btrfs/reflink.c5
-rw-r--r--fs/btrfs/relocation.c99
-rw-r--r--fs/btrfs/scrub.c145
-rw-r--r--fs/btrfs/send.c46
-rw-r--r--fs/btrfs/space-info.c365
-rw-r--r--fs/btrfs/space-info.h25
-rw-r--r--fs/btrfs/subpage.c278
-rw-r--r--fs/btrfs/subpage.h91
-rw-r--r--fs/btrfs/super.c8
-rw-r--r--fs/btrfs/sysfs.c2
-rw-r--r--fs/btrfs/tests/extent-map-tests.c2
-rw-r--r--fs/btrfs/transaction.c160
-rw-r--r--fs/btrfs/transaction.h5
-rw-r--r--fs/btrfs/tree-log.c288
-rw-r--r--fs/btrfs/volumes.c370
-rw-r--r--fs/btrfs/volumes.h19
-rw-r--r--fs/btrfs/zoned.c873
-rw-r--r--fs/btrfs/zoned.h157
-rw-r--r--fs/cachefiles/rdwr.c2
-rw-r--r--fs/ceph/addr.c2
-rw-r--r--fs/ceph/caps.c70
-rw-r--r--fs/ceph/inode.c61
-rw-r--r--fs/ceph/mds_client.c34
-rw-r--r--fs/ceph/snap.c10
-rw-r--r--fs/ceph/super.h40
-rw-r--r--fs/cifs/cifs_debug.c4
-rw-r--r--fs/cifs/cifs_dfs_ref.c12
-rw-r--r--fs/cifs/cifsfs.c4
-rw-r--r--fs/cifs/cifsproto.h6
-rw-r--r--fs/cifs/connect.c45
-rw-r--r--fs/cifs/dfs_cache.c8
-rw-r--r--fs/cifs/dir.c22
-rw-r--r--fs/cifs/fs_context.c61
-rw-r--r--fs/cifs/fs_context.h1
-rw-r--r--fs/cifs/smb2pdu.h2
-rw-r--r--fs/cifs/transport.c22
-rw-r--r--fs/compat_binfmt_elf.c7
-rw-r--r--fs/cramfs/inode.c18
-rw-r--r--fs/dax.c5
-rw-r--r--fs/dcache.c88
-rw-r--r--fs/dcookies.c356
-rw-r--r--fs/direct-io.c4
-rw-r--r--fs/ecryptfs/inode.c11
-rw-r--r--fs/erofs/namei.c4
-rw-r--r--fs/erofs/super.c4
-rw-r--r--fs/erofs/xattr.c10
-rw-r--r--fs/erofs/zmap.c10
-rw-r--r--fs/eventpoll.c4
-rw-r--r--fs/exec.c4
-rw-r--r--fs/exfat/balloc.c4
-rw-r--r--fs/exfat/exfat_fs.h2
-rw-r--r--fs/exfat/exfat_raw.h4
-rw-r--r--fs/exfat/fatent.c43
-rw-r--r--fs/exfat/file.c2
-rw-r--r--fs/exfat/super.c31
-rw-r--r--fs/ext4/fast_commit.c4
-rw-r--r--fs/ext4/file.c5
-rw-r--r--fs/ext4/fsync.c2
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/inode.c20
-rw-r--r--fs/ext4/ioctl.c7
-rw-r--r--fs/ext4/super.c2
-rw-r--r--fs/f2fs/Kconfig20
-rw-r--r--fs/f2fs/Makefile1
-rw-r--r--fs/f2fs/acl.c24
-rw-r--r--fs/f2fs/checkpoint.c177
-rw-r--r--fs/f2fs/compress.c195
-rw-r--r--fs/f2fs/data.c444
-rw-r--r--fs/f2fs/debug.c12
-rw-r--r--fs/f2fs/f2fs.h106
-rw-r--r--fs/f2fs/file.c70
-rw-r--r--fs/f2fs/gc.c8
-rw-r--r--fs/f2fs/inline.c4
-rw-r--r--fs/f2fs/namei.c8
-rw-r--r--fs/f2fs/node.c4
-rw-r--r--fs/f2fs/segment.c19
-rw-r--r--fs/f2fs/segment.h4
-rw-r--r--fs/f2fs/super.c202
-rw-r--r--fs/f2fs/sysfs.c132
-rw-r--r--fs/f2fs/trace.c165
-rw-r--r--fs/f2fs/trace.h43
-rw-r--r--fs/f2fs/xattr.c23
-rw-r--r--fs/fat/file.c2
-rw-r--r--fs/fat/misc.c23
-rw-r--r--fs/fcntl.c19
-rw-r--r--fs/file.c36
-rw-r--r--fs/fs-writeback.c132
-rw-r--r--fs/gfs2/file.c11
-rw-r--r--fs/gfs2/super.c2
-rw-r--r--fs/hfsplus/inode.c2
-rw-r--r--fs/hfsplus/super.c2
-rw-r--r--fs/hostfs/hostfs_kern.c10
-rw-r--r--fs/hugetlbfs/inode.c3
-rw-r--r--fs/inode.c42
-rw-r--r--fs/internal.h10
-rw-r--r--fs/io-wq.c31
-rw-r--r--fs/io-wq.h14
-rw-r--r--fs/io_uring.c2669
-rw-r--r--fs/iomap/direct-io.c78
-rw-r--r--fs/isofs/dir.c1
-rw-r--r--fs/isofs/inode.c9
-rw-r--r--fs/isofs/namei.c1
-rw-r--r--fs/jbd2/checkpoint.c2
-rw-r--r--fs/jbd2/commit.c4
-rw-r--r--fs/jbd2/recovery.c2
-rw-r--r--fs/jffs2/compr_rtime.c3
-rw-r--r--fs/jffs2/summary.c3
-rw-r--r--fs/jfs/jfs_dmap.c2
-rw-r--r--fs/jfs/jfs_filsys.h1
-rw-r--r--fs/jfs/jfs_mount.c10
-rw-r--r--fs/jfs/jfs_txnmgr.c35
-rw-r--r--fs/kernfs/file.c65
-rw-r--r--fs/libfs.c15
-rw-r--r--fs/lockd/svc4proc.c24
-rw-r--r--fs/lockd/svcproc.c24
-rw-r--r--fs/namei.c92
-rw-r--r--fs/nfs/blocklayout/blocklayout.c5
-rw-r--r--fs/nfs/callback_xdr.c2
-rw-r--r--fs/nfs/export.c18
-rw-r--r--fs/nfs/nfs4file.c4
-rw-r--r--fs/nfs/pnfs.c69
-rw-r--r--fs/nfs/super.c12
-rw-r--r--fs/nfs_common/Makefile2
-rw-r--r--fs/nfs_common/nfs_ssc.c2
-rw-r--r--fs/nfs_common/nfsacl.c52
-rw-r--r--fs/nfsd/Kconfig1
-rw-r--r--fs/nfsd/blocklayout.c2
-rw-r--r--fs/nfsd/export.c68
-rw-r--r--fs/nfsd/export.h15
-rw-r--r--fs/nfsd/netns.h23
-rw-r--r--fs/nfsd/nfs2acl.c67
-rw-r--r--fs/nfsd/nfs3acl.c45
-rw-r--r--fs/nfsd/nfs3proc.c93
-rw-r--r--fs/nfsd/nfs3xdr.c589
-rw-r--r--fs/nfsd/nfs4proc.c12
-rw-r--r--fs/nfsd/nfs4state.c124
-rw-r--r--fs/nfsd/nfscache.c52
-rw-r--r--fs/nfsd/nfsctl.c22
-rw-r--r--fs/nfsd/nfsd.h2
-rw-r--r--fs/nfsd/nfsfh.c4
-rw-r--r--fs/nfsd/nfsfh.h5
-rw-r--r--fs/nfsd/nfsproc.c92
-rw-r--r--fs/nfsd/nfssvc.c34
-rw-r--r--fs/nfsd/nfsxdr.c350
-rw-r--r--fs/nfsd/state.h3
-rw-r--r--fs/nfsd/stats.c114
-rw-r--r--fs/nfsd/stats.h96
-rw-r--r--fs/nfsd/vfs.c4
-rw-r--r--fs/nfsd/xdr.h12
-rw-r--r--fs/nfsd/xdr3.h20
-rw-r--r--fs/nilfs2/file.c1
-rw-r--r--fs/nilfs2/segbuf.c4
-rw-r--r--fs/nilfs2/the_nilfs.h2
-rw-r--r--fs/notify/fanotify/fanotify_user.c2
-rw-r--r--fs/notify/group.c25
-rw-r--r--fs/notify/inotify/inotify_user.c4
-rw-r--r--fs/ocfs2/file.c2
-rw-r--r--fs/open.c6
-rw-r--r--fs/orangefs/file.c5
-rw-r--r--fs/overlayfs/copy_up.c15
-rw-r--r--fs/overlayfs/dir.c2
-rw-r--r--fs/overlayfs/file.c5
-rw-r--r--fs/overlayfs/inode.c2
-rw-r--r--fs/overlayfs/overlayfs.h1
-rw-r--r--fs/overlayfs/ovl_entry.h2
-rw-r--r--fs/overlayfs/readdir.c28
-rw-r--r--fs/overlayfs/super.c38
-rw-r--r--fs/overlayfs/util.c27
-rw-r--r--fs/pipe.c1
-rw-r--r--fs/proc/proc_sysctl.c7
-rw-r--r--fs/proc/self.c2
-rw-r--r--fs/proc/task_mmu.c9
-rw-r--r--fs/proc/thread_self.c7
-rw-r--r--fs/pstore/platform.c4
-rw-r--r--fs/pstore/zone.c2
-rw-r--r--fs/quota/quota_v2.c11
-rw-r--r--fs/read_write.c19
-rw-r--r--fs/reiserfs/file.c2
-rw-r--r--fs/seq_file.c5
-rw-r--r--fs/splice.c53
-rw-r--r--fs/squashfs/block.c8
-rw-r--r--fs/squashfs/export.c41
-rw-r--r--fs/squashfs/id.c40
-rw-r--r--fs/squashfs/squashfs_fs_sb.h1
-rw-r--r--fs/squashfs/super.c6
-rw-r--r--fs/squashfs/xattr.h10
-rw-r--r--fs/squashfs/xattr_id.c66
-rw-r--r--fs/statfs.c5
-rw-r--r--fs/super.c15
-rw-r--r--fs/sysfs/file.c11
-rw-r--r--fs/ubifs/auth.c2
-rw-r--r--fs/ubifs/journal.c2
-rw-r--r--fs/ubifs/replay.c4
-rw-r--r--fs/ubifs/super.c4
-rw-r--r--fs/ubifs/xattr.c2
-rw-r--r--fs/udf/inode.c9
-rw-r--r--fs/udf/super.c16
-rw-r--r--fs/userfaultfd.c19
-rw-r--r--fs/verity/Makefile1
-rw-r--r--fs/verity/fsverity_private.h13
-rw-r--r--fs/verity/open.c133
-rw-r--r--fs/verity/read_metadata.c195
-rw-r--r--fs/verity/signature.c20
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c50
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h3
-rw-r--r--fs/xfs/libxfs/xfs_attr.c22
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c315
-rw-r--r--fs/xfs/libxfs/xfs_btree.c33
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h2
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c2
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h6
-rw-r--r--fs/xfs/libxfs/xfs_fs.h1
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c27
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h63
-rw-r--r--fs/xfs/libxfs/xfs_sb.c2
-rw-r--r--fs/xfs/scrub/common.c4
-rw-r--r--fs/xfs/xfs_bmap_item.c10
-rw-r--r--fs/xfs/xfs_bmap_util.c81
-rw-r--r--fs/xfs/xfs_buf.c30
-rw-r--r--fs/xfs/xfs_buf.h11
-rw-r--r--fs/xfs/xfs_dquot.c47
-rw-r--r--fs/xfs/xfs_error.c6
-rw-r--r--fs/xfs/xfs_file.c439
-rw-r--r--fs/xfs/xfs_fsops.c32
-rw-r--r--fs/xfs/xfs_fsops.h4
-rw-r--r--fs/xfs/xfs_globals.c7
-rw-r--r--fs/xfs/xfs_icache.c438
-rw-r--r--fs/xfs/xfs_icache.h24
-rw-r--r--fs/xfs/xfs_inode.c134
-rw-r--r--fs/xfs/xfs_ioctl.c75
-rw-r--r--fs/xfs/xfs_iomap.c82
-rw-r--r--fs/xfs/xfs_iops.c28
-rw-r--r--fs/xfs/xfs_iwalk.c5
-rw-r--r--fs/xfs/xfs_linux.h3
-rw-r--r--fs/xfs/xfs_log.c142
-rw-r--r--fs/xfs/xfs_log.h4
-rw-r--r--fs/xfs/xfs_mount.c43
-rw-r--r--fs/xfs/xfs_mount.h10
-rw-r--r--fs/xfs/xfs_mru_cache.c2
-rw-r--r--fs/xfs/xfs_pwork.c25
-rw-r--r--fs/xfs/xfs_pwork.h4
-rw-r--r--fs/xfs/xfs_qm.c116
-rw-r--r--fs/xfs/xfs_quota.h49
-rw-r--r--fs/xfs/xfs_reflink.c103
-rw-r--r--fs/xfs/xfs_rtalloc.c5
-rw-r--r--fs/xfs/xfs_super.c84
-rw-r--r--fs/xfs/xfs_super.h6
-rw-r--r--fs/xfs/xfs_symlink.c15
-rw-r--r--fs/xfs/xfs_sysctl.c15
-rw-r--r--fs/xfs/xfs_sysctl.h3
-rw-r--r--fs/xfs/xfs_trace.c1
-rw-r--r--fs/xfs/xfs_trace.h72
-rw-r--r--fs/xfs/xfs_trans.c195
-rw-r--r--fs/xfs/xfs_trans.h13
-rw-r--r--fs/xfs/xfs_trans_dquot.c71
-rw-r--r--fs/zonefs/Makefile2
-rw-r--r--fs/zonefs/super.c27
-rw-r--r--fs/zonefs/trace.h104
302 files changed, 12029 insertions, 6318 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index aa4c12282301..462253ae483a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -203,7 +203,7 @@ config TMPFS_XATTR
config TMPFS_INODE64
bool "Use 64-bit ino_t by default in tmpfs"
- depends on TMPFS && 64BIT
+ depends on TMPFS && 64BIT && !(S390 || ALPHA)
default n
help
tmpfs has historically used only inode numbers as wide as an unsigned
@@ -333,6 +333,10 @@ config NFS_COMMON
depends on NFSD || NFS_FS || LOCKD
default y
+config NFS_V4_2_SSC_HELPER
+ tristate
+ default y if NFS_V4=y || NFS_FS=y
+
source "net/sunrpc/Kconfig"
source "fs/ceph/Kconfig"
source "fs/cifs/Kconfig"
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 885da6d983b4..c6f1c8c1934e 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -29,7 +29,7 @@ config BINFMT_ELF
latest version).
config COMPAT_BINFMT_ELF
- bool
+ def_bool y
depends on COMPAT && BINFMT_ELF
select ELFCORE
@@ -45,7 +45,7 @@ config ARCH_USE_GNU_PROPERTY
config BINFMT_ELF_FDPIC
bool "Kernel support for FDPIC ELF binaries"
default y if !BINFMT_ELF
- depends on (ARM || (SUPERH && !MMU) || C6X)
+ depends on (ARM || (SUPERH && !MMU))
select ELFCORE
help
ELF FDPIC binaries are based on ELF, but allow the individual load
diff --git a/fs/Makefile b/fs/Makefile
index 999d1a23f036..3215fe205256 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -64,7 +64,6 @@ obj-$(CONFIG_SYSFS) += sysfs/
obj-$(CONFIG_CONFIGFS_FS) += configfs/
obj-y += devpts/
-obj-$(CONFIG_PROFILING) += dcookies.o
obj-$(CONFIG_DLM) += dlm/
# Do not add any filesystems before this line
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 9ad22befce28..bcab18956b4f 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -463,8 +463,10 @@ affs_xrename(struct inode *old_dir, struct dentry *old_dentry,
return -EIO;
bh_new = affs_bread(sb, d_inode(new_dentry)->i_ino);
- if (!bh_new)
+ if (!bh_new) {
+ affs_brelse(bh_old);
return -EIO;
+ }
/* Remove old header from its parent directory. */
affs_lock_dir(old_dir);
diff --git a/fs/afs/main.c b/fs/afs/main.c
index accdd8970e7c..b2975256dadb 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -193,7 +193,7 @@ static int __init afs_init(void)
goto error_cache;
#endif
- ret = register_pernet_subsys(&afs_net_ops);
+ ret = register_pernet_device(&afs_net_ops);
if (ret < 0)
goto error_net;
@@ -213,7 +213,7 @@ static int __init afs_init(void)
error_proc:
afs_fs_exit();
error_fs:
- unregister_pernet_subsys(&afs_net_ops);
+ unregister_pernet_device(&afs_net_ops);
error_net:
#ifdef CONFIG_AFS_FSCACHE
fscache_unregister_netfs(&afs_cache_netfs);
@@ -244,7 +244,7 @@ static void __exit afs_exit(void)
proc_remove(afs_proc_symlink);
afs_fs_exit();
- unregister_pernet_subsys(&afs_net_ops);
+ unregister_pernet_device(&afs_net_ops);
#ifdef CONFIG_AFS_FSCACHE
fscache_unregister_netfs(&afs_cache_netfs);
#endif
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 89714308c25b..a280156138ed 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -55,61 +55,79 @@ static struct file_system_type anon_inode_fs_type = {
.kill_sb = kill_anon_super,
};
-/**
- * anon_inode_getfile - creates a new file instance by hooking it up to an
- * anonymous inode, and a dentry that describe the "class"
- * of the file
- *
- * @name: [in] name of the "class" of the new file
- * @fops: [in] file operations for the new file
- * @priv: [in] private data for the new file (will be file's private_data)
- * @flags: [in] flags
- *
- * Creates a new file by hooking it on a single inode. This is useful for files
- * that do not need to have a full-fledged inode in order to operate correctly.
- * All the files created with anon_inode_getfile() will share a single inode,
- * hence saving memory and avoiding code duplication for the file/inode/dentry
- * setup. Returns the newly created file* or an error pointer.
- */
-struct file *anon_inode_getfile(const char *name,
- const struct file_operations *fops,
- void *priv, int flags)
+static struct inode *anon_inode_make_secure_inode(
+ const char *name,
+ const struct inode *context_inode)
{
- struct file *file;
+ struct inode *inode;
+ const struct qstr qname = QSTR_INIT(name, strlen(name));
+ int error;
+
+ inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
+ if (IS_ERR(inode))
+ return inode;
+ inode->i_flags &= ~S_PRIVATE;
+ error = security_inode_init_security_anon(inode, &qname, context_inode);
+ if (error) {
+ iput(inode);
+ return ERR_PTR(error);
+ }
+ return inode;
+}
- if (IS_ERR(anon_inode_inode))
- return ERR_PTR(-ENODEV);
+static struct file *__anon_inode_getfile(const char *name,
+ const struct file_operations *fops,
+ void *priv, int flags,
+ const struct inode *context_inode,
+ bool secure)
+{
+ struct inode *inode;
+ struct file *file;
if (fops->owner && !try_module_get(fops->owner))
return ERR_PTR(-ENOENT);
- /*
- * We know the anon_inode inode count is always greater than zero,
- * so ihold() is safe.
- */
- ihold(anon_inode_inode);
- file = alloc_file_pseudo(anon_inode_inode, anon_inode_mnt, name,
+ if (secure) {
+ inode = anon_inode_make_secure_inode(name, context_inode);
+ if (IS_ERR(inode)) {
+ file = ERR_CAST(inode);
+ goto err;
+ }
+ } else {
+ inode = anon_inode_inode;
+ if (IS_ERR(inode)) {
+ file = ERR_PTR(-ENODEV);
+ goto err;
+ }
+ /*
+ * We know the anon_inode inode count is always
+ * greater than zero, so ihold() is safe.
+ */
+ ihold(inode);
+ }
+
+ file = alloc_file_pseudo(inode, anon_inode_mnt, name,
flags & (O_ACCMODE | O_NONBLOCK), fops);
if (IS_ERR(file))
- goto err;
+ goto err_iput;
- file->f_mapping = anon_inode_inode->i_mapping;
+ file->f_mapping = inode->i_mapping;
file->private_data = priv;
return file;
+err_iput:
+ iput(inode);
err:
- iput(anon_inode_inode);
module_put(fops->owner);
return file;
}
-EXPORT_SYMBOL_GPL(anon_inode_getfile);
/**
- * anon_inode_getfd - creates a new file instance by hooking it up to an
- * anonymous inode, and a dentry that describe the "class"
- * of the file
+ * anon_inode_getfile - creates a new file instance by hooking it up to an
+ * anonymous inode, and a dentry that describe the "class"
+ * of the file
*
* @name: [in] name of the "class" of the new file
* @fops: [in] file operations for the new file
@@ -118,12 +136,23 @@ EXPORT_SYMBOL_GPL(anon_inode_getfile);
*
* Creates a new file by hooking it on a single inode. This is useful for files
* that do not need to have a full-fledged inode in order to operate correctly.
- * All the files created with anon_inode_getfd() will share a single inode,
+ * All the files created with anon_inode_getfile() will share a single inode,
* hence saving memory and avoiding code duplication for the file/inode/dentry
- * setup. Returns new descriptor or an error code.
+ * setup. Returns the newly created file* or an error pointer.
*/
-int anon_inode_getfd(const char *name, const struct file_operations *fops,
- void *priv, int flags)
+struct file *anon_inode_getfile(const char *name,
+ const struct file_operations *fops,
+ void *priv, int flags)
+{
+ return __anon_inode_getfile(name, fops, priv, flags, NULL, false);
+}
+EXPORT_SYMBOL_GPL(anon_inode_getfile);
+
+static int __anon_inode_getfd(const char *name,
+ const struct file_operations *fops,
+ void *priv, int flags,
+ const struct inode *context_inode,
+ bool secure)
{
int error, fd;
struct file *file;
@@ -133,7 +162,8 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
return error;
fd = error;
- file = anon_inode_getfile(name, fops, priv, flags);
+ file = __anon_inode_getfile(name, fops, priv, flags, context_inode,
+ secure);
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto err_put_unused_fd;
@@ -146,8 +176,55 @@ err_put_unused_fd:
put_unused_fd(fd);
return error;
}
+
+/**
+ * anon_inode_getfd - creates a new file instance by hooking it up to
+ * an anonymous inode and a dentry that describe
+ * the "class" of the file
+ *
+ * @name: [in] name of the "class" of the new file
+ * @fops: [in] file operations for the new file
+ * @priv: [in] private data for the new file (will be file's private_data)
+ * @flags: [in] flags
+ *
+ * Creates a new file by hooking it on a single inode. This is
+ * useful for files that do not need to have a full-fledged inode in
+ * order to operate correctly. All the files created with
+ * anon_inode_getfd() will use the same singleton inode, reducing
+ * memory use and avoiding code duplication for the file/inode/dentry
+ * setup. Returns a newly created file descriptor or an error code.
+ */
+int anon_inode_getfd(const char *name, const struct file_operations *fops,
+ void *priv, int flags)
+{
+ return __anon_inode_getfd(name, fops, priv, flags, NULL, false);
+}
EXPORT_SYMBOL_GPL(anon_inode_getfd);
+/**
+ * anon_inode_getfd_secure - Like anon_inode_getfd(), but creates a new
+ * !S_PRIVATE anon inode rather than reuse the singleton anon inode, and calls
+ * the inode_init_security_anon() LSM hook. This allows the inode to have its
+ * own security context and for a LSM to reject creation of the inode.
+ *
+ * @name: [in] name of the "class" of the new file
+ * @fops: [in] file operations for the new file
+ * @priv: [in] private data for the new file (will be file's private_data)
+ * @flags: [in] flags
+ * @context_inode:
+ * [in] the logical relationship with the new inode (optional)
+ *
+ * The LSM may use @context_inode in inode_init_security_anon(), but a
+ * reference to it is not held.
+ */
+int anon_inode_getfd_secure(const char *name, const struct file_operations *fops,
+ void *priv, int flags,
+ const struct inode *context_inode)
+{
+ return __anon_inode_getfd(name, fops, priv, flags, context_inode, true);
+}
+EXPORT_SYMBOL_GPL(anon_inode_getfd_secure);
+
static int __init anon_inode_init(void)
{
anon_inode_mnt = kern_mount(&anon_inode_fs_type);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 950bc177238a..b12ba98ae9f5 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -186,6 +186,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
unsigned char k_rand_bytes[16];
int items;
elf_addr_t *elf_info;
+ elf_addr_t flags = 0;
int ei_index;
const struct cred *cred = current_cred();
struct vm_area_struct *vma;
@@ -260,7 +261,9 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr));
NEW_AUX_ENT(AT_PHNUM, exec->e_phnum);
NEW_AUX_ENT(AT_BASE, interp_load_addr);
- NEW_AUX_ENT(AT_FLAGS, 0);
+ if (bprm->interp_flags & BINPRM_FLAGS_PRESERVE_ARGV0)
+ flags |= AT_FLAGS_PRESERVE_ARGV0;
+ NEW_AUX_ENT(AT_FLAGS, flags);
NEW_AUX_ENT(AT_ENTRY, e_entry);
NEW_AUX_ENT(AT_UID, from_kuid_munged(cred->user_ns, cred->uid));
NEW_AUX_ENT(AT_EUID, from_kuid_munged(cred->user_ns, cred->euid));
@@ -1495,7 +1498,7 @@ static void fill_note(struct memelfnote *note, const char *name, int type,
* fill up all the fields in prstatus from the given task struct, except
* registers which need to be filled up separately.
*/
-static void fill_prstatus(struct elf_prstatus *prstatus,
+static void fill_prstatus(struct elf_prstatus_common *prstatus,
struct task_struct *p, long signr)
{
prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
@@ -1717,11 +1720,11 @@ static void do_thread_regset_writeback(struct task_struct *task,
}
#ifndef PRSTATUS_SIZE
-#define PRSTATUS_SIZE(S, R) sizeof(S)
+#define PRSTATUS_SIZE sizeof(struct elf_prstatus)
#endif
#ifndef SET_PR_FPVALID
-#define SET_PR_FPVALID(S, V, R) ((S)->pr_fpvalid = (V))
+#define SET_PR_FPVALID(S) ((S)->pr_fpvalid = 1)
#endif
static int fill_thread_core_info(struct elf_thread_core_info *t,
@@ -1729,7 +1732,6 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
long signr, size_t *total)
{
unsigned int i;
- int regset0_size;
/*
* NT_PRSTATUS is the one special case, because the regset data
@@ -1737,14 +1739,12 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
* than being the whole note contents. We fill the reset in here.
* We assume that regset 0 is NT_PRSTATUS.
*/
- fill_prstatus(&t->prstatus, t->task, signr);
- regset0_size = regset_get(t->task, &view->regsets[0],
+ fill_prstatus(&t->prstatus.common, t->task, signr);
+ regset_get(t->task, &view->regsets[0],
sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg);
- if (regset0_size < 0)
- return 0;
fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
- PRSTATUS_SIZE(t->prstatus, regset0_size), &t->prstatus);
+ PRSTATUS_SIZE, &t->prstatus);
*total += notesize(&t->notes[0]);
do_thread_regset_writeback(t->task, &view->regsets[0]);
@@ -1772,7 +1772,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
continue;
if (is_fpreg)
- SET_PR_FPVALID(&t->prstatus, 1, regset0_size);
+ SET_PR_FPVALID(&t->prstatus);
fill_note(&t->notes[i], is_fpreg ? "CORE" : "LINUX",
note_type, ret, data);
@@ -1961,7 +1961,7 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
struct task_struct *p = t->thread;
t->num_notes = 0;
- fill_prstatus(&t->prstatus, p, signr);
+ fill_prstatus(&t->prstatus.common, p, signr);
elf_core_copy_task_regs(p, &t->prstatus.pr_reg);
fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
@@ -2040,7 +2040,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
}
/* now collect the dump for the current */
memset(info->prstatus, 0, sizeof(*info->prstatus));
- fill_prstatus(info->prstatus, current, siginfo->si_signo);
+ fill_prstatus(&info->prstatus->common, current, siginfo->si_signo);
elf_core_copy_regs(&info->prstatus->pr_reg, regs);
/* Set up header */
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index be4062b8ba75..3cfd6cd46f26 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -506,6 +506,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
char __user *u_platform, *u_base_platform, *p;
int loop;
int nr; /* reset for each csp adjustment */
+ unsigned long flags = 0;
#ifdef CONFIG_MMU
/* In some cases (e.g. Hyper-Threading), we want to avoid L1 evictions
@@ -648,7 +649,9 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr));
NEW_AUX_ENT(AT_PHNUM, exec_params->hdr.e_phnum);
NEW_AUX_ENT(AT_BASE, interp_params->elfhdr_addr);
- NEW_AUX_ENT(AT_FLAGS, 0);
+ if (bprm->interp_flags & BINPRM_FLAGS_PRESERVE_ARGV0)
+ flags |= AT_FLAGS_PRESERVE_ARGV0;
+ NEW_AUX_ENT(AT_FLAGS, flags);
NEW_AUX_ENT(AT_ENTRY, exec_params->entry_addr);
NEW_AUX_ENT(AT_UID, (elf_addr_t) from_kuid_munged(cred->user_ns, cred->uid));
NEW_AUX_ENT(AT_EUID, (elf_addr_t) from_kuid_munged(cred->user_ns, cred->euid));
@@ -1191,18 +1194,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
struct elf_prstatus_fdpic
{
- struct elf_siginfo pr_info; /* Info associated with signal */
- short pr_cursig; /* Current signal */
- unsigned long pr_sigpend; /* Set of pending signals */
- unsigned long pr_sighold; /* Set of held signals */
- pid_t pr_pid;
- pid_t pr_ppid;
- pid_t pr_pgrp;
- pid_t pr_sid;
- struct __kernel_old_timeval pr_utime; /* User time */
- struct __kernel_old_timeval pr_stime; /* System time */
- struct __kernel_old_timeval pr_cutime; /* Cumulative user time */
- struct __kernel_old_timeval pr_cstime; /* Cumulative system time */
+ struct elf_prstatus_common common;
elf_gregset_t pr_reg; /* GP registers */
/* When using FDPIC, the loadmap addresses need to be communicated
* to GDB in order for GDB to do the necessary relocations. The
@@ -1301,7 +1293,7 @@ static inline void fill_note(struct memelfnote *note, const char *name, int type
* fill up all the fields in prstatus from the given task struct, except
* registers which need to be filled up separately.
*/
-static void fill_prstatus(struct elf_prstatus_fdpic *prstatus,
+static void fill_prstatus(struct elf_prstatus_common *prstatus,
struct task_struct *p, long signr)
{
prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
@@ -1332,9 +1324,6 @@ static void fill_prstatus(struct elf_prstatus_fdpic *prstatus,
}
prstatus->pr_cutime = ns_to_kernel_old_timeval(p->signal->cutime);
prstatus->pr_cstime = ns_to_kernel_old_timeval(p->signal->cstime);
-
- prstatus->pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap;
- prstatus->pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap;
}
static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
@@ -1405,7 +1394,9 @@ static struct elf_thread_status *elf_dump_thread_status(long signr, struct task_
if (!t)
return t;
- fill_prstatus(&t->prstatus, p, signr);
+ fill_prstatus(&t->prstatus.common, p, signr);
+ t->prstatus.pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap;
+ t->prstatus.pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap;
regset_get(p, &view->regsets[0],
sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg);
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 3880a82da1dc..c457334de43f 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -153,7 +153,9 @@ static int load_misc_binary(struct linux_binprm *bprm)
if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
goto ret;
- if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) {
+ if (fmt->flags & MISC_FMT_PRESERVE_ARGV0) {
+ bprm->interp_flags |= BINPRM_FLAGS_PRESERVE_ARGV0;
+ } else {
retval = remove_arg_zero(bprm);
if (retval)
goto ret;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 3b8963e228a1..ec26179c8062 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -126,11 +126,18 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode,
bd_abort_claiming(bdev, truncate_bdev_range);
return 0;
}
-EXPORT_SYMBOL(truncate_bdev_range);
static void set_init_blocksize(struct block_device *bdev)
{
- bdev->bd_inode->i_blkbits = blksize_bits(bdev_logical_block_size(bdev));
+ unsigned int bsize = bdev_logical_block_size(bdev);
+ loff_t size = i_size_read(bdev->bd_inode);
+
+ while (bsize < PAGE_SIZE) {
+ if (size & bsize)
+ break;
+ bsize <<= 1;
+ }
+ bdev->bd_inode->i_blkbits = blksize_bits(bsize);
}
int set_blocksize(struct block_device *bdev, int size)
@@ -416,7 +423,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
dio->size += bio->bi_iter.bi_size;
pos += bio->bi_iter.bi_size;
- nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
+ nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_PAGES);
if (!nr_pages) {
bool polled = false;
@@ -481,9 +488,10 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
int nr_pages;
- nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1);
- if (!nr_pages)
+ if (!iov_iter_count(iter))
return 0;
+
+ nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_PAGES + 1);
if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES)
return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
@@ -680,7 +688,7 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
* i_mutex and doing so causes performance issues with concurrent
* O_SYNC writers to a block device.
*/
- error = blkdev_issue_flush(bdev, GFP_KERNEL);
+ error = blkdev_issue_flush(bdev);
if (error == -EOPNOTSUPP)
error = 0;
@@ -1800,13 +1808,11 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
return error;
/*
- * Invalidate again; if someone wandered in and dirtied a page,
- * the caller will be given -EBUSY. The third argument is
- * inclusive, so the rounding here is safe.
+ * Invalidate the page cache again; if someone wandered in and dirtied
+ * a page, we just discard it - userspace has no way of knowing whether
+ * the write happened before or after discard completing...
*/
- return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping,
- start >> PAGE_SHIFT,
- end >> PAGE_SHIFT);
+ return truncate_bdev_range(bdev, file->f_mode, start, end);
}
const struct file_operations def_blk_fops = {
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 9f1b1a88e317..b634c42115ea 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -1,5 +1,21 @@
# SPDX-License-Identifier: GPL-2.0
+# Subset of W=1 warnings
+subdir-ccflags-y += -Wextra -Wunused -Wno-unused-parameter
+subdir-ccflags-y += -Wmissing-declarations
+subdir-ccflags-y += -Wmissing-format-attribute
+subdir-ccflags-y += -Wmissing-prototypes
+subdir-ccflags-y += -Wold-style-definition
+subdir-ccflags-y += -Wmissing-include-dirs
+subdir-ccflags-y += $(call cc-option, -Wunused-but-set-variable)
+subdir-ccflags-y += $(call cc-option, -Wunused-const-variable)
+subdir-ccflags-y += $(call cc-option, -Wpacked-not-aligned)
+subdir-ccflags-y += $(call cc-option, -Wstringop-truncation)
+# The following turn off the warnings enabled by -Wextra
+subdir-ccflags-y += -Wno-missing-field-initializers
+subdir-ccflags-y += -Wno-sign-compare
+subdir-ccflags-y += -Wno-type-limits
+
obj-$(CONFIG_BTRFS_FS) := btrfs.o
btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
@@ -11,7 +27,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
- block-rsv.o delalloc-space.o block-group.o discard.o reflink.o
+ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
+ subpage.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 02d7d7b2563b..f47c1528eb9a 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1501,7 +1501,13 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
}
/**
- * btrfs_check_shared - tell us whether an extent is shared
+ * Check if an extent is shared or not
+ *
+ * @root: root inode belongs to
+ * @inum: inode number of the inode whose extent we are checking
+ * @bytenr: logical bytenr of the extent we are checking
+ * @roots: list of roots this extent is shared among
+ * @tmp: temporary list used for iteration
*
* btrfs_check_shared uses the backref walking code but will short
* circuit as soon as it finds a root or inode that doesn't match the
@@ -2541,13 +2547,6 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
list_del(&edge->list[UPPER]);
btrfs_backref_free_edge(cache, edge);
- if (RB_EMPTY_NODE(&upper->rb_node)) {
- BUG_ON(!list_empty(&node->upper));
- btrfs_backref_drop_node(cache, node);
- node = upper;
- node->lowest = 1;
- continue;
- }
/*
* Add the node to leaf node list if no other child block
* cached.
@@ -2624,7 +2623,7 @@ static int handle_direct_tree_backref(struct btrfs_backref_cache *cache,
/* Only reloc backref cache cares about a specific root */
if (cache->is_reloc) {
root = find_reloc_root(cache->fs_info, cur->bytenr);
- if (WARN_ON(!root))
+ if (!root)
return -ENOENT;
cur->root = root;
} else {
@@ -3117,7 +3116,7 @@ void btrfs_backref_error_cleanup(struct btrfs_backref_cache *cache,
list_del_init(&lower->list);
if (lower == node)
node = NULL;
- btrfs_backref_free_node(cache, lower);
+ btrfs_backref_drop_node(cache, lower);
}
btrfs_backref_cleanup_node(cache, node);
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index ff705cc564a9..17abde7f794c 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -296,6 +296,9 @@ static inline void btrfs_backref_free_node(struct btrfs_backref_cache *cache,
struct btrfs_backref_node *node)
{
if (node) {
+ ASSERT(list_empty(&node->list));
+ ASSERT(list_empty(&node->lower));
+ ASSERT(node->eb == NULL);
cache->nr_nodes--;
btrfs_put_root(node->root);
kfree(node);
@@ -340,11 +343,11 @@ static inline void btrfs_backref_drop_node_buffer(
static inline void btrfs_backref_drop_node(struct btrfs_backref_cache *tree,
struct btrfs_backref_node *node)
{
- BUG_ON(!list_empty(&node->upper));
+ ASSERT(list_empty(&node->upper));
btrfs_backref_drop_node_buffer(node);
- list_del(&node->list);
- list_del(&node->lower);
+ list_del_init(&node->list);
+ list_del_init(&node->lower);
if (!RB_EMPTY_NODE(&node->rb_node))
rb_erase(&node->rb_node, &tree->rb_root);
btrfs_backref_free_node(tree, node);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 52f2198d44c9..5064be59dac5 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -15,6 +15,7 @@
#include "delalloc-space.h"
#include "discard.h"
#include "raid56.h"
+#include "zoned.h"
/*
* Return target flags in extended format or 0 if restripe for this chunk_type
@@ -673,7 +674,15 @@ static noinline void caching_thread(struct btrfs_work *work)
wake_up(&caching_ctl->wait);
}
- if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ /*
+ * If we are in the transaction that populated the free space tree we
+ * can't actually cache from the free space tree as our commit root and
+ * real root are the same, so we could change the contents of the blocks
+ * while caching. Instead do the slow caching in this case, and after
+ * the transaction has committed we will be safe.
+ */
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
+ !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
ret = load_free_space_tree(caching_ctl);
else
ret = load_extent_tree_free(caching_ctl);
@@ -716,6 +725,10 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only
struct btrfs_caching_control *caching_ctl = NULL;
int ret = 0;
+ /* Allocator for zoned filesystems does not use the cache at all */
+ if (btrfs_is_zoned(fs_info))
+ return 0;
+
caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
if (!caching_ctl)
return -ENOMEM;
@@ -888,6 +901,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
btrfs_return_cluster_to_free_space(block_group, cluster);
spin_unlock(&cluster->refill_lock);
+ btrfs_clear_treelog_bg(block_group);
+
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
@@ -1000,12 +1015,17 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
WARN_ON(block_group->space_info->total_bytes
< block_group->length);
WARN_ON(block_group->space_info->bytes_readonly
- < block_group->length);
+ < block_group->length - block_group->zone_unusable);
+ WARN_ON(block_group->space_info->bytes_zone_unusable
+ < block_group->zone_unusable);
WARN_ON(block_group->space_info->disk_total
< block_group->length * factor);
}
block_group->space_info->total_bytes -= block_group->length;
- block_group->space_info->bytes_readonly -= block_group->length;
+ block_group->space_info->bytes_readonly -=
+ (block_group->length - block_group->zone_unusable);
+ block_group->space_info->bytes_zone_unusable -=
+ block_group->zone_unusable;
block_group->space_info->disk_total -= block_group->length * factor;
spin_unlock(&block_group->space_info->lock);
@@ -1149,7 +1169,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
}
num_bytes = cache->length - cache->reserved - cache->pinned -
- cache->bytes_super - cache->used;
+ cache->bytes_super - cache->zone_unusable - cache->used;
/*
* Data never overcommits, even in mixed mode, so do just the straight
@@ -1180,6 +1200,12 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
if (!ret) {
sinfo->bytes_readonly += num_bytes;
+ if (btrfs_is_zoned(cache->fs_info)) {
+ /* Migrate zone_unusable bytes to readonly */
+ sinfo->bytes_readonly += cache->zone_unusable;
+ sinfo->bytes_zone_unusable -= cache->zone_unusable;
+ cache->zone_unusable = 0;
+ }
cache->ro++;
list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
}
@@ -1254,6 +1280,13 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
+ /*
+ * Long running balances can keep us blocked here for eternity, so
+ * simply skip deletion if we're unable to get the mutex.
+ */
+ if (!mutex_trylock(&fs_info->delete_unused_bgs_mutex))
+ return;
+
spin_lock(&fs_info->unused_bgs_lock);
while (!list_empty(&fs_info->unused_bgs)) {
int trimming;
@@ -1273,8 +1306,6 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
- mutex_lock(&fs_info->delete_unused_bgs_mutex);
-
/* Don't want to race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
@@ -1363,9 +1394,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
btrfs_space_info_update_bytes_pinned(fs_info, space_info,
-block_group->pinned);
space_info->bytes_readonly += block_group->pinned;
- percpu_counter_add_batch(&space_info->total_bytes_pinned,
- -block_group->pinned,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
+ __btrfs_mod_total_bytes_pinned(space_info, -block_group->pinned);
block_group->pinned = 0;
spin_unlock(&block_group->lock);
@@ -1381,8 +1410,12 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
goto flip_async;
- /* DISCARD can flip during remount */
- trimming = btrfs_test_opt(fs_info, DISCARD_SYNC);
+ /*
+ * DISCARD can flip during remount. On zoned filesystems, we
+ * need to reset sequential-required zones.
+ */
+ trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) ||
+ btrfs_is_zoned(fs_info);
/* Implicit trim during transaction commit. */
if (trimming)
@@ -1420,11 +1453,11 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
end_trans:
btrfs_end_transaction(trans);
next:
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
btrfs_put_block_group(block_group);
spin_lock(&fs_info->unused_bgs_lock);
}
spin_unlock(&fs_info->unused_bgs_lock);
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
return;
flip_async:
@@ -1553,8 +1586,11 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
}
/**
- * btrfs_rmap_block - Map a physical disk address to a list of logical addresses
+ * Map a physical disk address to a list of logical addresses
+ *
+ * @fs_info: the filesystem
* @chunk_start: logical address of block group
+ * @bdev: physical device to resolve, can be NULL to indicate any device
* @physical: physical address to map to logical addresses
* @logical: return array of logical addresses which map to @physical
* @naddrs: length of @logical
@@ -1564,9 +1600,9 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
* Used primarily to exclude those portions of a block group that contain super
* block copies.
*/
-EXPORT_FOR_TESTS
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
- u64 physical, u64 **logical, int *naddrs, int *stripe_len)
+ struct block_device *bdev, u64 physical, u64 **logical,
+ int *naddrs, int *stripe_len)
{
struct extent_map *em;
struct map_lookup *map;
@@ -1584,6 +1620,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
map = em->map_lookup;
data_stripe_length = em->orig_block_len;
io_stripe_size = map->stripe_len;
+ chunk_start = em->start;
/* For RAID5/6 adjust to a full IO stripe length */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
@@ -1598,14 +1635,18 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
for (i = 0; i < map->num_stripes; i++) {
bool already_inserted = false;
u64 stripe_nr;
+ u64 offset;
int j;
if (!in_range(physical, map->stripes[i].physical,
data_stripe_length))
continue;
+ if (bdev && map->stripes[i].dev->bdev != bdev)
+ continue;
+
stripe_nr = physical - map->stripes[i].physical;
- stripe_nr = div64_u64(stripe_nr, map->stripe_len);
+ stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset);
if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
stripe_nr = stripe_nr * map->num_stripes + i;
@@ -1619,7 +1660,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
* instead of map->stripe_len
*/
- bytenr = chunk_start + stripe_nr * io_stripe_size;
+ bytenr = chunk_start + stripe_nr * io_stripe_size + offset;
/* Ensure we don't add duplicate addresses */
for (j = 0; j < nr; j++) {
@@ -1661,7 +1702,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
bytenr = btrfs_sb_offset(i);
- ret = btrfs_rmap_block(fs_info, cache->start,
+ ret = btrfs_rmap_block(fs_info, cache->start, NULL,
bytenr, &logical, &nr, &stripe_len);
if (ret)
return ret;
@@ -1797,24 +1838,8 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
return ret;
}
-static void read_block_group_item(struct btrfs_block_group *cache,
- struct btrfs_path *path,
- const struct btrfs_key *key)
-{
- struct extent_buffer *leaf = path->nodes[0];
- struct btrfs_block_group_item bgi;
- int slot = path->slots[0];
-
- cache->length = key->offset;
-
- read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
- sizeof(bgi));
- cache->used = btrfs_stack_block_group_used(&bgi);
- cache->flags = btrfs_stack_block_group_flags(&bgi);
-}
-
static int read_one_block_group(struct btrfs_fs_info *info,
- struct btrfs_path *path,
+ struct btrfs_block_group_item *bgi,
const struct btrfs_key *key,
int need_clear)
{
@@ -1829,7 +1854,9 @@ static int read_one_block_group(struct btrfs_fs_info *info,
if (!cache)
return -ENOMEM;
- read_block_group_item(cache, path, key);
+ cache->length = key->offset;
+ cache->used = btrfs_stack_block_group_used(bgi);
+ cache->flags = btrfs_stack_block_group_flags(bgi);
set_free_space_tree_thresholds(cache);
@@ -1856,6 +1883,13 @@ static int read_one_block_group(struct btrfs_fs_info *info,
goto error;
}
+ ret = btrfs_load_block_group_zone_info(cache, false);
+ if (ret) {
+ btrfs_err(info, "zoned: failed to load zone info of bg %llu",
+ cache->start);
+ goto error;
+ }
+
/*
* We need to exclude the super stripes now so that the space info has
* super bytes accounted for, otherwise we'll think we have more space
@@ -1869,12 +1903,20 @@ static int read_one_block_group(struct btrfs_fs_info *info,
}
/*
- * Check for two cases, either we are full, and therefore don't need
- * to bother with the caching work since we won't find any space, or we
- * are empty, and we can just add all the space in and be done with it.
- * This saves us _a_lot_ of time, particularly in the full case.
+ * For zoned filesystem, space after the allocation offset is the only
+ * free space for a block group. So, we don't need any caching work.
+ * btrfs_calc_zone_unusable() will set the amount of free space and
+ * zone_unusable space.
+ *
+ * For regular filesystem, check for two cases, either we are full, and
+ * therefore don't need to bother with the caching work since we won't
+ * find any space, or we are empty, and we can just add all the space
+ * in and be done with it. This saves us _a_lot_ of time, particularly
+ * in the full case.
*/
- if (cache->length == cache->used) {
+ if (btrfs_is_zoned(info)) {
+ btrfs_calc_zone_unusable(cache);
+ } else if (cache->length == cache->used) {
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
btrfs_free_excluded_extents(cache);
@@ -1893,7 +1935,8 @@ static int read_one_block_group(struct btrfs_fs_info *info,
}
trace_btrfs_add_block_group(info, cache, 0);
btrfs_update_space_info(info, cache->flags, cache->length,
- cache->used, cache->bytes_super, &space_info);
+ cache->used, cache->bytes_super,
+ cache->zone_unusable, &space_info);
cache->space_info = space_info;
@@ -1949,7 +1992,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
break;
}
btrfs_update_space_info(fs_info, bg->flags, em->len, em->len,
- 0, &space_info);
+ 0, 0, &space_info);
bg->space_info = space_info;
link_block_group(bg);
@@ -1988,19 +2031,29 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
need_clear = 1;
while (1) {
+ struct btrfs_block_group_item bgi;
+ struct extent_buffer *leaf;
+ int slot;
+
ret = find_first_block_group(info, path, &key);
if (ret > 0)
break;
if (ret != 0)
goto error;
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- ret = read_one_block_group(info, path, &key, need_clear);
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
+ sizeof(bgi));
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ btrfs_release_path(path);
+ ret = read_one_block_group(info, &bgi, &key, need_clear);
if (ret < 0)
goto error;
key.objectid += key.offset;
key.offset = 0;
- btrfs_release_path(path);
}
btrfs_release_path(path);
@@ -2132,6 +2185,13 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
cache->cached = BTRFS_CACHE_FINISHED;
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
cache->needs_free_space = 1;
+
+ ret = btrfs_load_block_group_zone_info(cache, true);
+ if (ret) {
+ btrfs_put_block_group(cache);
+ return ret;
+ }
+
ret = exclude_super_stripes(cache);
if (ret) {
/* We may have excluded something, so call this just in case */
@@ -2173,7 +2233,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
*/
trace_btrfs_add_block_group(fs_info, cache, 1);
btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
- cache->bytes_super, &cache->space_info);
+ cache->bytes_super, 0, &cache->space_info);
btrfs_update_global_block_rsv(fs_info);
link_block_group(cache);
@@ -2281,8 +2341,15 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
spin_lock(&cache->lock);
if (!--cache->ro) {
num_bytes = cache->length - cache->reserved -
- cache->pinned - cache->bytes_super - cache->used;
+ cache->pinned - cache->bytes_super -
+ cache->zone_unusable - cache->used;
sinfo->bytes_readonly -= num_bytes;
+ if (btrfs_is_zoned(cache->fs_info)) {
+ /* Migrate zone_unusable bytes back */
+ cache->zone_unusable = cache->alloc_offset - cache->used;
+ sinfo->bytes_zone_unusable += cache->zone_unusable;
+ sinfo->bytes_readonly -= cache->zone_unusable;
+ }
list_del_init(&cache->ro_list);
}
spin_unlock(&cache->lock);
@@ -2556,8 +2623,10 @@ again:
if (!path) {
path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
}
/*
@@ -2651,16 +2720,14 @@ again:
btrfs_put_block_group(cache);
if (drop_reserve)
btrfs_delayed_refs_rsv_release(fs_info, 1);
-
- if (ret)
- break;
-
/*
* Avoid blocking other tasks for too long. It might even save
* us from writing caches for block groups that are going to be
* removed.
*/
mutex_unlock(&trans->transaction->cache_write_mutex);
+ if (ret)
+ goto out;
mutex_lock(&trans->transaction->cache_write_mutex);
}
mutex_unlock(&trans->transaction->cache_write_mutex);
@@ -2669,7 +2736,8 @@ again:
* Go through delayed refs for all the stuff we've just kicked off
* and then loop back (just once)
*/
- ret = btrfs_run_delayed_refs(trans, 0);
+ if (!ret)
+ ret = btrfs_run_delayed_refs(trans, 0);
if (!ret && loops == 0) {
loops++;
spin_lock(&cur_trans->dirty_bgs_lock);
@@ -2683,7 +2751,12 @@ again:
goto again;
}
spin_unlock(&cur_trans->dirty_bgs_lock);
- } else if (ret < 0) {
+ }
+out:
+ if (ret < 0) {
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ list_splice_init(&dirty, &cur_trans->dirty_bgs);
+ spin_unlock(&cur_trans->dirty_bgs_lock);
btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
}
@@ -2887,10 +2960,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- percpu_counter_add_batch(
- &cache->space_info->total_bytes_pinned,
- num_bytes,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
+ __btrfs_mod_total_bytes_pinned(cache->space_info,
+ num_bytes);
set_extent_dirty(&trans->transaction->pinned_extents,
bytenr, bytenr + num_bytes - 1,
GFP_NOFS | __GFP_NOFAIL);
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 8f74a96074f7..29678426247d 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -95,6 +95,8 @@ struct btrfs_block_group {
unsigned int iref:1;
unsigned int has_caching_ctl:1;
unsigned int removed:1;
+ unsigned int to_copy:1;
+ unsigned int relocating_repair:1;
int disk_cache_state;
@@ -181,8 +183,19 @@ struct btrfs_block_group {
*/
int needs_free_space;
+ /* Flag indicating this block group is placed on a sequential zone */
+ bool seq_zone;
+
/* Record locked full stripes for RAID5/6 block group */
struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
+
+ /*
+ * Allocation offset for the block group to implement sequential
+ * allocation. This is used only on a zoned filesystem.
+ */
+ u64 alloc_offset;
+ u64 zone_unusable;
+ u64 meta_write_pointer;
};
static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
@@ -270,6 +283,9 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
int btrfs_free_block_groups(struct btrfs_fs_info *info);
void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache,
struct btrfs_caching_control *caching_ctl);
+int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
+ struct block_device *bdev, u64 physical, u64 **logical,
+ int *naddrs, int *stripe_len);
static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
{
@@ -296,9 +312,4 @@ static inline int btrfs_block_group_done(struct btrfs_block_group *cache)
void btrfs_freeze_block_group(struct btrfs_block_group *cache);
void btrfs_unfreeze_block_group(struct btrfs_block_group *cache);
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
- u64 physical, u64 **logical, int *naddrs, int *stripe_len);
-#endif
-
#endif /* BTRFS_BLOCK_GROUP_H */
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index d9bf53d9ff90..28e202e89660 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -325,7 +325,8 @@ struct btrfs_dio_private {
struct inode *inode;
u64 logical_offset;
u64 disk_bytenr;
- u64 bytes;
+ /* Used for bio::bi_size */
+ u32 bytes;
/*
* References to this structure. There is one reference per in-flight
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 6ff44e53814c..113cb85c1fd4 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -2674,7 +2674,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
mutex_lock(&btrfsic_mutex);
/* since btrfsic_submit_bio() is also called before
* btrfsic_mount(), this might return NULL */
- dev_state = btrfsic_dev_state_lookup(bio_dev(bio) + bio->bi_partno);
+ dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev);
if (NULL != dev_state &&
(bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) {
unsigned int i = 0;
@@ -2690,9 +2690,9 @@ static void __btrfsic_submit_bio(struct bio *bio)
bio_is_patched = 0;
if (dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
- pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_disk=%p)\n",
+ pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
bio_op(bio), bio->bi_opf, segs,
- bio->bi_iter.bi_sector, dev_bytenr, bio->bi_disk);
+ bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev);
mapped_datav = kmalloc_array(segs,
sizeof(*mapped_datav), GFP_NOFS);
@@ -2721,8 +2721,8 @@ static void __btrfsic_submit_bio(struct bio *bio)
} else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) {
if (dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
- pr_info("submit_bio(rw=%d,0x%x FLUSH, disk=%p)\n",
- bio_op(bio), bio->bi_opf, bio->bi_disk);
+ pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n",
+ bio_op(bio), bio->bi_opf, bio->bi_bdev);
if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
if ((dev_state->state->print_mask &
(BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 5ae3fa0386b7..6d203acfdeb3 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -542,13 +542,19 @@ static noinline int add_ra_bio_pages(struct inode *inode,
goto next;
}
- end = last_offset + PAGE_SIZE - 1;
/*
* at this point, we have a locked page in the page cache
* for these bytes in the file. But, we have to make
* sure they map to this compressed extent on disk.
*/
- set_page_extent_mapped(page);
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ unlock_page(page);
+ put_page(page);
+ break;
+ }
+
+ end = last_offset + PAGE_SIZE - 1;
lock_extent(tree, last_offset, end);
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, last_offset,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cc89b63d65a4..d56730a67885 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -221,9 +221,12 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
ret = btrfs_inc_ref(trans, root, cow, 1);
else
ret = btrfs_inc_ref(trans, root, cow, 0);
-
- if (ret)
+ if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
+ btrfs_abort_transaction(trans, ret);
return ret;
+ }
btrfs_mark_buffer_dirty(cow);
*cow_ret = cow;
@@ -1494,6 +1497,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
return ret;
}
+ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
/*
* helper function for defrag to decide if two blocks pointed to by a
@@ -2821,6 +2825,7 @@ done:
btrfs_release_path(p);
return ret;
}
+ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
/*
* Like btrfs_search_slot, this looks for a key in the given tree. It uses the
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9c0b43853cd2..bd659354d043 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -298,7 +298,8 @@ struct btrfs_super_block {
BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
- BTRFS_FEATURE_INCOMPAT_RAID1C34)
+ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
+ BTRFS_FEATURE_INCOMPAT_ZONED)
#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \
(BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
@@ -563,6 +564,9 @@ enum {
/* Indicate that we need to cleanup space cache v1 */
BTRFS_FS_CLEANUP_SPACE_CACHE_V1,
+
+ /* Indicate that we can't trust the free space tree for caching yet */
+ BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED,
};
/*
@@ -794,7 +798,7 @@ struct btrfs_fs_info {
/* used to keep from writing metadata until there is a nice batch */
struct percpu_counter dirty_metadata_bytes;
struct percpu_counter delalloc_bytes;
- struct percpu_counter dio_bytes;
+ struct percpu_counter ordered_bytes;
s32 dirty_metadata_batch;
s32 delalloc_batch;
@@ -930,6 +934,7 @@ struct btrfs_fs_info {
/* Used to reclaim the metadata space in the background. */
struct work_struct async_reclaim_work;
struct work_struct async_data_reclaim_work;
+ struct work_struct preempt_reclaim_work;
spinlock_t unused_bgs_lock;
struct list_head unused_bgs;
@@ -971,6 +976,9 @@ struct btrfs_fs_info {
/* Max size to emit ZONE_APPEND write command */
u64 max_zone_append_size;
+ struct mutex zoned_meta_io_lock;
+ spinlock_t treelog_bg_lock;
+ u64 treelog_bg;
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
@@ -1101,7 +1109,7 @@ struct btrfs_root {
u32 type;
- u64 highest_objectid;
+ u64 free_objectid;
struct btrfs_key defrag_progress;
struct btrfs_key defrag_max;
@@ -2737,6 +2745,7 @@ enum btrfs_flush_state {
ALLOC_CHUNK_FORCE = 8,
RUN_DELAYED_IPUTS = 9,
COMMIT_TRANS = 10,
+ FORCE_COMMIT_TRANS = 11,
};
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
@@ -3097,15 +3106,14 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u32 min_type);
int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr,
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
bool in_reclaim_context);
int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state);
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
struct btrfs_root *new_root,
- struct btrfs_root *parent_root,
- u64 new_dirid);
+ struct btrfs_root *parent_root);
void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
unsigned *bits);
void btrfs_clear_delalloc_extent(struct inode *inode,
@@ -3116,6 +3124,8 @@ void btrfs_split_delalloc_extent(struct inode *inode,
struct extent_state *orig, u64 split);
int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
unsigned long bio_flags);
+bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio,
+ unsigned int size);
void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end);
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
int btrfs_readpage(struct file *file, struct page *page);
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index bacee09b7bfd..56642ca7af10 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -191,12 +191,14 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
}
/**
- * btrfs_inode_rsv_release - release any excessive reservation.
- * @inode - the inode we need to release from.
- * @qgroup_free - free or convert qgroup meta.
- * Unlike normal operation, qgroup meta reservation needs to know if we are
- * freeing qgroup reservation or just converting it into per-trans. Normally
- * @qgroup_free is true for error handling, and false for normal release.
+ * Release any excessive reservation
+ *
+ * @inode: the inode we need to release from
+ * @qgroup_free: free or convert qgroup meta. Unlike normal operation, qgroup
+ * meta reservation needs to know if we are freeing qgroup
+ * reservation or just converting it into per-trans. Normally
+ * @qgroup_free is true for error handling, and false for normal
+ * release.
*
* This is the same as btrfs_block_rsv_release, except that it handles the
* tracepoint for the reservation.
@@ -361,7 +363,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
}
/**
- * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
+ * Release a metadata reservation for an inode
+ *
* @inode: the inode to release the reservation for.
* @num_bytes: the number of bytes we are releasing.
* @qgroup_free: free qgroup reservation or convert it to per-trans reservation
@@ -455,11 +458,13 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
}
/**
- * btrfs_delalloc_release_space - release data and metadata space for delalloc
- * @inode: inode we're releasing space for
- * @start: start position of the space already reserved
- * @len: the len of the space already reserved
- * @release_bytes: the len of the space we consumed or didn't use
+ * Release data and metadata space for delalloc
+ *
+ * @inode: inode we're releasing space for
+ * @reserved: list of changed/reserved ranges
+ * @start: start position of the space already reserved
+ * @len: length of the space already reserved
+ * @qgroup_free: should qgroup reserved-space also be freed
*
* This function will release the metadata space that was not used and will
* decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 70c0340d839c..ec0b50b8c5d6 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1154,7 +1154,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
delayed_root = fs_info->delayed_root;
curr_node = btrfs_first_delayed_node(delayed_root);
- while (curr_node && (!count || (count && nr--))) {
+ while (curr_node && (!count || nr--)) {
ret = __btrfs_commit_inode_delayed_items(trans, path,
curr_node);
if (ret) {
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 353cc2994d10..63be7d01a9a3 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -69,9 +69,10 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
}
/**
- * btrfs_delayed_refs_rsv_release - release a ref head's reservation.
- * @fs_info - the fs_info for our fs.
- * @nr - the number of items to drop.
+ * Release a ref head's reservation
+ *
+ * @fs_info: the filesystem
+ * @nr: number of items to drop
*
* This drops the delayed ref head's count from the delayed refs rsv and frees
* any excess reservation we had.
@@ -114,10 +115,11 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
}
/**
- * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv.
- * @fs_info - the fs info for our fs.
- * @src - the source block rsv to transfer from.
- * @num_bytes - the number of bytes to transfer.
+ * Transfer bytes to our delayed refs rsv
+ *
+ * @fs_info: the filesystem
+ * @src: source block rsv to transfer from
+ * @num_bytes: number of bytes to transfer
*
* This transfers up to the num_bytes amount from the src rsv to the
* delayed_refs_rsv. Any extra bytes are returned to the space info.
@@ -162,9 +164,10 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
}
/**
- * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage.
- * @fs_info - the fs_info for our fs.
- * @flush - control how we can flush for this reservation.
+ * Refill based on our delayed refs usage
+ *
+ * @fs_info: the filesystem
+ * @flush: control how we can flush for this reservation.
*
* This will refill the delayed block_rsv up to 1 items size worth of space and
* will return -ENOSPC if we can't make the reservation.
@@ -648,12 +651,12 @@ inserted:
*/
static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *existing,
- struct btrfs_delayed_ref_head *update,
- int *old_ref_mod_ret)
+ struct btrfs_delayed_ref_head *update)
{
struct btrfs_delayed_ref_root *delayed_refs =
&trans->transaction->delayed_refs;
struct btrfs_fs_info *fs_info = trans->fs_info;
+ u64 flags = btrfs_ref_head_to_space_flags(existing);
int old_ref_mod;
BUG_ON(existing->is_data != update->is_data);
@@ -701,8 +704,6 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
* currently, for refs we just added we know we're a-ok.
*/
old_ref_mod = existing->total_ref_mod;
- if (old_ref_mod_ret)
- *old_ref_mod_ret = old_ref_mod;
existing->ref_mod += update->ref_mod;
existing->total_ref_mod += update->ref_mod;
@@ -724,6 +725,27 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
trans->delayed_ref_updates += csum_leaves;
}
}
+
+ /*
+ * This handles the following conditions:
+ *
+ * 1. We had a ref mod of 0 or more and went negative, indicating that
+ * we may be freeing space, so add our space to the
+ * total_bytes_pinned counter.
+ * 2. We were negative and went to 0 or positive, so no longer can say
+ * that the space would be pinned, decrement our counter from the
+ * total_bytes_pinned counter.
+ * 3. We are now at 0 and have ->must_insert_reserved set, which means
+ * this was a new allocation and then we dropped it, and thus must
+ * add our space to the total_bytes_pinned counter.
+ */
+ if (existing->total_ref_mod < 0 && old_ref_mod >= 0)
+ btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes);
+ else if (existing->total_ref_mod >= 0 && old_ref_mod < 0)
+ btrfs_mod_total_bytes_pinned(fs_info, flags, -existing->num_bytes);
+ else if (existing->total_ref_mod == 0 && existing->must_insert_reserved)
+ btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes);
+
spin_unlock(&existing->lock);
}
@@ -798,8 +820,7 @@ static noinline struct btrfs_delayed_ref_head *
add_delayed_ref_head(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head_ref,
struct btrfs_qgroup_extent_record *qrecord,
- int action, int *qrecord_inserted_ret,
- int *old_ref_mod, int *new_ref_mod)
+ int action, int *qrecord_inserted_ret)
{
struct btrfs_delayed_ref_head *existing;
struct btrfs_delayed_ref_root *delayed_refs;
@@ -821,8 +842,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
existing = htree_insert(&delayed_refs->href_root,
&head_ref->href_node);
if (existing) {
- update_existing_head_ref(trans, existing, head_ref,
- old_ref_mod);
+ update_existing_head_ref(trans, existing, head_ref);
/*
* we've updated the existing ref, free the newly
* allocated ref
@@ -830,14 +850,17 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
head_ref = existing;
} else {
- if (old_ref_mod)
- *old_ref_mod = 0;
+ u64 flags = btrfs_ref_head_to_space_flags(head_ref);
+
if (head_ref->is_data && head_ref->ref_mod < 0) {
delayed_refs->pending_csums += head_ref->num_bytes;
trans->delayed_ref_updates +=
btrfs_csum_bytes_to_leaves(trans->fs_info,
head_ref->num_bytes);
}
+ if (head_ref->ref_mod < 0)
+ btrfs_mod_total_bytes_pinned(trans->fs_info, flags,
+ head_ref->num_bytes);
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
atomic_inc(&delayed_refs->num_entries);
@@ -845,8 +868,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
}
if (qrecord_inserted_ret)
*qrecord_inserted_ret = qrecord_inserted;
- if (new_ref_mod)
- *new_ref_mod = head_ref->total_ref_mod;
return head_ref;
}
@@ -909,8 +930,7 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
*/
int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref,
- struct btrfs_delayed_extent_op *extent_op,
- int *old_ref_mod, int *new_ref_mod)
+ struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_tree_ref *ref;
@@ -977,8 +997,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
* the spin lock
*/
head_ref = add_delayed_ref_head(trans, head_ref, record,
- action, &qrecord_inserted,
- old_ref_mod, new_ref_mod);
+ action, &qrecord_inserted);
ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
@@ -1006,8 +1025,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
*/
int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref,
- u64 reserved, int *old_ref_mod,
- int *new_ref_mod)
+ u64 reserved)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_data_ref *ref;
@@ -1073,8 +1091,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
* the spin lock
*/
head_ref = add_delayed_ref_head(trans, head_ref, record,
- action, &qrecord_inserted,
- old_ref_mod, new_ref_mod);
+ action, &qrecord_inserted);
ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
spin_unlock(&delayed_refs->lock);
@@ -1117,7 +1134,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
spin_lock(&delayed_refs->lock);
add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD,
- NULL, NULL, NULL);
+ NULL);
spin_unlock(&delayed_refs->lock);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 1c977e6d45dc..e22fba272e4f 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -135,6 +135,11 @@ struct btrfs_delayed_data_ref {
u64 offset;
};
+enum btrfs_delayed_ref_flags {
+ /* Indicate that we are flushing delayed refs for the commit */
+ BTRFS_DELAYED_REFS_FLUSHING,
+};
+
struct btrfs_delayed_ref_root {
/* head ref rbtree */
struct rb_root_cached href_root;
@@ -158,12 +163,7 @@ struct btrfs_delayed_ref_root {
u64 pending_csums;
- /*
- * set when the tree is flushing before a transaction commit,
- * used by the throttling code to decide if new updates need
- * to be run right away
- */
- int flushing;
+ unsigned long flags;
u64 run_delayed_start;
@@ -326,6 +326,16 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
}
}
+static inline u64 btrfs_ref_head_to_space_flags(
+ struct btrfs_delayed_ref_head *head_ref)
+{
+ if (head_ref->is_data)
+ return BTRFS_BLOCK_GROUP_DATA;
+ else if (head_ref->is_system)
+ return BTRFS_BLOCK_GROUP_SYSTEM;
+ return BTRFS_BLOCK_GROUP_METADATA;
+}
+
static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *head)
{
if (refcount_dec_and_test(&head->refs))
@@ -334,12 +344,10 @@ static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *hea
int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref,
- struct btrfs_delayed_extent_op *extent_op,
- int *old_ref_mod, int *new_ref_mod);
+ struct btrfs_delayed_extent_op *extent_op);
int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref,
- u64 reserved, int *old_ref_mod,
- int *new_ref_mod);
+ u64 reserved);
int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
struct btrfs_delayed_extent_op *extent_op);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 324f646d6e5e..3a9c1e046ebe 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -22,6 +22,7 @@
#include "dev-replace.h"
#include "sysfs.h"
#include "zoned.h"
+#include "block-group.h"
/*
* Device replace overview
@@ -459,6 +460,185 @@ static char* btrfs_dev_name(struct btrfs_device *device)
return rcu_str_deref(device->name);
}
+static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *src_dev)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_root *root = fs_info->dev_root;
+ struct btrfs_dev_extent *dev_extent = NULL;
+ struct btrfs_block_group *cache;
+ struct btrfs_trans_handle *trans;
+ int ret = 0;
+ u64 chunk_offset;
+
+ /* Do not use "to_copy" on non zoned filesystem for now */
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ mutex_lock(&fs_info->chunk_mutex);
+
+ /* Ensure we don't have pending new block group */
+ spin_lock(&fs_info->trans_lock);
+ while (fs_info->running_transaction &&
+ !list_empty(&fs_info->running_transaction->dev_update_list)) {
+ spin_unlock(&fs_info->trans_lock);
+ mutex_unlock(&fs_info->chunk_mutex);
+ trans = btrfs_attach_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ mutex_lock(&fs_info->chunk_mutex);
+ if (ret == -ENOENT) {
+ spin_lock(&fs_info->trans_lock);
+ continue;
+ } else {
+ goto unlock;
+ }
+ }
+
+ ret = btrfs_commit_transaction(trans);
+ mutex_lock(&fs_info->chunk_mutex);
+ if (ret)
+ goto unlock;
+
+ spin_lock(&fs_info->trans_lock);
+ }
+ spin_unlock(&fs_info->trans_lock);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ path->reada = READA_FORWARD;
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+
+ key.objectid = src_dev->devid;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto free_path;
+ if (ret > 0) {
+ if (path->slots[0] >=
+ btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto free_path;
+ if (ret > 0) {
+ ret = 0;
+ goto free_path;
+ }
+ } else {
+ ret = 0;
+ }
+ }
+
+ while (1) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+ if (found_key.objectid != src_dev->devid)
+ break;
+
+ if (found_key.type != BTRFS_DEV_EXTENT_KEY)
+ break;
+
+ if (found_key.offset < key.offset)
+ break;
+
+ dev_extent = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
+
+ chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent);
+
+ cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+ if (!cache)
+ goto skip;
+
+ spin_lock(&cache->lock);
+ cache->to_copy = 1;
+ spin_unlock(&cache->lock);
+
+ btrfs_put_block_group(cache);
+
+skip:
+ ret = btrfs_next_item(root, path);
+ if (ret != 0) {
+ if (ret > 0)
+ ret = 0;
+ break;
+ }
+ }
+
+free_path:
+ btrfs_free_path(path);
+unlock:
+ mutex_unlock(&fs_info->chunk_mutex);
+
+ return ret;
+}
+
+bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
+ struct btrfs_block_group *cache,
+ u64 physical)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 chunk_offset = cache->start;
+ int num_extents, cur_extent;
+ int i;
+
+ /* Do not use "to_copy" on non zoned filesystem for now */
+ if (!btrfs_is_zoned(fs_info))
+ return true;
+
+ spin_lock(&cache->lock);
+ if (cache->removed) {
+ spin_unlock(&cache->lock);
+ return true;
+ }
+ spin_unlock(&cache->lock);
+
+ em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+ ASSERT(!IS_ERR(em));
+ map = em->map_lookup;
+
+ num_extents = cur_extent = 0;
+ for (i = 0; i < map->num_stripes; i++) {
+ /* We have more device extent to copy */
+ if (srcdev != map->stripes[i].dev)
+ continue;
+
+ num_extents++;
+ if (physical == map->stripes[i].physical)
+ cur_extent = i;
+ }
+
+ free_extent_map(em);
+
+ if (num_extents > 1 && cur_extent < num_extents - 1) {
+ /*
+ * Has more stripes on this device. Keep this block group
+ * readonly until we finish all the stripes.
+ */
+ return false;
+ }
+
+ /* Last stripe on this device */
+ spin_lock(&cache->lock);
+ cache->to_copy = 0;
+ spin_unlock(&cache->lock);
+
+ return true;
+}
+
static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
int read_src)
@@ -500,6 +680,10 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
if (ret)
return ret;
+ ret = mark_block_group_to_copy(fs_info, src_device);
+ if (ret)
+ return ret;
+
down_write(&dev_replace->rwsem);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
@@ -715,7 +899,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
* flush all outstanding I/O and inode extent mappings before the
* copy operation is declared as being finished
*/
- ret = btrfs_start_delalloc_roots(fs_info, U64_MAX, false);
+ ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
if (ret) {
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return ret;
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 60b70dacc299..3911049a5f23 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -18,5 +18,8 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
+bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
+ struct btrfs_block_group *cache,
+ u64 physical);
#endif
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index 2b8383d41144..306ff20af70f 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -185,10 +185,12 @@ static struct btrfs_block_group *find_next_block_group(
}
/**
- * peek_discard_list - wrap find_next_block_group()
- * @discard_ctl: discard control
+ * Wrap find_next_block_group()
+ *
+ * @discard_ctl: discard control
* @discard_state: the discard_state of the block_group after state management
* @discard_index: the discard_index of the block_group after state management
+ * @now: time when discard was invoked, in ns
*
* This wraps find_next_block_group() and sets the block_group to be in use.
* discard_state's control flow is managed here. Variables related to
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6b35b7e88136..41b718cfea40 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -459,6 +459,12 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec
return 0;
found_start = btrfs_header_bytenr(eb);
+
+ if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) {
+ WARN_ON(found_start != 0);
+ return 0;
+ }
+
/*
* Please do not consolidate these warnings into a single if.
* It is useful to know what went wrong.
@@ -591,6 +597,59 @@ out:
return ret;
}
+static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
+ int mirror)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct extent_buffer *eb;
+ bool reads_done;
+ int ret = 0;
+
+ /*
+ * We don't allow bio merge for subpage metadata read, so we should
+ * only get one eb for each endio hook.
+ */
+ ASSERT(end == start + fs_info->nodesize - 1);
+ ASSERT(PagePrivate(page));
+
+ eb = find_extent_buffer(fs_info, start);
+ /*
+ * When we are reading one tree block, eb must have been inserted into
+ * the radix tree. If not, something is wrong.
+ */
+ ASSERT(eb);
+
+ reads_done = atomic_dec_and_test(&eb->io_pages);
+ /* Subpage read must finish in page read */
+ ASSERT(reads_done);
+
+ eb->read_mirror = mirror;
+ if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
+ ret = -EIO;
+ goto err;
+ }
+ ret = validate_extent_buffer(eb);
+ if (ret < 0)
+ goto err;
+
+ if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
+ btree_readahead_hook(eb, ret);
+
+ set_extent_buffer_uptodate(eb);
+
+ free_extent_buffer(eb);
+ return ret;
+err:
+ /*
+ * end_bio_extent_readpage decrements io_pages in case of error,
+ * make sure it has something to decrement.
+ */
+ atomic_inc(&eb->io_pages);
+ clear_extent_buffer_uptodate(eb);
+ free_extent_buffer(eb);
+ return ret;
+}
+
int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
struct page *page, u64 start, u64 end,
int mirror)
@@ -600,6 +659,10 @@ int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
int reads_done;
ASSERT(page->private);
+
+ if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
+ return validate_subpage_buffer(page, start, end, mirror);
+
eb = (struct extent_buffer *)page->private;
/*
@@ -646,7 +709,7 @@ static void end_workqueue_bio(struct bio *bio)
fs_info = end_io_wq->info;
end_io_wq->status = bio->bi_status;
- if (bio_op(bio) == REQ_OP_WRITE) {
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
wq = fs_info->endio_meta_write_workers;
else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
@@ -808,6 +871,8 @@ static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
static int check_async_write(struct btrfs_fs_info *fs_info,
struct btrfs_inode *bi)
{
+ if (btrfs_is_zoned(fs_info))
+ return 0;
if (atomic_read(&bi->sync_writers))
return 0;
if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
@@ -822,7 +887,7 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
int async = check_async_write(fs_info, BTRFS_I(inode));
blk_status_t ret;
- if (bio_op(bio) != REQ_OP_WRITE) {
+ if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
/*
* called for a read, do the setup so that checksum validation
* can happen in the async kernel threads
@@ -1016,7 +1081,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->orphan_cleanup_state = 0;
root->last_trans = 0;
- root->highest_objectid = 0;
+ root->free_objectid = 0;
root->nr_delalloc_inodes = 0;
root->nr_ordered_extents = 0;
root->inode_tree = RB_ROOT;
@@ -1189,7 +1254,6 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root;
- struct extent_buffer *leaf;
root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);
if (!root)
@@ -1199,6 +1263,14 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
+ return root;
+}
+
+int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct extent_buffer *leaf;
+
/*
* DON'T set SHAREABLE bit for log trees.
*
@@ -1211,16 +1283,15 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
- if (IS_ERR(leaf)) {
- btrfs_put_root(root);
- return ERR_CAST(leaf);
- }
+ if (IS_ERR(leaf))
+ return PTR_ERR(leaf);
root->node = leaf;
btrfs_mark_buffer_dirty(root->node);
btrfs_tree_unlock(root->node);
- return root;
+
+ return 0;
}
int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
@@ -1231,6 +1302,16 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
log_root = alloc_log_tree(trans, fs_info);
if (IS_ERR(log_root))
return PTR_ERR(log_root);
+
+ if (!btrfs_is_zoned(fs_info)) {
+ int ret = btrfs_alloc_log_tree_node(trans, log_root);
+
+ if (ret) {
+ btrfs_put_root(log_root);
+ return ret;
+ }
+ }
+
WARN_ON(fs_info->log_root_tree);
fs_info->log_root_tree = log_root;
return 0;
@@ -1242,11 +1323,18 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *log_root;
struct btrfs_inode_item *inode_item;
+ int ret;
log_root = alloc_log_tree(trans, fs_info);
if (IS_ERR(log_root))
return PTR_ERR(log_root);
+ ret = btrfs_alloc_log_tree_node(trans, log_root);
+ if (ret) {
+ btrfs_put_root(log_root);
+ return ret;
+ }
+
log_root->last_trans = trans->transid;
log_root->root_key.offset = root->root_key.objectid;
@@ -1367,14 +1455,13 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
}
mutex_lock(&root->objectid_mutex);
- ret = btrfs_find_highest_objectid(root,
- &root->highest_objectid);
+ ret = btrfs_init_root_free_objectid(root);
if (ret) {
mutex_unlock(&root->objectid_mutex);
goto fail;
}
- ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
+ ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
mutex_unlock(&root->objectid_mutex);
@@ -1470,7 +1557,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
{
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
percpu_counter_destroy(&fs_info->delalloc_bytes);
- percpu_counter_destroy(&fs_info->dio_bytes);
+ percpu_counter_destroy(&fs_info->ordered_bytes);
percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
btrfs_free_csum_hash(fs_info);
btrfs_free_stripe_hash_table(fs_info);
@@ -2427,13 +2514,21 @@ static int validate_super(struct btrfs_fs_info *fs_info,
btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
ret = -EINVAL;
}
- /* Only PAGE SIZE is supported yet */
- if (sectorsize != PAGE_SIZE) {
+
+ /*
+ * For 4K page size, we only support 4K sector size.
+ * For 64K page size, we support read-write for 64K sector size, and
+ * read-only for 4K sector size.
+ */
+ if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) ||
+ (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K &&
+ sectorsize != SZ_64K))) {
btrfs_err(fs_info,
- "sectorsize %llu not supported yet, only support %lu",
+ "sectorsize %llu not yet supported for page size %lu",
sectorsize, PAGE_SIZE);
ret = -EINVAL;
}
+
if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
@@ -2646,14 +2741,13 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
* No need to hold btrfs_root::objectid_mutex since the fs
* hasn't been fully initialised and we are the only user
*/
- ret = btrfs_find_highest_objectid(tree_root,
- &tree_root->highest_objectid);
+ ret = btrfs_init_root_free_objectid(tree_root);
if (ret < 0) {
handle_error = true;
continue;
}
- ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
+ ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
ret = btrfs_read_roots(fs_info);
if (ret < 0) {
@@ -2695,11 +2789,13 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->super_lock);
spin_lock_init(&fs_info->buffer_lock);
spin_lock_init(&fs_info->unused_bgs_lock);
+ spin_lock_init(&fs_info->treelog_bg_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->unused_bg_unpin_mutex);
mutex_init(&fs_info->delete_unused_bgs_mutex);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
+ mutex_init(&fs_info->zoned_meta_io_lock);
seqlock_init(&fs_info->profiles_lock);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
@@ -2804,7 +2900,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
- ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL);
+ ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL);
if (ret)
return ret;
@@ -3044,6 +3140,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_alloc;
}
+ fs_info->csum_size = btrfs_super_csum_size(disk_super);
+
ret = btrfs_init_csum_hash(fs_info, csum_type);
if (ret) {
err = ret;
@@ -3138,8 +3236,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
btrfs_info(fs_info, "has skinny extents");
- fs_info->zoned = (features & BTRFS_FEATURE_INCOMPAT_ZONED);
-
/*
* flag our filesystem as having big metadata blocks if
* they are bigger than the page size
@@ -3161,7 +3257,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
fs_info->nodesize = nodesize;
fs_info->sectorsize = sectorsize;
fs_info->sectorsize_bits = ilog2(sectorsize);
- fs_info->csum_size = btrfs_super_csum_size(disk_super);
fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
fs_info->stripesize = stripesize;
@@ -3193,6 +3288,17 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_alloc;
}
+ /* For 4K sector size support, it's only read-only */
+ if (PAGE_SIZE == SZ_64K && sectorsize == SZ_4K) {
+ if (!sb_rdonly(sb) || btrfs_super_log_root(disk_super)) {
+ btrfs_err(fs_info,
+ "subpage sectorsize %u only supported read-only for page size %lu",
+ sectorsize, PAGE_SIZE);
+ err = -EINVAL;
+ goto fail_alloc;
+ }
+ }
+
ret = btrfs_init_workqueues(fs_info, fs_devices);
if (ret) {
err = ret;
@@ -3260,6 +3366,19 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_tree_roots;
/*
+ * Get zone type information of zoned block devices. This will also
+ * handle emulation of a zoned filesystem if a regular device has the
+ * zoned incompat feature flag set.
+ */
+ ret = btrfs_get_dev_zone_info_all_devices(fs_info);
+ if (ret) {
+ btrfs_err(fs_info,
+ "zoned: failed to read device zone info: %d",
+ ret);
+ goto fail_block_groups;
+ }
+
+ /*
* If we have a uuid root and we're not being told to rescan we need to
* check the generation here so we can set the
* BTRFS_FS_UPDATE_UUID_TREE_GEN bit. Otherwise we could commit the
@@ -4113,6 +4232,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
cancel_work_sync(&fs_info->async_reclaim_work);
cancel_work_sync(&fs_info->async_data_reclaim_work);
+ cancel_work_sync(&fs_info->preempt_reclaim_work);
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);
@@ -4165,9 +4285,9 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
percpu_counter_sum(&fs_info->delalloc_bytes));
}
- if (percpu_counter_sum(&fs_info->dio_bytes))
+ if (percpu_counter_sum(&fs_info->ordered_bytes))
btrfs_info(fs_info, "at unmount dio bytes count %lld",
- percpu_counter_sum(&fs_info->dio_bytes));
+ percpu_counter_sum(&fs_info->ordered_bytes));
btrfs_sysfs_remove_mounted(fs_info);
btrfs_sysfs_remove_fsid(fs_info->fs_devices);
@@ -4688,6 +4808,8 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
EXTENT_DIRTY);
btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
+ btrfs_free_redirty_list(cur_trans);
+
cur_trans->state =TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
}
@@ -4745,7 +4867,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
return 0;
}
-int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
+int btrfs_init_root_free_objectid(struct btrfs_root *root)
{
struct btrfs_path *path;
int ret;
@@ -4769,10 +4891,10 @@ int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
slot = path->slots[0] - 1;
l = path->nodes[0];
btrfs_item_key_to_cpu(l, &found_key, slot);
- *objectid = max_t(u64, found_key.objectid,
- BTRFS_FIRST_FREE_OBJECTID - 1);
+ root->free_objectid = max_t(u64, found_key.objectid + 1,
+ BTRFS_FIRST_FREE_OBJECTID);
} else {
- *objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
+ root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
}
ret = 0;
error:
@@ -4780,12 +4902,12 @@ error:
return ret;
}
-int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
+int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
{
int ret;
mutex_lock(&root->objectid_mutex);
- if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
+ if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
btrfs_warn(root->fs_info,
"the objectid of root %llu reaches its highest value",
root->root_key.objectid);
@@ -4793,7 +4915,7 @@ int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
goto out;
}
- *objectid = ++root->highest_objectid;
+ *objectid = root->free_objectid++;
ret = 0;
out:
mutex_unlock(&root->objectid_mutex);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index e45057c0c016..0e7e9526b6a8 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -120,6 +120,8 @@ blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
extent_submit_bio_start_t *submit_bio_start);
blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
int mirror_num);
+int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
@@ -133,8 +135,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
int btree_lock_page_hook(struct page *page, void *data,
void (*flush_fn)(void *));
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
-int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid);
-int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid);
+int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid);
+int btrfs_init_root_free_objectid(struct btrfs_root *root);
int __init btrfs_end_io_wq_init(void);
void __cold btrfs_end_io_wq_exit(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d79b8369e6aa..78ad31a59e59 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -34,6 +34,8 @@
#include "block-group.h"
#include "discard.h"
#include "rcu-string.h"
+#include "zoned.h"
+#include "dev-replace.h"
#undef SCRAMBLE_DELAYED_REFS
@@ -82,41 +84,6 @@ void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
EXTENT_UPTODATE);
}
-static u64 generic_ref_to_space_flags(struct btrfs_ref *ref)
-{
- if (ref->type == BTRFS_REF_METADATA) {
- if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID)
- return BTRFS_BLOCK_GROUP_SYSTEM;
- else
- return BTRFS_BLOCK_GROUP_METADATA;
- }
- return BTRFS_BLOCK_GROUP_DATA;
-}
-
-static void add_pinned_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_ref *ref)
-{
- struct btrfs_space_info *space_info;
- u64 flags = generic_ref_to_space_flags(ref);
-
- space_info = btrfs_find_space_info(fs_info, flags);
- ASSERT(space_info);
- percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
-}
-
-static void sub_pinned_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_ref *ref)
-{
- struct btrfs_space_info *space_info;
- u64 flags = generic_ref_to_space_flags(ref);
-
- space_info = btrfs_find_space_info(fs_info, flags);
- ASSERT(space_info);
- percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
-}
-
/* simple helper to search for an existing data extent at a given offset */
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
{
@@ -1299,6 +1266,46 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
return ret;
}
+static int do_discard_extent(struct btrfs_bio_stripe *stripe, u64 *bytes)
+{
+ struct btrfs_device *dev = stripe->dev;
+ struct btrfs_fs_info *fs_info = dev->fs_info;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ u64 phys = stripe->physical;
+ u64 len = stripe->length;
+ u64 discarded = 0;
+ int ret = 0;
+
+ /* Zone reset on a zoned filesystem */
+ if (btrfs_can_zone_reset(dev, phys, len)) {
+ u64 src_disc;
+
+ ret = btrfs_reset_device_zone(dev, phys, len, &discarded);
+ if (ret)
+ goto out;
+
+ if (!btrfs_dev_replace_is_ongoing(dev_replace) ||
+ dev != dev_replace->srcdev)
+ goto out;
+
+ src_disc = discarded;
+
+ /* Send to replace target as well */
+ ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
+ &discarded);
+ discarded += src_disc;
+ } else if (blk_queue_discard(bdev_get_queue(stripe->dev->bdev))) {
+ ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
+ } else {
+ ret = 0;
+ *bytes = 0;
+ }
+
+out:
+ *bytes = discarded;
+ return ret;
+}
+
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 num_bytes, u64 *actual_bytes)
{
@@ -1333,20 +1340,13 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
stripe = bbio->stripes;
for (i = 0; i < bbio->num_stripes; i++, stripe++) {
u64 bytes;
- struct request_queue *req_q;
if (!stripe->dev->bdev) {
ASSERT(btrfs_test_opt(fs_info, DEGRADED));
continue;
}
- req_q = bdev_get_queue(stripe->dev->bdev);
- if (!blk_queue_discard(req_q))
- continue;
- ret = btrfs_issue_discard(stripe->dev->bdev,
- stripe->physical,
- stripe->length,
- &bytes);
+ ret = do_discard_extent(stripe, &bytes);
if (!ret) {
discarded_bytes += bytes;
} else if (ret != -EOPNOTSUPP) {
@@ -1388,7 +1388,6 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- int old_ref_mod, new_ref_mod;
int ret;
ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
@@ -1397,17 +1396,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
if (generic_ref->type == BTRFS_REF_METADATA)
- ret = btrfs_add_delayed_tree_ref(trans, generic_ref,
- NULL, &old_ref_mod, &new_ref_mod);
+ ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
else
- ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0,
- &old_ref_mod, &new_ref_mod);
+ ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
btrfs_ref_tree_mod(fs_info, generic_ref);
- if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
- sub_pinned_bytes(fs_info, generic_ref);
-
return ret;
}
@@ -1795,34 +1789,28 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
{
int nr_items = 1; /* Dropping this ref head update. */
- if (head->total_ref_mod < 0) {
- struct btrfs_space_info *space_info;
- u64 flags;
+ /*
+ * We had csum deletions accounted for in our delayed refs rsv, we need
+ * to drop the csum leaves for this update from our delayed_refs_rsv.
+ */
+ if (head->total_ref_mod < 0 && head->is_data) {
+ spin_lock(&delayed_refs->lock);
+ delayed_refs->pending_csums -= head->num_bytes;
+ spin_unlock(&delayed_refs->lock);
+ nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
+ }
- if (head->is_data)
- flags = BTRFS_BLOCK_GROUP_DATA;
- else if (head->is_system)
- flags = BTRFS_BLOCK_GROUP_SYSTEM;
- else
- flags = BTRFS_BLOCK_GROUP_METADATA;
- space_info = btrfs_find_space_info(fs_info, flags);
- ASSERT(space_info);
- percpu_counter_add_batch(&space_info->total_bytes_pinned,
- -head->num_bytes,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
+ /*
+ * We were dropping refs, or had a new ref and dropped it, and thus must
+ * adjust down our total_bytes_pinned, the space may or may not have
+ * been pinned and so is accounted for properly in the pinned space by
+ * now.
+ */
+ if (head->total_ref_mod < 0 ||
+ (head->total_ref_mod == 0 && head->must_insert_reserved)) {
+ u64 flags = btrfs_ref_head_to_space_flags(head);
- /*
- * We had csum deletions accounted for in our delayed refs rsv,
- * we need to drop the csum leaves for this update from our
- * delayed_refs_rsv.
- */
- if (head->is_data) {
- spin_lock(&delayed_refs->lock);
- delayed_refs->pending_csums -= head->num_bytes;
- spin_unlock(&delayed_refs->lock);
- nr_items += btrfs_csum_bytes_to_leaves(fs_info,
- head->num_bytes);
- }
+ btrfs_mod_total_bytes_pinned(fs_info, flags, -head->num_bytes);
}
btrfs_delayed_refs_rsv_release(fs_info, nr_items);
@@ -2160,7 +2148,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
delayed_refs = &trans->transaction->delayed_refs;
if (count == 0)
- count = atomic_read(&delayed_refs->num_entries) * 2;
+ count = delayed_refs->num_heads_ready;
again:
#ifdef SCRAMBLE_DELAYED_REFS
@@ -2572,8 +2560,7 @@ static int pin_down_extent(struct btrfs_trans_handle *trans,
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
- num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
+ __btrfs_mod_total_bytes_pinned(cache->space_info, num_bytes);
set_extent_dirty(&trans->transaction->pinned_extents, bytenr,
bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
return 0;
@@ -2602,8 +2589,6 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
struct btrfs_block_group *cache;
int ret;
- btrfs_add_excluded_extent(trans->fs_info, bytenr, num_bytes);
-
cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
if (!cache)
return -EINVAL;
@@ -2615,11 +2600,19 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
* the pinned extents.
*/
btrfs_cache_block_group(cache, 1);
+ /*
+ * Make sure we wait until the cache is completely built in case it is
+ * missing or is invalid and therefore needs to be rebuilt.
+ */
+ ret = btrfs_wait_block_group_cache_done(cache);
+ if (ret)
+ goto out;
pin_down_extent(trans, cache, bytenr, num_bytes, 0);
/* remove us from the free space cache (if we're there at all) */
ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
+out:
btrfs_put_block_group(cache);
return ret;
}
@@ -2629,50 +2622,22 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
{
int ret;
struct btrfs_block_group *block_group;
- struct btrfs_caching_control *caching_ctl;
block_group = btrfs_lookup_block_group(fs_info, start);
if (!block_group)
return -EINVAL;
- btrfs_cache_block_group(block_group, 0);
- caching_ctl = btrfs_get_caching_control(block_group);
-
- if (!caching_ctl) {
- /* Logic error */
- BUG_ON(!btrfs_block_group_done(block_group));
- ret = btrfs_remove_free_space(block_group, start, num_bytes);
- } else {
- /*
- * We must wait for v1 caching to finish, otherwise we may not
- * remove our space.
- */
- btrfs_wait_space_cache_v1_finished(block_group, caching_ctl);
- mutex_lock(&caching_ctl->mutex);
-
- if (start >= caching_ctl->progress) {
- ret = btrfs_add_excluded_extent(fs_info, start,
- num_bytes);
- } else if (start + num_bytes <= caching_ctl->progress) {
- ret = btrfs_remove_free_space(block_group,
- start, num_bytes);
- } else {
- num_bytes = caching_ctl->progress - start;
- ret = btrfs_remove_free_space(block_group,
- start, num_bytes);
- if (ret)
- goto out_lock;
+ btrfs_cache_block_group(block_group, 1);
+ /*
+ * Make sure we wait until the cache is completely built in case it is
+ * missing or is invalid and therefore needs to be rebuilt.
+ */
+ ret = btrfs_wait_block_group_cache_done(block_group);
+ if (ret)
+ goto out;
- num_bytes = (start + num_bytes) -
- caching_ctl->progress;
- start = caching_ctl->progress;
- ret = btrfs_add_excluded_extent(fs_info, start,
- num_bytes);
- }
-out_lock:
- mutex_unlock(&caching_ctl->mutex);
- btrfs_put_caching_control(caching_ctl);
- }
+ ret = btrfs_remove_free_space(block_group, start, num_bytes);
+out:
btrfs_put_block_group(block_group);
return ret;
}
@@ -2806,11 +2771,14 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
cache->pinned -= len;
btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
space_info->max_extent_size = 0;
- percpu_counter_add_batch(&space_info->total_bytes_pinned,
- -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
+ __btrfs_mod_total_bytes_pinned(space_info, -len);
if (cache->ro) {
space_info->bytes_readonly += len;
readonly = true;
+ } else if (btrfs_is_zoned(fs_info)) {
+ /* Need reset before reusing in a zoned block group */
+ space_info->bytes_zone_unusable += len;
+ readonly = true;
}
spin_unlock(&cache->lock);
if (!readonly && return_free_space &&
@@ -2863,9 +2831,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
break;
}
- if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
- clear_extent_bits(&fs_info->excluded_extents, start,
- end, EXTENT_UPTODATE);
if (btrfs_test_opt(fs_info, DISCARD_SYNC))
ret = btrfs_discard_extent(fs_info, start,
@@ -3343,7 +3308,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_ref generic_ref = { 0 };
- int pin = 1;
int ret;
btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
@@ -3352,13 +3316,9 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
root->root_key.objectid);
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
- int old_ref_mod, new_ref_mod;
-
btrfs_ref_tree_mod(fs_info, &generic_ref);
- ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL,
- &old_ref_mod, &new_ref_mod);
+ ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
BUG_ON(ret); /* -ENOMEM */
- pin = old_ref_mod >= 0 && new_ref_mod < 0;
}
if (last_ref && btrfs_header_generation(buf) == trans->transid) {
@@ -3366,11 +3326,12 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
ret = check_ref_cleanup(trans, buf->start);
- if (!ret)
+ if (!ret) {
+ btrfs_redirty_list_add(trans->transaction, buf);
goto out;
+ }
}
- pin = 0;
cache = btrfs_lookup_block_group(fs_info, buf->start);
if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
@@ -3379,6 +3340,13 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
goto out;
}
+ if (btrfs_is_zoned(fs_info)) {
+ btrfs_redirty_list_add(trans->transaction, buf);
+ pin_down_extent(trans, cache, buf->start, buf->len, 1);
+ btrfs_put_block_group(cache);
+ goto out;
+ }
+
WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
btrfs_add_free_space(cache, buf->start, buf->len);
@@ -3387,9 +3355,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
}
out:
- if (pin)
- add_pinned_bytes(fs_info, &generic_ref);
-
if (last_ref) {
/*
* Deleting the buffer, clear the corrupt flag since it doesn't
@@ -3403,7 +3368,6 @@ out:
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- int old_ref_mod, new_ref_mod;
int ret;
if (btrfs_is_testing(fs_info))
@@ -3419,14 +3383,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
/* unlocks the pinned mutex */
btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
- old_ref_mod = new_ref_mod = 0;
ret = 0;
} else if (ref->type == BTRFS_REF_METADATA) {
- ret = btrfs_add_delayed_tree_ref(trans, ref, NULL,
- &old_ref_mod, &new_ref_mod);
+ ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
} else {
- ret = btrfs_add_delayed_data_ref(trans, ref, 0,
- &old_ref_mod, &new_ref_mod);
+ ret = btrfs_add_delayed_data_ref(trans, ref, 0);
}
if (!((ref->type == BTRFS_REF_METADATA &&
@@ -3435,9 +3396,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
btrfs_ref_tree_mod(fs_info, ref);
- if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
- add_pinned_bytes(fs_info, ref);
-
return ret;
}
@@ -3514,6 +3472,7 @@ btrfs_release_block_group(struct btrfs_block_group *cache,
enum btrfs_extent_allocation_policy {
BTRFS_EXTENT_ALLOC_CLUSTERED,
+ BTRFS_EXTENT_ALLOC_ZONED,
};
/*
@@ -3538,6 +3497,9 @@ struct find_free_extent_ctl {
bool have_caching_bg;
bool orig_have_caching_bg;
+ /* Allocation is called for tree-log */
+ bool for_treelog;
+
/* RAID index, converted from flags */
int index;
@@ -3766,6 +3728,118 @@ static int do_allocation_clustered(struct btrfs_block_group *block_group,
return find_free_extent_unclustered(block_group, ffe_ctl);
}
+/*
+ * Tree-log block group locking
+ * ============================
+ *
+ * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which
+ * indicates the starting address of a block group, which is reserved only
+ * for tree-log metadata.
+ *
+ * Lock nesting
+ * ============
+ *
+ * space_info::lock
+ * block_group::lock
+ * fs_info::treelog_bg_lock
+ */
+
+/*
+ * Simple allocator for sequential-only block group. It only allows sequential
+ * allocation. No need to play with trees. This function also reserves the
+ * bytes as in btrfs_add_reserved_bytes.
+ */
+static int do_allocation_zoned(struct btrfs_block_group *block_group,
+ struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_block_group **bg_ret)
+{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_space_info *space_info = block_group->space_info;
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ u64 start = block_group->start;
+ u64 num_bytes = ffe_ctl->num_bytes;
+ u64 avail;
+ u64 bytenr = block_group->start;
+ u64 log_bytenr;
+ int ret = 0;
+ bool skip;
+
+ ASSERT(btrfs_is_zoned(block_group->fs_info));
+
+ /*
+ * Do not allow non-tree-log blocks in the dedicated tree-log block
+ * group, and vice versa.
+ */
+ spin_lock(&fs_info->treelog_bg_lock);
+ log_bytenr = fs_info->treelog_bg;
+ skip = log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
+ (!ffe_ctl->for_treelog && bytenr == log_bytenr));
+ spin_unlock(&fs_info->treelog_bg_lock);
+ if (skip)
+ return 1;
+
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+ spin_lock(&fs_info->treelog_bg_lock);
+
+ ASSERT(!ffe_ctl->for_treelog ||
+ block_group->start == fs_info->treelog_bg ||
+ fs_info->treelog_bg == 0);
+
+ if (block_group->ro) {
+ ret = 1;
+ goto out;
+ }
+
+ /*
+ * Do not allow currently using block group to be tree-log dedicated
+ * block group.
+ */
+ if (ffe_ctl->for_treelog && !fs_info->treelog_bg &&
+ (block_group->used || block_group->reserved)) {
+ ret = 1;
+ goto out;
+ }
+
+ avail = block_group->length - block_group->alloc_offset;
+ if (avail < num_bytes) {
+ if (ffe_ctl->max_extent_size < avail) {
+ /*
+ * With sequential allocator, free space is always
+ * contiguous
+ */
+ ffe_ctl->max_extent_size = avail;
+ ffe_ctl->total_free_space = avail;
+ }
+ ret = 1;
+ goto out;
+ }
+
+ if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
+ fs_info->treelog_bg = block_group->start;
+
+ ffe_ctl->found_offset = start + block_group->alloc_offset;
+ block_group->alloc_offset += num_bytes;
+ spin_lock(&ctl->tree_lock);
+ ctl->free_space -= num_bytes;
+ spin_unlock(&ctl->tree_lock);
+
+ /*
+ * We do not check if found_offset is aligned to stripesize. The
+ * address is anyway rewritten when using zone append writing.
+ */
+
+ ffe_ctl->search_start = ffe_ctl->found_offset;
+
+out:
+ if (ret && ffe_ctl->for_treelog)
+ fs_info->treelog_bg = 0;
+ spin_unlock(&fs_info->treelog_bg_lock);
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
+ return ret;
+}
+
static int do_allocation(struct btrfs_block_group *block_group,
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_block_group **bg_ret)
@@ -3773,6 +3847,8 @@ static int do_allocation(struct btrfs_block_group *block_group,
switch (ffe_ctl->policy) {
case BTRFS_EXTENT_ALLOC_CLUSTERED:
return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ return do_allocation_zoned(block_group, ffe_ctl, bg_ret);
default:
BUG();
}
@@ -3787,6 +3863,9 @@ static void release_block_group(struct btrfs_block_group *block_group,
ffe_ctl->retry_clustered = false;
ffe_ctl->retry_unclustered = false;
break;
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ /* Nothing to do */
+ break;
default:
BUG();
}
@@ -3815,6 +3894,9 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl,
case BTRFS_EXTENT_ALLOC_CLUSTERED:
found_extent_clustered(ffe_ctl, ins);
break;
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ /* Nothing to do */
+ break;
default:
BUG();
}
@@ -3830,6 +3912,9 @@ static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
*/
ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
return 0;
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ /* Give up here */
+ return -ENOSPC;
default:
BUG();
}
@@ -3998,6 +4083,14 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
case BTRFS_EXTENT_ALLOC_CLUSTERED:
return prepare_allocation_clustered(fs_info, ffe_ctl,
space_info, ins);
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ if (ffe_ctl->for_treelog) {
+ spin_lock(&fs_info->treelog_bg_lock);
+ if (fs_info->treelog_bg)
+ ffe_ctl->hint_byte = fs_info->treelog_bg;
+ spin_unlock(&fs_info->treelog_bg_lock);
+ }
+ return 0;
default:
BUG();
}
@@ -4040,6 +4133,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
struct find_free_extent_ctl ffe_ctl = {0};
struct btrfs_space_info *space_info;
bool full_search = false;
+ bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
WARN_ON(num_bytes < fs_info->sectorsize);
@@ -4053,6 +4147,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
ffe_ctl.orig_have_caching_bg = false;
ffe_ctl.found_offset = 0;
ffe_ctl.hint_byte = hint_byte_orig;
+ ffe_ctl.for_treelog = for_treelog;
ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
/* For clustered allocation */
@@ -4061,6 +4156,9 @@ static noinline int find_free_extent(struct btrfs_root *root,
ffe_ctl.last_ptr = NULL;
ffe_ctl.use_cluster = true;
+ if (btrfs_is_zoned(fs_info))
+ ffe_ctl.policy = BTRFS_EXTENT_ALLOC_ZONED;
+
ins->type = BTRFS_EXTENT_ITEM_KEY;
ins->objectid = 0;
ins->offset = 0;
@@ -4124,8 +4222,11 @@ search:
struct btrfs_block_group *bg_ret;
/* If the block group is read-only, we can skip it entirely. */
- if (unlikely(block_group->ro))
+ if (unlikely(block_group->ro)) {
+ if (for_treelog)
+ btrfs_clear_treelog_bg(block_group);
continue;
+ }
btrfs_grab_block_group(block_group, delalloc);
ffe_ctl.search_start = block_group->start;
@@ -4203,20 +4304,21 @@ have_block_group:
/* move on to the next group */
if (ffe_ctl.search_start + num_bytes >
block_group->start + block_group->length) {
- btrfs_add_free_space(block_group, ffe_ctl.found_offset,
- num_bytes);
+ btrfs_add_free_space_unused(block_group,
+ ffe_ctl.found_offset, num_bytes);
goto loop;
}
if (ffe_ctl.found_offset < ffe_ctl.search_start)
- btrfs_add_free_space(block_group, ffe_ctl.found_offset,
- ffe_ctl.search_start - ffe_ctl.found_offset);
+ btrfs_add_free_space_unused(block_group,
+ ffe_ctl.found_offset,
+ ffe_ctl.search_start - ffe_ctl.found_offset);
ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
num_bytes, delalloc);
if (ret == -EAGAIN) {
- btrfs_add_free_space(block_group, ffe_ctl.found_offset,
- num_bytes);
+ btrfs_add_free_space_unused(block_group,
+ ffe_ctl.found_offset, num_bytes);
goto loop;
}
btrfs_inc_block_group_reservations(block_group);
@@ -4310,6 +4412,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
bool final_tried = num_bytes == min_alloc_size;
u64 flags;
int ret;
+ bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
flags = get_alloc_profile_by_root(root, is_data);
again:
@@ -4333,8 +4436,8 @@ again:
sinfo = btrfs_find_space_info(fs_info, flags);
btrfs_err(fs_info,
- "allocation failed flags %llu, wanted %llu",
- flags, num_bytes);
+ "allocation failed flags %llu, wanted %llu tree-log %d",
+ flags, num_bytes, for_treelog);
if (sinfo)
btrfs_dump_space_info(fs_info, sinfo,
num_bytes, 1);
@@ -4516,7 +4619,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
}
if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
- BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
btrfs_set_extent_inline_ref_type(leaf, iref,
BTRFS_SHARED_BLOCK_REF_KEY);
btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
@@ -4553,7 +4655,6 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_key *ins)
{
struct btrfs_ref generic_ref = { 0 };
- int ret;
BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
@@ -4561,9 +4662,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
ins->objectid, ins->offset, 0);
btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
btrfs_ref_tree_mod(root->fs_info, &generic_ref);
- ret = btrfs_add_delayed_data_ref(trans, &generic_ref,
- ram_bytes, NULL, NULL);
- return ret;
+
+ return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
}
/*
@@ -4645,6 +4745,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
__btrfs_tree_lock(buf, nest);
btrfs_clean_tree_block(buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
+ clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags);
set_extent_buffer_uptodate(buf);
@@ -4755,8 +4856,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
generic_ref.real_root = root->root_key.objectid;
btrfs_init_tree_ref(&generic_ref, level, root_objectid);
btrfs_ref_tree_mod(fs_info, &generic_ref);
- ret = btrfs_add_delayed_tree_ref(trans, &generic_ref,
- extent_op, NULL, NULL);
+ ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
if (ret)
goto out_free_delayed;
}
@@ -5549,7 +5649,15 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
goto out_free;
}
- trans = btrfs_start_transaction(tree_root, 0);
+ /*
+ * Use join to avoid potential EINTR from transaction
+ * start. See wait_reserve_ticket and the whole
+ * reservation callchain.
+ */
+ if (for_reloc)
+ trans = btrfs_join_transaction(tree_root);
+ else
+ trans = btrfs_start_transaction(tree_root, 0);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
goto out_free;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c9cee458e001..4dfb3ead1175 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -24,6 +24,9 @@
#include "rcu-string.h"
#include "backref.h"
#include "disk-io.h"
+#include "subpage.h"
+#include "zoned.h"
+#include "block-group.h"
static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
@@ -389,16 +392,16 @@ do_insert:
}
/**
- * __etree_search - searche @tree for an entry that contains @offset. Such
- * entry would have entry->start <= offset && entry->end >= offset.
+ * Search @tree for an entry that contains @offset. Such entry would have
+ * entry->start <= offset && entry->end >= offset.
*
- * @tree - the tree to search
- * @offset - offset that should fall within an entry in @tree
- * @next_ret - pointer to the first entry whose range ends after @offset
- * @prev - pointer to the first entry whose range begins before @offset
- * @p_ret - pointer where new node should be anchored (used when inserting an
- * entry in the tree)
- * @parent_ret - points to entry which would have been the parent of the entry,
+ * @tree: the tree to search
+ * @offset: offset that should fall within an entry in @tree
+ * @next_ret: pointer to the first entry whose range ends after @offset
+ * @prev_ret: pointer to the first entry whose range begins before @offset
+ * @p_ret: pointer where new node should be anchored (used when inserting an
+ * entry in the tree)
+ * @parent_ret: points to entry which would have been the parent of the entry,
* containing @offset
*
* This function returns a pointer to the entry that contains @offset byte
@@ -1588,12 +1591,13 @@ out:
}
/**
- * find_contiguous_extent_bit: find a contiguous area of bits
- * @tree - io tree to check
- * @start - offset to start the search from
- * @start_ret - the first offset we found with the bits set
- * @end_ret - the final contiguous range of the bits that were set
- * @bits - bits to look for
+ * Find a contiguous area of bits
+ *
+ * @tree: io tree to check
+ * @start: offset to start the search from
+ * @start_ret: the first offset we found with the bits set
+ * @end_ret: the final contiguous range of the bits that were set
+ * @bits: bits to look for
*
* set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
* to set bits appropriately, and then merge them again. During this time it
@@ -1625,14 +1629,14 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
}
/**
- * find_first_clear_extent_bit - find the first range that has @bits not set.
- * This range could start before @start.
+ * Find the first range that has @bits not set. This range could start before
+ * @start.
*
- * @tree - the tree to search
- * @start - the offset at/after which the found extent should start
- * @start_ret - records the beginning of the range
- * @end_ret - records the end of the range (inclusive)
- * @bits - the set of bits which must be unset
+ * @tree: the tree to search
+ * @start: offset at/after which the found extent should start
+ * @start_ret: records the beginning of the range
+ * @end_ret: records the end of the range (inclusive)
+ * @bits: the set of bits which must be unset
*
* Since unallocated range is also considered one which doesn't have the bits
* set it's possible that @end_ret contains -1, this happens in case the range
@@ -1975,10 +1979,10 @@ static int __process_pages_contig(struct address_space *mapping,
pages_processed++;
continue;
}
- if (page_ops & PAGE_CLEAR_DIRTY)
+ if (page_ops & PAGE_START_WRITEBACK) {
clear_page_dirty_for_io(pages[i]);
- if (page_ops & PAGE_SET_WRITEBACK)
set_page_writeback(pages[i]);
+ }
if (page_ops & PAGE_SET_ERROR)
SetPageError(pages[i]);
if (page_ops & PAGE_END_WRITEBACK)
@@ -2256,6 +2260,9 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
BUG_ON(!mirror_num);
+ if (btrfs_is_zoned(fs_info))
+ return btrfs_repair_one_zone(fs_info, logical);
+
bio = btrfs_io_bio_alloc(1);
bio->bi_iter.bi_size = 0;
map_length = length;
@@ -2732,6 +2739,7 @@ static void end_bio_extent_writepage(struct bio *bio)
u64 start;
u64 end;
struct bvec_iter_all iter_all;
+ bool first_bvec = true;
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
@@ -2758,6 +2766,11 @@ static void end_bio_extent_writepage(struct bio *bio)
start = page_offset(page);
end = start + bvec->bv_offset + bvec->bv_len - 1;
+ if (first_bvec) {
+ btrfs_record_physical_zoned(inode, start, bio);
+ first_bvec = false;
+ }
+
end_extent_writepage(page, error, start, end);
end_page_writeback(page);
}
@@ -2775,7 +2788,7 @@ struct processed_extent {
struct btrfs_inode *inode;
/* Start of the range in @inode */
u64 start;
- /* End of the range in in @inode */
+ /* End of the range in @inode */
u64 end;
bool uptodate;
};
@@ -2838,15 +2851,38 @@ update:
processed->uptodate = uptodate;
}
-static void endio_readpage_update_page_status(struct page *page, bool uptodate)
+static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
{
+ ASSERT(PageLocked(page));
+ if (fs_info->sectorsize == PAGE_SIZE)
+ return;
+
+ ASSERT(PagePrivate(page));
+ btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE);
+}
+
+static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+
+ ASSERT(page_offset(page) <= start &&
+ start + len <= page_offset(page) + PAGE_SIZE);
+
if (uptodate) {
- SetPageUptodate(page);
+ btrfs_page_set_uptodate(fs_info, page, start, len);
} else {
- ClearPageUptodate(page);
- SetPageError(page);
+ btrfs_page_clear_uptodate(fs_info, page, start, len);
+ btrfs_page_set_error(fs_info, page, start, len);
}
- unlock_page(page);
+
+ if (fs_info->sectorsize == PAGE_SIZE)
+ unlock_page(page);
+ else if (is_data_inode(page->mapping->host))
+ /*
+ * For subpage data, unlock the page if we're the last reader.
+ * For subpage metadata, page lock is not utilized for read.
+ */
+ btrfs_subpage_end_reader(fs_info, page, start, len);
}
/*
@@ -2983,7 +3019,7 @@ readpage_ok:
bio_offset += len;
/* Update page status and unlock */
- endio_readpage_update_page_status(page, uptodate);
+ end_page_read(page, uptodate, start, len);
endio_readpage_release_extent(&processed, BTRFS_I(inode),
start, end, uptodate);
}
@@ -3058,14 +3094,67 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
return bio;
}
+/**
+ * Attempt to add a page to bio
+ *
+ * @bio: destination bio
+ * @page: page to add to the bio
+ * @disk_bytenr: offset of the new bio or to check whether we are adding
+ * a contiguous page to the previous one
+ * @pg_offset: starting offset in the page
+ * @size: portion of page that we want to write
+ * @prev_bio_flags: flags of previous bio to see if we can merge the current one
+ * @bio_flags: flags of the current bio to see if we can merge them
+ * @return: true if page was added, false otherwise
+ *
+ * Attempt to add a page to bio considering stripe alignment etc.
+ *
+ * Return true if successfully page added. Otherwise, return false.
+ */
+static bool btrfs_bio_add_page(struct bio *bio, struct page *page,
+ u64 disk_bytenr, unsigned int size,
+ unsigned int pg_offset,
+ unsigned long prev_bio_flags,
+ unsigned long bio_flags)
+{
+ const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
+ bool contig;
+ int ret;
+
+ if (prev_bio_flags != bio_flags)
+ return false;
+
+ if (prev_bio_flags & EXTENT_BIO_COMPRESSED)
+ contig = bio->bi_iter.bi_sector == sector;
+ else
+ contig = bio_end_sector(bio) == sector;
+ if (!contig)
+ return false;
+
+ if (btrfs_bio_fits_in_stripe(page, size, bio, bio_flags))
+ return false;
+
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ struct page *first_page = bio_first_bvec_all(bio)->bv_page;
+
+ if (!btrfs_bio_fits_in_ordered_extent(first_page, bio, size))
+ return false;
+ ret = bio_add_zone_append_page(bio, page, size, pg_offset);
+ } else {
+ ret = bio_add_page(bio, page, size, pg_offset);
+ }
+
+ return ret == size;
+}
+
/*
* @opf: bio REQ_OP_* and REQ_* flags as one value
* @wbc: optional writeback control for io accounting
* @page: page to add to the bio
+ * @disk_bytenr: logical bytenr where the write will be
+ * @size: portion of page that we want to write to
* @pg_offset: offset of the new bio or to check whether we are adding
* a contiguous page to the previous one
- * @size: portion of page that we want to write
- * @offset: starting offset in the page
* @bio_ret: must be valid pointer, newly allocated bio will be stored there
* @end_io_func: end_io callback for new bio
* @mirror_num: desired mirror to read/write
@@ -3074,7 +3163,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
*/
static int submit_extent_page(unsigned int opf,
struct writeback_control *wbc,
- struct page *page, u64 offset,
+ struct page *page, u64 disk_bytenr,
size_t size, unsigned long pg_offset,
struct bio **bio_ret,
bio_end_io_t end_io_func,
@@ -3086,27 +3175,17 @@ static int submit_extent_page(unsigned int opf,
int ret = 0;
struct bio *bio;
size_t io_size = min_t(size_t, size, PAGE_SIZE);
- sector_t sector = offset >> 9;
- struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree;
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct extent_io_tree *tree = &inode->io_tree;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
ASSERT(bio_ret);
if (*bio_ret) {
- bool contig;
- bool can_merge = true;
-
bio = *bio_ret;
- if (prev_bio_flags & EXTENT_BIO_COMPRESSED)
- contig = bio->bi_iter.bi_sector == sector;
- else
- contig = bio_end_sector(bio) == sector;
-
- if (btrfs_bio_fits_in_stripe(page, io_size, bio, bio_flags))
- can_merge = false;
-
- if (prev_bio_flags != bio_flags || !contig || !can_merge ||
- force_bio_submit ||
- bio_add_page(bio, page, io_size, pg_offset) < io_size) {
+ if (force_bio_submit ||
+ !btrfs_bio_add_page(bio, page, disk_bytenr, io_size,
+ pg_offset, prev_bio_flags, bio_flags)) {
ret = submit_one_bio(bio, mirror_num, prev_bio_flags);
if (ret < 0) {
*bio_ret = NULL;
@@ -3120,7 +3199,7 @@ static int submit_extent_page(unsigned int opf,
}
}
- bio = btrfs_bio_alloc(offset);
+ bio = btrfs_bio_alloc(disk_bytenr);
bio_add_page(bio, page, io_size, pg_offset);
bio->bi_end_io = end_io_func;
bio->bi_private = tree;
@@ -3129,20 +3208,39 @@ static int submit_extent_page(unsigned int opf,
if (wbc) {
struct block_device *bdev;
- bdev = BTRFS_I(page->mapping->host)->root->fs_info->fs_devices->latest_bdev;
+ bdev = fs_info->fs_devices->latest_bdev;
bio_set_dev(bio, bdev);
wbc_init_bio(wbc, bio);
wbc_account_cgroup_owner(wbc, page, io_size);
}
+ if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ struct extent_map *em;
+ struct map_lookup *map;
+
+ em = btrfs_get_chunk_map(fs_info, disk_bytenr, io_size);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+
+ map = em->map_lookup;
+ /* We only support single profile for now */
+ ASSERT(map->num_stripes == 1);
+ btrfs_io_bio(bio)->device = map->stripes[0].dev;
+
+ free_extent_map(em);
+ }
*bio_ret = bio;
return ret;
}
-static void attach_extent_buffer_page(struct extent_buffer *eb,
- struct page *page)
+static int attach_extent_buffer_page(struct extent_buffer *eb,
+ struct page *page,
+ struct btrfs_subpage *prealloc)
{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ int ret = 0;
+
/*
* If the page is mapped to btree inode, we should hold the private
* lock to prevent race.
@@ -3152,16 +3250,62 @@ static void attach_extent_buffer_page(struct extent_buffer *eb,
if (page->mapping)
lockdep_assert_held(&page->mapping->private_lock);
- if (!PagePrivate(page))
- attach_page_private(page, eb);
+ if (fs_info->sectorsize == PAGE_SIZE) {
+ if (!PagePrivate(page))
+ attach_page_private(page, eb);
+ else
+ WARN_ON(page->private != (unsigned long)eb);
+ return 0;
+ }
+
+ /* Already mapped, just free prealloc */
+ if (PagePrivate(page)) {
+ btrfs_free_subpage(prealloc);
+ return 0;
+ }
+
+ if (prealloc)
+ /* Has preallocated memory for subpage */
+ attach_page_private(page, prealloc);
else
- WARN_ON(page->private != (unsigned long)eb);
+ /* Do new allocation to attach subpage */
+ ret = btrfs_attach_subpage(fs_info, page,
+ BTRFS_SUBPAGE_METADATA);
+ return ret;
+}
+
+int set_page_extent_mapped(struct page *page)
+{
+ struct btrfs_fs_info *fs_info;
+
+ ASSERT(page->mapping);
+
+ if (PagePrivate(page))
+ return 0;
+
+ fs_info = btrfs_sb(page->mapping->host->i_sb);
+
+ if (fs_info->sectorsize < PAGE_SIZE)
+ return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA);
+
+ attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
+ return 0;
}
-void set_page_extent_mapped(struct page *page)
+void clear_page_extent_mapped(struct page *page)
{
+ struct btrfs_fs_info *fs_info;
+
+ ASSERT(page->mapping);
+
if (!PagePrivate(page))
- attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
+ return;
+
+ fs_info = btrfs_sb(page->mapping->host->i_sb);
+ if (fs_info->sectorsize < PAGE_SIZE)
+ return btrfs_detach_subpage(fs_info, page);
+
+ detach_page_private(page);
}
static struct extent_map *
@@ -3202,6 +3346,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
unsigned int read_flags, u64 *prev_em_start)
{
struct inode *inode = page->mapping->host;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 start = page_offset(page);
const u64 end = start + PAGE_SIZE - 1;
u64 cur = start;
@@ -3218,12 +3363,19 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
unsigned long this_bio_flag = 0;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
- set_page_extent_mapped(page);
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ unlock_extent(tree, start, end);
+ btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);
+ unlock_page(page);
+ goto out;
+ }
if (!PageUptodate(page)) {
if (cleancache_get_page(page) == 0) {
BUG_ON(blocksize != PAGE_SIZE);
unlock_extent(tree, start, end);
+ unlock_page(page);
goto out;
}
}
@@ -3240,9 +3392,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
kunmap_atomic(userpage);
}
}
+ begin_page_read(fs_info, page);
while (cur <= end) {
bool force_bio_submit = false;
- u64 offset;
+ u64 disk_bytenr;
if (cur >= last_byte) {
char *userpage;
@@ -3257,13 +3410,14 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
&cached, GFP_NOFS);
unlock_extent_cached(tree, cur,
cur + iosize - 1, &cached);
+ end_page_read(page, true, cur, iosize);
break;
}
em = __get_extent_map(inode, page, pg_offset, cur,
end - cur + 1, em_cached);
if (IS_ERR_OR_NULL(em)) {
- SetPageError(page);
unlock_extent(tree, cur, end);
+ end_page_read(page, false, cur, end + 1 - cur);
break;
}
extent_offset = cur - em->start;
@@ -3280,9 +3434,9 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
cur_end = min(extent_map_end(em) - 1, end);
iosize = ALIGN(iosize, blocksize);
if (this_bio_flag & EXTENT_BIO_COMPRESSED)
- offset = em->block_start;
+ disk_bytenr = em->block_start;
else
- offset = em->block_start + extent_offset;
+ disk_bytenr = em->block_start + extent_offset;
block_start = em->block_start;
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
block_start = EXTENT_MAP_HOLE;
@@ -3346,6 +3500,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
&cached, GFP_NOFS);
unlock_extent_cached(tree, cur,
cur + iosize - 1, &cached);
+ end_page_read(page, true, cur, iosize);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -3355,6 +3510,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
EXTENT_UPTODATE, 1, NULL)) {
check_page_uptodate(tree, page);
unlock_extent(tree, cur, cur + iosize - 1);
+ end_page_read(page, true, cur, iosize);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -3363,15 +3519,15 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
* to date. Error out
*/
if (block_start == EXTENT_MAP_INLINE) {
- SetPageError(page);
unlock_extent(tree, cur, cur + iosize - 1);
+ end_page_read(page, false, cur, iosize);
cur = cur + iosize;
pg_offset += iosize;
continue;
}
ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
- page, offset, iosize,
+ page, disk_bytenr, iosize,
pg_offset, bio,
end_bio_extent_readpage, 0,
*bio_flags,
@@ -3381,19 +3537,14 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
nr++;
*bio_flags = this_bio_flag;
} else {
- SetPageError(page);
unlock_extent(tree, cur, cur + iosize - 1);
+ end_page_read(page, false, cur, iosize);
goto out;
}
cur = cur + iosize;
pg_offset += iosize;
}
out:
- if (!nr) {
- if (!PageError(page))
- SetPageUptodate(page);
- unlock_page(page);
- }
return ret;
}
@@ -3513,23 +3664,21 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
unsigned long nr_written,
int *nr_ret)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_io_tree *tree = &inode->io_tree;
u64 start = page_offset(page);
- u64 page_end = start + PAGE_SIZE - 1;
- u64 end;
+ u64 end = start + PAGE_SIZE - 1;
u64 cur = start;
u64 extent_offset;
u64 block_start;
- u64 iosize;
struct extent_map *em;
- size_t pg_offset = 0;
- size_t blocksize;
int ret = 0;
int nr = 0;
+ u32 opf = REQ_OP_WRITE;
const unsigned int write_flags = wbc_to_write_flags(wbc);
bool compressed;
- ret = btrfs_writepage_cow_fixup(page, start, page_end);
+ ret = btrfs_writepage_cow_fixup(page, start, end);
if (ret) {
/* Fixup worker will requeue */
redirty_page_for_writepage(wbc, page);
@@ -3544,16 +3693,13 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
*/
update_nr_written(wbc, nr_written + 1);
- end = page_end;
- blocksize = inode->vfs_inode.i_sb->s_blocksize;
-
while (cur <= end) {
+ u64 disk_bytenr;
u64 em_end;
- u64 offset;
+ u32 iosize;
if (cur >= i_size) {
- btrfs_writepage_endio_finish_ordered(page, cur,
- page_end, 1);
+ btrfs_writepage_endio_finish_ordered(page, cur, end, 1);
break;
}
em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
@@ -3565,13 +3711,20 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
extent_offset = cur - em->start;
em_end = extent_map_end(em);
- BUG_ON(em_end <= cur);
- BUG_ON(end < cur);
- iosize = min(em_end - cur, end - cur + 1);
- iosize = ALIGN(iosize, blocksize);
- offset = em->block_start + extent_offset;
+ ASSERT(cur <= em_end);
+ ASSERT(cur < end);
+ ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
+ ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));
block_start = em->block_start;
compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ disk_bytenr = em->block_start + extent_offset;
+
+ /* Note that em_end from extent_map_end() is exclusive */
+ iosize = min(em_end, end + 1) - cur;
+
+ if (btrfs_use_zone_append(inode, em))
+ opf = REQ_OP_ZONE_APPEND;
+
free_extent_map(em);
em = NULL;
@@ -3587,7 +3740,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
btrfs_writepage_endio_finish_ordered(page, cur,
cur + iosize - 1, 1);
cur += iosize;
- pg_offset += iosize;
continue;
}
@@ -3598,9 +3750,9 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
page->index, cur, end);
}
- ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
- page, offset, iosize, pg_offset,
- &epd->bio,
+ ret = submit_extent_page(opf | write_flags, wbc, page,
+ disk_bytenr, iosize,
+ cur - page_offset(page), &epd->bio,
end_bio_extent_writepage,
0, 0, 0, false);
if (ret) {
@@ -3609,8 +3761,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
end_page_writeback(page);
}
- cur = cur + iosize;
- pg_offset += iosize;
+ cur += iosize;
nr++;
}
*nr_ret = nr;
@@ -3663,7 +3814,11 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
flush_dcache_page(page);
}
- set_page_extent_mapped(page);
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ SetPageError(page);
+ goto done;
+ }
if (!epd->extent_locked) {
ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start,
@@ -3923,7 +4078,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
struct writeback_control *wbc,
struct extent_page_data *epd)
{
- u64 offset = eb->start;
+ u64 disk_bytenr = eb->start;
u32 nritems;
int i, num_pages;
unsigned long start, end;
@@ -3956,7 +4111,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
clear_page_dirty_for_io(p);
set_page_writeback(p);
ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
- p, offset, PAGE_SIZE, 0,
+ p, disk_bytenr, PAGE_SIZE, 0,
&epd->bio,
end_bio_extent_buffer_writepage,
0, 0, 0, false);
@@ -3969,7 +4124,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
ret = -EIO;
break;
}
- offset += PAGE_SIZE;
+ disk_bytenr += PAGE_SIZE;
update_nr_written(wbc, 1);
unlock_page(p);
}
@@ -4010,6 +4165,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
struct extent_buffer **eb_context)
{
struct address_space *mapping = page->mapping;
+ struct btrfs_block_group *cache = NULL;
struct extent_buffer *eb;
int ret;
@@ -4042,13 +4198,31 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
if (!ret)
return 0;
+ if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) {
+ /*
+ * If for_sync, this hole will be filled with
+ * trasnsaction commit.
+ */
+ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
+ ret = -EAGAIN;
+ else
+ ret = 0;
+ free_extent_buffer(eb);
+ return ret;
+ }
+
*eb_context = eb;
ret = lock_extent_buffer_for_io(eb, epd);
if (ret <= 0) {
+ btrfs_revert_meta_write_pointer(cache, eb);
+ if (cache)
+ btrfs_put_block_group(cache);
free_extent_buffer(eb);
return ret;
}
+ if (cache)
+ btrfs_put_block_group(cache);
ret = write_one_eb(eb, wbc, epd);
free_extent_buffer(eb);
if (ret < 0)
@@ -4094,6 +4268,7 @@ int btree_write_cache_pages(struct address_space *mapping,
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
+ btrfs_zoned_meta_io_lock(fs_info);
retry:
if (wbc->sync_mode == WB_SYNC_ALL)
tag_pages_for_writeback(mapping, index, end);
@@ -4134,7 +4309,7 @@ retry:
}
if (ret < 0) {
end_write_bio(&epd, ret);
- return ret;
+ goto out;
}
/*
* If something went wrong, don't allow any metadata write bio to be
@@ -4169,14 +4344,17 @@ retry:
ret = -EROFS;
end_write_bio(&epd, ret);
}
+out:
+ btrfs_zoned_meta_io_unlock(fs_info);
return ret;
}
/**
- * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
+ * Walk the list of dirty pages of the given address space and write all of them.
+ *
* @mapping: address space structure to write
- * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- * @data: data passed to __extent_writepage function
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @epd: holds context for the write, namely the bio
*
* If a page is already under I/O, write_cache_pages() skips it, even
* if it's dirty. This is desirable behaviour for memory-cleaning writeback,
@@ -4975,25 +5153,39 @@ int extent_buffer_under_io(const struct extent_buffer *eb)
test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
}
-/*
- * Release all pages attached to the extent buffer.
- */
-static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
+static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
{
- int i;
- int num_pages;
- int mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
+ struct btrfs_subpage *subpage;
- BUG_ON(extent_buffer_under_io(eb));
+ lockdep_assert_held(&page->mapping->private_lock);
- num_pages = num_extent_pages(eb);
- for (i = 0; i < num_pages; i++) {
- struct page *page = eb->pages[i];
+ if (PagePrivate(page)) {
+ subpage = (struct btrfs_subpage *)page->private;
+ if (atomic_read(&subpage->eb_refs))
+ return true;
+ }
+ return false;
+}
- if (!page)
- continue;
+static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
+
+ /*
+ * For mapped eb, we're going to change the page private, which should
+ * be done under the private_lock.
+ */
+ if (mapped)
+ spin_lock(&page->mapping->private_lock);
+
+ if (!PagePrivate(page)) {
if (mapped)
- spin_lock(&page->mapping->private_lock);
+ spin_unlock(&page->mapping->private_lock);
+ return;
+ }
+
+ if (fs_info->sectorsize == PAGE_SIZE) {
/*
* We do this since we'll remove the pages after we've
* removed the eb from the radix tree, so we could race
@@ -5012,9 +5204,49 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
*/
detach_page_private(page);
}
-
if (mapped)
spin_unlock(&page->mapping->private_lock);
+ return;
+ }
+
+ /*
+ * For subpage, we can have dummy eb with page private. In this case,
+ * we can directly detach the private as such page is only attached to
+ * one dummy eb, no sharing.
+ */
+ if (!mapped) {
+ btrfs_detach_subpage(fs_info, page);
+ return;
+ }
+
+ btrfs_page_dec_eb_refs(fs_info, page);
+
+ /*
+ * We can only detach the page private if there are no other ebs in the
+ * page range.
+ */
+ if (!page_range_has_eb(fs_info, page))
+ btrfs_detach_subpage(fs_info, page);
+
+ spin_unlock(&page->mapping->private_lock);
+}
+
+/* Release all pages attached to the extent buffer */
+static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
+{
+ int i;
+ int num_pages;
+
+ ASSERT(!extent_buffer_under_io(eb));
+
+ num_pages = num_extent_pages(eb);
+ for (i = 0; i < num_pages; i++) {
+ struct page *page = eb->pages[i];
+
+ if (!page)
+ continue;
+
+ detach_extent_buffer_page(eb, page);
/* One for when we allocated the page */
put_page(page);
@@ -5046,6 +5278,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
&fs_info->allocated_ebs);
+ INIT_LIST_HEAD(&eb->release_list);
spin_lock_init(&eb->refs_lock);
atomic_set(&eb->refs, 1);
@@ -5067,21 +5300,32 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
if (new == NULL)
return NULL;
+ /*
+ * Set UNMAPPED before calling btrfs_release_extent_buffer(), as
+ * btrfs_release_extent_buffer() have different behavior for
+ * UNMAPPED subpage extent buffer.
+ */
+ set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
+
for (i = 0; i < num_pages; i++) {
+ int ret;
+
p = alloc_page(GFP_NOFS);
if (!p) {
btrfs_release_extent_buffer(new);
return NULL;
}
- attach_extent_buffer_page(new, p);
+ ret = attach_extent_buffer_page(new, p, NULL);
+ if (ret < 0) {
+ put_page(p);
+ btrfs_release_extent_buffer(new);
+ return NULL;
+ }
WARN_ON(PageDirty(p));
- SetPageUptodate(p);
new->pages[i] = p;
copy_page(page_address(p), page_address(src->pages[i]));
}
-
- set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
- set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
+ set_extent_buffer_uptodate(new);
return new;
}
@@ -5099,9 +5343,14 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
+ int ret;
+
eb->pages[i] = alloc_page(GFP_NOFS);
if (!eb->pages[i])
goto err;
+ ret = attach_extent_buffer_page(eb, eb->pages[i], NULL);
+ if (ret < 0)
+ goto err;
}
set_extent_buffer_uptodate(eb);
btrfs_set_header_nritems(eb, 0);
@@ -5109,8 +5358,10 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
return eb;
err:
- for (; i > 0; i--)
+ for (; i > 0; i--) {
+ detach_extent_buffer_page(eb, eb->pages[i - 1]);
__free_page(eb->pages[i - 1]);
+ }
__free_extent_buffer(eb);
return NULL;
}
@@ -5252,6 +5503,38 @@ free_eb:
}
#endif
+static struct extent_buffer *grab_extent_buffer(
+ struct btrfs_fs_info *fs_info, struct page *page)
+{
+ struct extent_buffer *exists;
+
+ /*
+ * For subpage case, we completely rely on radix tree to ensure we
+ * don't try to insert two ebs for the same bytenr. So here we always
+ * return NULL and just continue.
+ */
+ if (fs_info->sectorsize < PAGE_SIZE)
+ return NULL;
+
+ /* Page not yet attached to an extent buffer */
+ if (!PagePrivate(page))
+ return NULL;
+
+ /*
+ * We could have already allocated an eb for this page and attached one
+ * so lets see if we can get a ref on the existing eb, and if we can we
+ * know it's good and we can just return that one, else we know we can
+ * just overwrite page->private.
+ */
+ exists = (struct extent_buffer *)page->private;
+ if (atomic_inc_not_zero(&exists->refs))
+ return exists;
+
+ WARN_ON(PageDirty(page));
+ detach_page_private(page);
+ return NULL;
+}
+
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, u64 owner_root, int level)
{
@@ -5290,36 +5573,58 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++, index++) {
+ struct btrfs_subpage *prealloc = NULL;
+
p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
if (!p) {
exists = ERR_PTR(-ENOMEM);
goto free_eb;
}
- spin_lock(&mapping->private_lock);
- if (PagePrivate(p)) {
- /*
- * We could have already allocated an eb for this page
- * and attached one so lets see if we can get a ref on
- * the existing eb, and if we can we know it's good and
- * we can just return that one, else we know we can just
- * overwrite page->private.
- */
- exists = (struct extent_buffer *)p->private;
- if (atomic_inc_not_zero(&exists->refs)) {
- spin_unlock(&mapping->private_lock);
- unlock_page(p);
- put_page(p);
- mark_extent_buffer_accessed(exists, p);
- goto free_eb;
- }
- exists = NULL;
+ /*
+ * Preallocate page->private for subpage case, so that we won't
+ * allocate memory with private_lock hold. The memory will be
+ * freed by attach_extent_buffer_page() or freed manually if
+ * we exit earlier.
+ *
+ * Although we have ensured one subpage eb can only have one
+ * page, but it may change in the future for 16K page size
+ * support, so we still preallocate the memory in the loop.
+ */
+ ret = btrfs_alloc_subpage(fs_info, &prealloc,
+ BTRFS_SUBPAGE_METADATA);
+ if (ret < 0) {
+ unlock_page(p);
+ put_page(p);
+ exists = ERR_PTR(ret);
+ goto free_eb;
+ }
- WARN_ON(PageDirty(p));
- detach_page_private(p);
+ spin_lock(&mapping->private_lock);
+ exists = grab_extent_buffer(fs_info, p);
+ if (exists) {
+ spin_unlock(&mapping->private_lock);
+ unlock_page(p);
+ put_page(p);
+ mark_extent_buffer_accessed(exists, p);
+ btrfs_free_subpage(prealloc);
+ goto free_eb;
}
- attach_extent_buffer_page(eb, p);
+ /* Should not fail, as we have preallocated the memory */
+ ret = attach_extent_buffer_page(eb, p, prealloc);
+ ASSERT(!ret);
+ /*
+ * To inform we have extra eb under allocation, so that
+ * detach_extent_buffer_page() won't release the page private
+ * when the eb hasn't yet been inserted into radix tree.
+ *
+ * The ref will be decreased when the eb released the page, in
+ * detach_extent_buffer_page().
+ * Thus needs no special handling in error path.
+ */
+ btrfs_page_inc_eb_refs(fs_info, p);
spin_unlock(&mapping->private_lock);
+
WARN_ON(PageDirty(p));
eb->pages[i] = p;
if (!PageUptodate(p))
@@ -5525,31 +5830,101 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb)
void clear_extent_buffer_uptodate(struct extent_buffer *eb)
{
- int i;
+ struct btrfs_fs_info *fs_info = eb->fs_info;
struct page *page;
int num_pages;
+ int i;
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
if (page)
- ClearPageUptodate(page);
+ btrfs_page_clear_uptodate(fs_info, page,
+ eb->start, eb->len);
}
}
void set_extent_buffer_uptodate(struct extent_buffer *eb)
{
- int i;
+ struct btrfs_fs_info *fs_info = eb->fs_info;
struct page *page;
int num_pages;
+ int i;
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
- SetPageUptodate(page);
+ btrfs_page_set_uptodate(fs_info, page, eb->start, eb->len);
+ }
+}
+
+static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
+ int mirror_num)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct extent_io_tree *io_tree;
+ struct page *page = eb->pages[0];
+ struct bio *bio = NULL;
+ int ret = 0;
+
+ ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
+ ASSERT(PagePrivate(page));
+ io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
+
+ if (wait == WAIT_NONE) {
+ ret = try_lock_extent(io_tree, eb->start,
+ eb->start + eb->len - 1);
+ if (ret <= 0)
+ return ret;
+ } else {
+ ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1);
+ if (ret < 0)
+ return ret;
+ }
+
+ ret = 0;
+ if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) ||
+ PageUptodate(page) ||
+ btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) {
+ set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+ unlock_extent(io_tree, eb->start, eb->start + eb->len - 1);
+ return ret;
+ }
+
+ clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
+ eb->read_mirror = 0;
+ atomic_set(&eb->io_pages, 1);
+ check_buffer_tree_ref(eb);
+ btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
+
+ ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, page, eb->start,
+ eb->len, eb->start - page_offset(page), &bio,
+ end_bio_extent_readpage, mirror_num, 0, 0,
+ true);
+ if (ret) {
+ /*
+ * In the endio function, if we hit something wrong we will
+ * increase the io_pages, so here we need to decrease it for
+ * error path.
+ */
+ atomic_dec(&eb->io_pages);
+ }
+ if (bio) {
+ int tmp;
+
+ tmp = submit_one_bio(bio, mirror_num, 0);
+ if (tmp < 0)
+ return tmp;
}
+ if (ret || wait != WAIT_COMPLETE)
+ return ret;
+
+ wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED);
+ if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
+ ret = -EIO;
+ return ret;
}
int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
@@ -5568,10 +5943,20 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
+ if (eb->fs_info->sectorsize < PAGE_SIZE)
+ return read_extent_buffer_subpage(eb, wait, mirror_num);
+
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
if (wait == WAIT_NONE) {
+ /*
+ * WAIT_NONE is only utilized by readahead. If we can't
+ * acquire the lock atomically it means either the eb
+ * is being read out or under modification.
+ * Either way the eb will be or has been cached,
+ * readahead can exit safely.
+ */
if (!trylock_page(page))
goto unlock_exit;
} else {
@@ -5823,6 +6208,8 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
char *src = (char *)srcv;
unsigned long i = get_eb_page_index(start);
+ WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags));
+
if (check_eb_range(eb, start, len))
return;
@@ -6169,13 +6556,115 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
}
}
+static struct extent_buffer *get_next_extent_buffer(
+ struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
+{
+ struct extent_buffer *gang[BTRFS_SUBPAGE_BITMAP_SIZE];
+ struct extent_buffer *found = NULL;
+ u64 page_start = page_offset(page);
+ int ret;
+ int i;
+
+ ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
+ ASSERT(PAGE_SIZE / fs_info->nodesize <= BTRFS_SUBPAGE_BITMAP_SIZE);
+ lockdep_assert_held(&fs_info->buffer_lock);
+
+ ret = radix_tree_gang_lookup(&fs_info->buffer_radix, (void **)gang,
+ bytenr >> fs_info->sectorsize_bits,
+ PAGE_SIZE / fs_info->nodesize);
+ for (i = 0; i < ret; i++) {
+ /* Already beyond page end */
+ if (gang[i]->start >= page_start + PAGE_SIZE)
+ break;
+ /* Found one */
+ if (gang[i]->start >= bytenr) {
+ found = gang[i];
+ break;
+ }
+ }
+ return found;
+}
+
+static int try_release_subpage_extent_buffer(struct page *page)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ u64 cur = page_offset(page);
+ const u64 end = page_offset(page) + PAGE_SIZE;
+ int ret;
+
+ while (cur < end) {
+ struct extent_buffer *eb = NULL;
+
+ /*
+ * Unlike try_release_extent_buffer() which uses page->private
+ * to grab buffer, for subpage case we rely on radix tree, thus
+ * we need to ensure radix tree consistency.
+ *
+ * We also want an atomic snapshot of the radix tree, thus go
+ * with spinlock rather than RCU.
+ */
+ spin_lock(&fs_info->buffer_lock);
+ eb = get_next_extent_buffer(fs_info, page, cur);
+ if (!eb) {
+ /* No more eb in the page range after or at cur */
+ spin_unlock(&fs_info->buffer_lock);
+ break;
+ }
+ cur = eb->start + eb->len;
+
+ /*
+ * The same as try_release_extent_buffer(), to ensure the eb
+ * won't disappear out from under us.
+ */
+ spin_lock(&eb->refs_lock);
+ if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
+ spin_unlock(&eb->refs_lock);
+ spin_unlock(&fs_info->buffer_lock);
+ break;
+ }
+ spin_unlock(&fs_info->buffer_lock);
+
+ /*
+ * If tree ref isn't set then we know the ref on this eb is a
+ * real ref, so just return, this eb will likely be freed soon
+ * anyway.
+ */
+ if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
+ spin_unlock(&eb->refs_lock);
+ break;
+ }
+
+ /*
+ * Here we don't care about the return value, we will always
+ * check the page private at the end. And
+ * release_extent_buffer() will release the refs_lock.
+ */
+ release_extent_buffer(eb);
+ }
+ /*
+ * Finally to check if we have cleared page private, as if we have
+ * released all ebs in the page, the page private should be cleared now.
+ */
+ spin_lock(&page->mapping->private_lock);
+ if (!PagePrivate(page))
+ ret = 1;
+ else
+ ret = 0;
+ spin_unlock(&page->mapping->private_lock);
+ return ret;
+
+}
+
int try_release_extent_buffer(struct page *page)
{
struct extent_buffer *eb;
+ if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
+ return try_release_subpage_extent_buffer(page);
+
/*
- * We need to make sure nobody is attaching this page to an eb right
- * now.
+ * We need to make sure nobody is changing page->private, as we rely on
+ * page->private as the pointer to extent buffer.
*/
spin_lock(&page->mapping->private_lock);
if (!PagePrivate(page)) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 19221095c635..824640cb0ace 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -31,16 +31,17 @@ enum {
EXTENT_BUFFER_IN_TREE,
/* write IO error */
EXTENT_BUFFER_WRITE_ERR,
+ EXTENT_BUFFER_NO_CHECK,
};
/* these are flags for __process_pages_contig */
#define PAGE_UNLOCK (1 << 0)
-#define PAGE_CLEAR_DIRTY (1 << 1)
-#define PAGE_SET_WRITEBACK (1 << 2)
-#define PAGE_END_WRITEBACK (1 << 3)
-#define PAGE_SET_PRIVATE2 (1 << 4)
-#define PAGE_SET_ERROR (1 << 5)
-#define PAGE_LOCK (1 << 6)
+/* Page starts writeback, clear dirty bit and set writeback bit */
+#define PAGE_START_WRITEBACK (1 << 1)
+#define PAGE_END_WRITEBACK (1 << 2)
+#define PAGE_SET_PRIVATE2 (1 << 3)
+#define PAGE_SET_ERROR (1 << 4)
+#define PAGE_LOCK (1 << 5)
/*
* page->private values. Every page that is controlled by the extent
@@ -93,6 +94,7 @@ struct extent_buffer {
struct rw_semaphore lock;
struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
+ struct list_head release_list;
#ifdef CONFIG_BTRFS_DEBUG
struct list_head leak_list;
#endif
@@ -178,7 +180,8 @@ int btree_write_cache_pages(struct address_space *mapping,
void extent_readahead(struct readahead_control *rac);
int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
-void set_page_extent_mapped(struct page *page);
+int set_page_extent_mapped(struct page *page);
+void clear_page_extent_mapped(struct page *page);
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, u64 owner_root, int level);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index bd6229fb2b6f..4a8e02f7b6c7 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -385,9 +385,12 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
}
/**
- * add_extent_mapping - add new extent map to the extent tree
+ * Add new extent map to the extent tree
+ *
* @tree: tree to insert new map in
* @em: map to insert
+ * @modified: indicate whether the given @em should be added to the
+ * modified list, which indicates the extent needs to be logged
*
* Insert @em into @tree or perform a simple forward/backward merge with
* existing mappings. The extent_map struct passed in will be inserted
@@ -574,12 +577,13 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
}
/**
- * btrfs_add_extent_mapping - add extent mapping into em_tree
- * @fs_info - used for tracepoint
- * @em_tree - the extent tree into which we want to insert the extent mapping
- * @em_in - extent we are inserting
- * @start - start of the logical range btrfs_get_extent() is requesting
- * @len - length of the logical range btrfs_get_extent() is requesting
+ * Add extent mapping into em_tree
+ *
+ * @fs_info: the filesystem
+ * @em_tree: extent tree into which we want to insert the extent mapping
+ * @em_in: extent we are inserting
+ * @start: start of the logical range btrfs_get_extent() is requesting
+ * @len: length of the logical range btrfs_get_extent() is requesting
*
* Note that @em_in's range may be different from [start, start+len),
* but they must be overlapped.
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 6ccfc019ad90..47cd3a6dc635 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -24,8 +24,10 @@
PAGE_SIZE))
/**
- * @inode - the inode we want to update the disk_i_size for
- * @new_i_size - the i_size we want to set to, 0 if we use i_size
+ * Set inode's size according to filesystem options
+ *
+ * @inode: inode we want to update the disk_i_size for
+ * @new_i_size: i_size we want to set to, 0 if we use i_size
*
* With NO_HOLES set this simply sets the disk_is_size to whatever i_size_read()
* returns as it is perfectly fine with a file that has holes without hole file
@@ -62,9 +64,11 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz
}
/**
- * @inode - the inode we're modifying
- * @start - the start file offset of the file extent we've inserted
- * @len - the logical length of the file extent item
+ * Mark range within a file as having a new extent inserted
+ *
+ * @inode: inode being modified
+ * @start: start file offset of the file extent we've inserted
+ * @len: logical length of the file extent item
*
* Call when we are inserting a new file extent where there was none before.
* Does not need to call this in the case where we're replacing an existing file
@@ -88,9 +92,11 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
}
/**
- * @inode - the inode we're modifying
- * @start - the start file offset of the file extent we've inserted
- * @len - the logical length of the file extent item
+ * Marks an inode range as not having a backing extent
+ *
+ * @inode: inode being modified
+ * @start: start file offset of the file extent we've inserted
+ * @len: logical length of the file extent item
*
* Called when we drop a file extent, for example when we truncate. Doesn't
* need to be called for cases where we're replacing a file extent, like when
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0e41459b8de6..be9e3900cce8 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -453,12 +453,11 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
}
/*
- * after copy_from_user, pages need to be dirtied and we need to make
- * sure holes are created between the current EOF and the start of
- * any next extents (if required).
- *
- * this also makes the decision about creating an inline extent vs
- * doing real data extents, marking pages dirty and delalloc as required.
+ * After btrfs_copy_from_user(), update the following things for delalloc:
+ * - Mark newly dirtied pages as DELALLOC in the io tree.
+ * Used to advise which range is to be written back.
+ * - Mark modified pages as Uptodate/Dirty and not needing COW fixup
+ * - Update inode size for past EOF write
*/
int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
@@ -1370,6 +1369,12 @@ again:
goto fail;
}
+ err = set_page_extent_mapped(pages[i]);
+ if (err < 0) {
+ faili = i;
+ goto fail;
+ }
+
if (i == 0)
err = prepare_uptodate_page(inode, pages[i], pos,
force_uptodate);
@@ -1454,23 +1459,11 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
}
/*
- * It's possible the pages are dirty right now, but we don't want
- * to clean them yet because copy_from_user may catch a page fault
- * and we might have to fall back to one page at a time. If that
- * happens, we'll unlock these pages and we'd have a window where
- * reclaim could sneak in and drop the once-dirty page on the floor
- * without writing it.
- *
- * We have the pages locked and the extent range locked, so there's
- * no way someone can start IO on any dirty pages in this range.
- *
- * We'll call btrfs_dirty_pages() later on, and that will flip around
- * delalloc bits and dirty the pages as required.
+ * We should be called after prepare_pages() which should have locked
+ * all pages in the range.
*/
- for (i = 0; i < num_pages; i++) {
- set_page_extent_mapped(pages[i]);
+ for (i = 0; i < num_pages; i++)
WARN_ON(!PageLocked(pages[i]));
- }
return ret;
}
@@ -1949,8 +1942,8 @@ relock:
goto buffered;
}
- dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops,
- &btrfs_dio_ops, is_sync_kiocb(iocb));
+ dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+ 0);
btrfs_inode_unlock(inode, ilock_flags);
@@ -1997,9 +1990,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_inode *inode = BTRFS_I(file_inode(file));
ssize_t num_written = 0;
const bool sync = iocb->ki_flags & IOCB_DSYNC;
@@ -2008,7 +1999,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
* have opened a file as writable, we have to stop this write operation
* to ensure consistency.
*/
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (test_bit(BTRFS_FS_STATE_ERROR, &inode->root->fs_info->fs_state))
return -EROFS;
if (!(iocb->ki_flags & IOCB_DIRECT) &&
@@ -2016,7 +2007,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
return -EOPNOTSUPP;
if (sync)
- atomic_inc(&BTRFS_I(inode)->sync_writers);
+ atomic_inc(&inode->sync_writers);
if (iocb->ki_flags & IOCB_DIRECT)
num_written = btrfs_direct_write(iocb, from);
@@ -2028,14 +2019,14 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
* otherwise subsequent syncs to a file that's been synced in this
* transaction will appear to have already occurred.
*/
- spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->last_sub_trans = root->log_transid;
- spin_unlock(&BTRFS_I(inode)->lock);
+ spin_lock(&inode->lock);
+ inode->last_sub_trans = inode->root->log_transid;
+ spin_unlock(&inode->lock);
if (num_written > 0)
num_written = generic_write_sync(iocb, num_written);
if (sync)
- atomic_dec(&BTRFS_I(inode)->sync_writers);
+ atomic_dec(&inode->sync_writers);
current->backing_dev_info = NULL;
return num_written;
@@ -2177,8 +2168,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* commit waits for their completion, to avoid data loss if we fsync,
* the current transaction commits before the ordered extents complete
* and a power failure happens right after that.
+ *
+ * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
+ * logical address recorded in the ordered extent may change. We need
+ * to wait for the IO to stabilize the logical address.
*/
- if (full_sync) {
+ if (full_sync || btrfs_is_zoned(fs_info)) {
ret = btrfs_wait_ordered_range(inode, start, len);
} else {
/*
@@ -2241,6 +2236,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
ret = PTR_ERR(trans);
goto out_release_extents;
}
+ trans->in_fsync = true;
ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
btrfs_release_log_ctx_extents(&ctx);
@@ -3622,8 +3618,7 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
return 0;
btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
- ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
- is_sync_kiocb(iocb));
+ ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
return ret;
}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 4d8897879c9c..5400294bd271 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -198,7 +198,7 @@ int create_free_space_inode(struct btrfs_trans_handle *trans,
int ret;
u64 ino;
- ret = btrfs_find_free_objectid(trans->fs_info->tree_root, &ino);
+ ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino);
if (ret < 0)
return ret;
@@ -431,11 +431,22 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
int i;
for (i = 0; i < io_ctl->num_pages; i++) {
+ int ret;
+
page = find_or_create_page(inode->i_mapping, i, mask);
if (!page) {
io_ctl_drop_pages(io_ctl);
return -ENOMEM;
}
+
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ unlock_page(page);
+ put_page(page);
+ io_ctl_drop_pages(io_ctl);
+ return ret;
+ }
+
io_ctl->pages[i] = page;
if (uptodate && !PageUptodate(page)) {
btrfs_readpage(NULL, page);
@@ -455,10 +466,8 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
}
}
- for (i = 0; i < io_ctl->num_pages; i++) {
+ for (i = 0; i < io_ctl->num_pages; i++)
clear_page_dirty_for_io(io_ctl->pages[i]);
- set_page_extent_mapped(io_ctl->pages[i]);
- }
return 0;
}
@@ -775,8 +784,10 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
while (num_entries) {
e = kmem_cache_zalloc(btrfs_free_space_cachep,
GFP_NOFS);
- if (!e)
+ if (!e) {
+ ret = -ENOMEM;
goto free_cache;
+ }
ret = io_ctl_read_entry(&io_ctl, e, &type);
if (ret) {
@@ -785,6 +796,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
}
if (!e->bytes) {
+ ret = -1;
kmem_cache_free(btrfs_free_space_cachep, e);
goto free_cache;
}
@@ -805,6 +817,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
e->bitmap = kmem_cache_zalloc(
btrfs_free_space_bitmap_cachep, GFP_NOFS);
if (!e->bitmap) {
+ ret = -ENOMEM;
kmem_cache_free(
btrfs_free_space_cachep, e);
goto free_cache;
@@ -1295,11 +1308,14 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
}
/**
- * __btrfs_write_out_cache - write out cached info to an inode
- * @root - the root the inode belongs to
- * @ctl - the free space cache we are going to write out
- * @block_group - the block_group for this cache if it belongs to a block_group
- * @trans - the trans handle
+ * Write out cached info to an inode
+ *
+ * @root: root the inode belongs to
+ * @inode: freespace inode we are writing out
+ * @ctl: free space cache we are going to write out
+ * @block_group: block_group for this cache if it belongs to a block_group
+ * @io_ctl: holds context for the io
+ * @trans: the trans handle
*
* This function writes out a free space cache struct to disk for quick recovery
* on mount. This will return 0 if it was successful in writing the cache out,
@@ -2461,6 +2477,8 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
int ret = 0;
u64 filter_bytes = bytes;
+ ASSERT(!btrfs_is_zoned(fs_info));
+
info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
if (!info)
return -ENOMEM;
@@ -2518,11 +2536,49 @@ out:
return ret;
}
+static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
+ u64 bytenr, u64 size, bool used)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ u64 offset = bytenr - block_group->start;
+ u64 to_free, to_unusable;
+
+ spin_lock(&ctl->tree_lock);
+ if (!used)
+ to_free = size;
+ else if (offset >= block_group->alloc_offset)
+ to_free = size;
+ else if (offset + size <= block_group->alloc_offset)
+ to_free = 0;
+ else
+ to_free = offset + size - block_group->alloc_offset;
+ to_unusable = size - to_free;
+
+ ctl->free_space += to_free;
+ block_group->zone_unusable += to_unusable;
+ spin_unlock(&ctl->tree_lock);
+ if (!used) {
+ spin_lock(&block_group->lock);
+ block_group->alloc_offset -= size;
+ spin_unlock(&block_group->lock);
+ }
+
+ /* All the region is now unusable. Mark it as unused and reclaim */
+ if (block_group->zone_unusable == block_group->length)
+ btrfs_mark_bg_unused(block_group);
+
+ return 0;
+}
+
int btrfs_add_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size)
{
enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+ if (btrfs_is_zoned(block_group->fs_info))
+ return __btrfs_add_free_space_zoned(block_group, bytenr, size,
+ true);
+
if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
trim_state = BTRFS_TRIM_STATE_TRIMMED;
@@ -2531,6 +2587,16 @@ int btrfs_add_free_space(struct btrfs_block_group *block_group,
bytenr, size, trim_state);
}
+int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
+ u64 bytenr, u64 size)
+{
+ if (btrfs_is_zoned(block_group->fs_info))
+ return __btrfs_add_free_space_zoned(block_group, bytenr, size,
+ false);
+
+ return btrfs_add_free_space(block_group, bytenr, size);
+}
+
/*
* This is a subtle distinction because when adding free space back in general,
* we want it to be added as untrimmed for async. But in the case where we add
@@ -2541,6 +2607,10 @@ int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
{
enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+ if (btrfs_is_zoned(block_group->fs_info))
+ return __btrfs_add_free_space_zoned(block_group, bytenr, size,
+ true);
+
if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) ||
btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
trim_state = BTRFS_TRIM_STATE_TRIMMED;
@@ -2558,6 +2628,23 @@ int btrfs_remove_free_space(struct btrfs_block_group *block_group,
int ret;
bool re_search = false;
+ if (btrfs_is_zoned(block_group->fs_info)) {
+ /*
+ * This can happen with conventional zones when replaying log.
+ * Since the allocation info of tree-log nodes are not recorded
+ * to the extent-tree, calculate_alloc_pointer() failed to
+ * advance the allocation pointer after last allocated tree log
+ * node blocks.
+ *
+ * This function is called from
+ * btrfs_pin_extent_for_log_replay() when replaying the log.
+ * Advance the pointer not to overwrite the tree-log nodes.
+ */
+ if (block_group->alloc_offset < offset + bytes)
+ block_group->alloc_offset = offset + bytes;
+ return 0;
+ }
+
spin_lock(&ctl->tree_lock);
again:
@@ -2652,6 +2739,16 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group,
struct rb_node *n;
int count = 0;
+ /*
+ * Zoned btrfs does not use free space tree and cluster. Just print
+ * out the free space after the allocation offset.
+ */
+ if (btrfs_is_zoned(fs_info)) {
+ btrfs_info(fs_info, "free space %llu",
+ block_group->length - block_group->alloc_offset);
+ return;
+ }
+
spin_lock(&ctl->tree_lock);
for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
info = rb_entry(n, struct btrfs_free_space, offset_index);
@@ -2845,6 +2942,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
u64 align_gap_len = 0;
enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+ ASSERT(!btrfs_is_zoned(block_group->fs_info));
+
spin_lock(&ctl->tree_lock);
entry = find_free_space(ctl, &offset, &bytes_search,
block_group->full_stripe_len, max_extent_size);
@@ -2976,6 +3075,8 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
struct rb_node *node;
u64 ret = 0;
+ ASSERT(!btrfs_is_zoned(block_group->fs_info));
+
spin_lock(&cluster->lock);
if (bytes > cluster->max_size)
goto out;
@@ -3752,6 +3853,8 @@ int btrfs_trim_block_group(struct btrfs_block_group *block_group,
int ret;
u64 rem = 0;
+ ASSERT(!btrfs_is_zoned(block_group->fs_info));
+
*trimmed = 0;
spin_lock(&block_group->lock);
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index ecb09a02d544..1f23088d43f9 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -107,6 +107,8 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
enum btrfs_trim_state trim_state);
int btrfs_add_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
+int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
+ u64 bytenr, u64 size);
int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
int btrfs_remove_free_space(struct btrfs_block_group *block_group,
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index e33a65bd9a0c..a33bca94d133 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1150,6 +1150,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
return PTR_ERR(trans);
set_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
+ set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
free_space_root = btrfs_create_tree(trans,
BTRFS_FREE_SPACE_TREE_OBJECTID);
if (IS_ERR(free_space_root)) {
@@ -1171,11 +1172,18 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE);
btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
+ ret = btrfs_commit_transaction(trans);
- return btrfs_commit_transaction(trans);
+ /*
+ * Now that we've committed the transaction any reading of our commit
+ * root will be safe, so we can cache from the free space tree now.
+ */
+ clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
+ return ret;
abort:
clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
+ clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c0b11db98e5e..2e1c282c202d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -50,6 +50,7 @@
#include "delalloc-space.h"
#include "block-group.h"
#include "space-info.h"
+#include "zoned.h"
struct btrfs_iget_args {
u64 ino;
@@ -692,8 +693,7 @@ cont:
NULL,
clear_flags,
PAGE_UNLOCK |
- PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK |
+ PAGE_START_WRITEBACK |
page_error_op |
PAGE_END_WRITEBACK);
@@ -917,7 +917,6 @@ retry:
ins.objectid,
async_extent->ram_size,
ins.offset,
- BTRFS_ORDERED_COMPRESSED,
async_extent->compress_type);
if (ret) {
btrfs_drop_extent_cache(inode, async_extent->start,
@@ -934,8 +933,7 @@ retry:
async_extent->start +
async_extent->ram_size - 1,
NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
- PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK);
+ PAGE_UNLOCK | PAGE_START_WRITEBACK);
if (btrfs_submit_compressed_write(inode, async_extent->start,
async_extent->ram_size,
ins.objectid,
@@ -971,9 +969,8 @@ out_free:
NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
- PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
- PAGE_SET_ERROR);
+ PAGE_UNLOCK | PAGE_START_WRITEBACK |
+ PAGE_END_WRITEBACK | PAGE_SET_ERROR);
free_async_extent_pages(async_extent);
kfree(async_extent);
goto again;
@@ -1071,8 +1068,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
- PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
- PAGE_END_WRITEBACK);
+ PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
*nr_written = *nr_written +
(end - start + PAGE_SIZE) / PAGE_SIZE;
*page_started = 1;
@@ -1127,7 +1123,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
free_extent_map(em);
ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
- ram_size, cur_alloc_size, 0);
+ ram_size, cur_alloc_size,
+ BTRFS_ORDERED_REGULAR);
if (ret)
goto out_drop_extent_cache;
@@ -1194,8 +1191,7 @@ out_reserve:
out_unlock:
clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
- page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
- PAGE_END_WRITEBACK;
+ page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
/*
* If we reserved an extent for our delalloc range (or a subrange) and
* failed to create the respective ordered extent, then it means that
@@ -1320,9 +1316,8 @@ static int cow_file_range_async(struct btrfs_inode *inode,
unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING;
- unsigned long page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
- PAGE_SET_ERROR;
+ unsigned long page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK |
+ PAGE_END_WRITEBACK | PAGE_SET_ERROR;
extent_clear_unlock_delalloc(inode, start, end, locked_page,
clear_bits, page_ops);
@@ -1399,6 +1394,29 @@ static int cow_file_range_async(struct btrfs_inode *inode,
return 0;
}
+static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
+ struct page *locked_page, u64 start,
+ u64 end, int *page_started,
+ unsigned long *nr_written)
+{
+ int ret;
+
+ ret = cow_file_range(inode, locked_page, start, end, page_started,
+ nr_written, 0);
+ if (ret)
+ return ret;
+
+ if (*page_started)
+ return 0;
+
+ __set_page_dirty_nobuffers(locked_page);
+ account_page_redirty(locked_page);
+ extent_write_locked_range(&inode->vfs_inode, start, end, WB_SYNC_ALL);
+ *page_started = 1;
+
+ return 0;
+}
+
static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes)
{
@@ -1519,8 +1537,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, PAGE_UNLOCK |
- PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK |
+ PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK);
return -ENOMEM;
}
@@ -1842,8 +1859,7 @@ error:
locked_page, EXTENT_LOCKED |
EXTENT_DELALLOC | EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
- PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK |
+ PAGE_START_WRITEBACK |
PAGE_END_WRITEBACK);
btrfs_free_path(path);
return ret;
@@ -1878,17 +1894,24 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
{
int ret;
int force_cow = need_force_cow(inode, start, end);
+ const bool zoned = btrfs_is_zoned(inode->root->fs_info);
if (inode->flags & BTRFS_INODE_NODATACOW && !force_cow) {
+ ASSERT(!zoned);
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 1, nr_written);
} else if (inode->flags & BTRFS_INODE_PREALLOC && !force_cow) {
+ ASSERT(!zoned);
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 0, nr_written);
} else if (!inode_can_compress(inode) ||
!inode_need_compress(inode, start, end)) {
- ret = cow_file_range(inode, locked_page, start, end,
- page_started, nr_written, 1);
+ if (zoned)
+ ret = run_delalloc_zoned(inode, locked_page, start, end,
+ page_started, nr_written);
+ else
+ ret = cow_file_range(inode, locked_page, start, end,
+ page_started, nr_written, 1);
} else {
set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
ret = cow_file_range_async(inode, wbc, locked_page, start, end,
@@ -2183,9 +2206,10 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 logical = bio->bi_iter.bi_sector << 9;
+ struct extent_map *em;
u64 length = 0;
u64 map_length;
- int ret;
+ int ret = 0;
struct btrfs_io_geometry geom;
if (bio_flags & EXTENT_BIO_COMPRESSED)
@@ -2193,14 +2217,19 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
length = bio->bi_iter.bi_size;
map_length = length;
- ret = btrfs_get_io_geometry(fs_info, btrfs_op(bio), logical, map_length,
- &geom);
+ em = btrfs_get_chunk_map(fs_info, logical, map_length);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+ ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical,
+ map_length, &geom);
if (ret < 0)
- return ret;
+ goto out;
if (geom.len < length + size)
- return 1;
- return 0;
+ ret = 1;
+out:
+ free_extent_map(em);
+ return ret;
}
/*
@@ -2217,6 +2246,119 @@ static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
}
+bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio,
+ unsigned int size)
+{
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_ordered_extent *ordered;
+ u64 len = bio->bi_iter.bi_size + size;
+ bool ret = true;
+
+ ASSERT(btrfs_is_zoned(fs_info));
+ ASSERT(fs_info->max_zone_append_size > 0);
+ ASSERT(bio_op(bio) == REQ_OP_ZONE_APPEND);
+
+ /* Ordered extent not yet created, so we're good */
+ ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));
+ if (!ordered)
+ return ret;
+
+ if ((bio->bi_iter.bi_sector << SECTOR_SHIFT) + len >
+ ordered->disk_bytenr + ordered->disk_num_bytes)
+ ret = false;
+
+ btrfs_put_ordered_extent(ordered);
+
+ return ret;
+}
+
+static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
+ struct bio *bio, loff_t file_offset)
+{
+ struct btrfs_ordered_extent *ordered;
+ struct extent_map *em = NULL, *em_new = NULL;
+ struct extent_map_tree *em_tree = &inode->extent_tree;
+ u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ u64 len = bio->bi_iter.bi_size;
+ u64 end = start + len;
+ u64 ordered_end;
+ u64 pre, post;
+ int ret = 0;
+
+ ordered = btrfs_lookup_ordered_extent(inode, file_offset);
+ if (WARN_ON_ONCE(!ordered))
+ return BLK_STS_IOERR;
+
+ /* No need to split */
+ if (ordered->disk_num_bytes == len)
+ goto out;
+
+ /* We cannot split once end_bio'd ordered extent */
+ if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* We cannot split a compressed ordered extent */
+ if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ordered_end = ordered->disk_bytenr + ordered->disk_num_bytes;
+ /* bio must be in one ordered extent */
+ if (WARN_ON_ONCE(start < ordered->disk_bytenr || end > ordered_end)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Checksum list should be empty */
+ if (WARN_ON_ONCE(!list_empty(&ordered->list))) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ pre = start - ordered->disk_bytenr;
+ post = ordered_end - end;
+
+ ret = btrfs_split_ordered_extent(ordered, pre, post);
+ if (ret)
+ goto out;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, ordered->file_offset, len);
+ if (!em) {
+ read_unlock(&em_tree->lock);
+ ret = -EIO;
+ goto out;
+ }
+ read_unlock(&em_tree->lock);
+
+ ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
+ /*
+ * We cannot reuse em_new here but have to create a new one, as
+ * unpin_extent_cache() expects the start of the extent map to be the
+ * logical offset of the file, which does not hold true anymore after
+ * splitting.
+ */
+ em_new = create_io_em(inode, em->start + pre, len,
+ em->start + pre, em->block_start + pre, len,
+ len, len, BTRFS_COMPRESS_NONE,
+ BTRFS_ORDERED_REGULAR);
+ if (IS_ERR(em_new)) {
+ ret = PTR_ERR(em_new);
+ goto out;
+ }
+ free_extent_map(em_new);
+
+out:
+ free_extent_map(em);
+ btrfs_put_ordered_extent(ordered);
+
+ return errno_to_blk_status(ret);
+}
+
/*
* extent_io.c submission hook. This does the right thing for csum calculation
* on write, or reading the csums from the tree before a read.
@@ -2252,7 +2394,16 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
if (btrfs_is_free_space_inode(BTRFS_I(inode)))
metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
- if (bio_op(bio) != REQ_OP_WRITE) {
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ struct page *page = bio_first_bvec_all(bio)->bv_page;
+ loff_t file_offset = page_offset(page);
+
+ ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset);
+ if (ret)
+ goto out;
+ }
+
+ if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
if (ret)
goto out;
@@ -2754,6 +2905,9 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
+ if (ordered_extent->disk)
+ btrfs_rewrite_logical_zoned(ordered_extent);
+
btrfs_free_io_failure_record(inode, start, end);
if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
@@ -3103,14 +3257,16 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
}
/**
- * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running
- * @fs_info - the fs_info for this fs
- * @return - EINTR if we were killed, 0 if nothing's pending
+ * Wait for flushing all delayed iputs
+ *
+ * @fs_info: the filesystem
*
* This will wait on any delayed iputs that are currently running with KILLABLE
* set. Once they are all done running we will return, unless we are killed in
* which case we return EINTR. This helps in user operations like fallocate etc
* that might get blocked on the iputs.
+ *
+ * Return EINTR if we were killed, 0 if nothing's pending
*/
int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
{
@@ -4720,6 +4876,9 @@ again:
ret = -ENOMEM;
goto out;
}
+ ret = set_page_extent_mapped(page);
+ if (ret < 0)
+ goto out_unlock;
if (!PageUptodate(page)) {
ret = btrfs_readpage(NULL, page);
@@ -4737,7 +4896,6 @@ again:
wait_on_page_writeback(page);
lock_extent_bits(io_tree, block_start, block_end, &cached_state);
- set_page_extent_mapped(page);
ordered = btrfs_lookup_ordered_extent(inode, block_start);
if (ordered) {
@@ -5011,6 +5169,15 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
btrfs_drew_write_unlock(&root->snapshot_lock);
btrfs_end_transaction(trans);
} else {
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+
+ if (btrfs_is_zoned(fs_info)) {
+ ret = btrfs_wait_ordered_range(inode,
+ ALIGN(newsize, fs_info->sectorsize),
+ (u64)-1);
+ if (ret)
+ return ret;
+ }
/*
* We're truncating a file that used to have good data down to
@@ -6373,7 +6540,7 @@ static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
if (IS_ERR(trans))
return PTR_ERR(trans);
- err = btrfs_find_free_objectid(root, &objectid);
+ err = btrfs_get_free_objectid(root, &objectid);
if (err)
goto out_unlock;
@@ -6437,7 +6604,7 @@ static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,
if (IS_ERR(trans))
return PTR_ERR(trans);
- err = btrfs_find_free_objectid(root, &objectid);
+ err = btrfs_get_free_objectid(root, &objectid);
if (err)
goto out_unlock;
@@ -6582,7 +6749,7 @@ static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
if (IS_ERR(trans))
return PTR_ERR(trans);
- err = btrfs_find_free_objectid(root, &objectid);
+ err = btrfs_get_free_objectid(root, &objectid);
if (err)
goto out_fail;
@@ -7106,9 +7273,6 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
* @strict: if true, omit optimizations that might force us into unnecessary
* cow. e.g., don't trust generation number.
*
- * This function will flush ordered extents in the range to ensure proper
- * nocow checks for (nowait == false) case.
- *
* Return:
* >0 and update @len if we can do nocow write
* 0 if we can't do nocow write
@@ -7616,6 +7780,9 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
iomap->bdev = fs_info->fs_devices->latest_bdev;
iomap->length = len;
+ if (write && btrfs_use_zone_append(BTRFS_I(inode), em))
+ iomap->flags |= IOMAP_F_ZONE_APPEND;
+
free_extent_map(em);
return 0;
@@ -7685,7 +7852,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
if (!refcount_dec_and_test(&dip->refs))
return;
- if (bio_op(dip->dio_bio) == REQ_OP_WRITE) {
+ if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) {
__endio_write_update_ordered(BTRFS_I(dip->inode),
dip->logical_offset,
dip->bytes,
@@ -7800,10 +7967,8 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode,
NULL);
btrfs_queue_work(wq, &ordered->work);
}
- /*
- * If btrfs_dec_test_ordered_pending does not find any ordered
- * extent in the range, we can exit.
- */
+
+ /* No ordered extent found in the range, exit */
if (ordered_offset == last_offset)
return;
/*
@@ -7844,6 +8009,8 @@ static void btrfs_end_dio_bio(struct bio *bio)
if (err)
dip->dio_bio->bi_status = err;
+ btrfs_record_physical_zoned(dip->inode, dip->logical_offset, bio);
+
bio_put(bio);
btrfs_dio_private_put(dip);
}
@@ -7853,7 +8020,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_dio_private *dip = bio->bi_private;
- bool write = bio_op(bio) == REQ_OP_WRITE;
+ bool write = btrfs_op(bio) == BTRFS_MAP_WRITE;
blk_status_t ret;
/* Check btrfs_submit_bio_hook() for rules about async submit. */
@@ -7903,7 +8070,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
struct inode *inode,
loff_t file_offset)
{
- const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
+ const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
size_t dip_size;
struct btrfs_dio_private *dip;
@@ -7933,7 +8100,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
struct bio *dio_bio, loff_t file_offset)
{
- const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
+ const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
BTRFS_BLOCK_GROUP_RAID56_MASK);
@@ -7944,10 +8111,12 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
u64 submit_len;
int clone_offset = 0;
int clone_len;
+ u64 logical;
int ret;
blk_status_t status;
struct btrfs_io_geometry geom;
struct btrfs_dio_data *dio_data = iomap->private;
+ struct extent_map *em = NULL;
dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
if (!dip) {
@@ -7976,12 +8145,18 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
submit_len = dio_bio->bi_iter.bi_size;
do {
- ret = btrfs_get_io_geometry(fs_info, btrfs_op(dio_bio),
- start_sector << 9, submit_len,
- &geom);
+ logical = start_sector << 9;
+ em = btrfs_get_chunk_map(fs_info, logical, submit_len);
+ if (IS_ERR(em)) {
+ status = errno_to_blk_status(PTR_ERR(em));
+ em = NULL;
+ goto out_err_em;
+ }
+ ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio),
+ logical, submit_len, &geom);
if (ret) {
status = errno_to_blk_status(ret);
- goto out_err;
+ goto out_err_em;
}
ASSERT(geom.len <= INT_MAX);
@@ -7996,6 +8171,19 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
bio->bi_end_io = btrfs_end_dio_bio;
btrfs_io_bio(bio)->logical = file_offset;
+ WARN_ON_ONCE(write && btrfs_is_zoned(fs_info) &&
+ fs_info->max_zone_append_size &&
+ bio_op(bio) != REQ_OP_ZONE_APPEND);
+
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ status = extract_ordered_extent(BTRFS_I(inode), bio,
+ file_offset);
+ if (status) {
+ bio_put(bio);
+ goto out_err;
+ }
+ }
+
ASSERT(submit_len >= clone_len);
submit_len -= clone_len;
@@ -8026,19 +8214,24 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
bio_put(bio);
if (submit_len > 0)
refcount_dec(&dip->refs);
- goto out_err;
+ goto out_err_em;
}
dio_data->submitted += clone_len;
clone_offset += clone_len;
start_sector += clone_len >> 9;
file_offset += clone_len;
+
+ free_extent_map(em);
} while (submit_len > 0);
return BLK_QC_T_NONE;
+out_err_em:
+ free_extent_map(em);
out_err:
dip->dio_bio->bi_status = status;
btrfs_dio_private_put(dip);
+
return BLK_QC_T_NONE;
}
@@ -8120,7 +8313,7 @@ static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
{
int ret = try_release_extent_mapping(page, gfp_flags);
if (ret == 1)
- detach_page_private(page);
+ clear_page_extent_mapped(page);
return ret;
}
@@ -8189,8 +8382,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
if (!inode_evicting)
lock_extent_bits(tree, page_start, page_end, &cached_state);
-again:
+
start = page_start;
+again:
ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1);
if (ordered) {
found_ordered = true;
@@ -8279,7 +8473,7 @@ again:
}
ClearPageChecked(page);
- detach_page_private(page);
+ clear_page_extent_mapped(page);
}
/*
@@ -8358,7 +8552,12 @@ again:
wait_on_page_writeback(page);
lock_extent_bits(io_tree, page_start, page_end, &cached_state);
- set_page_extent_mapped(page);
+ ret2 = set_page_extent_mapped(page);
+ if (ret2 < 0) {
+ ret = vmf_error(ret2);
+ unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
+ goto out_unlock;
+ }
/*
* we can't set the delalloc bits if there are pending ordered
@@ -8595,15 +8794,18 @@ out:
*/
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
struct btrfs_root *new_root,
- struct btrfs_root *parent_root,
- u64 new_dirid)
+ struct btrfs_root *parent_root)
{
struct inode *inode;
int err;
u64 index = 0;
+ u64 ino;
+
+ err = btrfs_get_free_objectid(new_root, &ino);
+ if (err < 0)
+ return err;
- inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
- new_dirid, new_dirid,
+ inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, ino, ino,
S_IFDIR | (~current_umask() & S_IRWXUGO),
&index);
if (IS_ERR(inode))
@@ -9083,7 +9285,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
u64 objectid;
u64 index;
- ret = btrfs_find_free_objectid(root, &objectid);
+ ret = btrfs_get_free_objectid(root, &objectid);
if (ret)
return ret;
@@ -9490,11 +9692,11 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
return start_delalloc_inodes(root, &wbc, true, false);
}
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr,
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
bool in_reclaim_context)
{
struct writeback_control wbc = {
- .nr_to_write = (nr == U64_MAX) ? LONG_MAX : (unsigned long)nr,
+ .nr_to_write = nr,
.sync_mode = WB_SYNC_NONE,
.range_start = 0,
.range_end = LLONG_MAX,
@@ -9511,12 +9713,12 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr,
mutex_lock(&fs_info->delalloc_root_mutex);
spin_lock(&fs_info->delalloc_root_lock);
list_splice_init(&fs_info->delalloc_roots, &splice);
- while (!list_empty(&splice) && nr) {
+ while (!list_empty(&splice)) {
/*
* Reset nr_to_write here so we know that we're doing a full
* flush.
*/
- if (nr == U64_MAX)
+ if (nr == LONG_MAX)
wbc.nr_to_write = LONG_MAX;
root = list_first_entry(&splice, struct btrfs_root,
@@ -9579,7 +9781,7 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
if (IS_ERR(trans))
return PTR_ERR(trans);
- err = btrfs_find_free_objectid(root, &objectid);
+ err = btrfs_get_free_objectid(root, &objectid);
if (err)
goto out_unlock;
@@ -9915,7 +10117,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_find_free_objectid(root, &objectid);
+ ret = btrfs_get_free_objectid(root, &objectid);
if (ret)
goto out;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 56f53d692fa2..072e77726e94 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -528,6 +528,14 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
return -EPERM;
/*
+ * btrfs_trim_block_group() depends on space cache, which is not
+ * available in zoned filesystem. So, disallow fitrim on a zoned
+ * filesystem for now.
+ */
+ if (btrfs_is_zoned(fs_info))
+ return -EOPNOTSUPP;
+
+ /*
* If the fs is mounted with nologreplay, which requires it to be
* mounted in RO mode as well, we can not allow discard on free space
* inside block groups, because log trees refer to extents that are not
@@ -606,14 +614,13 @@ static noinline int create_subvol(struct inode *dir,
int err;
dev_t anon_dev = 0;
u64 objectid;
- u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
u64 index = 0;
root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
if (!root_item)
return -ENOMEM;
- ret = btrfs_find_free_objectid(fs_info->tree_root, &objectid);
+ ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid);
if (ret)
goto fail_free;
@@ -693,7 +700,7 @@ static noinline int create_subvol(struct inode *dir,
free_extent_buffer(leaf);
leaf = NULL;
- btrfs_set_root_dirid(root_item, new_dirid);
+ btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID);
key.objectid = objectid;
key.offset = 0;
@@ -716,7 +723,7 @@ static noinline int create_subvol(struct inode *dir,
btrfs_record_root_in_trans(trans, new_root);
- ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid);
+ ret = btrfs_create_subvol_root(trans, new_root, root);
btrfs_put_root(new_root);
if (ret) {
/* We potentially lose an unused inode item here */
@@ -724,10 +731,6 @@ static noinline int create_subvol(struct inode *dir,
goto fail;
}
- mutex_lock(&new_root->objectid_mutex);
- new_root->highest_objectid = new_dirid;
- mutex_unlock(&new_root->objectid_mutex);
-
/*
* insert the directory item
*/
@@ -1320,6 +1323,13 @@ again:
if (!page)
break;
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ unlock_page(page);
+ put_page(page);
+ break;
+ }
+
page_start = page_offset(page);
page_end = page_start + PAGE_SIZE - 1;
while (1) {
@@ -1441,7 +1451,6 @@ again:
for (i = 0; i < i_done; i++) {
clear_page_dirty_for_io(pages[i]);
ClearPageChecked(pages[i]);
- set_page_extent_mapped(pages[i]);
set_page_dirty(pages[i]);
unlock_page(pages[i]);
put_page(pages[i]);
@@ -4954,7 +4963,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_SYNC: {
int ret;
- ret = btrfs_start_delalloc_roots(fs_info, U64_MAX, false);
+ ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
if (ret)
return ret;
ret = btrfs_sync_fs(inode->i_sb, 1);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 79d366a36223..985a21558437 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -199,14 +199,21 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset
entry->compress_type = compress_type;
entry->truncated_len = (u64)-1;
entry->qgroup_rsv = ret;
- if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
- set_bit(type, &entry->flags);
+ entry->physical = (u64)-1;
+ entry->disk = NULL;
+ entry->partno = (u8)-1;
- if (dio) {
- percpu_counter_add_batch(&fs_info->dio_bytes, num_bytes,
- fs_info->delalloc_batch);
+ ASSERT(type == BTRFS_ORDERED_REGULAR ||
+ type == BTRFS_ORDERED_NOCOW ||
+ type == BTRFS_ORDERED_PREALLOC ||
+ type == BTRFS_ORDERED_COMPRESSED);
+ set_bit(type, &entry->flags);
+
+ percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes,
+ fs_info->delalloc_batch);
+
+ if (dio)
set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
- }
/* one ref for the tree */
refcount_set(&entry->refs, 1);
@@ -256,6 +263,9 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
int type)
{
+ ASSERT(type == BTRFS_ORDERED_REGULAR ||
+ type == BTRFS_ORDERED_NOCOW ||
+ type == BTRFS_ORDERED_PREALLOC);
return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
num_bytes, disk_num_bytes, type, 0,
BTRFS_COMPRESS_NONE);
@@ -265,6 +275,9 @@ int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes,
u64 disk_num_bytes, int type)
{
+ ASSERT(type == BTRFS_ORDERED_REGULAR ||
+ type == BTRFS_ORDERED_NOCOW ||
+ type == BTRFS_ORDERED_PREALLOC);
return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
num_bytes, disk_num_bytes, type, 1,
BTRFS_COMPRESS_NONE);
@@ -272,11 +285,12 @@ int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset,
int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes,
- u64 disk_num_bytes, int type,
- int compress_type)
+ u64 disk_num_bytes, int compress_type)
{
+ ASSERT(compress_type != BTRFS_COMPRESS_NONE);
return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
- num_bytes, disk_num_bytes, type, 0,
+ num_bytes, disk_num_bytes,
+ BTRFS_ORDERED_COMPRESSED, 0,
compress_type);
}
@@ -297,26 +311,33 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
}
/*
- * this is used to account for finished IO across a given range
- * of the file. The IO may span ordered extents. If
- * a given ordered_extent is completely done, 1 is returned, otherwise
- * 0.
+ * Finish IO for one ordered extent across a given range. The range can
+ * contain several ordered extents.
+ *
+ * @found_ret: Return the finished ordered extent
+ * @file_offset: File offset for the finished IO
+ * Will also be updated to one byte past the range that is
+ * recordered as finished. This allows caller to walk forward.
+ * @io_size: Length of the finish IO range
+ * @uptodate: If the IO finished without problem
*
- * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
- * to make sure this function only returns 1 once for a given ordered extent.
+ * Return true if any ordered extent is finished in the range, and update
+ * @found_ret and @file_offset.
+ * Return false otherwise.
*
- * file_offset is updated to one byte past the range that is recorded as
- * complete. This allows you to walk forward in the file.
+ * NOTE: Although The range can cross multiple ordered extents, only one
+ * ordered extent will be updated during one call. The caller is responsible to
+ * iterate all ordered extents in the range.
*/
-int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
- struct btrfs_ordered_extent **cached,
+bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
+ struct btrfs_ordered_extent **finished_ret,
u64 *file_offset, u64 io_size, int uptodate)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- int ret;
+ bool finished = false;
unsigned long flags;
u64 dec_end;
u64 dec_start;
@@ -324,16 +345,12 @@ int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
spin_lock_irqsave(&tree->lock, flags);
node = tree_search(tree, *file_offset);
- if (!node) {
- ret = 1;
+ if (!node)
goto out;
- }
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
- if (!offset_in_entry(entry, *file_offset)) {
- ret = 1;
+ if (!offset_in_entry(entry, *file_offset))
goto out;
- }
dec_start = max(*file_offset, entry->file_offset);
dec_end = min(*file_offset + io_size,
@@ -354,39 +371,50 @@ int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
if (entry->bytes_left == 0) {
- ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+ /*
+ * Ensure only one caller can set the flag and finished_ret
+ * accordingly
+ */
+ finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
/* test_and_set_bit implies a barrier */
cond_wake_up_nomb(&entry->wait);
- } else {
- ret = 1;
}
out:
- if (!ret && cached && entry) {
- *cached = entry;
+ if (finished && finished_ret && entry) {
+ *finished_ret = entry;
refcount_inc(&entry->refs);
}
spin_unlock_irqrestore(&tree->lock, flags);
- return ret == 0;
+ return finished;
}
/*
- * this is used to account for finished IO across a given range
- * of the file. The IO should not span ordered extents. If
- * a given ordered_extent is completely done, 1 is returned, otherwise
- * 0.
+ * Finish IO for one ordered extent across a given range. The range can only
+ * contain one ordered extent.
+ *
+ * @cached: The cached ordered extent. If not NULL, we can skip the tree
+ * search and use the ordered extent directly.
+ * Will be also used to store the finished ordered extent.
+ * @file_offset: File offset for the finished IO
+ * @io_size: Length of the finish IO range
+ * @uptodate: If the IO finishes without problem
*
- * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
- * to make sure this function only returns 1 once for a given ordered extent.
+ * Return true if the ordered extent is finished in the range, and update
+ * @cached.
+ * Return false otherwise.
+ *
+ * NOTE: The range can NOT cross multiple ordered extents.
+ * Thus caller should ensure the range doesn't cross ordered extents.
*/
-int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
- struct btrfs_ordered_extent **cached,
- u64 file_offset, u64 io_size, int uptodate)
+bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
+ struct btrfs_ordered_extent **cached,
+ u64 file_offset, u64 io_size, int uptodate)
{
struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
unsigned long flags;
- int ret;
+ bool finished = false;
spin_lock_irqsave(&tree->lock, flags);
if (cached && *cached) {
@@ -395,41 +423,39 @@ int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
}
node = tree_search(tree, file_offset);
- if (!node) {
- ret = 1;
+ if (!node)
goto out;
- }
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
have_entry:
- if (!offset_in_entry(entry, file_offset)) {
- ret = 1;
+ if (!offset_in_entry(entry, file_offset))
goto out;
- }
- if (io_size > entry->bytes_left) {
+ if (io_size > entry->bytes_left)
btrfs_crit(inode->root->fs_info,
"bad ordered accounting left %llu size %llu",
entry->bytes_left, io_size);
- }
+
entry->bytes_left -= io_size;
if (!uptodate)
set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
if (entry->bytes_left == 0) {
- ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+ /*
+ * Ensure only one caller can set the flag and finished_ret
+ * accordingly
+ */
+ finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
/* test_and_set_bit implies a barrier */
cond_wake_up_nomb(&entry->wait);
- } else {
- ret = 1;
}
out:
- if (!ret && cached && entry) {
+ if (finished && cached && entry) {
*cached = entry;
refcount_inc(&entry->refs);
}
spin_unlock_irqrestore(&tree->lock, flags);
- return ret == 0;
+ return finished;
}
/*
@@ -480,9 +506,8 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes,
false);
- if (test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
- percpu_counter_add_batch(&fs_info->dio_bytes, -entry->num_bytes,
- fs_info->delalloc_batch);
+ percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
+ fs_info->delalloc_batch);
tree = &btrfs_inode->ordered_tree;
spin_lock_irq(&tree->lock);
@@ -745,9 +770,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
+ unsigned long flags;
tree = &inode->ordered_tree;
- spin_lock_irq(&tree->lock);
+ spin_lock_irqsave(&tree->lock, flags);
node = tree_search(tree, file_offset);
if (!node)
goto out;
@@ -758,7 +784,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino
if (entry)
refcount_inc(&entry->refs);
out:
- spin_unlock_irq(&tree->lock);
+ spin_unlock_irqrestore(&tree->lock, flags);
return entry;
}
@@ -898,6 +924,84 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
}
}
+static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos,
+ u64 len)
+{
+ struct inode *inode = ordered->inode;
+ u64 file_offset = ordered->file_offset + pos;
+ u64 disk_bytenr = ordered->disk_bytenr + pos;
+ u64 num_bytes = len;
+ u64 disk_num_bytes = len;
+ int type;
+ unsigned long flags_masked = ordered->flags & ~(1 << BTRFS_ORDERED_DIRECT);
+ int compress_type = ordered->compress_type;
+ unsigned long weight;
+ int ret;
+
+ weight = hweight_long(flags_masked);
+ WARN_ON_ONCE(weight > 1);
+ if (!weight)
+ type = 0;
+ else
+ type = __ffs(flags_masked);
+
+ if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered->flags)) {
+ WARN_ON_ONCE(1);
+ ret = btrfs_add_ordered_extent_compress(BTRFS_I(inode),
+ file_offset, disk_bytenr, num_bytes,
+ disk_num_bytes, compress_type);
+ } else if (test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
+ ret = btrfs_add_ordered_extent_dio(BTRFS_I(inode), file_offset,
+ disk_bytenr, num_bytes, disk_num_bytes, type);
+ } else {
+ ret = btrfs_add_ordered_extent(BTRFS_I(inode), file_offset,
+ disk_bytenr, num_bytes, disk_num_bytes, type);
+ }
+
+ return ret;
+}
+
+int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
+ u64 post)
+{
+ struct inode *inode = ordered->inode;
+ struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+ struct rb_node *node;
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ int ret = 0;
+
+ spin_lock_irq(&tree->lock);
+ /* Remove from tree once */
+ node = &ordered->rb_node;
+ rb_erase(node, &tree->tree);
+ RB_CLEAR_NODE(node);
+ if (tree->last == node)
+ tree->last = NULL;
+
+ ordered->file_offset += pre;
+ ordered->disk_bytenr += pre;
+ ordered->num_bytes -= (pre + post);
+ ordered->disk_num_bytes -= (pre + post);
+ ordered->bytes_left -= (pre + post);
+
+ /* Re-insert the node */
+ node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node);
+ if (node)
+ btrfs_panic(fs_info, -EEXIST,
+ "zoned: inconsistency in ordered tree at offset %llu",
+ ordered->file_offset);
+
+ spin_unlock_irq(&tree->lock);
+
+ if (pre)
+ ret = clone_ordered_extent(ordered, 0, pre);
+ if (post)
+ ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes,
+ post);
+
+ return ret;
+}
+
int __init ordered_data_init(void)
{
btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 0bfa82b58e23..99e0853e4d3b 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -27,7 +27,7 @@ struct btrfs_ordered_sum {
};
/*
- * bits for the flags field:
+ * Bits for btrfs_ordered_extent::flags.
*
* BTRFS_ORDERED_IO_DONE is set when all of the blocks are written.
* It is used to make sure metadata is inserted into the tree only once
@@ -38,24 +38,36 @@ struct btrfs_ordered_sum {
* IO is done and any metadata is inserted into the tree.
*/
enum {
+ /*
+ * Different types for direct io, one and only one of the 4 type can
+ * be set when creating ordered extent.
+ *
+ * REGULAR: For regular non-compressed COW write
+ * NOCOW: For NOCOW write into existing non-hole extent
+ * PREALLOC: For NOCOW write into preallocated extent
+ * COMPRESSED: For compressed COW write
+ */
+ BTRFS_ORDERED_REGULAR,
+ BTRFS_ORDERED_NOCOW,
+ BTRFS_ORDERED_PREALLOC,
+ BTRFS_ORDERED_COMPRESSED,
+
+ /*
+ * Extra bit for direct io, can only be set for
+ * REGULAR/NOCOW/PREALLOC. No direct io for compressed extent.
+ */
+ BTRFS_ORDERED_DIRECT,
+
+ /* Extra status bits for ordered extents */
+
/* set when all the pages are written */
BTRFS_ORDERED_IO_DONE,
/* set when removed from the tree */
BTRFS_ORDERED_COMPLETE,
- /* set when we want to write in place */
- BTRFS_ORDERED_NOCOW,
- /* writing a zlib compressed extent */
- BTRFS_ORDERED_COMPRESSED,
- /* set when writing to preallocated extent */
- BTRFS_ORDERED_PREALLOC,
- /* set when we're doing DIO with this extent */
- BTRFS_ORDERED_DIRECT,
/* We had an io error when writing this out */
BTRFS_ORDERED_IOERR,
/* Set when we have to truncate an extent */
BTRFS_ORDERED_TRUNCATED,
- /* Regular IO for COW */
- BTRFS_ORDERED_REGULAR,
/* Used during fsync to track already logged extents */
BTRFS_ORDERED_LOGGED,
/* We have already logged all the csums of the ordered extent */
@@ -127,6 +139,14 @@ struct btrfs_ordered_extent {
struct completion completion;
struct btrfs_work flush_work;
struct list_head work_list;
+
+ /*
+ * Used to reverse-map physical address returned from ZONE_APPEND write
+ * command in a workqueue context
+ */
+ u64 physical;
+ struct gendisk *disk;
+ u8 partno;
};
/*
@@ -152,11 +172,11 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry);
-int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
- struct btrfs_ordered_extent **cached,
- u64 file_offset, u64 io_size, int uptodate);
-int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
- struct btrfs_ordered_extent **cached,
+bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
+ struct btrfs_ordered_extent **cached,
+ u64 file_offset, u64 io_size, int uptodate);
+bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
+ struct btrfs_ordered_extent **finished_ret,
u64 *file_offset, u64 io_size,
int uptodate);
int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
@@ -167,8 +187,7 @@ int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset,
u64 disk_num_bytes, int type);
int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes,
- u64 disk_num_bytes, int type,
- int compress_type);
+ u64 disk_num_bytes, int compress_type);
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
@@ -190,6 +209,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
u64 end,
struct extent_state **cached_state);
+int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
+ u64 post);
int __init ordered_data_init(void);
void __cold ordered_data_exit(void);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 93fbf87bdc8d..8ec34ecb6d68 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -233,8 +233,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
}
x = cmpxchg(&info->stripe_hash_table, NULL, table);
- if (x)
- kvfree(x);
+ kvfree(x);
return 0;
}
@@ -1105,8 +1104,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
* devices or if they are not contiguous
*/
if (last_end == disk_start && !last->bi_status &&
- last->bi_disk == stripe->dev->bdev->bd_disk &&
- last->bi_partno == stripe->dev->bdev->bd_partno) {
+ last->bi_bdev == stripe->dev->bdev) {
ret = bio_add_page(last, page, PAGE_SIZE, 0);
if (ret == PAGE_SIZE)
return 0;
@@ -1357,9 +1355,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
for (i = 0; i < rbio->bbio->num_stripes; i++) {
stripe = &rbio->bbio->stripes[i];
if (in_range(physical, stripe->physical, rbio->stripe_len) &&
- stripe->dev->bdev &&
- bio->bi_disk == stripe->dev->bdev->bd_disk &&
- bio->bi_partno == stripe->dev->bdev->bd_partno) {
+ stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
return i;
}
}
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 4b9b6c52a83b..2b490becbe67 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -495,14 +495,15 @@ static int process_extent_item(struct btrfs_fs_info *fs_info,
}
static int process_leaf(struct btrfs_root *root,
- struct btrfs_path *path, u64 *bytenr, u64 *num_bytes)
+ struct btrfs_path *path, u64 *bytenr, u64 *num_bytes,
+ int *tree_block_level)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_extent_data_ref *dref;
struct btrfs_shared_data_ref *sref;
u32 count;
- int i = 0, tree_block_level = 0, ret = 0;
+ int i = 0, ret = 0;
struct btrfs_key key;
int nritems = btrfs_header_nritems(leaf);
@@ -515,15 +516,15 @@ static int process_leaf(struct btrfs_root *root,
case BTRFS_METADATA_ITEM_KEY:
*bytenr = key.objectid;
ret = process_extent_item(fs_info, path, &key, i,
- &tree_block_level);
+ tree_block_level);
break;
case BTRFS_TREE_BLOCK_REF_KEY:
ret = add_tree_block(fs_info, key.offset, 0,
- key.objectid, tree_block_level);
+ key.objectid, *tree_block_level);
break;
case BTRFS_SHARED_BLOCK_REF_KEY:
ret = add_tree_block(fs_info, 0, key.offset,
- key.objectid, tree_block_level);
+ key.objectid, *tree_block_level);
break;
case BTRFS_EXTENT_DATA_REF_KEY:
dref = btrfs_item_ptr(leaf, i,
@@ -549,7 +550,8 @@ static int process_leaf(struct btrfs_root *root,
/* Walk down to the leaf from the given level */
static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
- int level, u64 *bytenr, u64 *num_bytes)
+ int level, u64 *bytenr, u64 *num_bytes,
+ int *tree_block_level)
{
struct extent_buffer *eb;
int ret = 0;
@@ -565,7 +567,8 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
path->slots[level-1] = 0;
path->locks[level-1] = BTRFS_READ_LOCK;
} else {
- ret = process_leaf(root, path, bytenr, num_bytes);
+ ret = process_leaf(root, path, bytenr, num_bytes,
+ tree_block_level);
if (ret)
break;
}
@@ -666,18 +669,18 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
u64 bytenr = generic_ref->bytenr;
u64 num_bytes = generic_ref->len;
u64 parent = generic_ref->parent;
- u64 ref_root;
- u64 owner;
- u64 offset;
+ u64 ref_root = 0;
+ u64 owner = 0;
+ u64 offset = 0;
if (!btrfs_test_opt(fs_info, REF_VERIFY))
return 0;
if (generic_ref->type == BTRFS_REF_METADATA) {
- ref_root = generic_ref->tree_ref.root;
+ if (!parent)
+ ref_root = generic_ref->tree_ref.root;
owner = generic_ref->tree_ref.level;
- offset = 0;
- } else {
+ } else if (!parent) {
ref_root = generic_ref->data_ref.ref_root;
owner = generic_ref->data_ref.ino;
offset = generic_ref->data_ref.offset;
@@ -693,13 +696,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
goto out;
}
- if (parent) {
- ref->parent = parent;
- } else {
- ref->root_objectid = ref_root;
- ref->owner = owner;
- ref->offset = offset;
- }
+ ref->parent = parent;
+ ref->owner = owner;
+ ref->root_objectid = ref_root;
+ ref->offset = offset;
ref->num_refs = (action == BTRFS_DROP_DELAYED_REF) ? -1 : 1;
memcpy(&ra->ref, ref, sizeof(struct ref_entry));
@@ -974,6 +974,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_path *path;
struct extent_buffer *eb;
+ int tree_block_level = 0;
u64 bytenr = 0, num_bytes = 0;
int ret, level;
@@ -998,7 +999,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
* different leaf from the original extent item.
*/
ret = walk_down_tree(fs_info->extent_root, path, level,
- &bytenr, &num_bytes);
+ &bytenr, &num_bytes, &tree_block_level);
if (ret)
break;
ret = walk_up_tree(path, &level);
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index b03e7891394e..b24396cf2f99 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -81,7 +81,10 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
goto out_unlock;
}
- set_page_extent_mapped(page);
+ ret = set_page_extent_mapped(page);
+ if (ret < 0)
+ goto out_unlock;
+
clear_extent_bit(&inode->io_tree, file_offset, range_end,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, NULL);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index df63ef64c5c0..232d5da7b7be 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -97,6 +97,7 @@ struct tree_block {
struct rb_node rb_node;
u64 bytenr;
}; /* Use rb_simple_node for search/insert */
+ u64 owner;
struct btrfs_key key;
unsigned int level:8;
unsigned int key_ready:1;
@@ -668,9 +669,7 @@ static void __del_reloc_root(struct btrfs_root *root)
RB_CLEAR_NODE(&node->rb_node);
}
spin_unlock(&rc->reloc_root_tree.lock);
- if (!node)
- return;
- BUG_ON((struct btrfs_root *)node->data != root);
+ ASSERT(!node || (struct btrfs_root *)node->data == root);
}
/*
@@ -2393,8 +2392,8 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info,
{
struct extent_buffer *eb;
- eb = read_tree_block(fs_info, block->bytenr, 0, block->key.offset,
- block->level, NULL);
+ eb = read_tree_block(fs_info, block->bytenr, block->owner,
+ block->key.offset, block->level, NULL);
if (IS_ERR(eb)) {
return PTR_ERR(eb);
} else if (!extent_buffer_uptodate(eb)) {
@@ -2493,7 +2492,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
/* Kick in readahead for tree blocks with missing keys */
rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
if (!block->key_ready)
- btrfs_readahead_tree_block(fs_info, block->bytenr, 0, 0,
+ btrfs_readahead_tree_block(fs_info, block->bytenr,
+ block->owner, 0,
block->level);
}
@@ -2553,6 +2553,31 @@ static noinline_for_stack int prealloc_file_extent_cluster(
if (ret)
return ret;
+ /*
+ * On a zoned filesystem, we cannot preallocate the file region.
+ * Instead, we dirty and fiemap_write the region.
+ */
+ if (btrfs_is_zoned(inode->root->fs_info)) {
+ struct btrfs_root *root = inode->root;
+ struct btrfs_trans_handle *trans;
+
+ end = cluster->end - offset + 1;
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
+ i_size_write(&inode->vfs_inode, end);
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
+
+ return btrfs_end_transaction(trans);
+ }
+
inode_lock(&inode->vfs_inode);
for (nr = 0; nr < cluster->nr; nr++) {
start = cluster->boundary[nr] - offset;
@@ -2615,7 +2640,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
/*
* Allow error injection to test balance cancellation
*/
-int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
+noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
{
return atomic_read(&fs_info->balance_cancel_req) ||
fatal_signal_pending(current);
@@ -2679,6 +2704,15 @@ static int relocate_file_extent_cluster(struct inode *inode,
goto out;
}
}
+ ret = set_page_extent_mapped(page);
+ if (ret < 0) {
+ btrfs_delalloc_release_metadata(BTRFS_I(inode),
+ PAGE_SIZE, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ unlock_page(page);
+ put_page(page);
+ goto out;
+ }
if (PageReadahead(page)) {
page_cache_async_readahead(inode->i_mapping,
@@ -2706,8 +2740,6 @@ static int relocate_file_extent_cluster(struct inode *inode,
lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
- set_page_extent_mapped(page);
-
if (nr < cluster->nr &&
page_start + offset == cluster->boundary[nr]) {
set_extent_bits(&BTRFS_I(inode)->io_tree,
@@ -2749,6 +2781,8 @@ static int relocate_file_extent_cluster(struct inode *inode,
}
}
WARN_ON(nr != cluster->nr);
+ if (btrfs_is_zoned(fs_info) && !ret)
+ ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
out:
kfree(ra);
return ret;
@@ -2801,21 +2835,58 @@ static int add_tree_block(struct reloc_control *rc,
u32 item_size;
int level = -1;
u64 generation;
+ u64 owner = 0;
eb = path->nodes[0];
item_size = btrfs_item_size_nr(eb, path->slots[0]);
if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
item_size >= sizeof(*ei) + sizeof(*bi)) {
+ unsigned long ptr = 0, end;
+
ei = btrfs_item_ptr(eb, path->slots[0],
struct btrfs_extent_item);
+ end = (unsigned long)ei + item_size;
if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
bi = (struct btrfs_tree_block_info *)(ei + 1);
level = btrfs_tree_block_level(eb, bi);
+ ptr = (unsigned long)(bi + 1);
} else {
level = (int)extent_key->offset;
+ ptr = (unsigned long)(ei + 1);
}
generation = btrfs_extent_generation(eb, ei);
+
+ /*
+ * We're reading random blocks without knowing their owner ahead
+ * of time. This is ok most of the time, as all reloc roots and
+ * fs roots have the same lock type. However normal trees do
+ * not, and the only way to know ahead of time is to read the
+ * inline ref offset. We know it's an fs root if
+ *
+ * 1. There's more than one ref.
+ * 2. There's a SHARED_DATA_REF_KEY set.
+ * 3. FULL_BACKREF is set on the flags.
+ *
+ * Otherwise it's safe to assume that the ref offset == the
+ * owner of this block, so we can use that when calling
+ * read_tree_block.
+ */
+ if (btrfs_extent_refs(eb, ei) == 1 &&
+ !(btrfs_extent_flags(eb, ei) &
+ BTRFS_BLOCK_FLAG_FULL_BACKREF) &&
+ ptr < end) {
+ struct btrfs_extent_inline_ref *iref;
+ int type;
+
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ type = btrfs_get_extent_inline_ref_type(eb, iref,
+ BTRFS_REF_TYPE_BLOCK);
+ if (type == BTRFS_REF_TYPE_INVALID)
+ return -EINVAL;
+ if (type == BTRFS_TREE_BLOCK_REF_KEY)
+ owner = btrfs_extent_inline_ref_offset(eb, iref);
+ }
} else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
btrfs_print_v0_err(eb->fs_info);
btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL);
@@ -2837,6 +2908,7 @@ static int add_tree_block(struct reloc_control *rc,
block->key.offset = generation;
block->level = level;
block->key_ready = 0;
+ block->owner = owner;
rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node);
if (rb_node)
@@ -3389,8 +3461,12 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct btrfs_inode_item *item;
struct extent_buffer *leaf;
+ u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
int ret;
+ if (btrfs_is_zoned(trans->fs_info))
+ flags &= ~BTRFS_INODE_PREALLOC;
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -3405,8 +3481,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
btrfs_set_inode_generation(leaf, item, 1);
btrfs_set_inode_size(leaf, item, 0);
btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
- btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
- BTRFS_INODE_PREALLOC);
+ btrfs_set_inode_flags(leaf, item, flags);
btrfs_mark_buffer_dirty(leaf);
out:
btrfs_free_path(path);
@@ -3434,7 +3509,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
return ERR_CAST(trans);
}
- err = btrfs_find_free_objectid(root, &objectid);
+ err = btrfs_get_free_objectid(root, &objectid);
if (err)
goto out;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 5f4f88a4d2c8..582df11d298a 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -166,6 +166,7 @@ struct scrub_ctx {
int pages_per_rd_bio;
int is_dev_replace;
+ u64 write_pointer;
struct scrub_bio *wr_curr_bio;
struct mutex wr_lock;
@@ -856,6 +857,9 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
have_csum = sblock_to_check->pagev[0]->have_csum;
dev = sblock_to_check->pagev[0]->dev;
+ if (btrfs_is_zoned(fs_info) && !sctx->is_dev_replace)
+ return btrfs_repair_one_zone(fs_info, logical);
+
/*
* We must use GFP_NOFS because the scrub task might be waiting for a
* worker task executing this function and in turn a transaction commit
@@ -1619,6 +1623,28 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
return scrub_add_page_to_wr_bio(sblock->sctx, spage);
}
+static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
+{
+ int ret = 0;
+ u64 length;
+
+ if (!btrfs_is_zoned(sctx->fs_info))
+ return 0;
+
+ if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
+ return 0;
+
+ if (sctx->write_pointer < physical) {
+ length = physical - sctx->write_pointer;
+
+ ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
+ sctx->write_pointer, length);
+ if (!ret)
+ sctx->write_pointer = physical;
+ }
+ return ret;
+}
+
static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
struct scrub_page *spage)
{
@@ -1641,6 +1667,13 @@ again:
if (sbio->page_count == 0) {
struct bio *bio;
+ ret = fill_writer_pointer_gap(sctx,
+ spage->physical_for_dev_replace);
+ if (ret) {
+ mutex_unlock(&sctx->wr_lock);
+ return ret;
+ }
+
sbio->physical = spage->physical_for_dev_replace;
sbio->logical = spage->logical;
sbio->dev = sctx->wr_tgtdev;
@@ -1695,13 +1728,16 @@ static void scrub_wr_submit(struct scrub_ctx *sctx)
sbio = sctx->wr_curr_bio;
sctx->wr_curr_bio = NULL;
- WARN_ON(!sbio->bio->bi_disk);
+ WARN_ON(!sbio->bio->bi_bdev);
scrub_pending_bio_inc(sctx);
/* process all writes in a single worker thread. Then the block layer
* orders the requests before sending them to the driver which
* doubled the write performance on spinning disks when measured
* with Linux 3.5 */
btrfsic_submit_bio(sbio->bio);
+
+ if (btrfs_is_zoned(sctx->fs_info))
+ sctx->write_pointer = sbio->physical + sbio->page_count * PAGE_SIZE;
}
static void scrub_wr_bio_end_io(struct bio *bio)
@@ -3025,6 +3061,46 @@ out:
return ret < 0 ? ret : 0;
}
+static void sync_replace_for_zoned(struct scrub_ctx *sctx)
+{
+ if (!btrfs_is_zoned(sctx->fs_info))
+ return;
+
+ sctx->flush_all_writes = true;
+ scrub_submit(sctx);
+ mutex_lock(&sctx->wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_lock);
+
+ wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
+}
+
+static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
+ u64 physical, u64 physical_end)
+{
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ int ret = 0;
+
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
+
+ mutex_lock(&sctx->wr_lock);
+ if (sctx->write_pointer < physical_end) {
+ ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
+ physical,
+ sctx->write_pointer);
+ if (ret)
+ btrfs_err(fs_info,
+ "zoned: failed to recover write pointer");
+ }
+ mutex_unlock(&sctx->wr_lock);
+ btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
+
+ return ret;
+}
+
static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct map_lookup *map,
struct btrfs_device *scrub_dev,
@@ -3165,6 +3241,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
*/
blk_start_plug(&plug);
+ if (sctx->is_dev_replace &&
+ btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
+ mutex_lock(&sctx->wr_lock);
+ sctx->write_pointer = physical;
+ mutex_unlock(&sctx->wr_lock);
+ sctx->flush_all_writes = true;
+ }
+
/*
* now find all extents for each stripe and scrub them
*/
@@ -3353,6 +3437,9 @@ again:
if (ret)
goto out;
+ if (sctx->is_dev_replace)
+ sync_replace_for_zoned(sctx);
+
if (extent_logical + extent_len <
key.objectid + bytes) {
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
@@ -3420,6 +3507,17 @@ out:
blk_finish_plug(&plug);
btrfs_free_path(path);
btrfs_free_path(ppath);
+
+ if (sctx->is_dev_replace && ret >= 0) {
+ int ret2;
+
+ ret2 = sync_write_pointer_for_zoned(sctx, base + offset,
+ map->stripes[num].physical,
+ physical_end);
+ if (ret2)
+ ret = ret2;
+ }
+
return ret < 0 ? ret : 0;
}
@@ -3475,6 +3573,25 @@ out:
return ret;
}
+static int finish_extent_writes_for_zoned(struct btrfs_root *root,
+ struct btrfs_block_group *cache)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_trans_handle *trans;
+
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ btrfs_wait_block_group_reservations(cache);
+ btrfs_wait_nocow_writers(cache);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ return btrfs_commit_transaction(trans);
+}
+
static noinline_for_stack
int scrub_enumerate_chunks(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev, u64 start, u64 end)
@@ -3561,6 +3678,16 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (!cache)
goto skip;
+ if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
+ spin_lock(&cache->lock);
+ if (!cache->to_copy) {
+ spin_unlock(&cache->lock);
+ ro_set = 0;
+ goto done;
+ }
+ spin_unlock(&cache->lock);
+ }
+
/*
* Make sure that while we are scrubbing the corresponding block
* group doesn't get its logical address and its device extents
@@ -3619,6 +3746,16 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
* group is not RO.
*/
ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
+ if (!ret && sctx->is_dev_replace) {
+ ret = finish_extent_writes_for_zoned(root, cache);
+ if (ret) {
+ btrfs_dec_block_group_ro(cache);
+ scrub_pause_off(fs_info);
+ btrfs_put_block_group(cache);
+ break;
+ }
+ }
+
if (ret == 0) {
ro_set = 1;
} else if (ret == -ENOSPC && !sctx->is_dev_replace) {
@@ -3692,6 +3829,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
scrub_pause_off(fs_info);
+ if (sctx->is_dev_replace &&
+ !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
+ cache, found_key.offset))
+ ro_set = 0;
+
+done:
down_write(&dev_replace->rwsem);
dev_replace->cursor_left = dev_replace->cursor_right;
dev_replace->item_needs_writeback = 1;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index ae97f4dbaff3..f87878274e9f 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1191,9 +1191,6 @@ struct backref_ctx {
/* may be truncated in case it's the last extent in a file */
u64 extent_len;
- /* data offset in the file extent item */
- u64 data_offset;
-
/* Just to check for bugs in backref resolving */
int found_itself;
};
@@ -1401,19 +1398,6 @@ static int find_extent_clone(struct send_ctx *sctx,
backref_ctx->cur_offset = data_offset;
backref_ctx->found_itself = 0;
backref_ctx->extent_len = num_bytes;
- /*
- * For non-compressed extents iterate_extent_inodes() gives us extent
- * offsets that already take into account the data offset, but not for
- * compressed extents, since the offset is logical and not relative to
- * the physical extent locations. We must take this into account to
- * avoid sending clone offsets that go beyond the source file's size,
- * which would result in the clone ioctl failing with -EINVAL on the
- * receiving end.
- */
- if (compressed == BTRFS_COMPRESS_NONE)
- backref_ctx->data_offset = 0;
- else
- backref_ctx->data_offset = btrfs_file_extent_offset(eb, fi);
/*
* The last extent of a file may be too large due to page alignment.
@@ -5512,6 +5496,21 @@ static int clone_range(struct send_ctx *sctx,
break;
offset += clone_len;
clone_root->offset += clone_len;
+
+ /*
+ * If we are cloning from the file we are currently processing,
+ * and using the send root as the clone root, we must stop once
+ * the current clone offset reaches the current eof of the file
+ * at the receiver, otherwise we would issue an invalid clone
+ * operation (source range going beyond eof) and cause the
+ * receiver to fail. So if we reach the current eof, bail out
+ * and fallback to a regular write.
+ */
+ if (clone_root->root == sctx->send_root &&
+ clone_root->ino == sctx->cur_ino &&
+ clone_root->offset >= sctx->cur_inode_next_write_offset)
+ break;
+
data_offset += clone_len;
next:
path->slots[0]++;
@@ -6592,10 +6591,9 @@ static int changed_cb(struct btrfs_path *left_path,
struct btrfs_path *right_path,
struct btrfs_key *key,
enum btrfs_compare_tree_result result,
- void *ctx)
+ struct send_ctx *sctx)
{
int ret = 0;
- struct send_ctx *sctx = ctx;
if (result == BTRFS_COMPARE_TREE_SAME) {
if (key->type == BTRFS_INODE_REF_KEY ||
@@ -6800,7 +6798,7 @@ static int tree_compare_item(struct btrfs_path *left_path,
* If it detects a change, it aborts immediately.
*/
static int btrfs_compare_trees(struct btrfs_root *left_root,
- struct btrfs_root *right_root, void *ctx)
+ struct btrfs_root *right_root, struct send_ctx *sctx)
{
struct btrfs_fs_info *fs_info = left_root->fs_info;
int ret;
@@ -6952,7 +6950,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
ret = changed_cb(left_path, right_path,
&right_key,
BTRFS_COMPARE_TREE_DELETED,
- ctx);
+ sctx);
if (ret < 0)
goto out;
}
@@ -6963,7 +6961,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
ret = changed_cb(left_path, right_path,
&left_key,
BTRFS_COMPARE_TREE_NEW,
- ctx);
+ sctx);
if (ret < 0)
goto out;
}
@@ -6977,7 +6975,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
ret = changed_cb(left_path, right_path,
&left_key,
BTRFS_COMPARE_TREE_NEW,
- ctx);
+ sctx);
if (ret < 0)
goto out;
advance_left = ADVANCE;
@@ -6985,7 +6983,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
ret = changed_cb(left_path, right_path,
&right_key,
BTRFS_COMPARE_TREE_DELETED,
- ctx);
+ sctx);
if (ret < 0)
goto out;
advance_right = ADVANCE;
@@ -7000,7 +6998,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
else
result = BTRFS_COMPARE_TREE_SAME;
ret = changed_cb(left_path, right_path,
- &left_key, result, ctx);
+ &left_key, result, sctx);
if (ret < 0)
goto out;
advance_left = ADVANCE;
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index e8347461c8dd..2da6177f4b0b 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -140,6 +140,12 @@
* be freed, plus any delayed work we may not have gotten rid of in the case
* of metadata.
*
+ * FORCE_COMMIT_TRANS
+ * For use by the preemptive flusher. We use this to bypass the ticketing
+ * checks in may_commit_transaction, as we have more information about the
+ * overall state of the system and may want to commit the transaction ahead
+ * of actual ENOSPC conditions.
+ *
* OVERCOMMIT
*
* Because we hold so many reservations for metadata we will allow you to
@@ -163,6 +169,7 @@ u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
ASSERT(s_info);
return s_info->bytes_used + s_info->bytes_reserved +
s_info->bytes_pinned + s_info->bytes_readonly +
+ s_info->bytes_zone_unusable +
(may_use_included ? s_info->bytes_may_use : 0);
}
@@ -206,6 +213,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
INIT_LIST_HEAD(&space_info->ro_bgs);
INIT_LIST_HEAD(&space_info->tickets);
INIT_LIST_HEAD(&space_info->priority_tickets);
+ space_info->clamp = 1;
ret = btrfs_sysfs_add_space_info_type(info, space_info);
if (ret)
@@ -257,7 +265,7 @@ out:
void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
u64 total_bytes, u64 bytes_used,
- u64 bytes_readonly,
+ u64 bytes_readonly, u64 bytes_zone_unusable,
struct btrfs_space_info **space_info)
{
struct btrfs_space_info *found;
@@ -273,6 +281,7 @@ void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
found->bytes_used += bytes_used;
found->disk_used += bytes_used * factor;
found->bytes_readonly += bytes_readonly;
+ found->bytes_zone_unusable += bytes_zone_unusable;
if (total_bytes > 0)
found->full = 0;
btrfs_try_granting_tickets(info, found);
@@ -422,10 +431,10 @@ static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
info->total_bytes - btrfs_space_info_used(info, true),
info->full ? "" : "not ");
btrfs_info(fs_info,
- "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
+ "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu",
info->total_bytes, info->bytes_used, info->bytes_pinned,
info->bytes_reserved, info->bytes_may_use,
- info->bytes_readonly);
+ info->bytes_readonly, info->bytes_zone_unusable);
DUMP_BLOCK_RSV(fs_info, global_block_rsv);
DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
@@ -454,9 +463,10 @@ again:
list_for_each_entry(cache, &info->block_groups[index], list) {
spin_lock(&cache->lock);
btrfs_info(fs_info,
- "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
+ "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu zone_unusable %s",
cache->start, cache->length, cache->used, cache->pinned,
- cache->reserved, cache->ro ? "[readonly]" : "");
+ cache->reserved, cache->zone_unusable,
+ cache->ro ? "[readonly]" : "");
spin_unlock(&cache->lock);
btrfs_dump_free_space(cache, bytes);
}
@@ -489,7 +499,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
{
struct btrfs_trans_handle *trans;
u64 delalloc_bytes;
- u64 dio_bytes;
+ u64 ordered_bytes;
u64 items;
long time_left;
int loops;
@@ -513,26 +523,22 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
delalloc_bytes = percpu_counter_sum_positive(
&fs_info->delalloc_bytes);
- dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
- if (delalloc_bytes == 0 && dio_bytes == 0) {
- if (trans)
- return;
- if (wait_ordered)
- btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
+ ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
+ if (delalloc_bytes == 0 && ordered_bytes == 0)
return;
- }
/*
* If we are doing more ordered than delalloc we need to just wait on
* ordered extents, otherwise we'll waste time trying to flush delalloc
* that likely won't give us the space back we need.
*/
- if (dio_bytes > delalloc_bytes)
+ if (ordered_bytes > delalloc_bytes)
wait_ordered = true;
loops = 0;
- while ((delalloc_bytes || dio_bytes) && loops < 3) {
- u64 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
+ while ((delalloc_bytes || ordered_bytes) && loops < 3) {
+ u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
+ long nr_pages = min_t(u64, temp, LONG_MAX);
btrfs_start_delalloc_roots(fs_info, nr_pages, true);
@@ -555,15 +561,16 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
delalloc_bytes = percpu_counter_sum_positive(
&fs_info->delalloc_bytes);
- dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
+ ordered_bytes = percpu_counter_sum_positive(
+ &fs_info->ordered_bytes);
}
}
/**
- * maybe_commit_transaction - possibly commit the transaction if its ok to
- * @root - the root we're allocating for
- * @bytes - the number of bytes we want to reserve
- * @force - force the commit
+ * Possibly commit the transaction if its ok to
+ *
+ * @fs_info: the filesystem
+ * @space_info: space_info we are checking for commit, either data or metadata
*
* This will check to make sure that committing the transaction will actually
* get us somewhere and then commit the transaction if it does. Otherwise it
@@ -669,7 +676,7 @@ enospc:
*/
static void flush_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, u64 num_bytes,
- int state)
+ enum btrfs_flush_state state, bool for_preempt)
{
struct btrfs_root *root = fs_info->extent_root;
struct btrfs_trans_handle *trans;
@@ -738,13 +745,21 @@ static void flush_space(struct btrfs_fs_info *fs_info,
case COMMIT_TRANS:
ret = may_commit_transaction(fs_info, space_info);
break;
+ case FORCE_COMMIT_TRANS:
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
+ ret = btrfs_commit_transaction(trans);
+ break;
default:
ret = -ENOSPC;
break;
}
trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
- ret);
+ ret, for_preempt);
return;
}
@@ -754,7 +769,6 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
{
u64 used;
u64 avail;
- u64 expected;
u64 to_reclaim = space_info->reclaim_size;
lockdep_assert_held(&space_info->lock);
@@ -772,43 +786,88 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
if (space_info->total_bytes + avail < used)
to_reclaim += used - (space_info->total_bytes + avail);
- if (to_reclaim)
- return to_reclaim;
-
- to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
- if (btrfs_can_overcommit(fs_info, space_info, to_reclaim,
- BTRFS_RESERVE_FLUSH_ALL))
- return 0;
-
- used = btrfs_space_info_used(space_info, true);
-
- if (btrfs_can_overcommit(fs_info, space_info, SZ_1M,
- BTRFS_RESERVE_FLUSH_ALL))
- expected = div_factor_fine(space_info->total_bytes, 95);
- else
- expected = div_factor_fine(space_info->total_bytes, 90);
-
- if (used > expected)
- to_reclaim = used - expected;
- else
- to_reclaim = 0;
- to_reclaim = min(to_reclaim, space_info->bytes_may_use +
- space_info->bytes_reserved);
return to_reclaim;
}
-static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- u64 used)
+static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info)
{
+ u64 ordered, delalloc;
u64 thresh = div_factor_fine(space_info->total_bytes, 98);
+ u64 used;
/* If we're just plain full then async reclaim just slows us down. */
if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
- return 0;
+ return false;
- if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info))
- return 0;
+ /*
+ * We have tickets queued, bail so we don't compete with the async
+ * flushers.
+ */
+ if (space_info->reclaim_size)
+ return false;
+
+ /*
+ * If we have over half of the free space occupied by reservations or
+ * pinned then we want to start flushing.
+ *
+ * We do not do the traditional thing here, which is to say
+ *
+ * if (used >= ((total_bytes + avail) / 2))
+ * return 1;
+ *
+ * because this doesn't quite work how we want. If we had more than 50%
+ * of the space_info used by bytes_used and we had 0 available we'd just
+ * constantly run the background flusher. Instead we want it to kick in
+ * if our reclaimable space exceeds our clamped free space.
+ *
+ * Our clamping range is 2^1 -> 2^8. Practically speaking that means
+ * the following:
+ *
+ * Amount of RAM Minimum threshold Maximum threshold
+ *
+ * 256GiB 1GiB 128GiB
+ * 128GiB 512MiB 64GiB
+ * 64GiB 256MiB 32GiB
+ * 32GiB 128MiB 16GiB
+ * 16GiB 64MiB 8GiB
+ *
+ * These are the range our thresholds will fall in, corresponding to how
+ * much delalloc we need for the background flusher to kick in.
+ */
+
+ thresh = calc_available_free_space(fs_info, space_info,
+ BTRFS_RESERVE_FLUSH_ALL);
+ thresh += (space_info->total_bytes - space_info->bytes_used -
+ space_info->bytes_reserved - space_info->bytes_readonly);
+ thresh >>= space_info->clamp;
+
+ used = space_info->bytes_pinned;
+
+ /*
+ * If we have more ordered bytes than delalloc bytes then we're either
+ * doing a lot of DIO, or we simply don't have a lot of delalloc waiting
+ * around. Preemptive flushing is only useful in that it can free up
+ * space before tickets need to wait for things to finish. In the case
+ * of ordered extents, preemptively waiting on ordered extents gets us
+ * nothing, if our reservations are tied up in ordered extents we'll
+ * simply have to slow down writers by forcing them to wait on ordered
+ * extents.
+ *
+ * In the case that ordered is larger than delalloc, only include the
+ * block reserves that we would actually be able to directly reclaim
+ * from. In this case if we're heavy on metadata operations this will
+ * clearly be heavy enough to warrant preemptive flushing. In the case
+ * of heavy DIO or ordered reservations, preemptive flushing will just
+ * waste time and cause us to slow down.
+ */
+ ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes);
+ delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
+ if (ordered >= delalloc)
+ used += fs_info->delayed_refs_rsv.reserved +
+ fs_info->delayed_block_rsv.reserved;
+ else
+ used += space_info->bytes_may_use;
return (used >= thresh && !btrfs_fs_closing(fs_info) &&
!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
@@ -922,7 +981,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
struct btrfs_fs_info *fs_info;
struct btrfs_space_info *space_info;
u64 to_reclaim;
- int flush_state;
+ enum btrfs_flush_state flush_state;
int commit_cycles = 0;
u64 last_tickets_id;
@@ -941,7 +1000,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
flush_state = FLUSH_DELAYED_ITEMS_NR;
do {
- flush_space(fs_info, space_info, to_reclaim, flush_state);
+ flush_space(fs_info, space_info, to_reclaim, flush_state, false);
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets)) {
space_info->flush = 0;
@@ -990,6 +1049,105 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
}
/*
+ * This handles pre-flushing of metadata space before we get to the point that
+ * we need to start blocking threads on tickets. The logic here is different
+ * from the other flush paths because it doesn't rely on tickets to tell us how
+ * much we need to flush, instead it attempts to keep us below the 80% full
+ * watermark of space by flushing whichever reservation pool is currently the
+ * largest.
+ */
+static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *space_info;
+ struct btrfs_block_rsv *delayed_block_rsv;
+ struct btrfs_block_rsv *delayed_refs_rsv;
+ struct btrfs_block_rsv *global_rsv;
+ struct btrfs_block_rsv *trans_rsv;
+ int loops = 0;
+
+ fs_info = container_of(work, struct btrfs_fs_info,
+ preempt_reclaim_work);
+ space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+ delayed_block_rsv = &fs_info->delayed_block_rsv;
+ delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+ global_rsv = &fs_info->global_block_rsv;
+ trans_rsv = &fs_info->trans_block_rsv;
+
+ spin_lock(&space_info->lock);
+ while (need_preemptive_reclaim(fs_info, space_info)) {
+ enum btrfs_flush_state flush;
+ u64 delalloc_size = 0;
+ u64 to_reclaim, block_rsv_size;
+ u64 global_rsv_size = global_rsv->reserved;
+
+ loops++;
+
+ /*
+ * We don't have a precise counter for the metadata being
+ * reserved for delalloc, so we'll approximate it by subtracting
+ * out the block rsv's space from the bytes_may_use. If that
+ * amount is higher than the individual reserves, then we can
+ * assume it's tied up in delalloc reservations.
+ */
+ block_rsv_size = global_rsv_size +
+ delayed_block_rsv->reserved +
+ delayed_refs_rsv->reserved +
+ trans_rsv->reserved;
+ if (block_rsv_size < space_info->bytes_may_use)
+ delalloc_size = space_info->bytes_may_use - block_rsv_size;
+ spin_unlock(&space_info->lock);
+
+ /*
+ * We don't want to include the global_rsv in our calculation,
+ * because that's space we can't touch. Subtract it from the
+ * block_rsv_size for the next checks.
+ */
+ block_rsv_size -= global_rsv_size;
+
+ /*
+ * We really want to avoid flushing delalloc too much, as it
+ * could result in poor allocation patterns, so only flush it if
+ * it's larger than the rest of the pools combined.
+ */
+ if (delalloc_size > block_rsv_size) {
+ to_reclaim = delalloc_size;
+ flush = FLUSH_DELALLOC;
+ } else if (space_info->bytes_pinned >
+ (delayed_block_rsv->reserved +
+ delayed_refs_rsv->reserved)) {
+ to_reclaim = space_info->bytes_pinned;
+ flush = FORCE_COMMIT_TRANS;
+ } else if (delayed_block_rsv->reserved >
+ delayed_refs_rsv->reserved) {
+ to_reclaim = delayed_block_rsv->reserved;
+ flush = FLUSH_DELAYED_ITEMS_NR;
+ } else {
+ to_reclaim = delayed_refs_rsv->reserved;
+ flush = FLUSH_DELAYED_REFS_NR;
+ }
+
+ /*
+ * We don't want to reclaim everything, just a portion, so scale
+ * down the to_reclaim by 1/4. If it takes us down to 0,
+ * reclaim 1 items worth.
+ */
+ to_reclaim >>= 2;
+ if (!to_reclaim)
+ to_reclaim = btrfs_calc_insert_metadata_size(fs_info, 1);
+ flush_space(fs_info, space_info, to_reclaim, flush, true);
+ cond_resched();
+ spin_lock(&space_info->lock);
+ }
+
+ /* We only went through once, back off our clamping. */
+ if (loops == 1 && !space_info->reclaim_size)
+ space_info->clamp = max(1, space_info->clamp - 1);
+ trace_btrfs_done_preemptive_reclaim(fs_info, space_info);
+ spin_unlock(&space_info->lock);
+}
+
+/*
* FLUSH_DELALLOC_WAIT:
* Space is freed from flushing delalloc in one of two ways.
*
@@ -1054,7 +1212,7 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
struct btrfs_fs_info *fs_info;
struct btrfs_space_info *space_info;
u64 last_tickets_id;
- int flush_state = 0;
+ enum btrfs_flush_state flush_state = 0;
fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work);
space_info = fs_info->data_sinfo;
@@ -1069,7 +1227,7 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
spin_unlock(&space_info->lock);
while (!space_info->full) {
- flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE);
+ flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets)) {
space_info->flush = 0;
@@ -1082,7 +1240,7 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
while (flush_state < ARRAY_SIZE(data_flush_states)) {
flush_space(fs_info, space_info, U64_MAX,
- data_flush_states[flush_state]);
+ data_flush_states[flush_state], false);
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets)) {
space_info->flush = 0;
@@ -1115,6 +1273,8 @@ void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
{
INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space);
INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space);
+ INIT_WORK(&fs_info->preempt_reclaim_work,
+ btrfs_preempt_reclaim_metadata_space);
}
static const enum btrfs_flush_state priority_flush_states[] = {
@@ -1153,7 +1313,8 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
flush_state = 0;
do {
- flush_space(fs_info, space_info, to_reclaim, states[flush_state]);
+ flush_space(fs_info, space_info, to_reclaim, states[flush_state],
+ false);
flush_state++;
spin_lock(&space_info->lock);
if (ticket->bytes == 0) {
@@ -1169,7 +1330,7 @@ static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
struct reserve_ticket *ticket)
{
while (!space_info->full) {
- flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE);
+ flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
spin_lock(&space_info->lock);
if (ticket->bytes == 0) {
spin_unlock(&space_info->lock);
@@ -1214,11 +1375,14 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
}
/**
- * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket
- * @fs_info - the fs
- * @space_info - the space_info for the reservation
- * @ticket - the ticket for the reservation
- * @flush - how much we can flush
+ * Do the appropriate flushing and waiting for a ticket
+ *
+ * @fs_info: the filesystem
+ * @space_info: space info for the reservation
+ * @ticket: ticket for the reservation
+ * @start_ns: timestamp when the reservation started
+ * @orig_bytes: amount of bytes originally reserved
+ * @flush: how much we can flush
*
* This does the work of figuring out how to flush for the ticket, waiting for
* the reservation, and returning the appropriate error if there is one.
@@ -1226,6 +1390,7 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
struct reserve_ticket *ticket,
+ u64 start_ns, u64 orig_bytes,
enum btrfs_reserve_flush_enum flush)
{
int ret;
@@ -1281,6 +1446,8 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
* space wasn't reserved at all).
*/
ASSERT(!(ticket->bytes == 0 && ticket->error));
+ trace_btrfs_reserve_ticket(fs_info, space_info->flags, orig_bytes,
+ start_ns, flush, ticket->error);
return ret;
}
@@ -1294,12 +1461,31 @@ static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush)
(flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
}
+static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info)
+{
+ u64 ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes);
+ u64 delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
+
+ /*
+ * If we're heavy on ordered operations then clamping won't help us. We
+ * need to clamp specifically to keep up with dirty'ing buffered
+ * writers, because there's not a 1:1 correlation of writing delalloc
+ * and freeing space, like there is with flushing delayed refs or
+ * delayed nodes. If we're already more ordered than delalloc then
+ * we're keeping up, otherwise we aren't and should probably clamp.
+ */
+ if (ordered < delalloc)
+ space_info->clamp = min(space_info->clamp + 1, 8);
+}
+
/**
- * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
- * @root - the root we're allocating for
- * @space_info - the space info we want to allocate from
- * @orig_bytes - the number of bytes we want
- * @flush - whether or not we can flush to make our reservation
+ * Try to reserve bytes from the block_rsv's space
+ *
+ * @fs_info: the filesystem
+ * @space_info: space info we want to allocate from
+ * @orig_bytes: number of bytes we want
+ * @flush: whether or not we can flush to make our reservation
*
* This will reserve orig_bytes number of bytes from the space info associated
* with the block_rsv. If there is not enough space it will make an attempt to
@@ -1314,6 +1500,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
{
struct work_struct *async_work;
struct reserve_ticket ticket;
+ u64 start_ns = 0;
u64 used;
int ret = 0;
bool pending_tickets;
@@ -1366,6 +1553,9 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
space_info->reclaim_size += ticket.bytes;
init_waitqueue_head(&ticket.wait);
ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
+ if (trace_btrfs_reserve_ticket_enabled())
+ start_ns = ktime_get_ns();
+
if (flush == BTRFS_RESERVE_FLUSH_ALL ||
flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
flush == BTRFS_RESERVE_FLUSH_DATA) {
@@ -1382,6 +1572,14 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
list_add_tail(&ticket.list,
&space_info->priority_tickets);
}
+
+ /*
+ * We were forced to add a reserve ticket, so our preemptive
+ * flushing is unable to keep up. Clamp down on the threshold
+ * for the preemptive flushing in order to keep up with the
+ * workload.
+ */
+ maybe_clamp_preempt(fs_info, space_info);
} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
used += orig_bytes;
/*
@@ -1390,27 +1588,29 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
* the async reclaim as we will panic.
*/
if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
- need_do_async_reclaim(fs_info, space_info, used) &&
- !work_busy(&fs_info->async_reclaim_work)) {
+ need_preemptive_reclaim(fs_info, space_info) &&
+ !work_busy(&fs_info->preempt_reclaim_work)) {
trace_btrfs_trigger_flush(fs_info, space_info->flags,
orig_bytes, flush, "preempt");
queue_work(system_unbound_wq,
- &fs_info->async_reclaim_work);
+ &fs_info->preempt_reclaim_work);
}
}
spin_unlock(&space_info->lock);
if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
return ret;
- return handle_reserve_ticket(fs_info, space_info, &ticket, flush);
+ return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns,
+ orig_bytes, flush);
}
/**
- * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
- * @root - the root we're allocating for
- * @block_rsv - the block_rsv we're allocating for
- * @orig_bytes - the number of bytes we want
- * @flush - whether or not we can flush to make our reservation
+ * Trye to reserve metadata bytes from the block_rsv's space
+ *
+ * @root: the root we're allocating for
+ * @block_rsv: block_rsv we're allocating for
+ * @orig_bytes: number of bytes we want
+ * @flush: whether or not we can flush to make our reservation
*
* This will reserve orig_bytes number of bytes from the space info associated
* with the block_rsv. If there is not enough space it will make an attempt to
@@ -1448,10 +1648,11 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
}
/**
- * btrfs_reserve_data_bytes - try to reserve data bytes for an allocation
- * @fs_info - the filesystem
- * @bytes - the number of bytes we need
- * @flush - how we are allowed to flush
+ * Try to reserve data bytes for an allocation
+ *
+ * @fs_info: the filesystem
+ * @bytes: number of bytes we need
+ * @flush: how we are allowed to flush
*
* This will reserve bytes from the data space info. If there is not enough
* space then we will attempt to flush space as specified by flush.
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 5646393b928c..b1a8ffb03b3e 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -17,11 +17,17 @@ struct btrfs_space_info {
u64 bytes_may_use; /* number of bytes that may be used for
delalloc/allocations */
u64 bytes_readonly; /* total bytes that are read only */
+ u64 bytes_zone_unusable; /* total bytes that are unusable until
+ resetting the device zone */
u64 max_extent_size; /* This will hold the maximum extent size of
the space info if we had an ENOSPC in the
allocator. */
+ int clamp; /* Used to scale our threshold for preemptive
+ flushing. The value is >> clamp, so turns
+ out to be a 2^clamp divisor. */
+
unsigned int full:1; /* indicates that we cannot allocate any more
chunks for this space */
unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
@@ -119,7 +125,7 @@ DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned");
int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
u64 total_bytes, u64 bytes_used,
- u64 bytes_readonly,
+ u64 bytes_readonly, u64 bytes_zone_unusable,
struct btrfs_space_info **space_info);
struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
u64 flags);
@@ -152,4 +158,21 @@ static inline void btrfs_space_info_free_bytes_may_use(
int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
enum btrfs_reserve_flush_enum flush);
+static inline void __btrfs_mod_total_bytes_pinned(
+ struct btrfs_space_info *space_info,
+ s64 mod)
+{
+ percpu_counter_add_batch(&space_info->total_bytes_pinned, mod,
+ BTRFS_TOTAL_BYTES_PINNED_BATCH);
+}
+
+static inline void btrfs_mod_total_bytes_pinned(struct btrfs_fs_info *fs_info,
+ u64 flags, s64 mod)
+{
+ struct btrfs_space_info *space_info = btrfs_find_space_info(fs_info, flags);
+
+ ASSERT(space_info);
+ __btrfs_mod_total_bytes_pinned(space_info, mod);
+}
+
#endif /* BTRFS_SPACE_INFO_H */
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
new file mode 100644
index 000000000000..c69049e7daa9
--- /dev/null
+++ b/fs/btrfs/subpage.c
@@ -0,0 +1,278 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/slab.h>
+#include "ctree.h"
+#include "subpage.h"
+
+int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
+ struct page *page, enum btrfs_subpage_type type)
+{
+ struct btrfs_subpage *subpage = NULL;
+ int ret;
+
+ /*
+ * We have cases like a dummy extent buffer page, which is not mappped
+ * and doesn't need to be locked.
+ */
+ if (page->mapping)
+ ASSERT(PageLocked(page));
+ /* Either not subpage, or the page already has private attached */
+ if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page))
+ return 0;
+
+ ret = btrfs_alloc_subpage(fs_info, &subpage, type);
+ if (ret < 0)
+ return ret;
+ attach_page_private(page, subpage);
+ return 0;
+}
+
+void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
+ struct page *page)
+{
+ struct btrfs_subpage *subpage;
+
+ /* Either not subpage, or already detached */
+ if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page))
+ return;
+
+ subpage = (struct btrfs_subpage *)detach_page_private(page);
+ ASSERT(subpage);
+ btrfs_free_subpage(subpage);
+}
+
+int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
+ struct btrfs_subpage **ret,
+ enum btrfs_subpage_type type)
+{
+ if (fs_info->sectorsize == PAGE_SIZE)
+ return 0;
+
+ *ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS);
+ if (!*ret)
+ return -ENOMEM;
+ spin_lock_init(&(*ret)->lock);
+ if (type == BTRFS_SUBPAGE_METADATA)
+ atomic_set(&(*ret)->eb_refs, 0);
+ else
+ atomic_set(&(*ret)->readers, 0);
+ return 0;
+}
+
+void btrfs_free_subpage(struct btrfs_subpage *subpage)
+{
+ kfree(subpage);
+}
+
+/*
+ * Increase the eb_refs of current subpage.
+ *
+ * This is important for eb allocation, to prevent race with last eb freeing
+ * of the same page.
+ * With the eb_refs increased before the eb inserted into radix tree,
+ * detach_extent_buffer_page() won't detach the page private while we're still
+ * allocating the extent buffer.
+ */
+void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
+ struct page *page)
+{
+ struct btrfs_subpage *subpage;
+
+ if (fs_info->sectorsize == PAGE_SIZE)
+ return;
+
+ ASSERT(PagePrivate(page) && page->mapping);
+ lockdep_assert_held(&page->mapping->private_lock);
+
+ subpage = (struct btrfs_subpage *)page->private;
+ atomic_inc(&subpage->eb_refs);
+}
+
+void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
+ struct page *page)
+{
+ struct btrfs_subpage *subpage;
+
+ if (fs_info->sectorsize == PAGE_SIZE)
+ return;
+
+ ASSERT(PagePrivate(page) && page->mapping);
+ lockdep_assert_held(&page->mapping->private_lock);
+
+ subpage = (struct btrfs_subpage *)page->private;
+ ASSERT(atomic_read(&subpage->eb_refs));
+ atomic_dec(&subpage->eb_refs);
+}
+
+static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ /* Basic checks */
+ ASSERT(PagePrivate(page) && page->private);
+ ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
+ IS_ALIGNED(len, fs_info->sectorsize));
+ /*
+ * The range check only works for mapped page, we can still have
+ * unmapped page like dummy extent buffer pages.
+ */
+ if (page->mapping)
+ ASSERT(page_offset(page) <= start &&
+ start + len <= page_offset(page) + PAGE_SIZE);
+}
+
+void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ const int nbits = len >> fs_info->sectorsize_bits;
+ int ret;
+
+ btrfs_subpage_assert(fs_info, page, start, len);
+
+ ret = atomic_add_return(nbits, &subpage->readers);
+ ASSERT(ret == nbits);
+}
+
+void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ const int nbits = len >> fs_info->sectorsize_bits;
+
+ btrfs_subpage_assert(fs_info, page, start, len);
+ ASSERT(atomic_read(&subpage->readers) >= nbits);
+ if (atomic_sub_and_test(nbits, &subpage->readers))
+ unlock_page(page);
+}
+
+/*
+ * Convert the [start, start + len) range into a u16 bitmap
+ *
+ * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0.
+ */
+static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits;
+ const int nbits = len >> fs_info->sectorsize_bits;
+
+ btrfs_subpage_assert(fs_info, page, start, len);
+
+ /*
+ * Here nbits can be 16, thus can go beyond u16 range. We make the
+ * first left shift to be calculate in unsigned long (at least u32),
+ * then truncate the result to u16.
+ */
+ return (u16)(((1UL << nbits) - 1) << bit_start);
+}
+
+void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ subpage->uptodate_bitmap |= tmp;
+ if (subpage->uptodate_bitmap == U16_MAX)
+ SetPageUptodate(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ subpage->uptodate_bitmap &= ~tmp;
+ ClearPageUptodate(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ subpage->error_bitmap |= tmp;
+ SetPageError(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ subpage->error_bitmap &= ~tmp;
+ if (subpage->error_bitmap == 0)
+ ClearPageError(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+/*
+ * Unlike set/clear which is dependent on each page status, for test all bits
+ * are tested in the same way.
+ */
+#define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \
+bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len) \
+{ \
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \
+ const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \
+ unsigned long flags; \
+ bool ret; \
+ \
+ spin_lock_irqsave(&subpage->lock, flags); \
+ ret = ((subpage->name##_bitmap & tmp) == tmp); \
+ spin_unlock_irqrestore(&subpage->lock, flags); \
+ return ret; \
+}
+IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate);
+IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error);
+
+/*
+ * Note that, in selftests (extent-io-tests), we can have empty fs_info passed
+ * in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall
+ * back to regular sectorsize branch.
+ */
+#define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \
+ test_page_func) \
+void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len) \
+{ \
+ if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
+ set_page_func(page); \
+ return; \
+ } \
+ btrfs_subpage_set_##name(fs_info, page, start, len); \
+} \
+void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len) \
+{ \
+ if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
+ clear_page_func(page); \
+ return; \
+ } \
+ btrfs_subpage_clear_##name(fs_info, page, start, len); \
+} \
+bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len) \
+{ \
+ if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \
+ return test_page_func(page); \
+ return btrfs_subpage_test_##name(fs_info, page, start, len); \
+}
+IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate,
+ PageUptodate);
+IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError);
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
new file mode 100644
index 000000000000..b86a4881475d
--- /dev/null
+++ b/fs/btrfs/subpage.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_SUBPAGE_H
+#define BTRFS_SUBPAGE_H
+
+#include <linux/spinlock.h>
+
+/*
+ * Maximum page size we support is 64K, minimum sector size is 4K, u16 bitmap
+ * is sufficient. Regular bitmap_* is not used due to size reasons.
+ */
+#define BTRFS_SUBPAGE_BITMAP_SIZE 16
+
+/*
+ * Structure to trace status of each sector inside a page, attached to
+ * page::private for both data and metadata inodes.
+ */
+struct btrfs_subpage {
+ /* Common members for both data and metadata pages */
+ spinlock_t lock;
+ u16 uptodate_bitmap;
+ u16 error_bitmap;
+ union {
+ /*
+ * Structures only used by metadata
+ *
+ * @eb_refs should only be operated under private_lock, as it
+ * manages whether the subpage can be detached.
+ */
+ atomic_t eb_refs;
+ /* Structures only used by data */
+ struct {
+ atomic_t readers;
+ };
+ };
+};
+
+enum btrfs_subpage_type {
+ BTRFS_SUBPAGE_METADATA,
+ BTRFS_SUBPAGE_DATA,
+};
+
+int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
+ struct page *page, enum btrfs_subpage_type type);
+void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
+ struct page *page);
+
+/* Allocate additional data where page represents more than one sector */
+int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
+ struct btrfs_subpage **ret,
+ enum btrfs_subpage_type type);
+void btrfs_free_subpage(struct btrfs_subpage *subpage);
+
+void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
+ struct page *page);
+void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
+ struct page *page);
+
+void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
+void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
+
+/*
+ * Template for subpage related operations.
+ *
+ * btrfs_subpage_*() are for call sites where the page has subpage attached and
+ * the range is ensured to be inside the page.
+ *
+ * btrfs_page_*() are for call sites where the page can either be subpage
+ * specific or regular page. The function will handle both cases.
+ * But the range still needs to be inside the page.
+ */
+#define DECLARE_BTRFS_SUBPAGE_OPS(name) \
+void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+void btrfs_subpage_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len);
+
+DECLARE_BTRFS_SUBPAGE_OPS(uptodate);
+DECLARE_BTRFS_SUBPAGE_OPS(error);
+
+#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 12d7d3be7cd4..f8435641b912 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -48,7 +48,6 @@
#include "tests/btrfs-tests.h"
#include "block-group.h"
#include "discard.h"
-
#include "qgroup.h"
#define CREATE_TRACE_POINTS
#include <trace/events/btrfs.h>
@@ -2028,6 +2027,13 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
ret = -EINVAL;
goto restore;
}
+ if (fs_info->sectorsize < PAGE_SIZE) {
+ btrfs_warn(fs_info,
+ "read-write mount is not yet allowed for sectorsize %u page size %lu",
+ fs_info->sectorsize, PAGE_SIZE);
+ ret = -EINVAL;
+ goto restore;
+ }
/*
* NOTE: when remounting with a change that does writes, don't
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 19b9fffa2c9c..6eb1c50fa98c 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -666,6 +666,7 @@ SPACE_INFO_ATTR(bytes_pinned);
SPACE_INFO_ATTR(bytes_reserved);
SPACE_INFO_ATTR(bytes_may_use);
SPACE_INFO_ATTR(bytes_readonly);
+SPACE_INFO_ATTR(bytes_zone_unusable);
SPACE_INFO_ATTR(disk_used);
SPACE_INFO_ATTR(disk_total);
BTRFS_ATTR(space_info, total_bytes_pinned,
@@ -679,6 +680,7 @@ static struct attribute *space_info_attrs[] = {
BTRFS_ATTR_PTR(space_info, bytes_reserved),
BTRFS_ATTR_PTR(space_info, bytes_may_use),
BTRFS_ATTR_PTR(space_info, bytes_readonly),
+ BTRFS_ATTR_PTR(space_info, bytes_zone_unusable),
BTRFS_ATTR_PTR(space_info, disk_used),
BTRFS_ATTR_PTR(space_info, disk_total),
BTRFS_ATTR_PTR(space_info, total_bytes_pinned),
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 57379e96ccc9..c0aefe6dee0b 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -507,7 +507,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
goto out_free;
}
- ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1),
+ ret = btrfs_rmap_block(fs_info, em->start, NULL, btrfs_sb_offset(1),
&logical, &out_ndaddrs, &out_stripe_len);
if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) {
test_err("didn't rmap anything but expected %d",
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8e0f7a1029c6..acff6bb49a97 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -21,6 +21,7 @@
#include "qgroup.h"
#include "block-group.h"
#include "space-info.h"
+#include "zoned.h"
#define BTRFS_ROOT_TRANS_TAG 0
@@ -107,6 +108,11 @@ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
__TRANS_JOIN |
__TRANS_JOIN_NOLOCK |
__TRANS_JOIN_NOSTART),
+ [TRANS_STATE_SUPER_COMMITTED] = (__TRANS_START |
+ __TRANS_ATTACH |
+ __TRANS_JOIN |
+ __TRANS_JOIN_NOLOCK |
+ __TRANS_JOIN_NOSTART),
[TRANS_STATE_COMPLETED] = (__TRANS_START |
__TRANS_ATTACH |
__TRANS_JOIN |
@@ -375,6 +381,8 @@ loop:
spin_lock_init(&cur_trans->dirty_bgs_lock);
INIT_LIST_HEAD(&cur_trans->deleted_bgs);
spin_lock_init(&cur_trans->dropped_roots_lock);
+ INIT_LIST_HEAD(&cur_trans->releasing_ebs);
+ spin_lock_init(&cur_trans->releasing_ebs_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
@@ -826,10 +834,11 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root)
return trans;
}
-/* wait for a transaction commit to be fully complete */
-static noinline void wait_for_commit(struct btrfs_transaction *commit)
+/* Wait for a transaction commit to reach at least the given state. */
+static noinline void wait_for_commit(struct btrfs_transaction *commit,
+ const enum btrfs_trans_state min_state)
{
- wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED);
+ wait_event(commit->commit_wait, commit->state >= min_state);
}
int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
@@ -884,7 +893,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
goto out; /* nothing committing|committed */
}
- wait_for_commit(cur_trans);
+ wait_for_commit(cur_trans, TRANS_STATE_COMPLETED);
btrfs_put_transaction(cur_trans);
out:
return ret;
@@ -909,9 +918,8 @@ bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_transaction *cur_trans = trans->transaction;
- smp_mb();
if (cur_trans->state >= TRANS_STATE_COMMIT_START ||
- cur_trans->delayed_refs.flushing)
+ test_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags))
return true;
return should_end_transaction(trans);
@@ -1230,10 +1238,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
if (ret)
return ret;
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
- if (ret)
- return ret;
-
ret = btrfs_run_dev_stats(trans);
if (ret)
return ret;
@@ -1248,10 +1252,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
if (ret)
return ret;
- /* run_qgroups might have added some more refs */
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
- if (ret)
- return ret;
again:
while (!list_empty(&fs_info->dirty_cowonly_roots)) {
struct btrfs_root *root;
@@ -1266,15 +1266,24 @@ again:
ret = update_cowonly_root(trans, root);
if (ret)
return ret;
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
- if (ret)
- return ret;
}
+ /* Now flush any delayed refs generated by updating all of the roots */
+ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+ if (ret)
+ return ret;
+
while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) {
ret = btrfs_write_dirty_block_groups(trans);
if (ret)
return ret;
+
+ /*
+ * We're writing the dirty block groups, which could generate
+ * delayed refs, which could generate more dirty block groups,
+ * so we want to keep this flushing in this loop to make sure
+ * everything gets run.
+ */
ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
if (ret)
return ret;
@@ -1319,7 +1328,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
struct btrfs_root *gang[8];
int i;
int ret;
- int err = 0;
spin_lock(&fs_info->fs_roots_radix_lock);
while (1) {
@@ -1331,6 +1339,8 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
break;
for (i = 0; i < ret; i++) {
struct btrfs_root *root = gang[i];
+ int ret2;
+
radix_tree_tag_clear(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
BTRFS_ROOT_TRANS_TAG);
@@ -1350,17 +1360,17 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
root->node);
}
- err = btrfs_update_root(trans, fs_info->tree_root,
+ ret2 = btrfs_update_root(trans, fs_info->tree_root,
&root->root_key,
&root->root_item);
+ if (ret2)
+ return ret2;
spin_lock(&fs_info->fs_roots_radix_lock);
- if (err)
- break;
btrfs_qgroup_free_meta_all_pertrans(root);
}
}
spin_unlock(&fs_info->fs_roots_radix_lock);
- return err;
+ return 0;
}
/*
@@ -1433,6 +1443,23 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
record_root_in_trans(trans, src, 1);
/*
+ * btrfs_qgroup_inherit relies on a consistent view of the usage for the
+ * src root, so we must run the delayed refs here.
+ *
+ * However this isn't particularly fool proof, because there's no
+ * synchronization keeping us from changing the tree after this point
+ * before we do the qgroup_inherit, or even from making changes while
+ * we're doing the qgroup_inherit. But that's a problem for the future,
+ * for now flush the delayed refs to narrow the race window where the
+ * qgroup counters could end up wrong.
+ */
+ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ /*
* We are going to commit transaction, see btrfs_commit_transaction()
* comment for reason locking tree_log_mutex
*/
@@ -1525,7 +1552,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
ASSERT(pending->root_item);
new_root_item = pending->root_item;
- pending->error = btrfs_find_free_objectid(tree_root, &objectid);
+ pending->error = btrfs_get_free_objectid(tree_root, &objectid);
if (pending->error)
goto no_free_objectid;
@@ -1685,12 +1712,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto fail;
- }
-
/*
* Do special qgroup accounting for snapshot, as we do some qgroup
* snapshot hack to do fast snapshot.
@@ -1738,12 +1759,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
}
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto fail;
- }
-
fail:
pending->error = ret;
dir_item_existed:
@@ -2042,32 +2057,25 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
btrfs_trans_release_metadata(trans);
trans->block_rsv = NULL;
- /* make a pass through all the delayed refs we have so far
- * any runnings procs may add more while we are here
- */
- ret = btrfs_run_delayed_refs(trans, 0);
- if (ret) {
- btrfs_end_transaction(trans);
- return ret;
- }
-
- cur_trans = trans->transaction;
-
/*
- * set the flushing flag so procs in this transaction have to
- * start sending their work down.
+ * We only want one transaction commit doing the flushing so we do not
+ * waste a bunch of time on lock contention on the extent root node.
*/
- cur_trans->delayed_refs.flushing = 1;
- smp_wmb();
+ if (!test_and_set_bit(BTRFS_DELAYED_REFS_FLUSHING,
+ &cur_trans->delayed_refs.flags)) {
+ /*
+ * Make a pass through all the delayed refs we have so far.
+ * Any running threads may add more while we are here.
+ */
+ ret = btrfs_run_delayed_refs(trans, 0);
+ if (ret) {
+ btrfs_end_transaction(trans);
+ return ret;
+ }
+ }
btrfs_create_pending_block_groups(trans);
- ret = btrfs_run_delayed_refs(trans, 0);
- if (ret) {
- btrfs_end_transaction(trans);
- return ret;
- }
-
if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
int run_it = 0;
@@ -2101,11 +2109,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
spin_lock(&fs_info->trans_lock);
if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
+ enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
+
spin_unlock(&fs_info->trans_lock);
refcount_inc(&cur_trans->use_count);
- ret = btrfs_end_transaction(trans);
- wait_for_commit(cur_trans);
+ if (trans->in_fsync)
+ want_state = TRANS_STATE_SUPER_COMMITTED;
+ ret = btrfs_end_transaction(trans);
+ wait_for_commit(cur_trans, want_state);
if (TRANS_ABORTED(cur_trans))
ret = cur_trans->aborted;
@@ -2119,13 +2131,19 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
wake_up(&fs_info->transaction_blocked_wait);
if (cur_trans->list.prev != &fs_info->trans_list) {
+ enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
+
+ if (trans->in_fsync)
+ want_state = TRANS_STATE_SUPER_COMMITTED;
+
prev_trans = list_entry(cur_trans->list.prev,
struct btrfs_transaction, list);
- if (prev_trans->state != TRANS_STATE_COMPLETED) {
+ if (prev_trans->state < want_state) {
refcount_inc(&prev_trans->use_count);
spin_unlock(&fs_info->trans_lock);
- wait_for_commit(prev_trans);
+ wait_for_commit(prev_trans, want_state);
+
ret = READ_ONCE(prev_trans->aborted);
btrfs_put_transaction(prev_trans);
@@ -2265,14 +2283,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
btrfs_free_log_root_tree(trans, fs_info);
/*
- * commit_fs_roots() can call btrfs_save_ino_cache(), which generates
- * new delayed refs. Must handle them or qgroup can be wrong.
- */
- ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
- if (ret)
- goto unlock_tree_log;
-
- /*
* Since fs roots are all committed, we can get a quite accurate
* new_roots. So let's do quota accounting.
*/
@@ -2343,6 +2353,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
goto scrub_continue;
}
+ /*
+ * At this point, we should have written all the tree blocks allocated
+ * in this transaction. So it's now safe to free the redirtyied extent
+ * buffers.
+ */
+ btrfs_free_redirty_list(cur_trans);
+
ret = write_all_supers(fs_info, 0);
/*
* the super is written, we can safely allow the tree-loggers
@@ -2352,6 +2369,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (ret)
goto scrub_continue;
+ /*
+ * We needn't acquire the lock here because there is no other task
+ * which can change it.
+ */
+ cur_trans->state = TRANS_STATE_SUPER_COMMITTED;
+ wake_up(&cur_trans->commit_wait);
+
btrfs_finish_extent_commit(trans);
if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 31ca81bad822..6335716e513f 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -16,6 +16,7 @@ enum btrfs_trans_state {
TRANS_STATE_COMMIT_START,
TRANS_STATE_COMMIT_DOING,
TRANS_STATE_UNBLOCKED,
+ TRANS_STATE_SUPER_COMMITTED,
TRANS_STATE_COMPLETED,
TRANS_STATE_MAX,
};
@@ -92,6 +93,9 @@ struct btrfs_transaction {
*/
atomic_t pending_ordered;
wait_queue_head_t pending_wait;
+
+ spinlock_t releasing_ebs_lock;
+ struct list_head releasing_ebs;
};
#define __TRANS_FREEZABLE (1U << 0)
@@ -133,6 +137,7 @@ struct btrfs_trans_handle {
bool can_flush_pending_bgs;
bool reloc_reserved;
bool dirty;
+ bool in_fsync;
struct btrfs_root *root;
struct btrfs_fs_info *fs_info;
struct list_head new_bgs;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 254c2ee43aae..d90695c1ab6c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -19,6 +19,7 @@
#include "qgroup.h"
#include "block-group.h"
#include "space-info.h"
+#include "zoned.h"
/* magic values for the inode_only field in btrfs_log_inode:
*
@@ -104,6 +105,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_path *path,
u64 dirid, int del_all);
+static void wait_log_commit(struct btrfs_root *root, int transid);
/*
* tree logging is a special write ahead log used to make sure that
@@ -139,7 +141,9 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *tree_root = fs_info->tree_root;
+ const bool zoned = btrfs_is_zoned(fs_info);
int ret = 0;
+ bool created = false;
/*
* First check if the log root tree was already created. If not, create
@@ -149,8 +153,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
mutex_lock(&tree_root->log_mutex);
if (!fs_info->log_root_tree) {
ret = btrfs_init_log_root_tree(trans, fs_info);
- if (!ret)
+ if (!ret) {
set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
+ created = true;
+ }
}
mutex_unlock(&tree_root->log_mutex);
if (ret)
@@ -159,12 +165,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
mutex_lock(&root->log_mutex);
+again:
if (root->log_root) {
+ int index = (root->log_transid + 1) % 2;
+
if (btrfs_need_log_full_commit(trans)) {
ret = -EAGAIN;
goto out;
}
+ if (zoned && atomic_read(&root->log_commit[index])) {
+ wait_log_commit(root, root->log_transid - 1);
+ goto again;
+ }
+
if (!root->log_start_pid) {
clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
root->log_start_pid = current->pid;
@@ -172,6 +186,17 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
}
} else {
+ /*
+ * This means fs_info->log_root_tree was already created
+ * for some other FS trees. Do the full commit not to mix
+ * nodes from multiple log transactions to do sequential
+ * writing.
+ */
+ if (zoned && !created) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
ret = btrfs_add_log_tree(trans, root);
if (ret)
goto out;
@@ -200,14 +225,22 @@ out:
*/
static int join_running_log_trans(struct btrfs_root *root)
{
+ const bool zoned = btrfs_is_zoned(root->fs_info);
int ret = -ENOENT;
if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
return ret;
mutex_lock(&root->log_mutex);
+again:
if (root->log_root) {
+ int index = (root->log_transid + 1) % 2;
+
ret = 0;
+ if (zoned && atomic_read(&root->log_commit[index])) {
+ wait_log_commit(root, root->log_transid - 1);
+ goto again;
+ }
atomic_inc(&root->log_writers);
}
mutex_unlock(&root->log_mutex);
@@ -2752,6 +2785,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
free_extent_buffer(next);
return ret;
}
+ btrfs_redirty_list_add(
+ trans->transaction, next);
} else {
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
clear_extent_buffer_dirty(next);
@@ -3085,6 +3120,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
*/
blk_start_plug(&plug);
ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
+ /*
+ * -EAGAIN happens when someone, e.g., a concurrent transaction
+ * commit, writes a dirty extent in this tree-log commit. This
+ * concurrent write will create a hole writing out the extents,
+ * and we cannot proceed on a zoned filesystem, requiring
+ * sequential writing. While we can bail out to a full commit
+ * here, but we can continue hoping the concurrent writing fills
+ * the hole.
+ */
+ if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
+ ret = 0;
if (ret) {
blk_finish_plug(&plug);
btrfs_abort_transaction(trans, ret);
@@ -3127,6 +3173,19 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
root_log_ctx.log_transid = log_root_tree->log_transid;
+ if (btrfs_is_zoned(fs_info)) {
+ mutex_lock(&fs_info->tree_root->log_mutex);
+ if (!log_root_tree->node) {
+ ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
+ if (ret) {
+ mutex_unlock(&fs_info->tree_log_mutex);
+ mutex_unlock(&log_root_tree->log_mutex);
+ goto out;
+ }
+ }
+ mutex_unlock(&fs_info->tree_root->log_mutex);
+ }
+
/*
* Now we are safe to update the log_root_tree because we're under the
* log_mutex, and we're a current writer so we're holding the commit
@@ -3194,7 +3253,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
&log_root_tree->dirty_log_pages,
EXTENT_DIRTY | EXTENT_NEW);
blk_finish_plug(&plug);
- if (ret) {
+ /*
+ * As described above, -EAGAIN indicates a hole in the extents. We
+ * cannot wait for these write outs since the waiting cause a
+ * deadlock. Bail out to the full commit instead.
+ */
+ if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
+ btrfs_set_log_full_commit(trans);
+ btrfs_wait_tree_log_extents(log, mark);
+ mutex_unlock(&log_root_tree->log_mutex);
+ goto out_wake_log_root;
+ } else if (ret) {
btrfs_set_log_full_commit(trans);
btrfs_abort_transaction(trans, ret);
mutex_unlock(&log_root_tree->log_mutex);
@@ -3285,17 +3354,22 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
.process_func = process_one_buffer
};
- ret = walk_log_tree(trans, log, &wc);
- if (ret) {
- if (trans)
- btrfs_abort_transaction(trans, ret);
- else
- btrfs_handle_fs_error(log->fs_info, ret, NULL);
+ if (log->node) {
+ ret = walk_log_tree(trans, log, &wc);
+ if (ret) {
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(log->fs_info, ret, NULL);
+ }
}
clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
extent_io_tree_release(&log->log_csum_range);
+
+ if (trans && log->node)
+ btrfs_redirty_list_add(trans->transaction, log->node);
btrfs_put_root(log);
}
@@ -3379,7 +3453,6 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
int ret;
int err = 0;
- int bytes_del = 0;
u64 dir_ino = btrfs_ino(dir);
if (!inode_logged(trans, dir))
@@ -3406,7 +3479,6 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
}
if (di) {
ret = btrfs_delete_one_dir_name(trans, log, path, di);
- bytes_del += name_len;
if (ret) {
err = ret;
goto fail;
@@ -3421,46 +3493,17 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
}
if (di) {
ret = btrfs_delete_one_dir_name(trans, log, path, di);
- bytes_del += name_len;
if (ret) {
err = ret;
goto fail;
}
}
- /* update the directory size in the log to reflect the names
- * we have removed
+ /*
+ * We do not need to update the size field of the directory's inode item
+ * because on log replay we update the field to reflect all existing
+ * entries in the directory (see overwrite_item()).
*/
- if (bytes_del) {
- struct btrfs_key key;
-
- key.objectid = dir_ino;
- key.offset = 0;
- key.type = BTRFS_INODE_ITEM_KEY;
- btrfs_release_path(path);
-
- ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
- if (ret < 0) {
- err = ret;
- goto fail;
- }
- if (ret == 0) {
- struct btrfs_inode_item *item;
- u64 i_size;
-
- item = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_inode_item);
- i_size = btrfs_inode_size(path->nodes[0], item);
- if (i_size > bytes_del)
- i_size -= bytes_del;
- else
- i_size = 0;
- btrfs_set_inode_size(path->nodes[0], item, i_size);
- btrfs_mark_buffer_dirty(path->nodes[0]);
- } else
- ret = 0;
- btrfs_release_path(path);
- }
fail:
btrfs_free_path(path);
out_unlock:
@@ -3889,7 +3932,14 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_token_timespec_nsec(&token, &item->ctime,
inode->i_ctime.tv_nsec);
- btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
+ /*
+ * We do not need to set the nbytes field, in fact during a fast fsync
+ * its value may not even be correct, since a fast fsync does not wait
+ * for ordered extent completion, which is where we update nbytes, it
+ * only waits for writeback to complete. During log replay as we find
+ * file extent items and replay them, we adjust the nbytes field of the
+ * inode item in subvolume tree as needed (see overwrite_item()).
+ */
btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
btrfs_set_token_inode_transid(&token, item, trans->transid);
@@ -5290,12 +5340,28 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
}
/*
+ * This is for cases where logging a directory could result in losing a
+ * a file after replaying the log. For example, if we move a file from a
+ * directory A to a directory B, then fsync directory A, we have no way
+ * to known the file was moved from A to B, so logging just A would
+ * result in losing the file after a log replay.
+ */
+ if (S_ISDIR(inode->vfs_inode.i_mode) &&
+ inode_only == LOG_INODE_ALL &&
+ inode->last_unlink_trans >= trans->transid) {
+ btrfs_set_log_full_commit(trans);
+ err = 1;
+ goto out_unlock;
+ }
+
+ /*
* a brute force approach to making sure we get the most uptodate
* copies of everything.
*/
if (S_ISDIR(inode->vfs_inode.i_mode)) {
int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
+ clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
if (inode_only == LOG_INODE_EXISTS)
max_key_type = BTRFS_XATTR_ITEM_KEY;
ret = drop_objectid_items(trans, log, path, ino, max_key_type);
@@ -5452,96 +5518,31 @@ out_unlock:
}
/*
- * Check if we must fallback to a transaction commit when logging an inode.
- * This must be called after logging the inode and is used only in the context
- * when fsyncing an inode requires the need to log some other inode - in which
- * case we can't lock the i_mutex of each other inode we need to log as that
- * can lead to deadlocks with concurrent fsync against other inodes (as we can
- * log inodes up or down in the hierarchy) or rename operations for example. So
- * we take the log_mutex of the inode after we have logged it and then check for
- * its last_unlink_trans value - this is safe because any task setting
- * last_unlink_trans must take the log_mutex and it must do this before it does
- * the actual unlink operation, so if we do this check before a concurrent task
- * sets last_unlink_trans it means we've logged a consistent version/state of
- * all the inode items, otherwise we are not sure and must do a transaction
- * commit (the concurrent task might have only updated last_unlink_trans before
- * we logged the inode or it might have also done the unlink).
+ * Check if we need to log an inode. This is used in contexts where while
+ * logging an inode we need to log another inode (either that it exists or in
+ * full mode). This is used instead of btrfs_inode_in_log() because the later
+ * requires the inode to be in the log and have the log transaction committed,
+ * while here we do not care if the log transaction was already committed - our
+ * caller will commit the log later - and we want to avoid logging an inode
+ * multiple times when multiple tasks have joined the same log transaction.
*/
-static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode)
+static bool need_log_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode)
{
- bool ret = false;
-
- mutex_lock(&inode->log_mutex);
- if (inode->last_unlink_trans >= trans->transid) {
- /*
- * Make sure any commits to the log are forced to be full
- * commits.
- */
- btrfs_set_log_full_commit(trans);
- ret = true;
- }
- mutex_unlock(&inode->log_mutex);
-
- return ret;
-}
-
-/*
- * follow the dentry parent pointers up the chain and see if any
- * of the directories in it require a full commit before they can
- * be logged. Returns zero if nothing special needs to be done or 1 if
- * a full commit is required.
- */
-static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode,
- struct dentry *parent,
- struct super_block *sb)
-{
- int ret = 0;
- struct dentry *old_parent = NULL;
-
/*
- * for regular files, if its inode is already on disk, we don't
- * have to worry about the parents at all. This is because
- * we can use the last_unlink_trans field to record renames
- * and other fun in this file.
+ * If this inode does not have new/updated/deleted xattrs since the last
+ * time it was logged and is flagged as logged in the current transaction,
+ * we can skip logging it. As for new/deleted names, those are updated in
+ * the log by link/unlink/rename operations.
+ * In case the inode was logged and then evicted and reloaded, its
+ * logged_trans will be 0, in which case we have to fully log it since
+ * logged_trans is a transient field, not persisted.
*/
- if (S_ISREG(inode->vfs_inode.i_mode) &&
- inode->generation < trans->transid &&
- inode->last_unlink_trans < trans->transid)
- goto out;
-
- if (!S_ISDIR(inode->vfs_inode.i_mode)) {
- if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
- goto out;
- inode = BTRFS_I(d_inode(parent));
- }
-
- while (1) {
- if (btrfs_must_commit_transaction(trans, inode)) {
- ret = 1;
- break;
- }
-
- if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
- break;
-
- if (IS_ROOT(parent)) {
- inode = BTRFS_I(d_inode(parent));
- if (btrfs_must_commit_transaction(trans, inode))
- ret = 1;
- break;
- }
-
- parent = dget_parent(parent);
- dput(old_parent);
- old_parent = parent;
- inode = BTRFS_I(d_inode(parent));
+ if (inode->logged_trans == trans->transid &&
+ !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
+ return false;
- }
- dput(old_parent);
-out:
- return ret;
+ return true;
}
struct btrfs_dir_list {
@@ -5671,7 +5672,7 @@ process_leaf:
goto next_dir_inode;
}
- if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) {
+ if (!need_log_inode(trans, BTRFS_I(di_inode))) {
btrfs_add_delayed_iput(di_inode);
break;
}
@@ -5681,9 +5682,6 @@ process_leaf:
log_mode = LOG_INODE_ALL;
ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
log_mode, ctx);
- if (!ret &&
- btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
- ret = 1;
btrfs_add_delayed_iput(di_inode);
if (ret)
goto next_dir_inode;
@@ -5821,13 +5819,15 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
goto out;
}
+ if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
+ btrfs_add_delayed_iput(dir_inode);
+ continue;
+ }
+
if (ctx)
ctx->log_new_dentries = false;
ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
LOG_INODE_ALL, ctx);
- if (!ret &&
- btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode)))
- ret = 1;
if (!ret && ctx && ctx->log_new_dentries)
ret = log_new_dir_dentries(trans, root,
BTRFS_I(dir_inode), ctx);
@@ -5872,7 +5872,8 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (BTRFS_I(inode)->generation >= trans->transid)
+ if (BTRFS_I(inode)->generation >= trans->transid &&
+ need_log_inode(trans, BTRFS_I(inode)))
ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
LOG_INODE_EXISTS, ctx);
btrfs_add_delayed_iput(inode);
@@ -5926,7 +5927,8 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
if (root != inode->root)
break;
- if (inode->generation >= trans->transid) {
+ if (inode->generation >= trans->transid &&
+ need_log_inode(trans, inode)) {
ret = btrfs_log_inode(trans, root, inode,
LOG_INODE_EXISTS, ctx);
if (ret)
@@ -6041,12 +6043,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct super_block *sb;
int ret = 0;
bool log_dentries = false;
- sb = inode->vfs_inode.i_sb;
-
if (btrfs_test_opt(fs_info, NOTREELOG)) {
ret = 1;
goto end_no_trans;
@@ -6057,10 +6056,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_no_trans;
}
- ret = check_parent_dirs_for_sync(trans, inode, parent, sb);
- if (ret)
- goto end_no_trans;
-
/*
* Skip already logged inodes or inodes corresponding to tmpfiles
* (since logging them is pointless, a link count of 0 means they
@@ -6307,8 +6302,7 @@ again:
* root->objectid_mutex is not acquired as log replay
* could only happen during mount.
*/
- ret = btrfs_find_highest_objectid(root,
- &root->highest_objectid);
+ ret = btrfs_init_root_free_objectid(root);
}
wc.replay_dest->log_root = NULL;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b62be84833e9..bc3b33efddc5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -421,7 +421,7 @@ static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
* Preallocate a bio that's always going to be used for flushing device
* barriers and matches the device lifespan
*/
- dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
+ dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
if (!dev->flush_bio) {
kfree(dev);
return ERR_PTR(-ENOMEM);
@@ -433,7 +433,7 @@ static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
atomic_set(&dev->reada_in_flight, 0);
atomic_set(&dev->dev_stats_ccnt, 0);
- btrfs_device_data_ordered_init(dev, fs_info);
+ btrfs_device_data_ordered_init(dev);
INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
extent_io_tree_init(fs_info, &dev->alloc_state,
@@ -669,10 +669,6 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
device->mode = flags;
- ret = btrfs_get_dev_zone_info(device);
- if (ret != 0)
- goto error_free_page;
-
fs_devices->open_devices++;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
device->devid != BTRFS_DEV_REPLACE_DEVID) {
@@ -1418,11 +1414,62 @@ static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
* make sure to start at an offset of at least 1MB.
*/
return max_t(u64, start, SZ_1M);
+ case BTRFS_CHUNK_ALLOC_ZONED:
+ /*
+ * We don't care about the starting region like regular
+ * allocator, because we anyway use/reserve the first two zones
+ * for superblock logging.
+ */
+ return ALIGN(start, device->zone_info->zone_size);
default:
BUG();
}
}
+static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
+ u64 *hole_start, u64 *hole_size,
+ u64 num_bytes)
+{
+ u64 zone_size = device->zone_info->zone_size;
+ u64 pos;
+ int ret;
+ bool changed = false;
+
+ ASSERT(IS_ALIGNED(*hole_start, zone_size));
+
+ while (*hole_size > 0) {
+ pos = btrfs_find_allocatable_zones(device, *hole_start,
+ *hole_start + *hole_size,
+ num_bytes);
+ if (pos != *hole_start) {
+ *hole_size = *hole_start + *hole_size - pos;
+ *hole_start = pos;
+ changed = true;
+ if (*hole_size < num_bytes)
+ break;
+ }
+
+ ret = btrfs_ensure_empty_zones(device, pos, num_bytes);
+
+ /* Range is ensured to be empty */
+ if (!ret)
+ return changed;
+
+ /* Given hole range was invalid (outside of device) */
+ if (ret == -ERANGE) {
+ *hole_start += *hole_size;
+ *hole_size = 0;
+ return 1;
+ }
+
+ *hole_start += zone_size;
+ *hole_size -= zone_size;
+ changed = true;
+ }
+
+ return changed;
+}
+
/**
* dev_extent_hole_check - check if specified hole is suitable for allocation
* @device: the device which we have the hole
@@ -1430,7 +1477,7 @@ static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
* @hole_size: the size of the hole
* @num_bytes: the size of the free space that we need
*
- * This function may modify @hole_start and @hole_end to reflect the suitable
+ * This function may modify @hole_start and @hole_size to reflect the suitable
* position for allocation. Returns 1 if hole position is updated, 0 otherwise.
*/
static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
@@ -1439,24 +1486,39 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
bool changed = false;
u64 hole_end = *hole_start + *hole_size;
- /*
- * Check before we set max_hole_start, otherwise we could end up
- * sending back this offset anyway.
- */
- if (contains_pending_extent(device, hole_start, *hole_size)) {
- if (hole_end >= *hole_start)
- *hole_size = hole_end - *hole_start;
- else
- *hole_size = 0;
- changed = true;
- }
+ for (;;) {
+ /*
+ * Check before we set max_hole_start, otherwise we could end up
+ * sending back this offset anyway.
+ */
+ if (contains_pending_extent(device, hole_start, *hole_size)) {
+ if (hole_end >= *hole_start)
+ *hole_size = hole_end - *hole_start;
+ else
+ *hole_size = 0;
+ changed = true;
+ }
+
+ switch (device->fs_devices->chunk_alloc_policy) {
+ case BTRFS_CHUNK_ALLOC_REGULAR:
+ /* No extra check */
+ break;
+ case BTRFS_CHUNK_ALLOC_ZONED:
+ if (dev_extent_hole_check_zoned(device, hole_start,
+ hole_size, num_bytes)) {
+ changed = true;
+ /*
+ * The changed hole can contain pending extent.
+ * Loop again to check that.
+ */
+ continue;
+ }
+ break;
+ default:
+ BUG();
+ }
- switch (device->fs_devices->chunk_alloc_policy) {
- case BTRFS_CHUNK_ALLOC_REGULAR:
- /* No extra check */
break;
- default:
- BUG();
}
return changed;
@@ -1509,6 +1571,9 @@ static int find_free_dev_extent_start(struct btrfs_device *device,
search_start = dev_extent_search_start(device, search_start);
+ WARN_ON(device->zone_info &&
+ !IS_ALIGNED(num_bytes, device->zone_info->zone_size));
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -4317,6 +4382,8 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
btrfs_warn(fs_info,
"balance: cannot set exclusive op status, resume manually");
+ btrfs_release_path(path);
+
mutex_lock(&fs_info->balance_mutex);
BUG_ON(fs_info->balance_ctl);
spin_lock(&fs_info->balance_lock);
@@ -4666,11 +4733,10 @@ again:
}
ret = btrfs_previous_item(root, path, 0, key.type);
- if (ret)
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
- if (ret < 0)
- goto done;
if (ret) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ if (ret < 0)
+ goto done;
ret = 0;
btrfs_release_path(path);
break;
@@ -4902,6 +4968,37 @@ static void init_alloc_chunk_ctl_policy_regular(
ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
}
+static void init_alloc_chunk_ctl_policy_zoned(
+ struct btrfs_fs_devices *fs_devices,
+ struct alloc_chunk_ctl *ctl)
+{
+ u64 zone_size = fs_devices->fs_info->zone_size;
+ u64 limit;
+ int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
+ int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
+ u64 min_chunk_size = min_data_stripes * zone_size;
+ u64 type = ctl->type;
+
+ ctl->max_stripe_size = zone_size;
+ if (type & BTRFS_BLOCK_GROUP_DATA) {
+ ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
+ zone_size);
+ } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
+ ctl->max_chunk_size = ctl->max_stripe_size;
+ } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
+ ctl->max_chunk_size = 2 * ctl->max_stripe_size;
+ ctl->devs_max = min_t(int, ctl->devs_max,
+ BTRFS_MAX_DEVS_SYS_CHUNK);
+ }
+
+ /* We don't want a chunk larger than 10% of writable space */
+ limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1),
+ zone_size),
+ min_chunk_size);
+ ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
+ ctl->dev_extent_min = zone_size * ctl->dev_stripes;
+}
+
static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
struct alloc_chunk_ctl *ctl)
{
@@ -4922,6 +5019,9 @@ static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
case BTRFS_CHUNK_ALLOC_REGULAR:
init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
break;
+ case BTRFS_CHUNK_ALLOC_ZONED:
+ init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
+ break;
default:
BUG();
}
@@ -5048,6 +5148,38 @@ static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
return 0;
}
+static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
+ struct btrfs_device_info *devices_info)
+{
+ u64 zone_size = devices_info[0].dev->zone_info->zone_size;
+ /* Number of stripes that count for block group size */
+ int data_stripes;
+
+ /*
+ * It should hold because:
+ * dev_extent_min == dev_extent_want == zone_size * dev_stripes
+ */
+ ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
+
+ ctl->stripe_size = zone_size;
+ ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
+ data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
+
+ /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
+ if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
+ ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
+ ctl->stripe_size) + ctl->nparity,
+ ctl->dev_stripes);
+ ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
+ data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
+ ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
+ }
+
+ ctl->chunk_size = ctl->stripe_size * data_stripes;
+
+ return 0;
+}
+
static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
struct alloc_chunk_ctl *ctl,
struct btrfs_device_info *devices_info)
@@ -5075,6 +5207,8 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
switch (fs_devices->chunk_alloc_policy) {
case BTRFS_CHUNK_ALLOC_REGULAR:
return decide_stripe_size_regular(ctl, devices_info);
+ case BTRFS_CHUNK_ALLOC_ZONED:
+ return decide_stripe_size_zoned(ctl, devices_info);
default:
BUG();
}
@@ -5839,9 +5973,29 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
return ret;
}
+static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
+{
+ struct btrfs_block_group *cache;
+ bool ret;
+
+ /* Non zoned filesystem does not use "to_copy" flag */
+ if (!btrfs_is_zoned(fs_info))
+ return false;
+
+ cache = btrfs_lookup_block_group(fs_info, logical);
+
+ spin_lock(&cache->lock);
+ ret = cache->to_copy;
+ spin_unlock(&cache->lock);
+
+ btrfs_put_block_group(cache);
+ return ret;
+}
+
static void handle_ops_on_dev_replace(enum btrfs_map_op op,
struct btrfs_bio **bbio_ret,
struct btrfs_dev_replace *dev_replace,
+ u64 logical,
int *num_stripes_ret, int *max_errors_ret)
{
struct btrfs_bio *bbio = *bbio_ret;
@@ -5855,6 +6009,13 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
int index_where_to_add;
/*
+ * A block group which have "to_copy" set will eventually
+ * copied by dev-replace process. We can avoid cloning IO here.
+ */
+ if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
+ return;
+
+ /*
* duplicate the write operations while the dev replace
* procedure is running. Since the copying of the old disk to
* the new disk takes place at run time while the filesystem is
@@ -5939,23 +6100,24 @@ static bool need_full_stripe(enum btrfs_map_op op)
}
/*
- * btrfs_get_io_geometry - calculates the geomery of a particular (address, len)
- * tuple. This information is used to calculate how big a
- * particular bio can get before it straddles a stripe.
+ * Calculate the geometry of a particular (address, len) tuple. This
+ * information is used to calculate how big a particular bio can get before it
+ * straddles a stripe.
*
- * @fs_info - the filesystem
- * @logical - address that we want to figure out the geometry of
- * @len - the length of IO we are going to perform, starting at @logical
- * @op - type of operation - write or read
- * @io_geom - pointer used to return values
+ * @fs_info: the filesystem
+ * @em: mapping containing the logical extent
+ * @op: type of operation - write or read
+ * @logical: address that we want to figure out the geometry of
+ * @len: the length of IO we are going to perform, starting at @logical
+ * @io_geom: pointer used to return values
*
* Returns < 0 in case a chunk for the given logical address cannot be found,
* usually shouldn't happen unless @logical is corrupted, 0 otherwise.
*/
-int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
- u64 logical, u64 len, struct btrfs_io_geometry *io_geom)
+int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
+ enum btrfs_map_op op, u64 logical, u64 len,
+ struct btrfs_io_geometry *io_geom)
{
- struct extent_map *em;
struct map_lookup *map;
u64 offset;
u64 stripe_offset;
@@ -5963,14 +6125,9 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 stripe_len;
u64 raid56_full_stripe_start = (u64)-1;
int data_stripes;
- int ret = 0;
ASSERT(op != BTRFS_MAP_DISCARD);
- em = btrfs_get_chunk_map(fs_info, logical, len);
- if (IS_ERR(em))
- return PTR_ERR(em);
-
map = em->map_lookup;
/* Offset of this logical address in the chunk */
offset = logical - em->start;
@@ -5984,8 +6141,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
btrfs_crit(fs_info,
"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
stripe_offset, offset, em->start, logical, stripe_len);
- ret = -EINVAL;
- goto out;
+ return -EINVAL;
}
/* stripe_offset is the offset of this block in its stripe */
@@ -6032,10 +6188,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
io_geom->stripe_offset = stripe_offset;
io_geom->raid56_stripe_offset = raid56_full_stripe_start;
-out:
- /* once for us */
- free_extent_map(em);
- return ret;
+ return 0;
}
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
@@ -6068,12 +6221,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
ASSERT(bbio_ret);
ASSERT(op != BTRFS_MAP_DISCARD);
- ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom);
+ em = btrfs_get_chunk_map(fs_info, logical, *length);
+ ASSERT(!IS_ERR(em));
+
+ ret = btrfs_get_io_geometry(fs_info, em, op, logical, *length, &geom);
if (ret < 0)
return ret;
- em = btrfs_get_chunk_map(fs_info, logical, *length);
- ASSERT(!IS_ERR(em));
map = em->map_lookup;
*length = geom.len;
@@ -6249,8 +6403,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
need_full_stripe(op)) {
- handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
- &max_errors);
+ handle_ops_on_dev_replace(op, &bbio, dev_replace, logical,
+ &num_stripes, &max_errors);
}
*bbio_ret = bbio;
@@ -6321,7 +6475,7 @@ static void btrfs_end_bio(struct bio *bio)
struct btrfs_device *dev = btrfs_io_bio(bio)->device;
ASSERT(dev->bdev);
- if (bio_op(bio) == REQ_OP_WRITE)
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE)
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_WRITE_ERRS);
else if (!(bio->bi_opf & REQ_RAHEAD))
@@ -6373,6 +6527,20 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
btrfs_io_bio(bio)->device = dev;
bio->bi_end_io = btrfs_end_bio;
bio->bi_iter.bi_sector = physical >> 9;
+ /*
+ * For zone append writing, bi_sector must point the beginning of the
+ * zone
+ */
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ if (btrfs_dev_is_sequential(dev, physical)) {
+ u64 zone_start = round_down(physical, fs_info->zone_size);
+
+ bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
+ } else {
+ bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
+ bio->bi_opf |= REQ_OP_WRITE;
+ }
+ }
btrfs_debug_in_rcu(fs_info,
"btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
@@ -6434,10 +6602,10 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
atomic_set(&bbio->stripes_pending, bbio->num_stripes);
if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
- ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) {
+ ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
/* In this case, map_length has been set to the length of
a single stripe; not the whole write */
- if (bio_op(bio) == REQ_OP_WRITE) {
+ if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
ret = raid56_parity_write(fs_info, bio, bbio,
map_length);
} else {
@@ -6460,7 +6628,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
dev = bbio->stripes[dev_nr].dev;
if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
&dev->dev_state) ||
- (bio_op(first_bio) == REQ_OP_WRITE &&
+ (btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
bbio_error(bbio, first_bio, logical);
continue;
@@ -7642,6 +7810,20 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
ret = -EUCLEAN;
goto out;
}
+
+ if (dev->zone_info) {
+ u64 zone_size = dev->zone_info->zone_size;
+
+ if (!IS_ALIGNED(physical_offset, zone_size) ||
+ !IS_ALIGNED(physical_len, zone_size)) {
+ btrfs_err(fs_info,
+"zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
+ devid, physical_offset, physical_len);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+
out:
free_extent_map(em);
return ret;
@@ -7798,3 +7980,75 @@ bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
spin_unlock(&fs_info->swapfile_pins_lock);
return node != NULL;
}
+
+static int relocating_repair_kthread(void *data)
+{
+ struct btrfs_block_group *cache = (struct btrfs_block_group *)data;
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ u64 target;
+ int ret = 0;
+
+ target = cache->start;
+ btrfs_put_block_group(cache);
+
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
+ btrfs_info(fs_info,
+ "zoned: skip relocating block group %llu to repair: EBUSY",
+ target);
+ return -EBUSY;
+ }
+
+ mutex_lock(&fs_info->delete_unused_bgs_mutex);
+
+ /* Ensure block group still exists */
+ cache = btrfs_lookup_block_group(fs_info, target);
+ if (!cache)
+ goto out;
+
+ if (!cache->relocating_repair)
+ goto out;
+
+ ret = btrfs_may_alloc_data_chunk(fs_info, target);
+ if (ret < 0)
+ goto out;
+
+ btrfs_info(fs_info,
+ "zoned: relocating block group %llu to repair IO failure",
+ target);
+ ret = btrfs_relocate_chunk(fs_info, target);
+
+out:
+ if (cache)
+ btrfs_put_block_group(cache);
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ btrfs_exclop_finish(fs_info);
+
+ return ret;
+}
+
+int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
+{
+ struct btrfs_block_group *cache;
+
+ /* Do not attempt to repair in degraded state */
+ if (btrfs_test_opt(fs_info, DEGRADED))
+ return 0;
+
+ cache = btrfs_lookup_block_group(fs_info, logical);
+ if (!cache)
+ return 0;
+
+ spin_lock(&cache->lock);
+ if (cache->relocating_repair) {
+ spin_unlock(&cache->lock);
+ btrfs_put_block_group(cache);
+ return 0;
+ }
+ cache->relocating_repair = 1;
+ spin_unlock(&cache->lock);
+
+ kthread_run(relocating_repair_kthread, cache,
+ "btrfs-relocating-repair");
+
+ return 0;
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 1997a4649a66..d4c3e0dd32b8 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -39,10 +39,10 @@ struct btrfs_io_geometry {
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
#include <linux/seqlock.h>
#define __BTRFS_NEED_DEVICE_DATA_ORDERED
-#define btrfs_device_data_ordered_init(device, info) \
- seqcount_mutex_init(&device->data_seqcount, &info->chunk_mutex)
+#define btrfs_device_data_ordered_init(device) \
+ seqcount_init(&device->data_seqcount)
#else
-#define btrfs_device_data_ordered_init(device, info) do { } while (0)
+#define btrfs_device_data_ordered_init(device) do { } while (0)
#endif
#define BTRFS_DEV_STATE_WRITEABLE (0)
@@ -76,8 +76,7 @@ struct btrfs_device {
blk_status_t last_flush_error;
#ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED
- /* A seqcount_t with associated chunk_mutex (for lockdep) */
- seqcount_mutex_t data_seqcount;
+ seqcount_t data_seqcount;
#endif
/* the internal btrfs device id */
@@ -168,9 +167,11 @@ btrfs_device_get_##name(const struct btrfs_device *dev) \
static inline void \
btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
{ \
+ preempt_disable(); \
write_seqcount_begin(&dev->data_seqcount); \
dev->name = size; \
write_seqcount_end(&dev->data_seqcount); \
+ preempt_enable(); \
}
#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
#define BTRFS_DEVICE_GETSET_FUNCS(name) \
@@ -213,6 +214,7 @@ BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
enum btrfs_chunk_allocation_policy {
BTRFS_CHUNK_ALLOC_REGULAR,
+ BTRFS_CHUNK_ALLOC_ZONED,
};
/*
@@ -422,6 +424,7 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio)
case REQ_OP_DISCARD:
return BTRFS_MAP_DISCARD;
case REQ_OP_WRITE:
+ case REQ_OP_ZONE_APPEND:
return BTRFS_MAP_WRITE;
default:
WARN_ON_ONCE(1);
@@ -439,8 +442,9 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret);
-int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
- u64 logical, u64 len, struct btrfs_io_geometry *io_geom);
+int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map,
+ enum btrfs_map_op op, u64 logical, u64 len,
+ struct btrfs_io_geometry *io_geom);
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type);
@@ -595,5 +599,6 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
int btrfs_bg_type_to_factor(u64 flags);
const char *btrfs_bg_type_to_raid_name(u64 flags);
int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
+int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
#endif
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index c38846659019..d0eb0c8d6269 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1,14 +1,25 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/bitops.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
+#include <linux/sched/mm.h>
#include "ctree.h"
#include "volumes.h"
#include "zoned.h"
#include "rcu-string.h"
+#include "disk-io.h"
+#include "block-group.h"
+#include "transaction.h"
+#include "dev-replace.h"
+#include "space-info.h"
/* Maximum number of zones to report per blkdev_report_zones() call */
#define BTRFS_REPORT_NR_ZONES 4096
+/* Invalid allocation pointer value for missing devices */
+#define WP_MISSING_DEV ((u64)-1)
+/* Pseudo write pointer value for conventional zone */
+#define WP_CONVENTIONAL ((u64)-2)
/* Number of superblock log zones */
#define BTRFS_NR_SB_LOG_ZONES 2
@@ -119,6 +130,36 @@ static inline u32 sb_zone_number(int shift, int mirror)
return 0;
}
+/*
+ * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
+ * device into static sized chunks and fake a conventional zone on each of
+ * them.
+ */
+static int emulate_report_zones(struct btrfs_device *device, u64 pos,
+ struct blk_zone *zones, unsigned int nr_zones)
+{
+ const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT;
+ sector_t bdev_size = bdev_nr_sectors(device->bdev);
+ unsigned int i;
+
+ pos >>= SECTOR_SHIFT;
+ for (i = 0; i < nr_zones; i++) {
+ zones[i].start = i * zone_sectors + pos;
+ zones[i].len = zone_sectors;
+ zones[i].capacity = zone_sectors;
+ zones[i].wp = zones[i].start + zone_sectors;
+ zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL;
+ zones[i].cond = BLK_ZONE_COND_NOT_WP;
+
+ if (zones[i].wp >= bdev_size) {
+ i++;
+ break;
+ }
+ }
+
+ return i;
+}
+
static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
struct blk_zone *zones, unsigned int *nr_zones)
{
@@ -127,6 +168,12 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
if (!*nr_zones)
return 0;
+ if (!bdev_is_zoned(device->bdev)) {
+ ret = emulate_report_zones(device, pos, zones, *nr_zones);
+ *nr_zones = ret;
+ return 0;
+ }
+
ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
copy_zone_info_cb, zones);
if (ret < 0) {
@@ -143,8 +190,78 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
return 0;
}
+/* The emulated zone size is determined from the size of device extent */
+static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_path *path;
+ struct btrfs_root *root = fs_info->dev_root;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_dev_extent *dext;
+ int ret = 0;
+
+ key.objectid = 1;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_item(root, path);
+ if (ret < 0)
+ goto out;
+ /* No dev extents at all? Not good */
+ if (ret > 0) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+
+ leaf = path->nodes[0];
+ dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
+ fs_info->zone_size = btrfs_dev_extent_length(leaf, dext);
+ ret = 0;
+
+out:
+ btrfs_free_path(path);
+
+ return ret;
+}
+
+int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *device;
+ int ret = 0;
+
+ /* fs_info->zone_size might not set yet. Use the incomapt flag here. */
+ if (!btrfs_fs_incompat(fs_info, ZONED))
+ return 0;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ /* We can skip reading of zone info for missing devices */
+ if (!device->bdev)
+ continue;
+
+ ret = btrfs_get_dev_zone_info(device);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ return ret;
+}
+
int btrfs_get_dev_zone_info(struct btrfs_device *device)
{
+ struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_zoned_device_info *zone_info = NULL;
struct block_device *bdev = device->bdev;
struct request_queue *queue = bdev_get_queue(bdev);
@@ -153,9 +270,14 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
struct blk_zone *zones = NULL;
unsigned int i, nreported = 0, nr_zones;
unsigned int zone_sectors;
+ char *model, *emulated;
int ret;
- if (!bdev_is_zoned(bdev))
+ /*
+ * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not
+ * yet be set.
+ */
+ if (!btrfs_fs_incompat(fs_info, ZONED))
return 0;
if (device->zone_info)
@@ -165,8 +287,20 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
if (!zone_info)
return -ENOMEM;
+ if (!bdev_is_zoned(bdev)) {
+ if (!fs_info->zone_size) {
+ ret = calculate_emulated_zone_size(fs_info);
+ if (ret)
+ goto out;
+ }
+
+ ASSERT(fs_info->zone_size);
+ zone_sectors = fs_info->zone_size >> SECTOR_SHIFT;
+ } else {
+ zone_sectors = bdev_zone_sectors(bdev);
+ }
+
nr_sectors = bdev_nr_sectors(bdev);
- zone_sectors = bdev_zone_sectors(bdev);
/* Check if it's power of 2 (see is_power_of_2) */
ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0);
zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
@@ -272,20 +406,42 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
device->zone_info = zone_info;
- /* device->fs_info is not safe to use for printing messages */
- btrfs_info_in_rcu(NULL,
- "host-%s zoned block device %s, %u zones of %llu bytes",
- bdev_zoned_model(bdev) == BLK_ZONED_HM ? "managed" : "aware",
- rcu_str_deref(device->name), zone_info->nr_zones,
- zone_info->zone_size);
+ switch (bdev_zoned_model(bdev)) {
+ case BLK_ZONED_HM:
+ model = "host-managed zoned";
+ emulated = "";
+ break;
+ case BLK_ZONED_HA:
+ model = "host-aware zoned";
+ emulated = "";
+ break;
+ case BLK_ZONED_NONE:
+ model = "regular";
+ emulated = "emulated ";
+ break;
+ default:
+ /* Just in case */
+ btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s",
+ bdev_zoned_model(bdev),
+ rcu_str_deref(device->name));
+ ret = -EOPNOTSUPP;
+ goto out_free_zone_info;
+ }
+
+ btrfs_info_in_rcu(fs_info,
+ "%s block device %s, %u %szones of %llu bytes",
+ model, rcu_str_deref(device->name), zone_info->nr_zones,
+ emulated, zone_info->zone_size);
return 0;
out:
kfree(zones);
+out_free_zone_info:
bitmap_free(zone_info->empty_zones);
bitmap_free(zone_info->seq_zones);
kfree(zone_info);
+ device->zone_info = NULL;
return ret;
}
@@ -324,7 +480,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
u64 nr_devices = 0;
u64 zone_size = 0;
u64 max_zone_append_size = 0;
- const bool incompat_zoned = btrfs_is_zoned(fs_info);
+ const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED);
int ret = 0;
/* Count zoned devices */
@@ -335,9 +491,17 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
continue;
model = bdev_zoned_model(device->bdev);
+ /*
+ * A Host-Managed zoned device must be used as a zoned device.
+ * A Host-Aware zoned device and a non-zoned devices can be
+ * treated as a zoned device, if ZONED flag is enabled in the
+ * superblock.
+ */
if (model == BLK_ZONED_HM ||
- (model == BLK_ZONED_HA && incompat_zoned)) {
- struct btrfs_zoned_device_info *zone_info;
+ (model == BLK_ZONED_HA && incompat_zoned) ||
+ (model == BLK_ZONED_NONE && incompat_zoned)) {
+ struct btrfs_zoned_device_info *zone_info =
+ device->zone_info;
zone_info = device->zone_info;
zoned_devices++;
@@ -406,6 +570,15 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
fs_info->zone_size = zone_size;
fs_info->max_zone_append_size = max_zone_append_size;
+ fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
+
+ /*
+ * Check mount options here, because we might change fs_info->zoned
+ * from fs_info->zone_size.
+ */
+ ret = btrfs_check_mountopts_zoned(fs_info);
+ if (ret)
+ goto out;
btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
out:
@@ -488,7 +661,6 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
unsigned int zone_sectors;
u32 sb_zone;
int ret;
- u64 zone_size;
u8 zone_sectors_shift;
sector_t nr_sectors;
u32 nr_zones;
@@ -503,7 +675,6 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
zone_sectors = bdev_zone_sectors(bdev);
if (!is_power_of_2(zone_sectors))
return -EINVAL;
- zone_size = zone_sectors << SECTOR_SHIFT;
zone_sectors_shift = ilog2(zone_sectors);
nr_sectors = bdev_nr_sectors(bdev);
nr_zones = nr_sectors >> zone_sectors_shift;
@@ -529,7 +700,13 @@ int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
struct btrfs_zoned_device_info *zinfo = device->zone_info;
u32 zone_num;
- if (!zinfo) {
+ /*
+ * For a zoned filesystem on a non-zoned block device, use the same
+ * super block locations as regular filesystem. Doing so, the super
+ * block can always be retrieved and the zoned flag of the volume
+ * detected from the super block information.
+ */
+ if (!bdev_is_zoned(device->bdev)) {
*bytenr_ret = btrfs_sb_offset(mirror);
return 0;
}
@@ -614,3 +791,671 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
sb_zone << zone_sectors_shift,
zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
}
+
+/**
+ * btrfs_find_allocatable_zones - find allocatable zones within a given region
+ *
+ * @device: the device to allocate a region on
+ * @hole_start: the position of the hole to allocate the region
+ * @num_bytes: size of wanted region
+ * @hole_end: the end of the hole
+ * @return: position of allocatable zones
+ *
+ * Allocatable region should not contain any superblock locations.
+ */
+u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
+ u64 hole_end, u64 num_bytes)
+{
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ const u8 shift = zinfo->zone_size_shift;
+ u64 nzones = num_bytes >> shift;
+ u64 pos = hole_start;
+ u64 begin, end;
+ bool have_sb;
+ int i;
+
+ ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
+ ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
+
+ while (pos < hole_end) {
+ begin = pos >> shift;
+ end = begin + nzones;
+
+ if (end > zinfo->nr_zones)
+ return hole_end;
+
+ /* Check if zones in the region are all empty */
+ if (btrfs_dev_is_sequential(device, pos) &&
+ find_next_zero_bit(zinfo->empty_zones, end, begin) != end) {
+ pos += zinfo->zone_size;
+ continue;
+ }
+
+ have_sb = false;
+ for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ u32 sb_zone;
+ u64 sb_pos;
+
+ sb_zone = sb_zone_number(shift, i);
+ if (!(end <= sb_zone ||
+ sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) {
+ have_sb = true;
+ pos = ((u64)sb_zone + BTRFS_NR_SB_LOG_ZONES) << shift;
+ break;
+ }
+
+ /* We also need to exclude regular superblock positions */
+ sb_pos = btrfs_sb_offset(i);
+ if (!(pos + num_bytes <= sb_pos ||
+ sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) {
+ have_sb = true;
+ pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE,
+ zinfo->zone_size);
+ break;
+ }
+ }
+ if (!have_sb)
+ break;
+ }
+
+ return pos;
+}
+
+int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
+ u64 length, u64 *bytes)
+{
+ int ret;
+
+ *bytes = 0;
+ ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
+ physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
+ GFP_NOFS);
+ if (ret)
+ return ret;
+
+ *bytes = length;
+ while (length) {
+ btrfs_dev_set_zone_empty(device, physical);
+ physical += device->zone_info->zone_size;
+ length -= device->zone_info->zone_size;
+ }
+
+ return 0;
+}
+
+int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
+{
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ const u8 shift = zinfo->zone_size_shift;
+ unsigned long begin = start >> shift;
+ unsigned long end = (start + size) >> shift;
+ u64 pos;
+ int ret;
+
+ ASSERT(IS_ALIGNED(start, zinfo->zone_size));
+ ASSERT(IS_ALIGNED(size, zinfo->zone_size));
+
+ if (end > zinfo->nr_zones)
+ return -ERANGE;
+
+ /* All the zones are conventional */
+ if (find_next_bit(zinfo->seq_zones, begin, end) == end)
+ return 0;
+
+ /* All the zones are sequential and empty */
+ if (find_next_zero_bit(zinfo->seq_zones, begin, end) == end &&
+ find_next_zero_bit(zinfo->empty_zones, begin, end) == end)
+ return 0;
+
+ for (pos = start; pos < start + size; pos += zinfo->zone_size) {
+ u64 reset_bytes;
+
+ if (!btrfs_dev_is_sequential(device, pos) ||
+ btrfs_dev_is_empty_zone(device, pos))
+ continue;
+
+ /* Free regions should be empty */
+ btrfs_warn_in_rcu(
+ device->fs_info,
+ "zoned: resetting device %s (devid %llu) zone %llu for allocation",
+ rcu_str_deref(device->name), device->devid, pos >> shift);
+ WARN_ON_ONCE(1);
+
+ ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
+ &reset_bytes);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * Calculate an allocation pointer from the extent allocation information
+ * for a block group consist of conventional zones. It is pointed to the
+ * end of the highest addressed extent in the block group as an allocation
+ * offset.
+ */
+static int calculate_alloc_pointer(struct btrfs_block_group *cache,
+ u64 *offset_ret)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ int ret;
+ u64 length;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = cache->start + cache->length;
+ key.type = 0;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ /* We should not find the exact match */
+ if (!ret)
+ ret = -EUCLEAN;
+ if (ret < 0)
+ goto out;
+
+ ret = btrfs_previous_extent_item(root, path, cache->start);
+ if (ret) {
+ if (ret == 1) {
+ ret = 0;
+ *offset_ret = 0;
+ }
+ goto out;
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+
+ if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
+ length = found_key.offset;
+ else
+ length = fs_info->nodesize;
+
+ if (!(found_key.objectid >= cache->start &&
+ found_key.objectid + length <= cache->start + cache->length)) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ *offset_ret = found_key.objectid + length - cache->start;
+ ret = 0;
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct btrfs_device *device;
+ u64 logical = cache->start;
+ u64 length = cache->length;
+ u64 physical = 0;
+ int ret;
+ int i;
+ unsigned int nofs_flag;
+ u64 *alloc_offsets = NULL;
+ u64 last_alloc = 0;
+ u32 num_sequential = 0, num_conventional = 0;
+
+ if (!btrfs_is_zoned(fs_info))
+ return 0;
+
+ /* Sanity check */
+ if (!IS_ALIGNED(length, fs_info->zone_size)) {
+ btrfs_err(fs_info,
+ "zoned: block group %llu len %llu unaligned to zone size %llu",
+ logical, length, fs_info->zone_size);
+ return -EIO;
+ }
+
+ /* Get the chunk mapping */
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, logical, length);
+ read_unlock(&em_tree->lock);
+
+ if (!em)
+ return -EINVAL;
+
+ map = em->map_lookup;
+
+ alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
+ if (!alloc_offsets) {
+ free_extent_map(em);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < map->num_stripes; i++) {
+ bool is_sequential;
+ struct blk_zone zone;
+ struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ int dev_replace_is_ongoing = 0;
+
+ device = map->stripes[i].dev;
+ physical = map->stripes[i].physical;
+
+ if (device->bdev == NULL) {
+ alloc_offsets[i] = WP_MISSING_DEV;
+ continue;
+ }
+
+ is_sequential = btrfs_dev_is_sequential(device, physical);
+ if (is_sequential)
+ num_sequential++;
+ else
+ num_conventional++;
+
+ if (!is_sequential) {
+ alloc_offsets[i] = WP_CONVENTIONAL;
+ continue;
+ }
+
+ /*
+ * This zone will be used for allocation, so mark this zone
+ * non-empty.
+ */
+ btrfs_dev_clear_zone_empty(device, physical);
+
+ down_read(&dev_replace->rwsem);
+ dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
+ if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
+ btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical);
+ up_read(&dev_replace->rwsem);
+
+ /*
+ * The group is mapped to a sequential zone. Get the zone write
+ * pointer to determine the allocation offset within the zone.
+ */
+ WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size));
+ nofs_flag = memalloc_nofs_save();
+ ret = btrfs_get_dev_zone(device, physical, &zone);
+ memalloc_nofs_restore(nofs_flag);
+ if (ret == -EIO || ret == -EOPNOTSUPP) {
+ ret = 0;
+ alloc_offsets[i] = WP_MISSING_DEV;
+ continue;
+ } else if (ret) {
+ goto out;
+ }
+
+ switch (zone.cond) {
+ case BLK_ZONE_COND_OFFLINE:
+ case BLK_ZONE_COND_READONLY:
+ btrfs_err(fs_info,
+ "zoned: offline/readonly zone %llu on device %s (devid %llu)",
+ physical >> device->zone_info->zone_size_shift,
+ rcu_str_deref(device->name), device->devid);
+ alloc_offsets[i] = WP_MISSING_DEV;
+ break;
+ case BLK_ZONE_COND_EMPTY:
+ alloc_offsets[i] = 0;
+ break;
+ case BLK_ZONE_COND_FULL:
+ alloc_offsets[i] = fs_info->zone_size;
+ break;
+ default:
+ /* Partially used zone */
+ alloc_offsets[i] =
+ ((zone.wp - zone.start) << SECTOR_SHIFT);
+ break;
+ }
+ }
+
+ if (num_sequential > 0)
+ cache->seq_zone = true;
+
+ if (num_conventional > 0) {
+ /*
+ * Avoid calling calculate_alloc_pointer() for new BG. It
+ * is no use for new BG. It must be always 0.
+ *
+ * Also, we have a lock chain of extent buffer lock ->
+ * chunk mutex. For new BG, this function is called from
+ * btrfs_make_block_group() which is already taking the
+ * chunk mutex. Thus, we cannot call
+ * calculate_alloc_pointer() which takes extent buffer
+ * locks to avoid deadlock.
+ */
+ if (new) {
+ cache->alloc_offset = 0;
+ goto out;
+ }
+ ret = calculate_alloc_pointer(cache, &last_alloc);
+ if (ret || map->num_stripes == num_conventional) {
+ if (!ret)
+ cache->alloc_offset = last_alloc;
+ else
+ btrfs_err(fs_info,
+ "zoned: failed to determine allocation offset of bg %llu",
+ cache->start);
+ goto out;
+ }
+ }
+
+ switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ case 0: /* single */
+ cache->alloc_offset = alloc_offsets[0];
+ break;
+ case BTRFS_BLOCK_GROUP_DUP:
+ case BTRFS_BLOCK_GROUP_RAID1:
+ case BTRFS_BLOCK_GROUP_RAID0:
+ case BTRFS_BLOCK_GROUP_RAID10:
+ case BTRFS_BLOCK_GROUP_RAID5:
+ case BTRFS_BLOCK_GROUP_RAID6:
+ /* non-single profiles are not supported yet */
+ default:
+ btrfs_err(fs_info, "zoned: profile %s not yet supported",
+ btrfs_bg_type_to_raid_name(map->type));
+ ret = -EINVAL;
+ goto out;
+ }
+
+out:
+ /* An extent is allocated after the write pointer */
+ if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
+ btrfs_err(fs_info,
+ "zoned: got wrong write pointer in BG %llu: %llu > %llu",
+ logical, last_alloc, cache->alloc_offset);
+ ret = -EIO;
+ }
+
+ if (!ret)
+ cache->meta_write_pointer = cache->alloc_offset + cache->start;
+
+ kfree(alloc_offsets);
+ free_extent_map(em);
+
+ return ret;
+}
+
+void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
+{
+ u64 unusable, free;
+
+ if (!btrfs_is_zoned(cache->fs_info))
+ return;
+
+ WARN_ON(cache->bytes_super != 0);
+ unusable = cache->alloc_offset - cache->used;
+ free = cache->length - cache->alloc_offset;
+
+ /* We only need ->free_space in ALLOC_SEQ block groups */
+ cache->last_byte_to_unpin = (u64)-1;
+ cache->cached = BTRFS_CACHE_FINISHED;
+ cache->free_space_ctl->free_space = free;
+ cache->zone_unusable = unusable;
+
+ /* Should not have any excluded extents. Just in case, though */
+ btrfs_free_excluded_extents(cache);
+}
+
+void btrfs_redirty_list_add(struct btrfs_transaction *trans,
+ struct extent_buffer *eb)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+
+ if (!btrfs_is_zoned(fs_info) ||
+ btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN) ||
+ !list_empty(&eb->release_list))
+ return;
+
+ set_extent_buffer_dirty(eb);
+ set_extent_bits_nowait(&trans->dirty_pages, eb->start,
+ eb->start + eb->len - 1, EXTENT_DIRTY);
+ memzero_extent_buffer(eb, 0, eb->len);
+ set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
+
+ spin_lock(&trans->releasing_ebs_lock);
+ list_add_tail(&eb->release_list, &trans->releasing_ebs);
+ spin_unlock(&trans->releasing_ebs_lock);
+ atomic_inc(&eb->refs);
+}
+
+void btrfs_free_redirty_list(struct btrfs_transaction *trans)
+{
+ spin_lock(&trans->releasing_ebs_lock);
+ while (!list_empty(&trans->releasing_ebs)) {
+ struct extent_buffer *eb;
+
+ eb = list_first_entry(&trans->releasing_ebs,
+ struct extent_buffer, release_list);
+ list_del_init(&eb->release_list);
+ free_extent_buffer(eb);
+ }
+ spin_unlock(&trans->releasing_ebs_lock);
+}
+
+bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_block_group *cache;
+ bool ret = false;
+
+ if (!btrfs_is_zoned(fs_info))
+ return false;
+
+ if (!fs_info->max_zone_append_size)
+ return false;
+
+ if (!is_data_inode(&inode->vfs_inode))
+ return false;
+
+ cache = btrfs_lookup_block_group(fs_info, em->block_start);
+ ASSERT(cache);
+ if (!cache)
+ return false;
+
+ ret = cache->seq_zone;
+ btrfs_put_block_group(cache);
+
+ return ret;
+}
+
+void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
+ struct bio *bio)
+{
+ struct btrfs_ordered_extent *ordered;
+ const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+
+ if (bio_op(bio) != REQ_OP_ZONE_APPEND)
+ return;
+
+ ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset);
+ if (WARN_ON(!ordered))
+ return;
+
+ ordered->physical = physical;
+ ordered->disk = bio->bi_bdev->bd_disk;
+ ordered->partno = bio->bi_bdev->bd_partno;
+
+ btrfs_put_ordered_extent(ordered);
+}
+
+void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
+{
+ struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct btrfs_ordered_sum *sum;
+ struct block_device *bdev;
+ u64 orig_logical = ordered->disk_bytenr;
+ u64 *logical = NULL;
+ int nr, stripe_len;
+
+ /* Zoned devices should not have partitions. So, we can assume it is 0 */
+ ASSERT(ordered->partno == 0);
+ bdev = bdgrab(ordered->disk->part0);
+ if (WARN_ON(!bdev))
+ return;
+
+ if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, bdev,
+ ordered->physical, &logical, &nr,
+ &stripe_len)))
+ goto out;
+
+ WARN_ON(nr != 1);
+
+ if (orig_logical == *logical)
+ goto out;
+
+ ordered->disk_bytenr = *logical;
+
+ em_tree = &inode->extent_tree;
+ write_lock(&em_tree->lock);
+ em = search_extent_mapping(em_tree, ordered->file_offset,
+ ordered->num_bytes);
+ em->block_start = *logical;
+ free_extent_map(em);
+ write_unlock(&em_tree->lock);
+
+ list_for_each_entry(sum, &ordered->list, list) {
+ if (*logical < orig_logical)
+ sum->bytenr -= orig_logical - *logical;
+ else
+ sum->bytenr += *logical - orig_logical;
+ }
+
+out:
+ kfree(logical);
+ bdput(bdev);
+}
+
+bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb,
+ struct btrfs_block_group **cache_ret)
+{
+ struct btrfs_block_group *cache;
+ bool ret = true;
+
+ if (!btrfs_is_zoned(fs_info))
+ return true;
+
+ cache = *cache_ret;
+
+ if (cache && (eb->start < cache->start ||
+ cache->start + cache->length <= eb->start)) {
+ btrfs_put_block_group(cache);
+ cache = NULL;
+ *cache_ret = NULL;
+ }
+
+ if (!cache)
+ cache = btrfs_lookup_block_group(fs_info, eb->start);
+
+ if (cache) {
+ if (cache->meta_write_pointer != eb->start) {
+ btrfs_put_block_group(cache);
+ cache = NULL;
+ ret = false;
+ } else {
+ cache->meta_write_pointer = eb->start + eb->len;
+ }
+
+ *cache_ret = cache;
+ }
+
+ return ret;
+}
+
+void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
+ struct extent_buffer *eb)
+{
+ if (!btrfs_is_zoned(eb->fs_info) || !cache)
+ return;
+
+ ASSERT(cache->meta_write_pointer == eb->start + eb->len);
+ cache->meta_write_pointer = eb->start;
+}
+
+int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length)
+{
+ if (!btrfs_dev_is_sequential(device, physical))
+ return -EOPNOTSUPP;
+
+ return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT,
+ length >> SECTOR_SHIFT, GFP_NOFS, 0);
+}
+
+static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
+ struct blk_zone *zone)
+{
+ struct btrfs_bio *bbio = NULL;
+ u64 mapped_length = PAGE_SIZE;
+ unsigned int nofs_flag;
+ int nmirrors;
+ int i, ret;
+
+ ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
+ &mapped_length, &bbio);
+ if (ret || !bbio || mapped_length < PAGE_SIZE) {
+ btrfs_put_bbio(bbio);
+ return -EIO;
+ }
+
+ if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+ return -EINVAL;
+
+ nofs_flag = memalloc_nofs_save();
+ nmirrors = (int)bbio->num_stripes;
+ for (i = 0; i < nmirrors; i++) {
+ u64 physical = bbio->stripes[i].physical;
+ struct btrfs_device *dev = bbio->stripes[i].dev;
+
+ /* Missing device */
+ if (!dev->bdev)
+ continue;
+
+ ret = btrfs_get_dev_zone(dev, physical, zone);
+ /* Failing device */
+ if (ret == -EIO || ret == -EOPNOTSUPP)
+ continue;
+ break;
+ }
+ memalloc_nofs_restore(nofs_flag);
+
+ return ret;
+}
+
+/*
+ * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by
+ * filling zeros between @physical_pos to a write pointer of dev-replace
+ * source device.
+ */
+int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
+ u64 physical_start, u64 physical_pos)
+{
+ struct btrfs_fs_info *fs_info = tgt_dev->fs_info;
+ struct blk_zone zone;
+ u64 length;
+ u64 wp;
+ int ret;
+
+ if (!btrfs_dev_is_sequential(tgt_dev, physical_pos))
+ return 0;
+
+ ret = read_zone_info(fs_info, logical, &zone);
+ if (ret)
+ return ret;
+
+ wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT);
+
+ if (physical_pos == wp)
+ return 0;
+
+ if (physical_pos > wp)
+ return -EUCLEAN;
+
+ length = wp - physical_pos;
+ return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length);
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 8abe2f83272b..61e969652fe1 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -7,6 +7,7 @@
#include <linux/blkdev.h>
#include "volumes.h"
#include "disk-io.h"
+#include "block-group.h"
struct btrfs_zoned_device_info {
/*
@@ -25,6 +26,7 @@ struct btrfs_zoned_device_info {
#ifdef CONFIG_BLK_DEV_ZONED
int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
struct blk_zone *zone);
+int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info);
int btrfs_get_dev_zone_info(struct btrfs_device *device);
void btrfs_destroy_dev_zone_info(struct btrfs_device *device);
int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info);
@@ -35,6 +37,28 @@ int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
u64 *bytenr_ret);
void btrfs_advance_sb_log(struct btrfs_device *device, int mirror);
int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror);
+u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
+ u64 hole_end, u64 num_bytes);
+int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
+ u64 length, u64 *bytes);
+int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size);
+int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new);
+void btrfs_calc_zone_unusable(struct btrfs_block_group *cache);
+void btrfs_redirty_list_add(struct btrfs_transaction *trans,
+ struct extent_buffer *eb);
+void btrfs_free_redirty_list(struct btrfs_transaction *trans);
+bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em);
+void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
+ struct bio *bio);
+void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered);
+bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb,
+ struct btrfs_block_group **cache_ret);
+void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
+ struct extent_buffer *eb);
+int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length);
+int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
+ u64 physical_start, u64 physical_pos);
#else /* CONFIG_BLK_DEV_ZONED */
static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
struct blk_zone *zone)
@@ -42,6 +66,11 @@ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
return 0;
}
+static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
+{
+ return 0;
+}
+
static inline int btrfs_get_dev_zone_info(struct btrfs_device *device)
{
return 0;
@@ -85,6 +114,78 @@ static inline int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror
return 0;
}
+static inline u64 btrfs_find_allocatable_zones(struct btrfs_device *device,
+ u64 hole_start, u64 hole_end,
+ u64 num_bytes)
+{
+ return hole_start;
+}
+
+static inline int btrfs_reset_device_zone(struct btrfs_device *device,
+ u64 physical, u64 length, u64 *bytes)
+{
+ *bytes = 0;
+ return 0;
+}
+
+static inline int btrfs_ensure_empty_zones(struct btrfs_device *device,
+ u64 start, u64 size)
+{
+ return 0;
+}
+
+static inline int btrfs_load_block_group_zone_info(
+ struct btrfs_block_group *cache, bool new)
+{
+ return 0;
+}
+
+static inline void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { }
+
+static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans,
+ struct extent_buffer *eb) { }
+static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { }
+
+static inline bool btrfs_use_zone_append(struct btrfs_inode *inode,
+ struct extent_map *em)
+{
+ return false;
+}
+
+static inline void btrfs_record_physical_zoned(struct inode *inode,
+ u64 file_offset, struct bio *bio)
+{
+}
+
+static inline void btrfs_rewrite_logical_zoned(
+ struct btrfs_ordered_extent *ordered) { }
+
+static inline bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb,
+ struct btrfs_block_group **cache_ret)
+{
+ return true;
+}
+
+static inline void btrfs_revert_meta_write_pointer(
+ struct btrfs_block_group *cache,
+ struct extent_buffer *eb)
+{
+}
+
+static inline int btrfs_zoned_issue_zeroout(struct btrfs_device *device,
+ u64 physical, u64 length)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev,
+ u64 logical, u64 physical_start,
+ u64 physical_pos)
+{
+ return -EOPNOTSUPP;
+}
+
#endif
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
@@ -136,12 +237,16 @@ static inline void btrfs_dev_clear_zone_empty(struct btrfs_device *device, u64 p
static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_info,
struct block_device *bdev)
{
- u64 zone_size;
-
if (btrfs_is_zoned(fs_info)) {
- zone_size = bdev_zone_sectors(bdev) << SECTOR_SHIFT;
- /* Do not allow non-zoned device */
- return bdev_is_zoned(bdev) && fs_info->zone_size == zone_size;
+ /*
+ * We can allow a regular device on a zoned filesystem, because
+ * we will emulate the zoned capabilities.
+ */
+ if (!bdev_is_zoned(bdev))
+ return true;
+
+ return fs_info->zone_size ==
+ (bdev_zone_sectors(bdev) << SECTOR_SHIFT);
}
/* Do not allow Host Manged zoned device */
@@ -157,4 +262,46 @@ static inline bool btrfs_check_super_location(struct btrfs_device *device, u64 p
return device->zone_info == NULL || !btrfs_dev_is_sequential(device, pos);
}
+static inline bool btrfs_can_zone_reset(struct btrfs_device *device,
+ u64 physical, u64 length)
+{
+ u64 zone_size;
+
+ if (!btrfs_dev_is_sequential(device, physical))
+ return false;
+
+ zone_size = device->zone_info->zone_size;
+ if (!IS_ALIGNED(physical, zone_size) || !IS_ALIGNED(length, zone_size))
+ return false;
+
+ return true;
+}
+
+static inline void btrfs_zoned_meta_io_lock(struct btrfs_fs_info *fs_info)
+{
+ if (!btrfs_is_zoned(fs_info))
+ return;
+ mutex_lock(&fs_info->zoned_meta_io_lock);
+}
+
+static inline void btrfs_zoned_meta_io_unlock(struct btrfs_fs_info *fs_info)
+{
+ if (!btrfs_is_zoned(fs_info))
+ return;
+ mutex_unlock(&fs_info->zoned_meta_io_lock);
+}
+
+static inline void btrfs_clear_treelog_bg(struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ if (!btrfs_is_zoned(fs_info))
+ return;
+
+ spin_lock(&fs_info->treelog_bg_lock);
+ if (fs_info->treelog_bg == bg->start)
+ fs_info->treelog_bg = 0;
+ spin_unlock(&fs_info->treelog_bg_lock);
+}
+
#endif
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 8bda092e60c5..e027c718ca01 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -413,7 +413,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
inode = d_backing_inode(object->backer);
ASSERT(S_ISREG(inode->i_mode));
- ASSERT(inode->i_mapping->a_ops->readpages);
/* calculate the shift required to use bmap */
shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
@@ -713,7 +712,6 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
inode = d_backing_inode(object->backer);
ASSERT(S_ISREG(inode->i_mode));
- ASSERT(inode->i_mapping->a_ops->readpages);
/* calculate the shift required to use bmap */
shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 950552944436..26e66436f005 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1662,7 +1662,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %x\n",
inode, off, len, ceph_cap_string(got), ret);
- ceph_put_cap_refs(ci, got);
+ ceph_put_cap_refs_async(ci, got);
out_free:
ceph_restore_sigs(&oldset);
sb_end_pagefault(inode->i_sb);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 255a512f1277..570731c4d019 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -3027,6 +3027,12 @@ static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci,
return 0;
}
+enum put_cap_refs_mode {
+ PUT_CAP_REFS_SYNC = 0,
+ PUT_CAP_REFS_NO_CHECK,
+ PUT_CAP_REFS_ASYNC,
+};
+
/*
* Release cap refs.
*
@@ -3037,10 +3043,11 @@ static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci,
* cap_snap, and wake up any waiters.
*/
static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
- bool skip_checking_caps)
+ enum put_cap_refs_mode mode)
{
struct inode *inode = &ci->vfs_inode;
int last = 0, put = 0, flushsnaps = 0, wake = 0;
+ bool check_flushsnaps = false;
spin_lock(&ci->i_ceph_lock);
if (had & CEPH_CAP_PIN)
@@ -3057,26 +3064,17 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
if (had & CEPH_CAP_FILE_BUFFER) {
if (--ci->i_wb_ref == 0) {
last++;
+ /* put the ref held by ceph_take_cap_refs() */
put++;
+ check_flushsnaps = true;
}
dout("put_cap_refs %p wb %d -> %d (?)\n",
inode, ci->i_wb_ref+1, ci->i_wb_ref);
}
- if (had & CEPH_CAP_FILE_WR)
+ if (had & CEPH_CAP_FILE_WR) {
if (--ci->i_wr_ref == 0) {
last++;
- if (__ceph_have_pending_cap_snap(ci)) {
- struct ceph_cap_snap *capsnap =
- list_last_entry(&ci->i_cap_snaps,
- struct ceph_cap_snap,
- ci_item);
- capsnap->writing = 0;
- if (ceph_try_drop_cap_snap(ci, capsnap))
- put++;
- else if (__ceph_finish_cap_snap(ci, capsnap))
- flushsnaps = 1;
- wake = 1;
- }
+ check_flushsnaps = true;
if (ci->i_wrbuffer_ref_head == 0 &&
ci->i_dirty_caps == 0 &&
ci->i_flushing_caps == 0) {
@@ -3088,15 +3086,42 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm)
drop_inode_snap_realm(ci);
}
+ }
+ if (check_flushsnaps && __ceph_have_pending_cap_snap(ci)) {
+ struct ceph_cap_snap *capsnap =
+ list_last_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap,
+ ci_item);
+
+ capsnap->writing = 0;
+ if (ceph_try_drop_cap_snap(ci, capsnap))
+ /* put the ref held by ceph_queue_cap_snap() */
+ put++;
+ else if (__ceph_finish_cap_snap(ci, capsnap))
+ flushsnaps = 1;
+ wake = 1;
+ }
spin_unlock(&ci->i_ceph_lock);
dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
last ? " last" : "", put ? " put" : "");
- if (last && !skip_checking_caps)
- ceph_check_caps(ci, 0, NULL);
- else if (flushsnaps)
- ceph_flush_snaps(ci, NULL);
+ switch (mode) {
+ case PUT_CAP_REFS_SYNC:
+ if (last)
+ ceph_check_caps(ci, 0, NULL);
+ else if (flushsnaps)
+ ceph_flush_snaps(ci, NULL);
+ break;
+ case PUT_CAP_REFS_ASYNC:
+ if (last)
+ ceph_queue_check_caps(inode);
+ else if (flushsnaps)
+ ceph_queue_flush_snaps(inode);
+ break;
+ default:
+ break;
+ }
if (wake)
wake_up_all(&ci->i_cap_wq);
while (put-- > 0)
@@ -3105,12 +3130,17 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
{
- __ceph_put_cap_refs(ci, had, false);
+ __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_SYNC);
+}
+
+void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had)
+{
+ __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_ASYNC);
}
void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had)
{
- __ceph_put_cap_refs(ci, had, true);
+ __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_NO_CHECK);
}
/*
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index d6ece56d40e8..156f849f5385 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1816,60 +1816,17 @@ void ceph_async_iput(struct inode *inode)
}
}
-/*
- * Write back inode data in a worker thread. (This can't be done
- * in the message handler context.)
- */
-void ceph_queue_writeback(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- set_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask);
-
- ihold(inode);
- if (queue_work(ceph_inode_to_client(inode)->inode_wq,
- &ci->i_work)) {
- dout("ceph_queue_writeback %p\n", inode);
- } else {
- dout("ceph_queue_writeback %p already queued, mask=%lx\n",
- inode, ci->i_work_mask);
- iput(inode);
- }
-}
-
-/*
- * queue an async invalidation
- */
-void ceph_queue_invalidate(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- set_bit(CEPH_I_WORK_INVALIDATE_PAGES, &ci->i_work_mask);
-
- ihold(inode);
- if (queue_work(ceph_inode_to_client(inode)->inode_wq,
- &ceph_inode(inode)->i_work)) {
- dout("ceph_queue_invalidate %p\n", inode);
- } else {
- dout("ceph_queue_invalidate %p already queued, mask=%lx\n",
- inode, ci->i_work_mask);
- iput(inode);
- }
-}
-
-/*
- * Queue an async vmtruncate. If we fail to queue work, we will handle
- * the truncation the next time we call __ceph_do_pending_vmtruncate.
- */
-void ceph_queue_vmtruncate(struct inode *inode)
+void ceph_queue_inode_work(struct inode *inode, int work_bit)
{
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
- set_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask);
+ set_bit(work_bit, &ci->i_work_mask);
ihold(inode);
- if (queue_work(ceph_inode_to_client(inode)->inode_wq,
- &ci->i_work)) {
- dout("ceph_queue_vmtruncate %p\n", inode);
+ if (queue_work(fsc->inode_wq, &ci->i_work)) {
+ dout("queue_inode_work %p, mask=%lx\n", inode, ci->i_work_mask);
} else {
- dout("ceph_queue_vmtruncate %p already queued, mask=%lx\n",
+ dout("queue_inode_work %p already queued, mask=%lx\n",
inode, ci->i_work_mask);
iput(inode);
}
@@ -2008,6 +1965,12 @@ static void ceph_inode_work(struct work_struct *work)
if (test_and_clear_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask))
__ceph_do_pending_vmtruncate(inode);
+ if (test_and_clear_bit(CEPH_I_WORK_CHECK_CAPS, &ci->i_work_mask))
+ ceph_check_caps(ci, 0, NULL);
+
+ if (test_and_clear_bit(CEPH_I_WORK_FLUSH_SNAPS, &ci->i_work_mask))
+ ceph_flush_snaps(ci, NULL);
+
iput(inode);
}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 840587037b59..d87bd852ed96 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -5038,7 +5038,7 @@ bad:
return;
}
-static struct ceph_connection *con_get(struct ceph_connection *con)
+static struct ceph_connection *mds_get_con(struct ceph_connection *con)
{
struct ceph_mds_session *s = con->private;
@@ -5047,7 +5047,7 @@ static struct ceph_connection *con_get(struct ceph_connection *con)
return NULL;
}
-static void con_put(struct ceph_connection *con)
+static void mds_put_con(struct ceph_connection *con)
{
struct ceph_mds_session *s = con->private;
@@ -5058,7 +5058,7 @@ static void con_put(struct ceph_connection *con)
* if the client is unresponsive for long enough, the mds will kill
* the session entirely.
*/
-static void peer_reset(struct ceph_connection *con)
+static void mds_peer_reset(struct ceph_connection *con)
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
@@ -5067,7 +5067,7 @@ static void peer_reset(struct ceph_connection *con)
send_mds_reconnect(mdsc, s);
}
-static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
@@ -5125,8 +5125,8 @@ out:
* Note: returned pointer is the address of a structure that's
* managed separately. Caller must *not* attempt to free it.
*/
-static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
- int *proto, int force_new)
+static struct ceph_auth_handshake *
+mds_get_authorizer(struct ceph_connection *con, int *proto, int force_new)
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
@@ -5142,7 +5142,7 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
return auth;
}
-static int add_authorizer_challenge(struct ceph_connection *con,
+static int mds_add_authorizer_challenge(struct ceph_connection *con,
void *challenge_buf, int challenge_buf_len)
{
struct ceph_mds_session *s = con->private;
@@ -5153,7 +5153,7 @@ static int add_authorizer_challenge(struct ceph_connection *con,
challenge_buf, challenge_buf_len);
}
-static int verify_authorizer_reply(struct ceph_connection *con)
+static int mds_verify_authorizer_reply(struct ceph_connection *con)
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
@@ -5165,7 +5165,7 @@ static int verify_authorizer_reply(struct ceph_connection *con)
NULL, NULL, NULL, NULL);
}
-static int invalidate_authorizer(struct ceph_connection *con)
+static int mds_invalidate_authorizer(struct ceph_connection *con)
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
@@ -5288,15 +5288,15 @@ static int mds_check_message_signature(struct ceph_msg *msg)
}
static const struct ceph_connection_operations mds_con_ops = {
- .get = con_get,
- .put = con_put,
- .dispatch = dispatch,
- .get_authorizer = get_authorizer,
- .add_authorizer_challenge = add_authorizer_challenge,
- .verify_authorizer_reply = verify_authorizer_reply,
- .invalidate_authorizer = invalidate_authorizer,
- .peer_reset = peer_reset,
+ .get = mds_get_con,
+ .put = mds_put_con,
.alloc_msg = mds_alloc_msg,
+ .dispatch = mds_dispatch,
+ .peer_reset = mds_peer_reset,
+ .get_authorizer = mds_get_authorizer,
+ .add_authorizer_challenge = mds_add_authorizer_challenge,
+ .verify_authorizer_reply = mds_verify_authorizer_reply,
+ .invalidate_authorizer = mds_invalidate_authorizer,
.sign_message = mds_sign_message,
.check_message_signature = mds_check_message_signature,
.get_auth_request = mds_get_auth_request,
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index b611f829cb61..0728b01d4d43 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -623,6 +623,16 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
return 0;
}
+ /* Fb cap still in use, delay it */
+ if (ci->i_wb_ref) {
+ dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
+ "used WRBUFFER, delaying\n", inode, capsnap,
+ capsnap->context, capsnap->context->seq,
+ ceph_cap_string(capsnap->dirty), capsnap->size);
+ capsnap->writing = 1;
+ return 0;
+ }
+
ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
inode, capsnap, capsnap->context,
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 1ef0a2a15817..c48bb30c8d70 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -562,9 +562,11 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
/*
* Masks of ceph inode work.
*/
-#define CEPH_I_WORK_WRITEBACK 0 /* writeback */
-#define CEPH_I_WORK_INVALIDATE_PAGES 1 /* invalidate pages */
-#define CEPH_I_WORK_VMTRUNCATE 2 /* vmtruncate */
+#define CEPH_I_WORK_WRITEBACK 0
+#define CEPH_I_WORK_INVALIDATE_PAGES 1
+#define CEPH_I_WORK_VMTRUNCATE 2
+#define CEPH_I_WORK_CHECK_CAPS 3
+#define CEPH_I_WORK_FLUSH_SNAPS 4
/*
* We set the ERROR_WRITE bit when we start seeing write errors on an inode
@@ -962,11 +964,36 @@ extern int ceph_inode_holds_cap(struct inode *inode, int mask);
extern bool ceph_inode_set_size(struct inode *inode, loff_t size);
extern void __ceph_do_pending_vmtruncate(struct inode *inode);
-extern void ceph_queue_vmtruncate(struct inode *inode);
-extern void ceph_queue_invalidate(struct inode *inode);
-extern void ceph_queue_writeback(struct inode *inode);
+
extern void ceph_async_iput(struct inode *inode);
+void ceph_queue_inode_work(struct inode *inode, int work_bit);
+
+static inline void ceph_queue_vmtruncate(struct inode *inode)
+{
+ ceph_queue_inode_work(inode, CEPH_I_WORK_VMTRUNCATE);
+}
+
+static inline void ceph_queue_invalidate(struct inode *inode)
+{
+ ceph_queue_inode_work(inode, CEPH_I_WORK_INVALIDATE_PAGES);
+}
+
+static inline void ceph_queue_writeback(struct inode *inode)
+{
+ ceph_queue_inode_work(inode, CEPH_I_WORK_WRITEBACK);
+}
+
+static inline void ceph_queue_check_caps(struct inode *inode)
+{
+ ceph_queue_inode_work(inode, CEPH_I_WORK_CHECK_CAPS);
+}
+
+static inline void ceph_queue_flush_snaps(struct inode *inode)
+{
+ ceph_queue_inode_work(inode, CEPH_I_WORK_FLUSH_SNAPS);
+}
+
extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
int mask, bool force);
static inline int ceph_do_getattr(struct inode *inode, int mask, bool force)
@@ -1109,6 +1136,7 @@ extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps,
bool snap_rwsem_locked);
extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
+extern void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had);
extern void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci,
int had);
extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index b231dcf1d1f9..aa697ccfa9dc 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -197,14 +197,14 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v)
cfile = list_entry(tmp2, struct cifsFileInfo,
tlist);
seq_printf(m,
- "0x%x 0x%llx 0x%x %d %d %d %s",
+ "0x%x 0x%llx 0x%x %d %d %d %pd",
tcon->tid,
cfile->fid.persistent_fid,
cfile->f_flags,
cfile->count,
cfile->pid,
from_kuid(&init_user_ns, cfile->uid),
- cfile->dentry->d_name.name);
+ cfile->dentry);
#ifdef CONFIG_CIFS_DEBUG2
seq_printf(m, " 0x%llx\n", cfile->fid.mid);
#else
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index e4c6ae47a796..6b1ce4efb591 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -133,8 +133,9 @@ cifs_build_devname(char *nodename, const char *prepath)
* Caller is responsible for freeing returned value if it is not error.
*/
char *cifs_compose_mount_options(const char *sb_mountdata,
- const char *fullpath,
- const struct dfs_info3_param *ref)
+ const char *fullpath,
+ const struct dfs_info3_param *ref,
+ char **devname)
{
int rc;
char *name;
@@ -231,7 +232,10 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
strcat(mountdata, "ip=");
strcat(mountdata, srvIP);
- kfree(name);
+ if (devname)
+ *devname = name;
+ else
+ kfree(name);
/*cifs_dbg(FYI, "%s: parent mountdata: %s\n", __func__, sb_mountdata);*/
/*cifs_dbg(FYI, "%s: submount mountdata: %s\n", __func__, mountdata );*/
@@ -278,7 +282,7 @@ static struct vfsmount *cifs_dfs_do_mount(struct dentry *mntpt,
/* strip first '\' from fullpath */
mountdata = cifs_compose_mount_options(cifs_sb->ctx->mount_options,
- fullpath + 1, NULL);
+ fullpath + 1, NULL, NULL);
if (IS_ERR(mountdata)) {
kfree(devname);
return (struct vfsmount *)mountdata;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 39e51dcf796f..38534e066cc7 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -470,7 +470,7 @@ cifs_show_cache_flavor(struct seq_file *s, struct cifs_sb_info *cifs_sb)
static int cifs_show_devname(struct seq_file *m, struct dentry *root)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb);
- char *devname = kstrdup(cifs_sb->ctx->UNC, GFP_KERNEL);
+ char *devname = kstrdup(cifs_sb->ctx->source, GFP_KERNEL);
if (devname == NULL)
seq_puts(m, "none");
@@ -823,7 +823,7 @@ cifs_smb3_do_mount(struct file_system_type *fs_type,
goto out;
}
- rc = cifs_setup_volume_info(cifs_sb->ctx);
+ rc = cifs_setup_volume_info(cifs_sb->ctx, NULL, old_ctx->UNC);
if (rc) {
root = ERR_PTR(rc);
goto out;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 340ff81ee87b..32f7a013402e 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -78,7 +78,8 @@ extern char *cifs_build_path_to_root(struct smb3_fs_context *ctx,
int add_treename);
extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
extern char *cifs_compose_mount_options(const char *sb_mountdata,
- const char *fullpath, const struct dfs_info3_param *ref);
+ const char *fullpath, const struct dfs_info3_param *ref,
+ char **devname);
/* extern void renew_parental_timestamps(struct dentry *direntry);*/
extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer,
struct TCP_Server_Info *server);
@@ -89,6 +90,7 @@ extern void cifs_wake_up_task(struct mid_q_entry *mid);
extern int cifs_handle_standard(struct TCP_Server_Info *server,
struct mid_q_entry *mid);
extern int smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx);
+extern int smb3_parse_opt(const char *options, const char *key, char **val);
extern bool cifs_match_ipaddr(struct sockaddr *srcaddr, struct sockaddr *rhs);
extern int cifs_discard_remaining_data(struct TCP_Server_Info *server);
extern int cifs_call_async(struct TCP_Server_Info *server,
@@ -549,7 +551,7 @@ extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8,
unsigned char *p24);
extern int
-cifs_setup_volume_info(struct smb3_fs_context *ctx);
+cifs_setup_volume_info(struct smb3_fs_context *ctx, const char *mntopts, const char *devname);
extern struct TCP_Server_Info *
cifs_find_tcp_session(struct smb3_fs_context *ctx);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 5d39129406ea..4bb9decbbf27 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2195,7 +2195,7 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
if (ses->server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING)
tcon->nohandlecache = ctx->nohandlecache;
else
- tcon->nohandlecache = 1;
+ tcon->nohandlecache = true;
tcon->nodelete = ctx->nodelete;
tcon->local_lease = ctx->local_lease;
INIT_LIST_HEAD(&tcon->pending_opens);
@@ -2628,7 +2628,7 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon,
} else if (ctx)
tcon->unix_ext = 1; /* Unix Extensions supported */
- if (tcon->unix_ext == 0) {
+ if (!tcon->unix_ext) {
cifs_dbg(FYI, "Unix extensions disabled so not set on reconnect\n");
return;
}
@@ -2756,6 +2756,7 @@ int cifs_setup_cifs_sb(struct cifs_sb_info *cifs_sb)
cifs_sb->prepath = kstrdup(ctx->prepath, GFP_KERNEL);
if (cifs_sb->prepath == NULL)
return -ENOMEM;
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
}
return 0;
@@ -2972,17 +2973,28 @@ expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses,
rc = dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb),
ref_path, &referral, NULL);
if (!rc) {
+ char *fake_devname = NULL;
+
mdata = cifs_compose_mount_options(cifs_sb->ctx->mount_options,
- full_path + 1, &referral);
+ full_path + 1, &referral,
+ &fake_devname);
free_dfs_info_param(&referral);
if (IS_ERR(mdata)) {
rc = PTR_ERR(mdata);
mdata = NULL;
} else {
- smb3_cleanup_fs_context_contents(ctx);
- rc = cifs_setup_volume_info(ctx);
+ /*
+ * We can not clear out the whole structure since we
+ * no longer have an explicit function to parse
+ * a mount-string. Instead we need to clear out the
+ * individual fields that are no longer valid.
+ */
+ kfree(ctx->prepath);
+ ctx->prepath = NULL;
+ rc = cifs_setup_volume_info(ctx, mdata, fake_devname);
}
+ kfree(fake_devname);
kfree(cifs_sb->ctx->mount_options);
cifs_sb->ctx->mount_options = mdata;
}
@@ -3036,6 +3048,7 @@ static int setup_dfs_tgt_conn(const char *path, const char *full_path,
struct dfs_info3_param ref = {0};
char *mdata = NULL;
struct smb3_fs_context fake_ctx = {NULL};
+ char *fake_devname = NULL;
cifs_dbg(FYI, "%s: dfs path: %s\n", __func__, path);
@@ -3044,16 +3057,18 @@ static int setup_dfs_tgt_conn(const char *path, const char *full_path,
return rc;
mdata = cifs_compose_mount_options(cifs_sb->ctx->mount_options,
- full_path + 1, &ref);
+ full_path + 1, &ref,
+ &fake_devname);
free_dfs_info_param(&ref);
if (IS_ERR(mdata)) {
rc = PTR_ERR(mdata);
mdata = NULL;
} else
- rc = cifs_setup_volume_info(&fake_ctx);
+ rc = cifs_setup_volume_info(&fake_ctx, mdata, fake_devname);
kfree(mdata);
+ kfree(fake_devname);
if (!rc) {
/*
@@ -3122,10 +3137,24 @@ static int do_dfs_failover(const char *path, const char *full_path, struct cifs_
* we should pass a clone of the original context?
*/
int
-cifs_setup_volume_info(struct smb3_fs_context *ctx)
+cifs_setup_volume_info(struct smb3_fs_context *ctx, const char *mntopts, const char *devname)
{
int rc = 0;
+ smb3_parse_devname(devname, ctx);
+
+ if (mntopts) {
+ char *ip;
+
+ cifs_dbg(FYI, "%s: mntopts=%s\n", __func__, mntopts);
+ rc = smb3_parse_opt(mntopts, "ip", &ip);
+ if (!rc && !cifs_convert_address((struct sockaddr *)&ctx->dstaddr, ip,
+ strlen(ip))) {
+ cifs_dbg(VFS, "%s: failed to convert ip address\n", __func__);
+ return -EINVAL;
+ }
+ }
+
if (ctx->nullauth) {
cifs_dbg(FYI, "Anonymous login\n");
kfree(ctx->username);
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index 0fdb0de7ff86..4950ab0486ae 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -1417,7 +1417,7 @@ static struct cifs_ses *find_root_ses(struct vol_info *vi,
int rc;
struct cache_entry *ce;
struct dfs_info3_param ref = {0};
- char *mdata = NULL;
+ char *mdata = NULL, *devname = NULL;
struct TCP_Server_Info *server;
struct cifs_ses *ses;
struct smb3_fs_context ctx = {NULL};
@@ -1444,7 +1444,8 @@ static struct cifs_ses *find_root_ses(struct vol_info *vi,
up_read(&htable_rw_lock);
- mdata = cifs_compose_mount_options(vi->mntdata, rpath, &ref);
+ mdata = cifs_compose_mount_options(vi->mntdata, rpath, &ref,
+ &devname);
free_dfs_info_param(&ref);
if (IS_ERR(mdata)) {
@@ -1453,7 +1454,7 @@ static struct cifs_ses *find_root_ses(struct vol_info *vi,
goto out;
}
- rc = cifs_setup_volume_info(&ctx);
+ rc = cifs_setup_volume_info(&ctx, NULL, devname);
if (rc) {
ses = ERR_PTR(rc);
@@ -1472,6 +1473,7 @@ out:
smb3_cleanup_fs_context_contents(&ctx);
kfree(mdata);
kfree(rpath);
+ kfree(devname);
return ses;
}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 68f4f8536e6a..a3fb81e0ba17 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -737,6 +737,7 @@ static int
cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
{
struct inode *inode;
+ int rc;
if (flags & LOOKUP_RCU)
return -ECHILD;
@@ -746,8 +747,25 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
if ((flags & LOOKUP_REVAL) && !CIFS_CACHE_READ(CIFS_I(inode)))
CIFS_I(inode)->time = 0; /* force reval */
- if (cifs_revalidate_dentry(direntry))
- return 0;
+ rc = cifs_revalidate_dentry(direntry);
+ if (rc) {
+ cifs_dbg(FYI, "cifs_revalidate_dentry failed with rc=%d", rc);
+ switch (rc) {
+ case -ENOENT:
+ case -ESTALE:
+ /*
+ * Those errors mean the dentry is invalid
+ * (file was deleted or recreated)
+ */
+ return 0;
+ default:
+ /*
+ * Otherwise some unexpected error happened
+ * report it as-is to VFS layer
+ */
+ return rc;
+ }
+ }
else {
/*
* If the inode wasn't known to be a dfs entry when
diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c
index 076bcadc756a..12a5da0230b5 100644
--- a/fs/cifs/fs_context.c
+++ b/fs/cifs/fs_context.c
@@ -148,7 +148,6 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
/* Mount options which take string value */
fsparam_string("source", Opt_source),
- fsparam_string("unc", Opt_source),
fsparam_string("user", Opt_user),
fsparam_string("username", Opt_user),
fsparam_string("pass", Opt_pass),
@@ -175,8 +174,15 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_flag_no("exec", Opt_ignore),
fsparam_flag_no("dev", Opt_ignore),
fsparam_flag_no("mand", Opt_ignore),
+ fsparam_flag_no("auto", Opt_ignore),
fsparam_string("cred", Opt_ignore),
fsparam_string("credentials", Opt_ignore),
+ /*
+ * UNC and prefixpath is now extracted from Opt_source
+ * in the new mount API so we can just ignore them going forward.
+ */
+ fsparam_string("unc", Opt_ignore),
+ fsparam_string("prefixpath", Opt_ignore),
{}
};
@@ -311,6 +317,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
new_ctx->password = NULL;
new_ctx->domainname = NULL;
new_ctx->UNC = NULL;
+ new_ctx->source = NULL;
new_ctx->iocharset = NULL;
/*
@@ -321,6 +328,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
DUP_CTX_STR(username);
DUP_CTX_STR(password);
DUP_CTX_STR(UNC);
+ DUP_CTX_STR(source);
DUP_CTX_STR(domainname);
DUP_CTX_STR(nodename);
DUP_CTX_STR(iocharset);
@@ -399,6 +407,37 @@ cifs_parse_smb_version(char *value, struct smb3_fs_context *ctx, bool is_smb3)
return 0;
}
+int smb3_parse_opt(const char *options, const char *key, char **val)
+{
+ int rc = -ENOENT;
+ char *opts, *orig, *p;
+
+ orig = opts = kstrdup(options, GFP_KERNEL);
+ if (!opts)
+ return -ENOMEM;
+
+ while ((p = strsep(&opts, ","))) {
+ char *nval;
+
+ if (!*p)
+ continue;
+ if (strncasecmp(p, key, strlen(key)))
+ continue;
+ nval = strchr(p, '=');
+ if (nval) {
+ if (nval == p)
+ continue;
+ *nval++ = 0;
+ *val = kstrndup(nval, strlen(nval), GFP_KERNEL);
+ rc = !*val ? -ENOMEM : 0;
+ goto out;
+ }
+ }
+out:
+ kfree(orig);
+ return rc;
+}
+
/*
* Parse a devname into substrings and populate the ctx->UNC and ctx->prepath
* fields with the result. Returns 0 on success and an error otherwise
@@ -531,7 +570,7 @@ static int smb3_fs_context_validate(struct fs_context *fc)
if (ctx->rdma && ctx->vals->protocol_id < SMB30_PROT_ID) {
cifs_dbg(VFS, "SMB Direct requires Version >=3.0\n");
- return -1;
+ return -EOPNOTSUPP;
}
#ifndef CONFIG_KEYS
@@ -554,7 +593,7 @@ static int smb3_fs_context_validate(struct fs_context *fc)
/* make sure UNC has a share name */
if (strlen(ctx->UNC) < 3 || !strchr(ctx->UNC + 3, '\\')) {
cifs_dbg(VFS, "Malformed UNC. Unable to find share name.\n");
- return -1;
+ return -ENOENT;
}
if (!ctx->got_ip) {
@@ -568,7 +607,7 @@ static int smb3_fs_context_validate(struct fs_context *fc)
if (!cifs_convert_address((struct sockaddr *)&ctx->dstaddr,
&ctx->UNC[2], len)) {
pr_err("Unable to determine destination address\n");
- return -1;
+ return -EHOSTUNREACH;
}
}
@@ -699,6 +738,7 @@ static int smb3_reconfigure(struct fs_context *fc)
* just use what we already have in cifs_sb->ctx.
*/
STEAL_STRING(cifs_sb, ctx, UNC);
+ STEAL_STRING(cifs_sb, ctx, source);
STEAL_STRING(cifs_sb, ctx, username);
STEAL_STRING(cifs_sb, ctx, password);
STEAL_STRING(cifs_sb, ctx, domainname);
@@ -941,6 +981,11 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
cifs_dbg(VFS, "Unknown error parsing devname\n");
goto cifs_parse_mount_err;
}
+ ctx->source = kstrdup(param->string, GFP_KERNEL);
+ if (ctx->source == NULL) {
+ cifs_dbg(VFS, "OOM when copying UNC string\n");
+ goto cifs_parse_mount_err;
+ }
fc->source = kstrdup(param->string, GFP_KERNEL);
if (fc->source == NULL) {
cifs_dbg(VFS, "OOM when copying UNC string\n");
@@ -1263,7 +1308,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
return 0;
cifs_parse_mount_err:
- return 1;
+ return -EINVAL;
}
int smb3_init_fs_context(struct fs_context *fc)
@@ -1363,6 +1408,8 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx)
ctx->password = NULL;
kfree(ctx->UNC);
ctx->UNC = NULL;
+ kfree(ctx->source);
+ ctx->source = NULL;
kfree(ctx->domainname);
ctx->domainname = NULL;
kfree(ctx->nodename);
@@ -1500,8 +1547,8 @@ void smb3_update_mnt_flags(struct cifs_sb_info *cifs_sb)
cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_MULTIUSER |
CIFS_MOUNT_NO_PERM);
else
- cifs_sb->mnt_cifs_flags &= ~(CIFS_MOUNT_MULTIUSER |
- CIFS_MOUNT_NO_PERM);
+ cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_MULTIUSER;
+
if (ctx->strict_io)
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_STRICT_IO;
diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h
index 3358b33abcd0..1c44a460e2c0 100644
--- a/fs/cifs/fs_context.h
+++ b/fs/cifs/fs_context.h
@@ -159,6 +159,7 @@ struct smb3_fs_context {
char *username;
char *password;
char *domainname;
+ char *source;
char *UNC;
char *nodename;
char *iocharset; /* local code page for mapping to and from Unicode */
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index d85edf5d1429..a5a9e33c0d73 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -286,7 +286,7 @@ struct smb2_negotiate_req {
__le32 NegotiateContextOffset; /* SMB3.1.1 only. MBZ earlier */
__le16 NegotiateContextCount; /* SMB3.1.1 only. MBZ earlier */
__le16 Reserved2;
- __le16 Dialects[1]; /* One dialect (vers=) at a time for now */
+ __le16 Dialects[4]; /* BB expand this if autonegotiate > 4 dialects */
} __packed;
/* Dialects */
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index e9abb41aa89b..4a2b836eb017 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -338,7 +338,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
if (ssocket == NULL)
return -EAGAIN;
- if (signal_pending(current)) {
+ if (fatal_signal_pending(current)) {
cifs_dbg(FYI, "signal pending before send request\n");
return -ERESTARTSYS;
}
@@ -429,7 +429,7 @@ unmask:
if (signal_pending(current) && (total_len != send_length)) {
cifs_dbg(FYI, "signal is pending after attempt to send\n");
- rc = -EINTR;
+ rc = -ERESTARTSYS;
}
/* uncork it */
@@ -666,10 +666,22 @@ wait_for_compound_request(struct TCP_Server_Info *server, int num,
if (*credits < num) {
/*
- * Return immediately if not too many requests in flight since
- * we will likely be stuck on waiting for credits.
+ * If the server is tight on resources or just gives us less
+ * credits for other reasons (e.g. requests are coming out of
+ * order and the server delays granting more credits until it
+ * processes a missing mid) and we exhausted most available
+ * credits there may be situations when we try to send
+ * a compound request but we don't have enough credits. At this
+ * point the client needs to decide if it should wait for
+ * additional credits or fail the request. If at least one
+ * request is in flight there is a high probability that the
+ * server will return enough credits to satisfy this compound
+ * request.
+ *
+ * Return immediately if no requests in flight since we will be
+ * stuck on waiting for credits.
*/
- if (server->in_flight < num - *credits) {
+ if (server->in_flight == 0) {
spin_unlock(&server->req_lock);
trace_smb3_insufficient_credits(server->CurrentMid,
server->hostname, scredits, sin_flight);
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 2c557229696a..95e72d271b95 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -50,6 +50,7 @@
* which requires asm/elf.h to define compat_elf_gregset_t et al.
*/
#define elf_prstatus compat_elf_prstatus
+#define elf_prstatus_common compat_elf_prstatus_common
#define elf_prpsinfo compat_elf_prpsinfo
#undef ns_to_kernel_old_timeval
@@ -61,7 +62,6 @@
* differ from the native ones, or omitted when they match.
*/
-#undef ELF_ARCH
#undef elf_check_arch
#define elf_check_arch compat_elf_check_arch
@@ -90,11 +90,6 @@
#define ELF_ET_DYN_BASE COMPAT_ELF_ET_DYN_BASE
#endif
-#ifdef COMPAT_ELF_EXEC_PAGESIZE
-#undef ELF_EXEC_PAGESIZE
-#define ELF_EXEC_PAGESIZE COMPAT_ELF_EXEC_PAGESIZE
-#endif
-
#ifdef COMPAT_ELF_PLAT_INIT
#undef ELF_PLAT_INIT
#define ELF_PLAT_INIT COMPAT_ELF_PLAT_INIT
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 4b90cfd1ec36..2be65269a987 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -392,8 +392,7 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma)
/* Don't map the last page if it contains some other data */
if (pgoff + pages == max_pages && cramfs_last_page_is_shared(inode)) {
- pr_debug("mmap: %s: last page is shared\n",
- file_dentry(file)->d_name.name);
+ pr_debug("mmap: %pD: last page is shared\n", file);
pages--;
}
@@ -430,16 +429,15 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma)
}
if (!ret)
- pr_debug("mapped %s[%lu] at 0x%08lx (%u/%lu pages) "
- "to vma 0x%08lx, page_prot 0x%llx\n",
- file_dentry(file)->d_name.name, pgoff,
- address, pages, vma_pages(vma), vma->vm_start,
+ pr_debug("mapped %pD[%lu] at 0x%08lx (%u/%lu pages) "
+ "to vma 0x%08lx, page_prot 0x%llx\n", file,
+ pgoff, address, pages, vma_pages(vma), vma->vm_start,
(unsigned long long)pgprot_val(vma->vm_page_prot));
return ret;
bailout:
- pr_debug("%s[%lu]: direct mmap impossible: %s\n",
- file_dentry(file)->d_name.name, pgoff, bailout_reason);
+ pr_debug("%pD[%lu]: direct mmap impossible: %s\n",
+ file, pgoff, bailout_reason);
/* Didn't manage any direct map, but normal paging is still possible */
return 0;
}
@@ -469,8 +467,8 @@ static unsigned long cramfs_physmem_get_unmapped_area(struct file *file,
if (!offset || block_pages != pages)
return -ENOSYS;
addr = sbi->linear_phys_addr + offset;
- pr_debug("get_unmapped for %s ofs %#lx siz %lu at 0x%08lx\n",
- file_dentry(file)->d_name.name, pgoff*PAGE_SIZE, len, addr);
+ pr_debug("get_unmapped for %pD ofs %#lx siz %lu at 0x%08lx\n",
+ file, pgoff*PAGE_SIZE, len, addr);
return addr;
}
diff --git a/fs/dax.c b/fs/dax.c
index 26d5dcd2d69e..b3d27fdc6775 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -810,11 +810,12 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
address = pgoff_address(index, vma);
/*
- * Note because we provide range to follow_pte it will call
+ * follow_invalidate_pte() will use the range to call
* mmu_notifier_invalidate_range_start() on our behalf before
* taking any lock.
*/
- if (follow_pte(vma->vm_mm, address, &range, &ptep, &pmdp, &ptl))
+ if (follow_invalidate_pte(vma->vm_mm, address, &range, &ptep,
+ &pmdp, &ptl))
continue;
/*
diff --git a/fs/dcache.c b/fs/dcache.c
index 97e81a844a96..799d9e4f0bcd 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -456,23 +456,6 @@ static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
list_lru_isolate_move(lru, &dentry->d_lru, list);
}
-/**
- * d_drop - drop a dentry
- * @dentry: dentry to drop
- *
- * d_drop() unhashes the entry from the parent dentry hashes, so that it won't
- * be found through a VFS lookup any more. Note that this is different from
- * deleting the dentry - d_delete will try to mark the dentry negative if
- * possible, giving a successful _negative_ lookup, while d_drop will
- * just make the cache lookup fail.
- *
- * d_drop() is used mainly for stuff that wants to invalidate a dentry for some
- * reason (NFS timeouts or autofs deletes).
- *
- * __d_drop requires dentry->d_lock
- * ___d_drop doesn't mark dentry as "unhashed"
- * (dentry->d_hash.pprev will be LIST_POISON2, not NULL).
- */
static void ___d_drop(struct dentry *dentry)
{
struct hlist_bl_head *b;
@@ -501,6 +484,24 @@ void __d_drop(struct dentry *dentry)
}
EXPORT_SYMBOL(__d_drop);
+/**
+ * d_drop - drop a dentry
+ * @dentry: dentry to drop
+ *
+ * d_drop() unhashes the entry from the parent dentry hashes, so that it won't
+ * be found through a VFS lookup any more. Note that this is different from
+ * deleting the dentry - d_delete will try to mark the dentry negative if
+ * possible, giving a successful _negative_ lookup, while d_drop will
+ * just make the cache lookup fail.
+ *
+ * d_drop() is used mainly for stuff that wants to invalidate a dentry for some
+ * reason (NFS timeouts or autofs deletes).
+ *
+ * __d_drop requires dentry->d_lock
+ *
+ * ___d_drop doesn't mark dentry as "unhashed"
+ * (dentry->d_hash.pprev will be LIST_POISON2, not NULL).
+ */
void d_drop(struct dentry *dentry)
{
spin_lock(&dentry->d_lock);
@@ -996,20 +997,6 @@ struct dentry *d_find_any_alias(struct inode *inode)
}
EXPORT_SYMBOL(d_find_any_alias);
-/**
- * d_find_alias - grab a hashed alias of inode
- * @inode: inode in question
- *
- * If inode has a hashed alias, or is a directory and has any alias,
- * acquire the reference to alias and return it. Otherwise return NULL.
- * Notice that if inode is a directory there can be only one alias and
- * it can be unhashed only if it has no children, or if it is the root
- * of a filesystem, or if the directory was renamed and d_revalidate
- * was the first vfs operation to notice.
- *
- * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
- * any other hashed alias over that one.
- */
static struct dentry *__d_find_alias(struct inode *inode)
{
struct dentry *alias;
@@ -1029,6 +1016,20 @@ static struct dentry *__d_find_alias(struct inode *inode)
return NULL;
}
+/**
+ * d_find_alias - grab a hashed alias of inode
+ * @inode: inode in question
+ *
+ * If inode has a hashed alias, or is a directory and has any alias,
+ * acquire the reference to alias and return it. Otherwise return NULL.
+ * Notice that if inode is a directory there can be only one alias and
+ * it can be unhashed only if it has no children, or if it is the root
+ * of a filesystem, or if the directory was renamed and d_revalidate
+ * was the first vfs operation to notice.
+ *
+ * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
+ * any other hashed alias over that one.
+ */
struct dentry *d_find_alias(struct inode *inode)
{
struct dentry *de = NULL;
@@ -1043,6 +1044,31 @@ struct dentry *d_find_alias(struct inode *inode)
EXPORT_SYMBOL(d_find_alias);
/*
+ * Caller MUST be holding rcu_read_lock() and be guaranteed
+ * that inode won't get freed until rcu_read_unlock().
+ */
+struct dentry *d_find_alias_rcu(struct inode *inode)
+{
+ struct hlist_head *l = &inode->i_dentry;
+ struct dentry *de = NULL;
+
+ spin_lock(&inode->i_lock);
+ // ->i_dentry and ->i_rcu are colocated, but the latter won't be
+ // used without having I_FREEING set, which means no aliases left
+ if (likely(!(inode->i_state & I_FREEING) && !hlist_empty(l))) {
+ if (S_ISDIR(inode->i_mode)) {
+ de = hlist_entry(l->first, struct dentry, d_u.d_alias);
+ } else {
+ hlist_for_each_entry(de, l, d_u.d_alias)
+ if (!d_unhashed(de))
+ break;
+ }
+ }
+ spin_unlock(&inode->i_lock);
+ return de;
+}
+
+/*
* Try to kill dentries associated with this inode.
* WARNING: you must own a reference to inode.
*/
diff --git a/fs/dcookies.c b/fs/dcookies.c
deleted file mode 100644
index 6eeb61100a09..000000000000
--- a/fs/dcookies.c
+++ /dev/null
@@ -1,356 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * dcookies.c
- *
- * Copyright 2002 John Levon <levon@movementarian.org>
- *
- * Persistent cookie-path mappings. These are used by
- * profilers to convert a per-task EIP value into something
- * non-transitory that can be processed at a later date.
- * This is done by locking the dentry/vfsmnt pair in the
- * kernel until released by the tasks needing the persistent
- * objects. The tag is simply an unsigned long that refers
- * to the pair and can be looked up from userspace.
- */
-
-#include <linux/syscalls.h>
-#include <linux/export.h>
-#include <linux/slab.h>
-#include <linux/list.h>
-#include <linux/mount.h>
-#include <linux/capability.h>
-#include <linux/dcache.h>
-#include <linux/mm.h>
-#include <linux/err.h>
-#include <linux/errno.h>
-#include <linux/dcookies.h>
-#include <linux/mutex.h>
-#include <linux/path.h>
-#include <linux/compat.h>
-#include <linux/uaccess.h>
-
-/* The dcookies are allocated from a kmem_cache and
- * hashed onto a small number of lists. None of the
- * code here is particularly performance critical
- */
-struct dcookie_struct {
- struct path path;
- struct list_head hash_list;
-};
-
-static LIST_HEAD(dcookie_users);
-static DEFINE_MUTEX(dcookie_mutex);
-static struct kmem_cache *dcookie_cache __read_mostly;
-static struct list_head *dcookie_hashtable __read_mostly;
-static size_t hash_size __read_mostly;
-
-static inline int is_live(void)
-{
- return !(list_empty(&dcookie_users));
-}
-
-
-/* The dentry is locked, its address will do for the cookie */
-static inline unsigned long dcookie_value(struct dcookie_struct * dcs)
-{
- return (unsigned long)dcs->path.dentry;
-}
-
-
-static size_t dcookie_hash(unsigned long dcookie)
-{
- return (dcookie >> L1_CACHE_SHIFT) & (hash_size - 1);
-}
-
-
-static struct dcookie_struct * find_dcookie(unsigned long dcookie)
-{
- struct dcookie_struct *found = NULL;
- struct dcookie_struct * dcs;
- struct list_head * pos;
- struct list_head * list;
-
- list = dcookie_hashtable + dcookie_hash(dcookie);
-
- list_for_each(pos, list) {
- dcs = list_entry(pos, struct dcookie_struct, hash_list);
- if (dcookie_value(dcs) == dcookie) {
- found = dcs;
- break;
- }
- }
-
- return found;
-}
-
-
-static void hash_dcookie(struct dcookie_struct * dcs)
-{
- struct list_head * list = dcookie_hashtable + dcookie_hash(dcookie_value(dcs));
- list_add(&dcs->hash_list, list);
-}
-
-
-static struct dcookie_struct *alloc_dcookie(const struct path *path)
-{
- struct dcookie_struct *dcs = kmem_cache_alloc(dcookie_cache,
- GFP_KERNEL);
- struct dentry *d;
- if (!dcs)
- return NULL;
-
- d = path->dentry;
- spin_lock(&d->d_lock);
- d->d_flags |= DCACHE_COOKIE;
- spin_unlock(&d->d_lock);
-
- dcs->path = *path;
- path_get(path);
- hash_dcookie(dcs);
- return dcs;
-}
-
-
-/* This is the main kernel-side routine that retrieves the cookie
- * value for a dentry/vfsmnt pair.
- */
-int get_dcookie(const struct path *path, unsigned long *cookie)
-{
- int err = 0;
- struct dcookie_struct * dcs;
-
- mutex_lock(&dcookie_mutex);
-
- if (!is_live()) {
- err = -EINVAL;
- goto out;
- }
-
- if (path->dentry->d_flags & DCACHE_COOKIE) {
- dcs = find_dcookie((unsigned long)path->dentry);
- } else {
- dcs = alloc_dcookie(path);
- if (!dcs) {
- err = -ENOMEM;
- goto out;
- }
- }
-
- *cookie = dcookie_value(dcs);
-
-out:
- mutex_unlock(&dcookie_mutex);
- return err;
-}
-
-
-/* And here is where the userspace process can look up the cookie value
- * to retrieve the path.
- */
-static int do_lookup_dcookie(u64 cookie64, char __user *buf, size_t len)
-{
- unsigned long cookie = (unsigned long)cookie64;
- int err = -EINVAL;
- char * kbuf;
- char * path;
- size_t pathlen;
- struct dcookie_struct * dcs;
-
- /* we could leak path information to users
- * without dir read permission without this
- */
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- mutex_lock(&dcookie_mutex);
-
- if (!is_live()) {
- err = -EINVAL;
- goto out;
- }
-
- if (!(dcs = find_dcookie(cookie)))
- goto out;
-
- err = -ENOMEM;
- kbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (!kbuf)
- goto out;
-
- /* FIXME: (deleted) ? */
- path = d_path(&dcs->path, kbuf, PAGE_SIZE);
-
- mutex_unlock(&dcookie_mutex);
-
- if (IS_ERR(path)) {
- err = PTR_ERR(path);
- goto out_free;
- }
-
- err = -ERANGE;
-
- pathlen = kbuf + PAGE_SIZE - path;
- if (pathlen <= len) {
- err = pathlen;
- if (copy_to_user(buf, path, pathlen))
- err = -EFAULT;
- }
-
-out_free:
- kfree(kbuf);
- return err;
-out:
- mutex_unlock(&dcookie_mutex);
- return err;
-}
-
-SYSCALL_DEFINE3(lookup_dcookie, u64, cookie64, char __user *, buf, size_t, len)
-{
- return do_lookup_dcookie(cookie64, buf, len);
-}
-
-#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, compat_size_t, len)
-{
-#ifdef __BIG_ENDIAN
- return do_lookup_dcookie(((u64)w0 << 32) | w1, buf, len);
-#else
- return do_lookup_dcookie(((u64)w1 << 32) | w0, buf, len);
-#endif
-}
-#endif
-
-static int dcookie_init(void)
-{
- struct list_head * d;
- unsigned int i, hash_bits;
- int err = -ENOMEM;
-
- dcookie_cache = kmem_cache_create("dcookie_cache",
- sizeof(struct dcookie_struct),
- 0, 0, NULL);
-
- if (!dcookie_cache)
- goto out;
-
- dcookie_hashtable = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (!dcookie_hashtable)
- goto out_kmem;
-
- err = 0;
-
- /*
- * Find the power-of-two list-heads that can fit into the allocation..
- * We don't guarantee that "sizeof(struct list_head)" is necessarily
- * a power-of-two.
- */
- hash_size = PAGE_SIZE / sizeof(struct list_head);
- hash_bits = 0;
- do {
- hash_bits++;
- } while ((hash_size >> hash_bits) != 0);
- hash_bits--;
-
- /*
- * Re-calculate the actual number of entries and the mask
- * from the number of bits we can fit.
- */
- hash_size = 1UL << hash_bits;
-
- /* And initialize the newly allocated array */
- d = dcookie_hashtable;
- i = hash_size;
- do {
- INIT_LIST_HEAD(d);
- d++;
- i--;
- } while (i);
-
-out:
- return err;
-out_kmem:
- kmem_cache_destroy(dcookie_cache);
- goto out;
-}
-
-
-static void free_dcookie(struct dcookie_struct * dcs)
-{
- struct dentry *d = dcs->path.dentry;
-
- spin_lock(&d->d_lock);
- d->d_flags &= ~DCACHE_COOKIE;
- spin_unlock(&d->d_lock);
-
- path_put(&dcs->path);
- kmem_cache_free(dcookie_cache, dcs);
-}
-
-
-static void dcookie_exit(void)
-{
- struct list_head * list;
- struct list_head * pos;
- struct list_head * pos2;
- struct dcookie_struct * dcs;
- size_t i;
-
- for (i = 0; i < hash_size; ++i) {
- list = dcookie_hashtable + i;
- list_for_each_safe(pos, pos2, list) {
- dcs = list_entry(pos, struct dcookie_struct, hash_list);
- list_del(&dcs->hash_list);
- free_dcookie(dcs);
- }
- }
-
- kfree(dcookie_hashtable);
- kmem_cache_destroy(dcookie_cache);
-}
-
-
-struct dcookie_user {
- struct list_head next;
-};
-
-struct dcookie_user * dcookie_register(void)
-{
- struct dcookie_user * user;
-
- mutex_lock(&dcookie_mutex);
-
- user = kmalloc(sizeof(struct dcookie_user), GFP_KERNEL);
- if (!user)
- goto out;
-
- if (!is_live() && dcookie_init())
- goto out_free;
-
- list_add(&user->next, &dcookie_users);
-
-out:
- mutex_unlock(&dcookie_mutex);
- return user;
-out_free:
- kfree(user);
- user = NULL;
- goto out;
-}
-
-
-void dcookie_unregister(struct dcookie_user * user)
-{
- mutex_lock(&dcookie_mutex);
-
- list_del(&user->next);
- kfree(user);
-
- if (!is_live())
- dcookie_exit();
-
- mutex_unlock(&dcookie_mutex);
-}
-
-EXPORT_SYMBOL_GPL(dcookie_register);
-EXPORT_SYMBOL_GPL(dcookie_unregister);
-EXPORT_SYMBOL_GPL(get_dcookie);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index d53fa92a1ab6..aa1083ecd623 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -426,6 +426,8 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
unsigned long flags;
bio->bi_private = dio;
+ /* don't account direct I/O as memory stall */
+ bio_clear_flag(bio, BIO_WORKINGSET);
spin_lock_irqsave(&dio->bio_lock, flags);
dio->refcount++;
@@ -434,7 +436,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty)
bio_set_pages_dirty(bio);
- dio->bio_disk = bio->bi_disk;
+ dio->bio_disk = bio->bi_bdev->bd_disk;
if (sdio->submit_io) {
sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 55da9a91f51a..18e9285fbb4c 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -1037,16 +1037,19 @@ ecryptfs_setxattr(struct dentry *dentry, struct inode *inode,
{
int rc;
struct dentry *lower_dentry;
+ struct inode *lower_inode;
lower_dentry = ecryptfs_dentry_to_lower(dentry);
- if (!(d_inode(lower_dentry)->i_opflags & IOP_XATTR)) {
+ lower_inode = d_inode(lower_dentry);
+ if (!(lower_inode->i_opflags & IOP_XATTR)) {
rc = -EOPNOTSUPP;
goto out;
}
- rc = vfs_setxattr(&init_user_ns, lower_dentry, name, value, size,
- flags);
+ inode_lock(lower_inode);
+ rc = __vfs_setxattr_locked(&init_user_ns, lower_dentry, name, value, size, flags, NULL);
+ inode_unlock(lower_inode);
if (!rc && inode)
- fsstack_copy_attr_all(inode, d_inode(lower_dentry));
+ fsstack_copy_attr_all(inode, lower_inode);
out:
return rc;
}
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index 5f8cc7346c69..3a81e1f7fc06 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -234,8 +234,8 @@ static struct dentry *erofs_lookup(struct inode *dir,
} else if (err) {
inode = ERR_PTR(err);
} else {
- erofs_dbg("%s, %s (nid %llu) found, d_type %u", __func__,
- dentry->d_name.name, nid, d_type);
+ erofs_dbg("%s, %pd (nid %llu) found, d_type %u", __func__,
+ dentry, nid, d_type);
inode = erofs_iget(dir->i_sb, nid, d_type == FT_DIR);
}
return d_splice_alias(inode, dentry);
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index be10b16ea66e..d5a6b9b888a5 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -158,8 +158,8 @@ static int erofs_read_superblock(struct super_block *sb)
blkszbits = dsb->blkszbits;
/* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */
if (blkszbits != LOG_BLOCK_SIZE) {
- erofs_err(sb, "blksize %u isn't supported on this platform",
- 1 << blkszbits);
+ erofs_err(sb, "blkszbits %u isn't supported on this platform",
+ blkszbits);
goto out;
}
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index 5bde77d70852..47314a26767a 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -48,8 +48,14 @@ static int init_inode_xattrs(struct inode *inode)
int ret = 0;
/* the most case is that xattrs of this inode are initialized. */
- if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags))
+ if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags)) {
+ /*
+ * paired with smp_mb() at the end of the function to ensure
+ * fields will only be observed after the bit is set.
+ */
+ smp_mb();
return 0;
+ }
if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_XATTR_BIT, TASK_KILLABLE))
return -ERESTARTSYS;
@@ -137,6 +143,8 @@ static int init_inode_xattrs(struct inode *inode)
}
xattr_iter_end(&it, atomic_map);
+ /* paired with smp_mb() at the beginning of the function. */
+ smp_mb();
set_bit(EROFS_I_EA_INITED_BIT, &vi->flags);
out_unlock:
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index ae325541884e..14d2de35110c 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -36,8 +36,14 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
void *kaddr;
struct z_erofs_map_header *h;
- if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags))
+ if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) {
+ /*
+ * paired with smp_mb() at the end of the function to ensure
+ * fields will only be observed after the bit is set.
+ */
+ smp_mb();
return 0;
+ }
if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE))
return -ERESTARTSYS;
@@ -83,6 +89,8 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits +
((h->h_clusterbits >> 5) & 7);
+ /* paired with smp_mb() at the beginning of the function */
+ smp_mb();
set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
unmap_done:
kunmap_atomic(kaddr);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index a829af074eb5..3196474cbe24 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -979,7 +979,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
return epir;
}
-#ifdef CONFIG_CHECKPOINT_RESTORE
+#ifdef CONFIG_KCMP
static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
{
struct rb_node *rbp;
@@ -1021,7 +1021,7 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
return file_raw;
}
-#endif /* CONFIG_CHECKPOINT_RESTORE */
+#endif /* CONFIG_KCMP */
/**
* Adds a new entry to the tail of the list in a lockless way, i.e.
diff --git a/fs/exec.c b/fs/exec.c
index 48d1e8b1610b..6f3c02066ce3 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -708,7 +708,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
return -ENOMEM;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, old_start, old_end);
+ tlb_gather_mmu(&tlb, mm);
if (new_end > old_start) {
/*
* when the old and new regions overlap clear from new_end.
@@ -725,7 +725,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
free_pgd_range(&tlb, old_start, old_end, new_end,
vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
}
- tlb_finish_mmu(&tlb, old_start, old_end);
+ tlb_finish_mmu(&tlb);
/*
* Shrink the vma to just the new range. Always succeeds.
diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c
index a987919686c0..761c79c3a4ba 100644
--- a/fs/exfat/balloc.c
+++ b/fs/exfat/balloc.c
@@ -166,7 +166,7 @@ int exfat_set_bitmap(struct inode *inode, unsigned int clu)
* If the value of "clu" is 0, it means cluster 2 which is the first cluster of
* the cluster heap.
*/
-void exfat_clear_bitmap(struct inode *inode, unsigned int clu)
+void exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync)
{
int i, b;
unsigned int ent_idx;
@@ -180,7 +180,7 @@ void exfat_clear_bitmap(struct inode *inode, unsigned int clu)
b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx);
clear_bit_le(b, sbi->vol_amap[i]->b_data);
- exfat_update_bh(sbi->vol_amap[i], IS_DIRSYNC(inode));
+ exfat_update_bh(sbi->vol_amap[i], sync);
if (opts->discard) {
int ret_discard;
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index d905bb9cd2ca..fa21421a14d9 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -408,7 +408,7 @@ int exfat_count_num_clusters(struct super_block *sb,
int exfat_load_bitmap(struct super_block *sb);
void exfat_free_bitmap(struct exfat_sb_info *sbi);
int exfat_set_bitmap(struct inode *inode, unsigned int clu);
-void exfat_clear_bitmap(struct inode *inode, unsigned int clu);
+void exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync);
unsigned int exfat_find_free_bitmap(struct super_block *sb, unsigned int clu);
int exfat_count_used_clusters(struct super_block *sb, unsigned int *ret_count);
diff --git a/fs/exfat/exfat_raw.h b/fs/exfat/exfat_raw.h
index 6aec6288e1f2..7f39b1c6469c 100644
--- a/fs/exfat/exfat_raw.h
+++ b/fs/exfat/exfat_raw.h
@@ -77,6 +77,10 @@
#define EXFAT_FILE_NAME_LEN 15
+#define EXFAT_MIN_SECT_SIZE_BITS 9
+#define EXFAT_MAX_SECT_SIZE_BITS 12
+#define EXFAT_MAX_SECT_PER_CLUS_BITS(x) (25 - (x)->sect_size_bits)
+
/* EXFAT: Main and Backup Boot Sector (512 bytes) */
struct boot_sector {
__u8 jmp_boot[BOOTSEC_JUMP_BOOT_LEN];
diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c
index c3c9afee7418..7b2e8af17193 100644
--- a/fs/exfat/fatent.c
+++ b/fs/exfat/fatent.c
@@ -157,6 +157,7 @@ int exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain)
unsigned int clu;
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ int cur_cmap_i, next_cmap_i;
/* invalid cluster number */
if (p_chain->dir == EXFAT_FREE_CLUSTER ||
@@ -176,21 +177,51 @@ int exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain)
clu = p_chain->dir;
+ cur_cmap_i = next_cmap_i =
+ BITMAP_OFFSET_SECTOR_INDEX(sb, CLUSTER_TO_BITMAP_ENT(clu));
+
if (p_chain->flags == ALLOC_NO_FAT_CHAIN) {
+ unsigned int last_cluster = p_chain->dir + p_chain->size - 1;
do {
- exfat_clear_bitmap(inode, clu);
- clu++;
+ bool sync = false;
+
+ if (clu < last_cluster)
+ next_cmap_i =
+ BITMAP_OFFSET_SECTOR_INDEX(sb, CLUSTER_TO_BITMAP_ENT(clu+1));
+ /* flush bitmap only if index would be changed or for last cluster */
+ if (clu == last_cluster || cur_cmap_i != next_cmap_i) {
+ sync = true;
+ cur_cmap_i = next_cmap_i;
+ }
+
+ exfat_clear_bitmap(inode, clu, (sync && IS_DIRSYNC(inode)));
+ clu++;
num_clusters++;
} while (num_clusters < p_chain->size);
} else {
do {
- exfat_clear_bitmap(inode, clu);
-
- if (exfat_get_next_cluster(sb, &clu))
- goto dec_used_clus;
+ bool sync = false;
+ unsigned int n_clu = clu;
+ int err = exfat_get_next_cluster(sb, &n_clu);
+
+ if (err || n_clu == EXFAT_EOF_CLUSTER)
+ sync = true;
+ else
+ next_cmap_i =
+ BITMAP_OFFSET_SECTOR_INDEX(sb, CLUSTER_TO_BITMAP_ENT(n_clu));
+
+ if (cur_cmap_i != next_cmap_i) {
+ sync = true;
+ cur_cmap_i = next_cmap_i;
+ }
+ exfat_clear_bitmap(inode, clu, (sync && IS_DIRSYNC(inode)));
+ clu = n_clu;
num_clusters++;
+
+ if (err)
+ goto dec_used_clus;
} while (clu != EXFAT_EOF_CLUSTER);
}
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 3aa6eb4de5e3..f783cf38dd8e 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -363,7 +363,7 @@ int exfat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
if (err)
return err;
- return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ return blkdev_issue_flush(inode->i_sb->s_bdev);
}
const struct file_operations exfat_file_operations = {
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 87be5bfc31eb..c6d8d2e53486 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -381,8 +381,7 @@ static int exfat_calibrate_blocksize(struct super_block *sb, int logical_sect)
{
struct exfat_sb_info *sbi = EXFAT_SB(sb);
- if (!is_power_of_2(logical_sect) ||
- logical_sect < 512 || logical_sect > 4096) {
+ if (!is_power_of_2(logical_sect)) {
exfat_err(sb, "bogus logical sector size %u", logical_sect);
return -EIO;
}
@@ -451,6 +450,25 @@ static int exfat_read_boot_sector(struct super_block *sb)
return -EINVAL;
}
+ /*
+ * sect_size_bits could be at least 9 and at most 12.
+ */
+ if (p_boot->sect_size_bits < EXFAT_MIN_SECT_SIZE_BITS ||
+ p_boot->sect_size_bits > EXFAT_MAX_SECT_SIZE_BITS) {
+ exfat_err(sb, "bogus sector size bits : %u\n",
+ p_boot->sect_size_bits);
+ return -EINVAL;
+ }
+
+ /*
+ * sect_per_clus_bits could be at least 0 and at most 25 - sect_size_bits.
+ */
+ if (p_boot->sect_per_clus_bits > EXFAT_MAX_SECT_PER_CLUS_BITS(p_boot)) {
+ exfat_err(sb, "bogus sectors bits per cluster : %u\n",
+ p_boot->sect_per_clus_bits);
+ return -EINVAL;
+ }
+
sbi->sect_per_clus = 1 << p_boot->sect_per_clus_bits;
sbi->sect_per_clus_bits = p_boot->sect_per_clus_bits;
sbi->cluster_size_bits = p_boot->sect_per_clus_bits +
@@ -477,16 +495,19 @@ static int exfat_read_boot_sector(struct super_block *sb)
sbi->used_clusters = EXFAT_CLUSTERS_UNTRACKED;
/* check consistencies */
- if (sbi->num_FAT_sectors << p_boot->sect_size_bits <
- sbi->num_clusters * 4) {
+ if ((u64)sbi->num_FAT_sectors << p_boot->sect_size_bits <
+ (u64)sbi->num_clusters * 4) {
exfat_err(sb, "bogus fat length");
return -EINVAL;
}
+
if (sbi->data_start_sector <
- sbi->FAT1_start_sector + sbi->num_FAT_sectors * p_boot->num_fats) {
+ (u64)sbi->FAT1_start_sector +
+ (u64)sbi->num_FAT_sectors * p_boot->num_fats) {
exfat_err(sb, "bogus data start sector");
return -EINVAL;
}
+
if (sbi->vol_flags & VOLUME_DIRTY)
exfat_warn(sb, "Volume was not properly unmounted. Some data may be corrupt. Please run fsck.");
if (sbi->vol_flags & MEDIA_FAILURE)
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 0a14a7c87bf8..6e8208acfc62 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -1076,7 +1076,7 @@ static int ext4_fc_perform_commit(journal_t *journal)
* flush before we start writing fast commit blocks.
*/
if (journal->j_fs_dev != journal->j_dev)
- blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
+ blkdev_issue_flush(journal->j_fs_dev);
blk_start_plug(&plug);
if (sbi->s_fc_bytes == 0) {
@@ -1535,7 +1535,7 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
out:
iput(inode);
if (!ret)
- blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
+ blkdev_issue_flush(sb->s_bdev);
return 0;
}
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 349b27f0dda0..194f5d00fa32 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -74,8 +74,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
return generic_file_read_iter(iocb, to);
}
- ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL,
- is_sync_kiocb(iocb));
+ ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0);
inode_unlock_shared(inode);
file_accessed(iocb->ki_filp);
@@ -550,7 +549,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (ilock_shared)
iomap_ops = &ext4_iomap_overwrite_ops;
ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
- is_sync_kiocb(iocb) || unaligned_io || extend);
+ (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0);
if (ret == -ENOTBLK)
ret = 0;
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 113bfb023a4a..027a7d7037a0 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -174,7 +174,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
ret = ext4_fsync_journal(inode, datasync, &needs_barrier);
if (needs_barrier) {
- err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ err = blkdev_issue_flush(inode->i_sb->s_bdev);
if (!ret)
ret = err;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index bf9028950a51..633ae7becd61 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1584,7 +1584,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
if (ret < 0)
goto err_out;
if (barrier)
- blkdev_issue_flush(sb->s_bdev, GFP_NOFS);
+ blkdev_issue_flush(sb->s_bdev);
skip_zeroout:
ext4_lock_group(sb, group);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8fbf85b3547e..650c5acd2f2d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4962,15 +4962,11 @@ static void __ext4_update_other_inode_time(struct super_block *sb,
if (!inode)
return;
- if ((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
- I_DIRTY_INODE)) ||
- ((inode->i_state & I_DIRTY_TIME) == 0))
+ if (!inode_is_dirtytime_only(inode))
return;
spin_lock(&inode->i_lock);
- if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
- I_DIRTY_INODE)) == 0) &&
- (inode->i_state & I_DIRTY_TIME)) {
+ if (inode_is_dirtytime_only(inode)) {
struct ext4_inode_info *ei = EXT4_I(inode);
inode->i_state &= ~I_DIRTY_TIME;
@@ -5940,26 +5936,16 @@ out:
* If the inode is marked synchronous, we don't honour that here - doing
* so would cause a commit on atime updates, which we don't bother doing.
* We handle synchronous inodes at the highest possible level.
- *
- * If only the I_DIRTY_TIME flag is set, we can skip everything. If
- * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need
- * to copy into the on-disk inode structure are the timestamp files.
*/
void ext4_dirty_inode(struct inode *inode, int flags)
{
handle_t *handle;
- if (flags == I_DIRTY_TIME)
- return;
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle))
- goto out;
-
+ return;
ext4_mark_inode_dirty(handle, inode);
-
ext4_journal_stop(handle);
-out:
- return;
}
int ext4_change_inode_journal_flag(struct inode *inode, int val)
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 56ad9c4b6350..a2cf35066f46 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -1313,6 +1313,12 @@ out:
return -EOPNOTSUPP;
return fsverity_ioctl_measure(filp, (void __user *)arg);
+ case FS_IOC_READ_VERITY_METADATA:
+ if (!ext4_has_feature_verity(sb))
+ return -EOPNOTSUPP;
+ return fsverity_ioctl_read_metadata(filp,
+ (const void __user *)arg);
+
default:
return -ENOTTY;
}
@@ -1395,6 +1401,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case FS_IOC_GETFSMAP:
case FS_IOC_ENABLE_VERITY:
case FS_IOC_MEASURE_VERITY:
+ case FS_IOC_READ_VERITY_METADATA:
case EXT4_IOC_CLEAR_ES_CACHE:
case EXT4_IOC_GETSTATE:
case EXT4_IOC_GET_ES_CACHE:
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a77fbb79e813..802bd26ed01c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5709,7 +5709,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
needs_barrier = true;
if (needs_barrier) {
int err;
- err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
+ err = blkdev_issue_flush(sb->s_bdev);
if (!ret)
ret = err;
}
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index d13c5c6a9787..62e638a49bbf 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -76,16 +76,6 @@ config F2FS_CHECK_FS
If you want to improve the performance, say N.
-config F2FS_IO_TRACE
- bool "F2FS IO tracer"
- depends on F2FS_FS
- depends on FUNCTION_TRACER
- help
- F2FS IO trace is based on a function trace, which gathers process
- information and block IO patterns in the filesystem level.
-
- If unsure, say N.
-
config F2FS_FAULT_INJECTION
bool "F2FS fault injection facility"
depends on F2FS_FS
@@ -119,6 +109,16 @@ config F2FS_FS_LZ4
help
Support LZ4 compress algorithm, if unsure, say Y.
+config F2FS_FS_LZ4HC
+ bool "LZ4HC compression support"
+ depends on F2FS_FS_COMPRESSION
+ depends on F2FS_FS_LZ4
+ select LZ4HC_COMPRESS
+ default y
+ help
+ Support LZ4HC compress algorithm, LZ4HC has compatible on-disk
+ layout with LZ4, if unsure, say Y.
+
config F2FS_FS_ZSTD
bool "ZSTD compression support"
depends on F2FS_FS_COMPRESSION
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index ee7316b42f69..e5295746208b 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -7,6 +7,5 @@ f2fs-y += shrinker.o extent_cache.o sysfs.o
f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
-f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o
f2fs-$(CONFIG_FS_VERITY) += verity.o
f2fs-$(CONFIG_F2FS_FS_COMPRESSION) += compress.o
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index a19e86c9adac..965037a9c205 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -200,6 +200,27 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
return __f2fs_get_acl(inode, type, NULL);
}
+static int f2fs_acl_update_mode(struct inode *inode, umode_t *mode_p,
+ struct posix_acl **acl)
+{
+ umode_t mode = inode->i_mode;
+ int error;
+
+ if (is_inode_flag_set(inode, FI_ACL_MODE))
+ mode = F2FS_I(inode)->i_acl_mode;
+
+ error = posix_acl_equiv_mode(*acl, &mode);
+ if (error < 0)
+ return error;
+ if (error == 0)
+ *acl = NULL;
+ if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) &&
+ !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
+ mode &= ~S_ISGID;
+ *mode_p = mode;
+ return 0;
+}
+
static int __f2fs_set_acl(struct inode *inode, int type,
struct posix_acl *acl, struct page *ipage)
{
@@ -213,8 +234,7 @@ static int __f2fs_set_acl(struct inode *inode, int type,
case ACL_TYPE_ACCESS:
name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
if (acl && !ipage) {
- error = posix_acl_update_mode(&init_user_ns, inode,
- &mode, &acl);
+ error = f2fs_acl_update_mode(inode, &mode, &acl);
if (error)
return error;
set_acl_inode(inode, mode);
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 897edb7c951a..174a0819ad96 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -13,13 +13,15 @@
#include <linux/f2fs_fs.h>
#include <linux/pagevec.h>
#include <linux/swap.h>
+#include <linux/kthread.h>
#include "f2fs.h"
#include "node.h"
#include "segment.h"
-#include "trace.h"
#include <trace/events/f2fs.h>
+#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
+
static struct kmem_cache *ino_entry_slab;
struct kmem_cache *f2fs_inode_entry_slab;
@@ -443,7 +445,6 @@ static int f2fs_set_meta_page_dirty(struct page *page)
__set_page_dirty_nobuffers(page);
inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
f2fs_set_page_private(page, 0);
- f2fs_trace_pid(page);
return 1;
}
return 0;
@@ -1017,7 +1018,6 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page)
spin_unlock(&sbi->inode_lock[type]);
f2fs_set_page_private(page, 0);
- f2fs_trace_pid(page);
}
void f2fs_remove_dirty_inode(struct inode *inode)
@@ -1707,3 +1707,174 @@ void f2fs_destroy_checkpoint_caches(void)
kmem_cache_destroy(ino_entry_slab);
kmem_cache_destroy(f2fs_inode_entry_slab);
}
+
+static int __write_checkpoint_sync(struct f2fs_sb_info *sbi)
+{
+ struct cp_control cpc = { .reason = CP_SYNC, };
+ int err;
+
+ down_write(&sbi->gc_lock);
+ err = f2fs_write_checkpoint(sbi, &cpc);
+ up_write(&sbi->gc_lock);
+
+ return err;
+}
+
+static void __checkpoint_and_complete_reqs(struct f2fs_sb_info *sbi)
+{
+ struct ckpt_req_control *cprc = &sbi->cprc_info;
+ struct ckpt_req *req, *next;
+ struct llist_node *dispatch_list;
+ u64 sum_diff = 0, diff, count = 0;
+ int ret;
+
+ dispatch_list = llist_del_all(&cprc->issue_list);
+ if (!dispatch_list)
+ return;
+ dispatch_list = llist_reverse_order(dispatch_list);
+
+ ret = __write_checkpoint_sync(sbi);
+ atomic_inc(&cprc->issued_ckpt);
+
+ llist_for_each_entry_safe(req, next, dispatch_list, llnode) {
+ diff = (u64)ktime_ms_delta(ktime_get(), req->queue_time);
+ req->ret = ret;
+ complete(&req->wait);
+
+ sum_diff += diff;
+ count++;
+ }
+ atomic_sub(count, &cprc->queued_ckpt);
+ atomic_add(count, &cprc->total_ckpt);
+
+ spin_lock(&cprc->stat_lock);
+ cprc->cur_time = (unsigned int)div64_u64(sum_diff, count);
+ if (cprc->peak_time < cprc->cur_time)
+ cprc->peak_time = cprc->cur_time;
+ spin_unlock(&cprc->stat_lock);
+}
+
+static int issue_checkpoint_thread(void *data)
+{
+ struct f2fs_sb_info *sbi = data;
+ struct ckpt_req_control *cprc = &sbi->cprc_info;
+ wait_queue_head_t *q = &cprc->ckpt_wait_queue;
+repeat:
+ if (kthread_should_stop())
+ return 0;
+
+ if (!llist_empty(&cprc->issue_list))
+ __checkpoint_and_complete_reqs(sbi);
+
+ wait_event_interruptible(*q,
+ kthread_should_stop() || !llist_empty(&cprc->issue_list));
+ goto repeat;
+}
+
+static void flush_remained_ckpt_reqs(struct f2fs_sb_info *sbi,
+ struct ckpt_req *wait_req)
+{
+ struct ckpt_req_control *cprc = &sbi->cprc_info;
+
+ if (!llist_empty(&cprc->issue_list)) {
+ __checkpoint_and_complete_reqs(sbi);
+ } else {
+ /* already dispatched by issue_checkpoint_thread */
+ if (wait_req)
+ wait_for_completion(&wait_req->wait);
+ }
+}
+
+static void init_ckpt_req(struct ckpt_req *req)
+{
+ memset(req, 0, sizeof(struct ckpt_req));
+
+ init_completion(&req->wait);
+ req->queue_time = ktime_get();
+}
+
+int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi)
+{
+ struct ckpt_req_control *cprc = &sbi->cprc_info;
+ struct ckpt_req req;
+ struct cp_control cpc;
+
+ cpc.reason = __get_cp_reason(sbi);
+ if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) {
+ int ret;
+
+ down_write(&sbi->gc_lock);
+ ret = f2fs_write_checkpoint(sbi, &cpc);
+ up_write(&sbi->gc_lock);
+
+ return ret;
+ }
+
+ if (!cprc->f2fs_issue_ckpt)
+ return __write_checkpoint_sync(sbi);
+
+ init_ckpt_req(&req);
+
+ llist_add(&req.llnode, &cprc->issue_list);
+ atomic_inc(&cprc->queued_ckpt);
+
+ /* update issue_list before we wake up issue_checkpoint thread */
+ smp_mb();
+
+ if (waitqueue_active(&cprc->ckpt_wait_queue))
+ wake_up(&cprc->ckpt_wait_queue);
+
+ if (cprc->f2fs_issue_ckpt)
+ wait_for_completion(&req.wait);
+ else
+ flush_remained_ckpt_reqs(sbi, &req);
+
+ return req.ret;
+}
+
+int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi)
+{
+ dev_t dev = sbi->sb->s_bdev->bd_dev;
+ struct ckpt_req_control *cprc = &sbi->cprc_info;
+
+ if (cprc->f2fs_issue_ckpt)
+ return 0;
+
+ cprc->f2fs_issue_ckpt = kthread_run(issue_checkpoint_thread, sbi,
+ "f2fs_ckpt-%u:%u", MAJOR(dev), MINOR(dev));
+ if (IS_ERR(cprc->f2fs_issue_ckpt)) {
+ cprc->f2fs_issue_ckpt = NULL;
+ return -ENOMEM;
+ }
+
+ set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio);
+
+ return 0;
+}
+
+void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi)
+{
+ struct ckpt_req_control *cprc = &sbi->cprc_info;
+
+ if (cprc->f2fs_issue_ckpt) {
+ struct task_struct *ckpt_task = cprc->f2fs_issue_ckpt;
+
+ cprc->f2fs_issue_ckpt = NULL;
+ kthread_stop(ckpt_task);
+
+ flush_remained_ckpt_reqs(sbi, NULL);
+ }
+}
+
+void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi)
+{
+ struct ckpt_req_control *cprc = &sbi->cprc_info;
+
+ atomic_set(&cprc->issued_ckpt, 0);
+ atomic_set(&cprc->total_ckpt, 0);
+ atomic_set(&cprc->queued_ckpt, 0);
+ cprc->ckpt_thread_ioprio = DEFAULT_CHECKPOINT_IOPRIO;
+ init_waitqueue_head(&cprc->ckpt_wait_queue);
+ init_llist_head(&cprc->issue_list);
+ spin_lock_init(&cprc->stat_lock);
+}
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 4bcbacfe3325..77fa342de38f 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -252,8 +252,14 @@ static const struct f2fs_compress_ops f2fs_lzo_ops = {
#ifdef CONFIG_F2FS_FS_LZ4
static int lz4_init_compress_ctx(struct compress_ctx *cc)
{
- cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode),
- LZ4_MEM_COMPRESS, GFP_NOFS);
+ unsigned int size = LZ4_MEM_COMPRESS;
+
+#ifdef CONFIG_F2FS_FS_LZ4HC
+ if (F2FS_I(cc->inode)->i_compress_flag >> COMPRESS_LEVEL_OFFSET)
+ size = LZ4HC_MEM_COMPRESS;
+#endif
+
+ cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), size, GFP_NOFS);
if (!cc->private)
return -ENOMEM;
@@ -272,10 +278,34 @@ static void lz4_destroy_compress_ctx(struct compress_ctx *cc)
cc->private = NULL;
}
+#ifdef CONFIG_F2FS_FS_LZ4HC
+static int lz4hc_compress_pages(struct compress_ctx *cc)
+{
+ unsigned char level = F2FS_I(cc->inode)->i_compress_flag >>
+ COMPRESS_LEVEL_OFFSET;
+ int len;
+
+ if (level)
+ len = LZ4_compress_HC(cc->rbuf, cc->cbuf->cdata, cc->rlen,
+ cc->clen, level, cc->private);
+ else
+ len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen,
+ cc->clen, cc->private);
+ if (!len)
+ return -EAGAIN;
+
+ cc->clen = len;
+ return 0;
+}
+#endif
+
static int lz4_compress_pages(struct compress_ctx *cc)
{
int len;
+#ifdef CONFIG_F2FS_FS_LZ4HC
+ return lz4hc_compress_pages(cc);
+#endif
len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen,
cc->clen, cc->private);
if (!len)
@@ -325,8 +355,13 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc)
ZSTD_CStream *stream;
void *workspace;
unsigned int workspace_size;
+ unsigned char level = F2FS_I(cc->inode)->i_compress_flag >>
+ COMPRESS_LEVEL_OFFSET;
- params = ZSTD_getParams(F2FS_ZSTD_DEFAULT_CLEVEL, cc->rlen, 0);
+ if (!level)
+ level = F2FS_ZSTD_DEFAULT_CLEVEL;
+
+ params = ZSTD_getParams(level, cc->rlen, 0);
workspace_size = ZSTD_CStreamWorkspaceBound(params.cParams);
workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode),
@@ -721,38 +756,27 @@ out:
return ret;
}
-void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
+static void f2fs_decompress_cluster(struct decompress_io_ctx *dic)
{
- struct decompress_io_ctx *dic =
- (struct decompress_io_ctx *)page_private(page);
struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode);
- struct f2fs_inode_info *fi= F2FS_I(dic->inode);
+ struct f2fs_inode_info *fi = F2FS_I(dic->inode);
const struct f2fs_compress_ops *cops =
f2fs_cops[fi->i_compress_algorithm];
int ret;
int i;
- dec_page_count(sbi, F2FS_RD_DATA);
-
- if (bio->bi_status || PageError(page))
- dic->failed = true;
-
- if (atomic_dec_return(&dic->pending_pages))
- return;
-
trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx,
dic->cluster_size, fi->i_compress_algorithm);
- /* submit partial compressed pages */
if (dic->failed) {
ret = -EIO;
- goto out_free_dic;
+ goto out_end_io;
}
dic->tpages = page_array_alloc(dic->inode, dic->cluster_size);
if (!dic->tpages) {
ret = -ENOMEM;
- goto out_free_dic;
+ goto out_end_io;
}
for (i = 0; i < dic->cluster_size; i++) {
@@ -764,20 +788,20 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
dic->tpages[i] = f2fs_compress_alloc_page();
if (!dic->tpages[i]) {
ret = -ENOMEM;
- goto out_free_dic;
+ goto out_end_io;
}
}
if (cops->init_decompress_ctx) {
ret = cops->init_decompress_ctx(dic);
if (ret)
- goto out_free_dic;
+ goto out_end_io;
}
dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size);
if (!dic->rbuf) {
ret = -ENOMEM;
- goto destroy_decompress_ctx;
+ goto out_destroy_decompress_ctx;
}
dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages);
@@ -816,18 +840,34 @@ out_vunmap_cbuf:
vm_unmap_ram(dic->cbuf, dic->nr_cpages);
out_vunmap_rbuf:
vm_unmap_ram(dic->rbuf, dic->cluster_size);
-destroy_decompress_ctx:
+out_destroy_decompress_ctx:
if (cops->destroy_decompress_ctx)
cops->destroy_decompress_ctx(dic);
-out_free_dic:
- if (!verity)
- f2fs_decompress_end_io(dic->rpages, dic->cluster_size,
- ret, false);
-
+out_end_io:
trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx,
dic->clen, ret);
- if (!verity)
- f2fs_free_dic(dic);
+ f2fs_decompress_end_io(dic, ret);
+}
+
+/*
+ * This is called when a page of a compressed cluster has been read from disk
+ * (or failed to be read from disk). It checks whether this page was the last
+ * page being waited on in the cluster, and if so, it decompresses the cluster
+ * (or in the case of a failure, cleans up without actually decompressing).
+ */
+void f2fs_end_read_compressed_page(struct page *page, bool failed)
+{
+ struct decompress_io_ctx *dic =
+ (struct decompress_io_ctx *)page_private(page);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode);
+
+ dec_page_count(sbi, F2FS_RD_DATA);
+
+ if (failed)
+ WRITE_ONCE(dic->failed, true);
+
+ if (atomic_dec_and_test(&dic->remaining_pages))
+ f2fs_decompress_cluster(dic);
}
static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index)
@@ -1415,7 +1455,7 @@ retry_write:
ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted,
NULL, NULL, wbc, io_type,
- compr_blocks);
+ compr_blocks, false);
if (ret) {
if (ret == AOP_WRITEPAGE_ACTIVATE) {
unlock_page(cc->rpages[i]);
@@ -1450,6 +1490,9 @@ retry_write:
*submitted += _submitted;
}
+
+ f2fs_balance_fs(F2FS_M_SB(mapping), true);
+
return 0;
out_err:
for (++i; i < cc->cluster_size; i++) {
@@ -1494,6 +1537,8 @@ destroy_out:
return err;
}
+static void f2fs_free_dic(struct decompress_io_ctx *dic);
+
struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
{
struct decompress_io_ctx *dic;
@@ -1512,12 +1557,14 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
dic->magic = F2FS_COMPRESSED_PAGE_MAGIC;
dic->inode = cc->inode;
- atomic_set(&dic->pending_pages, cc->nr_cpages);
+ atomic_set(&dic->remaining_pages, cc->nr_cpages);
dic->cluster_idx = cc->cluster_idx;
dic->cluster_size = cc->cluster_size;
dic->log_cluster_size = cc->log_cluster_size;
dic->nr_cpages = cc->nr_cpages;
+ refcount_set(&dic->refcnt, 1);
dic->failed = false;
+ dic->need_verity = f2fs_need_verity(cc->inode, start_idx);
for (i = 0; i < dic->cluster_size; i++)
dic->rpages[i] = cc->rpages[i];
@@ -1546,7 +1593,7 @@ out_free:
return ERR_PTR(-ENOMEM);
}
-void f2fs_free_dic(struct decompress_io_ctx *dic)
+static void f2fs_free_dic(struct decompress_io_ctx *dic)
{
int i;
@@ -1574,30 +1621,88 @@ void f2fs_free_dic(struct decompress_io_ctx *dic)
kmem_cache_free(dic_entry_slab, dic);
}
-void f2fs_decompress_end_io(struct page **rpages,
- unsigned int cluster_size, bool err, bool verity)
+static void f2fs_put_dic(struct decompress_io_ctx *dic)
+{
+ if (refcount_dec_and_test(&dic->refcnt))
+ f2fs_free_dic(dic);
+}
+
+/*
+ * Update and unlock the cluster's pagecache pages, and release the reference to
+ * the decompress_io_ctx that was being held for I/O completion.
+ */
+static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed)
{
int i;
- for (i = 0; i < cluster_size; i++) {
- struct page *rpage = rpages[i];
+ for (i = 0; i < dic->cluster_size; i++) {
+ struct page *rpage = dic->rpages[i];
if (!rpage)
continue;
- if (err || PageError(rpage))
- goto clear_uptodate;
-
- if (!verity || fsverity_verify_page(rpage)) {
+ /* PG_error was set if verity failed. */
+ if (failed || PageError(rpage)) {
+ ClearPageUptodate(rpage);
+ /* will re-read again later */
+ ClearPageError(rpage);
+ } else {
SetPageUptodate(rpage);
- goto unlock;
}
-clear_uptodate:
- ClearPageUptodate(rpage);
- ClearPageError(rpage);
-unlock:
unlock_page(rpage);
}
+
+ f2fs_put_dic(dic);
+}
+
+static void f2fs_verify_cluster(struct work_struct *work)
+{
+ struct decompress_io_ctx *dic =
+ container_of(work, struct decompress_io_ctx, verity_work);
+ int i;
+
+ /* Verify the cluster's decompressed pages with fs-verity. */
+ for (i = 0; i < dic->cluster_size; i++) {
+ struct page *rpage = dic->rpages[i];
+
+ if (rpage && !fsverity_verify_page(rpage))
+ SetPageError(rpage);
+ }
+
+ __f2fs_decompress_end_io(dic, false);
+}
+
+/*
+ * This is called when a compressed cluster has been decompressed
+ * (or failed to be read and/or decompressed).
+ */
+void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed)
+{
+ if (!failed && dic->need_verity) {
+ /*
+ * Note that to avoid deadlocks, the verity work can't be done
+ * on the decompression workqueue. This is because verifying
+ * the data pages can involve reading metadata pages from the
+ * file, and these metadata pages may be compressed.
+ */
+ INIT_WORK(&dic->verity_work, f2fs_verify_cluster);
+ fsverity_enqueue_verify_work(&dic->verity_work);
+ } else {
+ __f2fs_decompress_end_io(dic, failed);
+ }
+}
+
+/*
+ * Put a reference to a compressed page's decompress_io_ctx.
+ *
+ * This is called when the page is no longer needed and can be freed.
+ */
+void f2fs_put_page_dic(struct page *page)
+{
+ struct decompress_io_ctx *dic =
+ (struct decompress_io_ctx *)page_private(page);
+
+ f2fs_put_dic(dic);
}
int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index aa34d620bec9..b9721c8f116c 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -25,7 +25,6 @@
#include "f2fs.h"
#include "node.h"
#include "segment.h"
-#include "trace.h"
#include <trace/events/f2fs.h>
#define NUM_PREALLOC_POST_READ_CTXS 128
@@ -50,27 +49,6 @@ void f2fs_destroy_bioset(void)
bioset_exit(&f2fs_bioset);
}
-static inline struct bio *__f2fs_bio_alloc(gfp_t gfp_mask,
- unsigned int nr_iovecs)
-{
- return bio_alloc_bioset(gfp_mask, nr_iovecs, &f2fs_bioset);
-}
-
-struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio)
-{
- if (noio) {
- /* No failure on bio allocation */
- return __f2fs_bio_alloc(GFP_NOIO, npages);
- }
-
- if (time_to_inject(sbi, FAULT_ALLOC_BIO)) {
- f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO);
- return NULL;
- }
-
- return __f2fs_bio_alloc(GFP_KERNEL, npages);
-}
-
static bool __is_cp_guaranteed(struct page *page)
{
struct address_space *mapping = page->mapping;
@@ -115,10 +93,21 @@ static enum count_type __read_io_type(struct page *page)
/* postprocessing steps for read bios */
enum bio_post_read_step {
- STEP_DECRYPT,
- STEP_DECOMPRESS_NOWQ, /* handle normal cluster data inplace */
- STEP_DECOMPRESS, /* handle compressed cluster data in workqueue */
- STEP_VERITY,
+#ifdef CONFIG_FS_ENCRYPTION
+ STEP_DECRYPT = 1 << 0,
+#else
+ STEP_DECRYPT = 0, /* compile out the decryption-related code */
+#endif
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ STEP_DECOMPRESS = 1 << 1,
+#else
+ STEP_DECOMPRESS = 0, /* compile out the decompression-related code */
+#endif
+#ifdef CONFIG_FS_VERITY
+ STEP_VERITY = 1 << 2,
+#else
+ STEP_VERITY = 0, /* compile out the verity-related code */
+#endif
};
struct bio_post_read_ctx {
@@ -128,25 +117,26 @@ struct bio_post_read_ctx {
unsigned int enabled_steps;
};
-static void __read_end_io(struct bio *bio, bool compr, bool verity)
+static void f2fs_finish_read_bio(struct bio *bio)
{
- struct page *page;
struct bio_vec *bv;
struct bvec_iter_all iter_all;
+ /*
+ * Update and unlock the bio's pagecache pages, and put the
+ * decompression context for any compressed pages.
+ */
bio_for_each_segment_all(bv, bio, iter_all) {
- page = bv->bv_page;
+ struct page *page = bv->bv_page;
-#ifdef CONFIG_F2FS_FS_COMPRESSION
- if (compr && f2fs_is_compressed_page(page)) {
- f2fs_decompress_pages(bio, page, verity);
+ if (f2fs_is_compressed_page(page)) {
+ if (bio->bi_status)
+ f2fs_end_read_compressed_page(page, true);
+ f2fs_put_page_dic(page);
continue;
}
- if (verity)
- continue;
-#endif
- /* PG_error was set if any post_read step failed */
+ /* PG_error was set if decryption or verity failed. */
if (bio->bi_status || PageError(page)) {
ClearPageUptodate(page);
/* will re-read again later */
@@ -157,181 +147,141 @@ static void __read_end_io(struct bio *bio, bool compr, bool verity)
dec_page_count(F2FS_P_SB(page), __read_io_type(page));
unlock_page(page);
}
-}
-
-static void f2fs_release_read_bio(struct bio *bio);
-static void __f2fs_read_end_io(struct bio *bio, bool compr, bool verity)
-{
- if (!compr)
- __read_end_io(bio, false, verity);
- f2fs_release_read_bio(bio);
-}
-
-static void f2fs_decompress_bio(struct bio *bio, bool verity)
-{
- __read_end_io(bio, true, verity);
-}
-
-static void bio_post_read_processing(struct bio_post_read_ctx *ctx);
-
-static void f2fs_decrypt_work(struct bio_post_read_ctx *ctx)
-{
- fscrypt_decrypt_bio(ctx->bio);
-}
-
-static void f2fs_decompress_work(struct bio_post_read_ctx *ctx)
-{
- f2fs_decompress_bio(ctx->bio, ctx->enabled_steps & (1 << STEP_VERITY));
-}
-
-#ifdef CONFIG_F2FS_FS_COMPRESSION
-static void f2fs_verify_pages(struct page **rpages, unsigned int cluster_size)
-{
- f2fs_decompress_end_io(rpages, cluster_size, false, true);
-}
-
-static void f2fs_verify_bio(struct bio *bio)
-{
- struct bio_vec *bv;
- struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bv, bio, iter_all) {
- struct page *page = bv->bv_page;
- struct decompress_io_ctx *dic;
-
- dic = (struct decompress_io_ctx *)page_private(page);
-
- if (dic) {
- if (atomic_dec_return(&dic->verity_pages))
- continue;
- f2fs_verify_pages(dic->rpages,
- dic->cluster_size);
- f2fs_free_dic(dic);
- continue;
- }
-
- if (bio->bi_status || PageError(page))
- goto clear_uptodate;
-
- if (fsverity_verify_page(page)) {
- SetPageUptodate(page);
- goto unlock;
- }
-clear_uptodate:
- ClearPageUptodate(page);
- ClearPageError(page);
-unlock:
- dec_page_count(F2FS_P_SB(page), __read_io_type(page));
- unlock_page(page);
- }
+ if (bio->bi_private)
+ mempool_free(bio->bi_private, bio_post_read_ctx_pool);
+ bio_put(bio);
}
-#endif
-static void f2fs_verity_work(struct work_struct *work)
+static void f2fs_verify_bio(struct work_struct *work)
{
struct bio_post_read_ctx *ctx =
container_of(work, struct bio_post_read_ctx, work);
struct bio *bio = ctx->bio;
-#ifdef CONFIG_F2FS_FS_COMPRESSION
- unsigned int enabled_steps = ctx->enabled_steps;
-#endif
+ bool may_have_compressed_pages = (ctx->enabled_steps & STEP_DECOMPRESS);
/*
* fsverity_verify_bio() may call readpages() again, and while verity
- * will be disabled for this, decryption may still be needed, resulting
- * in another bio_post_read_ctx being allocated. So to prevent
- * deadlocks we need to release the current ctx to the mempool first.
- * This assumes that verity is the last post-read step.
+ * will be disabled for this, decryption and/or decompression may still
+ * be needed, resulting in another bio_post_read_ctx being allocated.
+ * So to prevent deadlocks we need to release the current ctx to the
+ * mempool first. This assumes that verity is the last post-read step.
*/
mempool_free(ctx, bio_post_read_ctx_pool);
bio->bi_private = NULL;
-#ifdef CONFIG_F2FS_FS_COMPRESSION
- /* previous step is decompression */
- if (enabled_steps & (1 << STEP_DECOMPRESS)) {
- f2fs_verify_bio(bio);
- f2fs_release_read_bio(bio);
- return;
+ /*
+ * Verify the bio's pages with fs-verity. Exclude compressed pages,
+ * as those were handled separately by f2fs_end_read_compressed_page().
+ */
+ if (may_have_compressed_pages) {
+ struct bio_vec *bv;
+ struct bvec_iter_all iter_all;
+
+ bio_for_each_segment_all(bv, bio, iter_all) {
+ struct page *page = bv->bv_page;
+
+ if (!f2fs_is_compressed_page(page) &&
+ !PageError(page) && !fsverity_verify_page(page))
+ SetPageError(page);
+ }
+ } else {
+ fsverity_verify_bio(bio);
}
-#endif
- fsverity_verify_bio(bio);
- __f2fs_read_end_io(bio, false, false);
+ f2fs_finish_read_bio(bio);
}
-static void f2fs_post_read_work(struct work_struct *work)
+/*
+ * If the bio's data needs to be verified with fs-verity, then enqueue the
+ * verity work for the bio. Otherwise finish the bio now.
+ *
+ * Note that to avoid deadlocks, the verity work can't be done on the
+ * decryption/decompression workqueue. This is because verifying the data pages
+ * can involve reading verity metadata pages from the file, and these verity
+ * metadata pages may be encrypted and/or compressed.
+ */
+static void f2fs_verify_and_finish_bio(struct bio *bio)
{
- struct bio_post_read_ctx *ctx =
- container_of(work, struct bio_post_read_ctx, work);
+ struct bio_post_read_ctx *ctx = bio->bi_private;
- if (ctx->enabled_steps & (1 << STEP_DECRYPT))
- f2fs_decrypt_work(ctx);
-
- if (ctx->enabled_steps & (1 << STEP_DECOMPRESS))
- f2fs_decompress_work(ctx);
-
- if (ctx->enabled_steps & (1 << STEP_VERITY)) {
- INIT_WORK(&ctx->work, f2fs_verity_work);
+ if (ctx && (ctx->enabled_steps & STEP_VERITY)) {
+ INIT_WORK(&ctx->work, f2fs_verify_bio);
fsverity_enqueue_verify_work(&ctx->work);
- return;
+ } else {
+ f2fs_finish_read_bio(bio);
}
-
- __f2fs_read_end_io(ctx->bio,
- ctx->enabled_steps & (1 << STEP_DECOMPRESS), false);
}
-static void f2fs_enqueue_post_read_work(struct f2fs_sb_info *sbi,
- struct work_struct *work)
-{
- queue_work(sbi->post_read_wq, work);
-}
-
-static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
+/*
+ * Handle STEP_DECOMPRESS by decompressing any compressed clusters whose last
+ * remaining page was read by @ctx->bio.
+ *
+ * Note that a bio may span clusters (even a mix of compressed and uncompressed
+ * clusters) or be for just part of a cluster. STEP_DECOMPRESS just indicates
+ * that the bio includes at least one compressed page. The actual decompression
+ * is done on a per-cluster basis, not a per-bio basis.
+ */
+static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx)
{
- /*
- * We use different work queues for decryption and for verity because
- * verity may require reading metadata pages that need decryption, and
- * we shouldn't recurse to the same workqueue.
- */
+ struct bio_vec *bv;
+ struct bvec_iter_all iter_all;
+ bool all_compressed = true;
- if (ctx->enabled_steps & (1 << STEP_DECRYPT) ||
- ctx->enabled_steps & (1 << STEP_DECOMPRESS)) {
- INIT_WORK(&ctx->work, f2fs_post_read_work);
- f2fs_enqueue_post_read_work(ctx->sbi, &ctx->work);
- return;
- }
+ bio_for_each_segment_all(bv, ctx->bio, iter_all) {
+ struct page *page = bv->bv_page;
- if (ctx->enabled_steps & (1 << STEP_VERITY)) {
- INIT_WORK(&ctx->work, f2fs_verity_work);
- fsverity_enqueue_verify_work(&ctx->work);
- return;
+ /* PG_error was set if decryption failed. */
+ if (f2fs_is_compressed_page(page))
+ f2fs_end_read_compressed_page(page, PageError(page));
+ else
+ all_compressed = false;
}
- __f2fs_read_end_io(ctx->bio, false, false);
+ /*
+ * Optimization: if all the bio's pages are compressed, then scheduling
+ * the per-bio verity work is unnecessary, as verity will be fully
+ * handled at the compression cluster level.
+ */
+ if (all_compressed)
+ ctx->enabled_steps &= ~STEP_VERITY;
}
-static bool f2fs_bio_post_read_required(struct bio *bio)
+static void f2fs_post_read_work(struct work_struct *work)
{
- return bio->bi_private;
+ struct bio_post_read_ctx *ctx =
+ container_of(work, struct bio_post_read_ctx, work);
+
+ if (ctx->enabled_steps & STEP_DECRYPT)
+ fscrypt_decrypt_bio(ctx->bio);
+
+ if (ctx->enabled_steps & STEP_DECOMPRESS)
+ f2fs_handle_step_decompress(ctx);
+
+ f2fs_verify_and_finish_bio(ctx->bio);
}
static void f2fs_read_end_io(struct bio *bio)
{
struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio));
+ struct bio_post_read_ctx *ctx = bio->bi_private;
if (time_to_inject(sbi, FAULT_READ_IO)) {
f2fs_show_injection_info(sbi, FAULT_READ_IO);
bio->bi_status = BLK_STS_IOERR;
}
- if (f2fs_bio_post_read_required(bio)) {
- struct bio_post_read_ctx *ctx = bio->bi_private;
-
- bio_post_read_processing(ctx);
+ if (bio->bi_status) {
+ f2fs_finish_read_bio(bio);
return;
}
- __f2fs_read_end_io(bio, false, false);
+ if (ctx && (ctx->enabled_steps & (STEP_DECRYPT | STEP_DECOMPRESS))) {
+ INIT_WORK(&ctx->work, f2fs_post_read_work);
+ queue_work(ctx->sbi->post_read_wq, &ctx->work);
+ } else {
+ f2fs_verify_and_finish_bio(bio);
+ }
}
static void f2fs_write_end_io(struct bio *bio)
@@ -427,22 +377,12 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
return 0;
}
-/*
- * Return true, if pre_bio's bdev is same as its target device.
- */
-static bool __same_bdev(struct f2fs_sb_info *sbi,
- block_t blk_addr, struct bio *bio)
-{
- struct block_device *b = f2fs_target_device(sbi, blk_addr, NULL);
- return bio->bi_disk == b->bd_disk && bio->bi_partno == b->bd_partno;
-}
-
static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages)
{
struct f2fs_sb_info *sbi = fio->sbi;
struct bio *bio;
- bio = f2fs_bio_alloc(sbi, npages, true);
+ bio = bio_alloc_bioset(GFP_NOIO, npages, &f2fs_bioset);
f2fs_target_device(sbi, fio->new_blkaddr, bio);
if (is_read_io(fio->op)) {
@@ -499,7 +439,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi,
if (f2fs_lfs_mode(sbi) && current->plug)
blk_finish_plug(current->plug);
- if (F2FS_IO_ALIGNED(sbi))
+ if (!F2FS_IO_ALIGNED(sbi))
goto submit_io;
start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS;
@@ -707,7 +647,6 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
return -EFSCORRUPTED;
trace_f2fs_submit_page_bio(page, fio);
- f2fs_trace_ios(fio, 0);
/* Allocate a new bio */
bio = __bio_alloc(fio, 1);
@@ -741,7 +680,7 @@ static bool page_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio,
return false;
if (last_blkaddr + 1 != cur_blkaddr)
return false;
- return __same_bdev(sbi, cur_blkaddr, bio);
+ return bio->bi_bdev == f2fs_target_device(sbi, cur_blkaddr, NULL);
}
static bool io_type_is_mergeable(struct f2fs_bio_info *io,
@@ -912,7 +851,6 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio)
return -EFSCORRUPTED;
trace_f2fs_submit_page_bio(page, fio);
- f2fs_trace_ios(fio, 0);
if (bio && !page_is_mergeable(fio->sbi, bio, *fio->last_block,
fio->new_blkaddr))
@@ -1009,7 +947,6 @@ alloc_new:
wbc_account_cgroup_owner(fio->io_wbc, bio_page, PAGE_SIZE);
io->last_block_in_bio = fio->new_blkaddr;
- f2fs_trace_ios(fio, 0);
trace_f2fs_submit_page_write(fio->page, fio);
skip:
@@ -1022,24 +959,18 @@ out:
up_write(&io->io_rwsem);
}
-static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
-{
- return fsverity_active(inode) &&
- idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
-}
-
static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
unsigned nr_pages, unsigned op_flag,
- pgoff_t first_idx, bool for_write,
- bool for_verity)
+ pgoff_t first_idx, bool for_write)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct bio *bio;
struct bio_post_read_ctx *ctx;
unsigned int post_read_steps = 0;
- bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES),
- for_write);
+ bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL,
+ min_t(int, nr_pages, BIO_MAX_PAGES),
+ &f2fs_bioset);
if (!bio)
return ERR_PTR(-ENOMEM);
@@ -1050,13 +981,19 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
bio_set_op_attrs(bio, REQ_OP_READ, op_flag);
if (fscrypt_inode_uses_fs_layer_crypto(inode))
- post_read_steps |= 1 << STEP_DECRYPT;
- if (f2fs_compressed_file(inode))
- post_read_steps |= 1 << STEP_DECOMPRESS_NOWQ;
- if (for_verity && f2fs_need_verity(inode, first_idx))
- post_read_steps |= 1 << STEP_VERITY;
+ post_read_steps |= STEP_DECRYPT;
+
+ if (f2fs_need_verity(inode, first_idx))
+ post_read_steps |= STEP_VERITY;
+
+ /*
+ * STEP_DECOMPRESS is handled specially, since a compressed file might
+ * contain both compressed and uncompressed clusters. We'll allocate a
+ * bio_post_read_ctx if the file is compressed, but the caller is
+ * responsible for enabling STEP_DECOMPRESS if it's actually needed.
+ */
- if (post_read_steps) {
+ if (post_read_steps || f2fs_compressed_file(inode)) {
/* Due to the mempool, this never fails. */
ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS);
ctx->bio = bio;
@@ -1068,13 +1005,6 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
return bio;
}
-static void f2fs_release_read_bio(struct bio *bio)
-{
- if (bio->bi_private)
- mempool_free(bio->bi_private, bio_post_read_ctx_pool);
- bio_put(bio);
-}
-
/* This can handle encryption stuffs */
static int f2fs_submit_page_read(struct inode *inode, struct page *page,
block_t blkaddr, int op_flags, bool for_write)
@@ -1083,7 +1013,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page,
struct bio *bio;
bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags,
- page->index, for_write, true);
+ page->index, for_write);
if (IS_ERR(bio))
return PTR_ERR(bio);
@@ -1964,6 +1894,7 @@ next:
}
if (size) {
+ flags |= FIEMAP_EXTENT_MERGED;
if (IS_ENCRYPTED(inode))
flags |= FIEMAP_EXTENT_DATA_ENCRYPTED;
@@ -2121,7 +2052,7 @@ submit_and_realloc:
if (bio == NULL) {
bio = f2fs_grab_read_bio(inode, block_nr, nr_pages,
is_readahead ? REQ_RAHEAD : 0, page->index,
- false, true);
+ false);
if (IS_ERR(bio)) {
ret = PTR_ERR(bio);
bio = NULL;
@@ -2167,8 +2098,6 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
sector_t last_block_in_file;
const unsigned blocksize = blks_to_bytes(inode, 1);
struct decompress_io_ctx *dic = NULL;
- struct bio_post_read_ctx *ctx;
- bool for_verity = false;
int i;
int ret = 0;
@@ -2234,29 +2163,10 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
goto out_put_dnode;
}
- /*
- * It's possible to enable fsverity on the fly when handling a cluster,
- * which requires complicated error handling. Instead of adding more
- * complexity, let's give a rule where end_io post-processes fsverity
- * per cluster. In order to do that, we need to submit bio, if previous
- * bio sets a different post-process policy.
- */
- if (fsverity_active(cc->inode)) {
- atomic_set(&dic->verity_pages, cc->nr_cpages);
- for_verity = true;
-
- if (bio) {
- ctx = bio->bi_private;
- if (!(ctx->enabled_steps & (1 << STEP_VERITY))) {
- __submit_bio(sbi, bio, DATA);
- bio = NULL;
- }
- }
- }
-
for (i = 0; i < dic->nr_cpages; i++) {
struct page *page = dic->cpages[i];
block_t blkaddr;
+ struct bio_post_read_ctx *ctx;
blkaddr = data_blkaddr(dn.inode, dn.node_page,
dn.ofs_in_node + i + 1);
@@ -2272,31 +2182,10 @@ submit_and_realloc:
if (!bio) {
bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages,
is_readahead ? REQ_RAHEAD : 0,
- page->index, for_write, for_verity);
+ page->index, for_write);
if (IS_ERR(bio)) {
- unsigned int remained = dic->nr_cpages - i;
- bool release = false;
-
ret = PTR_ERR(bio);
- dic->failed = true;
-
- if (for_verity) {
- if (!atomic_sub_return(remained,
- &dic->verity_pages))
- release = true;
- } else {
- if (!atomic_sub_return(remained,
- &dic->pending_pages))
- release = true;
- }
-
- if (release) {
- f2fs_decompress_end_io(dic->rpages,
- cc->cluster_size, true,
- false);
- f2fs_free_dic(dic);
- }
-
+ f2fs_decompress_end_io(dic, ret);
f2fs_put_dnode(&dn);
*bio_ret = NULL;
return ret;
@@ -2308,10 +2197,9 @@ submit_and_realloc:
if (bio_add_page(bio, page, blocksize, 0) < blocksize)
goto submit_and_realloc;
- /* tag STEP_DECOMPRESS to handle IO in wq */
ctx = bio->bi_private;
- if (!(ctx->enabled_steps & (1 << STEP_DECOMPRESS)))
- ctx->enabled_steps |= 1 << STEP_DECOMPRESS;
+ ctx->enabled_steps |= STEP_DECOMPRESS;
+ refcount_inc(&dic->refcnt);
inc_page_count(sbi, F2FS_RD_DATA);
f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE);
@@ -2328,7 +2216,13 @@ submit_and_realloc:
out_put_dnode:
f2fs_put_dnode(&dn);
out:
- f2fs_decompress_end_io(cc->rpages, cc->cluster_size, true, false);
+ for (i = 0; i < cc->cluster_size; i++) {
+ if (cc->rpages[i]) {
+ ClearPageUptodate(cc->rpages[i]);
+ ClearPageError(cc->rpages[i]);
+ unlock_page(cc->rpages[i]);
+ }
+ }
*bio_ret = bio;
return ret;
}
@@ -2337,11 +2231,6 @@ out:
/*
* This function was originally taken from fs/mpage.c, and customized for f2fs.
* Major change was from block_size == page_size in f2fs by default.
- *
- * Note that the aops->readpages() function is ONLY used for read-ahead. If
- * this function ever deviates from doing just read-ahead, it should either
- * use ->readpage() or do the necessary surgery to decouple ->readpages()
- * from read-ahead.
*/
static int f2fs_mpage_readpages(struct inode *inode,
struct readahead_control *rac, struct page *page)
@@ -2364,7 +2253,6 @@ static int f2fs_mpage_readpages(struct inode *inode,
unsigned nr_pages = rac ? readahead_count(rac) : 1;
unsigned max_nr_pages = nr_pages;
int ret = 0;
- bool drop_ra = false;
map.m_pblk = 0;
map.m_lblk = 0;
@@ -2375,26 +2263,10 @@ static int f2fs_mpage_readpages(struct inode *inode,
map.m_seg_type = NO_CHECK_TYPE;
map.m_may_create = false;
- /*
- * Two readahead threads for same address range can cause race condition
- * which fragments sequential read IOs. So let's avoid each other.
- */
- if (rac && readahead_count(rac)) {
- if (READ_ONCE(F2FS_I(inode)->ra_offset) == readahead_index(rac))
- drop_ra = true;
- else
- WRITE_ONCE(F2FS_I(inode)->ra_offset,
- readahead_index(rac));
- }
-
for (; nr_pages; nr_pages--) {
if (rac) {
page = readahead_page(rac);
prefetchw(&page->flags);
- if (drop_ra) {
- f2fs_put_page(page, 1);
- continue;
- }
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
@@ -2457,9 +2329,6 @@ next_page:
}
if (bio)
__submit_bio(F2FS_I_SB(inode), bio, DATA);
-
- if (rac && readahead_count(rac) && !drop_ra)
- WRITE_ONCE(F2FS_I(inode)->ra_offset, -1);
return ret;
}
@@ -2743,7 +2612,8 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
sector_t *last_block,
struct writeback_control *wbc,
enum iostat_type io_type,
- int compr_blocks)
+ int compr_blocks,
+ bool allow_balance)
{
struct inode *inode = page->mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -2881,7 +2751,7 @@ out:
}
unlock_page(page);
if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) &&
- !F2FS_I(inode)->cp_task)
+ !F2FS_I(inode)->cp_task && allow_balance)
f2fs_balance_fs(sbi, need_balance_fs);
if (unlikely(f2fs_cp_error(sbi))) {
@@ -2928,7 +2798,7 @@ out:
#endif
return f2fs_write_single_data_page(page, NULL, NULL, NULL,
- wbc, FS_DATA_IO, 0);
+ wbc, FS_DATA_IO, 0, true);
}
/*
@@ -3096,7 +2966,8 @@ continue_unlock:
}
#endif
ret = f2fs_write_single_data_page(page, &submitted,
- &bio, &last_block, wbc, io_type, 0);
+ &bio, &last_block, wbc, io_type,
+ 0, true);
if (ret == AOP_WRITEPAGE_ACTIVATE)
unlock_page(page);
#ifdef CONFIG_F2FS_FS_COMPRESSION
@@ -3831,7 +3702,7 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
filemap_write_and_wait(mapping);
/* Block number less than F2FS MAX BLOCKS */
- if (unlikely(block >= F2FS_I_SB(inode)->max_file_blocks))
+ if (unlikely(block >= max_file_blocks(inode)))
goto out;
if (f2fs_compressed_file(inode)) {
@@ -4108,12 +3979,13 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file,
if (!f2fs_disable_compressed_file(inode))
return -EINVAL;
+ f2fs_precache_extents(inode);
+
ret = check_swap_activate(sis, file, span);
if (ret < 0)
return ret;
set_inode_flag(inode, FI_PIN_FILE);
- f2fs_precache_extents(inode);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return ret;
}
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 197c914119da..91855d5721cd 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -120,6 +120,13 @@ static void update_general_status(struct f2fs_sb_info *sbi)
atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt);
si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks;
}
+ si->nr_issued_ckpt = atomic_read(&sbi->cprc_info.issued_ckpt);
+ si->nr_total_ckpt = atomic_read(&sbi->cprc_info.total_ckpt);
+ si->nr_queued_ckpt = atomic_read(&sbi->cprc_info.queued_ckpt);
+ spin_lock(&sbi->cprc_info.stat_lock);
+ si->cur_ckpt_time = sbi->cprc_info.cur_time;
+ si->peak_ckpt_time = sbi->cprc_info.peak_time;
+ spin_unlock(&sbi->cprc_info.stat_lock);
si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
si->rsvd_segs = reserved_segments(sbi);
si->overp_segs = overprovision_segments(sbi);
@@ -417,6 +424,11 @@ static int stat_show(struct seq_file *s, void *v)
si->meta_count[META_NAT]);
seq_printf(s, " - ssa blocks : %u\n",
si->meta_count[META_SSA]);
+ seq_printf(s, "CP merge (Queued: %4d, Issued: %4d, Total: %4d, "
+ "Cur time: %4d(ms), Peak time: %4d(ms))\n",
+ si->nr_queued_ckpt, si->nr_issued_ckpt,
+ si->nr_total_ckpt, si->cur_ckpt_time,
+ si->peak_ckpt_time);
seq_printf(s, "GC calls: %d (BG: %d)\n",
si->call_count, si->bg_gc);
seq_printf(s, " - data segments : %d (%d)\n",
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index c9002b1933f0..e2d302ae3a46 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -43,7 +43,6 @@ enum {
FAULT_KVMALLOC,
FAULT_PAGE_ALLOC,
FAULT_PAGE_GET,
- FAULT_ALLOC_BIO,
FAULT_ALLOC_NID,
FAULT_ORPHAN,
FAULT_BLOCK,
@@ -97,6 +96,7 @@ extern const char *f2fs_fault_name[FAULT_MAX];
#define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000
#define F2FS_MOUNT_NORECOVERY 0x04000000
#define F2FS_MOUNT_ATGC 0x08000000
+#define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000
#define F2FS_OPTION(sbi) ((sbi)->mount_opt)
#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
@@ -146,6 +146,7 @@ struct f2fs_mount_info {
/* For compression */
unsigned char compress_algorithm; /* algorithm type */
unsigned char compress_log_size; /* cluster log size */
+ unsigned char compress_level; /* compress level */
bool compress_chksum; /* compressed data chksum */
unsigned char compress_ext_cnt; /* extension count */
int compress_mode; /* compression mode */
@@ -266,6 +267,26 @@ struct fsync_node_entry {
unsigned int seq_id; /* sequence id */
};
+struct ckpt_req {
+ struct completion wait; /* completion for checkpoint done */
+ struct llist_node llnode; /* llist_node to be linked in wait queue */
+ int ret; /* return code of checkpoint */
+ ktime_t queue_time; /* request queued time */
+};
+
+struct ckpt_req_control {
+ struct task_struct *f2fs_issue_ckpt; /* checkpoint task */
+ int ckpt_thread_ioprio; /* checkpoint merge thread ioprio */
+ wait_queue_head_t ckpt_wait_queue; /* waiting queue for wake-up */
+ atomic_t issued_ckpt; /* # of actually issued ckpts */
+ atomic_t total_ckpt; /* # of total ckpts */
+ atomic_t queued_ckpt; /* # of queued ckpts */
+ struct llist_head issue_list; /* list for command issue */
+ spinlock_t stat_lock; /* lock for below checkpoint time stats */
+ unsigned int cur_time; /* cur wait time in msec for currently issued checkpoint */
+ unsigned int peak_time; /* peak wait time in msec until now */
+};
+
/* for the bitmap indicate blocks to be discarded */
struct discard_entry {
struct list_head list; /* list head */
@@ -717,7 +738,6 @@ struct f2fs_inode_info {
struct list_head inmem_pages; /* inmemory pages managed by f2fs */
struct task_struct *inmem_task; /* store inmemory task */
struct mutex inmem_lock; /* lock for inmemory pages */
- pgoff_t ra_offset; /* ongoing readahead offset */
struct extent_tree *extent_tree; /* cached extent_tree entry */
/* avoid racing between foreground op and gc */
@@ -735,6 +755,7 @@ struct f2fs_inode_info {
atomic_t i_compr_blocks; /* # of compressed blocks */
unsigned char i_compress_algorithm; /* algorithm type */
unsigned char i_log_cluster_size; /* log of cluster size */
+ unsigned char i_compress_level; /* compress level (lz4hc,zstd) */
unsigned short i_compress_flag; /* compress flag */
unsigned int i_cluster_size; /* cluster size */
};
@@ -1310,6 +1331,8 @@ struct compress_data {
#define F2FS_COMPRESSED_PAGE_MAGIC 0xF5F2C000
+#define COMPRESS_LEVEL_OFFSET 8
+
/* compress context */
struct compress_ctx {
struct inode *inode; /* inode the context belong to */
@@ -1337,7 +1360,7 @@ struct compress_io_ctx {
atomic_t pending_pages; /* in-flight compressed page count */
};
-/* decompress io context for read IO path */
+/* Context for decompressing one cluster on the read IO path */
struct decompress_io_ctx {
u32 magic; /* magic number to indicate page is compressed */
struct inode *inode; /* inode the context belong to */
@@ -1353,11 +1376,37 @@ struct decompress_io_ctx {
struct compress_data *cbuf; /* virtual mapped address on cpages */
size_t rlen; /* valid data length in rbuf */
size_t clen; /* valid data length in cbuf */
- atomic_t pending_pages; /* in-flight compressed page count */
- atomic_t verity_pages; /* in-flight page count for verity */
- bool failed; /* indicate IO error during decompression */
+
+ /*
+ * The number of compressed pages remaining to be read in this cluster.
+ * This is initially nr_cpages. It is decremented by 1 each time a page
+ * has been read (or failed to be read). When it reaches 0, the cluster
+ * is decompressed (or an error is reported).
+ *
+ * If an error occurs before all the pages have been submitted for I/O,
+ * then this will never reach 0. In this case the I/O submitter is
+ * responsible for calling f2fs_decompress_end_io() instead.
+ */
+ atomic_t remaining_pages;
+
+ /*
+ * Number of references to this decompress_io_ctx.
+ *
+ * One reference is held for I/O completion. This reference is dropped
+ * after the pagecache pages are updated and unlocked -- either after
+ * decompression (and verity if enabled), or after an error.
+ *
+ * In addition, each compressed page holds a reference while it is in a
+ * bio. These references are necessary prevent compressed pages from
+ * being freed while they are still in a bio.
+ */
+ refcount_t refcnt;
+
+ bool failed; /* IO error occurred before decompression? */
+ bool need_verity; /* need fs-verity verification after decompression? */
void *private; /* payload buffer for specified decompression algorithm */
void *private2; /* extra payload buffer */
+ struct work_struct verity_work; /* work to verify the decompressed pages */
};
#define NULL_CLUSTER ((unsigned int)(~0))
@@ -1404,6 +1453,7 @@ struct f2fs_sb_info {
wait_queue_head_t cp_wait;
unsigned long last_time[MAX_TIME]; /* to store time in jiffies */
long interval_time[MAX_TIME]; /* to store thresholds */
+ struct ckpt_req_control cprc_info; /* for checkpoint request control */
struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */
@@ -1444,7 +1494,6 @@ struct f2fs_sb_info {
unsigned int total_sections; /* total section count */
unsigned int total_node_count; /* total node block count */
unsigned int total_valid_node_count; /* valid node block count */
- loff_t max_file_blocks; /* max block index of file */
int dir_level; /* directory level */
int readdir_ra; /* readahead inode in readdir */
u64 max_io_bytes; /* max io bytes to merge IOs */
@@ -1541,9 +1590,12 @@ struct f2fs_sb_info {
unsigned int node_io_flag;
/* For sysfs suppport */
- struct kobject s_kobj;
+ struct kobject s_kobj; /* /sys/fs/f2fs/<devname> */
struct completion s_kobj_unregister;
+ struct kobject s_stat_kobj; /* /sys/fs/f2fs/<devname>/stat */
+ struct completion s_stat_kobj_unregister;
+
/* For shrinker support */
struct list_head s_list;
int s_ndevs; /* number of devices */
@@ -3233,6 +3285,7 @@ int f2fs_inode_dirtied(struct inode *inode, bool sync);
void f2fs_inode_synced(struct inode *inode);
int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly);
int f2fs_quota_sync(struct super_block *sb, int type);
+loff_t max_file_blocks(struct inode *inode);
void f2fs_quota_off_umount(struct super_block *sb);
int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover);
int f2fs_sync_fs(struct super_block *sb, int sync);
@@ -3419,13 +3472,16 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc);
void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi);
int __init f2fs_create_checkpoint_caches(void);
void f2fs_destroy_checkpoint_caches(void);
+int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi);
+int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi);
+void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi);
+void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi);
/*
* data.c
*/
int __init f2fs_init_bioset(void);
void f2fs_destroy_bioset(void);
-struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio);
int f2fs_init_bio_entry_cache(void);
void f2fs_destroy_bio_entry_cache(void);
void f2fs_submit_bio(struct f2fs_sb_info *sbi,
@@ -3470,7 +3526,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
struct bio **bio, sector_t *last_block,
struct writeback_control *wbc,
enum iostat_type io_type,
- int compr_blocks);
+ int compr_blocks, bool allow_balance);
void f2fs_invalidate_page(struct page *page, unsigned int offset,
unsigned int length);
int f2fs_release_page(struct page *page, gfp_t wait);
@@ -3531,6 +3587,8 @@ struct f2fs_stat_info {
int nr_discarding, nr_discarded;
int nr_discard_cmd;
unsigned int undiscard_blks;
+ int nr_issued_ckpt, nr_total_ckpt, nr_queued_ckpt;
+ unsigned int cur_ckpt_time, peak_ckpt_time;
int inline_xattr, inline_inode, inline_dir, append, update, orphans;
int compr_inode;
unsigned long long compr_blocks;
@@ -3716,8 +3774,6 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi);
#define stat_dec_compr_inode(inode) do { } while (0)
#define stat_add_compr_blocks(inode, blocks) do { } while (0)
#define stat_sub_compr_blocks(inode, blocks) do { } while (0)
-#define stat_inc_atomic_write(inode) do { } while (0)
-#define stat_dec_atomic_write(inode) do { } while (0)
#define stat_update_max_atomic_write(inode) do { } while (0)
#define stat_inc_volatile_write(inode) do { } while (0)
#define stat_dec_volatile_write(inode) do { } while (0)
@@ -3877,7 +3933,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page);
bool f2fs_is_compress_backend_ready(struct inode *inode);
int f2fs_init_compress_mempool(void);
void f2fs_destroy_compress_mempool(void);
-void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity);
+void f2fs_end_read_compressed_page(struct page *page, bool failed);
bool f2fs_cluster_is_empty(struct compress_ctx *cc);
bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index);
void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page);
@@ -3890,9 +3946,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
unsigned nr_pages, sector_t *last_block_in_bio,
bool is_readahead, bool for_write);
struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc);
-void f2fs_free_dic(struct decompress_io_ctx *dic);
-void f2fs_decompress_end_io(struct page **rpages,
- unsigned int cluster_size, bool err, bool verity);
+void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed);
+void f2fs_put_page_dic(struct page *page);
int f2fs_init_compress_ctx(struct compress_ctx *cc);
void f2fs_destroy_compress_ctx(struct compress_ctx *cc);
void f2fs_init_compress_info(struct f2fs_sb_info *sbi);
@@ -3916,6 +3971,14 @@ static inline struct page *f2fs_compress_control_page(struct page *page)
}
static inline int f2fs_init_compress_mempool(void) { return 0; }
static inline void f2fs_destroy_compress_mempool(void) { }
+static inline void f2fs_end_read_compressed_page(struct page *page, bool failed)
+{
+ WARN_ON_ONCE(1);
+}
+static inline void f2fs_put_page_dic(struct page *page)
+{
+ WARN_ON_ONCE(1);
+}
static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; }
static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { }
static inline int __init f2fs_init_compress_cache(void) { return 0; }
@@ -3935,6 +3998,11 @@ static inline void set_compress_context(struct inode *inode)
1 << COMPRESS_CHKSUM : 0;
F2FS_I(inode)->i_cluster_size =
1 << F2FS_I(inode)->i_log_cluster_size;
+ if (F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 &&
+ F2FS_OPTION(sbi).compress_level)
+ F2FS_I(inode)->i_compress_flag |=
+ F2FS_OPTION(sbi).compress_level <<
+ COMPRESS_LEVEL_OFFSET;
F2FS_I(inode)->i_flags |= F2FS_COMPR_FL;
set_inode_flag(inode, FI_COMPRESSED_FILE);
stat_inc_compr_inode(inode);
@@ -4115,6 +4183,12 @@ static inline bool f2fs_force_buffered_io(struct inode *inode,
return false;
}
+static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
+{
+ return fsverity_active(inode) &&
+ idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
+}
+
#ifdef CONFIG_F2FS_FAULT_INJECTION
extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
unsigned int type);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 8f1e97e7d242..d26ff2ae3f5e 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -29,7 +29,6 @@
#include "xattr.h"
#include "acl.h"
#include "gc.h"
-#include "trace.h"
#include <trace/events/f2fs.h>
#include <uapi/linux/f2fs.h>
@@ -60,6 +59,9 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
bool need_alloc = true;
int err = 0;
+ if (unlikely(IS_IMMUTABLE(inode)))
+ return VM_FAULT_SIGBUS;
+
if (unlikely(f2fs_cp_error(sbi))) {
err = -EIO;
goto err;
@@ -70,6 +72,10 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
goto err;
}
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ goto err;
+
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (f2fs_compressed_file(inode)) {
int ret = f2fs_is_compressed_cluster(inode, page->index);
@@ -366,7 +372,6 @@ flush_out:
f2fs_update_time(sbi, REQ_TIME);
out:
trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret);
- f2fs_trace_ios(NULL, 1);
return ret;
}
@@ -483,6 +488,9 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence)
struct inode *inode = file->f_mapping->host;
loff_t maxbytes = inode->i_sb->s_maxbytes;
+ if (f2fs_compressed_file(inode))
+ maxbytes = max_file_blocks(inode) << F2FS_BLKSIZE_BITS;
+
switch (whence) {
case SEEK_SET:
case SEEK_CUR:
@@ -502,7 +510,6 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence)
static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file_inode(file);
- int err;
if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
return -EIO;
@@ -510,11 +517,6 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
if (!f2fs_is_compress_backend_ready(inode))
return -EOPNOTSUPP;
- /* we don't need to use inline_data strictly */
- err = f2fs_convert_inline_inode(inode);
- if (err)
- return err;
-
file_accessed(file);
vma->vm_ops = &f2fs_file_vm_ops;
set_inode_flag(inode, FI_MMAP_FILE);
@@ -667,7 +669,7 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock)
free_from = (pgoff_t)F2FS_BLK_ALIGN(from);
- if (free_from >= sbi->max_file_blocks)
+ if (free_from >= max_file_blocks(inode))
goto free_partial;
if (lock)
@@ -767,6 +769,10 @@ int f2fs_truncate(struct inode *inode)
return -EIO;
}
+ err = dquot_initialize(inode);
+ if (err)
+ return err;
+
/* we should check inline_data size */
if (!f2fs_may_inline_data(inode)) {
err = f2fs_convert_inline_inode(inode);
@@ -850,7 +856,7 @@ static void __setattr_copy(struct user_namespace *mnt_userns,
umode_t mode = attr->ia_mode;
kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
- if (!in_group_p(kgid) && !capable(CAP_FSETID))
+ if (!in_group_p(kgid) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
mode &= ~S_ISGID;
set_acl_inode(inode, mode);
}
@@ -868,6 +874,14 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
return -EIO;
+ if (unlikely(IS_IMMUTABLE(inode)))
+ return -EPERM;
+
+ if (unlikely(IS_APPEND(inode) &&
+ (attr->ia_valid & (ATTR_MODE | ATTR_UID |
+ ATTR_GID | ATTR_TIMES_SET))))
+ return -EPERM;
+
if ((attr->ia_valid & ATTR_SIZE) &&
!f2fs_is_compress_backend_ready(inode))
return -EOPNOTSUPP;
@@ -951,10 +965,11 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
__setattr_copy(&init_user_ns, inode, attr);
if (attr->ia_valid & ATTR_MODE) {
- err = posix_acl_chmod(&init_user_ns, inode,
- f2fs_get_inode_mode(inode));
- if (err || is_inode_flag_set(inode, FI_ACL_MODE)) {
- inode->i_mode = F2FS_I(inode)->i_acl_mode;
+ err = posix_acl_chmod(&init_user_ns, inode, f2fs_get_inode_mode(inode));
+
+ if (is_inode_flag_set(inode, FI_ACL_MODE)) {
+ if (!err)
+ inode->i_mode = F2FS_I(inode)->i_acl_mode;
clear_inode_flag(inode, FI_ACL_MODE);
}
}
@@ -2734,7 +2749,7 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg)
return -EINVAL;
if (unlikely((range.start + range.len) >> PAGE_SHIFT >
- sbi->max_file_blocks))
+ max_file_blocks(inode)))
return -EINVAL;
err = mnt_want_write_file(filp);
@@ -3297,7 +3312,7 @@ int f2fs_precache_extents(struct inode *inode)
map.m_next_extent = &m_next_extent;
map.m_seg_type = NO_CHECK_TYPE;
map.m_may_create = false;
- end = F2FS_I_SB(inode)->max_file_blocks;
+ end = max_file_blocks(inode);
while (map.m_lblk < end) {
map.m_len = end - map.m_lblk;
@@ -3361,6 +3376,14 @@ static int f2fs_ioc_measure_verity(struct file *filp, unsigned long arg)
return fsverity_ioctl_measure(filp, (void __user *)arg);
}
+static int f2fs_ioc_read_verity_metadata(struct file *filp, unsigned long arg)
+{
+ if (!f2fs_sb_has_verity(F2FS_I_SB(file_inode(filp))))
+ return -EOPNOTSUPP;
+
+ return fsverity_ioctl_read_metadata(filp, (const void __user *)arg);
+}
+
static int f2fs_ioc_getfslabel(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -4047,8 +4070,10 @@ static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len)
for (i = 0; i < page_len; i++, redirty_idx++) {
page = find_lock_page(mapping, redirty_idx);
- if (!page)
- ret = -ENOENT;
+ if (!page) {
+ ret = -ENOMEM;
+ break;
+ }
set_page_dirty(page);
f2fs_put_page(page, 1);
f2fs_put_page(page, 0);
@@ -4276,6 +4301,8 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_ioc_enable_verity(filp, arg);
case FS_IOC_MEASURE_VERITY:
return f2fs_ioc_measure_verity(filp, arg);
+ case FS_IOC_READ_VERITY_METADATA:
+ return f2fs_ioc_read_verity_metadata(filp, arg);
case FS_IOC_GETFSLABEL:
return f2fs_ioc_getfslabel(filp, arg);
case FS_IOC_SETFSLABEL:
@@ -4353,6 +4380,11 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
inode_lock(inode);
}
+ if (unlikely(IS_IMMUTABLE(inode))) {
+ ret = -EPERM;
+ goto unlock;
+ }
+
ret = generic_write_checks(iocb, from);
if (ret > 0) {
bool preallocated = false;
@@ -4417,6 +4449,7 @@ write:
if (ret > 0)
f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret);
}
+unlock:
inode_unlock(inode);
out:
trace_f2fs_file_write_iter(inode, iocb->ki_pos,
@@ -4527,6 +4560,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case F2FS_IOC_RESIZE_FS:
case FS_IOC_ENABLE_VERITY:
case FS_IOC_MEASURE_VERITY:
+ case FS_IOC_READ_VERITY_METADATA:
case FS_IOC_GETFSLABEL:
case FS_IOC_SETFSLABEL:
case F2FS_IOC_GET_COMPRESS_BLOCKS:
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 3ef84e6ded41..39330ad3c44e 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1169,8 +1169,6 @@ static int move_data_block(struct inode *inode, block_t bidx,
if (err)
goto put_out;
- set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
-
/* read page */
fio.page = page;
fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
@@ -1207,6 +1205,9 @@ static int move_data_block(struct inode *inode, block_t bidx,
}
}
+ set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
+
+ /* allocate block address */
f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
&sum, type, NULL);
@@ -1233,9 +1234,6 @@ static int move_data_block(struct inode *inode, block_t bidx,
set_page_writeback(fio.encrypted_page);
ClearPageError(page);
- /* allocate block address */
- f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true);
-
fio.op = REQ_OP_WRITE;
fio.op_flags = REQ_SYNC;
fio.new_blkaddr = newaddr;
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 806ebabf5870..993caefcd2bb 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -192,6 +192,10 @@ int f2fs_convert_inline_inode(struct inode *inode)
f2fs_hw_is_readonly(sbi) || f2fs_readonly(sbi->sb))
return 0;
+ err = dquot_initialize(inode);
+ if (err)
+ return err;
+
page = f2fs_grab_cache_page(inode->i_mapping, 0, false);
if (!page)
return -ENOMEM;
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index c061a67e43a3..17bd072a5d39 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -856,7 +856,11 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
if (whiteout) {
f2fs_i_links_write(inode, false);
+
+ spin_lock(&inode->i_lock);
inode->i_state |= I_LINKABLE;
+ spin_unlock(&inode->i_lock);
+
*whiteout = inode;
} else {
d_tmpfile(dentry, inode);
@@ -1043,7 +1047,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
err = f2fs_add_link(old_dentry, whiteout);
if (err)
goto put_out_dir;
+
+ spin_lock(&whiteout->i_lock);
whiteout->i_state &= ~I_LINKABLE;
+ spin_unlock(&whiteout->i_lock);
+
iput(whiteout);
}
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 3a24423ac65f..a8a0fb890e8d 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -17,7 +17,6 @@
#include "node.h"
#include "segment.h"
#include "xattr.h"
-#include "trace.h"
#include <trace/events/f2fs.h>
#define on_f2fs_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock)
@@ -2089,7 +2088,6 @@ static int f2fs_set_node_page_dirty(struct page *page)
__set_page_dirty_nobuffers(page);
inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
f2fs_set_page_private(page, 0);
- f2fs_trace_pid(page);
return 1;
}
return 0;
@@ -2696,7 +2694,7 @@ retry:
src = F2FS_INODE(page);
dst = F2FS_INODE(ipage);
- memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src);
+ memcpy(dst, src, offsetof(struct f2fs_inode, i_ext));
dst->i_size = 0;
dst->i_blocks = cpu_to_le64(1);
dst->i_links = cpu_to_le32(1);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index deca74cb17df..993004f06a77 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -20,7 +20,6 @@
#include "segment.h"
#include "node.h"
#include "gc.h"
-#include "trace.h"
#include <trace/events/f2fs.h>
#define __reverse_ffz(x) __reverse_ffs(~(x))
@@ -187,8 +186,6 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page)
{
struct inmem_pages *new;
- f2fs_trace_pid(page);
-
f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE);
new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
@@ -566,17 +563,7 @@ do_sync:
static int __submit_flush_wait(struct f2fs_sb_info *sbi,
struct block_device *bdev)
{
- struct bio *bio;
- int ret;
-
- bio = f2fs_bio_alloc(sbi, 0, false);
- if (!bio)
- return -ENOMEM;
-
- bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
- bio_set_dev(bio, bdev);
- ret = submit_bio_wait(bio);
- bio_put(bio);
+ int ret = blkdev_issue_flush(bdev);
trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER),
test_opt(sbi, FLUSH_MERGE), ret);
@@ -610,8 +597,6 @@ repeat:
if (kthread_should_stop())
return 0;
- sb_start_intwrite(sbi->sb);
-
if (!llist_empty(&fcc->issue_list)) {
struct flush_cmd *cmd, *next;
int ret;
@@ -632,8 +617,6 @@ repeat:
fcc->dispatch_list = NULL;
}
- sb_end_intwrite(sbi->sb);
-
wait_event_interruptible(*q,
kthread_should_stop() || !llist_empty(&fcc->issue_list));
goto repeat;
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index e81eb0748e2a..229814b4f4a6 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -101,11 +101,11 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
#define BLKS_PER_SEC(sbi) \
((sbi)->segs_per_sec * (sbi)->blocks_per_seg)
#define GET_SEC_FROM_SEG(sbi, segno) \
- ((segno) / (sbi)->segs_per_sec)
+ (((segno) == -1) ? -1: (segno) / (sbi)->segs_per_sec)
#define GET_SEG_FROM_SEC(sbi, secno) \
((secno) * (sbi)->segs_per_sec)
#define GET_ZONE_FROM_SEC(sbi, secno) \
- ((secno) / (sbi)->secs_per_zone)
+ (((secno) == -1) ? -1: (secno) / (sbi)->secs_per_zone)
#define GET_ZONE_FROM_SEG(sbi, segno) \
GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno))
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index b4a07fe62d1a..7069793752f1 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -25,13 +25,14 @@
#include <linux/quota.h>
#include <linux/unicode.h>
#include <linux/part_stat.h>
+#include <linux/zstd.h>
+#include <linux/lz4.h>
#include "f2fs.h"
#include "node.h"
#include "segment.h"
#include "xattr.h"
#include "gc.h"
-#include "trace.h"
#define CREATE_TRACE_POINTS
#include <trace/events/f2fs.h>
@@ -45,7 +46,6 @@ const char *f2fs_fault_name[FAULT_MAX] = {
[FAULT_KVMALLOC] = "kvmalloc",
[FAULT_PAGE_ALLOC] = "page alloc",
[FAULT_PAGE_GET] = "page get",
- [FAULT_ALLOC_BIO] = "alloc bio",
[FAULT_ALLOC_NID] = "alloc nid",
[FAULT_ORPHAN] = "orphan",
[FAULT_BLOCK] = "no more block",
@@ -143,6 +143,8 @@ enum {
Opt_checkpoint_disable_cap,
Opt_checkpoint_disable_cap_perc,
Opt_checkpoint_enable,
+ Opt_checkpoint_merge,
+ Opt_nocheckpoint_merge,
Opt_compress_algorithm,
Opt_compress_log_size,
Opt_compress_extension,
@@ -213,6 +215,8 @@ static match_table_t f2fs_tokens = {
{Opt_checkpoint_disable_cap, "checkpoint=disable:%u"},
{Opt_checkpoint_disable_cap_perc, "checkpoint=disable:%u%%"},
{Opt_checkpoint_enable, "checkpoint=enable"},
+ {Opt_checkpoint_merge, "checkpoint_merge"},
+ {Opt_nocheckpoint_merge, "nocheckpoint_merge"},
{Opt_compress_algorithm, "compress_algorithm=%s"},
{Opt_compress_log_size, "compress_log_size=%u"},
{Opt_compress_extension, "compress_extension=%s"},
@@ -464,6 +468,74 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb,
return 0;
}
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+#ifdef CONFIG_F2FS_FS_LZ4
+static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str)
+{
+#ifdef CONFIG_F2FS_FS_LZ4HC
+ unsigned int level;
+#endif
+
+ if (strlen(str) == 3) {
+ F2FS_OPTION(sbi).compress_level = 0;
+ return 0;
+ }
+
+#ifdef CONFIG_F2FS_FS_LZ4HC
+ str += 3;
+
+ if (str[0] != ':') {
+ f2fs_info(sbi, "wrong format, e.g. <alg_name>:<compr_level>");
+ return -EINVAL;
+ }
+ if (kstrtouint(str + 1, 10, &level))
+ return -EINVAL;
+
+ if (level < LZ4HC_MIN_CLEVEL || level > LZ4HC_MAX_CLEVEL) {
+ f2fs_info(sbi, "invalid lz4hc compress level: %d", level);
+ return -EINVAL;
+ }
+
+ F2FS_OPTION(sbi).compress_level = level;
+ return 0;
+#else
+ f2fs_info(sbi, "kernel doesn't support lz4hc compression");
+ return -EINVAL;
+#endif
+}
+#endif
+
+#ifdef CONFIG_F2FS_FS_ZSTD
+static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str)
+{
+ unsigned int level;
+ int len = 4;
+
+ if (strlen(str) == len) {
+ F2FS_OPTION(sbi).compress_level = 0;
+ return 0;
+ }
+
+ str += len;
+
+ if (str[0] != ':') {
+ f2fs_info(sbi, "wrong format, e.g. <alg_name>:<compr_level>");
+ return -EINVAL;
+ }
+ if (kstrtouint(str + 1, 10, &level))
+ return -EINVAL;
+
+ if (!level || level > ZSTD_maxCLevel()) {
+ f2fs_info(sbi, "invalid zstd compress level: %d", level);
+ return -EINVAL;
+ }
+
+ F2FS_OPTION(sbi).compress_level = level;
+ return 0;
+}
+#endif
+#endif
+
static int parse_options(struct super_block *sb, char *options, bool is_remount)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
@@ -872,6 +944,12 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
case Opt_checkpoint_enable:
clear_opt(sbi, DISABLE_CHECKPOINT);
break;
+ case Opt_checkpoint_merge:
+ set_opt(sbi, MERGE_CHECKPOINT);
+ break;
+ case Opt_nocheckpoint_merge:
+ clear_opt(sbi, MERGE_CHECKPOINT);
+ break;
#ifdef CONFIG_F2FS_FS_COMPRESSION
case Opt_compress_algorithm:
if (!f2fs_sb_has_compression(sbi)) {
@@ -882,17 +960,45 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
if (!name)
return -ENOMEM;
if (!strcmp(name, "lzo")) {
+#ifdef CONFIG_F2FS_FS_LZO
+ F2FS_OPTION(sbi).compress_level = 0;
F2FS_OPTION(sbi).compress_algorithm =
COMPRESS_LZO;
- } else if (!strcmp(name, "lz4")) {
+#else
+ f2fs_info(sbi, "kernel doesn't support lzo compression");
+#endif
+ } else if (!strncmp(name, "lz4", 3)) {
+#ifdef CONFIG_F2FS_FS_LZ4
+ ret = f2fs_set_lz4hc_level(sbi, name);
+ if (ret) {
+ kfree(name);
+ return -EINVAL;
+ }
F2FS_OPTION(sbi).compress_algorithm =
COMPRESS_LZ4;
- } else if (!strcmp(name, "zstd")) {
+#else
+ f2fs_info(sbi, "kernel doesn't support lz4 compression");
+#endif
+ } else if (!strncmp(name, "zstd", 4)) {
+#ifdef CONFIG_F2FS_FS_ZSTD
+ ret = f2fs_set_zstd_level(sbi, name);
+ if (ret) {
+ kfree(name);
+ return -EINVAL;
+ }
F2FS_OPTION(sbi).compress_algorithm =
COMPRESS_ZSTD;
+#else
+ f2fs_info(sbi, "kernel doesn't support zstd compression");
+#endif
} else if (!strcmp(name, "lzo-rle")) {
+#ifdef CONFIG_F2FS_FS_LZORLE
+ F2FS_OPTION(sbi).compress_level = 0;
F2FS_OPTION(sbi).compress_algorithm =
COMPRESS_LZORLE;
+#else
+ f2fs_info(sbi, "kernel doesn't support lzorle compression");
+#endif
} else {
kfree(name);
return -EINVAL;
@@ -1076,8 +1182,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
/* Will be used by directory only */
fi->i_dir_level = F2FS_SB(sb)->dir_level;
- fi->ra_offset = -1;
-
return &fi->vfs_inode;
}
@@ -1196,9 +1300,6 @@ static void f2fs_dirty_inode(struct inode *inode, int flags)
inode->i_ino == F2FS_META_INO(sbi))
return;
- if (flags == I_DIRTY_TIME)
- return;
-
if (is_inode_flag_set(inode, FI_AUTO_RECOVER))
clear_inode_flag(inode, FI_AUTO_RECOVER);
@@ -1246,6 +1347,12 @@ static void f2fs_put_super(struct super_block *sb)
mutex_lock(&sbi->umount_mutex);
/*
+ * flush all issued checkpoints and stop checkpoint issue thread.
+ * after then, all checkpoints should be done by each process context.
+ */
+ f2fs_stop_ckpt_thread(sbi);
+
+ /*
* We don't need to do checkpoint when superblock is clean.
* But, the previous checkpoint was not done by umount, it needs to do
* clean checkpoint again.
@@ -1343,16 +1450,8 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
return -EAGAIN;
- if (sync) {
- struct cp_control cpc;
-
- cpc.reason = __get_cp_reason(sbi);
-
- down_write(&sbi->gc_lock);
- err = f2fs_write_checkpoint(sbi, &cpc);
- up_write(&sbi->gc_lock);
- }
- f2fs_trace_ios(NULL, 1);
+ if (sync)
+ err = f2fs_issue_checkpoint(sbi);
return err;
}
@@ -1369,6 +1468,10 @@ static int f2fs_freeze(struct super_block *sb)
/* must be clean, since sync_filesystem() was already called */
if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY))
return -EINVAL;
+
+ /* ensure no checkpoint required */
+ if (!llist_empty(&F2FS_SB(sb)->cprc_info.issue_list))
+ return -EINVAL;
return 0;
}
@@ -1539,6 +1642,9 @@ static inline void f2fs_show_compress_options(struct seq_file *seq,
}
seq_printf(seq, ",compress_algorithm=%s", algtype);
+ if (F2FS_OPTION(sbi).compress_level)
+ seq_printf(seq, ":%d", F2FS_OPTION(sbi).compress_level);
+
seq_printf(seq, ",compress_log_size=%u",
F2FS_OPTION(sbi).compress_log_size);
@@ -1674,6 +1780,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
if (test_opt(sbi, DISABLE_CHECKPOINT))
seq_printf(seq, ",checkpoint=disable:%u",
F2FS_OPTION(sbi).unusable_cap);
+ if (test_opt(sbi, MERGE_CHECKPOINT))
+ seq_puts(seq, ",checkpoint_merge");
+ else
+ seq_puts(seq, ",nocheckpoint_merge");
if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_POSIX)
seq_printf(seq, ",fsync_mode=%s", "posix");
else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT)
@@ -1796,6 +1906,9 @@ restore_flag:
static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
{
+ /* we should flush all the data to keep data consistency */
+ sync_inodes_sb(sbi->sb);
+
down_write(&sbi->gc_lock);
f2fs_dirty_to_prefree(sbi);
@@ -1954,6 +2067,19 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
}
}
+ if (!test_opt(sbi, DISABLE_CHECKPOINT) &&
+ test_opt(sbi, MERGE_CHECKPOINT)) {
+ err = f2fs_start_ckpt_thread(sbi);
+ if (err) {
+ f2fs_err(sbi,
+ "Failed to start F2FS issue_checkpoint_thread (%d)",
+ err);
+ goto restore_gc;
+ }
+ } else {
+ f2fs_stop_ckpt_thread(sbi);
+ }
+
/*
* We stop issue flush thread if FS is mounted as RO
* or if flush_merge is not passed in mount option.
@@ -2638,10 +2764,10 @@ static const struct export_operations f2fs_export_ops = {
.get_parent = f2fs_get_parent,
};
-static loff_t max_file_blocks(void)
+loff_t max_file_blocks(struct inode *inode)
{
loff_t result = 0;
- loff_t leaf_count = DEF_ADDRS_PER_BLOCK;
+ loff_t leaf_count;
/*
* note: previously, result is equal to (DEF_ADDRS_PER_INODE -
@@ -2650,6 +2776,11 @@ static loff_t max_file_blocks(void)
* result as zero.
*/
+ if (inode && f2fs_compressed_file(inode))
+ leaf_count = ADDRS_PER_BLOCK(inode);
+ else
+ leaf_count = DEF_ADDRS_PER_BLOCK;
+
/* two direct node blocks */
result += (leaf_count * 2);
@@ -3533,8 +3664,7 @@ try_onemore:
if (err)
goto free_options;
- sbi->max_file_blocks = max_file_blocks();
- sb->s_maxbytes = sbi->max_file_blocks <<
+ sb->s_maxbytes = max_file_blocks(NULL) <<
le32_to_cpu(raw_super->log_blocksize);
sb->s_max_links = F2FS_LINK_MAX;
@@ -3701,6 +3831,19 @@ try_onemore:
f2fs_init_fsync_node_info(sbi);
+ /* setup checkpoint request control and start checkpoint issue thread */
+ f2fs_init_ckpt_req_control(sbi);
+ if (!test_opt(sbi, DISABLE_CHECKPOINT) &&
+ test_opt(sbi, MERGE_CHECKPOINT)) {
+ err = f2fs_start_ckpt_thread(sbi);
+ if (err) {
+ f2fs_err(sbi,
+ "Failed to start F2FS issue_checkpoint_thread (%d)",
+ err);
+ goto stop_ckpt_thread;
+ }
+ }
+
/* setup f2fs internal modules */
err = f2fs_build_segment_manager(sbi);
if (err) {
@@ -3786,12 +3929,10 @@ try_onemore:
* previous checkpoint was not done by clean system shutdown.
*/
if (f2fs_hw_is_readonly(sbi)) {
- if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
- err = -EROFS;
+ if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG))
f2fs_err(sbi, "Need to recover fsync data, but write access unavailable");
- goto free_meta;
- }
- f2fs_info(sbi, "write access unavailable, skipping recovery");
+ else
+ f2fs_info(sbi, "write access unavailable, skipping recovery");
goto reset_checkpoint;
}
@@ -3910,6 +4051,8 @@ free_nm:
free_sm:
f2fs_destroy_segment_manager(sbi);
f2fs_destroy_post_read_wq(sbi);
+stop_ckpt_thread:
+ f2fs_stop_ckpt_thread(sbi);
free_devices:
destroy_device_list(sbi);
kvfree(sbi->ckpt);
@@ -4024,8 +4167,6 @@ static int __init init_f2fs_fs(void)
return -EINVAL;
}
- f2fs_build_trace_ios();
-
err = init_inodecache();
if (err)
goto fail;
@@ -4118,7 +4259,6 @@ static void __exit exit_f2fs_fs(void)
f2fs_destroy_segment_manager_caches();
f2fs_destroy_node_manager_caches();
destroy_inodecache();
- f2fs_destroy_trace_ios();
}
module_init(init_f2fs_fs)
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 30bae57428d1..e38a7f6921dd 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -11,6 +11,7 @@
#include <linux/f2fs_fs.h>
#include <linux/seq_file.h>
#include <linux/unicode.h>
+#include <linux/ioprio.h>
#include "f2fs.h"
#include "segment.h"
@@ -34,6 +35,7 @@ enum {
FAULT_INFO_TYPE, /* struct f2fs_fault_info */
#endif
RESERVED_BLOCKS, /* struct f2fs_sb_info */
+ CPRC_INFO, /* struct ckpt_req_control */
};
struct f2fs_attr {
@@ -70,6 +72,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
else if (struct_type == STAT_INFO)
return (unsigned char *)F2FS_STAT(sbi);
#endif
+ else if (struct_type == CPRC_INFO)
+ return (unsigned char *)&sbi->cprc_info;
return NULL;
}
@@ -96,6 +100,12 @@ static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
sbi->sectors_written_start) >> 1)));
}
+static ssize_t sb_status_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ return sprintf(buf, "%lx\n", sbi->s_flag);
+}
+
static ssize_t features_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
@@ -255,6 +265,23 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
return len;
}
+ if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) {
+ struct ckpt_req_control *cprc = &sbi->cprc_info;
+ int len = 0;
+ int class = IOPRIO_PRIO_CLASS(cprc->ckpt_thread_ioprio);
+ int data = IOPRIO_PRIO_DATA(cprc->ckpt_thread_ioprio);
+
+ if (class == IOPRIO_CLASS_RT)
+ len += scnprintf(buf + len, PAGE_SIZE - len, "rt,");
+ else if (class == IOPRIO_CLASS_BE)
+ len += scnprintf(buf + len, PAGE_SIZE - len, "be,");
+ else
+ return -EINVAL;
+
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%d\n", data);
+ return len;
+ }
+
ui = (unsigned int *)(ptr + a->offset);
return sprintf(buf, "%u\n", *ui);
@@ -308,6 +335,38 @@ out:
return ret ? ret : count;
}
+ if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) {
+ const char *name = strim((char *)buf);
+ struct ckpt_req_control *cprc = &sbi->cprc_info;
+ int class;
+ long data;
+ int ret;
+
+ if (!strncmp(name, "rt,", 3))
+ class = IOPRIO_CLASS_RT;
+ else if (!strncmp(name, "be,", 3))
+ class = IOPRIO_CLASS_BE;
+ else
+ return -EINVAL;
+
+ name += 3;
+ ret = kstrtol(name, 10, &data);
+ if (ret)
+ return ret;
+ if (data >= IOPRIO_BE_NR || data < 0)
+ return -EINVAL;
+
+ cprc->ckpt_thread_ioprio = IOPRIO_PRIO_VALUE(class, data);
+ if (test_opt(sbi, MERGE_CHECKPOINT)) {
+ ret = set_task_ioprio(cprc->f2fs_issue_ckpt,
+ cprc->ckpt_thread_ioprio);
+ if (ret)
+ return ret;
+ }
+
+ return count;
+ }
+
ui = (unsigned int *)(ptr + a->offset);
ret = kstrtoul(skip_spaces(buf), 0, &t);
@@ -567,6 +626,7 @@ F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type);
#endif
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag);
+F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio);
F2FS_GENERAL_RO_ATTR(dirty_segments);
F2FS_GENERAL_RO_ATTR(free_segments);
F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
@@ -652,6 +712,7 @@ static struct attribute *f2fs_attrs[] = {
#endif
ATTR_LIST(data_io_flag),
ATTR_LIST(node_io_flag),
+ ATTR_LIST(ckpt_thread_ioprio),
ATTR_LIST(dirty_segments),
ATTR_LIST(free_segments),
ATTR_LIST(unusable),
@@ -702,6 +763,13 @@ static struct attribute *f2fs_feat_attrs[] = {
};
ATTRIBUTE_GROUPS(f2fs_feat);
+F2FS_GENERAL_RO_ATTR(sb_status);
+static struct attribute *f2fs_stat_attrs[] = {
+ ATTR_LIST(sb_status),
+ NULL,
+};
+ATTRIBUTE_GROUPS(f2fs_stat);
+
static const struct sysfs_ops f2fs_attr_ops = {
.show = f2fs_attr_show,
.store = f2fs_attr_store,
@@ -730,6 +798,44 @@ static struct kobject f2fs_feat = {
.kset = &f2fs_kset,
};
+static ssize_t f2fs_stat_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
+ s_stat_kobj);
+ struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
+
+ return a->show ? a->show(a, sbi, buf) : 0;
+}
+
+static ssize_t f2fs_stat_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
+ s_stat_kobj);
+ struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
+
+ return a->store ? a->store(a, sbi, buf, len) : 0;
+}
+
+static void f2fs_stat_kobj_release(struct kobject *kobj)
+{
+ struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
+ s_stat_kobj);
+ complete(&sbi->s_stat_kobj_unregister);
+}
+
+static const struct sysfs_ops f2fs_stat_attr_ops = {
+ .show = f2fs_stat_attr_show,
+ .store = f2fs_stat_attr_store,
+};
+
+static struct kobj_type f2fs_stat_ktype = {
+ .default_groups = f2fs_stat_groups,
+ .sysfs_ops = &f2fs_stat_attr_ops,
+ .release = f2fs_stat_kobj_release,
+};
+
static int __maybe_unused segment_info_seq_show(struct seq_file *seq,
void *offset)
{
@@ -936,11 +1042,15 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
init_completion(&sbi->s_kobj_unregister);
err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL,
"%s", sb->s_id);
- if (err) {
- kobject_put(&sbi->s_kobj);
- wait_for_completion(&sbi->s_kobj_unregister);
- return err;
- }
+ if (err)
+ goto put_sb_kobj;
+
+ sbi->s_stat_kobj.kset = &f2fs_kset;
+ init_completion(&sbi->s_stat_kobj_unregister);
+ err = kobject_init_and_add(&sbi->s_stat_kobj, &f2fs_stat_ktype,
+ &sbi->s_kobj, "stat");
+ if (err)
+ goto put_stat_kobj;
if (f2fs_proc_root)
sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
@@ -956,6 +1066,13 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
victim_bits_seq_show, sb);
}
return 0;
+put_stat_kobj:
+ kobject_put(&sbi->s_stat_kobj);
+ wait_for_completion(&sbi->s_stat_kobj_unregister);
+put_sb_kobj:
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
+ return err;
}
void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi)
@@ -967,6 +1084,11 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi)
remove_proc_entry("victim_bits", sbi->s_proc);
remove_proc_entry(sbi->sb->s_id, f2fs_proc_root);
}
+
+ kobject_del(&sbi->s_stat_kobj);
+ kobject_put(&sbi->s_stat_kobj);
+ wait_for_completion(&sbi->s_stat_kobj_unregister);
+
kobject_del(&sbi->s_kobj);
kobject_put(&sbi->s_kobj);
wait_for_completion(&sbi->s_kobj_unregister);
diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
deleted file mode 100644
index d0ab533a9ce8..000000000000
--- a/fs/f2fs/trace.c
+++ /dev/null
@@ -1,165 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * f2fs IO tracer
- *
- * Copyright (c) 2014 Motorola Mobility
- * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org>
- */
-#include <linux/fs.h>
-#include <linux/f2fs_fs.h>
-#include <linux/sched.h>
-#include <linux/radix-tree.h>
-
-#include "f2fs.h"
-#include "trace.h"
-
-static RADIX_TREE(pids, GFP_ATOMIC);
-static spinlock_t pids_lock;
-static struct last_io_info last_io;
-
-static inline void __print_last_io(void)
-{
- if (!last_io.len)
- return;
-
- trace_printk("%3x:%3x %4x %-16s %2x %5x %5x %12x %4x\n",
- last_io.major, last_io.minor,
- last_io.pid, "----------------",
- last_io.type,
- last_io.fio.op, last_io.fio.op_flags,
- last_io.fio.new_blkaddr,
- last_io.len);
- memset(&last_io, 0, sizeof(last_io));
-}
-
-static int __file_type(struct inode *inode, pid_t pid)
-{
- if (f2fs_is_atomic_file(inode))
- return __ATOMIC_FILE;
- else if (f2fs_is_volatile_file(inode))
- return __VOLATILE_FILE;
- else if (S_ISDIR(inode->i_mode))
- return __DIR_FILE;
- else if (inode->i_ino == F2FS_NODE_INO(F2FS_I_SB(inode)))
- return __NODE_FILE;
- else if (inode->i_ino == F2FS_META_INO(F2FS_I_SB(inode)))
- return __META_FILE;
- else if (pid)
- return __NORMAL_FILE;
- else
- return __MISC_FILE;
-}
-
-void f2fs_trace_pid(struct page *page)
-{
- struct inode *inode = page->mapping->host;
- pid_t pid = task_pid_nr(current);
- void *p;
-
- set_page_private(page, (unsigned long)pid);
-
-retry:
- if (radix_tree_preload(GFP_NOFS))
- return;
-
- spin_lock(&pids_lock);
- p = radix_tree_lookup(&pids, pid);
- if (p == current)
- goto out;
- if (p)
- radix_tree_delete(&pids, pid);
-
- if (radix_tree_insert(&pids, pid, current)) {
- spin_unlock(&pids_lock);
- radix_tree_preload_end();
- cond_resched();
- goto retry;
- }
-
- trace_printk("%3x:%3x %4x %-16s\n",
- MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
- pid, current->comm);
-out:
- spin_unlock(&pids_lock);
- radix_tree_preload_end();
-}
-
-void f2fs_trace_ios(struct f2fs_io_info *fio, int flush)
-{
- struct inode *inode;
- pid_t pid;
- int major, minor;
-
- if (flush) {
- __print_last_io();
- return;
- }
-
- inode = fio->page->mapping->host;
- pid = page_private(fio->page);
-
- major = MAJOR(inode->i_sb->s_dev);
- minor = MINOR(inode->i_sb->s_dev);
-
- if (last_io.major == major && last_io.minor == minor &&
- last_io.pid == pid &&
- last_io.type == __file_type(inode, pid) &&
- last_io.fio.op == fio->op &&
- last_io.fio.op_flags == fio->op_flags &&
- last_io.fio.new_blkaddr + last_io.len ==
- fio->new_blkaddr) {
- last_io.len++;
- return;
- }
-
- __print_last_io();
-
- last_io.major = major;
- last_io.minor = minor;
- last_io.pid = pid;
- last_io.type = __file_type(inode, pid);
- last_io.fio = *fio;
- last_io.len = 1;
- return;
-}
-
-void f2fs_build_trace_ios(void)
-{
- spin_lock_init(&pids_lock);
-}
-
-#define PIDVEC_SIZE 128
-static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index,
- unsigned int max_items)
-{
- struct radix_tree_iter iter;
- void **slot;
- unsigned int ret = 0;
-
- if (unlikely(!max_items))
- return 0;
-
- radix_tree_for_each_slot(slot, &pids, &iter, first_index) {
- results[ret] = iter.index;
- if (++ret == max_items)
- break;
- }
- return ret;
-}
-
-void f2fs_destroy_trace_ios(void)
-{
- pid_t pid[PIDVEC_SIZE];
- pid_t next_pid = 0;
- unsigned int found;
-
- spin_lock(&pids_lock);
- while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) {
- unsigned idx;
-
- next_pid = pid[found - 1] + 1;
- for (idx = 0; idx < found; idx++)
- radix_tree_delete(&pids, pid[idx]);
- }
- spin_unlock(&pids_lock);
-}
diff --git a/fs/f2fs/trace.h b/fs/f2fs/trace.h
deleted file mode 100644
index 789f6aa727fc..000000000000
--- a/fs/f2fs/trace.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * f2fs IO tracer
- *
- * Copyright (c) 2014 Motorola Mobility
- * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org>
- */
-#ifndef __F2FS_TRACE_H__
-#define __F2FS_TRACE_H__
-
-#ifdef CONFIG_F2FS_IO_TRACE
-#include <trace/events/f2fs.h>
-
-enum file_type {
- __NORMAL_FILE,
- __DIR_FILE,
- __NODE_FILE,
- __META_FILE,
- __ATOMIC_FILE,
- __VOLATILE_FILE,
- __MISC_FILE,
-};
-
-struct last_io_info {
- int major, minor;
- pid_t pid;
- enum file_type type;
- struct f2fs_io_info fio;
- block_t len;
-};
-
-extern void f2fs_trace_pid(struct page *);
-extern void f2fs_trace_ios(struct f2fs_io_info *, int);
-extern void f2fs_build_trace_ios(void);
-extern void f2fs_destroy_trace_ios(void);
-#else
-#define f2fs_trace_pid(p)
-#define f2fs_trace_ios(i, n)
-#define f2fs_build_trace_ios()
-#define f2fs_destroy_trace_ios()
-
-#endif
-#endif /* __F2FS_TRACE_H__ */
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 10081bf74324..490f843ec3bf 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -329,7 +329,7 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
void *last_addr = NULL;
nid_t xnid = F2FS_I(inode)->i_xattr_nid;
unsigned int inline_size = inline_xattr_size(inode);
- int err = 0;
+ int err;
if (!xnid && !inline_size)
return -ENODATA;
@@ -517,7 +517,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
void *buffer, size_t buffer_size, struct page *ipage)
{
struct f2fs_xattr_entry *entry = NULL;
- int error = 0;
+ int error;
unsigned int size, len;
void *base_addr = NULL;
int base_size;
@@ -564,7 +564,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
struct inode *inode = d_inode(dentry);
struct f2fs_xattr_entry *entry;
void *base_addr, *last_base_addr;
- int error = 0;
+ int error;
size_t rest = buffer_size;
down_read(&F2FS_I(inode)->i_xattr_sem);
@@ -634,7 +634,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
int found, newsize;
size_t len;
__u32 new_hsize;
- int error = 0;
+ int error;
if (name == NULL)
return -EINVAL;
@@ -675,7 +675,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
}
if (value && f2fs_xattr_value_same(here, value, size))
- goto exit;
+ goto same;
} else if ((flags & XATTR_REPLACE)) {
error = -ENODATA;
goto exit;
@@ -740,17 +740,20 @@ static int __f2fs_setxattr(struct inode *inode, int index,
if (error)
goto exit;
- if (is_inode_flag_set(inode, FI_ACL_MODE)) {
- inode->i_mode = F2FS_I(inode)->i_acl_mode;
- inode->i_ctime = current_time(inode);
- clear_inode_flag(inode, FI_ACL_MODE);
- }
if (index == F2FS_XATTR_INDEX_ENCRYPTION &&
!strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT))
f2fs_set_encrypted_inode(inode);
f2fs_mark_inode_dirty_sync(inode, true);
if (!error && S_ISDIR(inode->i_mode))
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP);
+
+same:
+ if (is_inode_flag_set(inode, FI_ACL_MODE)) {
+ inode->i_mode = F2FS_I(inode)->i_acl_mode;
+ inode->i_ctime = current_time(inode);
+ clear_inode_flag(inode, FI_ACL_MODE);
+ }
+
exit:
kfree(base_addr);
return error;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index da7c56234189..13855ba49cd9 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -195,7 +195,7 @@ int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
if (err)
return err;
- return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ return blkdev_issue_flush(inode->i_sb->s_bdev);
}
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index f1b2a1fc2a6a..18a50a46b57f 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -329,22 +329,23 @@ EXPORT_SYMBOL_GPL(fat_truncate_time);
int fat_update_time(struct inode *inode, struct timespec64 *now, int flags)
{
- int iflags = I_DIRTY_TIME;
- bool dirty = false;
+ int dirty_flags = 0;
if (inode->i_ino == MSDOS_ROOT_INO)
return 0;
- fat_truncate_time(inode, now, flags);
- if (flags & S_VERSION)
- dirty = inode_maybe_inc_iversion(inode, false);
- if ((flags & (S_ATIME | S_CTIME | S_MTIME)) &&
- !(inode->i_sb->s_flags & SB_LAZYTIME))
- dirty = true;
+ if (flags & (S_ATIME | S_CTIME | S_MTIME)) {
+ fat_truncate_time(inode, now, flags);
+ if (inode->i_sb->s_flags & SB_LAZYTIME)
+ dirty_flags |= I_DIRTY_TIME;
+ else
+ dirty_flags |= I_DIRTY_SYNC;
+ }
+
+ if ((flags & S_VERSION) && inode_maybe_inc_iversion(inode, false))
+ dirty_flags |= I_DIRTY_SYNC;
- if (dirty)
- iflags |= I_DIRTY_SYNC;
- __mark_inode_dirty(inode, iflags);
+ __mark_inode_dirty(inode, dirty_flags);
return 0;
}
EXPORT_SYMBOL_GPL(fat_update_time);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index f6ac5285060d..dfc72f15be7f 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -149,11 +149,15 @@ void f_delown(struct file *filp)
pid_t f_getown(struct file *filp)
{
- pid_t pid;
+ pid_t pid = 0;
read_lock(&filp->f_owner.lock);
- pid = pid_vnr(filp->f_owner.pid);
- if (filp->f_owner.pid_type == PIDTYPE_PGID)
- pid = -pid;
+ rcu_read_lock();
+ if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) {
+ pid = pid_vnr(filp->f_owner.pid);
+ if (filp->f_owner.pid_type == PIDTYPE_PGID)
+ pid = -pid;
+ }
+ rcu_read_unlock();
read_unlock(&filp->f_owner.lock);
return pid;
}
@@ -201,11 +205,14 @@ static int f_setown_ex(struct file *filp, unsigned long arg)
static int f_getown_ex(struct file *filp, unsigned long arg)
{
struct f_owner_ex __user *owner_p = (void __user *)arg;
- struct f_owner_ex owner;
+ struct f_owner_ex owner = {};
int ret = 0;
read_lock(&filp->f_owner.lock);
- owner.pid = pid_vnr(filp->f_owner.pid);
+ rcu_read_lock();
+ if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type))
+ owner.pid = pid_vnr(filp->f_owner.pid);
+ rcu_read_unlock();
switch (filp->f_owner.pid_type) {
case PIDTYPE_PID:
owner.type = F_OWNER_TID;
diff --git a/fs/file.c b/fs/file.c
index dab120b71e44..f3a4bac2cbe9 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -22,6 +22,8 @@
#include <linux/close_range.h>
#include <net/sock.h>
+#include "internal.h"
+
unsigned int sysctl_nr_open __read_mostly = 1024*1024;
unsigned int sysctl_nr_open_min = BITS_PER_LONG;
/* our min() is unusable in constant expressions ;-/ */
@@ -732,36 +734,48 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
}
/*
- * variant of close_fd that gets a ref on the file for later fput.
- * The caller must ensure that filp_close() called on the file, and then
- * an fput().
+ * See close_fd_get_file() below, this variant assumes current->files->file_lock
+ * is held.
*/
-int close_fd_get_file(unsigned int fd, struct file **res)
+int __close_fd_get_file(unsigned int fd, struct file **res)
{
struct files_struct *files = current->files;
struct file *file;
struct fdtable *fdt;
- spin_lock(&files->file_lock);
fdt = files_fdtable(files);
if (fd >= fdt->max_fds)
- goto out_unlock;
+ goto out_err;
file = fdt->fd[fd];
if (!file)
- goto out_unlock;
+ goto out_err;
rcu_assign_pointer(fdt->fd[fd], NULL);
__put_unused_fd(files, fd);
- spin_unlock(&files->file_lock);
get_file(file);
*res = file;
return 0;
-
-out_unlock:
- spin_unlock(&files->file_lock);
+out_err:
*res = NULL;
return -ENOENT;
}
+/*
+ * variant of close_fd that gets a ref on the file for later fput.
+ * The caller must ensure that filp_close() called on the file, and then
+ * an fput().
+ */
+int close_fd_get_file(unsigned int fd, struct file **res)
+{
+ struct files_struct *files = current->files;
+ int ret;
+
+ spin_lock(&files->file_lock);
+ ret = __close_fd_get_file(fd, res);
+ spin_unlock(&files->file_lock);
+
+ return ret;
+}
+
void do_close_on_exec(struct files_struct *files)
{
unsigned i;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index acfb55834af2..e91980f49388 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1442,9 +1442,15 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
}
/*
- * Write out an inode and its dirty pages. Do not update the writeback list
- * linkage. That is left to the caller. The caller is also responsible for
- * setting I_SYNC flag and calling inode_sync_complete() to clear it.
+ * Write out an inode and its dirty pages (or some of its dirty pages, depending
+ * on @wbc->nr_to_write), and clear the relevant dirty flags from i_state.
+ *
+ * This doesn't remove the inode from the writeback list it is on, except
+ * potentially to move it from b_dirty_time to b_dirty due to timestamp
+ * expiration. The caller is otherwise responsible for writeback list handling.
+ *
+ * The caller is also responsible for setting the I_SYNC flag beforehand and
+ * calling inode_sync_complete() to clear it afterwards.
*/
static int
__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
@@ -1474,21 +1480,26 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
}
/*
- * Some filesystems may redirty the inode during the writeback
- * due to delalloc, clear dirty metadata flags right before
- * write_inode()
+ * If the inode has dirty timestamps and we need to write them, call
+ * mark_inode_dirty_sync() to notify the filesystem about it and to
+ * change I_DIRTY_TIME into I_DIRTY_SYNC.
*/
- spin_lock(&inode->i_lock);
-
- dirty = inode->i_state & I_DIRTY;
if ((inode->i_state & I_DIRTY_TIME) &&
- ((dirty & I_DIRTY_INODE) ||
- wbc->sync_mode == WB_SYNC_ALL || wbc->for_sync ||
+ (wbc->sync_mode == WB_SYNC_ALL ||
time_after(jiffies, inode->dirtied_time_when +
dirtytime_expire_interval * HZ))) {
- dirty |= I_DIRTY_TIME;
trace_writeback_lazytime(inode);
+ mark_inode_dirty_sync(inode);
}
+
+ /*
+ * Get and clear the dirty flags from i_state. This needs to be done
+ * after calling writepages because some filesystems may redirty the
+ * inode during writepages due to delalloc. It also needs to be done
+ * after handling timestamp expiration, as that may dirty the inode too.
+ */
+ spin_lock(&inode->i_lock);
+ dirty = inode->i_state & I_DIRTY;
inode->i_state &= ~dirty;
/*
@@ -1509,8 +1520,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
spin_unlock(&inode->i_lock);
- if (dirty & I_DIRTY_TIME)
- mark_inode_dirty_sync(inode);
/* Don't write the inode if only I_DIRTY_PAGES was set */
if (dirty & ~I_DIRTY_PAGES) {
int err = write_inode(inode, wbc);
@@ -1522,12 +1531,13 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
}
/*
- * Write out an inode's dirty pages. Either the caller has an active reference
- * on the inode or the inode has I_WILL_FREE set.
+ * Write out an inode's dirty data and metadata on-demand, i.e. separately from
+ * the regular batched writeback done by the flusher threads in
+ * writeback_sb_inodes(). @wbc controls various aspects of the write, such as
+ * whether it is a data-integrity sync (%WB_SYNC_ALL) or not (%WB_SYNC_NONE).
*
- * This function is designed to be called for writing back one inode which
- * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode()
- * and does more profound writeback list handling in writeback_sb_inodes().
+ * To prevent the inode from going away, either the caller must have a reference
+ * to the inode, or the inode must have I_WILL_FREE or I_FREEING set.
*/
static int writeback_single_inode(struct inode *inode,
struct writeback_control *wbc)
@@ -1542,23 +1552,23 @@ static int writeback_single_inode(struct inode *inode,
WARN_ON(inode->i_state & I_WILL_FREE);
if (inode->i_state & I_SYNC) {
- if (wbc->sync_mode != WB_SYNC_ALL)
- goto out;
/*
- * It's a data-integrity sync. We must wait. Since callers hold
- * inode reference or inode has I_WILL_FREE set, it cannot go
- * away under us.
+ * Writeback is already running on the inode. For WB_SYNC_NONE,
+ * that's enough and we can just return. For WB_SYNC_ALL, we
+ * must wait for the existing writeback to complete, then do
+ * writeback again if there's anything left.
*/
+ if (wbc->sync_mode != WB_SYNC_ALL)
+ goto out;
__inode_wait_for_writeback(inode);
}
WARN_ON(inode->i_state & I_SYNC);
/*
- * Skip inode if it is clean and we have no outstanding writeback in
- * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this
- * function since flusher thread may be doing for example sync in
- * parallel and if we move the inode, it could get skipped. So here we
- * make sure inode is on some writeback list and leave it there unless
- * we have completely cleaned the inode.
+ * If the inode is already fully clean, then there's nothing to do.
+ *
+ * For data-integrity syncs we also need to check whether any pages are
+ * still under writeback, e.g. due to prior WB_SYNC_NONE writeback. If
+ * there are any such pages, we'll need to wait for them.
*/
if (!(inode->i_state & I_DIRTY_ALL) &&
(wbc->sync_mode != WB_SYNC_ALL ||
@@ -1574,8 +1584,9 @@ static int writeback_single_inode(struct inode *inode,
wb = inode_to_wb_and_lock_list(inode);
spin_lock(&inode->i_lock);
/*
- * If inode is clean, remove it from writeback lists. Otherwise don't
- * touch it. See comment above for explanation.
+ * If the inode is now fully clean, then it can be safely removed from
+ * its writeback list (if any). Otherwise the flusher threads are
+ * responsible for the writeback lists.
*/
if (!(inode->i_state & I_DIRTY_ALL))
inode_io_list_del_locked(inode, wb);
@@ -2217,23 +2228,24 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
}
/**
- * __mark_inode_dirty - internal function
+ * __mark_inode_dirty - internal function to mark an inode dirty
*
* @inode: inode to mark
- * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
+ * @flags: what kind of dirty, e.g. I_DIRTY_SYNC. This can be a combination of
+ * multiple I_DIRTY_* flags, except that I_DIRTY_TIME can't be combined
+ * with I_DIRTY_PAGES.
*
- * Mark an inode as dirty. Callers should use mark_inode_dirty or
- * mark_inode_dirty_sync.
+ * Mark an inode as dirty. We notify the filesystem, then update the inode's
+ * dirty flags. Then, if needed we add the inode to the appropriate dirty list.
*
- * Put the inode on the super block's dirty list.
+ * Most callers should use mark_inode_dirty() or mark_inode_dirty_sync()
+ * instead of calling this directly.
*
- * CAREFUL! We mark it dirty unconditionally, but move it onto the
- * dirty list only if it is hashed or if it refers to a blockdev.
- * If it was not hashed, it will never be added to the dirty list
- * even if it is later hashed, as it will have been marked dirty already.
+ * CAREFUL! We only add the inode to the dirty list if it is hashed or if it
+ * refers to a blockdev. Unhashed inodes will never be added to the dirty list
+ * even if they are later hashed, as they will have been marked dirty already.
*
- * In short, make sure you hash any inodes _before_ you start marking
- * them dirty.
+ * In short, ensure you hash any inodes _before_ you start marking them dirty.
*
* Note that for blockdevs, inode->dirtied_when represents the dirtying time of
* the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
@@ -2245,25 +2257,34 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
void __mark_inode_dirty(struct inode *inode, int flags)
{
struct super_block *sb = inode->i_sb;
- int dirtytime;
+ int dirtytime = 0;
trace_writeback_mark_inode_dirty(inode, flags);
- /*
- * Don't do this for I_DIRTY_PAGES - that doesn't actually
- * dirty the inode itself
- */
- if (flags & (I_DIRTY_INODE | I_DIRTY_TIME)) {
+ if (flags & I_DIRTY_INODE) {
+ /*
+ * Notify the filesystem about the inode being dirtied, so that
+ * (if needed) it can update on-disk fields and journal the
+ * inode. This is only needed when the inode itself is being
+ * dirtied now. I.e. it's only needed for I_DIRTY_INODE, not
+ * for just I_DIRTY_PAGES or I_DIRTY_TIME.
+ */
trace_writeback_dirty_inode_start(inode, flags);
-
if (sb->s_op->dirty_inode)
- sb->s_op->dirty_inode(inode, flags);
-
+ sb->s_op->dirty_inode(inode, flags & I_DIRTY_INODE);
trace_writeback_dirty_inode(inode, flags);
- }
- if (flags & I_DIRTY_INODE)
+
+ /* I_DIRTY_INODE supersedes I_DIRTY_TIME. */
flags &= ~I_DIRTY_TIME;
- dirtytime = flags & I_DIRTY_TIME;
+ } else {
+ /*
+ * Else it's either I_DIRTY_PAGES, I_DIRTY_TIME, or nothing.
+ * (We don't support setting both I_DIRTY_PAGES and I_DIRTY_TIME
+ * in one call to __mark_inode_dirty().)
+ */
+ dirtytime = flags & I_DIRTY_TIME;
+ WARN_ON_ONCE(dirtytime && flags != I_DIRTY_TIME);
+ }
/*
* Paired with smp_mb() in __writeback_single_inode() for the
@@ -2286,6 +2307,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
inode_attach_wb(inode, NULL);
+ /* I_DIRTY_INODE supersedes I_DIRTY_TIME. */
if (flags & I_DIRTY_INODE)
inode->i_state &= ~I_DIRTY_TIME;
inode->i_state |= flags;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 8f5523822788..95bbdd4bca78 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -749,7 +749,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
{
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- int sync_state = inode->i_state & I_DIRTY_ALL;
+ int sync_state = inode->i_state & I_DIRTY;
struct gfs2_inode *ip = GFS2_I(inode);
int ret = 0, ret1 = 0;
@@ -762,7 +762,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
if (!gfs2_is_jdata(ip))
sync_state &= ~I_DIRTY_PAGES;
if (datasync)
- sync_state &= ~(I_DIRTY_SYNC | I_DIRTY_TIME);
+ sync_state &= ~I_DIRTY_SYNC;
if (sync_state) {
ret = sync_inode_metadata(inode, 1);
@@ -797,9 +797,7 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
if (ret)
goto out_uninit;
- ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
- is_sync_kiocb(iocb));
-
+ ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0);
gfs2_glock_dq(gh);
out_uninit:
gfs2_holder_uninit(gh);
@@ -833,8 +831,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
if (offset + len > i_size_read(&ip->i_inode))
goto out;
- ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
- is_sync_kiocb(iocb));
+ ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0);
if (ret == -ENOTBLK)
ret = 0;
out:
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 2f56acc41c04..042b94288ff1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -562,8 +562,6 @@ static void gfs2_dirty_inode(struct inode *inode, int flags)
int need_endtrans = 0;
int ret;
- if (!(flags & I_DIRTY_INODE))
- return;
if (unlikely(gfs2_withdrawn(sdp)))
return;
if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 7a937de9b2ad..078c5c8a5156 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -342,7 +342,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
}
if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
- blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ blkdev_issue_flush(inode->i_sb->s_bdev);
inode_unlock(inode);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 807119ae5adf..b9e3db3f855f 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -239,7 +239,7 @@ out:
mutex_unlock(&sbi->vh_mutex);
if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
- blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
+ blkdev_issue_flush(sb->s_bdev);
return error;
}
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 7c918cd816a3..29e407762626 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -34,6 +34,8 @@ static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
#define FILE_HOSTFS_I(file) HOSTFS_I(file_inode(file))
+static struct kmem_cache *hostfs_inode_cache;
+
/* Changed in hostfs_args before the kernel starts running */
static char *root_ino = "";
static int append = 0;
@@ -221,7 +223,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
{
struct hostfs_inode_info *hi;
- hi = kmalloc(sizeof(*hi), GFP_KERNEL_ACCOUNT);
+ hi = kmem_cache_alloc(hostfs_inode_cache, GFP_KERNEL_ACCOUNT);
if (hi == NULL)
return NULL;
hi->fd = -1;
@@ -243,7 +245,7 @@ static void hostfs_evict_inode(struct inode *inode)
static void hostfs_free_inode(struct inode *inode)
{
- kfree(HOSTFS_I(inode));
+ kmem_cache_free(hostfs_inode_cache, HOSTFS_I(inode));
}
static int hostfs_show_options(struct seq_file *seq, struct dentry *root)
@@ -991,12 +993,16 @@ MODULE_ALIAS_FS("hostfs");
static int __init init_hostfs(void)
{
+ hostfs_inode_cache = KMEM_CACHE(hostfs_inode_info, 0);
+ if (!hostfs_inode_cache)
+ return -ENOMEM;
return register_filesystem(&hostfs_type);
}
static void __exit exit_hostfs(void)
{
unregister_filesystem(&hostfs_type);
+ kmem_cache_destroy(hostfs_inode_cache);
}
module_init(init_hostfs)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c5c32eb59498..b7a72f577aab 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -735,9 +735,10 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ set_page_huge_active(page);
/*
* unlock_page because locked by add_to_page_cache()
- * page_put due to reference from alloc_huge_page()
+ * put_page() due to reference from alloc_huge_page()
*/
unlock_page(page);
put_page(page);
diff --git a/fs/inode.c b/fs/inode.c
index 08151968c9ef..6dba963d3f6d 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1493,7 +1493,7 @@ struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval,
EXPORT_SYMBOL(find_inode_rcu);
/**
- * find_inode_by_rcu - Find an inode in the inode cache
+ * find_inode_by_ino_rcu - Find an inode in the inode cache
* @sb: Super block of file system to search
* @ino: The inode number to match
*
@@ -1743,24 +1743,26 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
int generic_update_time(struct inode *inode, struct timespec64 *time, int flags)
{
- int iflags = I_DIRTY_TIME;
- bool dirty = false;
-
- if (flags & S_ATIME)
- inode->i_atime = *time;
- if (flags & S_VERSION)
- dirty = inode_maybe_inc_iversion(inode, false);
- if (flags & S_CTIME)
- inode->i_ctime = *time;
- if (flags & S_MTIME)
- inode->i_mtime = *time;
- if ((flags & (S_ATIME | S_CTIME | S_MTIME)) &&
- !(inode->i_sb->s_flags & SB_LAZYTIME))
- dirty = true;
-
- if (dirty)
- iflags |= I_DIRTY_SYNC;
- __mark_inode_dirty(inode, iflags);
+ int dirty_flags = 0;
+
+ if (flags & (S_ATIME | S_CTIME | S_MTIME)) {
+ if (flags & S_ATIME)
+ inode->i_atime = *time;
+ if (flags & S_CTIME)
+ inode->i_ctime = *time;
+ if (flags & S_MTIME)
+ inode->i_mtime = *time;
+
+ if (inode->i_sb->s_flags & SB_LAZYTIME)
+ dirty_flags |= I_DIRTY_TIME;
+ else
+ dirty_flags |= I_DIRTY_SYNC;
+ }
+
+ if ((flags & S_VERSION) && inode_maybe_inc_iversion(inode, false))
+ dirty_flags |= I_DIRTY_SYNC;
+
+ __mark_inode_dirty(inode, dirty_flags);
return 0;
}
EXPORT_SYMBOL(generic_update_time);
@@ -1777,7 +1779,7 @@ static int update_time(struct inode *inode, struct timespec64 *time, int flags)
}
/**
- * touch_atime - update the access time
+ * atime_needs_update - update the access time
* @path: the &struct path to update
* @inode: inode to update
*
diff --git a/fs/internal.h b/fs/internal.h
index 6c8a4eddc7e6..6aeae7ef3380 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -15,6 +15,7 @@ struct mount;
struct shrink_control;
struct fs_context;
struct user_namespace;
+struct pipe_inode_info;
/*
* block_dev.c
@@ -132,6 +133,7 @@ extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
const char *, const struct open_flags *);
extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
+extern int __close_fd_get_file(unsigned int fd, struct file **res);
long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
int chmod_common(const struct path *path, umode_t mode);
@@ -193,3 +195,11 @@ int sb_init_dio_done_wq(struct super_block *sb);
*/
int do_statx(int dfd, const char __user *filename, unsigned flags,
unsigned int mask, struct statx __user *buffer);
+
+/*
+ * fs/splice.c:
+ */
+long splice_file_to_pipe(struct file *in,
+ struct pipe_inode_info *opipe,
+ loff_t *offset,
+ size_t len, unsigned int flags);
diff --git a/fs/io-wq.c b/fs/io-wq.c
index a564f36e260c..c36bbcd823ce 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -64,9 +64,7 @@ struct io_worker {
#endif
const struct cred *cur_creds;
const struct cred *saved_creds;
- struct files_struct *restore_files;
struct nsproxy *restore_nsproxy;
- struct fs_struct *restore_fs;
};
#if BITS_PER_LONG == 64
@@ -156,19 +154,19 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
worker->cur_creds = worker->saved_creds = NULL;
}
- if (current->files != worker->restore_files) {
+ if (current->files) {
__acquire(&wqe->lock);
raw_spin_unlock_irq(&wqe->lock);
dropped_lock = true;
task_lock(current);
- current->files = worker->restore_files;
+ current->files = NULL;
current->nsproxy = worker->restore_nsproxy;
task_unlock(current);
}
- if (current->fs != worker->restore_fs)
- current->fs = worker->restore_fs;
+ if (current->fs)
+ current->fs = NULL;
/*
* If we have an active mm, we need to drop the wq lock before unusing
@@ -329,11 +327,11 @@ static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker)
allow_kernel_signal(SIGINT);
current->flags |= PF_IO_WORKER;
+ current->fs = NULL;
+ current->files = NULL;
worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
- worker->restore_files = current->files;
worker->restore_nsproxy = current->nsproxy;
- worker->restore_fs = current->fs;
io_wqe_inc_running(wqe, worker);
}
@@ -555,23 +553,21 @@ get_next:
/* handle a whole dependent link */
do {
- struct io_wq_work *old_work, *next_hashed, *linked;
+ struct io_wq_work *next_hashed, *linked;
unsigned int hash = io_get_work_hash(work);
next_hashed = wq_next_work(work);
io_impersonate_work(worker, work);
+ wq->do_work(work);
+ io_assign_current_work(worker, NULL);
- old_work = work;
- linked = wq->do_work(work);
-
+ linked = wq->free_work(work);
work = next_hashed;
if (!work && linked && !io_wq_is_hashed(linked)) {
work = linked;
linked = NULL;
}
io_assign_current_work(worker, work);
- wq->free_work(old_work);
-
if (linked)
io_wqe_enqueue(wqe, linked);
@@ -850,11 +846,9 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
struct io_wq *wq = wqe->wq;
do {
- struct io_wq_work *old_work = work;
-
work->flags |= IO_WQ_WORK_CANCEL;
- work = wq->do_work(work);
- wq->free_work(old_work);
+ wq->do_work(work);
+ work = wq->free_work(work);
} while (work);
}
@@ -944,7 +938,6 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
*/
spin_lock_irqsave(&worker->lock, flags);
if (worker->cur_work &&
- !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) &&
match->fn(worker->cur_work, match->data)) {
send_sig(SIGINT, worker->task, 1);
match->nr_running++;
diff --git a/fs/io-wq.h b/fs/io-wq.h
index b158f8addcf3..096f1021018e 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -9,7 +9,6 @@ enum {
IO_WQ_WORK_CANCEL = 1,
IO_WQ_WORK_HASHED = 2,
IO_WQ_WORK_UNBOUND = 4,
- IO_WQ_WORK_NO_CANCEL = 8,
IO_WQ_WORK_CONCURRENT = 16,
IO_WQ_WORK_FILES = 32,
@@ -28,15 +27,6 @@ enum io_wq_cancel {
IO_WQ_CANCEL_NOTFOUND, /* work not found */
};
-struct io_wq_work_node {
- struct io_wq_work_node *next;
-};
-
-struct io_wq_work_list {
- struct io_wq_work_node *first;
- struct io_wq_work_node *last;
-};
-
static inline void wq_list_add_after(struct io_wq_work_node *node,
struct io_wq_work_node *pos,
struct io_wq_work_list *list)
@@ -107,8 +97,8 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
return container_of(work->list.next, struct io_wq_work, list);
}
-typedef void (free_work_fn)(struct io_wq_work *);
-typedef struct io_wq_work *(io_wq_work_fn)(struct io_wq_work *);
+typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
+typedef void (io_wq_work_fn)(struct io_wq_work *);
struct io_wq_data {
struct user_struct *user;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 985a9e3f976d..14ce789927e4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -187,6 +187,11 @@ struct io_rings {
struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
};
+enum io_uring_cmd_flags {
+ IO_URING_F_NONBLOCK = 1,
+ IO_URING_F_COMPLETE_DEFER = 2,
+};
+
struct io_mapped_ubuf {
u64 ubuf;
size_t len;
@@ -195,28 +200,38 @@ struct io_mapped_ubuf {
unsigned long acct_pages;
};
-struct fixed_file_table {
+struct io_ring_ctx;
+
+struct io_rsrc_put {
+ struct list_head list;
+ union {
+ void *rsrc;
+ struct file *file;
+ };
+};
+
+struct fixed_rsrc_table {
struct file **files;
};
-struct fixed_file_ref_node {
+struct fixed_rsrc_ref_node {
struct percpu_ref refs;
struct list_head node;
- struct list_head file_list;
- struct fixed_file_data *file_data;
+ struct list_head rsrc_list;
+ struct fixed_rsrc_data *rsrc_data;
+ void (*rsrc_put)(struct io_ring_ctx *ctx,
+ struct io_rsrc_put *prsrc);
struct llist_node llist;
bool done;
};
-struct fixed_file_data {
- struct fixed_file_table *table;
+struct fixed_rsrc_data {
+ struct fixed_rsrc_table *table;
struct io_ring_ctx *ctx;
- struct fixed_file_ref_node *node;
+ struct fixed_rsrc_ref_node *node;
struct percpu_ref refs;
struct completion done;
- struct list_head ref_list;
- spinlock_t lock;
};
struct io_buffer {
@@ -249,6 +264,46 @@ struct io_sq_data {
unsigned sq_thread_idle;
};
+#define IO_IOPOLL_BATCH 8
+#define IO_COMPL_BATCH 32
+#define IO_REQ_CACHE_SIZE 32
+#define IO_REQ_ALLOC_BATCH 8
+
+struct io_comp_state {
+ struct io_kiocb *reqs[IO_COMPL_BATCH];
+ unsigned int nr;
+ unsigned int locked_free_nr;
+ /* inline/task_work completion list, under ->uring_lock */
+ struct list_head free_list;
+ /* IRQ completion list, under ->completion_lock */
+ struct list_head locked_free_list;
+};
+
+struct io_submit_state {
+ struct blk_plug plug;
+
+ /*
+ * io_kiocb alloc cache
+ */
+ void *reqs[IO_REQ_CACHE_SIZE];
+ unsigned int free_reqs;
+
+ bool plug_started;
+
+ /*
+ * Batch completion logic
+ */
+ struct io_comp_state comp;
+
+ /*
+ * File reference cache
+ */
+ struct file *file;
+ unsigned int fd;
+ unsigned int file_refs;
+ unsigned int ios_left;
+};
+
struct io_ring_ctx {
struct {
struct percpu_ref refs;
@@ -291,6 +346,13 @@ struct io_ring_ctx {
struct io_uring_sqe *sq_sqes;
} ____cacheline_aligned_in_smp;
+ struct {
+ struct mutex uring_lock;
+ wait_queue_head_t wait;
+ } ____cacheline_aligned_in_smp;
+
+ struct io_submit_state submit_state;
+
struct io_rings *rings;
/* IO offload */
@@ -319,7 +381,7 @@ struct io_ring_ctx {
* readers must ensure that ->refs is alive as long as the file* is
* used. Only updated through io_uring_register(2).
*/
- struct fixed_file_data *file_data;
+ struct fixed_rsrc_data *file_data;
unsigned nr_user_files;
/* if used, fixed mapped user buffers */
@@ -338,9 +400,6 @@ struct io_ring_ctx {
struct completion ref_comp;
struct completion sq_thread_comp;
- /* if all else fails... */
- struct io_kiocb *fallback_req;
-
#if defined(CONFIG_UNIX)
struct socket *ring_sock;
#endif
@@ -362,11 +421,6 @@ struct io_ring_ctx {
} ____cacheline_aligned_in_smp;
struct {
- struct mutex uring_lock;
- wait_queue_head_t wait;
- } ____cacheline_aligned_in_smp;
-
- struct {
spinlock_t completion_lock;
/*
@@ -384,11 +438,15 @@ struct io_ring_ctx {
struct list_head inflight_list;
} ____cacheline_aligned_in_smp;
- struct delayed_work file_put_work;
- struct llist_head file_put_llist;
+ struct delayed_work rsrc_put_work;
+ struct llist_head rsrc_put_llist;
+ struct list_head rsrc_ref_list;
+ spinlock_t rsrc_ref_lock;
- struct work_struct exit_work;
struct io_restriction restrictions;
+
+ /* Keep this last, we don't need it for the fast path */
+ struct work_struct exit_work;
};
/*
@@ -411,7 +469,6 @@ struct io_poll_remove {
struct io_close {
struct file *file;
- struct file *put_file;
int fd;
};
@@ -489,13 +546,12 @@ struct io_sr_msg {
struct io_open {
struct file *file;
int dfd;
- bool ignore_nonblock;
struct filename *filename;
struct open_how how;
unsigned long nofile;
};
-struct io_files_update {
+struct io_rsrc_update {
struct file *file;
u64 arg;
u32 nr_args;
@@ -584,7 +640,8 @@ struct io_async_connect {
struct io_async_msghdr {
struct iovec fast_iov[UIO_FASTIOV];
- struct iovec *iov;
+ /* points to an allocated iov, if NULL we use fast_iov instead */
+ struct iovec *free_iov;
struct sockaddr __user *uaddr;
struct msghdr msg;
struct sockaddr_storage addr;
@@ -618,6 +675,7 @@ enum {
REQ_F_NO_FILE_TABLE_BIT,
REQ_F_WORK_INITIALIZED_BIT,
REQ_F_LTIMEOUT_ACTIVE_BIT,
+ REQ_F_COMPLETE_INLINE_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@@ -661,6 +719,8 @@ enum {
REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT),
/* linked timeout is active, i.e. prepared by link's head */
REQ_F_LTIMEOUT_ACTIVE = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
+ /* completion is deferred through io_comp_state */
+ REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
};
struct async_poll {
@@ -668,6 +728,11 @@ struct async_poll {
struct io_poll_iocb *double_poll;
};
+struct io_task_work {
+ struct io_wq_work_node node;
+ task_work_func_t func;
+};
+
/*
* NOTE! Each of the iocb union members has the file pointer
* as the first entry in their struct definition. So you can
@@ -689,7 +754,7 @@ struct io_kiocb {
struct io_sr_msg sr_msg;
struct io_open open;
struct io_close close;
- struct io_files_update files_update;
+ struct io_rsrc_update rsrc_update;
struct io_fadvise fadvise;
struct io_madvise madvise;
struct io_epoll epoll;
@@ -719,14 +784,17 @@ struct io_kiocb {
u64 user_data;
struct io_kiocb *link;
- struct percpu_ref *fixed_file_refs;
+ struct percpu_ref *fixed_rsrc_refs;
/*
* 1. used with ctx->iopoll_list with reads/writes
* 2. to track reqs with ->files (see io_op_def::file_table)
*/
struct list_head inflight_entry;
- struct callback_head task_work;
+ union {
+ struct io_task_work io_task_work;
+ struct callback_head task_work;
+ };
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
struct hlist_node hash_node;
struct async_poll *apoll;
@@ -739,44 +807,9 @@ struct io_defer_entry {
u32 seq;
};
-#define IO_IOPOLL_BATCH 8
-
-struct io_comp_state {
- unsigned int nr;
- struct list_head list;
- struct io_ring_ctx *ctx;
-};
-
-struct io_submit_state {
- struct blk_plug plug;
-
- /*
- * io_kiocb alloc cache
- */
- void *reqs[IO_IOPOLL_BATCH];
- unsigned int free_reqs;
-
- bool plug_started;
-
- /*
- * Batch completion logic
- */
- struct io_comp_state comp;
-
- /*
- * File reference cache
- */
- struct file *file;
- unsigned int fd;
- unsigned int file_refs;
- unsigned int ios_left;
-};
-
struct io_op_def {
/* needs req->file assigned */
unsigned needs_file : 1;
- /* don't fail if file grab fails */
- unsigned needs_file_no_error : 1;
/* hash wq insertion if file is a regular file */
unsigned hash_reg_file : 1;
/* unbound wq insertion if file is a non-regular file */
@@ -857,7 +890,8 @@ static const struct io_op_def io_op_defs[] = {
.pollout = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_msghdr),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
+ IO_WQ_WORK_FS,
},
[IORING_OP_RECVMSG] = {
.needs_file = 1,
@@ -866,7 +900,8 @@ static const struct io_op_def io_op_defs[] = {
.buffer_select = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_msghdr),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
+ IO_WQ_WORK_FS,
},
[IORING_OP_TIMEOUT] = {
.needs_async_data = 1,
@@ -906,8 +941,6 @@ static const struct io_op_def io_op_defs[] = {
IO_WQ_WORK_FS | IO_WQ_WORK_MM,
},
[IORING_OP_CLOSE] = {
- .needs_file = 1,
- .needs_file_no_error = 1,
.work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG,
},
[IORING_OP_FILES_UPDATE] = {
@@ -989,42 +1022,43 @@ static const struct io_op_def io_op_defs[] = {
},
};
-enum io_mem_account {
- ACCT_LOCKED,
- ACCT_PINNED,
-};
-
-static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
- struct task_struct *task);
-
-static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node);
-static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
+static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
+ struct task_struct *task,
+ struct files_struct *files);
+static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node);
+static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
struct io_ring_ctx *ctx);
+static void init_fixed_file_ref_node(struct io_ring_ctx *ctx,
+ struct fixed_rsrc_ref_node *ref_node);
-static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
- struct io_comp_state *cs);
+static bool io_rw_reissue(struct io_kiocb *req);
static void io_cqring_fill_event(struct io_kiocb *req, long res);
static void io_put_req(struct io_kiocb *req);
static void io_put_req_deferred(struct io_kiocb *req, int nr);
static void io_double_put_req(struct io_kiocb *req);
+static void io_dismantle_req(struct io_kiocb *req);
+static void io_put_task(struct task_struct *task, int nr);
+static void io_queue_next(struct io_kiocb *req);
static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
static void __io_queue_linked_timeout(struct io_kiocb *req);
static void io_queue_linked_timeout(struct io_kiocb *req);
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
- struct io_uring_files_update *ip,
+ struct io_uring_rsrc_update *ip,
unsigned nr_args);
static void __io_clean_op(struct io_kiocb *req);
static struct file *io_file_get(struct io_submit_state *state,
struct io_kiocb *req, int fd, bool fixed);
-static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs);
-static void io_file_put_work(struct work_struct *work);
+static void __io_queue_sqe(struct io_kiocb *req);
+static void io_rsrc_put_work(struct work_struct *work);
-static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
- struct iovec **iovec, struct iov_iter *iter,
- bool needs_lock);
+static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
+ struct iov_iter *iter, bool needs_lock);
static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
const struct iovec *fast_iov,
struct iov_iter *iter, bool force);
+static void io_req_task_queue(struct io_kiocb *req);
+static void io_submit_flush_completions(struct io_comp_state *cs,
+ struct io_ring_ctx *ctx);
static struct kmem_cache *req_cachep;
@@ -1048,8 +1082,7 @@ EXPORT_SYMBOL(io_uring_get_socket);
static inline void io_clean_op(struct io_kiocb *req)
{
- if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED |
- REQ_F_INFLIGHT))
+ if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED))
__io_clean_op(req);
}
@@ -1057,9 +1090,9 @@ static inline void io_set_resource_node(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- if (!req->fixed_file_refs) {
- req->fixed_file_refs = &ctx->file_data->node->refs;
- percpu_ref_get(req->fixed_file_refs);
+ if (!req->fixed_rsrc_refs) {
+ req->fixed_rsrc_refs = &ctx->file_data->node->refs;
+ percpu_ref_get(req->fixed_rsrc_refs);
}
}
@@ -1069,14 +1102,21 @@ static bool io_match_task(struct io_kiocb *head,
{
struct io_kiocb *req;
- if (task && head->task != task)
+ if (task && head->task != task) {
+ /* in terms of cancelation, always match if req task is dead */
+ if (head->task->flags & PF_EXITING)
+ return true;
return false;
+ }
if (!files)
return true;
io_for_each_link(req, head) {
- if ((req->flags & REQ_F_WORK_INITIALIZED) &&
- (req->work.flags & IO_WQ_WORK_FILES) &&
+ if (!(req->flags & REQ_F_WORK_INITIALIZED))
+ continue;
+ if (req->file && req->file->f_op == &io_uring_fops)
+ return true;
+ if ((req->work.flags & IO_WQ_WORK_FILES) &&
req->work.identity->files == files)
return true;
}
@@ -1107,9 +1147,6 @@ static void io_sq_thread_drop_mm_files(void)
static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx)
{
- if (current->flags & PF_EXITING)
- return -EFAULT;
-
if (!current->files) {
struct files_struct *files;
struct nsproxy *nsproxy;
@@ -1137,15 +1174,9 @@ static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
{
struct mm_struct *mm;
- if (current->flags & PF_EXITING)
- return -EFAULT;
if (current->mm)
return 0;
- /* Should never happen */
- if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL)))
- return -EFAULT;
-
task_lock(ctx->sqo_task);
mm = ctx->sqo_task->mm;
if (unlikely(!mm || !mmget_not_zero(mm)))
@@ -1160,8 +1191,8 @@ static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
return -EFAULT;
}
-static int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
+static int __io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
int ret;
@@ -1181,6 +1212,14 @@ static int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
return 0;
}
+static inline int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
+{
+ if (!(ctx->flags & IORING_SETUP_SQPOLL))
+ return 0;
+ return __io_sq_thread_acquire_mm_files(ctx, req);
+}
+
static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx,
struct cgroup_subsys_state **cur_css)
@@ -1257,11 +1296,6 @@ static inline void io_req_init_async(struct io_kiocb *req)
refcount_inc(&req->work.identity->count);
}
-static inline bool io_async_submit(struct io_ring_ctx *ctx)
-{
- return ctx->flags & IORING_SETUP_SQPOLL;
-}
-
static void io_ring_ctx_ref_free(struct percpu_ref *ref)
{
struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
@@ -1283,10 +1317,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
if (!ctx)
return NULL;
- ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL);
- if (!ctx->fallback_req)
- goto err;
-
/*
* Use 5 bits less than the max cq entries, that should give us around
* 32 entries per hash list if totally full and uniformly spread.
@@ -1323,12 +1353,14 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->timeout_list);
spin_lock_init(&ctx->inflight_lock);
INIT_LIST_HEAD(&ctx->inflight_list);
- INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work);
- init_llist_head(&ctx->file_put_llist);
+ spin_lock_init(&ctx->rsrc_ref_lock);
+ INIT_LIST_HEAD(&ctx->rsrc_ref_list);
+ INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
+ init_llist_head(&ctx->rsrc_put_llist);
+ INIT_LIST_HEAD(&ctx->submit_state.comp.free_list);
+ INIT_LIST_HEAD(&ctx->submit_state.comp.locked_free_list);
return ctx;
err:
- if (ctx->fallback_req)
- kmem_cache_free(req_cachep, ctx->fallback_req);
kfree(ctx->cancel_hash);
kfree(ctx);
return NULL;
@@ -1346,14 +1378,6 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
return false;
}
-static void __io_commit_cqring(struct io_ring_ctx *ctx)
-{
- struct io_rings *rings = ctx->rings;
-
- /* order cqe stores with ring update */
- smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
-}
-
static void io_put_identity(struct io_uring_task *tctx, struct io_kiocb *req)
{
if (req->work.identity == &tctx->__identity)
@@ -1367,22 +1391,14 @@ static void io_req_clean_work(struct io_kiocb *req)
if (!(req->flags & REQ_F_WORK_INITIALIZED))
return;
- req->flags &= ~REQ_F_WORK_INITIALIZED;
-
- if (req->work.flags & IO_WQ_WORK_MM) {
+ if (req->work.flags & IO_WQ_WORK_MM)
mmdrop(req->work.identity->mm);
- req->work.flags &= ~IO_WQ_WORK_MM;
- }
#ifdef CONFIG_BLK_CGROUP
- if (req->work.flags & IO_WQ_WORK_BLKCG) {
+ if (req->work.flags & IO_WQ_WORK_BLKCG)
css_put(req->work.identity->blkcg_css);
- req->work.flags &= ~IO_WQ_WORK_BLKCG;
- }
#endif
- if (req->work.flags & IO_WQ_WORK_CREDS) {
+ if (req->work.flags & IO_WQ_WORK_CREDS)
put_cred(req->work.identity->creds);
- req->work.flags &= ~IO_WQ_WORK_CREDS;
- }
if (req->work.flags & IO_WQ_WORK_FS) {
struct fs_struct *fs = req->work.identity->fs;
@@ -1392,9 +1408,27 @@ static void io_req_clean_work(struct io_kiocb *req)
spin_unlock(&req->work.identity->fs->lock);
if (fs)
free_fs_struct(fs);
- req->work.flags &= ~IO_WQ_WORK_FS;
+ }
+ if (req->work.flags & IO_WQ_WORK_FILES) {
+ put_files_struct(req->work.identity->files);
+ put_nsproxy(req->work.identity->nsproxy);
+ }
+ if (req->flags & REQ_F_INFLIGHT) {
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_uring_task *tctx = req->task->io_uring;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->inflight_lock, flags);
+ list_del(&req->inflight_entry);
+ spin_unlock_irqrestore(&ctx->inflight_lock, flags);
+ req->flags &= ~REQ_F_INFLIGHT;
+ if (atomic_read(&tctx->in_idle))
+ wake_up(&tctx->wait);
}
+ req->flags &= ~REQ_F_WORK_INITIALIZED;
+ req->work.flags &= ~(IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | IO_WQ_WORK_FS |
+ IO_WQ_WORK_CREDS | IO_WQ_WORK_FILES);
io_put_identity(req->task->io_uring, req);
}
@@ -1443,11 +1477,24 @@ static bool io_identity_cow(struct io_kiocb *req)
return true;
}
+static void io_req_track_inflight(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!(req->flags & REQ_F_INFLIGHT)) {
+ io_req_init_async(req);
+ req->flags |= REQ_F_INFLIGHT;
+
+ spin_lock_irq(&ctx->inflight_lock);
+ list_add(&req->inflight_entry, &ctx->inflight_list);
+ spin_unlock_irq(&ctx->inflight_lock);
+ }
+}
+
static bool io_grab_identity(struct io_kiocb *req)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
struct io_identity *id = req->work.identity;
- struct io_ring_ctx *ctx = req->ctx;
if (def->work_flags & IO_WQ_WORK_FSIZE) {
if (id->fsize != rlimit(RLIMIT_FSIZE))
@@ -1503,12 +1550,8 @@ static bool io_grab_identity(struct io_kiocb *req)
return false;
atomic_inc(&id->files->count);
get_nsproxy(id->nsproxy);
- req->flags |= REQ_F_INFLIGHT;
-
- spin_lock_irq(&ctx->inflight_lock);
- list_add(&req->inflight_entry, &ctx->inflight_list);
- spin_unlock_irq(&ctx->inflight_lock);
req->work.flags |= IO_WQ_WORK_FILES;
+ io_req_track_inflight(req);
}
if (!(req->work.flags & IO_WQ_WORK_MM) &&
(def->work_flags & IO_WQ_WORK_MM)) {
@@ -1622,18 +1665,11 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx)
do {
struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
struct io_defer_entry, list);
- struct io_kiocb *link;
if (req_need_defer(de->req, de->seq))
break;
list_del_init(&de->list);
- /* punt-init is done before queueing for defer */
- link = __io_queue_async_work(de->req);
- if (link) {
- __io_queue_linked_timeout(link);
- /* drop submission reference */
- io_put_req_deferred(link, 1);
- }
+ io_req_task_queue(de->req);
kfree(de);
} while (!list_empty(&ctx->defer_list));
}
@@ -1677,7 +1713,9 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
static void io_commit_cqring(struct io_ring_ctx *ctx)
{
io_flush_timeouts(ctx);
- __io_commit_cqring(ctx);
+
+ /* order cqe stores with ring update */
+ smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
if (unlikely(!list_empty(&ctx->defer_list)))
__io_queue_deferred(ctx);
@@ -1690,21 +1728,25 @@ static inline bool io_sqring_full(struct io_ring_ctx *ctx)
return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == r->sq_ring_entries;
}
+static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
+{
+ return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
+}
+
static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
{
struct io_rings *rings = ctx->rings;
unsigned tail;
- tail = ctx->cached_cq_tail;
/*
* writes to the cq entry need to come after reading head; the
* control dependency is enough as we're using WRITE_ONCE to
* fill the cq entry
*/
- if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
+ if (__io_cqring_events(ctx) == rings->cq_ring_entries)
return NULL;
- ctx->cached_cq_tail++;
+ tail = ctx->cached_cq_tail++;
return &rings->cqes[tail & ctx->cq_mask];
}
@@ -1719,11 +1761,6 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
return io_wq_current_is_worker();
}
-static inline unsigned __io_cqring_events(struct io_ring_ctx *ctx)
-{
- return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
-}
-
static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
{
/* see waitqueue_active() comment */
@@ -1767,12 +1804,13 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
struct io_kiocb *req, *tmp;
struct io_uring_cqe *cqe;
unsigned long flags;
- bool all_flushed;
+ bool all_flushed, posted;
LIST_HEAD(list);
if (!force && __io_cqring_events(ctx) == rings->cq_ring_entries)
return false;
+ posted = false;
spin_lock_irqsave(&ctx->completion_lock, flags);
list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
if (!io_match_task(req, tsk, files))
@@ -1792,6 +1830,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
WRITE_ONCE(ctx->rings->cq_overflow,
ctx->cached_cq_overflow);
}
+ posted = true;
}
all_flushed = list_empty(&ctx->cq_overflow_list);
@@ -1801,9 +1840,11 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
}
- io_commit_cqring(ctx);
+ if (posted)
+ io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
- io_cqring_ev_posted(ctx);
+ if (posted)
+ io_cqring_ev_posted(ctx);
while (!list_empty(&list)) {
req = list_first_entry(&list, struct io_kiocb, compl.list);
@@ -1873,7 +1914,8 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
__io_cqring_fill_event(req, res, 0);
}
-static void io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
+static inline void io_req_complete_post(struct io_kiocb *req, long res,
+ unsigned int cflags)
{
struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
@@ -1881,92 +1923,96 @@ static void io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
spin_lock_irqsave(&ctx->completion_lock, flags);
__io_cqring_fill_event(req, res, cflags);
io_commit_cqring(ctx);
+ /*
+ * If we're the last reference to this request, add to our locked
+ * free_list cache.
+ */
+ if (refcount_dec_and_test(&req->refs)) {
+ struct io_comp_state *cs = &ctx->submit_state.comp;
+
+ io_dismantle_req(req);
+ io_put_task(req->task, 1);
+ list_add(&req->compl.list, &cs->locked_free_list);
+ cs->locked_free_nr++;
+ } else
+ req = NULL;
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
-}
-
-static void io_submit_flush_completions(struct io_comp_state *cs)
-{
- struct io_ring_ctx *ctx = cs->ctx;
-
- spin_lock_irq(&ctx->completion_lock);
- while (!list_empty(&cs->list)) {
- struct io_kiocb *req;
-
- req = list_first_entry(&cs->list, struct io_kiocb, compl.list);
- list_del(&req->compl.list);
- __io_cqring_fill_event(req, req->result, req->compl.cflags);
-
- /*
- * io_free_req() doesn't care about completion_lock unless one
- * of these flags is set. REQ_F_WORK_INITIALIZED is in the list
- * because of a potential deadlock with req->work.fs->lock
- */
- if (req->flags & (REQ_F_FAIL_LINK|REQ_F_LINK_TIMEOUT
- |REQ_F_WORK_INITIALIZED)) {
- spin_unlock_irq(&ctx->completion_lock);
- io_put_req(req);
- spin_lock_irq(&ctx->completion_lock);
- } else {
- io_put_req(req);
- }
+ if (req) {
+ io_queue_next(req);
+ percpu_ref_put(&ctx->refs);
}
- io_commit_cqring(ctx);
- spin_unlock_irq(&ctx->completion_lock);
-
- io_cqring_ev_posted(ctx);
- cs->nr = 0;
}
-static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags,
- struct io_comp_state *cs)
+static void io_req_complete_state(struct io_kiocb *req, long res,
+ unsigned int cflags)
{
- if (!cs) {
- io_cqring_add_event(req, res, cflags);
- io_put_req(req);
- } else {
- io_clean_op(req);
- req->result = res;
- req->compl.cflags = cflags;
- list_add_tail(&req->compl.list, &cs->list);
- if (++cs->nr >= 32)
- io_submit_flush_completions(cs);
- }
+ io_clean_op(req);
+ req->result = res;
+ req->compl.cflags = cflags;
+ req->flags |= REQ_F_COMPLETE_INLINE;
}
-static void io_req_complete(struct io_kiocb *req, long res)
+static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
+ long res, unsigned cflags)
{
- __io_req_complete(req, res, 0, NULL);
+ if (issue_flags & IO_URING_F_COMPLETE_DEFER)
+ io_req_complete_state(req, res, cflags);
+ else
+ io_req_complete_post(req, res, cflags);
}
-static inline bool io_is_fallback_req(struct io_kiocb *req)
+static inline void io_req_complete(struct io_kiocb *req, long res)
{
- return req == (struct io_kiocb *)
- ((unsigned long) req->ctx->fallback_req & ~1UL);
+ __io_req_complete(req, 0, res, 0);
}
-static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
+static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
{
- struct io_kiocb *req;
+ struct io_submit_state *state = &ctx->submit_state;
+ struct io_comp_state *cs = &state->comp;
+ struct io_kiocb *req = NULL;
- req = ctx->fallback_req;
- if (!test_and_set_bit_lock(0, (unsigned long *) &ctx->fallback_req))
- return req;
+ /*
+ * If we have more than a batch's worth of requests in our IRQ side
+ * locked cache, grab the lock and move them over to our submission
+ * side cache.
+ */
+ if (READ_ONCE(cs->locked_free_nr) > IO_COMPL_BATCH) {
+ spin_lock_irq(&ctx->completion_lock);
+ list_splice_init(&cs->locked_free_list, &cs->free_list);
+ cs->locked_free_nr = 0;
+ spin_unlock_irq(&ctx->completion_lock);
+ }
- return NULL;
+ while (!list_empty(&cs->free_list)) {
+ req = list_first_entry(&cs->free_list, struct io_kiocb,
+ compl.list);
+ list_del(&req->compl.list);
+ state->reqs[state->free_reqs++] = req;
+ if (state->free_reqs == ARRAY_SIZE(state->reqs))
+ break;
+ }
+
+ return req != NULL;
}
-static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
- struct io_submit_state *state)
+static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
{
+ struct io_submit_state *state = &ctx->submit_state;
+
+ BUILD_BUG_ON(IO_REQ_ALLOC_BATCH > ARRAY_SIZE(state->reqs));
+
if (!state->free_reqs) {
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
- size_t sz;
int ret;
- sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
- ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
+ if (io_flush_cached_reqs(ctx))
+ goto got_req;
+
+ ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
+ state->reqs);
/*
* Bulk alloc is all-or-nothing. If we fail to get a batch,
@@ -1975,16 +2021,14 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
if (unlikely(ret <= 0)) {
state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
if (!state->reqs[0])
- goto fallback;
+ return NULL;
ret = 1;
}
state->free_reqs = ret;
}
-
+got_req:
state->free_reqs--;
return state->reqs[state->free_reqs];
-fallback:
- return io_get_fallback_req(ctx);
}
static inline void io_put_file(struct io_kiocb *req, struct file *file,
@@ -2002,27 +2046,29 @@ static void io_dismantle_req(struct io_kiocb *req)
kfree(req->async_data);
if (req->file)
io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
- if (req->fixed_file_refs)
- percpu_ref_put(req->fixed_file_refs);
+ if (req->fixed_rsrc_refs)
+ percpu_ref_put(req->fixed_rsrc_refs);
io_req_clean_work(req);
}
+static inline void io_put_task(struct task_struct *task, int nr)
+{
+ struct io_uring_task *tctx = task->io_uring;
+
+ percpu_counter_sub(&tctx->inflight, nr);
+ if (unlikely(atomic_read(&tctx->in_idle)))
+ wake_up(&tctx->wait);
+ put_task_struct_many(task, nr);
+}
+
static void __io_free_req(struct io_kiocb *req)
{
- struct io_uring_task *tctx = req->task->io_uring;
struct io_ring_ctx *ctx = req->ctx;
io_dismantle_req(req);
+ io_put_task(req->task, 1);
- percpu_counter_dec(&tctx->inflight);
- if (atomic_read(&tctx->in_idle))
- wake_up(&tctx->wait);
- put_task_struct(req->task);
-
- if (likely(!io_is_fallback_req(req)))
- kmem_cache_free(req_cachep, req);
- else
- clear_bit_unlock(0, (unsigned long *) &ctx->fallback_req);
+ kmem_cache_free(req_cachep, req);
percpu_ref_put(&ctx->refs);
}
@@ -2128,11 +2174,105 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
{
- if (likely(!(req->link) && !(req->flags & REQ_F_LINK_TIMEOUT)))
+ if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
return NULL;
return __io_req_find_next(req);
}
+static bool __tctx_task_work(struct io_uring_task *tctx)
+{
+ struct io_ring_ctx *ctx = NULL;
+ struct io_wq_work_list list;
+ struct io_wq_work_node *node;
+
+ if (wq_list_empty(&tctx->task_list))
+ return false;
+
+ spin_lock_irq(&tctx->task_lock);
+ list = tctx->task_list;
+ INIT_WQ_LIST(&tctx->task_list);
+ spin_unlock_irq(&tctx->task_lock);
+
+ node = list.first;
+ while (node) {
+ struct io_wq_work_node *next = node->next;
+ struct io_ring_ctx *this_ctx;
+ struct io_kiocb *req;
+
+ req = container_of(node, struct io_kiocb, io_task_work.node);
+ this_ctx = req->ctx;
+ req->task_work.func(&req->task_work);
+ node = next;
+
+ if (!ctx) {
+ ctx = this_ctx;
+ } else if (ctx != this_ctx) {
+ mutex_lock(&ctx->uring_lock);
+ io_submit_flush_completions(&ctx->submit_state.comp, ctx);
+ mutex_unlock(&ctx->uring_lock);
+ ctx = this_ctx;
+ }
+ }
+
+ if (ctx && ctx->submit_state.comp.nr) {
+ mutex_lock(&ctx->uring_lock);
+ io_submit_flush_completions(&ctx->submit_state.comp, ctx);
+ mutex_unlock(&ctx->uring_lock);
+ }
+
+ return list.first != NULL;
+}
+
+static void tctx_task_work(struct callback_head *cb)
+{
+ struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work);
+
+ while (__tctx_task_work(tctx))
+ cond_resched();
+
+ clear_bit(0, &tctx->task_state);
+}
+
+static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req,
+ enum task_work_notify_mode notify)
+{
+ struct io_uring_task *tctx = tsk->io_uring;
+ struct io_wq_work_node *node, *prev;
+ unsigned long flags;
+ int ret;
+
+ WARN_ON_ONCE(!tctx);
+
+ spin_lock_irqsave(&tctx->task_lock, flags);
+ wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
+ spin_unlock_irqrestore(&tctx->task_lock, flags);
+
+ /* task_work already pending, we're done */
+ if (test_bit(0, &tctx->task_state) ||
+ test_and_set_bit(0, &tctx->task_state))
+ return 0;
+
+ if (!task_work_add(tsk, &tctx->task_work, notify))
+ return 0;
+
+ /*
+ * Slow path - we failed, find and delete work. if the work is not
+ * in the list, it got run and we're fine.
+ */
+ ret = 0;
+ spin_lock_irqsave(&tctx->task_lock, flags);
+ wq_list_for_each(node, prev, &tctx->task_list) {
+ if (&req->io_task_work.node == node) {
+ wq_list_del(&tctx->task_list, node, prev);
+ ret = 1;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&tctx->task_lock, flags);
+ clear_bit(0, &tctx->task_state);
+ return ret;
+}
+
static int io_req_task_work_add(struct io_kiocb *req)
{
struct task_struct *tsk = req->task;
@@ -2153,13 +2293,23 @@ static int io_req_task_work_add(struct io_kiocb *req)
if (!(ctx->flags & IORING_SETUP_SQPOLL))
notify = TWA_SIGNAL;
- ret = task_work_add(tsk, &req->task_work, notify);
+ ret = io_task_work_add(tsk, req, notify);
if (!ret)
wake_up_process(tsk);
return ret;
}
+static void io_req_task_work_add_fallback(struct io_kiocb *req,
+ task_work_func_t cb)
+{
+ struct task_struct *tsk = io_wq_get_task(req->ctx->io_wq);
+
+ init_task_work(&req->task_work, cb);
+ task_work_add(tsk, &req->task_work, TWA_NONE);
+ wake_up_process(tsk);
+}
+
static void __io_req_task_cancel(struct io_kiocb *req, int error)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -2187,40 +2337,35 @@ static void __io_req_task_submit(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ /* ctx stays valid until unlock, even if we drop all ours ctx->refs */
mutex_lock(&ctx->uring_lock);
- if (!ctx->sqo_dead &&
- !__io_sq_thread_acquire_mm(ctx) &&
- !__io_sq_thread_acquire_files(ctx))
- __io_queue_sqe(req, NULL);
+ if (!ctx->sqo_dead && !(current->flags & PF_EXITING) &&
+ !io_sq_thread_acquire_mm_files(ctx, req))
+ __io_queue_sqe(req);
else
__io_req_task_cancel(req, -EFAULT);
mutex_unlock(&ctx->uring_lock);
+
+ if (ctx->flags & IORING_SETUP_SQPOLL)
+ io_sq_thread_drop_mm_files();
}
static void io_req_task_submit(struct callback_head *cb)
{
struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
- struct io_ring_ctx *ctx = req->ctx;
__io_req_task_submit(req);
- percpu_ref_put(&ctx->refs);
}
static void io_req_task_queue(struct io_kiocb *req)
{
int ret;
- init_task_work(&req->task_work, io_req_task_submit);
- percpu_ref_get(&req->ctx->refs);
-
+ req->task_work.func = io_req_task_submit;
ret = io_req_task_work_add(req);
if (unlikely(ret)) {
- struct task_struct *tsk;
-
- init_task_work(&req->task_work, io_req_task_cancel);
- tsk = io_wq_get_task(req->ctx->io_wq);
- task_work_add(tsk, &req->task_work, TWA_NONE);
- wake_up_process(tsk);
+ percpu_ref_get(&req->ctx->refs);
+ io_req_task_work_add_fallback(req, io_req_task_cancel);
}
}
@@ -2239,66 +2384,75 @@ static void io_free_req(struct io_kiocb *req)
}
struct req_batch {
- void *reqs[IO_IOPOLL_BATCH];
- int to_free;
-
struct task_struct *task;
int task_refs;
+ int ctx_refs;
};
static inline void io_init_req_batch(struct req_batch *rb)
{
- rb->to_free = 0;
rb->task_refs = 0;
+ rb->ctx_refs = 0;
rb->task = NULL;
}
-static void __io_req_free_batch_flush(struct io_ring_ctx *ctx,
- struct req_batch *rb)
-{
- kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
- percpu_ref_put_many(&ctx->refs, rb->to_free);
- rb->to_free = 0;
-}
-
static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
struct req_batch *rb)
{
- if (rb->to_free)
- __io_req_free_batch_flush(ctx, rb);
- if (rb->task) {
- struct io_uring_task *tctx = rb->task->io_uring;
-
- percpu_counter_sub(&tctx->inflight, rb->task_refs);
- put_task_struct_many(rb->task, rb->task_refs);
- rb->task = NULL;
- }
+ if (rb->task)
+ io_put_task(rb->task, rb->task_refs);
+ if (rb->ctx_refs)
+ percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
}
-static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
+static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
+ struct io_submit_state *state)
{
- if (unlikely(io_is_fallback_req(req))) {
- io_free_req(req);
- return;
- }
io_queue_next(req);
if (req->task != rb->task) {
- if (rb->task) {
- struct io_uring_task *tctx = rb->task->io_uring;
-
- percpu_counter_sub(&tctx->inflight, rb->task_refs);
- put_task_struct_many(rb->task, rb->task_refs);
- }
+ if (rb->task)
+ io_put_task(rb->task, rb->task_refs);
rb->task = req->task;
rb->task_refs = 0;
}
rb->task_refs++;
+ rb->ctx_refs++;
io_dismantle_req(req);
- rb->reqs[rb->to_free++] = req;
- if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
- __io_req_free_batch_flush(req->ctx, rb);
+ if (state->free_reqs != ARRAY_SIZE(state->reqs))
+ state->reqs[state->free_reqs++] = req;
+ else
+ list_add(&req->compl.list, &state->comp.free_list);
+}
+
+static void io_submit_flush_completions(struct io_comp_state *cs,
+ struct io_ring_ctx *ctx)
+{
+ int i, nr = cs->nr;
+ struct io_kiocb *req;
+ struct req_batch rb;
+
+ io_init_req_batch(&rb);
+ spin_lock_irq(&ctx->completion_lock);
+ for (i = 0; i < nr; i++) {
+ req = cs->reqs[i];
+ __io_cqring_fill_event(req, req->result, req->compl.cflags);
+ }
+ io_commit_cqring(ctx);
+ spin_unlock_irq(&ctx->completion_lock);
+
+ io_cqring_ev_posted(ctx);
+ for (i = 0; i < nr; i++) {
+ req = cs->reqs[i];
+
+ /* submission and completion refs */
+ if (refcount_sub_and_test(2, &req->refs))
+ io_req_free_batch(&rb, req, &ctx->submit_state);
+ }
+
+ io_req_free_batch_finish(ctx, &rb);
+ cs->nr = 0;
}
/*
@@ -2333,15 +2487,10 @@ static void io_free_req_deferred(struct io_kiocb *req)
{
int ret;
- init_task_work(&req->task_work, io_put_req_deferred_cb);
+ req->task_work.func = io_put_req_deferred_cb;
ret = io_req_task_work_add(req);
- if (unlikely(ret)) {
- struct task_struct *tsk;
-
- tsk = io_wq_get_task(req->ctx->io_wq);
- task_work_add(tsk, &req->task_work, TWA_NONE);
- wake_up_process(tsk);
- }
+ if (unlikely(ret))
+ io_req_task_work_add_fallback(req, io_put_req_deferred_cb);
}
static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
@@ -2350,22 +2499,6 @@ static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
io_free_req_deferred(req);
}
-static struct io_wq_work *io_steal_work(struct io_kiocb *req)
-{
- struct io_kiocb *nxt;
-
- /*
- * A ref is owned by io-wq in which context we're. So, if that's the
- * last one, it's safe to steal next work. False negatives are Ok,
- * it just will be re-punted async in io_put_work()
- */
- if (refcount_read(&req->refs) != 1)
- return NULL;
-
- nxt = io_req_find_next(req);
- return nxt ? &nxt->work : NULL;
-}
-
static void io_double_put_req(struct io_kiocb *req)
{
/* drop both submit and complete references */
@@ -2424,17 +2557,6 @@ static inline bool io_run_task_work(void)
return false;
}
-static void io_iopoll_queue(struct list_head *again)
-{
- struct io_kiocb *req;
-
- do {
- req = list_first_entry(again, struct io_kiocb, inflight_entry);
- list_del(&req->inflight_entry);
- __io_complete_rw(req, -EAGAIN, 0, NULL);
- } while (!list_empty(again));
-}
-
/*
* Find and free completed poll iocbs
*/
@@ -2443,7 +2565,6 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
{
struct req_batch rb;
struct io_kiocb *req;
- LIST_HEAD(again);
/* order with ->result store in io_complete_rw_iopoll() */
smp_rmb();
@@ -2453,13 +2574,13 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
int cflags = 0;
req = list_first_entry(done, struct io_kiocb, inflight_entry);
+ list_del(&req->inflight_entry);
+
if (READ_ONCE(req->result) == -EAGAIN) {
- req->result = 0;
req->iopoll_completed = 0;
- list_move_tail(&req->inflight_entry, &again);
- continue;
+ if (io_rw_reissue(req))
+ continue;
}
- list_del(&req->inflight_entry);
if (req->flags & REQ_F_BUFFER_SELECTED)
cflags = io_put_rw_kbuf(req);
@@ -2468,15 +2589,12 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
(*nr_events)++;
if (refcount_dec_and_test(&req->refs))
- io_req_free_batch(&rb, req);
+ io_req_free_batch(&rb, req, &ctx->submit_state);
}
io_commit_cqring(ctx);
io_cqring_ev_posted_iopoll(ctx);
io_req_free_batch_finish(ctx, &rb);
-
- if (!list_empty(&again))
- io_iopoll_queue(&again);
}
static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
@@ -2643,34 +2761,16 @@ static void kiocb_end_write(struct io_kiocb *req)
file_end_write(req->file);
}
-static void io_complete_rw_common(struct kiocb *kiocb, long res,
- struct io_comp_state *cs)
-{
- struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
- int cflags = 0;
-
- if (kiocb->ki_flags & IOCB_WRITE)
- kiocb_end_write(req);
-
- if (res != req->result)
- req_set_fail_links(req);
- if (req->flags & REQ_F_BUFFER_SELECTED)
- cflags = io_put_rw_kbuf(req);
- __io_req_complete(req, res, cflags, cs);
-}
-
#ifdef CONFIG_BLOCK
-static bool io_resubmit_prep(struct io_kiocb *req, int error)
+static bool io_resubmit_prep(struct io_kiocb *req)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
- ssize_t ret = -ECANCELED;
+ int rw, ret;
struct iov_iter iter;
- int rw;
- if (error) {
- ret = error;
- goto end_req;
- }
+ /* already prepared */
+ if (req->async_data)
+ return true;
switch (req->opcode) {
case IORING_OP_READV:
@@ -2686,27 +2786,17 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error)
default:
printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n",
req->opcode);
- goto end_req;
+ return false;
}
- if (!req->async_data) {
- ret = io_import_iovec(rw, req, &iovec, &iter, false);
- if (ret < 0)
- goto end_req;
- ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
- if (!ret)
- return true;
- kfree(iovec);
- } else {
- return true;
- }
-end_req:
- req_set_fail_links(req);
- return false;
+ ret = io_import_iovec(rw, req, &iovec, &iter, false);
+ if (ret < 0)
+ return false;
+ return !io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
}
#endif
-static bool io_rw_reissue(struct io_kiocb *req, long res)
+static bool io_rw_reissue(struct io_kiocb *req)
{
#ifdef CONFIG_BLOCK
umode_t mode = file_inode(req->file)->i_mode;
@@ -2714,35 +2804,45 @@ static bool io_rw_reissue(struct io_kiocb *req, long res)
if (!S_ISBLK(mode) && !S_ISREG(mode))
return false;
- if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker())
+ if ((req->flags & REQ_F_NOWAIT) || io_wq_current_is_worker())
return false;
lockdep_assert_held(&req->ctx->uring_lock);
ret = io_sq_thread_acquire_mm_files(req->ctx, req);
- if (io_resubmit_prep(req, ret)) {
+ if (!ret && io_resubmit_prep(req)) {
refcount_inc(&req->refs);
io_queue_async_work(req);
return true;
}
-
+ req_set_fail_links(req);
#endif
return false;
}
static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
- struct io_comp_state *cs)
+ unsigned int issue_flags)
{
- if (!io_rw_reissue(req, res))
- io_complete_rw_common(&req->rw.kiocb, res, cs);
+ int cflags = 0;
+
+ if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_reissue(req))
+ return;
+ if (res != req->result)
+ req_set_fail_links(req);
+
+ if (req->rw.kiocb.ki_flags & IOCB_WRITE)
+ kiocb_end_write(req);
+ if (req->flags & REQ_F_BUFFER_SELECTED)
+ cflags = io_put_rw_kbuf(req);
+ __io_req_complete(req, issue_flags, res, cflags);
}
static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
- __io_complete_rw(req, res, res2, NULL);
+ __io_complete_rw(req, res, res2, 0);
}
static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
@@ -2806,16 +2906,12 @@ static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
wake_up(&ctx->sq_data->wait);
}
-static inline void __io_state_file_put(struct io_submit_state *state)
-{
- fput_many(state->file, state->file_refs);
- state->file_refs = 0;
-}
-
static inline void io_state_file_put(struct io_submit_state *state)
{
- if (state->file_refs)
- __io_state_file_put(state);
+ if (state->file_refs) {
+ fput_many(state->file, state->file_refs);
+ state->file_refs = 0;
+ }
}
/*
@@ -2833,7 +2929,7 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd)
state->file_refs--;
return state->file;
}
- __io_state_file_put(state);
+ io_state_file_put(state);
}
state->file = fget_many(fd, state->ios_left);
if (unlikely(!state->file))
@@ -2891,16 +2987,17 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
struct kiocb *kiocb = &req->rw.kiocb;
+ struct file *file = req->file;
unsigned ioprio;
int ret;
- if (S_ISREG(file_inode(req->file)->i_mode))
+ if (S_ISREG(file_inode(file)->i_mode))
req->flags |= REQ_F_ISREG;
kiocb->ki_pos = READ_ONCE(sqe->off);
- if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) {
+ if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
req->flags |= REQ_F_CUR_POS;
- kiocb->ki_pos = req->file->f_pos;
+ kiocb->ki_pos = file->f_pos;
}
kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
@@ -2908,6 +3005,10 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(ret))
return ret;
+ /* don't allow async punt for O_NONBLOCK or RWF_NOWAIT */
+ if ((kiocb->ki_flags & IOCB_NOWAIT) || (file->f_flags & O_NONBLOCK))
+ req->flags |= REQ_F_NOWAIT;
+
ioprio = READ_ONCE(sqe->ioprio);
if (ioprio) {
ret = ioprio_check_cap(ioprio);
@@ -2918,10 +3019,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
} else
kiocb->ki_ioprio = get_current_ioprio();
- /* don't allow async punt if RWF_NOWAIT was requested */
- if (kiocb->ki_flags & IOCB_NOWAIT)
- req->flags |= REQ_F_NOWAIT;
-
if (ctx->flags & IORING_SETUP_IOPOLL) {
if (!(kiocb->ki_flags & IOCB_DIRECT) ||
!kiocb->ki_filp->f_op->iopoll)
@@ -2964,7 +3061,7 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
}
static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
- struct io_comp_state *cs)
+ unsigned int issue_flags)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
struct io_async_rw *io = req->async_data;
@@ -2980,13 +3077,12 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
if (req->flags & REQ_F_CUR_POS)
req->file->f_pos = kiocb->ki_pos;
if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
- __io_complete_rw(req, ret, 0, cs);
+ __io_complete_rw(req, ret, 0, issue_flags);
else
io_rw_done(kiocb, ret);
}
-static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
- struct iov_iter *iter)
+static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
{
struct io_ring_ctx *ctx = req->ctx;
size_t len = req->rw.len;
@@ -3050,7 +3146,7 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
}
}
- return len;
+ return 0;
}
static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
@@ -3191,16 +3287,14 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
return __io_iov_buffer_select(req, iov, needs_lock);
}
-static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
- struct iovec **iovec, struct iov_iter *iter,
- bool needs_lock)
+static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
+ struct iov_iter *iter, bool needs_lock)
{
void __user *buf = u64_to_user_ptr(req->rw.addr);
size_t sqe_len = req->rw.len;
+ u8 opcode = req->opcode;
ssize_t ret;
- u8 opcode;
- opcode = req->opcode;
if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
*iovec = NULL;
return io_import_fixed(req, rw, iter);
@@ -3225,10 +3319,8 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
if (req->flags & REQ_F_BUFFER_SELECT) {
ret = io_iov_buffer_select(req, *iovec, needs_lock);
- if (!ret) {
- ret = (*iovec)->iov_len;
- iov_iter_init(iter, rw, *iovec, 1, ret);
- }
+ if (!ret)
+ iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
*iovec = NULL;
return ret;
}
@@ -3346,8 +3438,10 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
if (!force && !io_op_defs[req->opcode].needs_async_data)
return 0;
if (!req->async_data) {
- if (__io_alloc_async_data(req))
+ if (__io_alloc_async_data(req)) {
+ kfree(iovec);
return -ENOMEM;
+ }
io_req_map_rw(req, iovec, fast_iov, iter);
}
@@ -3358,7 +3452,7 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
{
struct io_async_rw *iorw = req->async_data;
struct iovec *iov = iorw->fast_iov;
- ssize_t ret;
+ int ret;
ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
if (unlikely(ret < 0))
@@ -3404,7 +3498,6 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
struct wait_page_queue *wpq;
struct io_kiocb *req = wait->private;
struct wait_page_key *key = arg;
- int ret;
wpq = container_of(wait, struct wait_page_queue, wait);
@@ -3414,21 +3507,9 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
list_del_init(&wait->entry);
- init_task_work(&req->task_work, io_req_task_submit);
- percpu_ref_get(&req->ctx->refs);
-
/* submit ref gets dropped, acquire a new one */
refcount_inc(&req->refs);
- ret = io_req_task_work_add(req);
- if (unlikely(ret)) {
- struct task_struct *tsk;
-
- /* queue just for cancelation */
- init_task_work(&req->task_work, io_req_task_cancel);
- tsk = io_wq_get_task(req->ctx->io_wq);
- task_work_add(tsk, &req->task_work, TWA_NONE);
- wake_up_process(tsk);
- }
+ io_req_task_queue(req);
return 1;
}
@@ -3485,15 +3566,14 @@ static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
return -EINVAL;
}
-static int io_read(struct io_kiocb *req, bool force_nonblock,
- struct io_comp_state *cs)
+static int io_read(struct io_kiocb *req, unsigned int issue_flags)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct kiocb *kiocb = &req->rw.kiocb;
struct iov_iter __iter, *iter = &__iter;
struct io_async_rw *rw = req->async_data;
ssize_t io_size, ret, ret2;
- bool no_async;
+ bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
if (rw) {
iter = &rw->iter;
@@ -3505,7 +3585,6 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
}
io_size = iov_iter_count(iter);
req->result = io_size;
- ret = 0;
/* Ensure we clear previously set non-block flag */
if (!force_nonblock)
@@ -3513,88 +3592,72 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
else
kiocb->ki_flags |= IOCB_NOWAIT;
-
/* If the file doesn't support async, just async punt */
- no_async = force_nonblock && !io_file_supports_async(req->file, READ);
- if (no_async)
- goto copy_iov;
+ if (force_nonblock && !io_file_supports_async(req->file, READ)) {
+ ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
+ return ret ?: -EAGAIN;
+ }
ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
- if (unlikely(ret))
- goto out_free;
+ if (unlikely(ret)) {
+ kfree(iovec);
+ return ret;
+ }
ret = io_iter_do_read(req, iter);
- if (!ret) {
- goto done;
- } else if (ret == -EIOCBQUEUED) {
- ret = 0;
- goto out_free;
+ if (ret == -EIOCBQUEUED) {
+ /* it's faster to check here then delegate to kfree */
+ if (iovec)
+ kfree(iovec);
+ return 0;
} else if (ret == -EAGAIN) {
/* IOPOLL retry should happen for io-wq threads */
if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
goto done;
- /* no retry on NONBLOCK marked file */
- if (req->file->f_flags & O_NONBLOCK)
+ /* no retry on NONBLOCK nor RWF_NOWAIT */
+ if (req->flags & REQ_F_NOWAIT)
goto done;
/* some cases will consume bytes even on error returns */
iov_iter_revert(iter, io_size - iov_iter_count(iter));
ret = 0;
- goto copy_iov;
- } else if (ret < 0) {
- /* make sure -ERESTARTSYS -> -EINTR is done */
+ } else if (ret <= 0 || ret == io_size || !force_nonblock ||
+ (req->flags & REQ_F_NOWAIT) || !(req->flags & REQ_F_ISREG)) {
+ /* read all, failed, already did sync or don't want to retry */
goto done;
}
- /* read it all, or we did blocking attempt. no retry. */
- if (!iov_iter_count(iter) || !force_nonblock ||
- (req->file->f_flags & O_NONBLOCK))
- goto done;
-
- io_size -= ret;
-copy_iov:
ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
- if (ret2) {
- ret = ret2;
- goto out_free;
- }
- if (no_async)
- return -EAGAIN;
+ if (ret2)
+ return ret2;
+
rw = req->async_data;
- /* it's copied and will be cleaned with ->io */
- iovec = NULL;
/* now use our persistent iterator, if we aren't already */
iter = &rw->iter;
-retry:
- rw->bytes_done += ret;
- /* if we can retry, do so with the callbacks armed */
- if (!io_rw_should_retry(req)) {
- kiocb->ki_flags &= ~IOCB_WAITQ;
- return -EAGAIN;
- }
- /*
- * Now retry read with the IOCB_WAITQ parts set in the iocb. If we
- * get -EIOCBQUEUED, then we'll get a notification when the desired
- * page gets unlocked. We can also get a partial read here, and if we
- * do, then just retry at the new offset.
- */
- ret = io_iter_do_read(req, iter);
- if (ret == -EIOCBQUEUED) {
- ret = 0;
- goto out_free;
- } else if (ret > 0 && ret < io_size) {
+ do {
+ io_size -= ret;
+ rw->bytes_done += ret;
+ /* if we can retry, do so with the callbacks armed */
+ if (!io_rw_should_retry(req)) {
+ kiocb->ki_flags &= ~IOCB_WAITQ;
+ return -EAGAIN;
+ }
+
+ /*
+ * Now retry read with the IOCB_WAITQ parts set in the iocb. If
+ * we get -EIOCBQUEUED, then we'll get a notification when the
+ * desired page gets unlocked. We can also get a partial read
+ * here, and if we do, then just retry at the new offset.
+ */
+ ret = io_iter_do_read(req, iter);
+ if (ret == -EIOCBQUEUED)
+ return 0;
/* we got some bytes, but not all. retry. */
- goto retry;
- }
+ } while (ret > 0 && ret < io_size);
done:
- kiocb_done(kiocb, ret, cs);
- ret = 0;
-out_free:
- /* it's reportedly faster than delegating the null check to kfree() */
- if (iovec)
- kfree(iovec);
- return ret;
+ kiocb_done(kiocb, ret, issue_flags);
+ return 0;
}
static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -3614,14 +3677,14 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return io_rw_prep_async(req, WRITE);
}
-static int io_write(struct io_kiocb *req, bool force_nonblock,
- struct io_comp_state *cs)
+static int io_write(struct io_kiocb *req, unsigned int issue_flags)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct kiocb *kiocb = &req->rw.kiocb;
struct iov_iter __iter, *iter = &__iter;
struct io_async_rw *rw = req->async_data;
ssize_t ret, ret2, io_size;
+ bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
if (rw) {
iter = &rw->iter;
@@ -3680,22 +3743,21 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
*/
if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
ret2 = -EAGAIN;
- /* no retry on NONBLOCK marked file */
- if (ret2 == -EAGAIN && (req->file->f_flags & O_NONBLOCK))
+ /* no retry on NONBLOCK nor RWF_NOWAIT */
+ if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
goto done;
if (!force_nonblock || ret2 != -EAGAIN) {
/* IOPOLL retry should happen for io-wq threads */
if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
goto copy_iov;
done:
- kiocb_done(kiocb, ret2, cs);
+ kiocb_done(kiocb, ret2, issue_flags);
} else {
copy_iov:
/* some cases will consume bytes even on error returns */
iov_iter_revert(iter, io_size - iov_iter_count(iter));
ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
- if (!ret)
- return -EAGAIN;
+ return ret ?: -EAGAIN;
}
out_free:
/* it's reportedly faster than delegating the null check to kfree() */
@@ -3733,12 +3795,12 @@ static int io_renameat_prep(struct io_kiocb *req,
return 0;
}
-static int io_renameat(struct io_kiocb *req, bool force_nonblock)
+static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_rename *ren = &req->rename;
int ret;
- if (force_nonblock)
+ if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
@@ -3775,12 +3837,12 @@ static int io_unlinkat_prep(struct io_kiocb *req,
return 0;
}
-static int io_unlinkat(struct io_kiocb *req, bool force_nonblock)
+static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_unlink *un = &req->unlink;
int ret;
- if (force_nonblock)
+ if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
if (un->flags & AT_REMOVEDIR)
@@ -3812,13 +3874,13 @@ static int io_shutdown_prep(struct io_kiocb *req,
#endif
}
-static int io_shutdown(struct io_kiocb *req, bool force_nonblock)
+static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
{
#if defined(CONFIG_NET)
struct socket *sock;
int ret;
- if (force_nonblock)
+ if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
sock = sock_from_file(req->file);
@@ -3877,7 +3939,7 @@ static int io_tee_prep(struct io_kiocb *req,
return __io_splice_prep(req, sqe);
}
-static int io_tee(struct io_kiocb *req, bool force_nonblock)
+static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_splice *sp = &req->splice;
struct file *in = sp->file_in;
@@ -3885,7 +3947,7 @@ static int io_tee(struct io_kiocb *req, bool force_nonblock)
unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
long ret = 0;
- if (force_nonblock)
+ if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
if (sp->len)
ret = do_tee(in, out, sp->len, flags);
@@ -3908,7 +3970,7 @@ static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return __io_splice_prep(req, sqe);
}
-static int io_splice(struct io_kiocb *req, bool force_nonblock)
+static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_splice *sp = &req->splice;
struct file *in = sp->file_in;
@@ -3917,7 +3979,7 @@ static int io_splice(struct io_kiocb *req, bool force_nonblock)
loff_t *poff_in, *poff_out;
long ret = 0;
- if (force_nonblock)
+ if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
@@ -3938,14 +4000,14 @@ static int io_splice(struct io_kiocb *req, bool force_nonblock)
/*
* IORING_OP_NOP just posts a completion event, nothing else.
*/
-static int io_nop(struct io_kiocb *req, struct io_comp_state *cs)
+static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- __io_req_complete(req, 0, 0, cs);
+ __io_req_complete(req, issue_flags, 0, 0);
return 0;
}
@@ -3970,13 +4032,13 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-static int io_fsync(struct io_kiocb *req, bool force_nonblock)
+static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
{
loff_t end = req->sync.off + req->sync.len;
int ret;
/* fsync always requires a blocking context */
- if (force_nonblock)
+ if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
ret = vfs_fsync_range(req->file, req->sync.off,
@@ -4002,12 +4064,12 @@ static int io_fallocate_prep(struct io_kiocb *req,
return 0;
}
-static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
+static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
{
int ret;
/* fallocate always requiring blocking context */
- if (force_nonblock)
+ if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
req->sync.len);
@@ -4040,7 +4102,6 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
return ret;
}
req->open.nofile = rlimit(RLIMIT_NOFILE);
- req->open.ignore_nonblock = false;
req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
@@ -4078,43 +4139,53 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return __io_openat_prep(req, sqe);
}
-static int io_openat2(struct io_kiocb *req, bool force_nonblock)
+static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
{
struct open_flags op;
struct file *file;
+ bool nonblock_set;
+ bool resolve_nonblock;
int ret;
- if (force_nonblock && !req->open.ignore_nonblock)
- return -EAGAIN;
-
ret = build_open_flags(&req->open.how, &op);
if (ret)
goto err;
+ nonblock_set = op.open_flag & O_NONBLOCK;
+ resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
+ if (issue_flags & IO_URING_F_NONBLOCK) {
+ /*
+ * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
+ * it'll always -EAGAIN
+ */
+ if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
+ return -EAGAIN;
+ op.lookup_flags |= LOOKUP_CACHED;
+ op.open_flag |= O_NONBLOCK;
+ }
ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
if (ret < 0)
goto err;
file = do_filp_open(req->open.dfd, req->open.filename, &op);
+ /* only retry if RESOLVE_CACHED wasn't already set by application */
+ if ((!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)) &&
+ file == ERR_PTR(-EAGAIN)) {
+ /*
+ * We could hang on to this 'fd', but seems like marginal
+ * gain for something that is now known to be a slower path.
+ * So just put it, and we'll get a new one when we retry.
+ */
+ put_unused_fd(ret);
+ return -EAGAIN;
+ }
+
if (IS_ERR(file)) {
put_unused_fd(ret);
ret = PTR_ERR(file);
- /*
- * A work-around to ensure that /proc/self works that way
- * that it should - if we get -EOPNOTSUPP back, then assume
- * that proc_self_get_link() failed us because we're in async
- * context. We should be safe to retry this from the task
- * itself with force_nonblock == false set, as it should not
- * block on lookup. Would be nice to know this upfront and
- * avoid the async dance, but doesn't seem feasible.
- */
- if (ret == -EOPNOTSUPP && io_wq_current_is_worker()) {
- req->open.ignore_nonblock = true;
- refcount_inc(&req->refs);
- io_req_task_queue(req);
- return 0;
- }
} else {
+ if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
+ file->f_flags &= ~O_NONBLOCK;
fsnotify_open(file);
fd_install(ret, file);
}
@@ -4127,9 +4198,9 @@ err:
return 0;
}
-static int io_openat(struct io_kiocb *req, bool force_nonblock)
+static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
{
- return io_openat2(req, force_nonblock);
+ return io_openat2(req, issue_flags & IO_URING_F_NONBLOCK);
}
static int io_remove_buffers_prep(struct io_kiocb *req,
@@ -4177,13 +4248,13 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
return i;
}
-static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock,
- struct io_comp_state *cs)
+static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_provide_buf *p = &req->pbuf;
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer *head;
int ret = 0;
+ bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
io_ring_submit_lock(ctx, !force_nonblock);
@@ -4198,11 +4269,11 @@ static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock,
/* need to hold the lock to complete IOPOLL requests */
if (ctx->flags & IORING_SETUP_IOPOLL) {
- __io_req_complete(req, ret, 0, cs);
+ __io_req_complete(req, issue_flags, ret, 0);
io_ring_submit_unlock(ctx, !force_nonblock);
} else {
io_ring_submit_unlock(ctx, !force_nonblock);
- __io_req_complete(req, ret, 0, cs);
+ __io_req_complete(req, issue_flags, ret, 0);
}
return 0;
}
@@ -4261,13 +4332,13 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
return i ? i : -ENOMEM;
}
-static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock,
- struct io_comp_state *cs)
+static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_provide_buf *p = &req->pbuf;
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer *head, *list;
int ret = 0;
+ bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
io_ring_submit_lock(ctx, !force_nonblock);
@@ -4293,11 +4364,11 @@ out:
/* need to hold the lock to complete IOPOLL requests */
if (ctx->flags & IORING_SETUP_IOPOLL) {
- __io_req_complete(req, ret, 0, cs);
+ __io_req_complete(req, issue_flags, ret, 0);
io_ring_submit_unlock(ctx, !force_nonblock);
} else {
io_ring_submit_unlock(ctx, !force_nonblock);
- __io_req_complete(req, ret, 0, cs);
+ __io_req_complete(req, issue_flags, ret, 0);
}
return 0;
}
@@ -4329,12 +4400,12 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,
#endif
}
-static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock,
- struct io_comp_state *cs)
+static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
{
#if defined(CONFIG_EPOLL)
struct io_epoll *ie = &req->epoll;
int ret;
+ bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
if (force_nonblock && ret == -EAGAIN)
@@ -4342,7 +4413,7 @@ static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock,
if (ret < 0)
req_set_fail_links(req);
- __io_req_complete(req, ret, 0, cs);
+ __io_req_complete(req, issue_flags, ret, 0);
return 0;
#else
return -EOPNOTSUPP;
@@ -4366,13 +4437,13 @@ static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
#endif
}
-static int io_madvise(struct io_kiocb *req, bool force_nonblock)
+static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
{
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
struct io_madvise *ma = &req->madvise;
int ret;
- if (force_nonblock)
+ if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
@@ -4398,12 +4469,12 @@ static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
+static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_fadvise *fa = &req->fadvise;
int ret;
- if (force_nonblock) {
+ if (issue_flags & IO_URING_F_NONBLOCK) {
switch (fa->advice) {
case POSIX_FADV_NORMAL:
case POSIX_FADV_RANDOM:
@@ -4439,12 +4510,12 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-static int io_statx(struct io_kiocb *req, bool force_nonblock)
+static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_statx *ctx = &req->statx;
int ret;
- if (force_nonblock) {
+ if (issue_flags & IO_URING_F_NONBLOCK) {
/* only need file table for an actual valid fd */
if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD)
req->flags |= REQ_F_NO_FILE_TABLE;
@@ -4462,14 +4533,6 @@ static int io_statx(struct io_kiocb *req, bool force_nonblock)
static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- /*
- * If we queue this for async, it must not be cancellable. That would
- * leave the 'file' in an undeterminate state, and here need to modify
- * io_wq_work.flags, so initialize io_wq_work firstly.
- */
- io_req_init_async(req);
- req->work.flags |= IO_WQ_WORK_NO_CANCEL;
-
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
@@ -4479,42 +4542,59 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EBADF;
req->close.fd = READ_ONCE(sqe->fd);
- if ((req->file && req->file->f_op == &io_uring_fops))
- return -EBADF;
-
- req->close.put_file = NULL;
return 0;
}
-static int io_close(struct io_kiocb *req, bool force_nonblock,
- struct io_comp_state *cs)
+static int io_close(struct io_kiocb *req, unsigned int issue_flags)
{
+ struct files_struct *files = current->files;
struct io_close *close = &req->close;
+ struct fdtable *fdt;
+ struct file *file;
int ret;
- /* might be already done during nonblock submission */
- if (!close->put_file) {
- ret = close_fd_get_file(close->fd, &close->put_file);
- if (ret < 0)
- return (ret == -ENOENT) ? -EBADF : ret;
+ file = NULL;
+ ret = -EBADF;
+ spin_lock(&files->file_lock);
+ fdt = files_fdtable(files);
+ if (close->fd >= fdt->max_fds) {
+ spin_unlock(&files->file_lock);
+ goto err;
+ }
+ file = fdt->fd[close->fd];
+ if (!file) {
+ spin_unlock(&files->file_lock);
+ goto err;
+ }
+
+ if (file->f_op == &io_uring_fops) {
+ spin_unlock(&files->file_lock);
+ file = NULL;
+ goto err;
}
/* if the file has a flush method, be safe and punt to async */
- if (close->put_file->f_op->flush && force_nonblock) {
- /* was never set, but play safe */
- req->flags &= ~REQ_F_NOWAIT;
- /* avoid grabbing files - we don't need the files */
- req->flags |= REQ_F_NO_FILE_TABLE;
+ if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
+ spin_unlock(&files->file_lock);
return -EAGAIN;
}
+ ret = __close_fd_get_file(close->fd, &file);
+ spin_unlock(&files->file_lock);
+ if (ret < 0) {
+ if (ret == -ENOENT)
+ ret = -EBADF;
+ goto err;
+ }
+
/* No ->flush() or already async, safely close from here */
- ret = filp_close(close->put_file, req->work.identity->files);
+ ret = filp_close(file, current->files);
+err:
if (ret < 0)
req_set_fail_links(req);
- fput(close->put_file);
- close->put_file = NULL;
- __io_req_complete(req, ret, 0, cs);
+ if (file)
+ fput(file);
+ __io_req_complete(req, issue_flags, ret, 0);
return 0;
}
@@ -4536,12 +4616,12 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
+static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
{
int ret;
/* sync_file_range always requires a blocking context */
- if (force_nonblock)
+ if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
ret = sync_file_range(req->file, req->sync.off, req->sync.len,
@@ -4561,23 +4641,27 @@ static int io_setup_async_msg(struct io_kiocb *req,
if (async_msg)
return -EAGAIN;
if (io_alloc_async_data(req)) {
- if (kmsg->iov != kmsg->fast_iov)
- kfree(kmsg->iov);
+ kfree(kmsg->free_iov);
return -ENOMEM;
}
async_msg = req->async_data;
req->flags |= REQ_F_NEED_CLEANUP;
memcpy(async_msg, kmsg, sizeof(*kmsg));
+ async_msg->msg.msg_name = &async_msg->addr;
+ /* if were using fast_iov, set it to the new one */
+ if (!async_msg->free_iov)
+ async_msg->msg.msg_iter.iov = async_msg->fast_iov;
+
return -EAGAIN;
}
static int io_sendmsg_copy_hdr(struct io_kiocb *req,
struct io_async_msghdr *iomsg)
{
- iomsg->iov = iomsg->fast_iov;
iomsg->msg.msg_name = &iomsg->addr;
+ iomsg->free_iov = iomsg->fast_iov;
return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
- req->sr_msg.msg_flags, &iomsg->iov);
+ req->sr_msg.msg_flags, &iomsg->free_iov);
}
static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -4606,8 +4690,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return ret;
}
-static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
- struct io_comp_state *cs)
+static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_async_msghdr iomsg, *kmsg;
struct socket *sock;
@@ -4618,14 +4701,8 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
if (unlikely(!sock))
return -ENOTSOCK;
- if (req->async_data) {
- kmsg = req->async_data;
- kmsg->msg.msg_name = &kmsg->addr;
- /* if iov is set, it's allocated already */
- if (!kmsg->iov)
- kmsg->iov = kmsg->fast_iov;
- kmsg->msg.msg_iter.iov = kmsg->iov;
- } else {
+ kmsg = req->async_data;
+ if (!kmsg) {
ret = io_sendmsg_copy_hdr(req, &iomsg);
if (ret)
return ret;
@@ -4635,26 +4712,26 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
flags = req->sr_msg.msg_flags;
if (flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
- else if (force_nonblock)
+ else if (issue_flags & IO_URING_F_NONBLOCK)
flags |= MSG_DONTWAIT;
ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
- if (force_nonblock && ret == -EAGAIN)
+ if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
return io_setup_async_msg(req, kmsg);
if (ret == -ERESTARTSYS)
ret = -EINTR;
- if (kmsg->iov != kmsg->fast_iov)
- kfree(kmsg->iov);
+ /* fast path, check for non-NULL to avoid function call */
+ if (kmsg->free_iov)
+ kfree(kmsg->free_iov);
req->flags &= ~REQ_F_NEED_CLEANUP;
if (ret < 0)
req_set_fail_links(req);
- __io_req_complete(req, ret, 0, cs);
+ __io_req_complete(req, issue_flags, ret, 0);
return 0;
}
-static int io_send(struct io_kiocb *req, bool force_nonblock,
- struct io_comp_state *cs)
+static int io_send(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_sr_msg *sr = &req->sr_msg;
struct msghdr msg;
@@ -4679,19 +4756,19 @@ static int io_send(struct io_kiocb *req, bool force_nonblock,
flags = req->sr_msg.msg_flags;
if (flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
- else if (force_nonblock)
+ else if (issue_flags & IO_URING_F_NONBLOCK)
flags |= MSG_DONTWAIT;
msg.msg_flags = flags;
ret = sock_sendmsg(sock, &msg);
- if (force_nonblock && ret == -EAGAIN)
+ if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
return -EAGAIN;
if (ret == -ERESTARTSYS)
ret = -EINTR;
if (ret < 0)
req_set_fail_links(req);
- __io_req_complete(req, ret, 0, cs);
+ __io_req_complete(req, issue_flags, ret, 0);
return 0;
}
@@ -4711,15 +4788,14 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
if (req->flags & REQ_F_BUFFER_SELECT) {
if (iov_len > 1)
return -EINVAL;
- if (copy_from_user(iomsg->iov, uiov, sizeof(*uiov)))
+ if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
return -EFAULT;
- sr->len = iomsg->iov[0].iov_len;
- iov_iter_init(&iomsg->msg.msg_iter, READ, iomsg->iov, 1,
- sr->len);
- iomsg->iov = NULL;
+ sr->len = iomsg->fast_iov[0].iov_len;
+ iomsg->free_iov = NULL;
} else {
+ iomsg->free_iov = iomsg->fast_iov;
ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
- &iomsg->iov, &iomsg->msg.msg_iter,
+ &iomsg->free_iov, &iomsg->msg.msg_iter,
false);
if (ret > 0)
ret = 0;
@@ -4758,11 +4834,11 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
if (clen < 0)
return -EINVAL;
sr->len = clen;
- iomsg->iov[0].iov_len = clen;
- iomsg->iov = NULL;
+ iomsg->free_iov = NULL;
} else {
+ iomsg->free_iov = iomsg->fast_iov;
ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
- UIO_FASTIOV, &iomsg->iov,
+ UIO_FASTIOV, &iomsg->free_iov,
&iomsg->msg.msg_iter, true);
if (ret < 0)
return ret;
@@ -4776,7 +4852,6 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req,
struct io_async_msghdr *iomsg)
{
iomsg->msg.msg_name = &iomsg->addr;
- iomsg->iov = iomsg->fast_iov;
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
@@ -4834,27 +4909,21 @@ static int io_recvmsg_prep(struct io_kiocb *req,
return ret;
}
-static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
- struct io_comp_state *cs)
+static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_async_msghdr iomsg, *kmsg;
struct socket *sock;
struct io_buffer *kbuf;
unsigned flags;
int ret, cflags = 0;
+ bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
sock = sock_from_file(req->file);
if (unlikely(!sock))
return -ENOTSOCK;
- if (req->async_data) {
- kmsg = req->async_data;
- kmsg->msg.msg_name = &kmsg->addr;
- /* if iov is set, it's allocated already */
- if (!kmsg->iov)
- kmsg->iov = kmsg->fast_iov;
- kmsg->msg.msg_iter.iov = kmsg->iov;
- } else {
+ kmsg = req->async_data;
+ if (!kmsg) {
ret = io_recvmsg_copy_hdr(req, &iomsg);
if (ret)
return ret;
@@ -4866,7 +4935,8 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
- iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov,
+ kmsg->fast_iov[0].iov_len = req->sr_msg.len;
+ iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
1, req->sr_msg.len);
}
@@ -4885,17 +4955,17 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
if (req->flags & REQ_F_BUFFER_SELECTED)
cflags = io_put_recv_kbuf(req);
- if (kmsg->iov != kmsg->fast_iov)
- kfree(kmsg->iov);
+ /* fast path, check for non-NULL to avoid function call */
+ if (kmsg->free_iov)
+ kfree(kmsg->free_iov);
req->flags &= ~REQ_F_NEED_CLEANUP;
if (ret < 0)
req_set_fail_links(req);
- __io_req_complete(req, ret, cflags, cs);
+ __io_req_complete(req, issue_flags, ret, cflags);
return 0;
}
-static int io_recv(struct io_kiocb *req, bool force_nonblock,
- struct io_comp_state *cs)
+static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_buffer *kbuf;
struct io_sr_msg *sr = &req->sr_msg;
@@ -4905,6 +4975,7 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock,
struct iovec iov;
unsigned flags;
int ret, cflags = 0;
+ bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
sock = sock_from_file(req->file);
if (unlikely(!sock))
@@ -4944,7 +5015,7 @@ out_free:
cflags = io_put_recv_kbuf(req);
if (ret < 0)
req_set_fail_links(req);
- __io_req_complete(req, ret, cflags, cs);
+ __io_req_complete(req, issue_flags, ret, cflags);
return 0;
}
@@ -4964,10 +5035,10 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-static int io_accept(struct io_kiocb *req, bool force_nonblock,
- struct io_comp_state *cs)
+static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_accept *accept = &req->accept;
+ bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
int ret;
@@ -4984,7 +5055,7 @@ static int io_accept(struct io_kiocb *req, bool force_nonblock,
ret = -EINTR;
req_set_fail_links(req);
}
- __io_req_complete(req, ret, 0, cs);
+ __io_req_complete(req, issue_flags, ret, 0);
return 0;
}
@@ -5008,12 +5079,12 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
&io->address);
}
-static int io_connect(struct io_kiocb *req, bool force_nonblock,
- struct io_comp_state *cs)
+static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_async_connect __io, *io;
unsigned file_flags;
int ret;
+ bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
if (req->async_data) {
io = req->async_data;
@@ -5046,7 +5117,7 @@ static int io_connect(struct io_kiocb *req, bool force_nonblock,
out:
if (ret < 0)
req_set_fail_links(req);
- __io_req_complete(req, ret, 0, cs);
+ __io_req_complete(req, issue_flags, ret, 0);
return 0;
}
#else /* !CONFIG_NET */
@@ -5055,14 +5126,12 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EOPNOTSUPP;
}
-static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
- struct io_comp_state *cs)
+static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
{
return -EOPNOTSUPP;
}
-static int io_send(struct io_kiocb *req, bool force_nonblock,
- struct io_comp_state *cs)
+static int io_send(struct io_kiocb *req, unsigned int issue_flags)
{
return -EOPNOTSUPP;
}
@@ -5073,14 +5142,12 @@ static int io_recvmsg_prep(struct io_kiocb *req,
return -EOPNOTSUPP;
}
-static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
- struct io_comp_state *cs)
+static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
{
return -EOPNOTSUPP;
}
-static int io_recv(struct io_kiocb *req, bool force_nonblock,
- struct io_comp_state *cs)
+static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
{
return -EOPNOTSUPP;
}
@@ -5090,8 +5157,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EOPNOTSUPP;
}
-static int io_accept(struct io_kiocb *req, bool force_nonblock,
- struct io_comp_state *cs)
+static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
{
return -EOPNOTSUPP;
}
@@ -5101,8 +5167,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EOPNOTSUPP;
}
-static int io_connect(struct io_kiocb *req, bool force_nonblock,
- struct io_comp_state *cs)
+static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
{
return -EOPNOTSUPP;
}
@@ -5128,7 +5193,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
list_del_init(&poll->wait.entry);
req->result = mask;
- init_task_work(&req->task_work, func);
+ req->task_work.func = func;
percpu_ref_get(&req->ctx->refs);
/*
@@ -5139,12 +5204,8 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
*/
ret = io_req_task_work_add(req);
if (unlikely(ret)) {
- struct task_struct *tsk;
-
WRITE_ONCE(poll->canceled, true);
- tsk = io_wq_get_task(req->ctx->io_wq);
- task_work_add(tsk, &req->task_work, TWA_NONE);
- wake_up_process(tsk);
+ io_req_task_work_add_fallback(req, func);
}
return 1;
}
@@ -5587,7 +5648,7 @@ static int io_poll_remove_prep(struct io_kiocb *req,
* Find a running poll command that matches one specified in sqe->addr,
* and remove it if found.
*/
-static int io_poll_remove(struct io_kiocb *req)
+static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
int ret;
@@ -5638,7 +5699,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
return 0;
}
-static int io_poll_add(struct io_kiocb *req)
+static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_poll_iocb *poll = &req->poll;
struct io_ring_ctx *ctx = req->ctx;
@@ -5769,24 +5830,27 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
return 0;
}
+static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
+{
+ return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
+ : HRTIMER_MODE_REL;
+}
+
/*
* Remove or update an existing timeout command
*/
-static int io_timeout_remove(struct io_kiocb *req)
+static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_timeout_rem *tr = &req->timeout_rem;
struct io_ring_ctx *ctx = req->ctx;
int ret;
spin_lock_irq(&ctx->completion_lock);
- if (req->timeout_rem.flags & IORING_TIMEOUT_UPDATE) {
- enum hrtimer_mode mode = (tr->flags & IORING_TIMEOUT_ABS)
- ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL;
-
- ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
- } else {
+ if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE))
ret = io_timeout_cancel(ctx, tr->addr);
- }
+ else
+ ret = io_timeout_update(ctx, tr->addr, &tr->ts,
+ io_translate_timeout_mode(tr->flags));
io_cqring_fill_event(req, ret);
io_commit_cqring(ctx);
@@ -5826,16 +5890,12 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
return -EFAULT;
- if (flags & IORING_TIMEOUT_ABS)
- data->mode = HRTIMER_MODE_ABS;
- else
- data->mode = HRTIMER_MODE_REL;
-
+ data->mode = io_translate_timeout_mode(flags);
hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
return 0;
}
-static int io_timeout(struct io_kiocb *req)
+static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_timeout_data *data = req->async_data;
@@ -5958,7 +6018,7 @@ static int io_async_cancel_prep(struct io_kiocb *req,
return 0;
}
-static int io_async_cancel(struct io_kiocb *req)
+static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -5966,7 +6026,7 @@ static int io_async_cancel(struct io_kiocb *req)
return 0;
}
-static int io_files_update_prep(struct io_kiocb *req,
+static int io_rsrc_update_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
if (unlikely(req->ctx->flags & IORING_SETUP_SQPOLL))
@@ -5976,34 +6036,33 @@ static int io_files_update_prep(struct io_kiocb *req,
if (sqe->ioprio || sqe->rw_flags)
return -EINVAL;
- req->files_update.offset = READ_ONCE(sqe->off);
- req->files_update.nr_args = READ_ONCE(sqe->len);
- if (!req->files_update.nr_args)
+ req->rsrc_update.offset = READ_ONCE(sqe->off);
+ req->rsrc_update.nr_args = READ_ONCE(sqe->len);
+ if (!req->rsrc_update.nr_args)
return -EINVAL;
- req->files_update.arg = READ_ONCE(sqe->addr);
+ req->rsrc_update.arg = READ_ONCE(sqe->addr);
return 0;
}
-static int io_files_update(struct io_kiocb *req, bool force_nonblock,
- struct io_comp_state *cs)
+static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
- struct io_uring_files_update up;
+ struct io_uring_rsrc_update up;
int ret;
- if (force_nonblock)
+ if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
- up.offset = req->files_update.offset;
- up.fds = req->files_update.arg;
+ up.offset = req->rsrc_update.offset;
+ up.data = req->rsrc_update.arg;
mutex_lock(&ctx->uring_lock);
- ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args);
+ ret = __io_sqe_files_update(ctx, &up, req->rsrc_update.nr_args);
mutex_unlock(&ctx->uring_lock);
if (ret < 0)
req_set_fail_links(req);
- __io_req_complete(req, ret, 0, cs);
+ __io_req_complete(req, issue_flags, ret, 0);
return 0;
}
@@ -6053,7 +6112,7 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
case IORING_OP_CLOSE:
return io_close_prep(req, sqe);
case IORING_OP_FILES_UPDATE:
- return io_files_update_prep(req, sqe);
+ return io_rsrc_update_prep(req, sqe);
case IORING_OP_STATX:
return io_statx_prep(req, sqe);
case IORING_OP_FADVISE:
@@ -6151,23 +6210,6 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EIOCBQUEUED;
}
-static void io_req_drop_files(struct io_kiocb *req)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_uring_task *tctx = req->task->io_uring;
- unsigned long flags;
-
- put_files_struct(req->work.identity->files);
- put_nsproxy(req->work.identity->nsproxy);
- spin_lock_irqsave(&ctx->inflight_lock, flags);
- list_del(&req->inflight_entry);
- spin_unlock_irqrestore(&ctx->inflight_lock, flags);
- req->flags &= ~REQ_F_INFLIGHT;
- req->work.flags &= ~IO_WQ_WORK_FILES;
- if (atomic_read(&tctx->in_idle))
- wake_up(&tctx->wait);
-}
-
static void __io_clean_op(struct io_kiocb *req)
{
if (req->flags & REQ_F_BUFFER_SELECTED) {
@@ -6201,8 +6243,8 @@ static void __io_clean_op(struct io_kiocb *req)
case IORING_OP_RECVMSG:
case IORING_OP_SENDMSG: {
struct io_async_msghdr *io = req->async_data;
- if (io->iov != io->fast_iov)
- kfree(io->iov);
+
+ kfree(io->free_iov);
break;
}
case IORING_OP_SPLICE:
@@ -6225,117 +6267,113 @@ static void __io_clean_op(struct io_kiocb *req)
}
req->flags &= ~REQ_F_NEED_CLEANUP;
}
-
- if (req->flags & REQ_F_INFLIGHT)
- io_req_drop_files(req);
}
-static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
- struct io_comp_state *cs)
+static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
int ret;
switch (req->opcode) {
case IORING_OP_NOP:
- ret = io_nop(req, cs);
+ ret = io_nop(req, issue_flags);
break;
case IORING_OP_READV:
case IORING_OP_READ_FIXED:
case IORING_OP_READ:
- ret = io_read(req, force_nonblock, cs);
+ ret = io_read(req, issue_flags);
break;
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
case IORING_OP_WRITE:
- ret = io_write(req, force_nonblock, cs);
+ ret = io_write(req, issue_flags);
break;
case IORING_OP_FSYNC:
- ret = io_fsync(req, force_nonblock);
+ ret = io_fsync(req, issue_flags);
break;
case IORING_OP_POLL_ADD:
- ret = io_poll_add(req);
+ ret = io_poll_add(req, issue_flags);
break;
case IORING_OP_POLL_REMOVE:
- ret = io_poll_remove(req);
+ ret = io_poll_remove(req, issue_flags);
break;
case IORING_OP_SYNC_FILE_RANGE:
- ret = io_sync_file_range(req, force_nonblock);
+ ret = io_sync_file_range(req, issue_flags);
break;
case IORING_OP_SENDMSG:
- ret = io_sendmsg(req, force_nonblock, cs);
+ ret = io_sendmsg(req, issue_flags);
break;
case IORING_OP_SEND:
- ret = io_send(req, force_nonblock, cs);
+ ret = io_send(req, issue_flags);
break;
case IORING_OP_RECVMSG:
- ret = io_recvmsg(req, force_nonblock, cs);
+ ret = io_recvmsg(req, issue_flags);
break;
case IORING_OP_RECV:
- ret = io_recv(req, force_nonblock, cs);
+ ret = io_recv(req, issue_flags);
break;
case IORING_OP_TIMEOUT:
- ret = io_timeout(req);
+ ret = io_timeout(req, issue_flags);
break;
case IORING_OP_TIMEOUT_REMOVE:
- ret = io_timeout_remove(req);
+ ret = io_timeout_remove(req, issue_flags);
break;
case IORING_OP_ACCEPT:
- ret = io_accept(req, force_nonblock, cs);
+ ret = io_accept(req, issue_flags);
break;
case IORING_OP_CONNECT:
- ret = io_connect(req, force_nonblock, cs);
+ ret = io_connect(req, issue_flags);
break;
case IORING_OP_ASYNC_CANCEL:
- ret = io_async_cancel(req);
+ ret = io_async_cancel(req, issue_flags);
break;
case IORING_OP_FALLOCATE:
- ret = io_fallocate(req, force_nonblock);
+ ret = io_fallocate(req, issue_flags);
break;
case IORING_OP_OPENAT:
- ret = io_openat(req, force_nonblock);
+ ret = io_openat(req, issue_flags);
break;
case IORING_OP_CLOSE:
- ret = io_close(req, force_nonblock, cs);
+ ret = io_close(req, issue_flags);
break;
case IORING_OP_FILES_UPDATE:
- ret = io_files_update(req, force_nonblock, cs);
+ ret = io_files_update(req, issue_flags);
break;
case IORING_OP_STATX:
- ret = io_statx(req, force_nonblock);
+ ret = io_statx(req, issue_flags);
break;
case IORING_OP_FADVISE:
- ret = io_fadvise(req, force_nonblock);
+ ret = io_fadvise(req, issue_flags);
break;
case IORING_OP_MADVISE:
- ret = io_madvise(req, force_nonblock);
+ ret = io_madvise(req, issue_flags);
break;
case IORING_OP_OPENAT2:
- ret = io_openat2(req, force_nonblock);
+ ret = io_openat2(req, issue_flags);
break;
case IORING_OP_EPOLL_CTL:
- ret = io_epoll_ctl(req, force_nonblock, cs);
+ ret = io_epoll_ctl(req, issue_flags);
break;
case IORING_OP_SPLICE:
- ret = io_splice(req, force_nonblock);
+ ret = io_splice(req, issue_flags);
break;
case IORING_OP_PROVIDE_BUFFERS:
- ret = io_provide_buffers(req, force_nonblock, cs);
+ ret = io_provide_buffers(req, issue_flags);
break;
case IORING_OP_REMOVE_BUFFERS:
- ret = io_remove_buffers(req, force_nonblock, cs);
+ ret = io_remove_buffers(req, issue_flags);
break;
case IORING_OP_TEE:
- ret = io_tee(req, force_nonblock);
+ ret = io_tee(req, issue_flags);
break;
case IORING_OP_SHUTDOWN:
- ret = io_shutdown(req, force_nonblock);
+ ret = io_shutdown(req, issue_flags);
break;
case IORING_OP_RENAMEAT:
- ret = io_renameat(req, force_nonblock);
+ ret = io_renameat(req, issue_flags);
break;
case IORING_OP_UNLINKAT:
- ret = io_unlinkat(req, force_nonblock);
+ ret = io_unlinkat(req, issue_flags);
break;
default:
ret = -EINVAL;
@@ -6362,7 +6400,7 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
return 0;
}
-static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
+static void io_wq_submit_work(struct io_wq_work *work)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
struct io_kiocb *timeout;
@@ -6372,15 +6410,12 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
if (timeout)
io_queue_linked_timeout(timeout);
- /* if NO_CANCEL is set, we must still run the work */
- if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
- IO_WQ_WORK_CANCEL) {
+ if (work->flags & IO_WQ_WORK_CANCEL)
ret = -ECANCELED;
- }
if (!ret) {
do {
- ret = io_issue_sqe(req, false, NULL);
+ ret = io_issue_sqe(req, 0);
/*
* We can get EAGAIN for polled IO even though we're
* forcing a sync submission from here, since we can't
@@ -6416,14 +6451,12 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
if (lock_ctx)
mutex_unlock(&lock_ctx->uring_lock);
}
-
- return io_steal_work(req);
}
static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
int index)
{
- struct fixed_file_table *table;
+ struct fixed_rsrc_table *table;
table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
return table->files[index & IORING_FILE_TABLE_MASK];
@@ -6446,6 +6479,8 @@ static struct file *io_file_get(struct io_submit_state *state,
file = __io_file_get(state, fd);
}
+ if (file && unlikely(file->f_op == &io_uring_fops))
+ io_req_track_inflight(req);
return file;
}
@@ -6474,9 +6509,10 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
if (prev) {
req_set_fail_links(prev);
io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
- io_put_req(prev);
+ io_put_req_deferred(prev, 1);
} else {
- io_req_complete(req, -ETIME);
+ io_req_complete_post(req, -ETIME, 0);
+ io_put_req_deferred(req, 1);
}
return HRTIMER_NORESTART;
}
@@ -6522,27 +6558,21 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
return nxt;
}
-static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs)
+static void __io_queue_sqe(struct io_kiocb *req)
{
- struct io_kiocb *linked_timeout;
+ struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
const struct cred *old_creds = NULL;
int ret;
-again:
- linked_timeout = io_prep_linked_timeout(req);
-
if ((req->flags & REQ_F_WORK_INITIALIZED) &&
(req->work.flags & IO_WQ_WORK_CREDS) &&
- req->work.identity->creds != current_cred()) {
- if (old_creds)
- revert_creds(old_creds);
- if (old_creds == req->work.identity->creds)
- old_creds = NULL; /* restored original creds */
- else
- old_creds = override_creds(req->work.identity->creds);
- }
+ req->work.identity->creds != current_cred())
+ old_creds = override_creds(req->work.identity->creds);
- ret = io_issue_sqe(req, true, cs);
+ ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
+
+ if (old_creds)
+ revert_creds(old_creds);
/*
* We async punt it if the file wasn't marked NOWAIT, or if the file
@@ -6556,34 +6586,28 @@ again:
*/
io_queue_async_work(req);
}
-
- if (linked_timeout)
- io_queue_linked_timeout(linked_timeout);
} else if (likely(!ret)) {
/* drop submission reference */
- req = io_put_req_find_next(req);
- if (linked_timeout)
- io_queue_linked_timeout(linked_timeout);
+ if (req->flags & REQ_F_COMPLETE_INLINE) {
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_comp_state *cs = &ctx->submit_state.comp;
- if (req) {
- if (!(req->flags & REQ_F_FORCE_ASYNC))
- goto again;
- io_queue_async_work(req);
+ cs->reqs[cs->nr++] = req;
+ if (cs->nr == ARRAY_SIZE(cs->reqs))
+ io_submit_flush_completions(cs, ctx);
+ } else {
+ io_put_req(req);
}
} else {
- /* un-prep timeout, so it'll be killed as any other linked */
- req->flags &= ~REQ_F_LINK_TIMEOUT;
req_set_fail_links(req);
io_put_req(req);
io_req_complete(req, ret);
}
-
- if (old_creds)
- revert_creds(old_creds);
+ if (linked_timeout)
+ io_queue_linked_timeout(linked_timeout);
}
-static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- struct io_comp_state *cs)
+static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
int ret;
@@ -6608,18 +6632,17 @@ fail_req:
if (unlikely(ret))
goto fail_req;
}
- __io_queue_sqe(req, cs);
+ __io_queue_sqe(req);
}
}
-static inline void io_queue_link_head(struct io_kiocb *req,
- struct io_comp_state *cs)
+static inline void io_queue_link_head(struct io_kiocb *req)
{
if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
io_put_req(req);
io_req_complete(req, -ECANCELED);
} else
- io_queue_sqe(req, NULL, cs);
+ io_queue_sqe(req, NULL);
}
struct io_submit_link {
@@ -6628,7 +6651,7 @@ struct io_submit_link {
};
static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- struct io_submit_link *link, struct io_comp_state *cs)
+ struct io_submit_link *link)
{
struct io_ring_ctx *ctx = req->ctx;
int ret;
@@ -6666,7 +6689,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
/* last request of a link, enqueue the link */
if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
- io_queue_link_head(head, cs);
+ io_queue_link_head(head);
link->head = NULL;
}
} else {
@@ -6681,7 +6704,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
link->head = req;
link->last = req;
} else {
- io_queue_sqe(req, sqe, cs);
+ io_queue_sqe(req, sqe);
}
}
@@ -6691,29 +6714,23 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
/*
* Batched submission is done, ensure local IO is flushed out.
*/
-static void io_submit_state_end(struct io_submit_state *state)
+static void io_submit_state_end(struct io_submit_state *state,
+ struct io_ring_ctx *ctx)
{
- if (!list_empty(&state->comp.list))
- io_submit_flush_completions(&state->comp);
+ if (state->comp.nr)
+ io_submit_flush_completions(&state->comp, ctx);
if (state->plug_started)
blk_finish_plug(&state->plug);
io_state_file_put(state);
- if (state->free_reqs)
- kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
}
/*
* Start submission side cache.
*/
static void io_submit_state_start(struct io_submit_state *state,
- struct io_ring_ctx *ctx, unsigned int max_ios)
+ unsigned int max_ios)
{
state->plug_started = false;
- state->comp.nr = 0;
- INIT_LIST_HEAD(&state->comp.list);
- state->comp.ctx = ctx;
- state->free_reqs = 0;
- state->file_refs = 0;
state->ios_left = max_ios;
}
@@ -6750,7 +6767,7 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
* 2) allows the kernel side to track the head on its own, even
* though the application is the one updating it.
*/
- head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]);
+ head = READ_ONCE(sq_array[ctx->cached_sq_head++ & ctx->sq_mask]);
if (likely(head < ctx->sq_entries))
return &ctx->sq_sqes[head];
@@ -6760,11 +6777,6 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
return NULL;
}
-static inline void io_consume_sqe(struct io_ring_ctx *ctx)
-{
- ctx->cached_sq_head++;
-}
-
/*
* Check SQE restrictions (opcode and flags).
*
@@ -6796,36 +6808,36 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
IOSQE_BUFFER_SELECT)
static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
- const struct io_uring_sqe *sqe,
- struct io_submit_state *state)
+ const struct io_uring_sqe *sqe)
{
+ struct io_submit_state *state;
unsigned int sqe_flags;
- int id, ret;
+ int id, ret = 0;
req->opcode = READ_ONCE(sqe->opcode);
+ /* same numerical values with corresponding REQ_F_*, safe to copy */
+ req->flags = sqe_flags = READ_ONCE(sqe->flags);
req->user_data = READ_ONCE(sqe->user_data);
req->async_data = NULL;
req->file = NULL;
req->ctx = ctx;
- req->flags = 0;
req->link = NULL;
- req->fixed_file_refs = NULL;
+ req->fixed_rsrc_refs = NULL;
/* one is dropped after submission, the other at completion */
refcount_set(&req->refs, 2);
req->task = current;
req->result = 0;
+ /* enforce forwards compatibility on users */
+ if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
+ return -EINVAL;
+
if (unlikely(req->opcode >= IORING_OP_LAST))
return -EINVAL;
if (unlikely(io_sq_thread_acquire_mm_files(ctx, req)))
return -EFAULT;
- sqe_flags = READ_ONCE(sqe->flags);
- /* enforce forwards compatibility on users */
- if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
- return -EINVAL;
-
if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
return -EACCES;
@@ -6848,8 +6860,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->work.flags |= IO_WQ_WORK_CREDS;
}
- /* same numerical values with corresponding REQ_F_*, safe to copy */
- req->flags |= sqe_flags;
+ state = &ctx->submit_state;
/*
* Plug now if we have more than 1 IO left after this, and the target
@@ -6861,13 +6872,11 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
state->plug_started = true;
}
- ret = 0;
if (io_op_defs[req->opcode].needs_file) {
bool fixed = req->flags & REQ_F_FIXED_FILE;
req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
- if (unlikely(!req->file &&
- !io_op_defs[req->opcode].needs_file_no_error))
+ if (unlikely(!req->file))
ret = -EBADF;
}
@@ -6877,7 +6886,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
{
- struct io_submit_state state;
struct io_submit_link link;
int i, submitted = 0;
@@ -6896,7 +6904,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
percpu_counter_add(&current->io_uring->inflight, nr);
refcount_add(nr, &current->usage);
- io_submit_state_start(&state, ctx, nr);
+ io_submit_state_start(&ctx->submit_state, nr);
link.head = NULL;
for (i = 0; i < nr; i++) {
@@ -6904,22 +6912,21 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
struct io_kiocb *req;
int err;
- sqe = io_get_sqe(ctx);
- if (unlikely(!sqe)) {
- io_consume_sqe(ctx);
- break;
- }
- req = io_alloc_req(ctx, &state);
+ req = io_alloc_req(ctx);
if (unlikely(!req)) {
if (!submitted)
submitted = -EAGAIN;
break;
}
- io_consume_sqe(ctx);
+ sqe = io_get_sqe(ctx);
+ if (unlikely(!sqe)) {
+ kmem_cache_free(req_cachep, req);
+ break;
+ }
/* will complete beyond this point, count as submitted */
submitted++;
- err = io_init_req(ctx, req, sqe, &state);
+ err = io_init_req(ctx, req, sqe);
if (unlikely(err)) {
fail_req:
io_put_req(req);
@@ -6928,8 +6935,8 @@ fail_req:
}
trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
- true, io_async_submit(ctx));
- err = io_submit_sqe(req, sqe, &link, &state.comp);
+ true, ctx->flags & IORING_SETUP_SQPOLL);
+ err = io_submit_sqe(req, sqe, &link);
if (err)
goto fail_req;
}
@@ -6944,8 +6951,8 @@ fail_req:
put_task_struct_many(current, unused);
}
if (link.head)
- io_queue_link_head(link.head, &state.comp);
- io_submit_state_end(&state);
+ io_queue_link_head(link.head);
+ io_submit_state_end(&ctx->submit_state, ctx);
/* Commit SQ ring head once we've consumed and submitted all SQEs */
io_commit_sqring(ctx);
@@ -7094,9 +7101,6 @@ static int io_sq_thread(void *data)
continue;
}
- if (kthread_should_park())
- continue;
-
needs_sched = true;
prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
@@ -7111,7 +7115,7 @@ static int io_sq_thread(void *data)
}
}
- if (needs_sched) {
+ if (needs_sched && !kthread_should_park()) {
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
io_ring_set_wakeup_flag(ctx);
@@ -7188,6 +7192,25 @@ static int io_run_task_work_sig(void)
return -EINTR;
}
+/* when returns >0, the caller should retry */
+static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
+ struct io_wait_queue *iowq,
+ signed long *timeout)
+{
+ int ret;
+
+ /* make sure we run task_work before checking for signals */
+ ret = io_run_task_work_sig();
+ if (ret || io_should_wake(iowq))
+ return ret;
+ /* let the caller flush overflows, retry */
+ if (test_bit(0, &ctx->cq_check_overflow))
+ return 1;
+
+ *timeout = schedule_timeout(*timeout);
+ return !*timeout ? -ETIME : 1;
+}
+
/*
* Wait until events become available, if we don't already have some. The
* application must reap them itself, as they reside on the shared cq ring.
@@ -7206,9 +7229,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
.to_wait = min_events,
};
struct io_rings *rings = ctx->rings;
- struct timespec64 ts;
- signed long timeout = 0;
- int ret = 0;
+ signed long timeout = MAX_SCHEDULE_TIMEOUT;
+ int ret;
do {
io_cqring_overflow_flush(ctx, false, NULL, NULL);
@@ -7232,6 +7254,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
}
if (uts) {
+ struct timespec64 ts;
+
if (get_timespec64(&ts, uts))
return -EFAULT;
timeout = timespec64_to_jiffies(&ts);
@@ -7243,27 +7267,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
io_cqring_overflow_flush(ctx, false, NULL, NULL);
prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
TASK_INTERRUPTIBLE);
- /* make sure we run task_work before checking for signals */
- ret = io_run_task_work_sig();
- if (ret > 0)
- continue;
- else if (ret < 0)
- break;
- if (io_should_wake(&iowq))
- break;
- if (test_bit(0, &ctx->cq_check_overflow))
- continue;
- if (uts) {
- timeout = schedule_timeout(timeout);
- if (timeout == 0) {
- ret = -ETIME;
- break;
- }
- } else {
- schedule();
- }
- } while (1);
- finish_wait(&ctx->wait, &iowq.wq);
+ ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
+ finish_wait(&ctx->wait, &iowq.wq);
+ } while (ret > 0);
restore_saved_sigmask_unless(ret == -EINTR);
@@ -7293,47 +7299,52 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
#endif
}
-static void io_file_ref_kill(struct percpu_ref *ref)
+static void io_rsrc_data_ref_zero(struct percpu_ref *ref)
{
- struct fixed_file_data *data;
+ struct fixed_rsrc_data *data;
- data = container_of(ref, struct fixed_file_data, refs);
+ data = container_of(ref, struct fixed_rsrc_data, refs);
complete(&data->done);
}
-static void io_sqe_files_set_node(struct fixed_file_data *file_data,
- struct fixed_file_ref_node *ref_node)
+static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx)
{
- spin_lock_bh(&file_data->lock);
- file_data->node = ref_node;
- list_add_tail(&ref_node->node, &file_data->ref_list);
- spin_unlock_bh(&file_data->lock);
- percpu_ref_get(&file_data->refs);
+ spin_lock_bh(&ctx->rsrc_ref_lock);
}
-static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
+static inline void io_rsrc_ref_unlock(struct io_ring_ctx *ctx)
{
- struct fixed_file_data *data = ctx->file_data;
- struct fixed_file_ref_node *backup_node, *ref_node = NULL;
- unsigned nr_tables, i;
- int ret;
+ spin_unlock_bh(&ctx->rsrc_ref_lock);
+}
- if (!data)
- return -ENXIO;
- backup_node = alloc_fixed_file_ref_node(ctx);
- if (!backup_node)
- return -ENOMEM;
+static void io_sqe_rsrc_set_node(struct io_ring_ctx *ctx,
+ struct fixed_rsrc_data *rsrc_data,
+ struct fixed_rsrc_ref_node *ref_node)
+{
+ io_rsrc_ref_lock(ctx);
+ rsrc_data->node = ref_node;
+ list_add_tail(&ref_node->node, &ctx->rsrc_ref_list);
+ io_rsrc_ref_unlock(ctx);
+ percpu_ref_get(&rsrc_data->refs);
+}
- spin_lock_bh(&data->lock);
+static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data,
+ struct io_ring_ctx *ctx,
+ struct fixed_rsrc_ref_node *backup_node)
+{
+ struct fixed_rsrc_ref_node *ref_node;
+ int ret;
+
+ io_rsrc_ref_lock(ctx);
ref_node = data->node;
- spin_unlock_bh(&data->lock);
+ io_rsrc_ref_unlock(ctx);
if (ref_node)
percpu_ref_kill(&ref_node->refs);
percpu_ref_kill(&data->refs);
/* wait for all refs nodes to complete */
- flush_delayed_work(&ctx->file_put_work);
+ flush_delayed_work(&ctx->rsrc_put_work);
do {
ret = wait_for_completion_interruptible(&data->done);
if (!ret)
@@ -7342,21 +7353,65 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
if (ret < 0) {
percpu_ref_resurrect(&data->refs);
reinit_completion(&data->done);
- io_sqe_files_set_node(data, backup_node);
+ io_sqe_rsrc_set_node(ctx, data, backup_node);
return ret;
}
} while (1);
+ destroy_fixed_rsrc_ref_node(backup_node);
+ return 0;
+}
+
+static struct fixed_rsrc_data *alloc_fixed_rsrc_data(struct io_ring_ctx *ctx)
+{
+ struct fixed_rsrc_data *data;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return NULL;
+
+ if (percpu_ref_init(&data->refs, io_rsrc_data_ref_zero,
+ PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
+ kfree(data);
+ return NULL;
+ }
+ data->ctx = ctx;
+ init_completion(&data->done);
+ return data;
+}
+
+static void free_fixed_rsrc_data(struct fixed_rsrc_data *data)
+{
+ percpu_ref_exit(&data->refs);
+ kfree(data->table);
+ kfree(data);
+}
+
+static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
+{
+ struct fixed_rsrc_data *data = ctx->file_data;
+ struct fixed_rsrc_ref_node *backup_node;
+ unsigned nr_tables, i;
+ int ret;
+
+ if (!data)
+ return -ENXIO;
+ backup_node = alloc_fixed_rsrc_ref_node(ctx);
+ if (!backup_node)
+ return -ENOMEM;
+ init_fixed_file_ref_node(ctx, backup_node);
+
+ ret = io_rsrc_ref_quiesce(data, ctx, backup_node);
+ if (ret)
+ return ret;
+
__io_sqe_files_unregister(ctx);
nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
for (i = 0; i < nr_tables; i++)
kfree(data->table[i].files);
- kfree(data->table);
- percpu_ref_exit(&data->refs);
- kfree(data);
+ free_fixed_rsrc_data(data);
ctx->file_data = NULL;
ctx->nr_user_files = 0;
- destroy_fixed_file_ref_node(backup_node);
return 0;
}
@@ -7579,13 +7634,13 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
}
#endif
-static int io_sqe_alloc_file_tables(struct fixed_file_data *file_data,
+static int io_sqe_alloc_file_tables(struct fixed_rsrc_data *file_data,
unsigned nr_tables, unsigned nr_files)
{
int i;
for (i = 0; i < nr_tables; i++) {
- struct fixed_file_table *table = &file_data->table[i];
+ struct fixed_rsrc_table *table = &file_data->table[i];
unsigned this_files;
this_files = min(nr_files, IORING_MAX_FILES_TABLE);
@@ -7600,14 +7655,15 @@ static int io_sqe_alloc_file_tables(struct fixed_file_data *file_data,
return 0;
for (i = 0; i < nr_tables; i++) {
- struct fixed_file_table *table = &file_data->table[i];
+ struct fixed_rsrc_table *table = &file_data->table[i];
kfree(table->files);
}
return 1;
}
-static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
+static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
{
+ struct file *file = prsrc->file;
#if defined(CONFIG_UNIX)
struct sock *sock = ctx->ring_sock->sk;
struct sk_buff_head list, *head = &sock->sk_receive_queue;
@@ -7668,108 +7724,119 @@ static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
#endif
}
-struct io_file_put {
- struct list_head list;
- struct file *file;
-};
-
-static void __io_file_put_work(struct fixed_file_ref_node *ref_node)
+static void __io_rsrc_put_work(struct fixed_rsrc_ref_node *ref_node)
{
- struct fixed_file_data *file_data = ref_node->file_data;
- struct io_ring_ctx *ctx = file_data->ctx;
- struct io_file_put *pfile, *tmp;
+ struct fixed_rsrc_data *rsrc_data = ref_node->rsrc_data;
+ struct io_ring_ctx *ctx = rsrc_data->ctx;
+ struct io_rsrc_put *prsrc, *tmp;
- list_for_each_entry_safe(pfile, tmp, &ref_node->file_list, list) {
- list_del(&pfile->list);
- io_ring_file_put(ctx, pfile->file);
- kfree(pfile);
+ list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
+ list_del(&prsrc->list);
+ ref_node->rsrc_put(ctx, prsrc);
+ kfree(prsrc);
}
percpu_ref_exit(&ref_node->refs);
kfree(ref_node);
- percpu_ref_put(&file_data->refs);
+ percpu_ref_put(&rsrc_data->refs);
}
-static void io_file_put_work(struct work_struct *work)
+static void io_rsrc_put_work(struct work_struct *work)
{
struct io_ring_ctx *ctx;
struct llist_node *node;
- ctx = container_of(work, struct io_ring_ctx, file_put_work.work);
- node = llist_del_all(&ctx->file_put_llist);
+ ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
+ node = llist_del_all(&ctx->rsrc_put_llist);
while (node) {
- struct fixed_file_ref_node *ref_node;
+ struct fixed_rsrc_ref_node *ref_node;
struct llist_node *next = node->next;
- ref_node = llist_entry(node, struct fixed_file_ref_node, llist);
- __io_file_put_work(ref_node);
+ ref_node = llist_entry(node, struct fixed_rsrc_ref_node, llist);
+ __io_rsrc_put_work(ref_node);
node = next;
}
}
-static void io_file_data_ref_zero(struct percpu_ref *ref)
+static struct file **io_fixed_file_slot(struct fixed_rsrc_data *file_data,
+ unsigned i)
+{
+ struct fixed_rsrc_table *table;
+
+ table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT];
+ return &table->files[i & IORING_FILE_TABLE_MASK];
+}
+
+static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
{
- struct fixed_file_ref_node *ref_node;
- struct fixed_file_data *data;
+ struct fixed_rsrc_ref_node *ref_node;
+ struct fixed_rsrc_data *data;
struct io_ring_ctx *ctx;
bool first_add = false;
int delay = HZ;
- ref_node = container_of(ref, struct fixed_file_ref_node, refs);
- data = ref_node->file_data;
+ ref_node = container_of(ref, struct fixed_rsrc_ref_node, refs);
+ data = ref_node->rsrc_data;
ctx = data->ctx;
- spin_lock_bh(&data->lock);
+ io_rsrc_ref_lock(ctx);
ref_node->done = true;
- while (!list_empty(&data->ref_list)) {
- ref_node = list_first_entry(&data->ref_list,
- struct fixed_file_ref_node, node);
+ while (!list_empty(&ctx->rsrc_ref_list)) {
+ ref_node = list_first_entry(&ctx->rsrc_ref_list,
+ struct fixed_rsrc_ref_node, node);
/* recycle ref nodes in order */
if (!ref_node->done)
break;
list_del(&ref_node->node);
- first_add |= llist_add(&ref_node->llist, &ctx->file_put_llist);
+ first_add |= llist_add(&ref_node->llist, &ctx->rsrc_put_llist);
}
- spin_unlock_bh(&data->lock);
+ io_rsrc_ref_unlock(ctx);
if (percpu_ref_is_dying(&data->refs))
delay = 0;
if (!delay)
- mod_delayed_work(system_wq, &ctx->file_put_work, 0);
+ mod_delayed_work(system_wq, &ctx->rsrc_put_work, 0);
else if (first_add)
- queue_delayed_work(system_wq, &ctx->file_put_work, delay);
+ queue_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
}
-static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
+static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
struct io_ring_ctx *ctx)
{
- struct fixed_file_ref_node *ref_node;
+ struct fixed_rsrc_ref_node *ref_node;
ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
if (!ref_node)
return NULL;
- if (percpu_ref_init(&ref_node->refs, io_file_data_ref_zero,
+ if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
0, GFP_KERNEL)) {
kfree(ref_node);
return NULL;
}
INIT_LIST_HEAD(&ref_node->node);
- INIT_LIST_HEAD(&ref_node->file_list);
- ref_node->file_data = ctx->file_data;
+ INIT_LIST_HEAD(&ref_node->rsrc_list);
ref_node->done = false;
return ref_node;
}
-static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node)
+static void init_fixed_file_ref_node(struct io_ring_ctx *ctx,
+ struct fixed_rsrc_ref_node *ref_node)
+{
+ ref_node->rsrc_data = ctx->file_data;
+ ref_node->rsrc_put = io_ring_file_put;
+}
+
+static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node)
{
percpu_ref_exit(&ref_node->refs);
kfree(ref_node);
}
+
static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args)
{
@@ -7777,8 +7844,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_tables, i;
struct file *file;
int fd, ret = -ENOMEM;
- struct fixed_file_ref_node *ref_node;
- struct fixed_file_data *file_data;
+ struct fixed_rsrc_ref_node *ref_node;
+ struct fixed_rsrc_data *file_data;
if (ctx->file_data)
return -EBUSY;
@@ -7787,13 +7854,10 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
if (nr_args > IORING_MAX_FIXED_FILES)
return -EMFILE;
- file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL);
+ file_data = alloc_fixed_rsrc_data(ctx);
if (!file_data)
return -ENOMEM;
- file_data->ctx = ctx;
- init_completion(&file_data->done);
- INIT_LIST_HEAD(&file_data->ref_list);
- spin_lock_init(&file_data->lock);
+ ctx->file_data = file_data;
nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
file_data->table = kcalloc(nr_tables, sizeof(*file_data->table),
@@ -7801,18 +7865,10 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
if (!file_data->table)
goto out_free;
- if (percpu_ref_init(&file_data->refs, io_file_ref_kill,
- PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
- goto out_free;
-
if (io_sqe_alloc_file_tables(file_data, nr_tables, nr_args))
- goto out_ref;
- ctx->file_data = file_data;
+ goto out_free;
for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
- struct fixed_file_table *table;
- unsigned index;
-
if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
ret = -EFAULT;
goto out_fput;
@@ -7837,9 +7893,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
fput(file);
goto out_fput;
}
- table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT];
- index = i & IORING_FILE_TABLE_MASK;
- table->files[index] = file;
+ *io_fixed_file_slot(file_data, i) = file;
}
ret = io_sqe_files_scm(ctx);
@@ -7848,13 +7902,14 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return ret;
}
- ref_node = alloc_fixed_file_ref_node(ctx);
+ ref_node = alloc_fixed_rsrc_ref_node(ctx);
if (!ref_node) {
io_sqe_files_unregister(ctx);
return -ENOMEM;
}
+ init_fixed_file_ref_node(ctx, ref_node);
- io_sqe_files_set_node(file_data, ref_node);
+ io_sqe_rsrc_set_node(ctx, file_data, ref_node);
return ret;
out_fput:
for (i = 0; i < ctx->nr_user_files; i++) {
@@ -7865,11 +7920,8 @@ out_fput:
for (i = 0; i < nr_tables; i++)
kfree(file_data->table[i].files);
ctx->nr_user_files = 0;
-out_ref:
- percpu_ref_exit(&file_data->refs);
out_free:
- kfree(file_data->table);
- kfree(file_data);
+ free_fixed_rsrc_data(ctx->file_data);
ctx->file_data = NULL;
return ret;
}
@@ -7917,29 +7969,34 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
#endif
}
-static int io_queue_file_removal(struct fixed_file_data *data,
- struct file *file)
+static int io_queue_rsrc_removal(struct fixed_rsrc_data *data, void *rsrc)
{
- struct io_file_put *pfile;
- struct fixed_file_ref_node *ref_node = data->node;
+ struct io_rsrc_put *prsrc;
+ struct fixed_rsrc_ref_node *ref_node = data->node;
- pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
- if (!pfile)
+ prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
+ if (!prsrc)
return -ENOMEM;
- pfile->file = file;
- list_add(&pfile->list, &ref_node->file_list);
+ prsrc->rsrc = rsrc;
+ list_add(&prsrc->list, &ref_node->rsrc_list);
return 0;
}
+static inline int io_queue_file_removal(struct fixed_rsrc_data *data,
+ struct file *file)
+{
+ return io_queue_rsrc_removal(data, (void *)file);
+}
+
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
- struct io_uring_files_update *up,
+ struct io_uring_rsrc_update *up,
unsigned nr_args)
{
- struct fixed_file_data *data = ctx->file_data;
- struct fixed_file_ref_node *ref_node;
- struct file *file;
+ struct fixed_rsrc_data *data = ctx->file_data;
+ struct fixed_rsrc_ref_node *ref_node;
+ struct file *file, **file_slot;
__s32 __user *fds;
int fd, i, err;
__u32 done;
@@ -7950,30 +8007,29 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
if (done > ctx->nr_user_files)
return -EINVAL;
- ref_node = alloc_fixed_file_ref_node(ctx);
+ ref_node = alloc_fixed_rsrc_ref_node(ctx);
if (!ref_node)
return -ENOMEM;
+ init_fixed_file_ref_node(ctx, ref_node);
- done = 0;
- fds = u64_to_user_ptr(up->fds);
- while (nr_args) {
- struct fixed_file_table *table;
- unsigned index;
-
+ fds = u64_to_user_ptr(up->data);
+ for (done = 0; done < nr_args; done++) {
err = 0;
if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
err = -EFAULT;
break;
}
- i = array_index_nospec(up->offset, ctx->nr_user_files);
- table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
- index = i & IORING_FILE_TABLE_MASK;
- if (table->files[index]) {
- file = table->files[index];
- err = io_queue_file_removal(data, file);
+ if (fd == IORING_REGISTER_FILES_SKIP)
+ continue;
+
+ i = array_index_nospec(up->offset + done, ctx->nr_user_files);
+ file_slot = io_fixed_file_slot(ctx->file_data, i);
+
+ if (*file_slot) {
+ err = io_queue_file_removal(data, *file_slot);
if (err)
break;
- table->files[index] = NULL;
+ *file_slot = NULL;
needs_switch = true;
}
if (fd != -1) {
@@ -7995,24 +8051,21 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
err = -EBADF;
break;
}
- table->files[index] = file;
+ *file_slot = file;
err = io_sqe_file_register(ctx, file, i);
if (err) {
- table->files[index] = NULL;
+ *file_slot = NULL;
fput(file);
break;
}
}
- nr_args--;
- done++;
- up->offset++;
}
if (needs_switch) {
percpu_ref_kill(&data->node->refs);
- io_sqe_files_set_node(data, ref_node);
+ io_sqe_rsrc_set_node(ctx, data, ref_node);
} else
- destroy_fixed_file_ref_node(ref_node);
+ destroy_fixed_rsrc_ref_node(ref_node);
return done ? done : err;
}
@@ -8020,7 +8073,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args)
{
- struct io_uring_files_update up;
+ struct io_uring_rsrc_update up;
if (!ctx->file_data)
return -ENXIO;
@@ -8034,12 +8087,12 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
return __io_sqe_files_update(ctx, &up, nr_args);
}
-static void io_free_work(struct io_wq_work *work)
+static struct io_wq_work *io_free_work(struct io_wq_work *work)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
- /* Consider that io_steal_work() relies on this ref */
- io_put_req(req);
+ req = io_put_req_find_next(req);
+ return req ? &req->work : NULL;
}
static int io_init_wq_offload(struct io_ring_ctx *ctx,
@@ -8112,6 +8165,10 @@ static int io_uring_alloc_task_context(struct task_struct *task)
io_init_identity(&tctx->__identity);
tctx->identity = &tctx->__identity;
task->io_uring = tctx;
+ spin_lock_init(&tctx->task_lock);
+ INIT_WQ_LIST(&tctx->task_list);
+ tctx->task_state = 0;
+ init_task_work(&tctx->task_work, tctx_task_work);
return 0;
}
@@ -8233,25 +8290,16 @@ static inline int __io_account_mem(struct user_struct *user,
return 0;
}
-static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
- enum io_mem_account acct)
+static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
{
if (ctx->limit_mem)
__io_unaccount_mem(ctx->user, nr_pages);
- if (ctx->mm_account) {
- if (acct == ACCT_LOCKED) {
- mmap_write_lock(ctx->mm_account);
- ctx->mm_account->locked_vm -= nr_pages;
- mmap_write_unlock(ctx->mm_account);
- }else if (acct == ACCT_PINNED) {
- atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
- }
- }
+ if (ctx->mm_account)
+ atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
}
-static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
- enum io_mem_account acct)
+static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
{
int ret;
@@ -8261,15 +8309,8 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
return ret;
}
- if (ctx->mm_account) {
- if (acct == ACCT_LOCKED) {
- mmap_write_lock(ctx->mm_account);
- ctx->mm_account->locked_vm += nr_pages;
- mmap_write_unlock(ctx->mm_account);
- } else if (acct == ACCT_PINNED) {
- atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
- }
- }
+ if (ctx->mm_account)
+ atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
return 0;
}
@@ -8289,7 +8330,7 @@ static void io_mem_free(void *ptr)
static void *io_mem_alloc(size_t size)
{
gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
- __GFP_NORETRY;
+ __GFP_NORETRY | __GFP_ACCOUNT;
return (void *) __get_free_pages(gfp_flags, get_order(size));
}
@@ -8323,19 +8364,7 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
return off;
}
-static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
-{
- size_t pages;
-
- pages = (size_t)1 << get_order(
- rings_size(sq_entries, cq_entries, NULL));
- pages += (size_t)1 << get_order(
- array_size(sizeof(struct io_uring_sqe), sq_entries));
-
- return pages;
-}
-
-static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
+static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
{
int i, j;
@@ -8349,7 +8378,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
unpin_user_page(imu->bvec[j].bv_page);
if (imu->acct_pages)
- io_unaccount_mem(ctx, imu->acct_pages, ACCT_PINNED);
+ io_unaccount_mem(ctx, imu->acct_pages);
kvfree(imu->bvec);
imu->nr_bvecs = 0;
}
@@ -8447,21 +8476,105 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
if (!imu->acct_pages)
return 0;
- ret = io_account_mem(ctx, imu->acct_pages, ACCT_PINNED);
+ ret = io_account_mem(ctx, imu->acct_pages);
if (ret)
imu->acct_pages = 0;
return ret;
}
-static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
- unsigned nr_args)
+static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
+ struct io_mapped_ubuf *imu,
+ struct page **last_hpage)
{
struct vm_area_struct **vmas = NULL;
struct page **pages = NULL;
- struct page *last_hpage = NULL;
- int i, j, got_pages = 0;
- int ret = -EINVAL;
+ unsigned long off, start, end, ubuf;
+ size_t size;
+ int ret, pret, nr_pages, i;
+ ubuf = (unsigned long) iov->iov_base;
+ end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ start = ubuf >> PAGE_SHIFT;
+ nr_pages = end - start;
+
+ ret = -ENOMEM;
+
+ pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
+ if (!pages)
+ goto done;
+
+ vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
+ GFP_KERNEL);
+ if (!vmas)
+ goto done;
+
+ imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
+ GFP_KERNEL);
+ if (!imu->bvec)
+ goto done;
+
+ ret = 0;
+ mmap_read_lock(current->mm);
+ pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
+ pages, vmas);
+ if (pret == nr_pages) {
+ /* don't support file backed memory */
+ for (i = 0; i < nr_pages; i++) {
+ struct vm_area_struct *vma = vmas[i];
+
+ if (vma->vm_file &&
+ !is_file_hugepages(vma->vm_file)) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+ }
+ } else {
+ ret = pret < 0 ? pret : -EFAULT;
+ }
+ mmap_read_unlock(current->mm);
+ if (ret) {
+ /*
+ * if we did partial map, or found file backed vmas,
+ * release any pages we did get
+ */
+ if (pret > 0)
+ unpin_user_pages(pages, pret);
+ kvfree(imu->bvec);
+ goto done;
+ }
+
+ ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
+ if (ret) {
+ unpin_user_pages(pages, pret);
+ kvfree(imu->bvec);
+ goto done;
+ }
+
+ off = ubuf & ~PAGE_MASK;
+ size = iov->iov_len;
+ for (i = 0; i < nr_pages; i++) {
+ size_t vec_len;
+
+ vec_len = min_t(size_t, size, PAGE_SIZE - off);
+ imu->bvec[i].bv_page = pages[i];
+ imu->bvec[i].bv_len = vec_len;
+ imu->bvec[i].bv_offset = off;
+ off = 0;
+ size -= vec_len;
+ }
+ /* store original address for later verification */
+ imu->ubuf = ubuf;
+ imu->len = iov->iov_len;
+ imu->nr_bvecs = nr_pages;
+ ret = 0;
+done:
+ kvfree(pages);
+ kvfree(vmas);
+ return ret;
+}
+
+static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
+{
if (ctx->user_bufs)
return -EBUSY;
if (!nr_args || nr_args > UIO_MAXIOV)
@@ -8472,121 +8585,58 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
if (!ctx->user_bufs)
return -ENOMEM;
- for (i = 0; i < nr_args; i++) {
- struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
- unsigned long off, start, end, ubuf;
- int pret, nr_pages;
- struct iovec iov;
- size_t size;
+ return 0;
+}
- ret = io_copy_iov(ctx, &iov, arg, i);
- if (ret)
- goto err;
+static int io_buffer_validate(struct iovec *iov)
+{
+ /*
+ * Don't impose further limits on the size and buffer
+ * constraints here, we'll -EINVAL later when IO is
+ * submitted if they are wrong.
+ */
+ if (!iov->iov_base || !iov->iov_len)
+ return -EFAULT;
- /*
- * Don't impose further limits on the size and buffer
- * constraints here, we'll -EINVAL later when IO is
- * submitted if they are wrong.
- */
- ret = -EFAULT;
- if (!iov.iov_base || !iov.iov_len)
- goto err;
+ /* arbitrary limit, but we need something */
+ if (iov->iov_len > SZ_1G)
+ return -EFAULT;
- /* arbitrary limit, but we need something */
- if (iov.iov_len > SZ_1G)
- goto err;
+ return 0;
+}
- ubuf = (unsigned long) iov.iov_base;
- end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- start = ubuf >> PAGE_SHIFT;
- nr_pages = end - start;
+static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned int nr_args)
+{
+ int i, ret;
+ struct iovec iov;
+ struct page *last_hpage = NULL;
- ret = 0;
- if (!pages || nr_pages > got_pages) {
- kvfree(vmas);
- kvfree(pages);
- pages = kvmalloc_array(nr_pages, sizeof(struct page *),
- GFP_KERNEL);
- vmas = kvmalloc_array(nr_pages,
- sizeof(struct vm_area_struct *),
- GFP_KERNEL);
- if (!pages || !vmas) {
- ret = -ENOMEM;
- goto err;
- }
- got_pages = nr_pages;
- }
+ ret = io_buffers_map_alloc(ctx, nr_args);
+ if (ret)
+ return ret;
- imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
- GFP_KERNEL);
- ret = -ENOMEM;
- if (!imu->bvec)
- goto err;
+ for (i = 0; i < nr_args; i++) {
+ struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
- ret = 0;
- mmap_read_lock(current->mm);
- pret = pin_user_pages(ubuf, nr_pages,
- FOLL_WRITE | FOLL_LONGTERM,
- pages, vmas);
- if (pret == nr_pages) {
- /* don't support file backed memory */
- for (j = 0; j < nr_pages; j++) {
- struct vm_area_struct *vma = vmas[j];
-
- if (vma->vm_file &&
- !is_file_hugepages(vma->vm_file)) {
- ret = -EOPNOTSUPP;
- break;
- }
- }
- } else {
- ret = pret < 0 ? pret : -EFAULT;
- }
- mmap_read_unlock(current->mm);
- if (ret) {
- /*
- * if we did partial map, or found file backed vmas,
- * release any pages we did get
- */
- if (pret > 0)
- unpin_user_pages(pages, pret);
- kvfree(imu->bvec);
- goto err;
- }
+ ret = io_copy_iov(ctx, &iov, arg, i);
+ if (ret)
+ break;
- ret = io_buffer_account_pin(ctx, pages, pret, imu, &last_hpage);
- if (ret) {
- unpin_user_pages(pages, pret);
- kvfree(imu->bvec);
- goto err;
- }
+ ret = io_buffer_validate(&iov);
+ if (ret)
+ break;
- off = ubuf & ~PAGE_MASK;
- size = iov.iov_len;
- for (j = 0; j < nr_pages; j++) {
- size_t vec_len;
-
- vec_len = min_t(size_t, size, PAGE_SIZE - off);
- imu->bvec[j].bv_page = pages[j];
- imu->bvec[j].bv_len = vec_len;
- imu->bvec[j].bv_offset = off;
- off = 0;
- size -= vec_len;
- }
- /* store original address for later verification */
- imu->ubuf = ubuf;
- imu->len = iov.iov_len;
- imu->nr_bvecs = nr_pages;
+ ret = io_sqe_buffer_register(ctx, &iov, imu, &last_hpage);
+ if (ret)
+ break;
ctx->nr_user_bufs++;
}
- kvfree(pages);
- kvfree(vmas);
- return 0;
-err:
- kvfree(pages);
- kvfree(vmas);
- io_sqe_buffer_unregister(ctx);
+
+ if (ret)
+ io_sqe_buffers_unregister(ctx);
+
return ret;
}
@@ -8637,10 +8687,49 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx)
idr_destroy(&ctx->io_buffer_idr);
}
+static void io_req_cache_free(struct list_head *list, struct task_struct *tsk)
+{
+ struct io_kiocb *req, *nxt;
+
+ list_for_each_entry_safe(req, nxt, list, compl.list) {
+ if (tsk && req->task != tsk)
+ continue;
+ list_del(&req->compl.list);
+ kmem_cache_free(req_cachep, req);
+ }
+}
+
+static void io_req_caches_free(struct io_ring_ctx *ctx, struct task_struct *tsk)
+{
+ struct io_submit_state *submit_state = &ctx->submit_state;
+
+ mutex_lock(&ctx->uring_lock);
+
+ if (submit_state->free_reqs)
+ kmem_cache_free_bulk(req_cachep, submit_state->free_reqs,
+ submit_state->reqs);
+
+ io_req_cache_free(&submit_state->comp.free_list, NULL);
+
+ spin_lock_irq(&ctx->completion_lock);
+ io_req_cache_free(&submit_state->comp.locked_free_list, NULL);
+ spin_unlock_irq(&ctx->completion_lock);
+
+ mutex_unlock(&ctx->uring_lock);
+}
+
static void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
+ /*
+ * Some may use context even when all refs and requests have been put,
+ * and they are free to do so while still holding uring_lock, see
+ * __io_req_task_submit(). Wait for them to finish.
+ */
+ mutex_lock(&ctx->uring_lock);
+ mutex_unlock(&ctx->uring_lock);
+
io_finish_async(ctx);
- io_sqe_buffer_unregister(ctx);
+ io_sqe_buffers_unregister(ctx);
if (ctx->sqo_task) {
put_task_struct(ctx->sqo_task);
@@ -8672,8 +8761,8 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
percpu_ref_exit(&ctx->refs);
free_uid(ctx->user);
put_cred(ctx->creds);
+ io_req_caches_free(ctx, NULL);
kfree(ctx->cancel_hash);
- kmem_cache_free(req_cachep, ctx->fallback_req);
kfree(ctx);
}
@@ -8690,8 +8779,21 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
smp_rmb();
if (!io_sqring_full(ctx))
mask |= EPOLLOUT | EPOLLWRNORM;
- io_cqring_overflow_flush(ctx, false, NULL, NULL);
- if (io_cqring_events(ctx))
+
+ /*
+ * Don't flush cqring overflow list here, just do a simple check.
+ * Otherwise there could possible be ABBA deadlock:
+ * CPU0 CPU1
+ * ---- ----
+ * lock(&ctx->uring_lock);
+ * lock(&ep->mtx);
+ * lock(&ctx->uring_lock);
+ * lock(&ep->mtx);
+ *
+ * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
+ * pushs them to do the flush.
+ */
+ if (io_cqring_events(ctx) || test_bit(0, &ctx->cq_check_overflow))
mask |= EPOLLIN | EPOLLRDNORM;
return mask;
@@ -8704,9 +8806,8 @@ static int io_uring_fasync(int fd, struct file *file, int on)
return fasync_helper(fd, file, on, &ctx->cq_fasync);
}
-static int io_remove_personalities(int id, void *p, void *data)
+static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
{
- struct io_ring_ctx *ctx = data;
struct io_identity *iod;
iod = idr_remove(&ctx->personality_idr, id);
@@ -8714,7 +8815,17 @@ static int io_remove_personalities(int id, void *p, void *data)
put_cred(iod->creds);
if (refcount_dec_and_test(&iod->count))
kfree(iod);
+ return 0;
}
+
+ return -EINVAL;
+}
+
+static int io_remove_personalities(int id, void *p, void *data)
+{
+ struct io_ring_ctx *ctx = data;
+
+ io_unregister_personality(ctx, id);
return 0;
}
@@ -8730,7 +8841,7 @@ static void io_ring_exit_work(struct work_struct *work)
* as nobody else will be looking for them.
*/
do {
- __io_uring_cancel_task_requests(ctx, NULL);
+ io_uring_try_cancel_requests(ctx, NULL, NULL);
} while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
io_ring_ctx_free(ctx);
}
@@ -8754,6 +8865,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
ctx->cq_overflow_flushed = 1;
if (ctx->rings)
__io_cqring_overflow_flush(ctx, true, NULL, NULL);
+ idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
mutex_unlock(&ctx->uring_lock);
io_kill_timeouts(ctx, NULL, NULL);
@@ -8764,15 +8876,6 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
/* if we failed setting up the ctx, we might not have any rings */
io_iopoll_try_reap_events(ctx);
- idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
-
- /*
- * Do this upfront, so we won't have a grace period where the ring
- * is closed but resources aren't reaped yet. This can cause
- * spurious failure in setting up a new ring.
- */
- io_unaccount_mem(ctx, ring_pages(ctx->sq_entries, ctx->cq_entries),
- ACCT_LOCKED);
INIT_WORK(&ctx->exit_work, io_ring_exit_work);
/*
@@ -8844,48 +8947,13 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx,
}
}
-static void io_uring_cancel_files(struct io_ring_ctx *ctx,
- struct task_struct *task,
- struct files_struct *files)
+static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
+ struct task_struct *task,
+ struct files_struct *files)
{
- while (!list_empty_careful(&ctx->inflight_list)) {
- struct io_task_cancel cancel = { .task = task, .files = files };
- struct io_kiocb *req;
- DEFINE_WAIT(wait);
- bool found = false;
+ struct io_task_cancel cancel = { .task = task, .files = files, };
- spin_lock_irq(&ctx->inflight_lock);
- list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
- if (req->task != task ||
- req->work.identity->files != files)
- continue;
- found = true;
- break;
- }
- if (found)
- prepare_to_wait(&task->io_uring->wait, &wait,
- TASK_UNINTERRUPTIBLE);
- spin_unlock_irq(&ctx->inflight_lock);
-
- /* We need to keep going until we don't find a matching req */
- if (!found)
- break;
-
- io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true);
- io_poll_remove_all(ctx, task, files);
- io_kill_timeouts(ctx, task, files);
- /* cancellations _may_ trigger task work */
- io_run_task_work();
- schedule();
- finish_wait(&task->io_uring->wait, &wait);
- }
-}
-
-static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
- struct task_struct *task)
-{
while (1) {
- struct io_task_cancel cancel = { .task = task, .files = NULL, };
enum io_wq_cancel cret;
bool ret = false;
@@ -8896,26 +8964,65 @@ static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
}
/* SQPOLL thread does its own polling */
- if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
+ if (!(ctx->flags & IORING_SETUP_SQPOLL) && !files) {
while (!list_empty_careful(&ctx->iopoll_list)) {
io_iopoll_try_reap_events(ctx);
ret = true;
}
}
- ret |= io_poll_remove_all(ctx, task, NULL);
- ret |= io_kill_timeouts(ctx, task, NULL);
+ ret |= io_poll_remove_all(ctx, task, files);
+ ret |= io_kill_timeouts(ctx, task, files);
ret |= io_run_task_work();
+ io_cqring_overflow_flush(ctx, true, task, files);
if (!ret)
break;
cond_resched();
}
}
-static void io_disable_sqo_submit(struct io_ring_ctx *ctx)
+static int io_uring_count_inflight(struct io_ring_ctx *ctx,
+ struct task_struct *task,
+ struct files_struct *files)
{
- WARN_ON_ONCE(ctx->sqo_task != current);
+ struct io_kiocb *req;
+ int cnt = 0;
+
+ spin_lock_irq(&ctx->inflight_lock);
+ list_for_each_entry(req, &ctx->inflight_list, inflight_entry)
+ cnt += io_match_task(req, task, files);
+ spin_unlock_irq(&ctx->inflight_lock);
+ return cnt;
+}
+static void io_uring_cancel_files(struct io_ring_ctx *ctx,
+ struct task_struct *task,
+ struct files_struct *files)
+{
+ while (!list_empty_careful(&ctx->inflight_list)) {
+ DEFINE_WAIT(wait);
+ int inflight;
+
+ inflight = io_uring_count_inflight(ctx, task, files);
+ if (!inflight)
+ break;
+
+ io_uring_try_cancel_requests(ctx, task, files);
+
+ if (ctx->sq_data)
+ io_sq_thread_unpark(ctx->sq_data);
+ prepare_to_wait(&task->io_uring->wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (inflight == io_uring_count_inflight(ctx, task, files))
+ schedule();
+ finish_wait(&task->io_uring->wait, &wait);
+ if (ctx->sq_data)
+ io_sq_thread_park(ctx->sq_data);
+ }
+}
+
+static void io_disable_sqo_submit(struct io_ring_ctx *ctx)
+{
mutex_lock(&ctx->uring_lock);
ctx->sqo_dead = 1;
mutex_unlock(&ctx->uring_lock);
@@ -8936,7 +9043,6 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
struct task_struct *task = current;
if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
- /* for SQPOLL only sqo_task has task notes */
io_disable_sqo_submit(ctx);
task = ctx->sq_data->thread;
atomic_inc(&task->io_uring->in_idle);
@@ -8944,21 +9050,13 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
}
io_cancel_defer_files(ctx, task, files);
- io_cqring_overflow_flush(ctx, true, task, files);
+ io_uring_cancel_files(ctx, task, files);
if (!files)
- __io_uring_cancel_task_requests(ctx, task);
- else
- io_uring_cancel_files(ctx, task, files);
+ io_uring_try_cancel_requests(ctx, task, NULL);
if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
atomic_dec(&task->io_uring->in_idle);
- /*
- * If the files that are going away are the ones in the thread
- * identity, clear them out.
- */
- if (task->io_uring->identity->files == files)
- task->io_uring->identity->files = NULL;
io_sq_thread_unpark(ctx->sq_data);
}
}
@@ -8988,6 +9086,10 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
fput(file);
return ret;
}
+
+ /* one and only SQPOLL file note, held by sqo_task */
+ WARN_ON_ONCE((ctx->flags & IORING_SETUP_SQPOLL) &&
+ current != ctx->sqo_task);
}
tctx->last = file;
}
@@ -9044,29 +9146,39 @@ void __io_uring_files_cancel(struct files_struct *files)
static s64 tctx_inflight(struct io_uring_task *tctx)
{
- unsigned long index;
- struct file *file;
- s64 inflight;
-
- inflight = percpu_counter_sum(&tctx->inflight);
- if (!tctx->sqpoll)
- return inflight;
+ return percpu_counter_sum(&tctx->inflight);
+}
- /*
- * If we have SQPOLL rings, then we need to iterate and find them, and
- * add the pending count for those.
- */
- xa_for_each(&tctx->xa, index, file) {
- struct io_ring_ctx *ctx = file->private_data;
+static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
+{
+ struct io_uring_task *tctx;
+ s64 inflight;
+ DEFINE_WAIT(wait);
- if (ctx->flags & IORING_SETUP_SQPOLL) {
- struct io_uring_task *__tctx = ctx->sqo_task->io_uring;
+ if (!ctx->sq_data)
+ return;
+ tctx = ctx->sq_data->thread->io_uring;
+ io_disable_sqo_submit(ctx);
- inflight += percpu_counter_sum(&__tctx->inflight);
- }
- }
+ atomic_inc(&tctx->in_idle);
+ do {
+ /* read completions before cancelations */
+ inflight = tctx_inflight(tctx);
+ if (!inflight)
+ break;
+ io_uring_cancel_task_requests(ctx, NULL);
- return inflight;
+ prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
+ /*
+ * If we've seen completions, retry without waiting. This
+ * avoids a race where a completion comes in before we did
+ * prepare_to_wait().
+ */
+ if (inflight == tctx_inflight(tctx))
+ schedule();
+ finish_wait(&tctx->wait, &wait);
+ } while (1);
+ atomic_dec(&tctx->in_idle);
}
/*
@@ -9082,6 +9194,15 @@ void __io_uring_task_cancel(void)
/* make sure overflow events are dropped */
atomic_inc(&tctx->in_idle);
+ /* trigger io_disable_sqo_submit() */
+ if (tctx->sqpoll) {
+ struct file *file;
+ unsigned long index;
+
+ xa_for_each(&tctx->xa, index, file)
+ io_uring_cancel_sqpoll(file->private_data);
+ }
+
do {
/* read completions before cancelations */
inflight = tctx_inflight(tctx);
@@ -9092,16 +9213,15 @@ void __io_uring_task_cancel(void)
prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
/*
- * If we've seen completions, retry. This avoids a race where
- * a completion comes in before we did prepare_to_wait().
+ * If we've seen completions, retry without waiting. This
+ * avoids a race where a completion comes in before we did
+ * prepare_to_wait().
*/
- if (inflight != tctx_inflight(tctx))
- continue;
- schedule();
+ if (inflight == tctx_inflight(tctx))
+ schedule();
finish_wait(&tctx->wait, &wait);
} while (1);
- finish_wait(&tctx->wait, &wait);
atomic_dec(&tctx->in_idle);
io_uring_remove_task_files(tctx);
@@ -9112,6 +9232,11 @@ static int io_uring_flush(struct file *file, void *data)
struct io_uring_task *tctx = current->io_uring;
struct io_ring_ctx *ctx = file->private_data;
+ if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) {
+ io_uring_cancel_task_requests(ctx, NULL);
+ io_req_caches_free(ctx, current);
+ }
+
if (!tctx)
return 0;
@@ -9128,7 +9253,10 @@ static int io_uring_flush(struct file *file, void *data)
if (ctx->flags & IORING_SETUP_SQPOLL) {
/* there is only one file note, which is owned by sqo_task */
- WARN_ON_ONCE((ctx->sqo_task == current) ==
+ WARN_ON_ONCE(ctx->sqo_task != current &&
+ xa_load(&tctx->xa, (unsigned long)file));
+ /* sqo_dead check is for when this happens after cancellation */
+ WARN_ON_ONCE(ctx->sqo_task == current && !ctx->sqo_dead &&
!xa_load(&tctx->xa, (unsigned long)file));
io_disable_sqo_submit(ctx);
@@ -9416,11 +9544,8 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
- struct fixed_file_table *table;
- struct file *f;
+ struct file *f = *io_fixed_file_slot(ctx->file_data, i);
- table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
- f = table->files[i & IORING_FILE_TABLE_MASK];
if (f)
seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
else
@@ -9576,7 +9701,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
struct user_struct *user = NULL;
struct io_ring_ctx *ctx;
struct file *file;
- bool limit_mem;
int ret;
if (!entries)
@@ -9617,26 +9741,14 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
}
user = get_uid(current_user());
- limit_mem = !capable(CAP_IPC_LOCK);
-
- if (limit_mem) {
- ret = __io_account_mem(user,
- ring_pages(p->sq_entries, p->cq_entries));
- if (ret) {
- free_uid(user);
- return ret;
- }
- }
ctx = io_ring_ctx_alloc(p);
if (!ctx) {
- if (limit_mem)
- __io_unaccount_mem(user, ring_pages(p->sq_entries,
- p->cq_entries));
free_uid(user);
return -ENOMEM;
}
ctx->compat = in_compat_syscall();
+ ctx->limit_mem = !capable(CAP_IPC_LOCK);
ctx->user = user;
ctx->creds = get_current_cred();
#ifdef CONFIG_AUDIT
@@ -9672,17 +9784,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
goto err;
}
#endif
-
- /*
- * Account memory _before_ installing the file descriptor. Once
- * the descriptor is installed, it can get closed at any time. Also
- * do this before hitting the general error path, as ring freeing
- * will un-account as well.
- */
- io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries),
- ACCT_LOCKED);
- ctx->limit_mem = limit_mem;
-
ret = io_allocate_scq_urings(ctx, p);
if (ret)
goto err;
@@ -9840,21 +9941,6 @@ static int io_register_personality(struct io_ring_ctx *ctx)
return ret;
}
-static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
-{
- struct io_identity *iod;
-
- iod = idr_remove(&ctx->personality_idr, id);
- if (iod) {
- put_cred(iod->creds);
- if (refcount_dec_and_test(&iod->count))
- kfree(iod);
- return 0;
- }
-
- return -EINVAL;
-}
-
static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
unsigned int nr_args)
{
@@ -10012,13 +10098,13 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
switch (opcode) {
case IORING_REGISTER_BUFFERS:
- ret = io_sqe_buffer_register(ctx, arg, nr_args);
+ ret = io_sqe_buffers_register(ctx, arg, nr_args);
break;
case IORING_UNREGISTER_BUFFERS:
ret = -EINVAL;
if (arg || nr_args)
break;
- ret = io_sqe_buffer_unregister(ctx);
+ ret = io_sqe_buffers_unregister(ctx);
break;
case IORING_REGISTER_FILES:
ret = io_sqe_files_register(ctx, arg, nr_args);
@@ -10161,7 +10247,8 @@ static int __init io_uring_init(void)
BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
- req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+ req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
+ SLAB_ACCOUNT);
return 0;
};
__initcall(io_uring_init);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 933f234d5bec..e2c4991833b8 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -201,6 +201,34 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
iomap_dio_submit_bio(dio, iomap, bio, pos);
}
+/*
+ * Figure out the bio's operation flags from the dio request, the
+ * mapping, and whether or not we want FUA. Note that we can end up
+ * clearing the WRITE_FUA flag in the dio request.
+ */
+static inline unsigned int
+iomap_dio_bio_opflags(struct iomap_dio *dio, struct iomap *iomap, bool use_fua)
+{
+ unsigned int opflags = REQ_SYNC | REQ_IDLE;
+
+ if (!(dio->flags & IOMAP_DIO_WRITE)) {
+ WARN_ON_ONCE(iomap->flags & IOMAP_F_ZONE_APPEND);
+ return REQ_OP_READ;
+ }
+
+ if (iomap->flags & IOMAP_F_ZONE_APPEND)
+ opflags |= REQ_OP_ZONE_APPEND;
+ else
+ opflags |= REQ_OP_WRITE;
+
+ if (use_fua)
+ opflags |= REQ_FUA;
+ else
+ dio->flags &= ~IOMAP_DIO_WRITE_FUA;
+
+ return opflags;
+}
+
static loff_t
iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
struct iomap_dio *dio, struct iomap *iomap)
@@ -208,6 +236,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
unsigned int fs_block_size = i_blocksize(inode), pad;
unsigned int align = iov_iter_alignment(dio->submit.iter);
+ unsigned int bio_opf;
struct bio *bio;
bool need_zeroout = false;
bool use_fua = false;
@@ -250,11 +279,8 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
orig_count = iov_iter_count(dio->submit.iter);
iov_iter_truncate(dio->submit.iter, length);
- nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES);
- if (nr_pages <= 0) {
- ret = nr_pages;
+ if (!iov_iter_count(dio->submit.iter))
goto out;
- }
if (need_zeroout) {
/* zero out from the start of the block to the write offset */
@@ -263,6 +289,14 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
iomap_dio_zero(dio, iomap, pos - pad, pad);
}
+ /*
+ * Set the operation flags early so that bio_iov_iter_get_pages
+ * can set up the page vector appropriately for a ZONE_APPEND
+ * operation.
+ */
+ bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
+
+ nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_PAGES);
do {
size_t n;
if (dio->error) {
@@ -278,6 +312,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
bio->bi_ioprio = dio->iocb->ki_ioprio;
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
+ bio->bi_opf = bio_opf;
ret = bio_iov_iter_get_pages(bio, dio->submit.iter);
if (unlikely(ret)) {
@@ -293,14 +328,8 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
n = bio->bi_iter.bi_size;
if (dio->flags & IOMAP_DIO_WRITE) {
- bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
- if (use_fua)
- bio->bi_opf |= REQ_FUA;
- else
- dio->flags &= ~IOMAP_DIO_WRITE_FUA;
task_io_account_write(n);
} else {
- bio->bi_opf = REQ_OP_READ;
if (dio->flags & IOMAP_DIO_DIRTY)
bio_set_pages_dirty(bio);
}
@@ -308,7 +337,8 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
dio->size += n;
copied += n;
- nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES);
+ nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter,
+ BIO_MAX_PAGES);
iomap_dio_submit_bio(dio, iomap, bio, pos);
pos += n;
} while (nr_pages);
@@ -420,23 +450,22 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
struct iomap_dio *
__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
- bool wait_for_completion)
+ unsigned int dio_flags)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = file_inode(iocb->ki_filp);
size_t count = iov_iter_count(iter);
loff_t pos = iocb->ki_pos;
loff_t end = iocb->ki_pos + count - 1, ret = 0;
- unsigned int flags = IOMAP_DIRECT;
+ bool wait_for_completion =
+ is_sync_kiocb(iocb) || (dio_flags & IOMAP_DIO_FORCE_WAIT);
+ unsigned int iomap_flags = IOMAP_DIRECT;
struct blk_plug plug;
struct iomap_dio *dio;
if (!count)
return NULL;
- if (WARN_ON(is_sync_kiocb(iocb) && !wait_for_completion))
- return ERR_PTR(-EIO);
-
dio = kmalloc(sizeof(*dio), GFP_KERNEL);
if (!dio)
return ERR_PTR(-ENOMEM);
@@ -461,7 +490,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iter_is_iovec(iter))
dio->flags |= IOMAP_DIO_DIRTY;
} else {
- flags |= IOMAP_WRITE;
+ iomap_flags |= IOMAP_WRITE;
dio->flags |= IOMAP_DIO_WRITE;
/* for data sync or sync, we need sync completion processing */
@@ -483,7 +512,14 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
ret = -EAGAIN;
goto out_free_dio;
}
- flags |= IOMAP_NOWAIT;
+ iomap_flags |= IOMAP_NOWAIT;
+ }
+
+ if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
+ ret = -EAGAIN;
+ if (pos >= dio->i_size || pos + count > dio->i_size)
+ goto out_free_dio;
+ iomap_flags |= IOMAP_OVERWRITE_ONLY;
}
ret = filemap_write_and_wait_range(mapping, pos, end);
@@ -514,7 +550,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
blk_start_plug(&plug);
do {
- ret = iomap_apply(inode, pos, count, flags, ops, dio,
+ ret = iomap_apply(inode, pos, count, iomap_flags, ops, dio,
iomap_dio_actor);
if (ret <= 0) {
/* magic error code to fall back to buffered I/O */
@@ -598,11 +634,11 @@ EXPORT_SYMBOL_GPL(__iomap_dio_rw);
ssize_t
iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
- bool wait_for_completion)
+ unsigned int dio_flags)
{
struct iomap_dio *dio;
- dio = __iomap_dio_rw(iocb, iter, ops, dops, wait_for_completion);
+ dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags);
if (IS_ERR_OR_NULL(dio))
return PTR_ERR_OR_ZERO(dio);
return iomap_dio_complete(dio);
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index f0fe641893a5..b9e6a7ec78be 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -152,6 +152,7 @@ static int do_isofs_readdir(struct inode *inode, struct file *file,
printk(KERN_NOTICE "iso9660: Corrupted directory entry"
" in block %lu of inode %lu\n", block,
inode->i_ino);
+ brelse(bh);
return -EIO;
}
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index ec90773527ee..21edc423b79f 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -339,6 +339,7 @@ static int parse_options(char *options, struct iso9660_options *popt)
{
char *p;
int option;
+ unsigned int uv;
popt->map = 'n';
popt->rock = 1;
@@ -434,17 +435,17 @@ static int parse_options(char *options, struct iso9660_options *popt)
case Opt_ignore:
break;
case Opt_uid:
- if (match_int(&args[0], &option))
+ if (match_uint(&args[0], &uv))
return 0;
- popt->uid = make_kuid(current_user_ns(), option);
+ popt->uid = make_kuid(current_user_ns(), uv);
if (!uid_valid(popt->uid))
return 0;
popt->uid_set = 1;
break;
case Opt_gid:
- if (match_int(&args[0], &option))
+ if (match_uint(&args[0], &uv))
return 0;
- popt->gid = make_kgid(current_user_ns(), option);
+ popt->gid = make_kgid(current_user_ns(), uv);
if (!gid_valid(popt->gid))
return 0;
popt->gid_set = 1;
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 402769881c32..58f80e1b3ac0 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -102,6 +102,7 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry,
printk(KERN_NOTICE "iso9660: Corrupted directory entry"
" in block %lu of inode %lu\n", block,
dir->i_ino);
+ brelse(bh);
return 0;
}
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 472932b9e6bc..63b526d44886 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -416,7 +416,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
* jbd2_cleanup_journal_tail() doesn't get called all that often.
*/
if (journal->j_flags & JBD2_BARRIER)
- blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
+ blkdev_issue_flush(journal->j_fs_dev);
return __jbd2_update_log_tail(journal, first_tid, blocknr);
}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index b121d7d434c6..3cc4ab2ba7f4 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -825,7 +825,7 @@ start_journal_io:
if (commit_transaction->t_need_data_flush &&
(journal->j_fs_dev != journal->j_dev) &&
(journal->j_flags & JBD2_BARRIER))
- blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
+ blkdev_issue_flush(journal->j_fs_dev);
/* Done it all: now write the commit record asynchronously. */
if (jbd2_has_feature_async_commit(journal)) {
@@ -932,7 +932,7 @@ start_journal_io:
stats.run.rs_blocks_logged++;
if (jbd2_has_feature_async_commit(journal) &&
journal->j_flags & JBD2_BARRIER) {
- blkdev_issue_flush(journal->j_dev, GFP_NOFS);
+ blkdev_issue_flush(journal->j_dev);
}
if (err)
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index dc0694fcfcd1..69f18fe20923 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -326,7 +326,7 @@ int jbd2_journal_recover(journal_t *journal)
err = err2;
/* Make sure all replayed data is on permanent storage */
if (journal->j_flags & JBD2_BARRIER) {
- err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL);
+ err2 = blkdev_issue_flush(journal->j_fs_dev);
if (!err)
err = err2;
}
diff --git a/fs/jffs2/compr_rtime.c b/fs/jffs2/compr_rtime.c
index 406d9cc84ba8..79e771ab624f 100644
--- a/fs/jffs2/compr_rtime.c
+++ b/fs/jffs2/compr_rtime.c
@@ -37,6 +37,9 @@ static int jffs2_rtime_compress(unsigned char *data_in,
int outpos = 0;
int pos=0;
+ if (*dstlen <= 3)
+ return -1;
+
memset(positions,0,sizeof(positions));
while (pos < (*sourcelen) && outpos <= (*dstlen)-2) {
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index be7c8a6a5748..4fe64519870f 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -783,6 +783,8 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
dbg_summary("Writing unknown RWCOMPAT_COPY node type %x\n",
je16_to_cpu(temp->u.nodetype));
jffs2_sum_disable_collecting(c->summary);
+ /* The above call removes the list, nothing more to do */
+ goto bail_rwcompat;
} else {
BUG(); /* unknown node in summary information */
}
@@ -794,6 +796,7 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
c->summary->sum_num--;
}
+ bail_rwcompat:
jffs2_sum_reset_collected(c->summary);
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 94b7c1cb5ceb..7aee15608619 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -1656,7 +1656,7 @@ s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen)
} else if (rc == -ENOSPC) {
/* search for next smaller log2 block */
l2nb = BLKSTOL2(nblocks) - 1;
- nblocks = 1 << l2nb;
+ nblocks = 1LL << l2nb;
} else {
/* Trim any already allocated blocks */
jfs_error(bmp->db_ipbmap->i_sb, "-EIO\n");
diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h
index 1e899298f7f0..b5d702df7111 100644
--- a/fs/jfs/jfs_filsys.h
+++ b/fs/jfs/jfs_filsys.h
@@ -268,5 +268,6 @@
* fsck() must be run to repair
*/
#define FM_EXTENDFS 0x00000008 /* file system extendfs() in progress */
+#define FM_STATE_MAX 0x0000000f /* max value of s_state */
#endif /* _H_JFS_FILSYS */
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 2935d4c776ec..5d7d7170c03c 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -37,6 +37,7 @@
#include <linux/fs.h>
#include <linux/buffer_head.h>
#include <linux/blkdev.h>
+#include <linux/log2.h>
#include "jfs_incore.h"
#include "jfs_filsys.h"
@@ -366,6 +367,15 @@ static int chkSuper(struct super_block *sb)
sbi->bsize = bsize;
sbi->l2bsize = le16_to_cpu(j_sb->s_l2bsize);
+ /* check some fields for possible corruption */
+ if (sbi->l2bsize != ilog2((u32)bsize) ||
+ j_sb->pad != 0 ||
+ le32_to_cpu(j_sb->s_state) > FM_STATE_MAX) {
+ rc = -EINVAL;
+ jfs_err("jfs_mount: Mount Failure: superblock is corrupt!");
+ goto out;
+ }
+
/*
* For now, ignore s_pbsize, l2bfactor. All I/O going through buffer
* cache.
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index dca8edd2378c..053295cd7bc6 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -148,10 +148,10 @@ static struct {
/*
* forward references
*/
-static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
- struct tlock * tlck, struct commit * cd);
-static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
- struct tlock * tlck);
+static void diLog(struct jfs_log *log, struct tblock *tblk, struct lrd *lrd,
+ struct tlock *tlck, struct commit *cd);
+static void dataLog(struct jfs_log *log, struct tblock *tblk, struct lrd *lrd,
+ struct tlock *tlck);
static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
struct tlock * tlck);
static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
@@ -159,8 +159,8 @@ static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
static void txAllocPMap(struct inode *ip, struct maplock * maplock,
struct tblock * tblk);
static void txForce(struct tblock * tblk);
-static int txLog(struct jfs_log * log, struct tblock * tblk,
- struct commit * cd);
+static void txLog(struct jfs_log *log, struct tblock *tblk,
+ struct commit *cd);
static void txUpdateMap(struct tblock * tblk);
static void txRelease(struct tblock * tblk);
static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
@@ -1256,8 +1256,7 @@ int txCommit(tid_t tid, /* transaction identifier */
*
* txUpdateMap() resets XAD_NEW in XAD.
*/
- if ((rc = txLog(log, tblk, &cd)))
- goto TheEnd;
+ txLog(log, tblk, &cd);
/*
* Ensure that inode isn't reused before
@@ -1365,9 +1364,8 @@ int txCommit(tid_t tid, /* transaction identifier */
*
* RETURN :
*/
-static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd)
+static void txLog(struct jfs_log *log, struct tblock *tblk, struct commit *cd)
{
- int rc = 0;
struct inode *ip;
lid_t lid;
struct tlock *tlck;
@@ -1414,7 +1412,7 @@ static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd)
}
}
- return rc;
+ return;
}
/*
@@ -1422,10 +1420,9 @@ static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd)
*
* function: log inode tlock and format maplock to update bmap;
*/
-static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
- struct tlock * tlck, struct commit * cd)
+static void diLog(struct jfs_log *log, struct tblock *tblk, struct lrd *lrd,
+ struct tlock *tlck, struct commit *cd)
{
- int rc = 0;
struct metapage *mp;
pxd_t *pxd;
struct pxd_lock *pxdlock;
@@ -1527,7 +1524,7 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
}
#endif /* _JFS_WIP */
- return rc;
+ return;
}
/*
@@ -1535,8 +1532,8 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
*
* function: log data tlock
*/
-static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
- struct tlock * tlck)
+static void dataLog(struct jfs_log *log, struct tblock *tblk, struct lrd *lrd,
+ struct tlock *tlck)
{
struct metapage *mp;
pxd_t *pxd;
@@ -1562,7 +1559,7 @@ static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
metapage_homeok(mp);
discard_metapage(mp);
tlck->mp = NULL;
- return 0;
+ return;
}
PXDaddress(pxd, mp->index);
@@ -1573,7 +1570,7 @@ static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
/* mark page as homeward bound */
tlck->flag |= tlckWRITEPAGE;
- return 0;
+ return;
}
/*
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index f277d023ebcd..c75719312147 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -14,6 +14,7 @@
#include <linux/pagemap.h>
#include <linux/sched/mm.h>
#include <linux/fsnotify.h>
+#include <linux/uio.h>
#include "kernfs-internal.h"
@@ -180,11 +181,10 @@ static const struct seq_operations kernfs_seq_ops = {
* it difficult to use seq_file. Implement simplistic custom buffering for
* bin files.
*/
-static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
- char __user *user_buf, size_t count,
- loff_t *ppos)
+static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
- ssize_t len = min_t(size_t, count, PAGE_SIZE);
+ struct kernfs_open_file *of = kernfs_of(iocb->ki_filp);
+ ssize_t len = min_t(size_t, iov_iter_count(iter), PAGE_SIZE);
const struct kernfs_ops *ops;
char *buf;
@@ -210,7 +210,7 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
of->event = atomic_read(&of->kn->attr.open->event);
ops = kernfs_ops(of->kn);
if (ops->read)
- len = ops->read(of, buf, len, *ppos);
+ len = ops->read(of, buf, len, iocb->ki_pos);
else
len = -EINVAL;
@@ -220,12 +220,12 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
if (len < 0)
goto out_free;
- if (copy_to_user(user_buf, buf, len)) {
+ if (copy_to_iter(buf, len, iter) != len) {
len = -EFAULT;
goto out_free;
}
- *ppos += len;
+ iocb->ki_pos += len;
out_free:
if (buf == of->prealloc_buf)
@@ -235,31 +235,14 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
return len;
}
-/**
- * kernfs_fop_read - kernfs vfs read callback
- * @file: file pointer
- * @user_buf: data to write
- * @count: number of bytes
- * @ppos: starting offset
- */
-static ssize_t kernfs_fop_read(struct file *file, char __user *user_buf,
- size_t count, loff_t *ppos)
+static ssize_t kernfs_fop_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
- struct kernfs_open_file *of = kernfs_of(file);
-
- if (of->kn->flags & KERNFS_HAS_SEQ_SHOW)
- return seq_read(file, user_buf, count, ppos);
- else
- return kernfs_file_direct_read(of, user_buf, count, ppos);
+ if (kernfs_of(iocb->ki_filp)->kn->flags & KERNFS_HAS_SEQ_SHOW)
+ return seq_read_iter(iocb, iter);
+ return kernfs_file_read_iter(iocb, iter);
}
-/**
- * kernfs_fop_write - kernfs vfs write callback
- * @file: file pointer
- * @user_buf: data to write
- * @count: number of bytes
- * @ppos: starting offset
- *
+/*
* Copy data in from userland and pass it to the matching kernfs write
* operation.
*
@@ -269,20 +252,18 @@ static ssize_t kernfs_fop_read(struct file *file, char __user *user_buf,
* modify only the the value you're changing, then write entire buffer
* back.
*/
-static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
- size_t count, loff_t *ppos)
+static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter)
{
- struct kernfs_open_file *of = kernfs_of(file);
+ struct kernfs_open_file *of = kernfs_of(iocb->ki_filp);
+ ssize_t len = iov_iter_count(iter);
const struct kernfs_ops *ops;
- ssize_t len;
char *buf;
if (of->atomic_write_len) {
- len = count;
if (len > of->atomic_write_len)
return -E2BIG;
} else {
- len = min_t(size_t, count, PAGE_SIZE);
+ len = min_t(size_t, len, PAGE_SIZE);
}
buf = of->prealloc_buf;
@@ -293,7 +274,7 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
if (!buf)
return -ENOMEM;
- if (copy_from_user(buf, user_buf, len)) {
+ if (copy_from_iter(buf, len, iter) != len) {
len = -EFAULT;
goto out_free;
}
@@ -312,7 +293,7 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
ops = kernfs_ops(of->kn);
if (ops->write)
- len = ops->write(of, buf, len, *ppos);
+ len = ops->write(of, buf, len, iocb->ki_pos);
else
len = -EINVAL;
@@ -320,7 +301,7 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
mutex_unlock(&of->mutex);
if (len > 0)
- *ppos += len;
+ iocb->ki_pos += len;
out_free:
if (buf == of->prealloc_buf)
@@ -673,7 +654,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
/*
* Write path needs to atomic_write_len outside active reference.
- * Cache it in open_file. See kernfs_fop_write() for details.
+ * Cache it in open_file. See kernfs_fop_write_iter() for details.
*/
of->atomic_write_len = ops->atomic_write_len;
@@ -960,14 +941,16 @@ void kernfs_notify(struct kernfs_node *kn)
EXPORT_SYMBOL_GPL(kernfs_notify);
const struct file_operations kernfs_file_fops = {
- .read = kernfs_fop_read,
- .write = kernfs_fop_write,
+ .read_iter = kernfs_fop_read_iter,
+ .write_iter = kernfs_fop_write_iter,
.llseek = generic_file_llseek,
.mmap = kernfs_fop_mmap,
.open = kernfs_fop_open,
.release = kernfs_fop_release,
.poll = kernfs_fop_poll,
.fsync = noop_fsync,
+ .splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
};
/**
diff --git a/fs/libfs.c b/fs/libfs.c
index 967aefda6ee3..e2de5401abca 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1119,7 +1119,7 @@ int generic_file_fsync(struct file *file, loff_t start, loff_t end,
err = __generic_file_fsync(file, start, end, datasync);
if (err)
return err;
- return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ return blkdev_issue_flush(inode->i_sb->s_bdev);
}
EXPORT_SYMBOL(generic_file_fsync);
@@ -1216,11 +1216,6 @@ static int anon_set_page_dirty(struct page *page)
return 0;
};
-/*
- * A single inode exists for all anon_inode files. Contrary to pipes,
- * anon_inode inodes have no associated per-instance data, so we need
- * only allocate one of them.
- */
struct inode *alloc_anon_inode(struct super_block *s)
{
static const struct address_space_operations anon_aops = {
@@ -1392,8 +1387,8 @@ static bool needs_casefold(const struct inode *dir)
*
* Return: 0 if names match, 1 if mismatch, or -ERRNO
*/
-int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
- const char *str, const struct qstr *name)
+static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
+ const char *str, const struct qstr *name)
{
const struct dentry *parent = READ_ONCE(dentry->d_parent);
const struct inode *dir = READ_ONCE(parent->d_inode);
@@ -1430,7 +1425,6 @@ fallback:
return 1;
return !!memcmp(str, name->name, len);
}
-EXPORT_SYMBOL(generic_ci_d_compare);
/**
* generic_ci_d_hash - generic d_hash implementation for casefolding filesystems
@@ -1439,7 +1433,7 @@ EXPORT_SYMBOL(generic_ci_d_compare);
*
* Return: 0 if hash was successful or unchanged, and -EINVAL on error
*/
-int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
+static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
{
const struct inode *dir = READ_ONCE(dentry->d_inode);
struct super_block *sb = dentry->d_sb;
@@ -1454,7 +1448,6 @@ int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
return -EINVAL;
return 0;
}
-EXPORT_SYMBOL(generic_ci_d_hash);
static const struct dentry_operations generic_ci_dentry_ops = {
.d_hash = generic_ci_d_hash,
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index fa41dda39925..4c10fb5138f1 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -512,6 +512,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_argsize = sizeof(struct nlm_void),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
+ .pc_name = "NULL",
},
[NLMPROC_TEST] = {
.pc_func = nlm4svc_proc_test,
@@ -520,6 +521,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_argsize = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St+2+No+Rg,
+ .pc_name = "TEST",
},
[NLMPROC_LOCK] = {
.pc_func = nlm4svc_proc_lock,
@@ -528,6 +530,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_argsize = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
+ .pc_name = "LOCK",
},
[NLMPROC_CANCEL] = {
.pc_func = nlm4svc_proc_cancel,
@@ -536,6 +539,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_argsize = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
+ .pc_name = "CANCEL",
},
[NLMPROC_UNLOCK] = {
.pc_func = nlm4svc_proc_unlock,
@@ -544,6 +548,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_argsize = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
+ .pc_name = "UNLOCK",
},
[NLMPROC_GRANTED] = {
.pc_func = nlm4svc_proc_granted,
@@ -552,6 +557,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_argsize = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
+ .pc_name = "GRANTED",
},
[NLMPROC_TEST_MSG] = {
.pc_func = nlm4svc_proc_test_msg,
@@ -560,6 +566,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_argsize = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
+ .pc_name = "TEST_MSG",
},
[NLMPROC_LOCK_MSG] = {
.pc_func = nlm4svc_proc_lock_msg,
@@ -568,6 +575,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_argsize = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
+ .pc_name = "LOCK_MSG",
},
[NLMPROC_CANCEL_MSG] = {
.pc_func = nlm4svc_proc_cancel_msg,
@@ -576,6 +584,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_argsize = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
+ .pc_name = "CANCEL_MSG",
},
[NLMPROC_UNLOCK_MSG] = {
.pc_func = nlm4svc_proc_unlock_msg,
@@ -584,6 +593,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_argsize = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
+ .pc_name = "UNLOCK_MSG",
},
[NLMPROC_GRANTED_MSG] = {
.pc_func = nlm4svc_proc_granted_msg,
@@ -592,6 +602,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_argsize = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
+ .pc_name = "GRANTED_MSG",
},
[NLMPROC_TEST_RES] = {
.pc_func = nlm4svc_proc_null,
@@ -600,6 +611,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_argsize = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
+ .pc_name = "TEST_RES",
},
[NLMPROC_LOCK_RES] = {
.pc_func = nlm4svc_proc_null,
@@ -608,6 +620,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_argsize = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
+ .pc_name = "LOCK_RES",
},
[NLMPROC_CANCEL_RES] = {
.pc_func = nlm4svc_proc_null,
@@ -616,6 +629,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_argsize = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
+ .pc_name = "CANCEL_RES",
},
[NLMPROC_UNLOCK_RES] = {
.pc_func = nlm4svc_proc_null,
@@ -624,6 +638,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_argsize = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
+ .pc_name = "UNLOCK_RES",
},
[NLMPROC_GRANTED_RES] = {
.pc_func = nlm4svc_proc_granted_res,
@@ -632,6 +647,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_argsize = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
+ .pc_name = "GRANTED_RES",
},
[NLMPROC_NSM_NOTIFY] = {
.pc_func = nlm4svc_proc_sm_notify,
@@ -640,6 +656,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_argsize = sizeof(struct nlm_reboot),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
+ .pc_name = "SM_NOTIFY",
},
[17] = {
.pc_func = nlm4svc_proc_unused,
@@ -648,6 +665,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_argsize = sizeof(struct nlm_void),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = 0,
+ .pc_name = "UNUSED",
},
[18] = {
.pc_func = nlm4svc_proc_unused,
@@ -656,6 +674,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_argsize = sizeof(struct nlm_void),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = 0,
+ .pc_name = "UNUSED",
},
[19] = {
.pc_func = nlm4svc_proc_unused,
@@ -664,6 +683,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_argsize = sizeof(struct nlm_void),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = 0,
+ .pc_name = "UNUSED",
},
[NLMPROC_SHARE] = {
.pc_func = nlm4svc_proc_share,
@@ -672,6 +692,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_argsize = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St+1,
+ .pc_name = "SHARE",
},
[NLMPROC_UNSHARE] = {
.pc_func = nlm4svc_proc_unshare,
@@ -680,6 +701,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_argsize = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St+1,
+ .pc_name = "UNSHARE",
},
[NLMPROC_NM_LOCK] = {
.pc_func = nlm4svc_proc_nm_lock,
@@ -688,6 +710,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_argsize = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
+ .pc_name = "NM_LOCK",
},
[NLMPROC_FREE_ALL] = {
.pc_func = nlm4svc_proc_free_all,
@@ -696,5 +719,6 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
.pc_argsize = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
+ .pc_name = "FREE_ALL",
},
};
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 50855f2c1f4b..4ae4b63b5392 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -554,6 +554,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_argsize = sizeof(struct nlm_void),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
+ .pc_name = "NULL",
},
[NLMPROC_TEST] = {
.pc_func = nlmsvc_proc_test,
@@ -562,6 +563,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_argsize = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St+2+No+Rg,
+ .pc_name = "TEST",
},
[NLMPROC_LOCK] = {
.pc_func = nlmsvc_proc_lock,
@@ -570,6 +572,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_argsize = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
+ .pc_name = "LOCK",
},
[NLMPROC_CANCEL] = {
.pc_func = nlmsvc_proc_cancel,
@@ -578,6 +581,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_argsize = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
+ .pc_name = "CANCEL",
},
[NLMPROC_UNLOCK] = {
.pc_func = nlmsvc_proc_unlock,
@@ -586,6 +590,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_argsize = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
+ .pc_name = "UNLOCK",
},
[NLMPROC_GRANTED] = {
.pc_func = nlmsvc_proc_granted,
@@ -594,6 +599,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_argsize = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
+ .pc_name = "GRANTED",
},
[NLMPROC_TEST_MSG] = {
.pc_func = nlmsvc_proc_test_msg,
@@ -602,6 +608,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_argsize = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
+ .pc_name = "TEST_MSG",
},
[NLMPROC_LOCK_MSG] = {
.pc_func = nlmsvc_proc_lock_msg,
@@ -610,6 +617,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_argsize = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
+ .pc_name = "LOCK_MSG",
},
[NLMPROC_CANCEL_MSG] = {
.pc_func = nlmsvc_proc_cancel_msg,
@@ -618,6 +626,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_argsize = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
+ .pc_name = "CANCEL_MSG",
},
[NLMPROC_UNLOCK_MSG] = {
.pc_func = nlmsvc_proc_unlock_msg,
@@ -626,6 +635,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_argsize = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
+ .pc_name = "UNLOCK_MSG",
},
[NLMPROC_GRANTED_MSG] = {
.pc_func = nlmsvc_proc_granted_msg,
@@ -634,6 +644,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_argsize = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
+ .pc_name = "GRANTED_MSG",
},
[NLMPROC_TEST_RES] = {
.pc_func = nlmsvc_proc_null,
@@ -642,6 +653,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_argsize = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
+ .pc_name = "TEST_RES",
},
[NLMPROC_LOCK_RES] = {
.pc_func = nlmsvc_proc_null,
@@ -650,6 +662,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_argsize = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
+ .pc_name = "LOCK_RES",
},
[NLMPROC_CANCEL_RES] = {
.pc_func = nlmsvc_proc_null,
@@ -658,6 +671,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_argsize = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
+ .pc_name = "CANCEL_RES",
},
[NLMPROC_UNLOCK_RES] = {
.pc_func = nlmsvc_proc_null,
@@ -666,6 +680,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_argsize = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
+ .pc_name = "UNLOCK_RES",
},
[NLMPROC_GRANTED_RES] = {
.pc_func = nlmsvc_proc_granted_res,
@@ -674,6 +689,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_argsize = sizeof(struct nlm_res),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
+ .pc_name = "GRANTED_RES",
},
[NLMPROC_NSM_NOTIFY] = {
.pc_func = nlmsvc_proc_sm_notify,
@@ -682,6 +698,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_argsize = sizeof(struct nlm_reboot),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
+ .pc_name = "SM_NOTIFY",
},
[17] = {
.pc_func = nlmsvc_proc_unused,
@@ -690,6 +707,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_argsize = sizeof(struct nlm_void),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
+ .pc_name = "UNUSED",
},
[18] = {
.pc_func = nlmsvc_proc_unused,
@@ -698,6 +716,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_argsize = sizeof(struct nlm_void),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
+ .pc_name = "UNUSED",
},
[19] = {
.pc_func = nlmsvc_proc_unused,
@@ -706,6 +725,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_argsize = sizeof(struct nlm_void),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = St,
+ .pc_name = "UNUSED",
},
[NLMPROC_SHARE] = {
.pc_func = nlmsvc_proc_share,
@@ -714,6 +734,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_argsize = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St+1,
+ .pc_name = "SHARE",
},
[NLMPROC_UNSHARE] = {
.pc_func = nlmsvc_proc_unshare,
@@ -722,6 +743,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_argsize = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St+1,
+ .pc_name = "UNSHARE",
},
[NLMPROC_NM_LOCK] = {
.pc_func = nlmsvc_proc_nm_lock,
@@ -730,6 +752,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_argsize = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_res),
.pc_xdrressize = Ck+St,
+ .pc_name = "NM_LOCK",
},
[NLMPROC_FREE_ALL] = {
.pc_func = nlmsvc_proc_free_all,
@@ -738,5 +761,6 @@ const struct svc_procedure nlmsvc_procedures[24] = {
.pc_argsize = sizeof(struct nlm_args),
.pc_ressize = sizeof(struct nlm_void),
.pc_xdrressize = 0,
+ .pc_name = "FREE_ALL",
},
};
diff --git a/fs/namei.c b/fs/namei.c
index dbf53b325ac9..216f16e74351 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -680,6 +680,11 @@ static inline bool legitimize_path(struct nameidata *nd,
static bool legitimize_links(struct nameidata *nd)
{
int i;
+ if (unlikely(nd->flags & LOOKUP_CACHED)) {
+ drop_links(nd);
+ nd->depth = 0;
+ return false;
+ }
for (i = 0; i < nd->depth; i++) {
struct saved *last = nd->stack + i;
if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
@@ -719,17 +724,17 @@ static bool legitimize_root(struct nameidata *nd)
*/
/**
- * unlazy_walk - try to switch to ref-walk mode.
+ * try_to_unlazy - try to switch to ref-walk mode.
* @nd: nameidata pathwalk data
- * Returns: 0 on success, -ECHILD on failure
+ * Returns: true on success, false on failure
*
- * unlazy_walk attempts to legitimize the current nd->path and nd->root
+ * try_to_unlazy attempts to legitimize the current nd->path and nd->root
* for ref-walk mode.
* Must be called from rcu-walk context.
- * Nothing should touch nameidata between unlazy_walk() failure and
+ * Nothing should touch nameidata between try_to_unlazy() failure and
* terminate_walk().
*/
-static int unlazy_walk(struct nameidata *nd)
+static bool try_to_unlazy(struct nameidata *nd)
{
struct dentry *parent = nd->path.dentry;
@@ -744,30 +749,30 @@ static int unlazy_walk(struct nameidata *nd)
goto out;
rcu_read_unlock();
BUG_ON(nd->inode != parent->d_inode);
- return 0;
+ return true;
out1:
nd->path.mnt = NULL;
nd->path.dentry = NULL;
out:
rcu_read_unlock();
- return -ECHILD;
+ return false;
}
/**
- * unlazy_child - try to switch to ref-walk mode.
+ * try_to_unlazy_next - try to switch to ref-walk mode.
* @nd: nameidata pathwalk data
- * @dentry: child of nd->path.dentry
- * @seq: seq number to check dentry against
- * Returns: 0 on success, -ECHILD on failure
+ * @dentry: next dentry to step into
+ * @seq: seq number to check @dentry against
+ * Returns: true on success, false on failure
*
- * unlazy_child attempts to legitimize the current nd->path, nd->root and dentry
- * for ref-walk mode. @dentry must be a path found by a do_lookup call on
- * @nd. Must be called from rcu-walk context.
- * Nothing should touch nameidata between unlazy_child() failure and
+ * Similar to to try_to_unlazy(), but here we have the next dentry already
+ * picked by rcu-walk and want to legitimize that in addition to the current
+ * nd->path and nd->root for ref-walk mode. Must be called from rcu-walk context.
+ * Nothing should touch nameidata between try_to_unlazy_next() failure and
* terminate_walk().
*/
-static int unlazy_child(struct nameidata *nd, struct dentry *dentry, unsigned seq)
+static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsigned seq)
{
BUG_ON(!(nd->flags & LOOKUP_RCU));
@@ -797,7 +802,7 @@ static int unlazy_child(struct nameidata *nd, struct dentry *dentry, unsigned se
if (unlikely(!legitimize_root(nd)))
goto out_dput;
rcu_read_unlock();
- return 0;
+ return true;
out2:
nd->path.mnt = NULL;
@@ -805,11 +810,11 @@ out1:
nd->path.dentry = NULL;
out:
rcu_read_unlock();
- return -ECHILD;
+ return false;
out_dput:
rcu_read_unlock();
dput(dentry);
- return -ECHILD;
+ return false;
}
static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
@@ -842,7 +847,8 @@ static int complete_walk(struct nameidata *nd)
*/
if (!(nd->flags & (LOOKUP_ROOT | LOOKUP_IS_SCOPED)))
nd->root.mnt = NULL;
- if (unlikely(unlazy_walk(nd)))
+ nd->flags &= ~LOOKUP_CACHED;
+ if (!try_to_unlazy(nd))
return -ECHILD;
}
@@ -1448,7 +1454,7 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
return -ENOENT;
if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
return 0;
- if (unlazy_child(nd, dentry, seq))
+ if (!try_to_unlazy_next(nd, dentry, seq))
return -ECHILD;
// *path might've been clobbered by __follow_mount_rcu()
path->mnt = nd->path.mnt;
@@ -1542,7 +1548,7 @@ static struct dentry *lookup_fast(struct nameidata *nd,
unsigned seq;
dentry = __d_lookup_rcu(parent, &nd->last, &seq);
if (unlikely(!dentry)) {
- if (unlazy_walk(nd))
+ if (!try_to_unlazy(nd))
return ERR_PTR(-ECHILD);
return NULL;
}
@@ -1569,9 +1575,9 @@ static struct dentry *lookup_fast(struct nameidata *nd,
status = d_revalidate(dentry, nd->flags);
if (likely(status > 0))
return dentry;
- if (unlazy_child(nd, dentry, seq))
+ if (!try_to_unlazy_next(nd, dentry, seq))
return ERR_PTR(-ECHILD);
- if (unlikely(status == -ECHILD))
+ if (status == -ECHILD)
/* we'd been told to redo it in non-rcu mode */
status = d_revalidate(dentry, nd->flags);
} else {
@@ -1643,12 +1649,9 @@ static inline int may_lookup(struct user_namespace *mnt_userns,
struct nameidata *nd)
{
if (nd->flags & LOOKUP_RCU) {
- int err = inode_permission(mnt_userns, nd->inode,
- MAY_EXEC | MAY_NOT_BLOCK);
- if (err != -ECHILD)
+ int err = inode_permission(mnt_userns, nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
+ if (err != -ECHILD || !try_to_unlazy(nd))
return err;
- if (unlazy_walk(nd))
- return -ECHILD;
}
return inode_permission(mnt_userns, nd->inode, MAY_EXEC);
}
@@ -1670,7 +1673,7 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
// unlazy even if we fail to grab the link - cleanup needs it
bool grabbed_link = legitimize_path(nd, link, seq);
- if (unlazy_walk(nd) != 0 || !grabbed_link)
+ if (!try_to_unlazy(nd) != 0 || !grabbed_link)
return -ECHILD;
if (nd_alloc_stack(nd))
@@ -1712,7 +1715,7 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
touch_atime(&last->link);
cond_resched();
} else if (atime_needs_update(&last->link, inode)) {
- if (unlikely(unlazy_walk(nd)))
+ if (!try_to_unlazy(nd))
return ERR_PTR(-ECHILD);
touch_atime(&last->link);
}
@@ -1729,11 +1732,8 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
get = inode->i_op->get_link;
if (nd->flags & LOOKUP_RCU) {
res = get(NULL, inode, &last->done);
- if (res == ERR_PTR(-ECHILD)) {
- if (unlikely(unlazy_walk(nd)))
- return ERR_PTR(-ECHILD);
+ if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd))
res = get(link->dentry, inode, &last->done);
- }
} else {
res = get(link->dentry, inode, &last->done);
}
@@ -2275,7 +2275,7 @@ OK:
}
if (unlikely(!d_can_lookup(nd->path.dentry))) {
if (nd->flags & LOOKUP_RCU) {
- if (unlazy_walk(nd))
+ if (!try_to_unlazy(nd))
return -ECHILD;
}
return -ENOTDIR;
@@ -2289,6 +2289,10 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
int error;
const char *s = nd->name->name;
+ /* LOOKUP_CACHED requires RCU, ask caller to retry */
+ if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
+ return ERR_PTR(-EAGAIN);
+
if (!*s)
flags &= ~LOOKUP_RCU;
if (flags & LOOKUP_RCU)
@@ -3238,7 +3242,6 @@ static const char *open_last_lookups(struct nameidata *nd,
struct inode *inode;
struct dentry *dentry;
const char *res;
- int error;
nd->flags |= op->intent;
@@ -3262,9 +3265,8 @@ static const char *open_last_lookups(struct nameidata *nd,
} else {
/* create side of things */
if (nd->flags & LOOKUP_RCU) {
- error = unlazy_walk(nd);
- if (unlikely(error))
- return ERR_PTR(error);
+ if (!try_to_unlazy(nd))
+ return ERR_PTR(-ECHILD);
}
audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
/* trailing slashes? */
@@ -3273,9 +3275,7 @@ static const char *open_last_lookups(struct nameidata *nd,
}
if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
- error = mnt_want_write(nd->path.mnt);
- if (!error)
- got_write = true;
+ got_write = !mnt_want_write(nd->path.mnt);
/*
* do _not_ fail yet - we might not need that or fail with
* a different error; let lookup_open() decide; we'll be
@@ -3454,10 +3454,8 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
audit_inode(nd->name, child, 0);
/* Don't check for other permissions, the inode was just created */
error = may_open(mnt_userns, &path, 0, op->open_flag);
- if (error)
- goto out2;
- file->f_path.mnt = path.mnt;
- error = finish_open(file, child, NULL);
+ if (!error)
+ error = vfs_open(&path, file);
out2:
mnt_drop_write(path.mnt);
out:
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 3be6836074ae..1a96ce28efb0 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -123,11 +123,6 @@ bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
npg = min(npg, BIO_MAX_PAGES);
bio = bio_alloc(GFP_NOIO, npg);
- if (!bio && (current->flags & PF_MEMALLOC)) {
- while (!bio && (npg /= 2))
- bio = bio_alloc(GFP_NOIO, npg);
- }
-
if (bio) {
bio->bi_iter.bi_sector = disk_sector;
bio_set_dev(bio, bdev);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 79ff172eb1c8..c5348ba81129 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -1060,6 +1060,7 @@ static const struct svc_procedure nfs4_callback_procedures1[] = {
.pc_decode = nfs4_decode_void,
.pc_encode = nfs4_encode_void,
.pc_xdrressize = 1,
+ .pc_name = "NULL",
},
[CB_COMPOUND] = {
.pc_func = nfs4_callback_compound,
@@ -1067,6 +1068,7 @@ static const struct svc_procedure nfs4_callback_procedures1[] = {
.pc_argsize = 256,
.pc_ressize = 256,
.pc_xdrressize = NFS4_CALLBACK_BUFSIZE,
+ .pc_name = "COMPOUND",
}
};
diff --git a/fs/nfs/export.c b/fs/nfs/export.c
index 7412bb164fa7..f2b34cfe286c 100644
--- a/fs/nfs/export.c
+++ b/fs/nfs/export.c
@@ -167,10 +167,28 @@ out:
return parent;
}
+static u64 nfs_fetch_iversion(struct inode *inode)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+
+ /* Is this the right call?: */
+ nfs_revalidate_inode(server, inode);
+ /*
+ * Also, note we're ignoring any returned error. That seems to be
+ * the practice for cache consistency information elsewhere in
+ * the server, but I'm not sure why.
+ */
+ if (server->nfs_client->rpc_ops->version >= 4)
+ return inode_peek_iversion_raw(inode);
+ else
+ return time_to_chattr(&inode->i_ctime);
+}
+
const struct export_operations nfs_export_ops = {
.encode_fh = nfs_encode_fh,
.fh_to_dentry = nfs_fh_to_dentry,
.get_parent = nfs_get_parent,
+ .fetch_iversion = nfs_fetch_iversion,
.flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK|
EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS|
EXPORT_OP_NOATOMIC_ATTR,
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 57b3821d975a..441a2fa073c8 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -420,7 +420,9 @@ static const struct nfs4_ssc_client_ops nfs4_ssc_clnt_ops_tbl = {
*/
void nfs42_ssc_register_ops(void)
{
+#ifdef CONFIG_NFSD_V4
nfs42_ssc_register(&nfs4_ssc_clnt_ops_tbl);
+#endif
}
/**
@@ -431,7 +433,9 @@ void nfs42_ssc_register_ops(void)
*/
void nfs42_ssc_unregister_ops(void)
{
+#ifdef CONFIG_NFSD_V4
nfs42_ssc_unregister(&nfs4_ssc_clnt_ops_tbl);
+#endif
}
#endif /* CONFIG_NFS_V4_2 */
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 4f274f21c4ab..af64b4e6fd1f 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -324,6 +324,21 @@ pnfs_grab_inode_layout_hdr(struct pnfs_layout_hdr *lo)
return NULL;
}
+/*
+ * Compare 2 layout stateid sequence ids, to see which is newer,
+ * taking into account wraparound issues.
+ */
+static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
+{
+ return (s32)(s1 - s2) > 0;
+}
+
+static void pnfs_barrier_update(struct pnfs_layout_hdr *lo, u32 newseq)
+{
+ if (pnfs_seqid_is_newer(newseq, lo->plh_barrier))
+ lo->plh_barrier = newseq;
+}
+
static void
pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
u32 seq)
@@ -335,6 +350,7 @@ pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
if (seq != 0) {
WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq);
lo->plh_return_seq = seq;
+ pnfs_barrier_update(lo, seq);
}
}
@@ -639,15 +655,6 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
return rv;
}
-/*
- * Compare 2 layout stateid sequence ids, to see which is newer,
- * taking into account wraparound issues.
- */
-static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
-{
- return (s32)(s1 - s2) > 0;
-}
-
static bool
pnfs_should_free_range(const struct pnfs_layout_range *lseg_range,
const struct pnfs_layout_range *recall_range)
@@ -984,8 +991,7 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
new_barrier = be32_to_cpu(new->seqid);
else if (new_barrier == 0)
return;
- if (pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
- lo->plh_barrier = new_barrier;
+ pnfs_barrier_update(lo, new_barrier);
}
static bool
@@ -994,7 +1000,7 @@ pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
{
u32 seqid = be32_to_cpu(stateid->seqid);
- return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
+ return !pnfs_seqid_is_newer(seqid, lo->plh_barrier) && lo->plh_barrier;
}
/* lget is set to 1 if called from inside send_layoutget call chain */
@@ -1183,20 +1189,17 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
return false;
set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
pnfs_get_layout_hdr(lo);
+ nfs4_stateid_copy(stateid, &lo->plh_stateid);
+ *cred = get_cred(lo->plh_lc_cred);
if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) {
- nfs4_stateid_copy(stateid, &lo->plh_stateid);
- *cred = get_cred(lo->plh_lc_cred);
if (lo->plh_return_seq != 0)
stateid->seqid = cpu_to_be32(lo->plh_return_seq);
if (iomode != NULL)
*iomode = lo->plh_return_iomode;
pnfs_clear_layoutreturn_info(lo);
- return true;
- }
- nfs4_stateid_copy(stateid, &lo->plh_stateid);
- *cred = get_cred(lo->plh_lc_cred);
- if (iomode != NULL)
+ } else if (iomode != NULL)
*iomode = IOMODE_ANY;
+ pnfs_barrier_update(lo, be32_to_cpu(stateid->seqid));
return true;
}
@@ -1909,6 +1912,11 @@ static void nfs_layoutget_end(struct pnfs_layout_hdr *lo)
wake_up_var(&lo->plh_outstanding);
}
+static bool pnfs_is_first_layoutget(struct pnfs_layout_hdr *lo)
+{
+ return test_bit(NFS_LAYOUT_FIRST_LAYOUTGET, &lo->plh_flags);
+}
+
static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo)
{
unsigned long *bitlock = &lo->plh_flags;
@@ -2383,23 +2391,34 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
goto out_forget;
}
- if (!pnfs_layout_is_valid(lo)) {
- /* We have a completely new layout */
- pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, true);
- } else if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
+ if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
/* existing state ID, make sure the sequence number matches. */
if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
+ if (!pnfs_layout_is_valid(lo) &&
+ pnfs_is_first_layoutget(lo))
+ lo->plh_barrier = 0;
dprintk("%s forget reply due to sequence\n", __func__);
goto out_forget;
}
pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, false);
- } else {
+ } else if (pnfs_layout_is_valid(lo)) {
/*
* We got an entirely new state ID. Mark all segments for the
* inode invalid, and retry the layoutget
*/
- pnfs_mark_layout_stateid_invalid(lo, &free_me);
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_ANY,
+ .length = NFS4_MAX_UINT64,
+ };
+ pnfs_set_plh_return_info(lo, IOMODE_ANY, 0);
+ pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs,
+ &range, 0);
goto out_forget;
+ } else {
+ /* We have a completely new layout */
+ if (!pnfs_is_first_layoutget(lo))
+ goto out_forget;
+ pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, true);
}
pnfs_get_lseg(lseg);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 4034102010f0..c7a924580eec 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -86,9 +86,11 @@ const struct super_operations nfs_sops = {
};
EXPORT_SYMBOL_GPL(nfs_sops);
+#ifdef CONFIG_NFS_V4_2
static const struct nfs_ssc_client_ops nfs_ssc_clnt_ops_tbl = {
.sco_sb_deactive = nfs_sb_deactive,
};
+#endif
#if IS_ENABLED(CONFIG_NFS_V4)
static int __init register_nfs4_fs(void)
@@ -111,15 +113,21 @@ static void unregister_nfs4_fs(void)
}
#endif
+#ifdef CONFIG_NFS_V4_2
static void nfs_ssc_register_ops(void)
{
+#ifdef CONFIG_NFSD_V4
nfs_ssc_register(&nfs_ssc_clnt_ops_tbl);
+#endif
}
static void nfs_ssc_unregister_ops(void)
{
+#ifdef CONFIG_NFSD_V4
nfs_ssc_unregister(&nfs_ssc_clnt_ops_tbl);
+#endif
}
+#endif /* CONFIG_NFS_V4_2 */
static struct shrinker acl_shrinker = {
.count_objects = nfs_access_cache_count,
@@ -148,7 +156,9 @@ int __init register_nfs_fs(void)
ret = register_shrinker(&acl_shrinker);
if (ret < 0)
goto error_3;
+#ifdef CONFIG_NFS_V4_2
nfs_ssc_register_ops();
+#endif
return 0;
error_3:
nfs_unregister_sysctl();
@@ -168,7 +178,9 @@ void __exit unregister_nfs_fs(void)
unregister_shrinker(&acl_shrinker);
nfs_unregister_sysctl();
unregister_nfs4_fs();
+#ifdef CONFIG_NFS_V4_2
nfs_ssc_unregister_ops();
+#endif
unregister_filesystem(&nfs_fs_type);
}
diff --git a/fs/nfs_common/Makefile b/fs/nfs_common/Makefile
index fa82f5aaa6d9..119c75ab9fd0 100644
--- a/fs/nfs_common/Makefile
+++ b/fs/nfs_common/Makefile
@@ -7,4 +7,4 @@ obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o
nfs_acl-objs := nfsacl.o
obj-$(CONFIG_GRACE_PERIOD) += grace.o
-obj-$(CONFIG_GRACE_PERIOD) += nfs_ssc.o
+obj-$(CONFIG_NFS_V4_2_SSC_HELPER) += nfs_ssc.o
diff --git a/fs/nfs_common/nfs_ssc.c b/fs/nfs_common/nfs_ssc.c
index f43bbb373913..7c1509e968c8 100644
--- a/fs/nfs_common/nfs_ssc.c
+++ b/fs/nfs_common/nfs_ssc.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * fs/nfs_common/nfs_ssc_comm.c
- *
* Helper for knfsd's SSC to access ops in NFS client modules
*
* Author: Dai Ngo <dai.ngo@oracle.com>
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index d056ad2fdefd..79c563c1a5e8 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -295,3 +295,55 @@ int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
nfsacl_desc.desc.array_len;
}
EXPORT_SYMBOL_GPL(nfsacl_decode);
+
+/**
+ * nfs_stream_decode_acl - Decode an NFSv3 ACL
+ *
+ * @xdr: an xdr_stream positioned at an encoded ACL
+ * @aclcnt: OUT: count of ACEs in decoded posix_acl
+ * @pacl: OUT: a dynamically-allocated buffer containing the decoded posix_acl
+ *
+ * Return values:
+ * %false: The encoded ACL is not valid
+ * %true: @pacl contains a decoded ACL, and @xdr is advanced
+ *
+ * On a successful return, caller must release *pacl using posix_acl_release().
+ */
+bool nfs_stream_decode_acl(struct xdr_stream *xdr, unsigned int *aclcnt,
+ struct posix_acl **pacl)
+{
+ const size_t elem_size = XDR_UNIT * 3;
+ struct nfsacl_decode_desc nfsacl_desc = {
+ .desc = {
+ .elem_size = elem_size,
+ .xcode = pacl ? xdr_nfsace_decode : NULL,
+ },
+ };
+ unsigned int base;
+ u32 entries;
+
+ if (xdr_stream_decode_u32(xdr, &entries) < 0)
+ return false;
+ if (entries > NFS_ACL_MAX_ENTRIES)
+ return false;
+
+ base = xdr_stream_pos(xdr);
+ if (!xdr_inline_decode(xdr, XDR_UNIT + elem_size * entries))
+ return false;
+ nfsacl_desc.desc.array_maxlen = entries;
+ if (xdr_decode_array2(xdr->buf, base, &nfsacl_desc.desc))
+ return false;
+
+ if (pacl) {
+ if (entries != nfsacl_desc.desc.array_len ||
+ posix_acl_from_nfsacl(nfsacl_desc.acl) != 0) {
+ posix_acl_release(nfsacl_desc.acl);
+ return false;
+ }
+ *pacl = nfsacl_desc.acl;
+ }
+ if (aclcnt)
+ *aclcnt = entries;
+ return true;
+}
+EXPORT_SYMBOL_GPL(nfs_stream_decode_acl);
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index dbbc583d6273..821e5913faee 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -76,6 +76,7 @@ config NFSD_V4
select CRYPTO_MD5
select CRYPTO_SHA256
select GRACE_PERIOD
+ select NFS_V4_2_SSC_HELPER if NFS_V4_2
help
This option enables support in your system's NFS server for
version 4 of the NFS protocol (RFC 3530).
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index a07c39c94bbd..1058659a8d31 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -254,7 +254,7 @@ again:
req->cmd[4] = bufflen & 0xff;
req->cmd_len = COMMAND_SIZE(INQUIRY);
- blk_execute_rq(rq->q, NULL, rq, 1);
+ blk_execute_rq(NULL, rq, 1);
if (req->result) {
pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
req->result);
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index e456421f68b4..9421dae22737 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -331,12 +331,29 @@ static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc)
fsloc->locations = NULL;
}
+static int export_stats_init(struct export_stats *stats)
+{
+ stats->start_time = ktime_get_seconds();
+ return nfsd_percpu_counters_init(stats->counter, EXP_STATS_COUNTERS_NUM);
+}
+
+static void export_stats_reset(struct export_stats *stats)
+{
+ nfsd_percpu_counters_reset(stats->counter, EXP_STATS_COUNTERS_NUM);
+}
+
+static void export_stats_destroy(struct export_stats *stats)
+{
+ nfsd_percpu_counters_destroy(stats->counter, EXP_STATS_COUNTERS_NUM);
+}
+
static void svc_export_put(struct kref *ref)
{
struct svc_export *exp = container_of(ref, struct svc_export, h.ref);
path_put(&exp->ex_path);
auth_domain_put(exp->ex_client);
nfsd4_fslocs_free(&exp->ex_fslocs);
+ export_stats_destroy(&exp->ex_stats);
kfree(exp->ex_uuid);
kfree_rcu(exp, ex_rcu);
}
@@ -698,22 +715,47 @@ static void exp_flags(struct seq_file *m, int flag, int fsid,
kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fslocs);
static void show_secinfo(struct seq_file *m, struct svc_export *exp);
+static int is_export_stats_file(struct seq_file *m)
+{
+ /*
+ * The export_stats file uses the same ops as the exports file.
+ * We use the file's name to determine the reported info per export.
+ * There is no rename in nsfdfs, so d_name.name is stable.
+ */
+ return !strcmp(m->file->f_path.dentry->d_name.name, "export_stats");
+}
+
static int svc_export_show(struct seq_file *m,
struct cache_detail *cd,
struct cache_head *h)
{
- struct svc_export *exp ;
+ struct svc_export *exp;
+ bool export_stats = is_export_stats_file(m);
- if (h ==NULL) {
- seq_puts(m, "#path domain(flags)\n");
+ if (h == NULL) {
+ if (export_stats)
+ seq_puts(m, "#path domain start-time\n#\tstats\n");
+ else
+ seq_puts(m, "#path domain(flags)\n");
return 0;
}
exp = container_of(h, struct svc_export, h);
seq_path(m, &exp->ex_path, " \t\n\\");
seq_putc(m, '\t');
seq_escape(m, exp->ex_client->name, " \t\n\\");
+ if (export_stats) {
+ seq_printf(m, "\t%lld\n", exp->ex_stats.start_time);
+ seq_printf(m, "\tfh_stale: %lld\n",
+ percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_FH_STALE]));
+ seq_printf(m, "\tio_read: %lld\n",
+ percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_IO_READ]));
+ seq_printf(m, "\tio_write: %lld\n",
+ percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_IO_WRITE]));
+ seq_putc(m, '\n');
+ return 0;
+ }
seq_putc(m, '(');
- if (test_bit(CACHE_VALID, &h->flags) &&
+ if (test_bit(CACHE_VALID, &h->flags) &&
!test_bit(CACHE_NEGATIVE, &h->flags)) {
exp_flags(m, exp->ex_flags, exp->ex_fsid,
exp->ex_anon_uid, exp->ex_anon_gid, &exp->ex_fslocs);
@@ -754,6 +796,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
new->ex_layout_types = 0;
new->ex_uuid = NULL;
new->cd = item->cd;
+ export_stats_reset(&new->ex_stats);
}
static void export_update(struct cache_head *cnew, struct cache_head *citem)
@@ -786,10 +829,15 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
static struct cache_head *svc_export_alloc(void)
{
struct svc_export *i = kmalloc(sizeof(*i), GFP_KERNEL);
- if (i)
- return &i->h;
- else
+ if (!i)
+ return NULL;
+
+ if (export_stats_init(&i->ex_stats)) {
+ kfree(i);
return NULL;
+ }
+
+ return &i->h;
}
static const struct cache_detail svc_export_cache_template = {
@@ -1251,10 +1299,14 @@ static int e_show(struct seq_file *m, void *p)
struct cache_head *cp = p;
struct svc_export *exp = container_of(cp, struct svc_export, h);
struct cache_detail *cd = m->private;
+ bool export_stats = is_export_stats_file(m);
if (p == SEQ_START_TOKEN) {
seq_puts(m, "# Version 1.1\n");
- seq_puts(m, "# Path Client(Flags) # IPs\n");
+ if (export_stats)
+ seq_puts(m, "# Path Client Start-time\n#\tStats\n");
+ else
+ seq_puts(m, "# Path Client(Flags) # IPs\n");
return 0;
}
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index e7daa1f246f0..ee0e3aba4a6e 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -6,6 +6,7 @@
#define NFSD_EXPORT_H
#include <linux/sunrpc/cache.h>
+#include <linux/percpu_counter.h>
#include <uapi/linux/nfsd/export.h>
#include <linux/nfs4.h>
@@ -46,6 +47,19 @@ struct exp_flavor_info {
u32 flags;
};
+/* Per-export stats */
+enum {
+ EXP_STATS_FH_STALE,
+ EXP_STATS_IO_READ,
+ EXP_STATS_IO_WRITE,
+ EXP_STATS_COUNTERS_NUM
+};
+
+struct export_stats {
+ time64_t start_time;
+ struct percpu_counter counter[EXP_STATS_COUNTERS_NUM];
+};
+
struct svc_export {
struct cache_head h;
struct auth_domain * ex_client;
@@ -62,6 +76,7 @@ struct svc_export {
struct nfsd4_deviceid_map *ex_devid_map;
struct cache_detail *cd;
struct rcu_head ex_rcu;
+ struct export_stats ex_stats;
};
/* an "export key" (expkey) maps a filehandlefragement to an
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 7346acda9d76..c330f5bd0cf3 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -10,6 +10,7 @@
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <linux/percpu_counter.h>
/* Hash tables for nfs4_clientid state */
#define CLIENT_HASH_BITS 4
@@ -21,6 +22,14 @@
struct cld_net;
struct nfsd4_client_tracking_ops;
+enum {
+ /* cache misses due only to checksum comparison failures */
+ NFSD_NET_PAYLOAD_MISSES,
+ /* amount of memory (in bytes) currently consumed by the DRC */
+ NFSD_NET_DRC_MEM_USAGE,
+ NFSD_NET_COUNTERS_NUM
+};
+
/*
* Represents a nfsd "container". With respect to nfsv4 state tracking, the
* fields of interest are the *_id_hashtbls and the *_name_tree. These track
@@ -149,20 +158,16 @@ struct nfsd_net {
/*
* Stats and other tracking of on the duplicate reply cache.
- * These fields and the "rc" fields in nfsdstats are modified
- * with only the per-bucket cache lock, which isn't really safe
- * and should be fixed if we want the statistics to be
- * completely accurate.
+ * The longest_chain* fields are modified with only the per-bucket
+ * cache lock, which isn't really safe and should be fixed if we want
+ * these statistics to be completely accurate.
*/
/* total number of entries */
atomic_t num_drc_entries;
- /* cache misses due only to checksum comparison failures */
- unsigned int payload_misses;
-
- /* amount of memory (in bytes) currently consumed by the DRC */
- unsigned int drc_mem_usage;
+ /* Per-netns stats counters */
+ struct percpu_counter counter[NFSD_NET_COUNTERS_NUM];
/* longest hash chain seen */
unsigned int longest_chain;
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index b83f222558e3..855e17772eba 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -190,63 +190,49 @@ out:
static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_getaclargs *argp = rqstp->rq_argp;
- p = nfs2svc_decode_fh(p, &argp->fh);
- if (!p)
+ if (!svcxdr_decode_fhandle(xdr, &argp->fh))
+ return 0;
+ if (xdr_stream_decode_u32(xdr, &argp->mask) < 0)
return 0;
- argp->mask = ntohl(*p); p++;
- return xdr_argsize_check(rqstp, p);
+ return 1;
}
-
static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_setaclargs *argp = rqstp->rq_argp;
- struct kvec *head = rqstp->rq_arg.head;
- unsigned int base;
- int n;
- p = nfs2svc_decode_fh(p, &argp->fh);
- if (!p)
+ if (!svcxdr_decode_fhandle(xdr, &argp->fh))
return 0;
- argp->mask = ntohl(*p++);
- if (argp->mask & ~NFS_ACL_MASK ||
- !xdr_argsize_check(rqstp, p))
+ if (xdr_stream_decode_u32(xdr, &argp->mask) < 0)
return 0;
-
- base = (char *)p - (char *)head->iov_base;
- n = nfsacl_decode(&rqstp->rq_arg, base, NULL,
- (argp->mask & NFS_ACL) ?
- &argp->acl_access : NULL);
- if (n > 0)
- n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL,
- (argp->mask & NFS_DFACL) ?
- &argp->acl_default : NULL);
- return (n > 0);
-}
-
-static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p)
-{
- struct nfsd_fhandle *argp = rqstp->rq_argp;
-
- p = nfs2svc_decode_fh(p, &argp->fh);
- if (!p)
+ if (argp->mask & ~NFS_ACL_MASK)
+ return 0;
+ if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_ACL) ?
+ &argp->acl_access : NULL))
return 0;
- return xdr_argsize_check(rqstp, p);
+ if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_DFACL) ?
+ &argp->acl_default : NULL))
+ return 0;
+
+ return 1;
}
static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p)
{
- struct nfsd3_accessargs *argp = rqstp->rq_argp;
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
+ struct nfsd3_accessargs *args = rqstp->rq_argp;
- p = nfs2svc_decode_fh(p, &argp->fh);
- if (!p)
+ if (!svcxdr_decode_fhandle(xdr, &args->fh))
+ return 0;
+ if (xdr_stream_decode_u32(xdr, &args->access) < 0)
return 0;
- argp->access = ntohl(*p++);
- return xdr_argsize_check(rqstp, p);
+ return 1;
}
/*
@@ -373,6 +359,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
.pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST,
+ .pc_name = "NULL",
},
[ACLPROC2_GETACL] = {
.pc_func = nfsacld_proc_getacl,
@@ -383,6 +370,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
.pc_ressize = sizeof(struct nfsd3_getaclres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+1+2*(1+ACL),
+ .pc_name = "GETACL",
},
[ACLPROC2_SETACL] = {
.pc_func = nfsacld_proc_setacl,
@@ -393,16 +381,18 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
.pc_ressize = sizeof(struct nfsd_attrstat),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+AT,
+ .pc_name = "SETACL",
},
[ACLPROC2_GETATTR] = {
.pc_func = nfsacld_proc_getattr,
- .pc_decode = nfsaclsvc_decode_fhandleargs,
+ .pc_decode = nfssvc_decode_fhandleargs,
.pc_encode = nfsaclsvc_encode_attrstatres,
.pc_release = nfsaclsvc_release_attrstat,
.pc_argsize = sizeof(struct nfsd_fhandle),
.pc_ressize = sizeof(struct nfsd_attrstat),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+AT,
+ .pc_name = "GETATTR",
},
[ACLPROC2_ACCESS] = {
.pc_func = nfsacld_proc_access,
@@ -413,6 +403,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
.pc_ressize = sizeof(struct nfsd3_accessres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+AT+1,
+ .pc_name = "SETATTR",
},
};
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index f18ec7e8094d..9a6f18d74d14 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -126,43 +126,39 @@ out:
/*
* XDR decode functions
*/
+
static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_getaclargs *args = rqstp->rq_argp;
- p = nfs3svc_decode_fh(p, &args->fh);
- if (!p)
+ if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
+ return 0;
+ if (xdr_stream_decode_u32(xdr, &args->mask) < 0)
return 0;
- args->mask = ntohl(*p); p++;
- return xdr_argsize_check(rqstp, p);
+ return 1;
}
-
static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p)
{
- struct nfsd3_setaclargs *args = rqstp->rq_argp;
- struct kvec *head = rqstp->rq_arg.head;
- unsigned int base;
- int n;
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
+ struct nfsd3_setaclargs *argp = rqstp->rq_argp;
- p = nfs3svc_decode_fh(p, &args->fh);
- if (!p)
+ if (!svcxdr_decode_nfs_fh3(xdr, &argp->fh))
return 0;
- args->mask = ntohl(*p++);
- if (args->mask & ~NFS_ACL_MASK ||
- !xdr_argsize_check(rqstp, p))
+ if (xdr_stream_decode_u32(xdr, &argp->mask) < 0)
+ return 0;
+ if (argp->mask & ~NFS_ACL_MASK)
+ return 0;
+ if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_ACL) ?
+ &argp->acl_access : NULL))
+ return 0;
+ if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_DFACL) ?
+ &argp->acl_default : NULL))
return 0;
- base = (char *)p - (char *)head->iov_base;
- n = nfsacl_decode(&rqstp->rq_arg, base, NULL,
- (args->mask & NFS_ACL) ?
- &args->acl_access : NULL);
- if (n > 0)
- n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL,
- (args->mask & NFS_DFACL) ?
- &args->acl_default : NULL);
- return (n > 0);
+ return 1;
}
/*
@@ -253,6 +249,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = {
.pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST,
+ .pc_name = "NULL",
},
[ACLPROC3_GETACL] = {
.pc_func = nfsd3_proc_getacl,
@@ -263,6 +260,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = {
.pc_ressize = sizeof(struct nfsd3_getaclres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+1+2*(1+ACL),
+ .pc_name = "GETACL",
},
[ACLPROC3_SETACL] = {
.pc_func = nfsd3_proc_setacl,
@@ -273,6 +271,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = {
.pc_ressize = sizeof(struct nfsd3_attrstat),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+pAT,
+ .pc_name = "SETACL",
},
};
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 76931f4f57c3..8675851199f8 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -124,15 +124,16 @@ nfsd3_proc_access(struct svc_rqst *rqstp)
static __be32
nfsd3_proc_readlink(struct svc_rqst *rqstp)
{
- struct nfsd3_readlinkargs *argp = rqstp->rq_argp;
+ struct nfsd_fhandle *argp = rqstp->rq_argp;
struct nfsd3_readlinkres *resp = rqstp->rq_resp;
+ char *buffer = page_address(*(rqstp->rq_next_page++));
dprintk("nfsd: READLINK(3) %s\n", SVCFH_fmt(&argp->fh));
/* Read the symlink. */
fh_copy(&resp->fh, &argp->fh);
resp->len = NFS3_MAXPATHLEN;
- resp->status = nfsd_readlink(rqstp, &resp->fh, argp->buffer, &resp->len);
+ resp->status = nfsd_readlink(rqstp, &resp->fh, buffer, &resp->len);
return rpc_success;
}
@@ -144,25 +145,38 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
{
struct nfsd3_readargs *argp = rqstp->rq_argp;
struct nfsd3_readres *resp = rqstp->rq_resp;
- u32 max_blocksize = svc_max_payload(rqstp);
- unsigned long cnt = min(argp->count, max_blocksize);
+ u32 max_blocksize = svc_max_payload(rqstp);
+ unsigned int len;
+ int v;
+
+ argp->count = min_t(u32, argp->count, max_blocksize);
dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n",
SVCFH_fmt(&argp->fh),
(unsigned long) argp->count,
(unsigned long long) argp->offset);
+ v = 0;
+ len = argp->count;
+ while (len > 0) {
+ struct page *page = *(rqstp->rq_next_page++);
+
+ rqstp->rq_vec[v].iov_base = page_address(page);
+ rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE);
+ len -= rqstp->rq_vec[v].iov_len;
+ v++;
+ }
+
/* Obtain buffer pointer for payload.
* 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
* + 1 (xdr opaque byte count) = 26
*/
- resp->count = cnt;
+ resp->count = argp->count;
svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
fh_copy(&resp->fh, &argp->fh);
resp->status = nfsd_read(rqstp, &resp->fh, argp->offset,
- rqstp->rq_vec, argp->vlen, &resp->count,
- &resp->eof);
+ rqstp->rq_vec, v, &resp->count, &resp->eof);
return rpc_success;
}
@@ -421,6 +435,23 @@ nfsd3_proc_link(struct svc_rqst *rqstp)
return rpc_success;
}
+static void nfsd3_init_dirlist_pages(struct svc_rqst *rqstp,
+ struct nfsd3_readdirres *resp,
+ int count)
+{
+ count = min_t(u32, count, svc_max_payload(rqstp));
+
+ /* Convert byte count to number of words (i.e. >> 2),
+ * and reserve room for the NULL ptr & eof flag (-2 words) */
+ resp->buflen = (count >> 2) - 2;
+
+ resp->buffer = page_address(*rqstp->rq_next_page);
+ while (count > 0) {
+ rqstp->rq_next_page++;
+ count -= PAGE_SIZE;
+ }
+}
+
/*
* Read a portion of a directory.
*/
@@ -430,6 +461,7 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp)
struct nfsd3_readdirargs *argp = rqstp->rq_argp;
struct nfsd3_readdirres *resp = rqstp->rq_resp;
int count = 0;
+ loff_t offset;
struct page **p;
caddr_t page_addr = NULL;
@@ -437,18 +469,16 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp)
SVCFH_fmt(&argp->fh),
argp->count, (u32) argp->cookie);
- /* Make sure we've room for the NULL ptr & eof flag, and shrink to
- * client read size */
- count = (argp->count >> 2) - 2;
+ nfsd3_init_dirlist_pages(rqstp, resp, argp->count);
/* Read directory and encode entries on the fly */
fh_copy(&resp->fh, &argp->fh);
- resp->buflen = count;
resp->common.err = nfs_ok;
- resp->buffer = argp->buffer;
resp->rqstp = rqstp;
- resp->status = nfsd_readdir(rqstp, &resp->fh, (loff_t *)&argp->cookie,
+ offset = argp->cookie;
+
+ resp->status = nfsd_readdir(rqstp, &resp->fh, &offset,
&resp->common, nfs3svc_encode_entry);
memcpy(resp->verf, argp->verf, 8);
count = 0;
@@ -464,8 +494,6 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp)
}
resp->count = count >> 2;
if (resp->offset) {
- loff_t offset = argp->cookie;
-
if (unlikely(resp->offset1)) {
/* we ended up with offset on a page boundary */
*resp->offset = htonl(offset >> 32);
@@ -498,16 +526,12 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp)
SVCFH_fmt(&argp->fh),
argp->count, (u32) argp->cookie);
- /* Convert byte count to number of words (i.e. >> 2),
- * and reserve room for the NULL ptr & eof flag (-2 words) */
- resp->count = (argp->count >> 2) - 2;
+ nfsd3_init_dirlist_pages(rqstp, resp, argp->count);
/* Read directory and encode entries on the fly */
fh_copy(&resp->fh, &argp->fh);
resp->common.err = nfs_ok;
- resp->buffer = argp->buffer;
- resp->buflen = resp->count;
resp->rqstp = rqstp;
offset = argp->cookie;
@@ -683,7 +707,6 @@ out:
* NFSv3 Server procedures.
* Only the results of non-idempotent operations are cached.
*/
-#define nfs3svc_decode_fhandleargs nfs3svc_decode_fhandle
#define nfs3svc_encode_attrstatres nfs3svc_encode_attrstat
#define nfs3svc_encode_wccstatres nfs3svc_encode_wccstat
#define nfsd3_mkdirargs nfsd3_createargs
@@ -708,16 +731,18 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST,
+ .pc_name = "NULL",
},
[NFS3PROC_GETATTR] = {
.pc_func = nfsd3_proc_getattr,
.pc_decode = nfs3svc_decode_fhandleargs,
.pc_encode = nfs3svc_encode_attrstatres,
.pc_release = nfs3svc_release_fhandle,
- .pc_argsize = sizeof(struct nfsd3_fhandleargs),
+ .pc_argsize = sizeof(struct nfsd_fhandle),
.pc_ressize = sizeof(struct nfsd3_attrstatres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+AT,
+ .pc_name = "GETATTR",
},
[NFS3PROC_SETATTR] = {
.pc_func = nfsd3_proc_setattr,
@@ -728,6 +753,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_ressize = sizeof(struct nfsd3_wccstatres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+WC,
+ .pc_name = "SETATTR",
},
[NFS3PROC_LOOKUP] = {
.pc_func = nfsd3_proc_lookup,
@@ -738,6 +764,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_ressize = sizeof(struct nfsd3_diropres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+FH+pAT+pAT,
+ .pc_name = "LOOKUP",
},
[NFS3PROC_ACCESS] = {
.pc_func = nfsd3_proc_access,
@@ -748,16 +775,18 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_ressize = sizeof(struct nfsd3_accessres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+pAT+1,
+ .pc_name = "ACCESS",
},
[NFS3PROC_READLINK] = {
.pc_func = nfsd3_proc_readlink,
- .pc_decode = nfs3svc_decode_readlinkargs,
+ .pc_decode = nfs3svc_decode_fhandleargs,
.pc_encode = nfs3svc_encode_readlinkres,
.pc_release = nfs3svc_release_fhandle,
- .pc_argsize = sizeof(struct nfsd3_readlinkargs),
+ .pc_argsize = sizeof(struct nfsd_fhandle),
.pc_ressize = sizeof(struct nfsd3_readlinkres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+pAT+1+NFS3_MAXPATHLEN/4,
+ .pc_name = "READLINK",
},
[NFS3PROC_READ] = {
.pc_func = nfsd3_proc_read,
@@ -768,6 +797,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_ressize = sizeof(struct nfsd3_readres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+pAT+4+NFSSVC_MAXBLKSIZE/4,
+ .pc_name = "READ",
},
[NFS3PROC_WRITE] = {
.pc_func = nfsd3_proc_write,
@@ -778,6 +808,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_ressize = sizeof(struct nfsd3_writeres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+WC+4,
+ .pc_name = "WRITE",
},
[NFS3PROC_CREATE] = {
.pc_func = nfsd3_proc_create,
@@ -788,6 +819,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_ressize = sizeof(struct nfsd3_createres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+(1+FH+pAT)+WC,
+ .pc_name = "CREATE",
},
[NFS3PROC_MKDIR] = {
.pc_func = nfsd3_proc_mkdir,
@@ -798,6 +830,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_ressize = sizeof(struct nfsd3_createres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+(1+FH+pAT)+WC,
+ .pc_name = "MKDIR",
},
[NFS3PROC_SYMLINK] = {
.pc_func = nfsd3_proc_symlink,
@@ -808,6 +841,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_ressize = sizeof(struct nfsd3_createres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+(1+FH+pAT)+WC,
+ .pc_name = "SYMLINK",
},
[NFS3PROC_MKNOD] = {
.pc_func = nfsd3_proc_mknod,
@@ -818,6 +852,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_ressize = sizeof(struct nfsd3_createres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+(1+FH+pAT)+WC,
+ .pc_name = "MKNOD",
},
[NFS3PROC_REMOVE] = {
.pc_func = nfsd3_proc_remove,
@@ -828,6 +863,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_ressize = sizeof(struct nfsd3_wccstatres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+WC,
+ .pc_name = "REMOVE",
},
[NFS3PROC_RMDIR] = {
.pc_func = nfsd3_proc_rmdir,
@@ -838,6 +874,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_ressize = sizeof(struct nfsd3_wccstatres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+WC,
+ .pc_name = "RMDIR",
},
[NFS3PROC_RENAME] = {
.pc_func = nfsd3_proc_rename,
@@ -848,6 +885,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_ressize = sizeof(struct nfsd3_renameres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+WC+WC,
+ .pc_name = "RENAME",
},
[NFS3PROC_LINK] = {
.pc_func = nfsd3_proc_link,
@@ -858,6 +896,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_ressize = sizeof(struct nfsd3_linkres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+pAT+WC,
+ .pc_name = "LINK",
},
[NFS3PROC_READDIR] = {
.pc_func = nfsd3_proc_readdir,
@@ -867,6 +906,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_argsize = sizeof(struct nfsd3_readdirargs),
.pc_ressize = sizeof(struct nfsd3_readdirres),
.pc_cachetype = RC_NOCACHE,
+ .pc_name = "READDIR",
},
[NFS3PROC_READDIRPLUS] = {
.pc_func = nfsd3_proc_readdirplus,
@@ -876,6 +916,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_argsize = sizeof(struct nfsd3_readdirplusargs),
.pc_ressize = sizeof(struct nfsd3_readdirres),
.pc_cachetype = RC_NOCACHE,
+ .pc_name = "READDIRPLUS",
},
[NFS3PROC_FSSTAT] = {
.pc_func = nfsd3_proc_fsstat,
@@ -885,6 +926,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_ressize = sizeof(struct nfsd3_fsstatres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+pAT+2*6+1,
+ .pc_name = "FSSTAT",
},
[NFS3PROC_FSINFO] = {
.pc_func = nfsd3_proc_fsinfo,
@@ -894,6 +936,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_ressize = sizeof(struct nfsd3_fsinfores),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+pAT+12,
+ .pc_name = "FSINFO",
},
[NFS3PROC_PATHCONF] = {
.pc_func = nfsd3_proc_pathconf,
@@ -903,6 +946,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_ressize = sizeof(struct nfsd3_pathconfres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+pAT+6,
+ .pc_name = "PATHCONF",
},
[NFS3PROC_COMMIT] = {
.pc_func = nfsd3_proc_commit,
@@ -913,6 +957,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
.pc_ressize = sizeof(struct nfsd3_commitres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+WC+2,
+ .pc_name = "COMMIT",
},
};
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 821db21ba072..9d9a01ce0b27 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -29,8 +29,9 @@ static u32 nfs3_ftypes[] = {
/*
- * XDR functions for basic NFS types
+ * Basic NFSv3 data types (RFC 1813 Sections 2.5 and 2.6)
*/
+
static __be32 *
encode_time3(__be32 *p, struct timespec64 *time)
{
@@ -38,32 +39,47 @@ encode_time3(__be32 *p, struct timespec64 *time)
return p;
}
-static __be32 *
-decode_time3(__be32 *p, struct timespec64 *time)
+static bool
+svcxdr_decode_nfstime3(struct xdr_stream *xdr, struct timespec64 *timep)
{
- time->tv_sec = ntohl(*p++);
- time->tv_nsec = ntohl(*p++);
- return p;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, XDR_UNIT * 2);
+ if (!p)
+ return false;
+ timep->tv_sec = be32_to_cpup(p++);
+ timep->tv_nsec = be32_to_cpup(p);
+
+ return true;
}
-static __be32 *
-decode_fh(__be32 *p, struct svc_fh *fhp)
+/**
+ * svcxdr_decode_nfs_fh3 - Decode an NFSv3 file handle
+ * @xdr: XDR stream positioned at an undecoded NFSv3 FH
+ * @fhp: OUT: filled-in server file handle
+ *
+ * Return values:
+ * %false: The encoded file handle was not valid
+ * %true: @fhp has been initialized
+ */
+bool
+svcxdr_decode_nfs_fh3(struct xdr_stream *xdr, struct svc_fh *fhp)
{
- unsigned int size;
+ __be32 *p;
+ u32 size;
+
+ if (xdr_stream_decode_u32(xdr, &size) < 0)
+ return false;
+ if (size == 0 || size > NFS3_FHSIZE)
+ return false;
+ p = xdr_inline_decode(xdr, size);
+ if (!p)
+ return false;
fh_init(fhp, NFS3_FHSIZE);
- size = ntohl(*p++);
- if (size > NFS3_FHSIZE)
- return NULL;
-
- memcpy(&fhp->fh_handle.fh_base, p, size);
fhp->fh_handle.fh_size = size;
- return p + XDR_QUADLEN(size);
-}
+ memcpy(&fhp->fh_handle.fh_base, p, size);
-/* Helper function for NFSv3 ACL code */
-__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp)
-{
- return decode_fh(p, fhp);
+ return true;
}
static __be32 *
@@ -76,69 +92,165 @@ encode_fh(__be32 *p, struct svc_fh *fhp)
return p + XDR_QUADLEN(size);
}
-/*
- * Decode a file name and make sure that the path contains
- * no slashes or null bytes.
- */
-static __be32 *
-decode_filename(__be32 *p, char **namp, unsigned int *lenp)
+static bool
+svcxdr_decode_filename3(struct xdr_stream *xdr, char **name, unsigned int *len)
{
- char *name;
- unsigned int i;
+ u32 size, i;
+ __be32 *p;
+ char *c;
+
+ if (xdr_stream_decode_u32(xdr, &size) < 0)
+ return false;
+ if (size == 0 || size > NFS3_MAXNAMLEN)
+ return false;
+ p = xdr_inline_decode(xdr, size);
+ if (!p)
+ return false;
- if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS3_MAXNAMLEN)) != NULL) {
- for (i = 0, name = *namp; i < *lenp; i++, name++) {
- if (*name == '\0' || *name == '/')
- return NULL;
- }
+ *len = size;
+ *name = (char *)p;
+ for (i = 0, c = *name; i < size; i++, c++) {
+ if (*c == '\0' || *c == '/')
+ return false;
}
- return p;
+ return true;
}
-static __be32 *
-decode_sattr3(__be32 *p, struct iattr *iap, struct user_namespace *userns)
+static bool
+svcxdr_decode_diropargs3(struct xdr_stream *xdr, struct svc_fh *fhp,
+ char **name, unsigned int *len)
+{
+ return svcxdr_decode_nfs_fh3(xdr, fhp) &&
+ svcxdr_decode_filename3(xdr, name, len);
+}
+
+static bool
+svcxdr_decode_sattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr,
+ struct iattr *iap)
{
- u32 tmp;
+ u32 set_it;
iap->ia_valid = 0;
- if (*p++) {
+ if (xdr_stream_decode_bool(xdr, &set_it) < 0)
+ return false;
+ if (set_it) {
+ u32 mode;
+
+ if (xdr_stream_decode_u32(xdr, &mode) < 0)
+ return false;
iap->ia_valid |= ATTR_MODE;
- iap->ia_mode = ntohl(*p++);
+ iap->ia_mode = mode;
}
- if (*p++) {
- iap->ia_uid = make_kuid(userns, ntohl(*p++));
+ if (xdr_stream_decode_bool(xdr, &set_it) < 0)
+ return false;
+ if (set_it) {
+ u32 uid;
+
+ if (xdr_stream_decode_u32(xdr, &uid) < 0)
+ return false;
+ iap->ia_uid = make_kuid(nfsd_user_namespace(rqstp), uid);
if (uid_valid(iap->ia_uid))
iap->ia_valid |= ATTR_UID;
}
- if (*p++) {
- iap->ia_gid = make_kgid(userns, ntohl(*p++));
+ if (xdr_stream_decode_bool(xdr, &set_it) < 0)
+ return false;
+ if (set_it) {
+ u32 gid;
+
+ if (xdr_stream_decode_u32(xdr, &gid) < 0)
+ return false;
+ iap->ia_gid = make_kgid(nfsd_user_namespace(rqstp), gid);
if (gid_valid(iap->ia_gid))
iap->ia_valid |= ATTR_GID;
}
- if (*p++) {
- u64 newsize;
+ if (xdr_stream_decode_bool(xdr, &set_it) < 0)
+ return false;
+ if (set_it) {
+ u64 newsize;
+ if (xdr_stream_decode_u64(xdr, &newsize) < 0)
+ return false;
iap->ia_valid |= ATTR_SIZE;
- p = xdr_decode_hyper(p, &newsize);
iap->ia_size = min_t(u64, newsize, NFS_OFFSET_MAX);
}
- if ((tmp = ntohl(*p++)) == 1) { /* set to server time */
+ if (xdr_stream_decode_u32(xdr, &set_it) < 0)
+ return false;
+ switch (set_it) {
+ case DONT_CHANGE:
+ break;
+ case SET_TO_SERVER_TIME:
iap->ia_valid |= ATTR_ATIME;
- } else if (tmp == 2) { /* set to client time */
+ break;
+ case SET_TO_CLIENT_TIME:
+ if (!svcxdr_decode_nfstime3(xdr, &iap->ia_atime))
+ return false;
iap->ia_valid |= ATTR_ATIME | ATTR_ATIME_SET;
- iap->ia_atime.tv_sec = ntohl(*p++);
- iap->ia_atime.tv_nsec = ntohl(*p++);
+ break;
+ default:
+ return false;
}
- if ((tmp = ntohl(*p++)) == 1) { /* set to server time */
+ if (xdr_stream_decode_u32(xdr, &set_it) < 0)
+ return false;
+ switch (set_it) {
+ case DONT_CHANGE:
+ break;
+ case SET_TO_SERVER_TIME:
iap->ia_valid |= ATTR_MTIME;
- } else if (tmp == 2) { /* set to client time */
+ break;
+ case SET_TO_CLIENT_TIME:
+ if (!svcxdr_decode_nfstime3(xdr, &iap->ia_mtime))
+ return false;
iap->ia_valid |= ATTR_MTIME | ATTR_MTIME_SET;
- iap->ia_mtime.tv_sec = ntohl(*p++);
- iap->ia_mtime.tv_nsec = ntohl(*p++);
+ break;
+ default:
+ return false;
}
- return p;
+
+ return true;
+}
+
+static bool
+svcxdr_decode_sattrguard3(struct xdr_stream *xdr, struct nfsd3_sattrargs *args)
+{
+ __be32 *p;
+ u32 check;
+
+ if (xdr_stream_decode_bool(xdr, &check) < 0)
+ return false;
+ if (check) {
+ p = xdr_inline_decode(xdr, XDR_UNIT * 2);
+ if (!p)
+ return false;
+ args->check_guard = 1;
+ args->guardtime = be32_to_cpup(p);
+ } else
+ args->check_guard = 0;
+
+ return true;
+}
+
+static bool
+svcxdr_decode_specdata3(struct xdr_stream *xdr, struct nfsd3_mknodargs *args)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, XDR_UNIT * 2);
+ if (!p)
+ return false;
+ args->major = be32_to_cpup(p++);
+ args->minor = be32_to_cpup(p);
+
+ return true;
+}
+
+static bool
+svcxdr_decode_devicedata3(struct svc_rqst *rqstp, struct xdr_stream *xdr,
+ struct nfsd3_mknodargs *args)
+{
+ return svcxdr_decode_sattr3(rqstp, xdr, &args->attrs) &&
+ svcxdr_decode_specdata3(xdr, args);
}
static __be32 *encode_fsid(__be32 *p, struct svc_fh *fhp)
@@ -252,6 +364,11 @@ encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
return encode_post_op_attr(rqstp, p, fhp);
}
+static bool fs_supports_change_attribute(struct super_block *sb)
+{
+ return sb->s_flags & SB_I_VERSION || sb->s_export_op->fetch_iversion;
+}
+
/*
* Fill in the pre_op attr for the wcc data
*/
@@ -260,24 +377,26 @@ void fill_pre_wcc(struct svc_fh *fhp)
struct inode *inode;
struct kstat stat;
bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE);
- __be32 err;
if (fhp->fh_no_wcc || fhp->fh_pre_saved)
return;
inode = d_inode(fhp->fh_dentry);
- err = fh_getattr(fhp, &stat);
- if (err) {
- /* Grab the times from inode anyway */
- stat.mtime = inode->i_mtime;
- stat.ctime = inode->i_ctime;
- stat.size = inode->i_size;
+ if (fs_supports_change_attribute(inode->i_sb) || !v4) {
+ __be32 err = fh_getattr(fhp, &stat);
+
+ if (err) {
+ /* Grab the times from inode anyway */
+ stat.mtime = inode->i_mtime;
+ stat.ctime = inode->i_ctime;
+ stat.size = inode->i_size;
+ }
+ fhp->fh_pre_mtime = stat.mtime;
+ fhp->fh_pre_ctime = stat.ctime;
+ fhp->fh_pre_size = stat.size;
}
if (v4)
fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode);
- fhp->fh_pre_mtime = stat.mtime;
- fhp->fh_pre_ctime = stat.ctime;
- fhp->fh_pre_size = stat.size;
fhp->fh_pre_saved = true;
}
@@ -288,7 +407,6 @@ void fill_post_wcc(struct svc_fh *fhp)
{
bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE);
struct inode *inode = d_inode(fhp->fh_dentry);
- __be32 err;
if (fhp->fh_no_wcc)
return;
@@ -296,12 +414,16 @@ void fill_post_wcc(struct svc_fh *fhp)
if (fhp->fh_post_saved)
printk("nfsd: inode locked twice during operation.\n");
- err = fh_getattr(fhp, &fhp->fh_post_attr);
- if (err) {
- fhp->fh_post_saved = false;
- fhp->fh_post_attr.ctime = inode->i_ctime;
- } else
- fhp->fh_post_saved = true;
+ fhp->fh_post_saved = true;
+
+ if (fs_supports_change_attribute(inode->i_sb) || !v4) {
+ __be32 err = fh_getattr(fhp, &fhp->fh_post_attr);
+
+ if (err) {
+ fhp->fh_post_saved = false;
+ fhp->fh_post_attr.ctime = inode->i_ctime;
+ }
+ }
if (v4)
fhp->fh_post_change =
nfsd4_change_attribute(&fhp->fh_post_attr, inode);
@@ -312,331 +434,277 @@ void fill_post_wcc(struct svc_fh *fhp)
*/
int
-nfs3svc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p)
+nfs3svc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_fhandle *args = rqstp->rq_argp;
- p = decode_fh(p, &args->fh);
- if (!p)
- return 0;
- return xdr_argsize_check(rqstp, p);
+ return svcxdr_decode_nfs_fh3(xdr, &args->fh);
}
int
nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_sattrargs *args = rqstp->rq_argp;
- p = decode_fh(p, &args->fh);
- if (!p)
- return 0;
- p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp));
-
- if ((args->check_guard = ntohl(*p++)) != 0) {
- struct timespec64 time;
- p = decode_time3(p, &time);
- args->guardtime = time.tv_sec;
- }
-
- return xdr_argsize_check(rqstp, p);
+ return svcxdr_decode_nfs_fh3(xdr, &args->fh) &&
+ svcxdr_decode_sattr3(rqstp, xdr, &args->attrs) &&
+ svcxdr_decode_sattrguard3(xdr, args);
}
int
nfs3svc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_diropargs *args = rqstp->rq_argp;
- if (!(p = decode_fh(p, &args->fh))
- || !(p = decode_filename(p, &args->name, &args->len)))
- return 0;
-
- return xdr_argsize_check(rqstp, p);
+ return svcxdr_decode_diropargs3(xdr, &args->fh, &args->name, &args->len);
}
int
nfs3svc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_accessargs *args = rqstp->rq_argp;
- p = decode_fh(p, &args->fh);
- if (!p)
+ if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
+ return 0;
+ if (xdr_stream_decode_u32(xdr, &args->access) < 0)
return 0;
- args->access = ntohl(*p++);
- return xdr_argsize_check(rqstp, p);
+ return 1;
}
int
nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_readargs *args = rqstp->rq_argp;
- unsigned int len;
- int v;
- u32 max_blocksize = svc_max_payload(rqstp);
- p = decode_fh(p, &args->fh);
- if (!p)
+ if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
+ return 0;
+ if (xdr_stream_decode_u64(xdr, &args->offset) < 0)
+ return 0;
+ if (xdr_stream_decode_u32(xdr, &args->count) < 0)
return 0;
- p = xdr_decode_hyper(p, &args->offset);
-
- args->count = ntohl(*p++);
- len = min(args->count, max_blocksize);
-
- /* set up the kvec */
- v=0;
- while (len > 0) {
- struct page *p = *(rqstp->rq_next_page++);
- rqstp->rq_vec[v].iov_base = page_address(p);
- rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE);
- len -= rqstp->rq_vec[v].iov_len;
- v++;
- }
- args->vlen = v;
- return xdr_argsize_check(rqstp, p);
+ return 1;
}
int
nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_writeargs *args = rqstp->rq_argp;
- unsigned int len, hdr, dlen;
u32 max_blocksize = svc_max_payload(rqstp);
struct kvec *head = rqstp->rq_arg.head;
struct kvec *tail = rqstp->rq_arg.tail;
+ size_t remaining;
- p = decode_fh(p, &args->fh);
- if (!p)
+ if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
return 0;
- p = xdr_decode_hyper(p, &args->offset);
-
- args->count = ntohl(*p++);
- args->stable = ntohl(*p++);
- len = args->len = ntohl(*p++);
- if ((void *)p > head->iov_base + head->iov_len)
+ if (xdr_stream_decode_u64(xdr, &args->offset) < 0)
return 0;
- /*
- * The count must equal the amount of data passed.
- */
- if (args->count != args->len)
+ if (xdr_stream_decode_u32(xdr, &args->count) < 0)
+ return 0;
+ if (xdr_stream_decode_u32(xdr, &args->stable) < 0)
return 0;
- /*
- * Check to make sure that we got the right number of
- * bytes.
- */
- hdr = (void*)p - head->iov_base;
- dlen = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len - hdr;
- /*
- * Round the length of the data which was specified up to
- * the next multiple of XDR units and then compare that
- * against the length which was actually received.
- * Note that when RPCSEC/GSS (for example) is used, the
- * data buffer can be padded so dlen might be larger
- * than required. It must never be smaller.
- */
- if (dlen < XDR_QUADLEN(len)*4)
+ /* opaque data */
+ if (xdr_stream_decode_u32(xdr, &args->len) < 0)
return 0;
+ /* request sanity */
+ if (args->count != args->len)
+ return 0;
+ remaining = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len;
+ remaining -= xdr_stream_pos(xdr);
+ if (remaining < xdr_align_size(args->len))
+ return 0;
if (args->count > max_blocksize) {
args->count = max_blocksize;
- len = args->len = max_blocksize;
+ args->len = max_blocksize;
}
- args->first.iov_base = (void *)p;
- args->first.iov_len = head->iov_len - hdr;
+ args->first.iov_base = xdr->p;
+ args->first.iov_len = head->iov_len - xdr_stream_pos(xdr);
+
return 1;
}
int
nfs3svc_decode_createargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_createargs *args = rqstp->rq_argp;
- if (!(p = decode_fh(p, &args->fh))
- || !(p = decode_filename(p, &args->name, &args->len)))
+ if (!svcxdr_decode_diropargs3(xdr, &args->fh, &args->name, &args->len))
return 0;
-
- switch (args->createmode = ntohl(*p++)) {
+ if (xdr_stream_decode_u32(xdr, &args->createmode) < 0)
+ return 0;
+ switch (args->createmode) {
case NFS3_CREATE_UNCHECKED:
case NFS3_CREATE_GUARDED:
- p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp));
- break;
+ return svcxdr_decode_sattr3(rqstp, xdr, &args->attrs);
case NFS3_CREATE_EXCLUSIVE:
- args->verf = p;
- p += 2;
+ args->verf = xdr_inline_decode(xdr, NFS3_CREATEVERFSIZE);
+ if (!args->verf)
+ return 0;
break;
default:
return 0;
}
-
- return xdr_argsize_check(rqstp, p);
+ return 1;
}
int
nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_createargs *args = rqstp->rq_argp;
- if (!(p = decode_fh(p, &args->fh)) ||
- !(p = decode_filename(p, &args->name, &args->len)))
- return 0;
- p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp));
-
- return xdr_argsize_check(rqstp, p);
+ return svcxdr_decode_diropargs3(xdr, &args->fh,
+ &args->name, &args->len) &&
+ svcxdr_decode_sattr3(rqstp, xdr, &args->attrs);
}
int
nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_symlinkargs *args = rqstp->rq_argp;
- char *base = (char *)p;
- size_t dlen;
+ struct kvec *head = rqstp->rq_arg.head;
+ struct kvec *tail = rqstp->rq_arg.tail;
+ size_t remaining;
- if (!(p = decode_fh(p, &args->ffh)) ||
- !(p = decode_filename(p, &args->fname, &args->flen)))
+ if (!svcxdr_decode_diropargs3(xdr, &args->ffh, &args->fname, &args->flen))
+ return 0;
+ if (!svcxdr_decode_sattr3(rqstp, xdr, &args->attrs))
+ return 0;
+ if (xdr_stream_decode_u32(xdr, &args->tlen) < 0)
return 0;
- p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp));
- args->tlen = ntohl(*p++);
+ /* request sanity */
+ remaining = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len;
+ remaining -= xdr_stream_pos(xdr);
+ if (remaining < xdr_align_size(args->tlen))
+ return 0;
- args->first.iov_base = p;
- args->first.iov_len = rqstp->rq_arg.head[0].iov_len;
- args->first.iov_len -= (char *)p - base;
+ args->first.iov_base = xdr->p;
+ args->first.iov_len = head->iov_len - xdr_stream_pos(xdr);
- dlen = args->first.iov_len + rqstp->rq_arg.page_len +
- rqstp->rq_arg.tail[0].iov_len;
- if (dlen < XDR_QUADLEN(args->tlen) << 2)
- return 0;
return 1;
}
int
nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_mknodargs *args = rqstp->rq_argp;
- if (!(p = decode_fh(p, &args->fh))
- || !(p = decode_filename(p, &args->name, &args->len)))
+ if (!svcxdr_decode_diropargs3(xdr, &args->fh, &args->name, &args->len))
+ return 0;
+ if (xdr_stream_decode_u32(xdr, &args->ftype) < 0)
+ return 0;
+ switch (args->ftype) {
+ case NF3CHR:
+ case NF3BLK:
+ return svcxdr_decode_devicedata3(rqstp, xdr, args);
+ case NF3SOCK:
+ case NF3FIFO:
+ return svcxdr_decode_sattr3(rqstp, xdr, &args->attrs);
+ case NF3REG:
+ case NF3DIR:
+ case NF3LNK:
+ /* Valid XDR but illegal file types */
+ break;
+ default:
return 0;
-
- args->ftype = ntohl(*p++);
-
- if (args->ftype == NF3BLK || args->ftype == NF3CHR
- || args->ftype == NF3SOCK || args->ftype == NF3FIFO)
- p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp));
-
- if (args->ftype == NF3BLK || args->ftype == NF3CHR) {
- args->major = ntohl(*p++);
- args->minor = ntohl(*p++);
}
- return xdr_argsize_check(rqstp, p);
+ return 1;
}
int
nfs3svc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_renameargs *args = rqstp->rq_argp;
- if (!(p = decode_fh(p, &args->ffh))
- || !(p = decode_filename(p, &args->fname, &args->flen))
- || !(p = decode_fh(p, &args->tfh))
- || !(p = decode_filename(p, &args->tname, &args->tlen)))
- return 0;
-
- return xdr_argsize_check(rqstp, p);
-}
-
-int
-nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p)
-{
- struct nfsd3_readlinkargs *args = rqstp->rq_argp;
-
- p = decode_fh(p, &args->fh);
- if (!p)
- return 0;
- args->buffer = page_address(*(rqstp->rq_next_page++));
-
- return xdr_argsize_check(rqstp, p);
+ return svcxdr_decode_diropargs3(xdr, &args->ffh,
+ &args->fname, &args->flen) &&
+ svcxdr_decode_diropargs3(xdr, &args->tfh,
+ &args->tname, &args->tlen);
}
int
nfs3svc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_linkargs *args = rqstp->rq_argp;
- if (!(p = decode_fh(p, &args->ffh))
- || !(p = decode_fh(p, &args->tfh))
- || !(p = decode_filename(p, &args->tname, &args->tlen)))
- return 0;
-
- return xdr_argsize_check(rqstp, p);
+ return svcxdr_decode_nfs_fh3(xdr, &args->ffh) &&
+ svcxdr_decode_diropargs3(xdr, &args->tfh,
+ &args->tname, &args->tlen);
}
int
nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_readdirargs *args = rqstp->rq_argp;
- int len;
- u32 max_blocksize = svc_max_payload(rqstp);
- p = decode_fh(p, &args->fh);
- if (!p)
+ if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
+ return 0;
+ if (xdr_stream_decode_u64(xdr, &args->cookie) < 0)
+ return 0;
+ args->verf = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE);
+ if (!args->verf)
+ return 0;
+ if (xdr_stream_decode_u32(xdr, &args->count) < 0)
return 0;
- p = xdr_decode_hyper(p, &args->cookie);
- args->verf = p; p += 2;
- args->dircount = ~0;
- args->count = ntohl(*p++);
- len = args->count = min_t(u32, args->count, max_blocksize);
-
- while (len > 0) {
- struct page *p = *(rqstp->rq_next_page++);
- if (!args->buffer)
- args->buffer = page_address(p);
- len -= PAGE_SIZE;
- }
- return xdr_argsize_check(rqstp, p);
+ return 1;
}
int
nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_readdirargs *args = rqstp->rq_argp;
- int len;
- u32 max_blocksize = svc_max_payload(rqstp);
+ u32 dircount;
- p = decode_fh(p, &args->fh);
- if (!p)
+ if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
+ return 0;
+ if (xdr_stream_decode_u64(xdr, &args->cookie) < 0)
+ return 0;
+ args->verf = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE);
+ if (!args->verf)
+ return 0;
+ /* dircount is ignored */
+ if (xdr_stream_decode_u32(xdr, &dircount) < 0)
+ return 0;
+ if (xdr_stream_decode_u32(xdr, &args->count) < 0)
return 0;
- p = xdr_decode_hyper(p, &args->cookie);
- args->verf = p; p += 2;
- args->dircount = ntohl(*p++);
- args->count = ntohl(*p++);
-
- len = args->count = min(args->count, max_blocksize);
- while (len > 0) {
- struct page *p = *(rqstp->rq_next_page++);
- if (!args->buffer)
- args->buffer = page_address(p);
- len -= PAGE_SIZE;
- }
- return xdr_argsize_check(rqstp, p);
+ return 1;
}
int
nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd3_commitargs *args = rqstp->rq_argp;
- p = decode_fh(p, &args->fh);
- if (!p)
+
+ if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
+ return 0;
+ if (xdr_stream_decode_u64(xdr, &args->offset) < 0)
+ return 0;
+ if (xdr_stream_decode_u32(xdr, &args->count) < 0)
return 0;
- p = xdr_decode_hyper(p, &args->offset);
- args->count = ntohl(*p++);
- return xdr_argsize_check(rqstp, p);
+ return 1;
}
/*
@@ -865,9 +933,14 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
if (isdotent(name, namlen)) {
if (namlen == 2) {
dchild = dget_parent(dparent);
- /* filesystem root - cannot return filehandle for ".." */
+ /*
+ * Don't return filehandle for ".." if we're at
+ * the filesystem or export root:
+ */
if (dchild == dparent)
goto out;
+ if (dparent == exp->ex_path.dentry)
+ goto out;
} else
dchild = dget(dparent);
} else
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 8d6d2678abad..acdb3cd806a1 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -378,8 +378,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* Before RECLAIM_COMPLETE done, server should deny new lock
*/
if (nfsd4_has_session(cstate) &&
- !test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE,
- &cstate->session->se_client->cl_flags) &&
+ !test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->clp->cl_flags) &&
open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
return nfserr_grace;
@@ -428,8 +427,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
break;
case NFS4_OPEN_CLAIM_PREVIOUS:
- status = nfs4_check_open_reclaim(&open->op_clientid,
- cstate, nn);
+ status = nfs4_check_open_reclaim(cstate->clp);
if (status)
goto out;
open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
@@ -1888,7 +1886,7 @@ nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
nfserr = nfs_ok;
if (gdp->gd_maxcount != 0) {
nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb,
- rqstp, cstate->session->se_client, gdp);
+ rqstp, cstate->clp, gdp);
}
gdp->gd_notify_types &= ops->notify_types;
@@ -2174,7 +2172,7 @@ nfsd4_proc_null(struct svc_rqst *rqstp)
static inline void nfsd4_increment_op_stats(u32 opnum)
{
if (opnum >= FIRST_NFS4_OP && opnum <= LAST_NFS4_OP)
- nfsdstats.nfs4_opcount[opnum]++;
+ percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_NFS4_OP(opnum)]);
}
static const struct nfsd4_operation nfsd4_ops[];
@@ -3305,6 +3303,7 @@ static const struct svc_procedure nfsd_procedures4[2] = {
.pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = 1,
+ .pc_name = "NULL",
},
[NFSPROC4_COMPOUND] = {
.pc_func = nfsd4_proc_compound,
@@ -3315,6 +3314,7 @@ static const struct svc_procedure nfsd_procedures4[2] = {
.pc_release = nfsd4_release_compoundargs,
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = NFSD_BUFSIZE/4,
+ .pc_name = "COMPOUND",
},
};
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1d2cd6a88f61..423fd6683f3a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3891,6 +3891,7 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate, union nfsd4_op_u *u)
{
struct nfsd4_reclaim_complete *rc = &u->reclaim_complete;
+ struct nfs4_client *clp = cstate->clp;
__be32 status = 0;
if (rc->rca_one_fs) {
@@ -3904,12 +3905,11 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp,
}
status = nfserr_complete_already;
- if (test_and_set_bit(NFSD4_CLIENT_RECLAIM_COMPLETE,
- &cstate->session->se_client->cl_flags))
+ if (test_and_set_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags))
goto out;
status = nfserr_stale_clientid;
- if (is_client_expired(cstate->session->se_client))
+ if (is_client_expired(clp))
/*
* The following error isn't really legal.
* But we only get here if the client just explicitly
@@ -3920,8 +3920,8 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp,
goto out;
status = nfs_ok;
- nfsd4_client_record_create(cstate->session->se_client);
- inc_reclaim_complete(cstate->session->se_client);
+ nfsd4_client_record_create(clp);
+ inc_reclaim_complete(clp);
out:
return status;
}
@@ -4633,40 +4633,37 @@ static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4
return nfserr_bad_seqid;
}
-static __be32 lookup_clientid(clientid_t *clid,
- struct nfsd4_compound_state *cstate,
- struct nfsd_net *nn,
- bool sessions)
+static struct nfs4_client *lookup_clientid(clientid_t *clid, bool sessions,
+ struct nfsd_net *nn)
{
struct nfs4_client *found;
+ spin_lock(&nn->client_lock);
+ found = find_confirmed_client(clid, sessions, nn);
+ if (found)
+ atomic_inc(&found->cl_rpc_users);
+ spin_unlock(&nn->client_lock);
+ return found;
+}
+
+static __be32 set_client(clientid_t *clid,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd_net *nn)
+{
if (cstate->clp) {
- found = cstate->clp;
- if (!same_clid(&found->cl_clientid, clid))
+ if (!same_clid(&cstate->clp->cl_clientid, clid))
return nfserr_stale_clientid;
return nfs_ok;
}
-
if (STALE_CLIENTID(clid, nn))
return nfserr_stale_clientid;
-
/*
- * For v4.1+ we get the client in the SEQUENCE op. If we don't have one
- * cached already then we know this is for is for v4.0 and "sessions"
- * will be false.
+ * We're in the 4.0 case (otherwise the SEQUENCE op would have
+ * set cstate->clp), so session = false:
*/
- WARN_ON_ONCE(cstate->session);
- spin_lock(&nn->client_lock);
- found = find_confirmed_client(clid, sessions, nn);
- if (!found) {
- spin_unlock(&nn->client_lock);
+ cstate->clp = lookup_clientid(clid, false, nn);
+ if (!cstate->clp)
return nfserr_expired;
- }
- atomic_inc(&found->cl_rpc_users);
- spin_unlock(&nn->client_lock);
-
- /* Cache the nfs4_client in cstate! */
- cstate->clp = found;
return nfs_ok;
}
@@ -4680,8 +4677,6 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
struct nfs4_openowner *oo = NULL;
__be32 status;
- if (STALE_CLIENTID(&open->op_clientid, nn))
- return nfserr_stale_clientid;
/*
* In case we need it later, after we've already created the
* file and don't want to risk a further failure:
@@ -4690,7 +4685,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
if (open->op_file == NULL)
return nfserr_jukebox;
- status = lookup_clientid(clientid, cstate, nn, false);
+ status = set_client(clientid, cstate, nn);
if (status)
return status;
clp = cstate->clp;
@@ -5300,17 +5295,14 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
trace_nfsd_clid_renew(clid);
- status = lookup_clientid(clid, cstate, nn, false);
+ status = set_client(clid, cstate, nn);
if (status)
- goto out;
+ return status;
clp = cstate->clp;
- status = nfserr_cb_path_down;
if (!list_empty(&clp->cl_delegations)
&& clp->cl_cb_state != NFSD4_CB_UP)
- goto out;
- status = nfs_ok;
-out:
- return status;
+ return nfserr_cb_path_down;
+ return nfs_ok;
}
void
@@ -5686,8 +5678,7 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) ||
CLOSE_STATEID(stateid))
return nfserr_bad_stateid;
- status = lookup_clientid(&stateid->si_opaque.so_clid, cstate, nn,
- false);
+ status = set_client(&stateid->si_opaque.so_clid, cstate, nn);
if (status == nfserr_stale_clientid) {
if (cstate->session)
return nfserr_bad_stateid;
@@ -5818,21 +5809,27 @@ static __be32 find_cpntf_state(struct nfsd_net *nn, stateid_t *st,
{
__be32 status;
struct nfs4_cpntf_state *cps = NULL;
- struct nfsd4_compound_state cstate;
+ struct nfs4_client *found;
status = manage_cpntf_state(nn, st, NULL, &cps);
if (status)
return status;
cps->cpntf_time = ktime_get_boottime_seconds();
- memset(&cstate, 0, sizeof(cstate));
- status = lookup_clientid(&cps->cp_p_clid, &cstate, nn, true);
- if (status)
+
+ status = nfserr_expired;
+ found = lookup_clientid(&cps->cp_p_clid, true, nn);
+ if (!found)
goto out;
- status = nfsd4_lookup_stateid(&cstate, &cps->cp_p_stateid,
- NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID,
- stid, nn);
- put_client_renew(cstate.clp);
+
+ *stid = find_stateid_by_type(found, &cps->cp_p_stateid,
+ NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID);
+ if (*stid)
+ status = nfs_ok;
+ else
+ status = nfserr_bad_stateid;
+
+ put_client_renew(found);
out:
nfs4_put_cpntf_state(nn, cps);
return status;
@@ -5921,7 +5918,7 @@ nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
struct nfsd4_test_stateid *test_stateid = &u->test_stateid;
struct nfsd4_test_stateid_id *stateid;
- struct nfs4_client *cl = cstate->session->se_client;
+ struct nfs4_client *cl = cstate->clp;
list_for_each_entry(stateid, &test_stateid->ts_stateid_list, ts_id_list)
stateid->ts_id_status =
@@ -5967,7 +5964,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
stateid_t *stateid = &free_stateid->fr_stateid;
struct nfs4_stid *s;
struct nfs4_delegation *dp;
- struct nfs4_client *cl = cstate->session->se_client;
+ struct nfs4_client *cl = cstate->clp;
__be32 ret = nfserr_bad_stateid;
spin_lock(&cl->cl_lock);
@@ -6696,13 +6693,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (nfsd4_has_session(cstate))
/* See rfc 5661 18.10.3: given clientid is ignored: */
memcpy(&lock->lk_new_clientid,
- &cstate->session->se_client->cl_clientid,
+ &cstate->clp->cl_clientid,
sizeof(clientid_t));
- status = nfserr_stale_clientid;
- if (STALE_CLIENTID(&lock->lk_new_clientid, nn))
- goto out;
-
/* validate and update open stateid and open seqid */
status = nfs4_preprocess_confirmed_seqid_op(cstate,
lock->lk_new_open_seqid,
@@ -6909,8 +6902,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return nfserr_inval;
if (!nfsd4_has_session(cstate)) {
- status = lookup_clientid(&lockt->lt_clientid, cstate, nn,
- false);
+ status = set_client(&lockt->lt_clientid, cstate, nn);
if (status)
goto out;
}
@@ -7094,7 +7086,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
clid->cl_boot, clid->cl_id);
- status = lookup_clientid(clid, cstate, nn, false);
+ status = set_client(clid, cstate, nn);
if (status)
return status;
@@ -7230,25 +7222,13 @@ nfsd4_find_reclaim_client(struct xdr_netobj name, struct nfsd_net *nn)
return NULL;
}
-/*
-* Called from OPEN. Look for clientid in reclaim list.
-*/
__be32
-nfs4_check_open_reclaim(clientid_t *clid,
- struct nfsd4_compound_state *cstate,
- struct nfsd_net *nn)
+nfs4_check_open_reclaim(struct nfs4_client *clp)
{
- __be32 status;
-
- /* find clientid in conf_id_hashtbl */
- status = lookup_clientid(clid, cstate, nn, false);
- if (status)
- return nfserr_reclaim_bad;
-
- if (test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->clp->cl_flags))
+ if (test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags))
return nfserr_no_grace;
- if (nfsd4_client_record_check(cstate->clp))
+ if (nfsd4_client_record_check(clp))
return nfserr_reclaim_bad;
return nfs_ok;
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 80c90fc231a5..96cdf77925f3 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -121,14 +121,14 @@ nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct svc_cacherep *rp,
struct nfsd_net *nn)
{
if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) {
- nn->drc_mem_usage -= rp->c_replvec.iov_len;
+ nfsd_stats_drc_mem_usage_sub(nn, rp->c_replvec.iov_len);
kfree(rp->c_replvec.iov_base);
}
if (rp->c_state != RC_UNUSED) {
rb_erase(&rp->c_node, &b->rb_head);
list_del(&rp->c_lru);
atomic_dec(&nn->num_drc_entries);
- nn->drc_mem_usage -= sizeof(*rp);
+ nfsd_stats_drc_mem_usage_sub(nn, sizeof(*rp));
}
kmem_cache_free(drc_slab, rp);
}
@@ -154,6 +154,16 @@ void nfsd_drc_slab_free(void)
kmem_cache_destroy(drc_slab);
}
+static int nfsd_reply_cache_stats_init(struct nfsd_net *nn)
+{
+ return nfsd_percpu_counters_init(nn->counter, NFSD_NET_COUNTERS_NUM);
+}
+
+static void nfsd_reply_cache_stats_destroy(struct nfsd_net *nn)
+{
+ nfsd_percpu_counters_destroy(nn->counter, NFSD_NET_COUNTERS_NUM);
+}
+
int nfsd_reply_cache_init(struct nfsd_net *nn)
{
unsigned int hashsize;
@@ -165,12 +175,16 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
hashsize = nfsd_hashsize(nn->max_drc_entries);
nn->maskbits = ilog2(hashsize);
+ status = nfsd_reply_cache_stats_init(nn);
+ if (status)
+ goto out_nomem;
+
nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan;
nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count;
nn->nfsd_reply_cache_shrinker.seeks = 1;
status = register_shrinker(&nn->nfsd_reply_cache_shrinker);
if (status)
- goto out_nomem;
+ goto out_stats_destroy;
nn->drc_hashtbl = kvzalloc(array_size(hashsize,
sizeof(*nn->drc_hashtbl)), GFP_KERNEL);
@@ -186,6 +200,8 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
return 0;
out_shrinker:
unregister_shrinker(&nn->nfsd_reply_cache_shrinker);
+out_stats_destroy:
+ nfsd_reply_cache_stats_destroy(nn);
out_nomem:
printk(KERN_ERR "nfsd: failed to allocate reply cache\n");
return -ENOMEM;
@@ -196,6 +212,7 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn)
struct svc_cacherep *rp;
unsigned int i;
+ nfsd_reply_cache_stats_destroy(nn);
unregister_shrinker(&nn->nfsd_reply_cache_shrinker);
for (i = 0; i < nn->drc_hashsize; i++) {
@@ -324,7 +341,7 @@ nfsd_cache_key_cmp(const struct svc_cacherep *key,
{
if (key->c_key.k_xid == rp->c_key.k_xid &&
key->c_key.k_csum != rp->c_key.k_csum) {
- ++nn->payload_misses;
+ nfsd_stats_payload_misses_inc(nn);
trace_nfsd_drc_mismatch(nn, key, rp);
}
@@ -407,7 +424,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp)
rqstp->rq_cacherep = NULL;
if (type == RC_NOCACHE) {
- nfsdstats.rcnocache++;
+ nfsd_stats_rc_nocache_inc();
goto out;
}
@@ -429,12 +446,12 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp)
goto found_entry;
}
- nfsdstats.rcmisses++;
+ nfsd_stats_rc_misses_inc();
rqstp->rq_cacherep = rp;
rp->c_state = RC_INPROG;
atomic_inc(&nn->num_drc_entries);
- nn->drc_mem_usage += sizeof(*rp);
+ nfsd_stats_drc_mem_usage_add(nn, sizeof(*rp));
/* go ahead and prune the cache */
prune_bucket(b, nn);
@@ -446,7 +463,7 @@ out:
found_entry:
/* We found a matching entry which is either in progress or done. */
- nfsdstats.rchits++;
+ nfsd_stats_rc_hits_inc();
rtn = RC_DROPIT;
/* Request being processed */
@@ -548,7 +565,7 @@ void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
return;
}
spin_lock(&b->cache_lock);
- nn->drc_mem_usage += bufsize;
+ nfsd_stats_drc_mem_usage_add(nn, bufsize);
lru_put_end(b, rp);
rp->c_secure = test_bit(RQ_SECURE, &rqstp->rq_flags);
rp->c_type = cachetype;
@@ -588,13 +605,18 @@ static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
seq_printf(m, "max entries: %u\n", nn->max_drc_entries);
seq_printf(m, "num entries: %u\n",
- atomic_read(&nn->num_drc_entries));
+ atomic_read(&nn->num_drc_entries));
seq_printf(m, "hash buckets: %u\n", 1 << nn->maskbits);
- seq_printf(m, "mem usage: %u\n", nn->drc_mem_usage);
- seq_printf(m, "cache hits: %u\n", nfsdstats.rchits);
- seq_printf(m, "cache misses: %u\n", nfsdstats.rcmisses);
- seq_printf(m, "not cached: %u\n", nfsdstats.rcnocache);
- seq_printf(m, "payload misses: %u\n", nn->payload_misses);
+ seq_printf(m, "mem usage: %lld\n",
+ percpu_counter_sum_positive(&nn->counter[NFSD_NET_DRC_MEM_USAGE]));
+ seq_printf(m, "cache hits: %lld\n",
+ percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS]));
+ seq_printf(m, "cache misses: %lld\n",
+ percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_MISSES]));
+ seq_printf(m, "not cached: %lld\n",
+ percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]));
+ seq_printf(m, "payload misses: %lld\n",
+ percpu_counter_sum_positive(&nn->counter[NFSD_NET_PAYLOAD_MISSES]));
seq_printf(m, "longest chain len: %u\n", nn->longest_chain);
seq_printf(m, "cachesize at longest: %u\n", nn->longest_chain_cachesize);
return 0;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index f6d5d783f4a4..ef86ed23af82 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -32,6 +32,7 @@
enum {
NFSD_Root = 1,
NFSD_List,
+ NFSD_Export_Stats,
NFSD_Export_features,
NFSD_Fh,
NFSD_FO_UnlockIP,
@@ -1348,6 +1349,8 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
static const struct tree_descr nfsd_files[] = {
[NFSD_List] = {"exports", &exports_nfsd_operations, S_IRUGO},
+ /* Per-export io stats use same ops as exports file */
+ [NFSD_Export_Stats] = {"export_stats", &exports_nfsd_operations, S_IRUGO},
[NFSD_Export_features] = {"export_features",
&export_features_operations, S_IRUGO},
[NFSD_FO_UnlockIP] = {"unlock_ip",
@@ -1522,19 +1525,18 @@ static int __init init_nfsd(void)
int retval;
printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n");
- retval = register_pernet_subsys(&nfsd_net_ops);
- if (retval < 0)
- return retval;
retval = register_cld_notifier();
if (retval)
- goto out_unregister_pernet;
+ return retval;
retval = nfsd4_init_slabs();
if (retval)
goto out_unregister_notifier;
retval = nfsd4_init_pnfs();
if (retval)
goto out_free_slabs;
- nfsd_stat_init(); /* Statistics */
+ retval = nfsd_stat_init(); /* Statistics */
+ if (retval)
+ goto out_free_pnfs;
retval = nfsd_drc_slab_create();
if (retval)
goto out_free_stat;
@@ -1544,9 +1546,14 @@ static int __init init_nfsd(void)
goto out_free_lockd;
retval = register_filesystem(&nfsd_fs_type);
if (retval)
+ goto out_free_exports;
+ retval = register_pernet_subsys(&nfsd_net_ops);
+ if (retval < 0)
goto out_free_all;
return 0;
out_free_all:
+ unregister_pernet_subsys(&nfsd_net_ops);
+out_free_exports:
remove_proc_entry("fs/nfs/exports", NULL);
remove_proc_entry("fs/nfs", NULL);
out_free_lockd:
@@ -1554,18 +1561,18 @@ out_free_lockd:
nfsd_drc_slab_free();
out_free_stat:
nfsd_stat_shutdown();
+out_free_pnfs:
nfsd4_exit_pnfs();
out_free_slabs:
nfsd4_free_slabs();
out_unregister_notifier:
unregister_cld_notifier();
-out_unregister_pernet:
- unregister_pernet_subsys(&nfsd_net_ops);
return retval;
}
static void __exit exit_nfsd(void)
{
+ unregister_pernet_subsys(&nfsd_net_ops);
nfsd_drc_slab_free();
remove_proc_entry("fs/nfs/exports", NULL);
remove_proc_entry("fs/nfs", NULL);
@@ -1575,7 +1582,6 @@ static void __exit exit_nfsd(void)
nfsd4_exit_pnfs();
unregister_filesystem(&nfsd_fs_type);
unregister_cld_notifier();
- unregister_pernet_subsys(&nfsd_net_ops);
}
MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index d63cf8196fed..8bdc37aa2c2e 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -24,8 +24,8 @@
#include <uapi/linux/nfsd/debug.h>
#include "netns.h"
-#include "stats.h"
#include "export.h"
+#include "stats.h"
#undef ifdebug
#ifdef CONFIG_SUNRPC_DEBUG
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 8d90796e236a..10b44421eace 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -350,7 +350,7 @@ out:
__be32
fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
{
- struct svc_export *exp;
+ struct svc_export *exp = NULL;
struct dentry *dentry;
__be32 error;
@@ -423,7 +423,7 @@ skip_pseudoflavor_check:
}
out:
if (error == nfserr_stale)
- nfsdstats.fh_stale++;
+ nfsd_stats_fh_stale_inc(exp);
return error;
}
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index cb20c2cd3469..f58933519f38 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -12,6 +12,7 @@
#include <linux/sunrpc/svc.h>
#include <uapi/linux/nfsd/nfsfh.h>
#include <linux/iversion.h>
+#include <linux/exportfs.h>
static inline __u32 ino_t_to_u32(ino_t ino)
{
@@ -264,7 +265,9 @@ fh_clear_wcc(struct svc_fh *fhp)
static inline u64 nfsd4_change_attribute(struct kstat *stat,
struct inode *inode)
{
- if (IS_I_VERSION(inode)) {
+ if (inode->i_sb->s_export_op->fetch_iversion)
+ return inode->i_sb->s_export_op->fetch_iversion(inode);
+ else if (IS_I_VERSION(inode)) {
u64 chattr;
chattr = stat->ctime.tv_sec;
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 0ea0554d20d1..a8d5449dd0e9 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -149,14 +149,15 @@ out:
static __be32
nfsd_proc_readlink(struct svc_rqst *rqstp)
{
- struct nfsd_readlinkargs *argp = rqstp->rq_argp;
+ struct nfsd_fhandle *argp = rqstp->rq_argp;
struct nfsd_readlinkres *resp = rqstp->rq_resp;
+ char *buffer = page_address(*(rqstp->rq_next_page++));
dprintk("nfsd: READLINK %s\n", SVCFH_fmt(&argp->fh));
/* Read the symlink. */
resp->len = NFS_MAXPATHLEN;
- resp->status = nfsd_readlink(rqstp, &argp->fh, argp->buffer, &resp->len);
+ resp->status = nfsd_readlink(rqstp, &argp->fh, buffer, &resp->len);
fh_put(&argp->fh);
return rpc_success;
@@ -171,32 +172,36 @@ nfsd_proc_read(struct svc_rqst *rqstp)
{
struct nfsd_readargs *argp = rqstp->rq_argp;
struct nfsd_readres *resp = rqstp->rq_resp;
+ unsigned int len;
u32 eof;
+ int v;
dprintk("nfsd: READ %s %d bytes at %d\n",
SVCFH_fmt(&argp->fh),
argp->count, argp->offset);
+ argp->count = min_t(u32, argp->count, NFSSVC_MAXBLKSIZE_V2);
+
+ v = 0;
+ len = argp->count;
+ while (len > 0) {
+ struct page *page = *(rqstp->rq_next_page++);
+
+ rqstp->rq_vec[v].iov_base = page_address(page);
+ rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE);
+ len -= rqstp->rq_vec[v].iov_len;
+ v++;
+ }
+
/* Obtain buffer pointer for payload. 19 is 1 word for
* status, 17 words for fattr, and 1 word for the byte count.
*/
-
- if (NFSSVC_MAXBLKSIZE_V2 < argp->count) {
- char buf[RPC_MAX_ADDRBUFLEN];
- printk(KERN_NOTICE
- "oversized read request from %s (%d bytes)\n",
- svc_print_addr(rqstp, buf, sizeof(buf)),
- argp->count);
- argp->count = NFSSVC_MAXBLKSIZE_V2;
- }
svc_reserve_auth(rqstp, (19<<2) + argp->count + 4);
resp->count = argp->count;
- resp->status = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh),
- argp->offset,
- rqstp->rq_vec, argp->vlen,
- &resp->count,
- &eof);
+ fh_copy(&resp->fh, &argp->fh);
+ resp->status = nfsd_read(rqstp, &resp->fh, argp->offset,
+ rqstp->rq_vec, v, &resp->count, &eof);
if (resp->status == nfs_ok)
resp->status = fh_getattr(&resp->fh, &resp->stat);
else if (resp->status == nfserr_jukebox)
@@ -548,6 +553,20 @@ nfsd_proc_rmdir(struct svc_rqst *rqstp)
return rpc_success;
}
+static void nfsd_init_dirlist_pages(struct svc_rqst *rqstp,
+ struct nfsd_readdirres *resp,
+ int count)
+{
+ count = min_t(u32, count, PAGE_SIZE);
+
+ /* Convert byte count to number of words (i.e. >> 2),
+ * and reserve room for the NULL ptr & eof flag (-2 words) */
+ resp->buflen = (count >> 2) - 2;
+
+ resp->buffer = page_address(*rqstp->rq_next_page);
+ rqstp->rq_next_page++;
+}
+
/*
* Read a portion of a directory.
*/
@@ -556,31 +575,24 @@ nfsd_proc_readdir(struct svc_rqst *rqstp)
{
struct nfsd_readdirargs *argp = rqstp->rq_argp;
struct nfsd_readdirres *resp = rqstp->rq_resp;
- int count;
loff_t offset;
+ __be32 *buffer;
dprintk("nfsd: READDIR %s %d bytes at %d\n",
SVCFH_fmt(&argp->fh),
argp->count, argp->cookie);
- /* Shrink to the client read size */
- count = (argp->count >> 2) - 2;
-
- /* Make sure we've room for the NULL ptr & eof flag */
- count -= 2;
- if (count < 0)
- count = 0;
+ nfsd_init_dirlist_pages(rqstp, resp, argp->count);
+ buffer = resp->buffer;
- resp->buffer = argp->buffer;
resp->offset = NULL;
- resp->buflen = count;
resp->common.err = nfs_ok;
/* Read directory and encode entries on the fly */
offset = argp->cookie;
resp->status = nfsd_readdir(rqstp, &argp->fh, &offset,
&resp->common, nfssvc_encode_entry);
- resp->count = resp->buffer - argp->buffer;
+ resp->count = resp->buffer - buffer;
if (resp->offset)
*resp->offset = htonl(offset);
@@ -623,16 +635,18 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = 0,
+ .pc_name = "NULL",
},
[NFSPROC_GETATTR] = {
.pc_func = nfsd_proc_getattr,
- .pc_decode = nfssvc_decode_fhandle,
+ .pc_decode = nfssvc_decode_fhandleargs,
.pc_encode = nfssvc_encode_attrstat,
.pc_release = nfssvc_release_attrstat,
.pc_argsize = sizeof(struct nfsd_fhandle),
.pc_ressize = sizeof(struct nfsd_attrstat),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+AT,
+ .pc_name = "GETATTR",
},
[NFSPROC_SETATTR] = {
.pc_func = nfsd_proc_setattr,
@@ -643,6 +657,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_ressize = sizeof(struct nfsd_attrstat),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+AT,
+ .pc_name = "SETATTR",
},
[NFSPROC_ROOT] = {
.pc_func = nfsd_proc_root,
@@ -652,6 +667,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = 0,
+ .pc_name = "ROOT",
},
[NFSPROC_LOOKUP] = {
.pc_func = nfsd_proc_lookup,
@@ -662,15 +678,17 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_ressize = sizeof(struct nfsd_diropres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+FH+AT,
+ .pc_name = "LOOKUP",
},
[NFSPROC_READLINK] = {
.pc_func = nfsd_proc_readlink,
- .pc_decode = nfssvc_decode_readlinkargs,
+ .pc_decode = nfssvc_decode_fhandleargs,
.pc_encode = nfssvc_encode_readlinkres,
- .pc_argsize = sizeof(struct nfsd_readlinkargs),
+ .pc_argsize = sizeof(struct nfsd_fhandle),
.pc_ressize = sizeof(struct nfsd_readlinkres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+1+NFS_MAXPATHLEN/4,
+ .pc_name = "READLINK",
},
[NFSPROC_READ] = {
.pc_func = nfsd_proc_read,
@@ -681,6 +699,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_ressize = sizeof(struct nfsd_readres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4,
+ .pc_name = "READ",
},
[NFSPROC_WRITECACHE] = {
.pc_func = nfsd_proc_writecache,
@@ -690,6 +709,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_ressize = sizeof(struct nfsd_voidres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = 0,
+ .pc_name = "WRITECACHE",
},
[NFSPROC_WRITE] = {
.pc_func = nfsd_proc_write,
@@ -700,6 +720,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_ressize = sizeof(struct nfsd_attrstat),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+AT,
+ .pc_name = "WRITE",
},
[NFSPROC_CREATE] = {
.pc_func = nfsd_proc_create,
@@ -710,6 +731,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_ressize = sizeof(struct nfsd_diropres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+FH+AT,
+ .pc_name = "CREATE",
},
[NFSPROC_REMOVE] = {
.pc_func = nfsd_proc_remove,
@@ -719,6 +741,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
.pc_xdrressize = ST,
+ .pc_name = "REMOVE",
},
[NFSPROC_RENAME] = {
.pc_func = nfsd_proc_rename,
@@ -728,6 +751,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
.pc_xdrressize = ST,
+ .pc_name = "RENAME",
},
[NFSPROC_LINK] = {
.pc_func = nfsd_proc_link,
@@ -737,6 +761,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
.pc_xdrressize = ST,
+ .pc_name = "LINK",
},
[NFSPROC_SYMLINK] = {
.pc_func = nfsd_proc_symlink,
@@ -746,6 +771,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
.pc_xdrressize = ST,
+ .pc_name = "SYMLINK",
},
[NFSPROC_MKDIR] = {
.pc_func = nfsd_proc_mkdir,
@@ -756,6 +782,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_ressize = sizeof(struct nfsd_diropres),
.pc_cachetype = RC_REPLBUFF,
.pc_xdrressize = ST+FH+AT,
+ .pc_name = "MKDIR",
},
[NFSPROC_RMDIR] = {
.pc_func = nfsd_proc_rmdir,
@@ -765,6 +792,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_ressize = sizeof(struct nfsd_stat),
.pc_cachetype = RC_REPLSTAT,
.pc_xdrressize = ST,
+ .pc_name = "RMDIR",
},
[NFSPROC_READDIR] = {
.pc_func = nfsd_proc_readdir,
@@ -773,15 +801,17 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_argsize = sizeof(struct nfsd_readdirargs),
.pc_ressize = sizeof(struct nfsd_readdirres),
.pc_cachetype = RC_NOCACHE,
+ .pc_name = "READDIR",
},
[NFSPROC_STATFS] = {
.pc_func = nfsd_proc_statfs,
- .pc_decode = nfssvc_decode_fhandle,
+ .pc_decode = nfssvc_decode_fhandleargs,
.pc_encode = nfssvc_encode_statfsres,
.pc_argsize = sizeof(struct nfsd_fhandle),
.pc_ressize = sizeof(struct nfsd_statfsres),
.pc_cachetype = RC_NOCACHE,
.pc_xdrressize = ST+5,
+ .pc_name = "STATFS",
},
};
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index f9c9f4c63cc7..6de406322106 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -955,37 +955,6 @@ out:
return 0;
}
-/*
- * A write procedure can have a large argument, and a read procedure can
- * have a large reply, but no NFSv2 or NFSv3 procedure has argument and
- * reply that can both be larger than a page. The xdr code has taken
- * advantage of this assumption to be a sloppy about bounds checking in
- * some cases. Pending a rewrite of the NFSv2/v3 xdr code to fix that
- * problem, we enforce these assumptions here:
- */
-static bool nfs_request_too_big(struct svc_rqst *rqstp,
- const struct svc_procedure *proc)
-{
- /*
- * The ACL code has more careful bounds-checking and is not
- * susceptible to this problem:
- */
- if (rqstp->rq_prog != NFS_PROGRAM)
- return false;
- /*
- * Ditto NFSv4 (which can in theory have argument and reply both
- * more than a page):
- */
- if (rqstp->rq_vers >= 4)
- return false;
- /* The reply will be small, we're OK: */
- if (proc->pc_xdrressize > 0 &&
- proc->pc_xdrressize < XDR_QUADLEN(PAGE_SIZE))
- return false;
-
- return rqstp->rq_arg.len > PAGE_SIZE;
-}
-
/**
* nfsd_dispatch - Process an NFS or NFSACL Request
* @rqstp: incoming request
@@ -1004,9 +973,6 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
struct kvec *resv = &rqstp->rq_res.head[0];
__be32 *p;
- if (nfs_request_too_big(rqstp, proc))
- goto out_decode_err;
-
/*
* Give the xdr decoder a chance to change this if it wants
* (necessary in the NFSv4.0 compound case)
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 7aa6e8aca2c1..5d79ef6a0c7f 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -23,24 +23,31 @@ static u32 nfs_ftypes[] = {
/*
- * XDR functions for basic NFS types
+ * Basic NFSv2 data types (RFC 1094 Section 2.3)
*/
-static __be32 *
-decode_fh(__be32 *p, struct svc_fh *fhp)
+
+/**
+ * svcxdr_decode_fhandle - Decode an NFSv2 file handle
+ * @xdr: XDR stream positioned at an encoded NFSv2 FH
+ * @fhp: OUT: filled-in server file handle
+ *
+ * Return values:
+ * %false: The encoded file handle was not valid
+ * %true: @fhp has been initialized
+ */
+bool
+svcxdr_decode_fhandle(struct xdr_stream *xdr, struct svc_fh *fhp)
{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS_FHSIZE);
+ if (!p)
+ return false;
fh_init(fhp, NFS_FHSIZE);
memcpy(&fhp->fh_handle.fh_base, p, NFS_FHSIZE);
fhp->fh_handle.fh_size = NFS_FHSIZE;
- /* FIXME: Look up export pointer here and verify
- * Sun Secure RPC if requested */
- return p + (NFS_FHSIZE >> 2);
-}
-
-/* Helper function for NFSv2 ACL code */
-__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp)
-{
- return decode_fh(p, fhp);
+ return true;
}
static __be32 *
@@ -50,66 +57,95 @@ encode_fh(__be32 *p, struct svc_fh *fhp)
return p + (NFS_FHSIZE>> 2);
}
-/*
- * Decode a file name and make sure that the path contains
- * no slashes or null bytes.
- */
-static __be32 *
-decode_filename(__be32 *p, char **namp, unsigned int *lenp)
+static bool
+svcxdr_decode_filename(struct xdr_stream *xdr, char **name, unsigned int *len)
{
- char *name;
- unsigned int i;
-
- if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXNAMLEN)) != NULL) {
- for (i = 0, name = *namp; i < *lenp; i++, name++) {
- if (*name == '\0' || *name == '/')
- return NULL;
- }
- }
+ u32 size, i;
+ __be32 *p;
+ char *c;
+
+ if (xdr_stream_decode_u32(xdr, &size) < 0)
+ return false;
+ if (size == 0 || size > NFS_MAXNAMLEN)
+ return false;
+ p = xdr_inline_decode(xdr, size);
+ if (!p)
+ return false;
- return p;
+ *len = size;
+ *name = (char *)p;
+ for (i = 0, c = *name; i < size; i++, c++)
+ if (*c == '\0' || *c == '/')
+ return false;
+
+ return true;
}
-static __be32 *
-decode_sattr(__be32 *p, struct iattr *iap, struct user_namespace *userns)
+static bool
+svcxdr_decode_diropargs(struct xdr_stream *xdr, struct svc_fh *fhp,
+ char **name, unsigned int *len)
{
- u32 tmp, tmp1;
+ return svcxdr_decode_fhandle(xdr, fhp) &&
+ svcxdr_decode_filename(xdr, name, len);
+}
+
+static bool
+svcxdr_decode_sattr(struct svc_rqst *rqstp, struct xdr_stream *xdr,
+ struct iattr *iap)
+{
+ u32 tmp1, tmp2;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, XDR_UNIT * 8);
+ if (!p)
+ return false;
iap->ia_valid = 0;
- /* Sun client bug compatibility check: some sun clients seem to
- * put 0xffff in the mode field when they mean 0xffffffff.
- * Quoting the 4.4BSD nfs server code: Nah nah nah nah na nah.
+ /*
+ * Some Sun clients put 0xffff in the mode field when they
+ * mean 0xffffffff.
*/
- if ((tmp = ntohl(*p++)) != (u32)-1 && tmp != 0xffff) {
+ tmp1 = be32_to_cpup(p++);
+ if (tmp1 != (u32)-1 && tmp1 != 0xffff) {
iap->ia_valid |= ATTR_MODE;
- iap->ia_mode = tmp;
+ iap->ia_mode = tmp1;
}
- if ((tmp = ntohl(*p++)) != (u32)-1) {
- iap->ia_uid = make_kuid(userns, tmp);
+
+ tmp1 = be32_to_cpup(p++);
+ if (tmp1 != (u32)-1) {
+ iap->ia_uid = make_kuid(nfsd_user_namespace(rqstp), tmp1);
if (uid_valid(iap->ia_uid))
iap->ia_valid |= ATTR_UID;
}
- if ((tmp = ntohl(*p++)) != (u32)-1) {
- iap->ia_gid = make_kgid(userns, tmp);
+
+ tmp1 = be32_to_cpup(p++);
+ if (tmp1 != (u32)-1) {
+ iap->ia_gid = make_kgid(nfsd_user_namespace(rqstp), tmp1);
if (gid_valid(iap->ia_gid))
iap->ia_valid |= ATTR_GID;
}
- if ((tmp = ntohl(*p++)) != (u32)-1) {
+
+ tmp1 = be32_to_cpup(p++);
+ if (tmp1 != (u32)-1) {
iap->ia_valid |= ATTR_SIZE;
- iap->ia_size = tmp;
+ iap->ia_size = tmp1;
}
- tmp = ntohl(*p++); tmp1 = ntohl(*p++);
- if (tmp != (u32)-1 && tmp1 != (u32)-1) {
+
+ tmp1 = be32_to_cpup(p++);
+ tmp2 = be32_to_cpup(p++);
+ if (tmp1 != (u32)-1 && tmp2 != (u32)-1) {
iap->ia_valid |= ATTR_ATIME | ATTR_ATIME_SET;
- iap->ia_atime.tv_sec = tmp;
- iap->ia_atime.tv_nsec = tmp1 * 1000;
+ iap->ia_atime.tv_sec = tmp1;
+ iap->ia_atime.tv_nsec = tmp2 * NSEC_PER_USEC;
}
- tmp = ntohl(*p++); tmp1 = ntohl(*p++);
- if (tmp != (u32)-1 && tmp1 != (u32)-1) {
+
+ tmp1 = be32_to_cpup(p++);
+ tmp2 = be32_to_cpup(p++);
+ if (tmp1 != (u32)-1 && tmp2 != (u32)-1) {
iap->ia_valid |= ATTR_MTIME | ATTR_MTIME_SET;
- iap->ia_mtime.tv_sec = tmp;
- iap->ia_mtime.tv_nsec = tmp1 * 1000;
+ iap->ia_mtime.tv_sec = tmp1;
+ iap->ia_mtime.tv_nsec = tmp2 * NSEC_PER_USEC;
/*
* Passing the invalid value useconds=1000000 for mtime
* is a Sun convention for "set both mtime and atime to
@@ -119,10 +155,11 @@ decode_sattr(__be32 *p, struct iattr *iap, struct user_namespace *userns)
* sattr in section 6.1 of "NFS Illustrated" by
* Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5
*/
- if (tmp1 == 1000000)
+ if (tmp2 == 1000000)
iap->ia_valid &= ~(ATTR_ATIME_SET|ATTR_MTIME_SET);
}
- return p;
+
+ return true;
}
static __be32 *
@@ -194,225 +231,158 @@ __be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *f
*/
int
-nfssvc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p)
+nfssvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_fhandle *args = rqstp->rq_argp;
- p = decode_fh(p, &args->fh);
- if (!p)
- return 0;
- return xdr_argsize_check(rqstp, p);
+ return svcxdr_decode_fhandle(xdr, &args->fh);
}
int
nfssvc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_sattrargs *args = rqstp->rq_argp;
- p = decode_fh(p, &args->fh);
- if (!p)
- return 0;
- p = decode_sattr(p, &args->attrs, nfsd_user_namespace(rqstp));
-
- return xdr_argsize_check(rqstp, p);
+ return svcxdr_decode_fhandle(xdr, &args->fh) &&
+ svcxdr_decode_sattr(rqstp, xdr, &args->attrs);
}
int
nfssvc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_diropargs *args = rqstp->rq_argp;
- if (!(p = decode_fh(p, &args->fh))
- || !(p = decode_filename(p, &args->name, &args->len)))
- return 0;
-
- return xdr_argsize_check(rqstp, p);
+ return svcxdr_decode_diropargs(xdr, &args->fh, &args->name, &args->len);
}
int
nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_readargs *args = rqstp->rq_argp;
- unsigned int len;
- int v;
- p = decode_fh(p, &args->fh);
- if (!p)
- return 0;
-
- args->offset = ntohl(*p++);
- len = args->count = ntohl(*p++);
- p++; /* totalcount - unused */
+ u32 totalcount;
- len = min_t(unsigned int, len, NFSSVC_MAXBLKSIZE_V2);
+ if (!svcxdr_decode_fhandle(xdr, &args->fh))
+ return 0;
+ if (xdr_stream_decode_u32(xdr, &args->offset) < 0)
+ return 0;
+ if (xdr_stream_decode_u32(xdr, &args->count) < 0)
+ return 0;
+ /* totalcount is ignored */
+ if (xdr_stream_decode_u32(xdr, &totalcount) < 0)
+ return 0;
- /* set up somewhere to store response.
- * We take pages, put them on reslist and include in iovec
- */
- v=0;
- while (len > 0) {
- struct page *p = *(rqstp->rq_next_page++);
-
- rqstp->rq_vec[v].iov_base = page_address(p);
- rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE);
- len -= rqstp->rq_vec[v].iov_len;
- v++;
- }
- args->vlen = v;
- return xdr_argsize_check(rqstp, p);
+ return 1;
}
int
nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_writeargs *args = rqstp->rq_argp;
- unsigned int len, hdr, dlen;
struct kvec *head = rqstp->rq_arg.head;
+ struct kvec *tail = rqstp->rq_arg.tail;
+ u32 beginoffset, totalcount;
+ size_t remaining;
- p = decode_fh(p, &args->fh);
- if (!p)
+ if (!svcxdr_decode_fhandle(xdr, &args->fh))
return 0;
-
- p++; /* beginoffset */
- args->offset = ntohl(*p++); /* offset */
- p++; /* totalcount */
- len = args->len = ntohl(*p++);
- /*
- * The protocol specifies a maximum of 8192 bytes.
- */
- if (len > NFSSVC_MAXBLKSIZE_V2)
+ /* beginoffset is ignored */
+ if (xdr_stream_decode_u32(xdr, &beginoffset) < 0)
return 0;
-
- /*
- * Check to make sure that we got the right number of
- * bytes.
- */
- hdr = (void*)p - head->iov_base;
- if (hdr > head->iov_len)
+ if (xdr_stream_decode_u32(xdr, &args->offset) < 0)
+ return 0;
+ /* totalcount is ignored */
+ if (xdr_stream_decode_u32(xdr, &totalcount) < 0)
return 0;
- dlen = head->iov_len + rqstp->rq_arg.page_len - hdr;
- /*
- * Round the length of the data which was specified up to
- * the next multiple of XDR units and then compare that
- * against the length which was actually received.
- * Note that when RPCSEC/GSS (for example) is used, the
- * data buffer can be padded so dlen might be larger
- * than required. It must never be smaller.
- */
- if (dlen < XDR_QUADLEN(len)*4)
+ /* opaque data */
+ if (xdr_stream_decode_u32(xdr, &args->len) < 0)
return 0;
+ if (args->len > NFSSVC_MAXBLKSIZE_V2)
+ return 0;
+ remaining = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len;
+ remaining -= xdr_stream_pos(xdr);
+ if (remaining < xdr_align_size(args->len))
+ return 0;
+ args->first.iov_base = xdr->p;
+ args->first.iov_len = head->iov_len - xdr_stream_pos(xdr);
- args->first.iov_base = (void *)p;
- args->first.iov_len = head->iov_len - hdr;
return 1;
}
int
nfssvc_decode_createargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_createargs *args = rqstp->rq_argp;
- if ( !(p = decode_fh(p, &args->fh))
- || !(p = decode_filename(p, &args->name, &args->len)))
- return 0;
- p = decode_sattr(p, &args->attrs, nfsd_user_namespace(rqstp));
-
- return xdr_argsize_check(rqstp, p);
+ return svcxdr_decode_diropargs(xdr, &args->fh,
+ &args->name, &args->len) &&
+ svcxdr_decode_sattr(rqstp, xdr, &args->attrs);
}
int
nfssvc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_renameargs *args = rqstp->rq_argp;
- if (!(p = decode_fh(p, &args->ffh))
- || !(p = decode_filename(p, &args->fname, &args->flen))
- || !(p = decode_fh(p, &args->tfh))
- || !(p = decode_filename(p, &args->tname, &args->tlen)))
- return 0;
-
- return xdr_argsize_check(rqstp, p);
-}
-
-int
-nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p)
-{
- struct nfsd_readlinkargs *args = rqstp->rq_argp;
-
- p = decode_fh(p, &args->fh);
- if (!p)
- return 0;
- args->buffer = page_address(*(rqstp->rq_next_page++));
-
- return xdr_argsize_check(rqstp, p);
+ return svcxdr_decode_diropargs(xdr, &args->ffh,
+ &args->fname, &args->flen) &&
+ svcxdr_decode_diropargs(xdr, &args->tfh,
+ &args->tname, &args->tlen);
}
int
nfssvc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_linkargs *args = rqstp->rq_argp;
- if (!(p = decode_fh(p, &args->ffh))
- || !(p = decode_fh(p, &args->tfh))
- || !(p = decode_filename(p, &args->tname, &args->tlen)))
- return 0;
-
- return xdr_argsize_check(rqstp, p);
+ return svcxdr_decode_fhandle(xdr, &args->ffh) &&
+ svcxdr_decode_diropargs(xdr, &args->tfh,
+ &args->tname, &args->tlen);
}
int
nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_symlinkargs *args = rqstp->rq_argp;
- char *base = (char *)p;
- size_t xdrlen;
+ struct kvec *head = rqstp->rq_arg.head;
- if ( !(p = decode_fh(p, &args->ffh))
- || !(p = decode_filename(p, &args->fname, &args->flen)))
+ if (!svcxdr_decode_diropargs(xdr, &args->ffh, &args->fname, &args->flen))
+ return 0;
+ if (xdr_stream_decode_u32(xdr, &args->tlen) < 0)
return 0;
-
- args->tlen = ntohl(*p++);
if (args->tlen == 0)
return 0;
- args->first.iov_base = p;
- args->first.iov_len = rqstp->rq_arg.head[0].iov_len;
- args->first.iov_len -= (char *)p - base;
-
- /* This request is never larger than a page. Therefore,
- * transport will deliver either:
- * 1. pathname in the pagelist -> sattr is in the tail.
- * 2. everything in the head buffer -> sattr is in the head.
- */
- if (rqstp->rq_arg.page_len) {
- if (args->tlen != rqstp->rq_arg.page_len)
- return 0;
- p = rqstp->rq_arg.tail[0].iov_base;
- } else {
- xdrlen = XDR_QUADLEN(args->tlen);
- if (xdrlen > args->first.iov_len - (8 * sizeof(__be32)))
- return 0;
- p += xdrlen;
- }
- decode_sattr(p, &args->attrs, nfsd_user_namespace(rqstp));
-
- return 1;
+ args->first.iov_len = head->iov_len - xdr_stream_pos(xdr);
+ args->first.iov_base = xdr_inline_decode(xdr, args->tlen);
+ if (!args->first.iov_base)
+ return 0;
+ return svcxdr_decode_sattr(rqstp, xdr, &args->attrs);
}
int
nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p)
{
+ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
struct nfsd_readdirargs *args = rqstp->rq_argp;
- p = decode_fh(p, &args->fh);
- if (!p)
+ if (!svcxdr_decode_fhandle(xdr, &args->fh))
+ return 0;
+ if (xdr_stream_decode_u32(xdr, &args->cookie) < 0)
+ return 0;
+ if (xdr_stream_decode_u32(xdr, &args->count) < 0)
return 0;
- args->cookie = ntohl(*p++);
- args->count = ntohl(*p++);
- args->count = min_t(u32, args->count, PAGE_SIZE);
- args->buffer = page_address(*(rqstp->rq_next_page++));
- return xdr_argsize_check(rqstp, p);
+ return 1;
}
/*
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 9eae11a9d21c..73deea353169 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -649,8 +649,7 @@ void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *)
extern void nfs4_release_reclaim(struct nfsd_net *);
extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(struct xdr_netobj name,
struct nfsd_net *nn);
-extern __be32 nfs4_check_open_reclaim(clientid_t *clid,
- struct nfsd4_compound_state *cstate, struct nfsd_net *nn);
+extern __be32 nfs4_check_open_reclaim(struct nfs4_client *);
extern void nfsd4_probe_callback(struct nfs4_client *clp);
extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index b1bc582b0493..1d3b881e7382 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -7,16 +7,14 @@
* Format:
* rc <hits> <misses> <nocache>
* Statistsics for the reply cache
- * fh <stale> <total-lookups> <anonlookups> <dir-not-in-dcache> <nondir-not-in-dcache>
+ * fh <stale> <deprecated filehandle cache stats>
* statistics for filehandle lookup
* io <bytes-read> <bytes-written>
* statistics for IO throughput
- * th <threads> <fullcnt> <10%-20%> <20%-30%> ... <90%-100%> <100%>
- * time (seconds) when nfsd thread usage above thresholds
- * and number of times that all threads were in use
- * ra cache-size <10% <20% <30% ... <100% not-found
- * number of times that read-ahead entry was found that deep in
- * the cache.
+ * th <threads> <deprecated thread usage histogram stats>
+ * number of threads
+ * ra <deprecated ra-cache stats>
+ *
* plus generic RPC stats (see net/sunrpc/stats.c)
*
* Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
@@ -38,31 +36,24 @@ static int nfsd_proc_show(struct seq_file *seq, void *v)
{
int i;
- seq_printf(seq, "rc %u %u %u\nfh %u %u %u %u %u\nio %u %u\n",
- nfsdstats.rchits,
- nfsdstats.rcmisses,
- nfsdstats.rcnocache,
- nfsdstats.fh_stale,
- nfsdstats.fh_lookup,
- nfsdstats.fh_anon,
- nfsdstats.fh_nocache_dir,
- nfsdstats.fh_nocache_nondir,
- nfsdstats.io_read,
- nfsdstats.io_write);
+ seq_printf(seq, "rc %lld %lld %lld\nfh %lld 0 0 0 0\nio %lld %lld\n",
+ percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS]),
+ percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_MISSES]),
+ percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]),
+ percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_FH_STALE]),
+ percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_READ]),
+ percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_WRITE]));
+
/* thread usage: */
- seq_printf(seq, "th %u %u", nfsdstats.th_cnt, nfsdstats.th_fullcnt);
- for (i=0; i<10; i++) {
- unsigned int jifs = nfsdstats.th_usage[i];
- unsigned int sec = jifs / HZ, msec = (jifs % HZ)*1000/HZ;
- seq_printf(seq, " %u.%03u", sec, msec);
- }
+ seq_printf(seq, "th %u 0", nfsdstats.th_cnt);
+
+ /* deprecated thread usage histogram stats */
+ for (i = 0; i < 10; i++)
+ seq_puts(seq, " 0.000");
+
+ /* deprecated ra-cache stats */
+ seq_puts(seq, "\nra 0 0 0 0 0 0 0 0 0 0 0 0\n");
- /* newline and ra-cache */
- seq_printf(seq, "\nra %u", nfsdstats.ra_size);
- for (i=0; i<11; i++)
- seq_printf(seq, " %u", nfsdstats.ra_depth[i]);
- seq_putc(seq, '\n');
-
/* show my rpc info */
svc_seq_show(seq, &nfsd_svcstats);
@@ -70,8 +61,10 @@ static int nfsd_proc_show(struct seq_file *seq, void *v)
/* Show count for individual nfsv4 operations */
/* Writing operation numbers 0 1 2 also for maintaining uniformity */
seq_printf(seq,"proc4ops %u", LAST_NFS4_OP + 1);
- for (i = 0; i <= LAST_NFS4_OP; i++)
- seq_printf(seq, " %u", nfsdstats.nfs4_opcount[i]);
+ for (i = 0; i <= LAST_NFS4_OP; i++) {
+ seq_printf(seq, " %lld",
+ percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_NFS4_OP(i)]));
+ }
seq_putc(seq, '\n');
#endif
@@ -91,14 +84,63 @@ static const struct proc_ops nfsd_proc_ops = {
.proc_release = single_release,
};
-void
-nfsd_stat_init(void)
+int nfsd_percpu_counters_init(struct percpu_counter counters[], int num)
+{
+ int i, err = 0;
+
+ for (i = 0; !err && i < num; i++)
+ err = percpu_counter_init(&counters[i], 0, GFP_KERNEL);
+
+ if (!err)
+ return 0;
+
+ for (; i > 0; i--)
+ percpu_counter_destroy(&counters[i-1]);
+
+ return err;
+}
+
+void nfsd_percpu_counters_reset(struct percpu_counter counters[], int num)
+{
+ int i;
+
+ for (i = 0; i < num; i++)
+ percpu_counter_set(&counters[i], 0);
+}
+
+void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num)
{
+ int i;
+
+ for (i = 0; i < num; i++)
+ percpu_counter_destroy(&counters[i]);
+}
+
+static int nfsd_stat_counters_init(void)
+{
+ return nfsd_percpu_counters_init(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM);
+}
+
+static void nfsd_stat_counters_destroy(void)
+{
+ nfsd_percpu_counters_destroy(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM);
+}
+
+int nfsd_stat_init(void)
+{
+ int err;
+
+ err = nfsd_stat_counters_init();
+ if (err)
+ return err;
+
svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_ops);
+
+ return 0;
}
-void
-nfsd_stat_shutdown(void)
+void nfsd_stat_shutdown(void)
{
+ nfsd_stat_counters_destroy();
svc_proc_unregister(&init_net, "nfsd");
}
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
index b23fdac69820..51ecda852e23 100644
--- a/fs/nfsd/stats.h
+++ b/fs/nfsd/stats.h
@@ -8,37 +8,91 @@
#define _NFSD_STATS_H
#include <uapi/linux/nfsd/stats.h>
+#include <linux/percpu_counter.h>
-struct nfsd_stats {
- unsigned int rchits; /* repcache hits */
- unsigned int rcmisses; /* repcache hits */
- unsigned int rcnocache; /* uncached reqs */
- unsigned int fh_stale; /* FH stale error */
- unsigned int fh_lookup; /* dentry cached */
- unsigned int fh_anon; /* anon file dentry returned */
- unsigned int fh_nocache_dir; /* filehandle not found in dcache */
- unsigned int fh_nocache_nondir; /* filehandle not found in dcache */
- unsigned int io_read; /* bytes returned to read requests */
- unsigned int io_write; /* bytes passed in write requests */
- unsigned int th_cnt; /* number of available threads */
- unsigned int th_usage[10]; /* number of ticks during which n perdeciles
- * of available threads were in use */
- unsigned int th_fullcnt; /* number of times last free thread was used */
- unsigned int ra_size; /* size of ra cache */
- unsigned int ra_depth[11]; /* number of times ra entry was found that deep
- * in the cache (10percentiles). [10] = not found */
+enum {
+ NFSD_STATS_RC_HITS, /* repcache hits */
+ NFSD_STATS_RC_MISSES, /* repcache misses */
+ NFSD_STATS_RC_NOCACHE, /* uncached reqs */
+ NFSD_STATS_FH_STALE, /* FH stale error */
+ NFSD_STATS_IO_READ, /* bytes returned to read requests */
+ NFSD_STATS_IO_WRITE, /* bytes passed in write requests */
#ifdef CONFIG_NFSD_V4
- unsigned int nfs4_opcount[LAST_NFS4_OP + 1]; /* count of individual nfsv4 operations */
+ NFSD_STATS_FIRST_NFS4_OP, /* count of individual nfsv4 operations */
+ NFSD_STATS_LAST_NFS4_OP = NFSD_STATS_FIRST_NFS4_OP + LAST_NFS4_OP,
+#define NFSD_STATS_NFS4_OP(op) (NFSD_STATS_FIRST_NFS4_OP + (op))
#endif
+ NFSD_STATS_COUNTERS_NUM
+};
+
+struct nfsd_stats {
+ struct percpu_counter counter[NFSD_STATS_COUNTERS_NUM];
+ /* Protected by nfsd_mutex */
+ unsigned int th_cnt; /* number of available threads */
};
extern struct nfsd_stats nfsdstats;
+
extern struct svc_stat nfsd_svcstats;
-void nfsd_stat_init(void);
-void nfsd_stat_shutdown(void);
+int nfsd_percpu_counters_init(struct percpu_counter counters[], int num);
+void nfsd_percpu_counters_reset(struct percpu_counter counters[], int num);
+void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num);
+int nfsd_stat_init(void);
+void nfsd_stat_shutdown(void);
+
+static inline void nfsd_stats_rc_hits_inc(void)
+{
+ percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_HITS]);
+}
+
+static inline void nfsd_stats_rc_misses_inc(void)
+{
+ percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_MISSES]);
+}
+
+static inline void nfsd_stats_rc_nocache_inc(void)
+{
+ percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]);
+}
+
+static inline void nfsd_stats_fh_stale_inc(struct svc_export *exp)
+{
+ percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_FH_STALE]);
+ if (exp)
+ percpu_counter_inc(&exp->ex_stats.counter[EXP_STATS_FH_STALE]);
+}
+
+static inline void nfsd_stats_io_read_add(struct svc_export *exp, s64 amount)
+{
+ percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_READ], amount);
+ if (exp)
+ percpu_counter_add(&exp->ex_stats.counter[EXP_STATS_IO_READ], amount);
+}
+
+static inline void nfsd_stats_io_write_add(struct svc_export *exp, s64 amount)
+{
+ percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_WRITE], amount);
+ if (exp)
+ percpu_counter_add(&exp->ex_stats.counter[EXP_STATS_IO_WRITE], amount);
+}
+
+static inline void nfsd_stats_payload_misses_inc(struct nfsd_net *nn)
+{
+ percpu_counter_inc(&nn->counter[NFSD_NET_PAYLOAD_MISSES]);
+}
+
+static inline void nfsd_stats_drc_mem_usage_add(struct nfsd_net *nn, s64 amount)
+{
+ percpu_counter_add(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount);
+}
+
+static inline void nfsd_stats_drc_mem_usage_sub(struct nfsd_net *nn, s64 amount)
+{
+ percpu_counter_sub(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount);
+}
#endif /* _NFSD_STATS_H */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index fab873178140..fd6be35a1642 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -890,7 +890,7 @@ static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned long *count, u32 *eof, ssize_t host_err)
{
if (host_err >= 0) {
- nfsdstats.io_read += host_err;
+ nfsd_stats_io_read_add(fhp->fh_export, host_err);
*eof = nfsd_eof_on_read(file, offset, host_err, *count);
*count = host_err;
fsnotify_access(file);
@@ -1041,7 +1041,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
goto out_nfserr;
}
*cnt = host_err;
- nfsdstats.io_write += *cnt;
+ nfsd_stats_io_write_add(exp, *cnt);
fsnotify_modify(file);
if (stable && use_wgather) {
diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h
index ad77387734cc..3018b52b6d5e 100644
--- a/fs/nfsd/xdr.h
+++ b/fs/nfsd/xdr.h
@@ -27,7 +27,6 @@ struct nfsd_readargs {
struct svc_fh fh;
__u32 offset;
__u32 count;
- int vlen;
};
struct nfsd_writeargs {
@@ -53,11 +52,6 @@ struct nfsd_renameargs {
unsigned int tlen;
};
-struct nfsd_readlinkargs {
- struct svc_fh fh;
- char * buffer;
-};
-
struct nfsd_linkargs {
struct svc_fh ffh;
struct svc_fh tfh;
@@ -79,7 +73,6 @@ struct nfsd_readdirargs {
struct svc_fh fh;
__u32 cookie;
__u32 count;
- __be32 * buffer;
};
struct nfsd_stat {
@@ -144,14 +137,13 @@ union nfsd_xdrstore {
#define NFS2_SVC_XDRSIZE sizeof(union nfsd_xdrstore)
-int nfssvc_decode_fhandle(struct svc_rqst *, __be32 *);
+int nfssvc_decode_fhandleargs(struct svc_rqst *, __be32 *);
int nfssvc_decode_sattrargs(struct svc_rqst *, __be32 *);
int nfssvc_decode_diropargs(struct svc_rqst *, __be32 *);
int nfssvc_decode_readargs(struct svc_rqst *, __be32 *);
int nfssvc_decode_writeargs(struct svc_rqst *, __be32 *);
int nfssvc_decode_createargs(struct svc_rqst *, __be32 *);
int nfssvc_decode_renameargs(struct svc_rqst *, __be32 *);
-int nfssvc_decode_readlinkargs(struct svc_rqst *, __be32 *);
int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *);
int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *);
int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *);
@@ -172,6 +164,6 @@ void nfssvc_release_readres(struct svc_rqst *rqstp);
/* Helper functions for NFSv2 ACL code */
__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat);
-__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp);
+bool svcxdr_decode_fhandle(struct xdr_stream *xdr, struct svc_fh *fhp);
#endif /* LINUX_NFSD_H */
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
index 456fcd7a1038..3e1578953f54 100644
--- a/fs/nfsd/xdr3.h
+++ b/fs/nfsd/xdr3.h
@@ -25,14 +25,13 @@ struct nfsd3_diropargs {
struct nfsd3_accessargs {
struct svc_fh fh;
- unsigned int access;
+ __u32 access;
};
struct nfsd3_readargs {
struct svc_fh fh;
__u64 offset;
__u32 count;
- int vlen;
};
struct nfsd3_writeargs {
@@ -71,11 +70,6 @@ struct nfsd3_renameargs {
unsigned int tlen;
};
-struct nfsd3_readlinkargs {
- struct svc_fh fh;
- char * buffer;
-};
-
struct nfsd3_linkargs {
struct svc_fh ffh;
struct svc_fh tfh;
@@ -96,10 +90,8 @@ struct nfsd3_symlinkargs {
struct nfsd3_readdirargs {
struct svc_fh fh;
__u64 cookie;
- __u32 dircount;
__u32 count;
__be32 * verf;
- __be32 * buffer;
};
struct nfsd3_commitargs {
@@ -110,13 +102,13 @@ struct nfsd3_commitargs {
struct nfsd3_getaclargs {
struct svc_fh fh;
- int mask;
+ __u32 mask;
};
struct posix_acl;
struct nfsd3_setaclargs {
struct svc_fh fh;
- int mask;
+ __u32 mask;
struct posix_acl *acl_access;
struct posix_acl *acl_default;
};
@@ -273,7 +265,7 @@ union nfsd3_xdrstore {
#define NFS3_SVC_XDRSIZE sizeof(union nfsd3_xdrstore)
-int nfs3svc_decode_fhandle(struct svc_rqst *, __be32 *);
+int nfs3svc_decode_fhandleargs(struct svc_rqst *, __be32 *);
int nfs3svc_decode_sattrargs(struct svc_rqst *, __be32 *);
int nfs3svc_decode_diropargs(struct svc_rqst *, __be32 *);
int nfs3svc_decode_accessargs(struct svc_rqst *, __be32 *);
@@ -283,7 +275,6 @@ int nfs3svc_decode_createargs(struct svc_rqst *, __be32 *);
int nfs3svc_decode_mkdirargs(struct svc_rqst *, __be32 *);
int nfs3svc_decode_mknodargs(struct svc_rqst *, __be32 *);
int nfs3svc_decode_renameargs(struct svc_rqst *, __be32 *);
-int nfs3svc_decode_readlinkargs(struct svc_rqst *, __be32 *);
int nfs3svc_decode_linkargs(struct svc_rqst *, __be32 *);
int nfs3svc_decode_symlinkargs(struct svc_rqst *, __be32 *);
int nfs3svc_decode_readdirargs(struct svc_rqst *, __be32 *);
@@ -316,7 +307,6 @@ int nfs3svc_encode_entry_plus(void *, const char *name,
/* Helper functions for NFSv3 ACL code */
__be32 *nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p,
struct svc_fh *fhp);
-__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp);
-
+bool svcxdr_decode_nfs_fh3(struct xdr_stream *xdr, struct svc_fh *fhp);
#endif /* _LINUX_NFSD_XDR3_H */
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 64bc81363c6c..e1bd592ce700 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -141,6 +141,7 @@ const struct file_operations nilfs_file_operations = {
/* .release = nilfs_release_file, */
.fsync = nilfs_sync_file,
.splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
};
const struct inode_operations nilfs_file_inode_operations = {
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 1a8729eded8b..1e75417bfe6e 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -386,10 +386,6 @@ static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start,
struct bio *bio;
bio = bio_alloc(GFP_NOIO, nr_vecs);
- if (bio == NULL) {
- while (!bio && (nr_vecs >>= 1))
- bio = bio_alloc(GFP_NOIO, nr_vecs);
- }
if (likely(bio)) {
bio_set_dev(bio, nilfs->ns_bdev);
bio->bi_iter.bi_sector =
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index b55cdeb4d169..987c8ab02aee 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -375,7 +375,7 @@ static inline int nilfs_flush_device(struct the_nilfs *nilfs)
*/
smp_wmb();
- err = blkdev_issue_flush(nilfs->ns_bdev, GFP_KERNEL);
+ err = blkdev_issue_flush(nilfs->ns_bdev);
if (err != -EIO)
err = 0;
return err;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 64cfc1a3015d..9e0c1afac8bd 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -976,7 +976,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
f_flags |= O_NONBLOCK;
/* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */
- group = fsnotify_alloc_group(&fanotify_fsnotify_ops);
+ group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops);
if (IS_ERR(group)) {
free_uid(user);
return PTR_ERR(group);
diff --git a/fs/notify/group.c b/fs/notify/group.c
index a4a4b1c64d32..ffd723ffe46d 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -111,14 +111,12 @@ void fsnotify_put_group(struct fsnotify_group *group)
}
EXPORT_SYMBOL_GPL(fsnotify_put_group);
-/*
- * Create a new fsnotify_group and hold a reference for the group returned.
- */
-struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
+static struct fsnotify_group *__fsnotify_alloc_group(
+ const struct fsnotify_ops *ops, gfp_t gfp)
{
struct fsnotify_group *group;
- group = kzalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
+ group = kzalloc(sizeof(struct fsnotify_group), gfp);
if (!group)
return ERR_PTR(-ENOMEM);
@@ -139,8 +137,25 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
return group;
}
+
+/*
+ * Create a new fsnotify_group and hold a reference for the group returned.
+ */
+struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
+{
+ return __fsnotify_alloc_group(ops, GFP_KERNEL);
+}
EXPORT_SYMBOL_GPL(fsnotify_alloc_group);
+/*
+ * Create a new fsnotify_group and hold a reference for the group returned.
+ */
+struct fsnotify_group *fsnotify_alloc_user_group(const struct fsnotify_ops *ops)
+{
+ return __fsnotify_alloc_group(ops, GFP_KERNEL_ACCOUNT);
+}
+EXPORT_SYMBOL_GPL(fsnotify_alloc_user_group);
+
int fsnotify_fasync(int fd, struct file *file, int on)
{
struct fsnotify_group *group = file->private_data;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index e1155d32ef6f..c71be4fb7dc5 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -632,11 +632,11 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
struct fsnotify_group *group;
struct inotify_event_info *oevent;
- group = fsnotify_alloc_group(&inotify_fsnotify_ops);
+ group = fsnotify_alloc_user_group(&inotify_fsnotify_ops);
if (IS_ERR(group))
return group;
- oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL);
+ oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL_ACCOUNT);
if (unlikely(!oevent)) {
fsnotify_destroy_group(group);
return ERR_PTR(-ENOMEM);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index e3039d973acd..6611c64ca0be 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -194,7 +194,7 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
needs_barrier = true;
err = jbd2_complete_transaction(journal, commit_tid);
if (needs_barrier) {
- ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ ret = blkdev_issue_flush(inode->i_sb->s_bdev);
if (!err)
err = ret;
}
diff --git a/fs/open.c b/fs/open.c
index 4ec3979d0466..e53af13b5835 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1100,6 +1100,12 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
lookup_flags |= LOOKUP_BENEATH;
if (how->resolve & RESOLVE_IN_ROOT)
lookup_flags |= LOOKUP_IN_ROOT;
+ if (how->resolve & RESOLVE_CACHED) {
+ /* Don't bother even trying for create/truncate/tmpfile open */
+ if (flags & (O_TRUNC | O_CREAT | O_TMPFILE))
+ return -EAGAIN;
+ lookup_flags |= LOOKUP_CACHED;
+ }
op->lookup_flags = lookup_flags;
return 0;
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index ec8ae4257975..9b28a7132466 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -487,10 +487,7 @@ static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma)
return ret;
gossip_debug(GOSSIP_FILE_DEBUG,
- "orangefs_file_mmap: called on %s\n",
- (file ?
- (char *)file->f_path.dentry->d_name.name :
- (char *)"Unknown"));
+ "orangefs_file_mmap: called on %pD\n", file);
/* set the sequential readahead hint */
vma->vm_flags |= VM_SEQ_READ;
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index f81b836c2256..0b2891c6c71e 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -84,6 +84,14 @@ int ovl_copy_xattr(struct super_block *sb, struct dentry *old,
if (ovl_is_private_xattr(sb, name))
continue;
+
+ error = security_inode_copy_up_xattr(name);
+ if (error < 0 && error != -EOPNOTSUPP)
+ break;
+ if (error == 1) {
+ error = 0;
+ continue; /* Discard */
+ }
retry:
size = vfs_getxattr(&init_user_ns, old, name, value, value_size);
if (size == -ERANGE)
@@ -107,13 +115,6 @@ retry:
goto retry;
}
- error = security_inode_copy_up_xattr(name);
- if (error < 0 && error != -EOPNOTSUPP)
- break;
- if (error == 1) {
- error = 0;
- continue; /* Discard */
- }
error = vfs_setxattr(&init_user_ns, new, name, value, size, 0);
if (error) {
if (error != -EOPNOTSUPP || ovl_must_copy_xattr(name))
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 8b3be7342a8c..836f14b9d3a6 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -993,8 +993,8 @@ static char *ovl_get_redirect(struct dentry *dentry, bool abs_redirect)
buflen -= thislen;
memcpy(&buf[buflen], name, thislen);
- tmp = dget_dlock(d->d_parent);
spin_unlock(&d->d_lock);
+ tmp = dget_parent(d);
dput(d);
d = tmp;
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 7d8b84c715b3..dbfb35fb0ff7 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -398,8 +398,9 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
const struct cred *old_cred;
int ret;
- if (!ovl_should_sync(OVL_FS(file_inode(file)->i_sb)))
- return 0;
+ ret = ovl_sync_status(OVL_FS(file_inode(file)->i_sb));
+ if (ret <= 0)
+ return ret;
ret = ovl_real_fdget_meta(file, &real, !datasync);
if (ret)
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index e78d45dfeaee..003cf83bf78a 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -354,7 +354,9 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
goto out;
if (!value && !upperdentry) {
+ old_cred = ovl_override_creds(dentry->d_sb);
err = vfs_getxattr(&init_user_ns, realdentry, name, NULL, 0);
+ revert_creds(old_cred);
if (err < 0)
goto out_drop_write;
}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 78b9d93a33c9..95cff83786a5 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -333,6 +333,7 @@ int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct dentry *dentry);
bool ovl_is_metacopy_dentry(struct dentry *dentry);
char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct dentry *dentry,
int padding);
+int ovl_sync_status(struct ovl_fs *ofs);
static inline bool ovl_is_impuredir(struct super_block *sb,
struct dentry *dentry)
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index fbd5e27ce66b..63efee554f69 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -81,6 +81,8 @@ struct ovl_fs {
atomic_long_t last_ino;
/* Whiteout dentry cache */
struct dentry *whiteout;
+ /* r/o snapshot of upperdir sb's only taken on volatile mounts */
+ errseq_t errseq;
};
static inline struct vfsmount *ovl_upper_mnt(struct ovl_fs *ofs)
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 01620ebae1bd..f404a78e6b60 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -865,7 +865,7 @@ struct file *ovl_dir_real_file(const struct file *file, bool want_upper)
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
- struct file *realfile = od->realfile;
+ struct file *old, *realfile = od->realfile;
if (!OVL_TYPE_UPPER(ovl_path_type(dentry)))
return want_upper ? NULL : realfile;
@@ -874,29 +874,20 @@ struct file *ovl_dir_real_file(const struct file *file, bool want_upper)
* Need to check if we started out being a lower dir, but got copied up
*/
if (!od->is_upper) {
- struct inode *inode = file_inode(file);
-
realfile = READ_ONCE(od->upperfile);
if (!realfile) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
realfile = ovl_dir_open_realfile(file, &upperpath);
+ if (IS_ERR(realfile))
+ return realfile;
- inode_lock(inode);
- if (!od->upperfile) {
- if (IS_ERR(realfile)) {
- inode_unlock(inode);
- return realfile;
- }
- smp_store_release(&od->upperfile, realfile);
- } else {
- /* somebody has beaten us to it */
- if (!IS_ERR(realfile))
- fput(realfile);
- realfile = od->upperfile;
+ old = cmpxchg_release(&od->upperfile, NULL, realfile);
+ if (old) {
+ fput(realfile);
+ realfile = old;
}
- inode_unlock(inode);
}
}
@@ -909,8 +900,9 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
struct file *realfile;
int err;
- if (!ovl_should_sync(OVL_FS(file->f_path.dentry->d_sb)))
- return 0;
+ err = ovl_sync_status(OVL_FS(file->f_path.dentry->d_sb));
+ if (err <= 0)
+ return err;
realfile = ovl_dir_real_file(file, true);
err = PTR_ERR_OR_ZERO(realfile);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index b702c576e783..fdd72f1a9c5e 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -264,11 +264,20 @@ static int ovl_sync_fs(struct super_block *sb, int wait)
struct super_block *upper_sb;
int ret;
- if (!ovl_upper_mnt(ofs))
- return 0;
+ ret = ovl_sync_status(ofs);
+ /*
+ * We have to always set the err, because the return value isn't
+ * checked in syncfs, and instead indirectly return an error via
+ * the sb's writeback errseq, which VFS inspects after this call.
+ */
+ if (ret < 0) {
+ errseq_set(&sb->s_wb_err, -EIO);
+ return -EIO;
+ }
+
+ if (!ret)
+ return ret;
- if (!ovl_should_sync(ofs))
- return 0;
/*
* Not called for sync(2) call or an emergency sync (SB_I_SKIP_SYNC).
* All the super blocks will be iterated, including upper_sb.
@@ -1932,6 +1941,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
unsigned int numlower;
int err;
+ err = -EIO;
+ if (WARN_ON(sb->s_user_ns != current_user_ns()))
+ goto out;
+
sb->s_d_op = &ovl_dentry_operations;
err = -ENOMEM;
@@ -1998,6 +2011,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
sb->s_op = &ovl_super_operations;
if (ofs->config.upperdir) {
+ struct super_block *upper_sb;
+
if (!ofs->config.workdir) {
pr_err("missing 'workdir'\n");
goto out_err;
@@ -2007,6 +2022,16 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
if (err)
goto out_err;
+ upper_sb = ovl_upper_mnt(ofs)->mnt_sb;
+ if (!ovl_should_sync(ofs)) {
+ ofs->errseq = errseq_sample(&upper_sb->s_wb_err);
+ if (errseq_check(&upper_sb->s_wb_err, ofs->errseq)) {
+ err = -EIO;
+ pr_err("Cannot mount volatile when upperdir has an unseen error. Sync upperdir fs to clear state.\n");
+ goto out_err;
+ }
+ }
+
err = ovl_get_workdir(sb, ofs, &upperpath);
if (err)
goto out_err;
@@ -2014,9 +2039,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
if (!ofs->workdir)
sb->s_flags |= SB_RDONLY;
- sb->s_stack_depth = ovl_upper_mnt(ofs)->mnt_sb->s_stack_depth;
- sb->s_time_gran = ovl_upper_mnt(ofs)->mnt_sb->s_time_gran;
-
+ sb->s_stack_depth = upper_sb->s_stack_depth;
+ sb->s_time_gran = upper_sb->s_time_gran;
}
oe = ovl_get_lowerstack(sb, splitlower, numlower, ofs, layers);
err = PTR_ERR(oe);
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 06013b7b1e87..7f5a01a11f97 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -962,3 +962,30 @@ err_free:
kfree(buf);
return ERR_PTR(res);
}
+
+/*
+ * ovl_sync_status() - Check fs sync status for volatile mounts
+ *
+ * Returns 1 if this is not a volatile mount and a real sync is required.
+ *
+ * Returns 0 if syncing can be skipped because mount is volatile, and no errors
+ * have occurred on the upperdir since the mount.
+ *
+ * Returns -errno if it is a volatile mount, and the error that occurred since
+ * the last mount. If the error code changes, it'll return the latest error
+ * code.
+ */
+
+int ovl_sync_status(struct ovl_fs *ofs)
+{
+ struct vfsmount *mnt;
+
+ if (ovl_should_sync(ofs))
+ return 1;
+
+ mnt = ovl_upper_mnt(ofs);
+ if (!mnt)
+ return 0;
+
+ return errseq_check(&mnt->mnt_sb->s_wb_err, ofs->errseq);
+}
diff --git a/fs/pipe.c b/fs/pipe.c
index c5989cfd564d..39c96845a72f 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1206,6 +1206,7 @@ const struct file_operations pipefifo_fops = {
.unlocked_ioctl = pipe_ioctl,
.release = pipe_release,
.fasync = pipe_fasync,
+ .splice_write = iter_file_splice_write,
};
/*
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 2daac06727d0..656ba24c317d 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -1773,6 +1773,12 @@ static int process_sysctl_arg(char *param, char *val,
return 0;
}
+ if (!val)
+ return -EINVAL;
+ len = strlen(val);
+ if (len == 0)
+ return -EINVAL;
+
/*
* To set sysctl options, we use a temporary mount of proc, look up the
* respective sys/ file and write to it. To avoid mounting it when no
@@ -1814,7 +1820,6 @@ static int process_sysctl_arg(char *param, char *val,
file, param, val);
goto out;
}
- len = strlen(val);
wret = kernel_write(file, val, len, &pos);
if (wret < 0) {
err = wret;
diff --git a/fs/proc/self.c b/fs/proc/self.c
index cc71ce3466dc..a4012154e109 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -20,7 +20,7 @@ static const char *proc_self_get_link(struct dentry *dentry,
* Not currently supported. Once we can inherit all of struct pid,
* we can allow this.
*/
- if (current->flags & PF_KTHREAD)
+ if (current->flags & PF_IO_WORKER)
return ERR_PTR(-EOPNOTSUPP);
if (!tgid)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 602e3a52884d..3cec6fbef725 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1210,7 +1210,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
struct mm_struct *mm;
struct vm_area_struct *vma;
enum clear_refs_types type;
- struct mmu_gather tlb;
int itype;
int rv;
@@ -1249,7 +1248,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
goto out_unlock;
}
- tlb_gather_mmu(&tlb, mm, 0, -1);
if (type == CLEAR_REFS_SOFT_DIRTY) {
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (!(vma->vm_flags & VM_SOFTDIRTY))
@@ -1258,15 +1256,18 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
vma_set_page_prot(vma);
}
+ inc_tlb_flush_pending(mm);
mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY,
0, NULL, mm, 0, -1UL);
mmu_notifier_invalidate_range_start(&range);
}
walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops,
&cp);
- if (type == CLEAR_REFS_SOFT_DIRTY)
+ if (type == CLEAR_REFS_SOFT_DIRTY) {
mmu_notifier_invalidate_range_end(&range);
- tlb_finish_mmu(&tlb, 0, -1);
+ flush_tlb_mm(mm);
+ dec_tlb_flush_pending(mm);
+ }
out_unlock:
mmap_write_unlock(mm);
out_mm:
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index a553273fbd41..d56681d86d28 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -17,6 +17,13 @@ static const char *proc_thread_self_get_link(struct dentry *dentry,
pid_t pid = task_pid_nr_ns(current, ns);
char *name;
+ /*
+ * Not currently supported. Once we can inherit all of struct pid,
+ * we can allow this.
+ */
+ if (current->flags & PF_IO_WORKER)
+ return ERR_PTR(-EOPNOTSUPP);
+
if (!pid)
return ERR_PTR(-ENOENT);
name = kmalloc(10 + 6 + 10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 32f64abc277c..d963ae7902f9 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -269,7 +269,7 @@ static int pstore_compress(const void *in, void *out,
{
int ret;
- if (!IS_ENABLED(CONFIG_PSTORE_COMPRESSION))
+ if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS))
return -EINVAL;
ret = crypto_comp_compress(tfm, in, inlen, out, &outlen);
@@ -671,7 +671,7 @@ static void decompress_record(struct pstore_record *record)
int unzipped_len;
char *unzipped, *workspace;
- if (!IS_ENABLED(CONFIG_PSTORE_COMPRESSION) || !record->compressed)
+ if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !record->compressed)
return;
/* Only PSTORE_TYPE_DMESG support compression. */
diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c
index 5266ccbec007..7c8f8feac6c3 100644
--- a/fs/pstore/zone.c
+++ b/fs/pstore/zone.c
@@ -23,7 +23,7 @@
#include "internal.h"
/**
- * struct psz_head - header of zone to flush to storage
+ * struct psz_buffer - header of zone to flush to storage
*
* @sig: signature to indicate header (PSZ_SIG xor PSZONE-type value)
* @datalen: length of data in @data
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index c21106557a37..b1467f3921c2 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -164,19 +164,24 @@ static int v2_read_file_info(struct super_block *sb, int type)
quota_error(sb, "Number of blocks too big for quota file size (%llu > %llu).",
(loff_t)qinfo->dqi_blocks << qinfo->dqi_blocksize_bits,
i_size_read(sb_dqopt(sb)->files[type]));
- goto out;
+ goto out_free;
}
if (qinfo->dqi_free_blk >= qinfo->dqi_blocks) {
quota_error(sb, "Free block number too big (%u >= %u).",
qinfo->dqi_free_blk, qinfo->dqi_blocks);
- goto out;
+ goto out_free;
}
if (qinfo->dqi_free_entry >= qinfo->dqi_blocks) {
quota_error(sb, "Block with free entry too big (%u >= %u).",
qinfo->dqi_free_entry, qinfo->dqi_blocks);
- goto out;
+ goto out_free;
}
ret = 0;
+out_free:
+ if (ret) {
+ kfree(info->dqi_priv);
+ info->dqi_priv = NULL;
+ }
out:
up_read(&dqopt->dqio_sem);
return ret;
diff --git a/fs/read_write.c b/fs/read_write.c
index 75f764b43418..9db7adf160d2 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1188,6 +1188,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
{
struct fd in, out;
struct inode *in_inode, *out_inode;
+ struct pipe_inode_info *opipe;
loff_t pos;
loff_t out_pos;
ssize_t retval;
@@ -1228,9 +1229,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
in_inode = file_inode(in.file);
out_inode = file_inode(out.file);
out_pos = out.file->f_pos;
- retval = rw_verify_area(WRITE, out.file, &out_pos, count);
- if (retval < 0)
- goto fput_out;
if (!max)
max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
@@ -1253,9 +1251,18 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
if (in.file->f_flags & O_NONBLOCK)
fl = SPLICE_F_NONBLOCK;
#endif
- file_start_write(out.file);
- retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
- file_end_write(out.file);
+ opipe = get_pipe_info(out.file, true);
+ if (!opipe) {
+ retval = rw_verify_area(WRITE, out.file, &out_pos, count);
+ if (retval < 0)
+ goto fput_out;
+ file_start_write(out.file);
+ retval = do_splice_direct(in.file, &pos, out.file, &out_pos,
+ count, fl);
+ file_end_write(out.file);
+ } else {
+ retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl);
+ }
if (retval > 0) {
add_rchar(current, retval);
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 0b641ae694f1..1db0254bc38b 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -159,7 +159,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end,
barrier_done = reiserfs_commit_for_inode(inode);
reiserfs_write_unlock(inode->i_sb);
if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
- blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ blkdev_issue_flush(inode->i_sb->s_bdev);
inode_unlock(inode);
if (barrier_done < 0)
return barrier_done;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 03a369ccd28c..cb11a34fb871 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -669,7 +669,8 @@ void seq_puts(struct seq_file *m, const char *s)
EXPORT_SYMBOL(seq_puts);
/**
- * A helper routine for putting decimal numbers without rich format of printf().
+ * seq_put_decimal_ull_width - A helper routine for putting decimal numbers
+ * without rich format of printf().
* only 'unsigned long long' is supported.
* @m: seq_file identifying the buffer to which data should be written
* @delimiter: a string which is printed before the number
@@ -1044,7 +1045,7 @@ struct hlist_node *seq_hlist_next_rcu(void *v,
EXPORT_SYMBOL(seq_hlist_next_rcu);
/**
- * seq_hlist_start_precpu - start an iteration of a percpu hlist array
+ * seq_hlist_start_percpu - start an iteration of a percpu hlist array
* @head: pointer to percpu array of struct hlist_heads
* @cpu: pointer to cpu "cursor"
* @pos: start position of sequence
diff --git a/fs/splice.c b/fs/splice.c
index 866d5c2367b2..5dbce4dcc1a7 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -662,12 +662,14 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
/* build the vector */
left = sd.total_len;
- for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++, n++) {
+ for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) {
struct pipe_buffer *buf = &pipe->bufs[tail & mask];
size_t this_len = buf->len;
- if (this_len > left)
- this_len = left;
+ /* zero-length bvecs are not supported, skip them */
+ if (!this_len)
+ continue;
+ this_len = min(this_len, left);
ret = pipe_buf_confirm(pipe, buf);
if (unlikely(ret)) {
@@ -680,6 +682,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
array[n].bv_len = this_len;
array[n].bv_offset = buf->offset;
left -= this_len;
+ n++;
}
iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left);
@@ -771,11 +774,16 @@ static long do_splice_to(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags)
{
+ unsigned int p_space;
int ret;
if (unlikely(!(in->f_mode & FMODE_READ)))
return -EBADF;
+ /* Don't try to read more the pipe has space for. */
+ p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail);
+ len = min_t(size_t, len, p_space << PAGE_SHIFT);
+
ret = rw_verify_area(READ, in, ppos, len);
if (unlikely(ret < 0))
return ret;
@@ -856,15 +864,10 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail));
while (len) {
- unsigned int p_space;
size_t read_len;
loff_t pos = sd->pos, prev_pos = pos;
- /* Don't try to read more the pipe has space for. */
- p_space = pipe->max_usage -
- pipe_occupancy(pipe->head, pipe->tail);
- read_len = min_t(size_t, len, p_space << PAGE_SHIFT);
- ret = do_splice_to(in, &pos, pipe, read_len, flags);
+ ret = do_splice_to(in, &pos, pipe, len, flags);
if (unlikely(ret <= 0))
goto out_release;
@@ -1002,6 +1005,23 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
struct pipe_inode_info *opipe,
size_t len, unsigned int flags);
+long splice_file_to_pipe(struct file *in,
+ struct pipe_inode_info *opipe,
+ loff_t *offset,
+ size_t len, unsigned int flags)
+{
+ long ret;
+
+ pipe_lock(opipe);
+ ret = wait_for_space(opipe, flags);
+ if (!ret)
+ ret = do_splice_to(in, offset, opipe, len, flags);
+ pipe_unlock(opipe);
+ if (ret > 0)
+ wakeup_pipe_readers(opipe);
+ return ret;
+}
+
/*
* Determine where to splice to/from.
*/
@@ -1081,20 +1101,7 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out,
if (out->f_flags & O_NONBLOCK)
flags |= SPLICE_F_NONBLOCK;
- pipe_lock(opipe);
- ret = wait_for_space(opipe, flags);
- if (!ret) {
- unsigned int p_space;
-
- /* Don't try to read more the pipe has space for. */
- p_space = opipe->max_usage - pipe_occupancy(opipe->head, opipe->tail);
- len = min_t(size_t, len, p_space << PAGE_SHIFT);
-
- ret = do_splice_to(in, &offset, opipe, len, flags);
- }
- pipe_unlock(opipe);
- if (ret > 0)
- wakeup_pipe_readers(opipe);
+ ret = splice_file_to_pipe(in, opipe, &offset, len, flags);
if (!off_in)
in->f_pos = offset;
else
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 8a19773b5a0b..45f44425d856 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -196,9 +196,15 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length,
length = SQUASHFS_COMPRESSED_SIZE(length);
index += 2;
- TRACE("Block @ 0x%llx, %scompressed size %d\n", index,
+ TRACE("Block @ 0x%llx, %scompressed size %d\n", index - 2,
compressed ? "" : "un", length);
}
+ if (length < 0 || length > output->length ||
+ (index + length) > msblk->bytes_used) {
+ res = -EIO;
+ goto out;
+ }
+
if (next_index)
*next_index = index + length;
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index ae2c87bb0fbe..eb02072d28dd 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -41,12 +41,17 @@ static long long squashfs_inode_lookup(struct super_block *sb, int ino_num)
struct squashfs_sb_info *msblk = sb->s_fs_info;
int blk = SQUASHFS_LOOKUP_BLOCK(ino_num - 1);
int offset = SQUASHFS_LOOKUP_BLOCK_OFFSET(ino_num - 1);
- u64 start = le64_to_cpu(msblk->inode_lookup_table[blk]);
+ u64 start;
__le64 ino;
int err;
TRACE("Entered squashfs_inode_lookup, inode_number = %d\n", ino_num);
+ if (ino_num == 0 || (ino_num - 1) >= msblk->inodes)
+ return -EINVAL;
+
+ start = le64_to_cpu(msblk->inode_lookup_table[blk]);
+
err = squashfs_read_metadata(sb, &ino, &start, &offset, sizeof(ino));
if (err < 0)
return err;
@@ -111,7 +116,10 @@ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
u64 lookup_table_start, u64 next_table, unsigned int inodes)
{
unsigned int length = SQUASHFS_LOOKUP_BLOCK_BYTES(inodes);
+ unsigned int indexes = SQUASHFS_LOOKUP_BLOCKS(inodes);
+ int n;
__le64 *table;
+ u64 start, end;
TRACE("In read_inode_lookup_table, length %d\n", length);
@@ -121,20 +129,37 @@ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
if (inodes == 0)
return ERR_PTR(-EINVAL);
- /* length bytes should not extend into the next table - this check
- * also traps instances where lookup_table_start is incorrectly larger
- * than the next table start
+ /*
+ * The computed size of the lookup table (length bytes) should exactly
+ * match the table start and end points
*/
- if (lookup_table_start + length > next_table)
+ if (length != (next_table - lookup_table_start))
return ERR_PTR(-EINVAL);
table = squashfs_read_table(sb, lookup_table_start, length);
+ if (IS_ERR(table))
+ return table;
/*
- * table[0] points to the first inode lookup table metadata block,
- * this should be less than lookup_table_start
+ * table0], table[1], ... table[indexes - 1] store the locations
+ * of the compressed inode lookup blocks. Each entry should be
+ * less than the next (i.e. table[0] < table[1]), and the difference
+ * between them should be SQUASHFS_METADATA_SIZE or less.
+ * table[indexes - 1] should be less than lookup_table_start, and
+ * again the difference should be SQUASHFS_METADATA_SIZE or less
*/
- if (!IS_ERR(table) && le64_to_cpu(table[0]) >= lookup_table_start) {
+ for (n = 0; n < (indexes - 1); n++) {
+ start = le64_to_cpu(table[n]);
+ end = le64_to_cpu(table[n + 1]);
+
+ if (start >= end || (end - start) > SQUASHFS_METADATA_SIZE) {
+ kfree(table);
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ start = le64_to_cpu(table[indexes - 1]);
+ if (start >= lookup_table_start || (lookup_table_start - start) > SQUASHFS_METADATA_SIZE) {
kfree(table);
return ERR_PTR(-EINVAL);
}
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index 6be5afe7287d..11581bf31af4 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -35,10 +35,15 @@ int squashfs_get_id(struct super_block *sb, unsigned int index,
struct squashfs_sb_info *msblk = sb->s_fs_info;
int block = SQUASHFS_ID_BLOCK(index);
int offset = SQUASHFS_ID_BLOCK_OFFSET(index);
- u64 start_block = le64_to_cpu(msblk->id_table[block]);
+ u64 start_block;
__le32 disk_id;
int err;
+ if (index >= msblk->ids)
+ return -EINVAL;
+
+ start_block = le64_to_cpu(msblk->id_table[block]);
+
err = squashfs_read_metadata(sb, &disk_id, &start_block, &offset,
sizeof(disk_id));
if (err < 0)
@@ -56,7 +61,10 @@ __le64 *squashfs_read_id_index_table(struct super_block *sb,
u64 id_table_start, u64 next_table, unsigned short no_ids)
{
unsigned int length = SQUASHFS_ID_BLOCK_BYTES(no_ids);
+ unsigned int indexes = SQUASHFS_ID_BLOCKS(no_ids);
+ int n;
__le64 *table;
+ u64 start, end;
TRACE("In read_id_index_table, length %d\n", length);
@@ -67,20 +75,36 @@ __le64 *squashfs_read_id_index_table(struct super_block *sb,
return ERR_PTR(-EINVAL);
/*
- * length bytes should not extend into the next table - this check
- * also traps instances where id_table_start is incorrectly larger
- * than the next table start
+ * The computed size of the index table (length bytes) should exactly
+ * match the table start and end points
*/
- if (id_table_start + length > next_table)
+ if (length != (next_table - id_table_start))
return ERR_PTR(-EINVAL);
table = squashfs_read_table(sb, id_table_start, length);
+ if (IS_ERR(table))
+ return table;
/*
- * table[0] points to the first id lookup table metadata block, this
- * should be less than id_table_start
+ * table[0], table[1], ... table[indexes - 1] store the locations
+ * of the compressed id blocks. Each entry should be less than
+ * the next (i.e. table[0] < table[1]), and the difference between them
+ * should be SQUASHFS_METADATA_SIZE or less. table[indexes - 1]
+ * should be less than id_table_start, and again the difference
+ * should be SQUASHFS_METADATA_SIZE or less
*/
- if (!IS_ERR(table) && le64_to_cpu(table[0]) >= id_table_start) {
+ for (n = 0; n < (indexes - 1); n++) {
+ start = le64_to_cpu(table[n]);
+ end = le64_to_cpu(table[n + 1]);
+
+ if (start >= end || (end - start) > SQUASHFS_METADATA_SIZE) {
+ kfree(table);
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ start = le64_to_cpu(table[indexes - 1]);
+ if (start >= id_table_start || (id_table_start - start) > SQUASHFS_METADATA_SIZE) {
kfree(table);
return ERR_PTR(-EINVAL);
}
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 34c21ffb6df3..166e98806265 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -64,5 +64,6 @@ struct squashfs_sb_info {
unsigned int inodes;
unsigned int fragments;
int xattr_ids;
+ unsigned int ids;
};
#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index d6c6593ec169..88cc94be1076 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -166,6 +166,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
msblk->directory_table = le64_to_cpu(sblk->directory_table_start);
msblk->inodes = le32_to_cpu(sblk->inodes);
msblk->fragments = le32_to_cpu(sblk->fragments);
+ msblk->ids = le16_to_cpu(sblk->no_ids);
flags = le16_to_cpu(sblk->flags);
TRACE("Found valid superblock on %pg\n", sb->s_bdev);
@@ -177,7 +178,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
TRACE("Block size %d\n", msblk->block_size);
TRACE("Number of inodes %d\n", msblk->inodes);
TRACE("Number of fragments %d\n", msblk->fragments);
- TRACE("Number of ids %d\n", le16_to_cpu(sblk->no_ids));
+ TRACE("Number of ids %d\n", msblk->ids);
TRACE("sblk->inode_table_start %llx\n", msblk->inode_table);
TRACE("sblk->directory_table_start %llx\n", msblk->directory_table);
TRACE("sblk->fragment_table_start %llx\n",
@@ -236,8 +237,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
allocate_id_index_table:
/* Allocate and read id index table */
msblk->id_table = squashfs_read_id_index_table(sb,
- le64_to_cpu(sblk->id_table_start), next_table,
- le16_to_cpu(sblk->no_ids));
+ le64_to_cpu(sblk->id_table_start), next_table, msblk->ids);
if (IS_ERR(msblk->id_table)) {
errorf(fc, "unable to read id index table");
err = PTR_ERR(msblk->id_table);
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
index 184129afd456..d8a270d3ac4c 100644
--- a/fs/squashfs/xattr.h
+++ b/fs/squashfs/xattr.h
@@ -17,8 +17,16 @@ extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
u64 start, u64 *xattr_table_start, int *xattr_ids)
{
+ struct squashfs_xattr_id_table *id_table;
+
+ id_table = squashfs_read_table(sb, start, sizeof(*id_table));
+ if (IS_ERR(id_table))
+ return (__le64 *) id_table;
+
+ *xattr_table_start = le64_to_cpu(id_table->xattr_table_start);
+ kfree(id_table);
+
ERROR("Xattrs in filesystem, these will be ignored\n");
- *xattr_table_start = start;
return ERR_PTR(-ENOTSUPP);
}
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
index d99e08464554..ead66670b41a 100644
--- a/fs/squashfs/xattr_id.c
+++ b/fs/squashfs/xattr_id.c
@@ -31,10 +31,15 @@ int squashfs_xattr_lookup(struct super_block *sb, unsigned int index,
struct squashfs_sb_info *msblk = sb->s_fs_info;
int block = SQUASHFS_XATTR_BLOCK(index);
int offset = SQUASHFS_XATTR_BLOCK_OFFSET(index);
- u64 start_block = le64_to_cpu(msblk->xattr_id_table[block]);
+ u64 start_block;
struct squashfs_xattr_id id;
int err;
+ if (index >= msblk->xattr_ids)
+ return -EINVAL;
+
+ start_block = le64_to_cpu(msblk->xattr_id_table[block]);
+
err = squashfs_read_metadata(sb, &id, &start_block, &offset,
sizeof(id));
if (err < 0)
@@ -50,13 +55,17 @@ int squashfs_xattr_lookup(struct super_block *sb, unsigned int index,
/*
* Read uncompressed xattr id lookup table indexes from disk into memory
*/
-__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start,
+__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start,
u64 *xattr_table_start, int *xattr_ids)
{
- unsigned int len;
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+ unsigned int len, indexes;
struct squashfs_xattr_id_table *id_table;
+ __le64 *table;
+ u64 start, end;
+ int n;
- id_table = squashfs_read_table(sb, start, sizeof(*id_table));
+ id_table = squashfs_read_table(sb, table_start, sizeof(*id_table));
if (IS_ERR(id_table))
return (__le64 *) id_table;
@@ -70,13 +79,52 @@ __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start,
if (*xattr_ids == 0)
return ERR_PTR(-EINVAL);
- /* xattr_table should be less than start */
- if (*xattr_table_start >= start)
+ len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);
+ indexes = SQUASHFS_XATTR_BLOCKS(*xattr_ids);
+
+ /*
+ * The computed size of the index table (len bytes) should exactly
+ * match the table start and end points
+ */
+ start = table_start + sizeof(*id_table);
+ end = msblk->bytes_used;
+
+ if (len != (end - start))
return ERR_PTR(-EINVAL);
- len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);
+ table = squashfs_read_table(sb, start, len);
+ if (IS_ERR(table))
+ return table;
+
+ /* table[0], table[1], ... table[indexes - 1] store the locations
+ * of the compressed xattr id blocks. Each entry should be less than
+ * the next (i.e. table[0] < table[1]), and the difference between them
+ * should be SQUASHFS_METADATA_SIZE or less. table[indexes - 1]
+ * should be less than table_start, and again the difference
+ * shouls be SQUASHFS_METADATA_SIZE or less.
+ *
+ * Finally xattr_table_start should be less than table[0].
+ */
+ for (n = 0; n < (indexes - 1); n++) {
+ start = le64_to_cpu(table[n]);
+ end = le64_to_cpu(table[n + 1]);
+
+ if (start >= end || (end - start) > SQUASHFS_METADATA_SIZE) {
+ kfree(table);
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ start = le64_to_cpu(table[indexes - 1]);
+ if (start >= table_start || (table_start - start) > SQUASHFS_METADATA_SIZE) {
+ kfree(table);
+ return ERR_PTR(-EINVAL);
+ }
- TRACE("In read_xattr_index_table, length %d\n", len);
+ if (*xattr_table_start >= le64_to_cpu(table[0])) {
+ kfree(table);
+ return ERR_PTR(-EINVAL);
+ }
- return squashfs_read_table(sb, start + sizeof(*id_table), len);
+ return table;
}
diff --git a/fs/statfs.c b/fs/statfs.c
index 68cb07788750..0ba34c135593 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -255,7 +255,10 @@ SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
memset(&tmp,0,sizeof(struct ustat));
tmp.f_tfree = sbuf.f_bfree;
- tmp.f_tinode = sbuf.f_ffree;
+ if (IS_ENABLED(CONFIG_ARCH_32BIT_USTAT_F_TINODE))
+ tmp.f_tinode = min_t(u64, sbuf.f_ffree, UINT_MAX);
+ else
+ tmp.f_tinode = sbuf.f_ffree;
return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0;
}
diff --git a/fs/super.c b/fs/super.c
index 2c6cdea2ab2d..8c1baca35c16 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -865,7 +865,8 @@ int reconfigure_super(struct fs_context *fc)
if (fc->sb_flags_mask & SB_RDONLY) {
#ifdef CONFIG_BLOCK
- if (!(fc->sb_flags & SB_RDONLY) && bdev_read_only(sb->s_bdev))
+ if (!(fc->sb_flags & SB_RDONLY) && sb->s_bdev &&
+ bdev_read_only(sb->s_bdev))
return -EACCES;
#endif
@@ -1718,12 +1719,6 @@ int freeze_super(struct super_block *sb)
}
EXPORT_SYMBOL(freeze_super);
-/**
- * thaw_super -- unlock filesystem
- * @sb: the super to thaw
- *
- * Unlocks the filesystem and marks it writeable again after freeze_super().
- */
static int thaw_super_locked(struct super_block *sb)
{
int error;
@@ -1759,6 +1754,12 @@ out:
return 0;
}
+/**
+ * thaw_super -- unlock filesystem
+ * @sb: the super to thaw
+ *
+ * Unlocks the filesystem and marks it writeable again after freeze_super().
+ */
int thaw_super(struct super_block *sb)
{
down_write(&sb->s_umount);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 96d0da65e088..9aefa7779b29 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -170,6 +170,16 @@ static int sysfs_kf_bin_mmap(struct kernfs_open_file *of,
return battr->mmap(of->file, kobj, battr, vma);
}
+static int sysfs_kf_bin_open(struct kernfs_open_file *of)
+{
+ struct bin_attribute *battr = of->kn->priv;
+
+ if (battr->mapping)
+ of->file->f_mapping = battr->mapping;
+
+ return 0;
+}
+
void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr)
{
struct kernfs_node *kn = kobj->sd, *tmp;
@@ -241,6 +251,7 @@ static const struct kernfs_ops sysfs_bin_kfops_mmap = {
.read = sysfs_kf_bin_read,
.write = sysfs_kf_bin_write,
.mmap = sysfs_kf_bin_mmap,
+ .open = sysfs_kf_bin_open,
};
int sysfs_add_file_mode_ns(struct kernfs_node *parent,
diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c
index 51a7c8c2c3f0..e564d5ff8781 100644
--- a/fs/ubifs/auth.c
+++ b/fs/ubifs/auth.c
@@ -327,7 +327,7 @@ int ubifs_init_authentication(struct ubifs_info *c)
ubifs_err(c, "hmac %s is bigger than maximum allowed hmac size (%d > %d)",
hmac_name, c->hmac_desc_len, UBIFS_HMAC_ARR_SZ);
err = -EINVAL;
- goto out_free_hash;
+ goto out_free_hmac;
}
err = crypto_shash_setkey(c->hmac_tfm, ukp->data, ukp->datalen);
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 03410ae0813a..2857e64d673d 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -881,7 +881,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode)
struct inode *xino;
struct ubifs_dent_node *xent, *pxent = NULL;
- if (ui->xattr_cnt >= ubifs_xattr_max_cnt(c)) {
+ if (ui->xattr_cnt > ubifs_xattr_max_cnt(c)) {
ubifs_err(c, "Cannot delete inode, it has too much xattrs!");
goto out_release;
}
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 79801c9a5b87..0f8a6a16421b 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -559,7 +559,9 @@ static int is_last_bud(struct ubifs_info *c, struct ubifs_bud *bud)
}
/* authenticate_sleb_hash is split out for stack usage */
-static int authenticate_sleb_hash(struct ubifs_info *c, struct shash_desc *log_hash, u8 *hash)
+static int noinline_for_stack
+authenticate_sleb_hash(struct ubifs_info *c,
+ struct shash_desc *log_hash, u8 *hash)
{
SHASH_DESC_ON_STACK(hash_desc, c->hash_tfm);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 138b9426c6c1..ddb2ca636c93 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -838,8 +838,10 @@ static int alloc_wbufs(struct ubifs_info *c)
c->jheads[i].wbuf.jhead = i;
c->jheads[i].grouped = 1;
c->jheads[i].log_hash = ubifs_hash_get_desc(c);
- if (IS_ERR(c->jheads[i].log_hash))
+ if (IS_ERR(c->jheads[i].log_hash)) {
+ err = PTR_ERR(c->jheads[i].log_hash);
goto out;
+ }
}
/*
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 8f4135c22ab6..6b1e9830b274 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -498,7 +498,7 @@ int ubifs_purge_xattrs(struct inode *host)
struct fscrypt_name nm = {0};
int err;
- if (ubifs_inode(host)->xattr_cnt < ubifs_xattr_max_cnt(c))
+ if (ubifs_inode(host)->xattr_cnt <= ubifs_xattr_max_cnt(c))
return 0;
ubifs_warn(c, "inode %lu has too many xattrs, doing a non-atomic deletion",
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index bb89c3e43212..0dd2f93ac048 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -544,11 +544,14 @@ static int udf_do_extend_file(struct inode *inode,
udf_write_aext(inode, last_pos, &last_ext->extLocation,
last_ext->extLength, 1);
+
/*
- * We've rewritten the last extent but there may be empty
- * indirect extent after it - enter it.
+ * We've rewritten the last extent. If we are going to add
+ * more extents, we may need to enter possible following
+ * empty indirect extent.
*/
- udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0);
+ if (new_block_bytes || prealloc_len)
+ udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0);
}
/* Managed to do everything necessary? */
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 5bef3a68395d..2f83c1204e20 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -459,6 +459,7 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
{
char *p;
int option;
+ unsigned int uv;
uopt->novrs = 0;
uopt->session = 0xFFFFFFFF;
@@ -508,17 +509,17 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
uopt->flags &= ~(1 << UDF_FLAG_USE_SHORT_AD);
break;
case Opt_gid:
- if (match_int(args, &option))
+ if (match_uint(args, &uv))
return 0;
- uopt->gid = make_kgid(current_user_ns(), option);
+ uopt->gid = make_kgid(current_user_ns(), uv);
if (!gid_valid(uopt->gid))
return 0;
uopt->flags |= (1 << UDF_FLAG_GID_SET);
break;
case Opt_uid:
- if (match_int(args, &option))
+ if (match_uint(args, &uv))
return 0;
- uopt->uid = make_kuid(current_user_ns(), option);
+ uopt->uid = make_kuid(current_user_ns(), uv);
if (!uid_valid(uopt->uid))
return 0;
uopt->flags |= (1 << UDF_FLAG_UID_SET);
@@ -705,6 +706,7 @@ static int udf_check_vsd(struct super_block *sb)
struct buffer_head *bh = NULL;
int nsr = 0;
struct udf_sb_info *sbi;
+ loff_t session_offset;
sbi = UDF_SB(sb);
if (sb->s_blocksize < sizeof(struct volStructDesc))
@@ -712,7 +714,8 @@ static int udf_check_vsd(struct super_block *sb)
else
sectorsize = sb->s_blocksize;
- sector += (((loff_t)sbi->s_session) << sb->s_blocksize_bits);
+ session_offset = (loff_t)sbi->s_session << sb->s_blocksize_bits;
+ sector += session_offset;
udf_debug("Starting at sector %u (%lu byte sectors)\n",
(unsigned int)(sector >> sb->s_blocksize_bits),
@@ -757,8 +760,7 @@ static int udf_check_vsd(struct super_block *sb)
if (nsr > 0)
return 1;
- else if (!bh && sector - (sbi->s_session << sb->s_blocksize_bits) ==
- VSD_FIRST_SECTOR_OFFSET)
+ else if (!bh && sector - session_offset == VSD_FIRST_SECTOR_OFFSET)
return -1;
else
return 0;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 894cc28142e7..0be8cdd4425a 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -979,14 +979,14 @@ static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
static const struct file_operations userfaultfd_fops;
-static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
- struct userfaultfd_ctx *new,
+static int resolve_userfault_fork(struct userfaultfd_ctx *new,
+ struct inode *inode,
struct uffd_msg *msg)
{
int fd;
- fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, new,
- O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS));
+ fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new,
+ O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
if (fd < 0)
return fd;
@@ -996,7 +996,7 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
}
static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
- struct uffd_msg *msg)
+ struct uffd_msg *msg, struct inode *inode)
{
ssize_t ret;
DECLARE_WAITQUEUE(wait, current);
@@ -1107,7 +1107,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
spin_unlock_irq(&ctx->fd_wqh.lock);
if (!ret && msg->event == UFFD_EVENT_FORK) {
- ret = resolve_userfault_fork(ctx, fork_nctx, msg);
+ ret = resolve_userfault_fork(fork_nctx, inode, msg);
spin_lock_irq(&ctx->event_wqh.lock);
if (!list_empty(&fork_event)) {
/*
@@ -1167,6 +1167,7 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf,
ssize_t _ret, ret = 0;
struct uffd_msg msg;
int no_wait = file->f_flags & O_NONBLOCK;
+ struct inode *inode = file_inode(file);
if (ctx->state == UFFD_STATE_WAIT_API)
return -EINVAL;
@@ -1174,7 +1175,7 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf,
for (;;) {
if (count < sizeof(msg))
return ret ? ret : -EINVAL;
- _ret = userfaultfd_ctx_read(ctx, no_wait, &msg);
+ _ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode);
if (_ret < 0)
return ret ? ret : _ret;
if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
@@ -1999,8 +2000,8 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
/* prevent the mm struct to be freed */
mmgrab(ctx->mm);
- fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, ctx,
- O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
+ fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx,
+ O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
if (fd < 0) {
mmdrop(ctx->mm);
kmem_cache_free(userfaultfd_ctx_cachep, ctx);
diff --git a/fs/verity/Makefile b/fs/verity/Makefile
index 570e9136334d..435559a4fa9e 100644
--- a/fs/verity/Makefile
+++ b/fs/verity/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_FS_VERITY) += enable.o \
init.o \
measure.o \
open.o \
+ read_metadata.o \
verify.o
obj-$(CONFIG_FS_VERITY_BUILTIN_SIGNATURES) += signature.o
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index 6413d28664d6..a7920434bae5 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -122,12 +122,17 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
const u8 *salt, size_t salt_size);
struct fsverity_info *fsverity_create_info(const struct inode *inode,
- void *desc, size_t desc_size);
+ struct fsverity_descriptor *desc,
+ size_t desc_size);
void fsverity_set_info(struct inode *inode, struct fsverity_info *vi);
void fsverity_free_info(struct fsverity_info *vi);
+int fsverity_get_descriptor(struct inode *inode,
+ struct fsverity_descriptor **desc_ret,
+ size_t *desc_size_ret);
+
int __init fsverity_init_info_cache(void);
void __init fsverity_exit_info_cache(void);
@@ -135,15 +140,13 @@ void __init fsverity_exit_info_cache(void);
#ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES
int fsverity_verify_signature(const struct fsverity_info *vi,
- const struct fsverity_descriptor *desc,
- size_t desc_size);
+ const u8 *signature, size_t sig_size);
int __init fsverity_init_signature(void);
#else /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */
static inline int
fsverity_verify_signature(const struct fsverity_info *vi,
- const struct fsverity_descriptor *desc,
- size_t desc_size)
+ const u8 *signature, size_t sig_size)
{
return 0;
}
diff --git a/fs/verity/open.c b/fs/verity/open.c
index 228d0eca3e2e..60ff8af7219f 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -142,45 +142,17 @@ static int compute_file_digest(struct fsverity_hash_alg *hash_alg,
}
/*
- * Validate the given fsverity_descriptor and create a new fsverity_info from
- * it. The signature (if present) is also checked.
+ * Create a new fsverity_info from the given fsverity_descriptor (with optional
+ * appended signature), and check the signature if present. The
+ * fsverity_descriptor must have already undergone basic validation.
*/
struct fsverity_info *fsverity_create_info(const struct inode *inode,
- void *_desc, size_t desc_size)
+ struct fsverity_descriptor *desc,
+ size_t desc_size)
{
- struct fsverity_descriptor *desc = _desc;
struct fsverity_info *vi;
int err;
- if (desc_size < sizeof(*desc)) {
- fsverity_err(inode, "Unrecognized descriptor size: %zu bytes",
- desc_size);
- return ERR_PTR(-EINVAL);
- }
-
- if (desc->version != 1) {
- fsverity_err(inode, "Unrecognized descriptor version: %u",
- desc->version);
- return ERR_PTR(-EINVAL);
- }
-
- if (memchr_inv(desc->__reserved, 0, sizeof(desc->__reserved))) {
- fsverity_err(inode, "Reserved bits set in descriptor");
- return ERR_PTR(-EINVAL);
- }
-
- if (desc->salt_size > sizeof(desc->salt)) {
- fsverity_err(inode, "Invalid salt_size: %u", desc->salt_size);
- return ERR_PTR(-EINVAL);
- }
-
- if (le64_to_cpu(desc->data_size) != inode->i_size) {
- fsverity_err(inode,
- "Wrong data_size: %llu (desc) != %lld (inode)",
- le64_to_cpu(desc->data_size), inode->i_size);
- return ERR_PTR(-EINVAL);
- }
-
vi = kmem_cache_zalloc(fsverity_info_cachep, GFP_KERNEL);
if (!vi)
return ERR_PTR(-ENOMEM);
@@ -209,7 +181,8 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode,
vi->tree_params.hash_alg->name,
vi->tree_params.digest_size, vi->file_digest);
- err = fsverity_verify_signature(vi, desc, desc_size);
+ err = fsverity_verify_signature(vi, desc->signature,
+ le32_to_cpu(desc->sig_size));
out:
if (err) {
fsverity_free_info(vi);
@@ -245,15 +218,57 @@ void fsverity_free_info(struct fsverity_info *vi)
kmem_cache_free(fsverity_info_cachep, vi);
}
-/* Ensure the inode has an ->i_verity_info */
-static int ensure_verity_info(struct inode *inode)
+static bool validate_fsverity_descriptor(struct inode *inode,
+ const struct fsverity_descriptor *desc,
+ size_t desc_size)
{
- struct fsverity_info *vi = fsverity_get_info(inode);
- struct fsverity_descriptor *desc;
- int res;
+ if (desc_size < sizeof(*desc)) {
+ fsverity_err(inode, "Unrecognized descriptor size: %zu bytes",
+ desc_size);
+ return false;
+ }
- if (vi)
- return 0;
+ if (desc->version != 1) {
+ fsverity_err(inode, "Unrecognized descriptor version: %u",
+ desc->version);
+ return false;
+ }
+
+ if (memchr_inv(desc->__reserved, 0, sizeof(desc->__reserved))) {
+ fsverity_err(inode, "Reserved bits set in descriptor");
+ return false;
+ }
+
+ if (desc->salt_size > sizeof(desc->salt)) {
+ fsverity_err(inode, "Invalid salt_size: %u", desc->salt_size);
+ return false;
+ }
+
+ if (le64_to_cpu(desc->data_size) != inode->i_size) {
+ fsverity_err(inode,
+ "Wrong data_size: %llu (desc) != %lld (inode)",
+ le64_to_cpu(desc->data_size), inode->i_size);
+ return false;
+ }
+
+ if (le32_to_cpu(desc->sig_size) > desc_size - sizeof(*desc)) {
+ fsverity_err(inode, "Signature overflows verity descriptor");
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Read the inode's fsverity_descriptor (with optional appended signature) from
+ * the filesystem, and do basic validation of it.
+ */
+int fsverity_get_descriptor(struct inode *inode,
+ struct fsverity_descriptor **desc_ret,
+ size_t *desc_size_ret)
+{
+ int res;
+ struct fsverity_descriptor *desc;
res = inode->i_sb->s_vop->get_verity_descriptor(inode, NULL, 0);
if (res < 0) {
@@ -272,20 +287,46 @@ static int ensure_verity_info(struct inode *inode)
res = inode->i_sb->s_vop->get_verity_descriptor(inode, desc, res);
if (res < 0) {
fsverity_err(inode, "Error %d reading verity descriptor", res);
- goto out_free_desc;
+ kfree(desc);
+ return res;
+ }
+
+ if (!validate_fsverity_descriptor(inode, desc, res)) {
+ kfree(desc);
+ return -EINVAL;
}
- vi = fsverity_create_info(inode, desc, res);
+ *desc_ret = desc;
+ *desc_size_ret = res;
+ return 0;
+}
+
+/* Ensure the inode has an ->i_verity_info */
+static int ensure_verity_info(struct inode *inode)
+{
+ struct fsverity_info *vi = fsverity_get_info(inode);
+ struct fsverity_descriptor *desc;
+ size_t desc_size;
+ int err;
+
+ if (vi)
+ return 0;
+
+ err = fsverity_get_descriptor(inode, &desc, &desc_size);
+ if (err)
+ return err;
+
+ vi = fsverity_create_info(inode, desc, desc_size);
if (IS_ERR(vi)) {
- res = PTR_ERR(vi);
+ err = PTR_ERR(vi);
goto out_free_desc;
}
fsverity_set_info(inode, vi);
- res = 0;
+ err = 0;
out_free_desc:
kfree(desc);
- return res;
+ return err;
}
/**
diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c
new file mode 100644
index 000000000000..7e2d0c7bdf0d
--- /dev/null
+++ b/fs/verity/read_metadata.c
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Ioctl to read verity metadata
+ *
+ * Copyright 2021 Google LLC
+ */
+
+#include "fsverity_private.h"
+
+#include <linux/backing-dev.h>
+#include <linux/highmem.h>
+#include <linux/sched/signal.h>
+#include <linux/uaccess.h>
+
+static int fsverity_read_merkle_tree(struct inode *inode,
+ const struct fsverity_info *vi,
+ void __user *buf, u64 offset, int length)
+{
+ const struct fsverity_operations *vops = inode->i_sb->s_vop;
+ u64 end_offset;
+ unsigned int offs_in_page;
+ pgoff_t index, last_index;
+ int retval = 0;
+ int err = 0;
+
+ end_offset = min(offset + length, vi->tree_params.tree_size);
+ if (offset >= end_offset)
+ return 0;
+ offs_in_page = offset_in_page(offset);
+ last_index = (end_offset - 1) >> PAGE_SHIFT;
+
+ /*
+ * Iterate through each Merkle tree page in the requested range and copy
+ * the requested portion to userspace. Note that the Merkle tree block
+ * size isn't important here, as we are returning a byte stream; i.e.,
+ * we can just work with pages even if the tree block size != PAGE_SIZE.
+ */
+ for (index = offset >> PAGE_SHIFT; index <= last_index; index++) {
+ unsigned long num_ra_pages =
+ min_t(unsigned long, last_index - index + 1,
+ inode->i_sb->s_bdi->io_pages);
+ unsigned int bytes_to_copy = min_t(u64, end_offset - offset,
+ PAGE_SIZE - offs_in_page);
+ struct page *page;
+ const void *virt;
+
+ page = vops->read_merkle_tree_page(inode, index, num_ra_pages);
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ fsverity_err(inode,
+ "Error %d reading Merkle tree page %lu",
+ err, index);
+ break;
+ }
+
+ virt = kmap(page);
+ if (copy_to_user(buf, virt + offs_in_page, bytes_to_copy)) {
+ kunmap(page);
+ put_page(page);
+ err = -EFAULT;
+ break;
+ }
+ kunmap(page);
+ put_page(page);
+
+ retval += bytes_to_copy;
+ buf += bytes_to_copy;
+ offset += bytes_to_copy;
+
+ if (fatal_signal_pending(current)) {
+ err = -EINTR;
+ break;
+ }
+ cond_resched();
+ offs_in_page = 0;
+ }
+ return retval ? retval : err;
+}
+
+/* Copy the requested portion of the buffer to userspace. */
+static int fsverity_read_buffer(void __user *dst, u64 offset, int length,
+ const void *src, size_t src_length)
+{
+ if (offset >= src_length)
+ return 0;
+ src += offset;
+ src_length -= offset;
+
+ length = min_t(size_t, length, src_length);
+
+ if (copy_to_user(dst, src, length))
+ return -EFAULT;
+
+ return length;
+}
+
+static int fsverity_read_descriptor(struct inode *inode,
+ void __user *buf, u64 offset, int length)
+{
+ struct fsverity_descriptor *desc;
+ size_t desc_size;
+ int res;
+
+ res = fsverity_get_descriptor(inode, &desc, &desc_size);
+ if (res)
+ return res;
+
+ /* don't include the signature */
+ desc_size = offsetof(struct fsverity_descriptor, signature);
+ desc->sig_size = 0;
+
+ res = fsverity_read_buffer(buf, offset, length, desc, desc_size);
+
+ kfree(desc);
+ return res;
+}
+
+static int fsverity_read_signature(struct inode *inode,
+ void __user *buf, u64 offset, int length)
+{
+ struct fsverity_descriptor *desc;
+ size_t desc_size;
+ int res;
+
+ res = fsverity_get_descriptor(inode, &desc, &desc_size);
+ if (res)
+ return res;
+
+ if (desc->sig_size == 0) {
+ res = -ENODATA;
+ goto out;
+ }
+
+ /*
+ * Include only the signature. Note that fsverity_get_descriptor()
+ * already verified that sig_size is in-bounds.
+ */
+ res = fsverity_read_buffer(buf, offset, length, desc->signature,
+ le32_to_cpu(desc->sig_size));
+out:
+ kfree(desc);
+ return res;
+}
+
+/**
+ * fsverity_ioctl_read_metadata() - read verity metadata from a file
+ * @filp: file to read the metadata from
+ * @uarg: user pointer to fsverity_read_metadata_arg
+ *
+ * Return: length read on success, 0 on EOF, -errno on failure
+ */
+int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg)
+{
+ struct inode *inode = file_inode(filp);
+ const struct fsverity_info *vi;
+ struct fsverity_read_metadata_arg arg;
+ int length;
+ void __user *buf;
+
+ vi = fsverity_get_info(inode);
+ if (!vi)
+ return -ENODATA; /* not a verity file */
+ /*
+ * Note that we don't have to explicitly check that the file is open for
+ * reading, since verity files can only be opened for reading.
+ */
+
+ if (copy_from_user(&arg, uarg, sizeof(arg)))
+ return -EFAULT;
+
+ if (arg.__reserved)
+ return -EINVAL;
+
+ /* offset + length must not overflow. */
+ if (arg.offset + arg.length < arg.offset)
+ return -EINVAL;
+
+ /* Ensure that the return value will fit in INT_MAX. */
+ length = min_t(u64, arg.length, INT_MAX);
+
+ buf = u64_to_user_ptr(arg.buf_ptr);
+
+ switch (arg.metadata_type) {
+ case FS_VERITY_METADATA_TYPE_MERKLE_TREE:
+ return fsverity_read_merkle_tree(inode, vi, buf, arg.offset,
+ length);
+ case FS_VERITY_METADATA_TYPE_DESCRIPTOR:
+ return fsverity_read_descriptor(inode, buf, arg.offset, length);
+ case FS_VERITY_METADATA_TYPE_SIGNATURE:
+ return fsverity_read_signature(inode, buf, arg.offset, length);
+ default:
+ return -EINVAL;
+ }
+}
+EXPORT_SYMBOL_GPL(fsverity_ioctl_read_metadata);
diff --git a/fs/verity/signature.c b/fs/verity/signature.c
index 012468eda2a7..143a530a8008 100644
--- a/fs/verity/signature.c
+++ b/fs/verity/signature.c
@@ -29,21 +29,19 @@ static struct key *fsverity_keyring;
/**
* fsverity_verify_signature() - check a verity file's signature
* @vi: the file's fsverity_info
- * @desc: the file's fsverity_descriptor
- * @desc_size: size of @desc
+ * @signature: the file's built-in signature
+ * @sig_size: size of signature in bytes, or 0 if no signature
*
- * If the file's fs-verity descriptor includes a signature of the file digest,
- * verify it against the certificates in the fs-verity keyring.
+ * If the file includes a signature of its fs-verity file digest, verify it
+ * against the certificates in the fs-verity keyring.
*
* Return: 0 on success (signature valid or not required); -errno on failure
*/
int fsverity_verify_signature(const struct fsverity_info *vi,
- const struct fsverity_descriptor *desc,
- size_t desc_size)
+ const u8 *signature, size_t sig_size)
{
const struct inode *inode = vi->inode;
const struct fsverity_hash_alg *hash_alg = vi->tree_params.hash_alg;
- const u32 sig_size = le32_to_cpu(desc->sig_size);
struct fsverity_formatted_digest *d;
int err;
@@ -56,11 +54,6 @@ int fsverity_verify_signature(const struct fsverity_info *vi,
return 0;
}
- if (sig_size > desc_size - sizeof(*desc)) {
- fsverity_err(inode, "Signature overflows verity descriptor");
- return -EBADMSG;
- }
-
d = kzalloc(sizeof(*d) + hash_alg->digest_size, GFP_KERNEL);
if (!d)
return -ENOMEM;
@@ -70,8 +63,7 @@ int fsverity_verify_signature(const struct fsverity_info *vi,
memcpy(d->digest, vi->file_digest, hash_alg->digest_size);
err = verify_pkcs7_signature(d, sizeof(*d) + hash_alg->digest_size,
- desc->signature, sig_size,
- fsverity_keyring,
+ signature, sig_size, fsverity_keyring,
VERIFYING_UNSPECIFIED_SIGNATURE,
NULL, NULL);
kfree(d);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 7cb9f064ac64..0c623d3c1036 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2474,6 +2474,47 @@ xfs_defer_agfl_block(
xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list);
}
+#ifdef DEBUG
+/*
+ * Check if an AGF has a free extent record whose length is equal to
+ * args->minlen.
+ */
+STATIC int
+xfs_exact_minlen_extent_available(
+ struct xfs_alloc_arg *args,
+ struct xfs_buf *agbp,
+ int *stat)
+{
+ struct xfs_btree_cur *cnt_cur;
+ xfs_agblock_t fbno;
+ xfs_extlen_t flen;
+ int error = 0;
+
+ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp,
+ args->agno, XFS_BTNUM_CNT);
+ error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat);
+ if (error)
+ goto out;
+
+ if (*stat == 0) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat);
+ if (error)
+ goto out;
+
+ if (*stat == 1 && flen != args->minlen)
+ *stat = 0;
+
+out:
+ xfs_btree_del_cursor(cnt_cur, error);
+
+ return error;
+}
+#endif
+
/*
* Decide whether to use this allocation group for this allocation.
* If so, fix up the btree freelist's size.
@@ -2545,6 +2586,15 @@ xfs_alloc_fix_freelist(
if (!xfs_alloc_space_available(args, need, flags))
goto out_agbp_relse;
+#ifdef DEBUG
+ if (args->alloc_minlen_only) {
+ int stat;
+
+ error = xfs_exact_minlen_extent_available(args, agbp, &stat);
+ if (error || !stat)
+ goto out_agbp_relse;
+ }
+#endif
/*
* Make the freelist shorter if it's too long.
*
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 6c22b12176b8..a4427c5775c2 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -75,6 +75,9 @@ typedef struct xfs_alloc_arg {
char wasfromfl; /* set if allocation is from freelist */
struct xfs_owner_info oinfo; /* owner of blocks being allocated */
enum xfs_ag_resv_type resv; /* block reservation to use */
+#ifdef DEBUG
+ bool alloc_minlen_only; /* allocate exact minlen extent */
+#endif
} xfs_alloc_arg_t;
/*
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index fd8e6418a0d3..472b3039eabb 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -396,6 +396,7 @@ xfs_attr_set(
struct xfs_trans_res tres;
bool rsvd = (args->attr_filter & XFS_ATTR_ROOT);
int error, local;
+ int rmt_blks = 0;
unsigned int total;
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
@@ -442,34 +443,33 @@ xfs_attr_set(
tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
total = args->total;
+
+ if (!local)
+ rmt_blks = xfs_attr3_rmt_blocks(mp, args->valuelen);
} else {
XFS_STATS_INC(mp, xs_attr_remove);
tres = M_RES(mp)->tr_attrrm;
total = XFS_ATTRRM_SPACE_RES(mp);
+ rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX);
}
/*
* Root fork attributes can use reserved data blocks for this
* operation if necessary
*/
- error = xfs_trans_alloc(mp, &tres, total, 0,
- rsvd ? XFS_TRANS_RESERVE : 0, &args->trans);
+ error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans);
if (error)
return error;
- xfs_ilock(dp, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(args->trans, dp, 0);
- if (args->value) {
- unsigned int quota_flags = XFS_QMOPT_RES_REGBLKS;
-
- if (rsvd)
- quota_flags |= XFS_QMOPT_FORCE_RES;
- error = xfs_trans_reserve_quota_nblks(args->trans, dp,
- args->total, 0, quota_flags);
+ if (args->value || xfs_inode_hasattr(dp)) {
+ error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK,
+ XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
if (error)
goto out_trans_cancel;
+ }
+ if (args->value) {
error = xfs_has_attr(args);
if (error == -EEXIST && (args->attr_flags & XATTR_CREATE))
goto out_trans_cancel;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index bc446418e227..e0905ad171f0 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -1079,21 +1079,13 @@ xfs_bmap_add_attrfork(
blks = XFS_ADDAFORK_SPACE_RES(mp);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_addafork, blks, 0,
- rsvd ? XFS_TRANS_RESERVE : 0, &tp);
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_addafork, blks, 0,
+ rsvd, &tp);
if (error)
return error;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
- XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
- XFS_QMOPT_RES_REGBLKS);
- if (error)
- goto trans_cancel;
if (XFS_IFORK_Q(ip))
goto trans_cancel;
- xfs_trans_ijoin(tp, ip, 0);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
error = xfs_bmap_set_attrforkoff(ip, size, &version);
if (error)
@@ -3463,34 +3455,16 @@ xfs_bmap_btalloc_accounting(
args->len);
}
-STATIC int
-xfs_bmap_btalloc(
- struct xfs_bmalloca *ap) /* bmap alloc argument struct */
+static int
+xfs_bmap_compute_alignments(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args)
{
- xfs_mount_t *mp; /* mount point structure */
- xfs_alloctype_t atype = 0; /* type for allocation routines */
- xfs_extlen_t align = 0; /* minimum allocation alignment */
- xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
- xfs_agnumber_t ag;
- xfs_alloc_arg_t args;
- xfs_fileoff_t orig_offset;
- xfs_extlen_t orig_length;
- xfs_extlen_t blen;
- xfs_extlen_t nextminlen = 0;
- int nullfb; /* true if ap->firstblock isn't set */
- int isaligned;
- int tryagain;
- int error;
- int stripe_align;
-
- ASSERT(ap->length);
- orig_offset = ap->offset;
- orig_length = ap->length;
-
- mp = ap->ip->i_mount;
+ struct xfs_mount *mp = args->mp;
+ xfs_extlen_t align = 0; /* minimum allocation alignment */
+ int stripe_align = 0;
/* stripe alignment for allocation is determined by mount parameters */
- stripe_align = 0;
if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
stripe_align = mp->m_swidth;
else if (mp->m_dalign)
@@ -3501,13 +3475,171 @@ xfs_bmap_btalloc(
else if (ap->datatype & XFS_ALLOC_USERDATA)
align = xfs_get_extsz_hint(ap->ip);
if (align) {
- error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
- align, 0, ap->eof, 0, ap->conv,
- &ap->offset, &ap->length);
- ASSERT(!error);
+ if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0,
+ ap->eof, 0, ap->conv, &ap->offset,
+ &ap->length))
+ ASSERT(0);
ASSERT(ap->length);
}
+ /* apply extent size hints if obtained earlier */
+ if (align) {
+ args->prod = align;
+ div_u64_rem(ap->offset, args->prod, &args->mod);
+ if (args->mod)
+ args->mod = args->prod - args->mod;
+ } else if (mp->m_sb.sb_blocksize >= PAGE_SIZE) {
+ args->prod = 1;
+ args->mod = 0;
+ } else {
+ args->prod = PAGE_SIZE >> mp->m_sb.sb_blocklog;
+ div_u64_rem(ap->offset, args->prod, &args->mod);
+ if (args->mod)
+ args->mod = args->prod - args->mod;
+ }
+
+ return stripe_align;
+}
+
+static void
+xfs_bmap_process_allocated_extent(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args,
+ xfs_fileoff_t orig_offset,
+ xfs_extlen_t orig_length)
+{
+ int nullfb;
+
+ nullfb = ap->tp->t_firstblock == NULLFSBLOCK;
+
+ /*
+ * check the allocation happened at the same or higher AG than
+ * the first block that was allocated.
+ */
+ ASSERT(nullfb ||
+ XFS_FSB_TO_AGNO(args->mp, ap->tp->t_firstblock) <=
+ XFS_FSB_TO_AGNO(args->mp, args->fsbno));
+
+ ap->blkno = args->fsbno;
+ if (nullfb)
+ ap->tp->t_firstblock = args->fsbno;
+ ap->length = args->len;
+ /*
+ * If the extent size hint is active, we tried to round the
+ * caller's allocation request offset down to extsz and the
+ * length up to another extsz boundary. If we found a free
+ * extent we mapped it in starting at this new offset. If the
+ * newly mapped space isn't long enough to cover any of the
+ * range of offsets that was originally requested, move the
+ * mapping up so that we can fill as much of the caller's
+ * original request as possible. Free space is apparently
+ * very fragmented so we're unlikely to be able to satisfy the
+ * hints anyway.
+ */
+ if (ap->length <= orig_length)
+ ap->offset = orig_offset;
+ else if (ap->offset + ap->length < orig_offset + orig_length)
+ ap->offset = orig_offset + orig_length - ap->length;
+ xfs_bmap_btalloc_accounting(ap, args);
+}
+
+#ifdef DEBUG
+static int
+xfs_bmap_exact_minlen_extent_alloc(
+ struct xfs_bmalloca *ap)
+{
+ struct xfs_mount *mp = ap->ip->i_mount;
+ struct xfs_alloc_arg args = { .tp = ap->tp, .mp = mp };
+ xfs_fileoff_t orig_offset;
+ xfs_extlen_t orig_length;
+ int error;
+
+ ASSERT(ap->length);
+
+ if (ap->minlen != 1) {
+ ap->blkno = NULLFSBLOCK;
+ ap->length = 0;
+ return 0;
+ }
+
+ orig_offset = ap->offset;
+ orig_length = ap->length;
+
+ args.alloc_minlen_only = 1;
+
+ xfs_bmap_compute_alignments(ap, &args);
+
+ if (ap->tp->t_firstblock == NULLFSBLOCK) {
+ /*
+ * Unlike the longest extent available in an AG, we don't track
+ * the length of an AG's shortest extent.
+ * XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT is a debug only knob and
+ * hence we can afford to start traversing from the 0th AG since
+ * we need not be concerned about a drop in performance in
+ * "debug only" code paths.
+ */
+ ap->blkno = XFS_AGB_TO_FSB(mp, 0, 0);
+ } else {
+ ap->blkno = ap->tp->t_firstblock;
+ }
+
+ args.fsbno = ap->blkno;
+ args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
+ args.type = XFS_ALLOCTYPE_FIRST_AG;
+ args.total = args.minlen = args.maxlen = ap->minlen;
+
+ args.alignment = 1;
+ args.minalignslop = 0;
+
+ args.minleft = ap->minleft;
+ args.wasdel = ap->wasdel;
+ args.resv = XFS_AG_RESV_NONE;
+ args.datatype = ap->datatype;
+
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ return error;
+
+ if (args.fsbno != NULLFSBLOCK) {
+ xfs_bmap_process_allocated_extent(ap, &args, orig_offset,
+ orig_length);
+ } else {
+ ap->blkno = NULLFSBLOCK;
+ ap->length = 0;
+ }
+
+ return 0;
+}
+#else
+
+#define xfs_bmap_exact_minlen_extent_alloc(bma) (-EFSCORRUPTED)
+
+#endif
+
+STATIC int
+xfs_bmap_btalloc(
+ struct xfs_bmalloca *ap)
+{
+ struct xfs_mount *mp = ap->ip->i_mount;
+ struct xfs_alloc_arg args = { .tp = ap->tp, .mp = mp };
+ xfs_alloctype_t atype = 0;
+ xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
+ xfs_agnumber_t ag;
+ xfs_fileoff_t orig_offset;
+ xfs_extlen_t orig_length;
+ xfs_extlen_t blen;
+ xfs_extlen_t nextminlen = 0;
+ int nullfb; /* true if ap->firstblock isn't set */
+ int isaligned;
+ int tryagain;
+ int error;
+ int stripe_align;
+
+ ASSERT(ap->length);
+ orig_offset = ap->offset;
+ orig_length = ap->length;
+
+ stripe_align = xfs_bmap_compute_alignments(ap, &args);
nullfb = ap->tp->t_firstblock == NULLFSBLOCK;
fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp,
@@ -3538,9 +3670,6 @@ xfs_bmap_btalloc(
* Normal allocation, done through xfs_alloc_vextent.
*/
tryagain = isaligned = 0;
- memset(&args, 0, sizeof(args));
- args.tp = ap->tp;
- args.mp = mp;
args.fsbno = ap->blkno;
args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
@@ -3571,21 +3700,7 @@ xfs_bmap_btalloc(
args.total = ap->total;
args.minlen = ap->minlen;
}
- /* apply extent size hints if obtained earlier */
- if (align) {
- args.prod = align;
- div_u64_rem(ap->offset, args.prod, &args.mod);
- if (args.mod)
- args.mod = args.prod - args.mod;
- } else if (mp->m_sb.sb_blocksize >= PAGE_SIZE) {
- args.prod = 1;
- args.mod = 0;
- } else {
- args.prod = PAGE_SIZE >> mp->m_sb.sb_blocklog;
- div_u64_rem(ap->offset, args.prod, &args.mod);
- if (args.mod)
- args.mod = args.prod - args.mod;
- }
+
/*
* If we are not low on available data blocks, and the underlying
* logical volume manager is a stripe, and the file offset is zero then
@@ -3687,37 +3802,10 @@ xfs_bmap_btalloc(
return error;
ap->tp->t_flags |= XFS_TRANS_LOWMODE;
}
+
if (args.fsbno != NULLFSBLOCK) {
- /*
- * check the allocation happened at the same or higher AG than
- * the first block that was allocated.
- */
- ASSERT(ap->tp->t_firstblock == NULLFSBLOCK ||
- XFS_FSB_TO_AGNO(mp, ap->tp->t_firstblock) <=
- XFS_FSB_TO_AGNO(mp, args.fsbno));
-
- ap->blkno = args.fsbno;
- if (ap->tp->t_firstblock == NULLFSBLOCK)
- ap->tp->t_firstblock = args.fsbno;
- ASSERT(nullfb || fb_agno <= args.agno);
- ap->length = args.len;
- /*
- * If the extent size hint is active, we tried to round the
- * caller's allocation request offset down to extsz and the
- * length up to another extsz boundary. If we found a free
- * extent we mapped it in starting at this new offset. If the
- * newly mapped space isn't long enough to cover any of the
- * range of offsets that was originally requested, move the
- * mapping up so that we can fill as much of the caller's
- * original request as possible. Free space is apparently
- * very fragmented so we're unlikely to be able to satisfy the
- * hints anyway.
- */
- if (ap->length <= orig_length)
- ap->offset = orig_offset;
- else if (ap->offset + ap->length < orig_offset + orig_length)
- ap->offset = orig_offset + orig_length - ap->length;
- xfs_bmap_btalloc_accounting(ap, &args);
+ xfs_bmap_process_allocated_extent(ap, &args, orig_offset,
+ orig_length);
} else {
ap->blkno = NULLFSBLOCK;
ap->length = 0;
@@ -4001,8 +4089,7 @@ xfs_bmapi_reserve_delalloc(
* blocks. This number gets adjusted later. We return if we haven't
* allocated blocks already inside this loop.
*/
- error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0,
- XFS_QMOPT_RES_REGBLKS);
+ error = xfs_quota_reserve_blkres(ip, alen);
if (error)
return error;
@@ -4048,8 +4135,7 @@ out_unreserve_blocks:
xfs_mod_fdblocks(mp, alen, false);
out_unreserve_quota:
if (XFS_IS_QUOTA_ON(mp))
- xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0,
- XFS_QMOPT_RES_REGBLKS);
+ xfs_quota_unreserve_blkres(ip, alen);
return error;
}
@@ -4083,6 +4169,10 @@ xfs_bmap_alloc_userdata(
return xfs_bmap_rtalloc(bma);
}
+ if (unlikely(XFS_TEST_ERROR(false, mp,
+ XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT)))
+ return xfs_bmap_exact_minlen_extent_alloc(bma);
+
return xfs_bmap_btalloc(bma);
}
@@ -4119,10 +4209,15 @@ xfs_bmapi_allocate(
else
bma->minlen = 1;
- if (bma->flags & XFS_BMAPI_METADATA)
- error = xfs_bmap_btalloc(bma);
- else
+ if (bma->flags & XFS_BMAPI_METADATA) {
+ if (unlikely(XFS_TEST_ERROR(false, mp,
+ XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT)))
+ error = xfs_bmap_exact_minlen_extent_alloc(bma);
+ else
+ error = xfs_bmap_btalloc(bma);
+ } else {
error = xfs_bmap_alloc_userdata(bma);
+ }
if (error || bma->blkno == NULLFSBLOCK)
return error;
@@ -4527,6 +4622,12 @@ xfs_bmapi_convert_delalloc(
return error;
xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+ error = xfs_iext_count_may_overflow(ip, whichfork,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error)
+ goto out_trans_cancel;
+
xfs_trans_ijoin(tp, ip, 0);
if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) ||
@@ -4826,9 +4927,8 @@ xfs_bmap_del_extent_delay(
* sb counters as we might have to borrow some blocks for the
* indirect block accounting.
*/
- error = xfs_trans_reserve_quota_nblks(NULL, ip,
- -((long)del->br_blockcount), 0,
- isrt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+ ASSERT(!isrt);
+ error = xfs_quota_unreserve_blkres(ip, del->br_blockcount);
if (error)
return error;
ip->i_delayed_blks -= del->br_blockcount;
@@ -5145,6 +5245,27 @@ xfs_bmap_del_extent_real(
/*
* Deleting the middle of the extent.
*/
+
+ /*
+ * For directories, -ENOSPC is returned since a directory entry
+ * remove operation must not fail due to low extent count
+ * availability. -ENOSPC will be handled by higher layers of XFS
+ * by letting the corresponding empty Data/Free blocks to linger
+ * until a future remove operation. Dabtree blocks would be
+ * swapped with the last block in the leaf space and then the
+ * new last block will be unmapped.
+ *
+ * The above logic also applies to the source directory entry of
+ * a rename operation.
+ */
+ error = xfs_iext_count_may_overflow(ip, whichfork, 1);
+ if (error) {
+ ASSERT(S_ISDIR(VFS_I(ip)->i_mode) &&
+ whichfork == XFS_DATA_FORK);
+ error = -ENOSPC;
+ goto done;
+ }
+
old = got;
got.br_blockcount = del->br_startoff - got.br_startoff;
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index c4d7a9241dc3..b56ff451adce 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -353,20 +353,17 @@ xfs_btree_free_block(
*/
void
xfs_btree_del_cursor(
- xfs_btree_cur_t *cur, /* btree cursor */
- int error) /* del because of error */
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int error) /* del because of error */
{
- int i; /* btree level */
+ int i; /* btree level */
/*
- * Clear the buffer pointers, and release the buffers.
- * If we're doing this in the face of an error, we
- * need to make sure to inspect all of the entries
- * in the bc_bufs array for buffers to be unlocked.
- * This is because some of the btree code works from
- * level n down to 0, and if we get an error along
- * the way we won't have initialized all the entries
- * down to 0.
+ * Clear the buffer pointers and release the buffers. If we're doing
+ * this because of an error, inspect all of the entries in the bc_bufs
+ * array for buffers to be unlocked. This is because some of the btree
+ * code works from level n down to 0, and if we get an error along the
+ * way we won't have initialized all the entries down to 0.
*/
for (i = 0; i < cur->bc_nlevels; i++) {
if (cur->bc_bufs[i])
@@ -374,17 +371,11 @@ xfs_btree_del_cursor(
else if (!error)
break;
}
- /*
- * Can't free a bmap cursor without having dealt with the
- * allocated indirect blocks' accounting.
- */
- ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
- cur->bc_ino.allocated == 0);
- /*
- * Free the cursor.
- */
+
+ ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 ||
+ XFS_FORCED_SHUTDOWN(cur->bc_mp));
if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
- kmem_free((void *)cur->bc_ops);
+ kmem_free(cur->bc_ops);
kmem_cache_free(xfs_btree_cur_zone, cur);
}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index e55378640b05..d03e6098ded9 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -47,8 +47,6 @@ extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name, xfs_ino_t ino,
xfs_extlen_t tot);
-extern bool xfs_dir2_sf_replace_needblock(struct xfs_inode *dp,
- xfs_ino_t inum);
extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name, xfs_ino_t inum,
xfs_extlen_t tot);
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 2463b5d73447..8c4f76bba88b 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -1018,7 +1018,7 @@ xfs_dir2_sf_removename(
/*
* Check whether the sf dir replace operation need more blocks.
*/
-bool
+static bool
xfs_dir2_sf_replace_needblock(
struct xfs_inode *dp,
xfs_ino_t inum)
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index 53b305dea381..6ca9084b6934 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -56,7 +56,9 @@
#define XFS_ERRTAG_FORCE_SUMMARY_RECALC 33
#define XFS_ERRTAG_IUNLINK_FALLBACK 34
#define XFS_ERRTAG_BUF_IOERROR 35
-#define XFS_ERRTAG_MAX 36
+#define XFS_ERRTAG_REDUCE_MAX_IEXTENTS 36
+#define XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT 37
+#define XFS_ERRTAG_MAX 38
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -97,5 +99,7 @@
#define XFS_RANDOM_FORCE_SUMMARY_RECALC 1
#define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10)
#define XFS_RANDOM_BUF_IOERROR XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_REDUCE_MAX_IEXTENTS 1
+#define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT 1
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 2a2e3cfd94f0..6fad140d4c8e 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -250,6 +250,7 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_RMAPBT (1 << 19) /* reverse mapping btree */
#define XFS_FSOP_GEOM_FLAGS_REFLINK (1 << 20) /* files can share blocks */
#define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */
+#define XFS_FSOP_GEOM_FLAGS_INOBTCNT (1 << 22) /* inobt btree counter */
/*
* Minimum and maximum sizes need for growth checks.
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 7575de5cecb1..e080d7e07643 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -23,6 +23,8 @@
#include "xfs_da_btree.h"
#include "xfs_dir2_priv.h"
#include "xfs_attr_leaf.h"
+#include "xfs_types.h"
+#include "xfs_errortag.h"
kmem_zone_t *xfs_ifork_zone;
@@ -728,3 +730,28 @@ xfs_ifork_verify_local_attr(
return 0;
}
+
+int
+xfs_iext_count_may_overflow(
+ struct xfs_inode *ip,
+ int whichfork,
+ int nr_to_add)
+{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ uint64_t max_exts;
+ uint64_t nr_exts;
+
+ if (whichfork == XFS_COW_FORK)
+ return 0;
+
+ max_exts = (whichfork == XFS_ATTR_FORK) ? MAXAEXTNUM : MAXEXTNUM;
+
+ if (XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS))
+ max_exts = 10;
+
+ nr_exts = ifp->if_nextents + nr_to_add;
+ if (nr_exts < ifp->if_nextents || nr_exts > max_exts)
+ return -EFBIG;
+
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index a4953e95c4f3..9e2137cd7372 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -35,6 +35,67 @@ struct xfs_ifork {
#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */
/*
+ * Worst-case increase in the fork extent count when we're adding a single
+ * extent to a fork and there's no possibility of splitting an existing mapping.
+ */
+#define XFS_IEXT_ADD_NOSPLIT_CNT (1)
+
+/*
+ * Punching out an extent from the middle of an existing extent can cause the
+ * extent count to increase by 1.
+ * i.e. | Old extent | Hole | Old extent |
+ */
+#define XFS_IEXT_PUNCH_HOLE_CNT (1)
+
+/*
+ * Directory entry addition can cause the following,
+ * 1. Data block can be added/removed.
+ * A new extent can cause extent count to increase by 1.
+ * 2. Free disk block can be added/removed.
+ * Same behaviour as described above for Data block.
+ * 3. Dabtree blocks.
+ * XFS_DA_NODE_MAXDEPTH blocks can be added. Each of these can be new
+ * extents. Hence extent count can increase by XFS_DA_NODE_MAXDEPTH.
+ */
+#define XFS_IEXT_DIR_MANIP_CNT(mp) \
+ ((XFS_DA_NODE_MAXDEPTH + 1 + 1) * (mp)->m_dir_geo->fsbcount)
+
+/*
+ * Adding/removing an xattr can cause XFS_DA_NODE_MAXDEPTH extents to
+ * be added. One extra extent for dabtree in case a local attr is
+ * large enough to cause a double split. It can also cause extent
+ * count to increase proportional to the size of a remote xattr's
+ * value.
+ */
+#define XFS_IEXT_ATTR_MANIP_CNT(rmt_blks) \
+ (XFS_DA_NODE_MAXDEPTH + max(1, rmt_blks))
+
+/*
+ * A write to a sub-interval of an existing unwritten extent causes the original
+ * extent to be split into 3 extents
+ * i.e. | Unwritten | Real | Unwritten |
+ * Hence extent count can increase by 2.
+ */
+#define XFS_IEXT_WRITE_UNWRITTEN_CNT (2)
+
+
+/*
+ * Moving an extent to data fork can cause a sub-interval of an existing extent
+ * to be unmapped. This will increase extent count by 1. Mapping in the new
+ * extent can increase the extent count by 1 again i.e.
+ * | Old extent | New extent | Old extent |
+ * Hence number of extents increases by 2.
+ */
+#define XFS_IEXT_REFLINK_END_COW_CNT (2)
+
+/*
+ * Removing an initial range of source/donor file's extent and adding a new
+ * extent (from donor/source file) in its place will cause extent count to
+ * increase by 1.
+ */
+#define XFS_IEXT_SWAP_RMAP_CNT (1)
+
+/*
* Fork handling.
*/
@@ -172,5 +233,7 @@ extern void xfs_ifork_init_cow(struct xfs_inode *ip);
int xfs_ifork_verify_local_data(struct xfs_inode *ip);
int xfs_ifork_verify_local_attr(struct xfs_inode *ip);
+int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork,
+ int nr_to_add);
#endif /* __XFS_INODE_FORK_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index bbda117e5d85..60e6d255e5e2 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -1138,6 +1138,8 @@ xfs_fs_geometry(
geo->flags |= XFS_FSOP_GEOM_FLAGS_REFLINK;
if (xfs_sb_version_hasbigtime(sbp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_BIGTIME;
+ if (xfs_sb_version_hasinobtcounts(sbp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_INOBTCNT;
if (xfs_sb_version_hassector(sbp))
geo->logsectsize = sbp->sb_logsectsize;
else
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 8ea6d4aa3f55..53456f3de881 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -888,7 +888,7 @@ xchk_stop_reaping(
struct xfs_scrub *sc)
{
sc->flags |= XCHK_REAPING_DISABLED;
- xfs_stop_block_reaping(sc->mp);
+ xfs_blockgc_stop(sc->mp);
}
/* Restart background reaping of resources. */
@@ -896,6 +896,6 @@ void
xchk_start_reaping(
struct xfs_scrub *sc)
{
- xfs_start_block_reaping(sc->mp);
+ xfs_blockgc_start(sc->mp);
sc->flags &= ~XCHK_REAPING_DISABLED;
}
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 93e4d8ae6e92..2344757ede63 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -471,6 +471,7 @@ xfs_bui_item_recover(
xfs_exntst_t state;
unsigned int bui_type;
int whichfork;
+ int iext_delta;
int error = 0;
if (!xfs_bui_validate(mp, buip)) {
@@ -508,6 +509,15 @@ xfs_bui_item_recover(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
+ if (bui_type == XFS_BMAP_MAP)
+ iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT;
+ else
+ iext_delta = XFS_IEXT_PUNCH_HOLE_CNT;
+
+ error = xfs_iext_count_may_overflow(ip, whichfork, iext_delta);
+ if (error)
+ goto err_cancel;
+
count = bmap->me_len;
error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip,
whichfork, bmap->me_startoff, bmap->me_startblock,
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 7371a7f7c652..e7d68318e6a5 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -727,11 +727,9 @@ xfs_alloc_file_space(
xfs_fileoff_t startoffset_fsb;
xfs_fileoff_t endoffset_fsb;
int nimaps;
- int quota_flag;
int rt;
xfs_trans_t *tp;
xfs_bmbt_irec_t imaps[1], *imapp;
- uint qblocks, resblks, resrtextents;
int error;
trace_xfs_alloc_file_space(ip);
@@ -761,6 +759,7 @@ xfs_alloc_file_space(
*/
while (allocatesize_fsb && !error) {
xfs_fileoff_t s, e;
+ unsigned int dblocks, rblocks, resblks;
/*
* Determine space reservations for data/realtime.
@@ -790,45 +789,31 @@ xfs_alloc_file_space(
*/
resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
if (unlikely(rt)) {
- resrtextents = qblocks = resblks;
- resrtextents /= mp->m_sb.sb_rextsize;
- resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
- quota_flag = XFS_QMOPT_RES_RTBLKS;
+ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+ rblocks = resblks;
} else {
- resrtextents = 0;
- resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
- quota_flag = XFS_QMOPT_RES_REGBLKS;
+ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
+ rblocks = 0;
}
/*
* Allocate and setup the transaction.
*/
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
- resrtextents, 0, &tp);
-
- /*
- * Check for running out of space
- */
- if (error) {
- /*
- * Free the transaction structure.
- */
- ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
- break;
- }
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
- 0, quota_flag);
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
+ dblocks, rblocks, false, &tp);
if (error)
- goto error1;
+ break;
- xfs_trans_ijoin(tp, ip, 0);
+ error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error)
+ goto error;
error = xfs_bmapi_write(tp, ip, startoffset_fsb,
allocatesize_fsb, alloc_type, 0, imapp,
&nimaps);
if (error)
- goto error0;
+ goto error;
/*
* Complete the transaction
@@ -851,10 +836,7 @@ xfs_alloc_file_space(
return error;
-error0: /* unlock inode, unreserve quota blocks, cancel trans */
- xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
-
-error1: /* Just cancel transaction */
+error:
xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
@@ -872,20 +854,16 @@ xfs_unmap_extent(
uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
int error;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
- if (error) {
- ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0,
+ false, &tp);
+ if (error)
return error;
- }
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, ip->i_gdquot,
- ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS);
+ error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ XFS_IEXT_PUNCH_HOLE_CNT);
if (error)
goto out_trans_cancel;
- xfs_trans_ijoin(tp, ip, 0);
-
error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, done);
if (error)
goto out_trans_cancel;
@@ -1163,6 +1141,11 @@ xfs_insert_file_space(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
+ error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ XFS_IEXT_PUNCH_HOLE_CNT);
+ if (error)
+ goto out_trans_cancel;
+
/*
* The extent shifting code works on extent granularity. So, if stop_fsb
* is not the starting block of extent, we need to split the extent at
@@ -1384,6 +1367,22 @@ xfs_swap_extent_rmap(
irec.br_blockcount);
trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec);
+ if (xfs_bmap_is_real_extent(&uirec)) {
+ error = xfs_iext_count_may_overflow(ip,
+ XFS_DATA_FORK,
+ XFS_IEXT_SWAP_RMAP_CNT);
+ if (error)
+ goto out;
+ }
+
+ if (xfs_bmap_is_real_extent(&irec)) {
+ error = xfs_iext_count_may_overflow(tip,
+ XFS_DATA_FORK,
+ XFS_IEXT_SWAP_RMAP_CNT);
+ if (error)
+ goto out;
+ }
+
/* Remove the mapping from the donor file. */
xfs_bmap_unmap_extent(tp, tip, &uirec);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index f8400bbd6473..f6e5235df7c9 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -43,7 +43,7 @@ static kmem_zone_t *xfs_buf_zone;
* pag_buf_lock
* lru_lock
*
- * xfs_buftarg_wait_rele
+ * xfs_buftarg_drain_rele
* lru_lock
* b_lock (trylock due to inversion)
*
@@ -88,7 +88,7 @@ xfs_buf_vmap_len(
* because the corresponding decrement is deferred to buffer release. Buffers
* can undergo I/O multiple times in a hold-release cycle and per buffer I/O
* tracking adds unnecessary overhead. This is used for sychronization purposes
- * with unmount (see xfs_wait_buftarg()), so all we really need is a count of
+ * with unmount (see xfs_buftarg_drain()), so all we really need is a count of
* in-flight buffers.
*
* Buffers that are never released (e.g., superblock, iclog buffers) must set
@@ -1786,7 +1786,7 @@ __xfs_buf_mark_corrupt(
* while freeing all the buffers only held by the LRU.
*/
static enum lru_status
-xfs_buftarg_wait_rele(
+xfs_buftarg_drain_rele(
struct list_head *item,
struct list_lru_one *lru,
spinlock_t *lru_lock,
@@ -1798,7 +1798,7 @@ xfs_buftarg_wait_rele(
if (atomic_read(&bp->b_hold) > 1) {
/* need to wait, so skip it this pass */
- trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
+ trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
return LRU_SKIP;
}
if (!spin_trylock(&bp->b_lock))
@@ -1815,14 +1815,13 @@ xfs_buftarg_wait_rele(
return LRU_REMOVED;
}
+/*
+ * Wait for outstanding I/O on the buftarg to complete.
+ */
void
-xfs_wait_buftarg(
+xfs_buftarg_wait(
struct xfs_buftarg *btp)
{
- LIST_HEAD(dispose);
- int loop = 0;
- bool write_fail = false;
-
/*
* First wait on the buftarg I/O count for all in-flight buffers to be
* released. This is critical as new buffers do not make the LRU until
@@ -1838,10 +1837,21 @@ xfs_wait_buftarg(
while (percpu_counter_sum(&btp->bt_io_count))
delay(100);
flush_workqueue(btp->bt_mount->m_buf_workqueue);
+}
+
+void
+xfs_buftarg_drain(
+ struct xfs_buftarg *btp)
+{
+ LIST_HEAD(dispose);
+ int loop = 0;
+ bool write_fail = false;
+
+ xfs_buftarg_wait(btp);
/* loop until there is nothing left on the lru list. */
while (list_lru_count(&btp->bt_lru)) {
- list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
+ list_lru_walk(&btp->bt_lru, xfs_buftarg_drain_rele,
&dispose, LONG_MAX);
while (!list_empty(&dispose)) {
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 5d91a31298a4..459ca34f26f5 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -152,7 +152,7 @@ struct xfs_buf {
struct list_head b_list;
struct xfs_perag *b_pag; /* contains rbtree root */
struct xfs_mount *b_mount;
- xfs_buftarg_t *b_target; /* buffer target (device) */
+ struct xfs_buftarg *b_target; /* buffer target (device) */
void *b_addr; /* virtual address of buffer */
struct work_struct b_ioend_work;
struct completion b_iowait; /* queue for I/O waiters */
@@ -344,11 +344,12 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
/*
* Handling of buftargs.
*/
-extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
- struct block_device *, struct dax_device *);
+extern struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *,
+ struct block_device *, struct dax_device *);
extern void xfs_free_buftarg(struct xfs_buftarg *);
-extern void xfs_wait_buftarg(xfs_buftarg_t *);
-extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int);
+extern void xfs_buftarg_wait(struct xfs_buftarg *);
+extern void xfs_buftarg_drain(struct xfs_buftarg *);
+extern int xfs_setsize_buftarg(struct xfs_buftarg *, unsigned int);
#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 1d95ed387d66..bd8379b98374 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -314,8 +314,14 @@ xfs_dquot_disk_alloc(
return -ESRCH;
}
- /* Create the block mapping. */
xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
+
+ error = xfs_iext_count_may_overflow(quotip, XFS_DATA_FORK,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error)
+ return error;
+
+ /* Create the block mapping. */
error = xfs_bmapi_write(tp, quotip, dqp->q_fileoffset,
XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0, &map,
&nmaps);
@@ -500,6 +506,42 @@ xfs_dquot_alloc(
return dqp;
}
+/* Check the ondisk dquot's id and type match what the incore dquot expects. */
+static bool
+xfs_dquot_check_type(
+ struct xfs_dquot *dqp,
+ struct xfs_disk_dquot *ddqp)
+{
+ uint8_t ddqp_type;
+ uint8_t dqp_type;
+
+ ddqp_type = ddqp->d_type & XFS_DQTYPE_REC_MASK;
+ dqp_type = xfs_dquot_type(dqp);
+
+ if (be32_to_cpu(ddqp->d_id) != dqp->q_id)
+ return false;
+
+ /*
+ * V5 filesystems always expect an exact type match. V4 filesystems
+ * expect an exact match for user dquots and for non-root group and
+ * project dquots.
+ */
+ if (xfs_sb_version_hascrc(&dqp->q_mount->m_sb) ||
+ dqp_type == XFS_DQTYPE_USER || dqp->q_id != 0)
+ return ddqp_type == dqp_type;
+
+ /*
+ * V4 filesystems support either group or project quotas, but not both
+ * at the same time. The non-user quota file can be switched between
+ * group and project quota uses depending on the mount options, which
+ * means that we can encounter the other type when we try to load quota
+ * defaults. Quotacheck will soon reset the the entire quota file
+ * (including the root dquot) anyway, but don't log scary corruption
+ * reports to dmesg.
+ */
+ return ddqp_type == XFS_DQTYPE_GROUP || ddqp_type == XFS_DQTYPE_PROJ;
+}
+
/* Copy the in-core quota fields in from the on-disk buffer. */
STATIC int
xfs_dquot_from_disk(
@@ -512,8 +554,7 @@ xfs_dquot_from_disk(
* Ensure that we got the type and ID we were looking for.
* Everything else was checked by the dquot buffer verifier.
*/
- if ((ddqp->d_type & XFS_DQTYPE_REC_MASK) != xfs_dquot_type(dqp) ||
- be32_to_cpu(ddqp->d_id) != dqp->q_id) {
+ if (!xfs_dquot_check_type(dqp, ddqp)) {
xfs_alert_tag(bp->b_mount, XFS_PTAG_VERIFIER_ERROR,
"Metadata corruption detected at %pS, quota %u",
__this_address, dqp->q_id);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 7f6e20899473..185b4915b7bf 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -54,6 +54,8 @@ static unsigned int xfs_errortag_random_default[] = {
XFS_RANDOM_FORCE_SUMMARY_RECALC,
XFS_RANDOM_IUNLINK_FALLBACK,
XFS_RANDOM_BUF_IOERROR,
+ XFS_RANDOM_REDUCE_MAX_IEXTENTS,
+ XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT,
};
struct xfs_errortag_attr {
@@ -164,6 +166,8 @@ XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR);
XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC);
XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK);
XFS_ERRORTAG_ATTR_RW(buf_ioerror, XFS_ERRTAG_BUF_IOERROR);
+XFS_ERRORTAG_ATTR_RW(reduce_max_iextents, XFS_ERRTAG_REDUCE_MAX_IEXTENTS);
+XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT);
static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -202,6 +206,8 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(bad_summary),
XFS_ERRORTAG_ATTR_LIST(iunlink_fallback),
XFS_ERRORTAG_ATTR_LIST(buf_ioerror),
+ XFS_ERRORTAG_ATTR_LIST(reduce_max_iextents),
+ XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent),
NULL,
};
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1bdc3560aed9..a007ca0711d9 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -119,6 +119,54 @@ xfs_dir_fsync(
return xfs_log_force_inode(ip);
}
+static xfs_lsn_t
+xfs_fsync_lsn(
+ struct xfs_inode *ip,
+ bool datasync)
+{
+ if (!xfs_ipincount(ip))
+ return 0;
+ if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+ return 0;
+ return ip->i_itemp->ili_last_lsn;
+}
+
+/*
+ * All metadata updates are logged, which means that we just have to flush the
+ * log up to the latest LSN that touched the inode.
+ *
+ * If we have concurrent fsync/fdatasync() calls, we need them to all block on
+ * the log force before we clear the ili_fsync_fields field. This ensures that
+ * we don't get a racing sync operation that does not wait for the metadata to
+ * hit the journal before returning. If we race with clearing ili_fsync_fields,
+ * then all that will happen is the log force will do nothing as the lsn will
+ * already be on disk. We can't race with setting ili_fsync_fields because that
+ * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
+ * shared until after the ili_fsync_fields is cleared.
+ */
+static int
+xfs_fsync_flush_log(
+ struct xfs_inode *ip,
+ bool datasync,
+ int *log_flushed)
+{
+ int error = 0;
+ xfs_lsn_t lsn;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ lsn = xfs_fsync_lsn(ip, datasync);
+ if (lsn) {
+ error = xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC,
+ log_flushed);
+
+ spin_lock(&ip->i_itemp->ili_lock);
+ ip->i_itemp->ili_fsync_fields = 0;
+ spin_unlock(&ip->i_itemp->ili_lock);
+ }
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ return error;
+}
+
STATIC int
xfs_file_fsync(
struct file *file,
@@ -126,13 +174,10 @@ xfs_file_fsync(
loff_t end,
int datasync)
{
- struct inode *inode = file->f_mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_inode_log_item *iip = ip->i_itemp;
+ struct xfs_inode *ip = XFS_I(file->f_mapping->host);
struct xfs_mount *mp = ip->i_mount;
int error = 0;
int log_flushed = 0;
- xfs_lsn_t lsn = 0;
trace_xfs_file_fsync(ip);
@@ -157,32 +202,13 @@ xfs_file_fsync(
xfs_blkdev_issue_flush(mp->m_ddev_targp);
/*
- * All metadata updates are logged, which means that we just have to
- * flush the log up to the latest LSN that touched the inode. If we have
- * concurrent fsync/fdatasync() calls, we need them to all block on the
- * log force before we clear the ili_fsync_fields field. This ensures
- * that we don't get a racing sync operation that does not wait for the
- * metadata to hit the journal before returning. If we race with
- * clearing the ili_fsync_fields, then all that will happen is the log
- * force will do nothing as the lsn will already be on disk. We can't
- * race with setting ili_fsync_fields because that is done under
- * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
- * until after the ili_fsync_fields is cleared.
+ * Any inode that has dirty modifications in the log is pinned. The
+ * racy check here for a pinned inode while not catch modifications
+ * that happen concurrently to the fsync call, but fsync semantics
+ * only require to sync previously completed I/O.
*/
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (xfs_ipincount(ip)) {
- if (!datasync ||
- (iip->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
- lsn = iip->ili_last_lsn;
- }
-
- if (lsn) {
- error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
- spin_lock(&iip->ili_lock);
- iip->ili_fsync_fields = 0;
- spin_unlock(&iip->ili_lock);
- }
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ if (xfs_ipincount(ip))
+ error = xfs_fsync_flush_log(ip, datasync, &log_flushed);
/*
* If we only have a single device, and the log force about was
@@ -198,30 +224,42 @@ xfs_file_fsync(
return error;
}
+static int
+xfs_ilock_iocb(
+ struct kiocb *iocb,
+ unsigned int lock_mode)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!xfs_ilock_nowait(ip, lock_mode))
+ return -EAGAIN;
+ } else {
+ xfs_ilock(ip, lock_mode);
+ }
+
+ return 0;
+}
+
STATIC ssize_t
-xfs_file_dio_aio_read(
+xfs_file_dio_read(
struct kiocb *iocb,
struct iov_iter *to)
{
struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
- size_t count = iov_iter_count(to);
ssize_t ret;
- trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
+ trace_xfs_file_direct_read(iocb, to);
- if (!count)
+ if (!iov_iter_count(to))
return 0; /* skip atime */
file_accessed(iocb->ki_filp);
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
- return -EAGAIN;
- } else {
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
- }
- ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL,
- is_sync_kiocb(iocb));
+ ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+ if (ret)
+ return ret;
+ ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
@@ -233,21 +271,16 @@ xfs_file_dax_read(
struct iov_iter *to)
{
struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
- size_t count = iov_iter_count(to);
ssize_t ret = 0;
- trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
+ trace_xfs_file_dax_read(iocb, to);
- if (!count)
+ if (!iov_iter_count(to))
return 0; /* skip atime */
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
- return -EAGAIN;
- } else {
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
- }
-
+ ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+ if (ret)
+ return ret;
ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -256,21 +289,18 @@ xfs_file_dax_read(
}
STATIC ssize_t
-xfs_file_buffered_aio_read(
+xfs_file_buffered_read(
struct kiocb *iocb,
struct iov_iter *to)
{
struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
ssize_t ret;
- trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
+ trace_xfs_file_buffered_read(iocb, to);
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
- return -EAGAIN;
- } else {
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
- }
+ ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+ if (ret)
+ return ret;
ret = generic_file_read_iter(iocb, to);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -294,9 +324,9 @@ xfs_file_read_iter(
if (IS_DAX(inode))
ret = xfs_file_dax_read(iocb, to);
else if (iocb->ki_flags & IOCB_DIRECT)
- ret = xfs_file_dio_aio_read(iocb, to);
+ ret = xfs_file_dio_read(iocb, to);
else
- ret = xfs_file_buffered_aio_read(iocb, to);
+ ret = xfs_file_buffered_read(iocb, to);
if (ret > 0)
XFS_STATS_ADD(mp, xs_read_bytes, ret);
@@ -311,7 +341,7 @@ xfs_file_read_iter(
* if called for a direct write beyond i_size.
*/
STATIC ssize_t
-xfs_file_aio_write_checks(
+xfs_file_write_checks(
struct kiocb *iocb,
struct iov_iter *from,
int *iolock)
@@ -329,7 +359,14 @@ restart:
if (error <= 0)
return error;
- error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ error = break_layout(inode, false);
+ if (error == -EWOULDBLOCK)
+ error = -EAGAIN;
+ } else {
+ error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
+ }
+
if (error)
return error;
@@ -340,7 +377,11 @@ restart:
if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
xfs_iunlock(ip, *iolock);
*iolock = XFS_IOLOCK_EXCL;
- xfs_ilock(ip, *iolock);
+ error = xfs_ilock_iocb(iocb, *iolock);
+ if (error) {
+ *iolock = 0;
+ return error;
+ }
goto restart;
}
/*
@@ -362,6 +403,10 @@ restart:
isize = i_size_read(inode);
if (iocb->ki_pos > isize) {
spin_unlock(&ip->i_flags_lock);
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+
if (!drained_dio) {
if (*iolock == XFS_IOLOCK_SHARED) {
xfs_iunlock(ip, *iolock);
@@ -390,12 +435,6 @@ restart:
} else
spin_unlock(&ip->i_flags_lock);
- /*
- * Updating the timestamps will grab the ilock again from
- * xfs_fs_dirty_inode, so we have to call it after dropping the
- * lock above. Eventually we should look into a way to avoid
- * the pointless lock roundtrip.
- */
return file_modified(file);
}
@@ -481,122 +520,149 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
};
/*
- * xfs_file_dio_aio_write - handle direct IO writes
- *
- * Lock the inode appropriately to prepare for and issue a direct IO write.
- * By separating it from the buffered write path we remove all the tricky to
- * follow locking changes and looping.
- *
- * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
- * until we're sure the bytes at the new EOF have been zeroed and/or the cached
- * pages are flushed out.
- *
- * In most cases the direct IO writes will be done holding IOLOCK_SHARED
- * allowing them to be done in parallel with reads and other direct IO writes.
- * However, if the IO is not aligned to filesystem blocks, the direct IO layer
- * needs to do sub-block zeroing and that requires serialisation against other
- * direct IOs to the same block. In this case we need to serialise the
- * submission of the unaligned IOs so that we don't get racing block zeroing in
- * the dio layer. To avoid the problem with aio, we also need to wait for
- * outstanding IOs to complete so that unwritten extent conversion is completed
- * before we try to map the overlapping block. This is currently implemented by
- * hitting it with a big hammer (i.e. inode_dio_wait()).
- *
- * Returns with locks held indicated by @iolock and errors indicated by
- * negative return values.
+ * Handle block aligned direct I/O writes
*/
-STATIC ssize_t
-xfs_file_dio_aio_write(
+static noinline ssize_t
+xfs_file_dio_write_aligned(
+ struct xfs_inode *ip,
struct kiocb *iocb,
struct iov_iter *from)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- ssize_t ret = 0;
- int unaligned_io = 0;
- int iolock;
- size_t count = iov_iter_count(from);
- struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ int iolock = XFS_IOLOCK_SHARED;
+ ssize_t ret;
- /* DIO must be aligned to device logical sector size */
- if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
- return -EINVAL;
+ ret = xfs_ilock_iocb(iocb, iolock);
+ if (ret)
+ return ret;
+ ret = xfs_file_write_checks(iocb, from, &iolock);
+ if (ret)
+ goto out_unlock;
/*
- * Don't take the exclusive iolock here unless the I/O is unaligned to
- * the file system block size. We don't need to consider the EOF
- * extension case here because xfs_file_aio_write_checks() will relock
- * the inode as necessary for EOF zeroing cases and fill out the new
- * inode size as appropriate.
+ * We don't need to hold the IOLOCK exclusively across the IO, so demote
+ * the iolock back to shared if we had to take the exclusive lock in
+ * xfs_file_write_checks() for other reasons.
*/
- if ((iocb->ki_pos & mp->m_blockmask) ||
- ((iocb->ki_pos + count) & mp->m_blockmask)) {
- unaligned_io = 1;
-
- /*
- * We can't properly handle unaligned direct I/O to reflink
- * files yet, as we can't unshare a partial block.
- */
- if (xfs_is_cow_inode(ip)) {
- trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
- return -ENOTBLK;
- }
- iolock = XFS_IOLOCK_EXCL;
- } else {
+ if (iolock == XFS_IOLOCK_EXCL) {
+ xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
iolock = XFS_IOLOCK_SHARED;
}
+ trace_xfs_file_direct_write(iocb, from);
+ ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
+ &xfs_dio_write_ops, 0);
+out_unlock:
+ if (iolock)
+ xfs_iunlock(ip, iolock);
+ return ret;
+}
- if (iocb->ki_flags & IOCB_NOWAIT) {
- /* unaligned dio always waits, bail */
- if (unaligned_io)
- return -EAGAIN;
- if (!xfs_ilock_nowait(ip, iolock))
+/*
+ * Handle block unaligned direct I/O writes
+ *
+ * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
+ * them to be done in parallel with reads and other direct I/O writes. However,
+ * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
+ * to do sub-block zeroing and that requires serialisation against other direct
+ * I/O to the same block. In this case we need to serialise the submission of
+ * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
+ * In the case where sub-block zeroing is not required, we can do concurrent
+ * sub-block dios to the same block successfully.
+ *
+ * Optimistically submit the I/O using the shared lock first, but use the
+ * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
+ * if block allocation or partial block zeroing would be required. In that case
+ * we try again with the exclusive lock.
+ */
+static noinline ssize_t
+xfs_file_dio_write_unaligned(
+ struct xfs_inode *ip,
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ size_t isize = i_size_read(VFS_I(ip));
+ size_t count = iov_iter_count(from);
+ int iolock = XFS_IOLOCK_SHARED;
+ unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY;
+ ssize_t ret;
+
+ /*
+ * Extending writes need exclusivity because of the sub-block zeroing
+ * that the DIO code always does for partial tail blocks beyond EOF, so
+ * don't even bother trying the fast path in this case.
+ */
+ if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
+retry_exclusive:
+ if (iocb->ki_flags & IOCB_NOWAIT)
return -EAGAIN;
- } else {
- xfs_ilock(ip, iolock);
+ iolock = XFS_IOLOCK_EXCL;
+ flags = IOMAP_DIO_FORCE_WAIT;
}
- ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+ ret = xfs_ilock_iocb(iocb, iolock);
if (ret)
- goto out;
- count = iov_iter_count(from);
+ return ret;
/*
- * If we are doing unaligned IO, we can't allow any other overlapping IO
- * in-flight at the same time or we risk data corruption. Wait for all
- * other IO to drain before we submit. If the IO is aligned, demote the
- * iolock if we had to take the exclusive lock in
- * xfs_file_aio_write_checks() for other reasons.
+ * We can't properly handle unaligned direct I/O to reflink files yet,
+ * as we can't unshare a partial block.
*/
- if (unaligned_io) {
- inode_dio_wait(inode);
- } else if (iolock == XFS_IOLOCK_EXCL) {
- xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
- iolock = XFS_IOLOCK_SHARED;
+ if (xfs_is_cow_inode(ip)) {
+ trace_xfs_reflink_bounce_dio_write(iocb, from);
+ ret = -ENOTBLK;
+ goto out_unlock;
}
- trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
+ ret = xfs_file_write_checks(iocb, from, &iolock);
+ if (ret)
+ goto out_unlock;
+
/*
- * If unaligned, this is the only IO in-flight. Wait on it before we
- * release the iolock to prevent subsequent overlapping IO.
+ * If we are doing exclusive unaligned I/O, this must be the only I/O
+ * in-flight. Otherwise we risk data corruption due to unwritten extent
+ * conversions from the AIO end_io handler. Wait for all other I/O to
+ * drain first.
*/
+ if (flags & IOMAP_DIO_FORCE_WAIT)
+ inode_dio_wait(VFS_I(ip));
+
+ trace_xfs_file_direct_write(iocb, from);
ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
- &xfs_dio_write_ops,
- is_sync_kiocb(iocb) || unaligned_io);
-out:
- xfs_iunlock(ip, iolock);
+ &xfs_dio_write_ops, flags);
/*
- * No fallback to buffered IO after short writes for XFS, direct I/O
- * will either complete fully or return an error.
+ * Retry unaligned I/O with exclusive blocking semantics if the DIO
+ * layer rejected it for mapping or locking reasons. If we are doing
+ * nonblocking user I/O, propagate the error.
*/
- ASSERT(ret < 0 || ret == count);
+ if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
+ ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
+ xfs_iunlock(ip, iolock);
+ goto retry_exclusive;
+ }
+
+out_unlock:
+ if (iolock)
+ xfs_iunlock(ip, iolock);
return ret;
}
+static ssize_t
+xfs_file_dio_write(
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ size_t count = iov_iter_count(from);
+
+ /* direct I/O must be aligned to device logical sector size */
+ if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
+ return -EINVAL;
+ if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
+ return xfs_file_dio_write_unaligned(ip, iocb, from);
+ return xfs_file_dio_write_aligned(ip, iocb, from);
+}
+
static noinline ssize_t
xfs_file_dax_write(
struct kiocb *iocb,
@@ -606,31 +672,26 @@ xfs_file_dax_write(
struct xfs_inode *ip = XFS_I(inode);
int iolock = XFS_IOLOCK_EXCL;
ssize_t ret, error = 0;
- size_t count;
loff_t pos;
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!xfs_ilock_nowait(ip, iolock))
- return -EAGAIN;
- } else {
- xfs_ilock(ip, iolock);
- }
-
- ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+ ret = xfs_ilock_iocb(iocb, iolock);
+ if (ret)
+ return ret;
+ ret = xfs_file_write_checks(iocb, from, &iolock);
if (ret)
goto out;
pos = iocb->ki_pos;
- count = iov_iter_count(from);
- trace_xfs_file_dax_write(ip, count, pos);
+ trace_xfs_file_dax_write(iocb, from);
ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
i_size_write(inode, iocb->ki_pos);
error = xfs_setfilesize(ip, pos, ret);
}
out:
- xfs_iunlock(ip, iolock);
+ if (iolock)
+ xfs_iunlock(ip, iolock);
if (error)
return error;
@@ -644,7 +705,7 @@ out:
}
STATIC ssize_t
-xfs_file_buffered_aio_write(
+xfs_file_buffered_write(
struct kiocb *iocb,
struct iov_iter *from)
{
@@ -653,7 +714,7 @@ xfs_file_buffered_aio_write(
struct inode *inode = mapping->host;
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
- int enospc = 0;
+ bool cleared_space = false;
int iolock;
if (iocb->ki_flags & IOCB_NOWAIT)
@@ -663,14 +724,14 @@ write_retry:
iolock = XFS_IOLOCK_EXCL;
xfs_ilock(ip, iolock);
- ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+ ret = xfs_file_write_checks(iocb, from, &iolock);
if (ret)
goto out;
/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(inode);
- trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
+ trace_xfs_file_buffered_write(iocb, from);
ret = iomap_file_buffered_write(iocb, from,
&xfs_buffered_write_iomap_ops);
if (likely(ret >= 0))
@@ -683,27 +744,23 @@ write_retry:
* metadata space. This reduces the chances that the eofblocks scan
* waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
* also behaves as a filter to prevent too many eofblocks scans from
- * running at the same time.
+ * running at the same time. Use a synchronous scan to increase the
+ * effectiveness of the scan.
*/
- if (ret == -EDQUOT && !enospc) {
+ if (ret == -EDQUOT && !cleared_space) {
xfs_iunlock(ip, iolock);
- enospc = xfs_inode_free_quota_eofblocks(ip);
- if (enospc)
- goto write_retry;
- enospc = xfs_inode_free_quota_cowblocks(ip);
- if (enospc)
- goto write_retry;
- iolock = 0;
- } else if (ret == -ENOSPC && !enospc) {
+ xfs_blockgc_free_quota(ip, XFS_EOF_FLAGS_SYNC);
+ cleared_space = true;
+ goto write_retry;
+ } else if (ret == -ENOSPC && !cleared_space) {
struct xfs_eofblocks eofb = {0};
- enospc = 1;
+ cleared_space = true;
xfs_flush_inodes(ip->i_mount);
xfs_iunlock(ip, iolock);
eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
- xfs_icache_free_eofblocks(ip->i_mount, &eofb);
- xfs_icache_free_cowblocks(ip->i_mount, &eofb);
+ xfs_blockgc_free_space(ip->i_mount, &eofb);
goto write_retry;
}
@@ -750,12 +807,12 @@ xfs_file_write_iter(
* CoW. In all other directio scenarios we do not
* allow an operation to fall back to buffered mode.
*/
- ret = xfs_file_dio_aio_write(iocb, from);
+ ret = xfs_file_dio_write(iocb, from);
if (ret != -ENOTBLK)
return ret;
}
- return xfs_file_buffered_aio_write(iocb, from);
+ return xfs_file_buffered_write(iocb, from);
}
static void
@@ -1321,17 +1378,19 @@ xfs_filemap_pfn_mkwrite(
return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
}
-static void
+static vm_fault_t
xfs_filemap_map_pages(
struct vm_fault *vmf,
pgoff_t start_pgoff,
pgoff_t end_pgoff)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
+ vm_fault_t ret;
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- filemap_map_pages(vmf, start_pgoff, end_pgoff);
+ ret = filemap_map_pages(vmf, start_pgoff, end_pgoff);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ return ret;
}
static const struct vm_operations_struct xfs_file_vm_ops = {
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 959ce91a3755..a2a407039227 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -25,17 +25,17 @@
*/
static int
xfs_growfs_data_private(
- xfs_mount_t *mp, /* mount point for filesystem */
- xfs_growfs_data_t *in) /* growfs data input struct */
+ struct xfs_mount *mp, /* mount point for filesystem */
+ struct xfs_growfs_data *in) /* growfs data input struct */
{
struct xfs_buf *bp;
int error;
xfs_agnumber_t nagcount;
xfs_agnumber_t nagimax = 0;
- xfs_rfsblock_t nb, nb_mod;
- xfs_rfsblock_t new;
+ xfs_rfsblock_t nb, nb_div, nb_mod;
+ xfs_rfsblock_t delta;
xfs_agnumber_t oagcount;
- xfs_trans_t *tp;
+ struct xfs_trans *tp;
struct aghdr_init_data id = {};
nb = in->newblocks;
@@ -50,16 +50,16 @@ xfs_growfs_data_private(
return error;
xfs_buf_relse(bp);
- new = nb; /* use new as a temporary here */
- nb_mod = do_div(new, mp->m_sb.sb_agblocks);
- nagcount = new + (nb_mod != 0);
+ nb_div = nb;
+ nb_mod = do_div(nb_div, mp->m_sb.sb_agblocks);
+ nagcount = nb_div + (nb_mod != 0);
if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
nagcount--;
nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
if (nb < mp->m_sb.sb_dblocks)
return -EINVAL;
}
- new = nb - mp->m_sb.sb_dblocks;
+ delta = nb - mp->m_sb.sb_dblocks;
oagcount = mp->m_sb.sb_agcount;
/* allocate the new per-ag structures */
@@ -89,7 +89,7 @@ xfs_growfs_data_private(
INIT_LIST_HEAD(&id.buffer_list);
for (id.agno = nagcount - 1;
id.agno >= oagcount;
- id.agno--, new -= id.agsize) {
+ id.agno--, delta -= id.agsize) {
if (id.agno == nagcount - 1)
id.agsize = nb -
@@ -110,8 +110,8 @@ xfs_growfs_data_private(
xfs_trans_agblocks_delta(tp, id.nfree);
/* If there are new blocks in the old last AG, extend it. */
- if (new) {
- error = xfs_ag_extend_space(mp, tp, &id, new);
+ if (delta) {
+ error = xfs_ag_extend_space(mp, tp, &id, delta);
if (error)
goto out_trans_cancel;
}
@@ -143,7 +143,7 @@ xfs_growfs_data_private(
* If we expanded the last AG, free the per-AG reservation
* so we can reinitialize it with the new size.
*/
- if (new) {
+ if (delta) {
struct xfs_perag *pag;
pag = xfs_perag_get(mp, id.agno);
@@ -170,8 +170,8 @@ out_trans_cancel:
static int
xfs_growfs_log_private(
- xfs_mount_t *mp, /* mount point for filesystem */
- xfs_growfs_log_t *in) /* growfs log input struct */
+ struct xfs_mount *mp, /* mount point for filesystem */
+ struct xfs_growfs_log *in) /* growfs log input struct */
{
xfs_extlen_t nb;
@@ -268,7 +268,7 @@ out_error:
int
xfs_growfs_log(
xfs_mount_t *mp,
- xfs_growfs_log_t *in)
+ struct xfs_growfs_log *in)
{
int error;
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 92869f6ec8d3..2cffe51a31e8 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -6,8 +6,8 @@
#ifndef __XFS_FSOPS_H__
#define __XFS_FSOPS_H__
-extern int xfs_growfs_data(xfs_mount_t *mp, xfs_growfs_data_t *in);
-extern int xfs_growfs_log(xfs_mount_t *mp, xfs_growfs_log_t *in);
+extern int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in);
+extern int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in);
extern void xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
extern int xfs_reserve_blocks(xfs_mount_t *mp, uint64_t *inval,
xfs_fsop_resblks_t *outval);
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index fa55ab8b8d80..f62fa652c2fd 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -8,8 +8,8 @@
/*
* Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n,
* other XFS code uses these values. Times are measured in centisecs (i.e.
- * 100ths of a second) with the exception of eofb_timer and cowb_timer, which
- * are measured in seconds.
+ * 100ths of a second) with the exception of blockgc_timer, which is measured
+ * in seconds.
*/
xfs_param_t xfs_params = {
/* MIN DFLT MAX */
@@ -28,8 +28,7 @@ xfs_param_t xfs_params = {
.rotorstep = { 1, 1, 255 },
.inherit_nodfrg = { 0, 1, 1 },
.fstrm_timer = { 1, 30*100, 3600*100},
- .eofb_timer = { 1, 300, 3600*24},
- .cowb_timer = { 1, 1800, 3600*24},
+ .blockgc_timer = { 1, 300, 3600*24},
};
struct xfs_globals xfs_globals = {
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index deb99300d171..1d7720a0c068 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -916,69 +916,6 @@ xfs_inode_walk(
}
/*
- * Background scanning to trim post-EOF preallocated space. This is queued
- * based on the 'speculative_prealloc_lifetime' tunable (5m by default).
- */
-void
-xfs_queue_eofblocks(
- struct xfs_mount *mp)
-{
- rcu_read_lock();
- if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG))
- queue_delayed_work(mp->m_eofblocks_workqueue,
- &mp->m_eofblocks_work,
- msecs_to_jiffies(xfs_eofb_secs * 1000));
- rcu_read_unlock();
-}
-
-void
-xfs_eofblocks_worker(
- struct work_struct *work)
-{
- struct xfs_mount *mp = container_of(to_delayed_work(work),
- struct xfs_mount, m_eofblocks_work);
-
- if (!sb_start_write_trylock(mp->m_super))
- return;
- xfs_icache_free_eofblocks(mp, NULL);
- sb_end_write(mp->m_super);
-
- xfs_queue_eofblocks(mp);
-}
-
-/*
- * Background scanning to trim preallocated CoW space. This is queued
- * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default).
- * (We'll just piggyback on the post-EOF prealloc space workqueue.)
- */
-void
-xfs_queue_cowblocks(
- struct xfs_mount *mp)
-{
- rcu_read_lock();
- if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG))
- queue_delayed_work(mp->m_eofblocks_workqueue,
- &mp->m_cowblocks_work,
- msecs_to_jiffies(xfs_cowb_secs * 1000));
- rcu_read_unlock();
-}
-
-void
-xfs_cowblocks_worker(
- struct work_struct *work)
-{
- struct xfs_mount *mp = container_of(to_delayed_work(work),
- struct xfs_mount, m_cowblocks_work);
-
- if (!sb_start_write_trylock(mp->m_super))
- return;
- xfs_icache_free_cowblocks(mp, NULL);
- sb_end_write(mp->m_super);
-
- xfs_queue_cowblocks(mp);
-}
-
-/*
* Grab the inode for reclaim exclusively.
*
* We have found this inode via a lookup under RCU, so the inode may have
@@ -1346,14 +1283,17 @@ xfs_reclaim_worker(
STATIC int
xfs_inode_free_eofblocks(
struct xfs_inode *ip,
- void *args)
+ void *args,
+ unsigned int *lockflags)
{
struct xfs_eofblocks *eofb = args;
bool wait;
- int ret;
wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC);
+ if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS))
+ return 0;
+
if (!xfs_can_free_eofblocks(ip, false)) {
/* inode could be preallocated or append-only */
trace_xfs_inode_free_eofblocks_invalid(ip);
@@ -1380,130 +1320,68 @@ xfs_inode_free_eofblocks(
return -EAGAIN;
return 0;
}
+ *lockflags |= XFS_IOLOCK_EXCL;
- ret = xfs_free_eofblocks(ip);
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-
- return ret;
-}
-
-int
-xfs_icache_free_eofblocks(
- struct xfs_mount *mp,
- struct xfs_eofblocks *eofb)
-{
- return xfs_inode_walk(mp, 0, xfs_inode_free_eofblocks, eofb,
- XFS_ICI_EOFBLOCKS_TAG);
+ return xfs_free_eofblocks(ip);
}
/*
- * Run eofblocks scans on the quotas applicable to the inode. For inodes with
- * multiple quotas, we don't know exactly which quota caused an allocation
- * failure. We make a best effort by including each quota under low free space
- * conditions (less than 1% free space) in the scan.
+ * Background scanning to trim preallocated space. This is queued based on the
+ * 'speculative_prealloc_lifetime' tunable (5m by default).
*/
-static int
-__xfs_inode_free_quota_eofblocks(
- struct xfs_inode *ip,
- int (*execute)(struct xfs_mount *mp,
- struct xfs_eofblocks *eofb))
-{
- int scan = 0;
- struct xfs_eofblocks eofb = {0};
- struct xfs_dquot *dq;
-
- /*
- * Run a sync scan to increase effectiveness and use the union filter to
- * cover all applicable quotas in a single scan.
- */
- eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC;
-
- if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) {
- dq = xfs_inode_dquot(ip, XFS_DQTYPE_USER);
- if (dq && xfs_dquot_lowsp(dq)) {
- eofb.eof_uid = VFS_I(ip)->i_uid;
- eofb.eof_flags |= XFS_EOF_FLAGS_UID;
- scan = 1;
- }
- }
-
- if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) {
- dq = xfs_inode_dquot(ip, XFS_DQTYPE_GROUP);
- if (dq && xfs_dquot_lowsp(dq)) {
- eofb.eof_gid = VFS_I(ip)->i_gid;
- eofb.eof_flags |= XFS_EOF_FLAGS_GID;
- scan = 1;
- }
- }
-
- if (scan)
- execute(ip->i_mount, &eofb);
-
- return scan;
-}
-
-int
-xfs_inode_free_quota_eofblocks(
- struct xfs_inode *ip)
-{
- return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks);
-}
-
-static inline unsigned long
-xfs_iflag_for_tag(
- int tag)
+static inline void
+xfs_blockgc_queue(
+ struct xfs_perag *pag)
{
- switch (tag) {
- case XFS_ICI_EOFBLOCKS_TAG:
- return XFS_IEOFBLOCKS;
- case XFS_ICI_COWBLOCKS_TAG:
- return XFS_ICOWBLOCKS;
- default:
- ASSERT(0);
- return 0;
- }
+ rcu_read_lock();
+ if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG))
+ queue_delayed_work(pag->pag_mount->m_blockgc_workqueue,
+ &pag->pag_blockgc_work,
+ msecs_to_jiffies(xfs_blockgc_secs * 1000));
+ rcu_read_unlock();
}
static void
-__xfs_inode_set_blocks_tag(
- xfs_inode_t *ip,
- void (*execute)(struct xfs_mount *mp),
- void (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno,
- int error, unsigned long caller_ip),
- int tag)
+xfs_blockgc_set_iflag(
+ struct xfs_inode *ip,
+ unsigned long iflag)
{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_perag *pag;
- int tagged;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
+ int tagged;
+
+ ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
/*
* Don't bother locking the AG and looking up in the radix trees
* if we already know that we have the tag set.
*/
- if (ip->i_flags & xfs_iflag_for_tag(tag))
+ if (ip->i_flags & iflag)
return;
spin_lock(&ip->i_flags_lock);
- ip->i_flags |= xfs_iflag_for_tag(tag);
+ ip->i_flags |= iflag;
spin_unlock(&ip->i_flags_lock);
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
spin_lock(&pag->pag_ici_lock);
- tagged = radix_tree_tagged(&pag->pag_ici_root, tag);
+ tagged = radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG);
radix_tree_tag_set(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag);
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+ XFS_ICI_BLOCKGC_TAG);
if (!tagged) {
- /* propagate the eofblocks tag up into the perag radix tree */
+ /* propagate the blockgc tag up into the perag radix tree */
spin_lock(&ip->i_mount->m_perag_lock);
radix_tree_tag_set(&ip->i_mount->m_perag_tree,
XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
- tag);
+ XFS_ICI_BLOCKGC_TAG);
spin_unlock(&ip->i_mount->m_perag_lock);
/* kick off background trimming */
- execute(ip->i_mount);
+ xfs_blockgc_queue(pag);
- set_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_);
+ trace_xfs_perag_set_blockgc(ip->i_mount, pag->pag_agno, -1,
+ _RET_IP_);
}
spin_unlock(&pag->pag_ici_lock);
@@ -1515,38 +1393,43 @@ xfs_inode_set_eofblocks_tag(
xfs_inode_t *ip)
{
trace_xfs_inode_set_eofblocks_tag(ip);
- return __xfs_inode_set_blocks_tag(ip, xfs_queue_eofblocks,
- trace_xfs_perag_set_eofblocks,
- XFS_ICI_EOFBLOCKS_TAG);
+ return xfs_blockgc_set_iflag(ip, XFS_IEOFBLOCKS);
}
static void
-__xfs_inode_clear_blocks_tag(
- xfs_inode_t *ip,
- void (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno,
- int error, unsigned long caller_ip),
- int tag)
+xfs_blockgc_clear_iflag(
+ struct xfs_inode *ip,
+ unsigned long iflag)
{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_perag *pag;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
+ bool clear_tag;
+
+ ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
spin_lock(&ip->i_flags_lock);
- ip->i_flags &= ~xfs_iflag_for_tag(tag);
+ ip->i_flags &= ~iflag;
+ clear_tag = (ip->i_flags & (XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0;
spin_unlock(&ip->i_flags_lock);
+ if (!clear_tag)
+ return;
+
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
spin_lock(&pag->pag_ici_lock);
radix_tree_tag_clear(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag);
- if (!radix_tree_tagged(&pag->pag_ici_root, tag)) {
- /* clear the eofblocks tag from the perag radix tree */
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+ XFS_ICI_BLOCKGC_TAG);
+ if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) {
+ /* clear the blockgc tag from the perag radix tree */
spin_lock(&ip->i_mount->m_perag_lock);
radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
- tag);
+ XFS_ICI_BLOCKGC_TAG);
spin_unlock(&ip->i_mount->m_perag_lock);
- clear_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_);
+ trace_xfs_perag_clear_blockgc(ip->i_mount, pag->pag_agno, -1,
+ _RET_IP_);
}
spin_unlock(&pag->pag_ici_lock);
@@ -1558,8 +1441,7 @@ xfs_inode_clear_eofblocks_tag(
xfs_inode_t *ip)
{
trace_xfs_inode_clear_eofblocks_tag(ip);
- return __xfs_inode_clear_blocks_tag(ip,
- trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG);
+ return xfs_blockgc_clear_iflag(ip, XFS_IEOFBLOCKS);
}
/*
@@ -1609,20 +1491,42 @@ xfs_prep_free_cowblocks(
STATIC int
xfs_inode_free_cowblocks(
struct xfs_inode *ip,
- void *args)
+ void *args,
+ unsigned int *lockflags)
{
struct xfs_eofblocks *eofb = args;
+ bool wait;
int ret = 0;
+ wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC);
+
+ if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS))
+ return 0;
+
if (!xfs_prep_free_cowblocks(ip))
return 0;
if (!xfs_inode_matches_eofb(ip, eofb))
return 0;
- /* Free the CoW blocks */
- xfs_ilock(ip, XFS_IOLOCK_EXCL);
- xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+ /*
+ * If the caller is waiting, return -EAGAIN to keep the background
+ * scanner moving and revisit the inode in a subsequent pass.
+ */
+ if (!(*lockflags & XFS_IOLOCK_EXCL) &&
+ !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
+ if (wait)
+ return -EAGAIN;
+ return 0;
+ }
+ *lockflags |= XFS_IOLOCK_EXCL;
+
+ if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) {
+ if (wait)
+ return -EAGAIN;
+ return 0;
+ }
+ *lockflags |= XFS_MMAPLOCK_EXCL;
/*
* Check again, nobody else should be able to dirty blocks or change
@@ -1630,37 +1534,15 @@ xfs_inode_free_cowblocks(
*/
if (xfs_prep_free_cowblocks(ip))
ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false);
-
- xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-
return ret;
}
-int
-xfs_icache_free_cowblocks(
- struct xfs_mount *mp,
- struct xfs_eofblocks *eofb)
-{
- return xfs_inode_walk(mp, 0, xfs_inode_free_cowblocks, eofb,
- XFS_ICI_COWBLOCKS_TAG);
-}
-
-int
-xfs_inode_free_quota_cowblocks(
- struct xfs_inode *ip)
-{
- return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_cowblocks);
-}
-
void
xfs_inode_set_cowblocks_tag(
xfs_inode_t *ip)
{
trace_xfs_inode_set_cowblocks_tag(ip);
- return __xfs_inode_set_blocks_tag(ip, xfs_queue_cowblocks,
- trace_xfs_perag_set_cowblocks,
- XFS_ICI_COWBLOCKS_TAG);
+ return xfs_blockgc_set_iflag(ip, XFS_ICOWBLOCKS);
}
void
@@ -1668,24 +1550,158 @@ xfs_inode_clear_cowblocks_tag(
xfs_inode_t *ip)
{
trace_xfs_inode_clear_cowblocks_tag(ip);
- return __xfs_inode_clear_blocks_tag(ip,
- trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG);
+ return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS);
}
+#define for_each_perag_tag(mp, next_agno, pag, tag) \
+ for ((next_agno) = 0, (pag) = xfs_perag_get_tag((mp), 0, (tag)); \
+ (pag) != NULL; \
+ (next_agno) = (pag)->pag_agno + 1, \
+ xfs_perag_put(pag), \
+ (pag) = xfs_perag_get_tag((mp), (next_agno), (tag)))
+
+
/* Disable post-EOF and CoW block auto-reclamation. */
void
-xfs_stop_block_reaping(
+xfs_blockgc_stop(
struct xfs_mount *mp)
{
- cancel_delayed_work_sync(&mp->m_eofblocks_work);
- cancel_delayed_work_sync(&mp->m_cowblocks_work);
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+
+ for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
+ cancel_delayed_work_sync(&pag->pag_blockgc_work);
}
/* Enable post-EOF and CoW block auto-reclamation. */
void
-xfs_start_block_reaping(
+xfs_blockgc_start(
struct xfs_mount *mp)
{
- xfs_queue_eofblocks(mp);
- xfs_queue_cowblocks(mp);
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+
+ for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
+ xfs_blockgc_queue(pag);
+}
+
+/* Scan one incore inode for block preallocations that we can remove. */
+static int
+xfs_blockgc_scan_inode(
+ struct xfs_inode *ip,
+ void *args)
+{
+ unsigned int lockflags = 0;
+ int error;
+
+ error = xfs_inode_free_eofblocks(ip, args, &lockflags);
+ if (error)
+ goto unlock;
+
+ error = xfs_inode_free_cowblocks(ip, args, &lockflags);
+unlock:
+ if (lockflags)
+ xfs_iunlock(ip, lockflags);
+ return error;
+}
+
+/* Background worker that trims preallocated space. */
+void
+xfs_blockgc_worker(
+ struct work_struct *work)
+{
+ struct xfs_perag *pag = container_of(to_delayed_work(work),
+ struct xfs_perag, pag_blockgc_work);
+ struct xfs_mount *mp = pag->pag_mount;
+ int error;
+
+ if (!sb_start_write_trylock(mp->m_super))
+ return;
+ error = xfs_inode_walk_ag(pag, 0, xfs_blockgc_scan_inode, NULL,
+ XFS_ICI_BLOCKGC_TAG);
+ if (error)
+ xfs_info(mp, "AG %u preallocation gc worker failed, err=%d",
+ pag->pag_agno, error);
+ sb_end_write(mp->m_super);
+ xfs_blockgc_queue(pag);
+}
+
+/*
+ * Try to free space in the filesystem by purging eofblocks and cowblocks.
+ */
+int
+xfs_blockgc_free_space(
+ struct xfs_mount *mp,
+ struct xfs_eofblocks *eofb)
+{
+ trace_xfs_blockgc_free_space(mp, eofb, _RET_IP_);
+
+ return xfs_inode_walk(mp, 0, xfs_blockgc_scan_inode, eofb,
+ XFS_ICI_BLOCKGC_TAG);
+}
+
+/*
+ * Run cow/eofblocks scans on the supplied dquots. We don't know exactly which
+ * quota caused an allocation failure, so we make a best effort by including
+ * each quota under low free space conditions (less than 1% free space) in the
+ * scan.
+ *
+ * Callers must not hold any inode's ILOCK. If requesting a synchronous scan
+ * (XFS_EOF_FLAGS_SYNC), the caller also must not hold any inode's IOLOCK or
+ * MMAPLOCK.
+ */
+int
+xfs_blockgc_free_dquots(
+ struct xfs_mount *mp,
+ struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp,
+ struct xfs_dquot *pdqp,
+ unsigned int eof_flags)
+{
+ struct xfs_eofblocks eofb = {0};
+ bool do_work = false;
+
+ if (!udqp && !gdqp && !pdqp)
+ return 0;
+
+ /*
+ * Run a scan to free blocks using the union filter to cover all
+ * applicable quotas in a single scan.
+ */
+ eofb.eof_flags = XFS_EOF_FLAGS_UNION | eof_flags;
+
+ if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) {
+ eofb.eof_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id);
+ eofb.eof_flags |= XFS_EOF_FLAGS_UID;
+ do_work = true;
+ }
+
+ if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) {
+ eofb.eof_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id);
+ eofb.eof_flags |= XFS_EOF_FLAGS_GID;
+ do_work = true;
+ }
+
+ if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) {
+ eofb.eof_prid = pdqp->q_id;
+ eofb.eof_flags |= XFS_EOF_FLAGS_PRID;
+ do_work = true;
+ }
+
+ if (!do_work)
+ return 0;
+
+ return xfs_blockgc_free_space(mp, &eofb);
+}
+
+/* Run cow/eofblocks scans on the quotas attached to the inode. */
+int
+xfs_blockgc_free_quota(
+ struct xfs_inode *ip,
+ unsigned int eof_flags)
+{
+ return xfs_blockgc_free_dquots(ip->i_mount,
+ xfs_inode_dquot(ip, XFS_DQTYPE_USER),
+ xfs_inode_dquot(ip, XFS_DQTYPE_GROUP),
+ xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), eof_flags);
}
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 3a4c8b382cd0..d1fddb152420 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -23,8 +23,8 @@ struct xfs_eofblocks {
#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup
in xfs_inode_walk */
#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */
-#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */
-#define XFS_ICI_COWBLOCKS_TAG 2 /* inode can have cow blocks to gc */
+/* Inode has speculative preallocations (posteof or cow) to clean. */
+#define XFS_ICI_BLOCKGC_TAG 1
/*
* Flags for xfs_iget()
@@ -54,19 +54,19 @@ long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp, struct xfs_dquot *pdqp,
+ unsigned int eof_flags);
+int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int eof_flags);
+int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_eofblocks *eofb);
+
void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
-int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
-int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip);
-void xfs_eofblocks_worker(struct work_struct *);
-void xfs_queue_eofblocks(struct xfs_mount *);
void xfs_inode_set_cowblocks_tag(struct xfs_inode *ip);
void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip);
-int xfs_icache_free_cowblocks(struct xfs_mount *, struct xfs_eofblocks *);
-int xfs_inode_free_quota_cowblocks(struct xfs_inode *ip);
-void xfs_cowblocks_worker(struct work_struct *);
-void xfs_queue_cowblocks(struct xfs_mount *);
+
+void xfs_blockgc_worker(struct work_struct *work);
int xfs_inode_walk(struct xfs_mount *mp, int iter_flags,
int (*execute)(struct xfs_inode *ip, void *args),
@@ -75,7 +75,7 @@ int xfs_inode_walk(struct xfs_mount *mp, int iter_flags,
int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_ino_t ino, bool *inuse);
-void xfs_stop_block_reaping(struct xfs_mount *mp);
-void xfs_start_block_reaping(struct xfs_mount *mp);
+void xfs_blockgc_stop(struct xfs_mount *mp);
+void xfs_blockgc_start(struct xfs_mount *mp);
#endif
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 95b7f2ba4e06..46a861d55e48 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -776,6 +776,7 @@ xfs_init_new_inode(
prid_t prid,
struct xfs_inode **ipp)
{
+ struct inode *dir = pip ? VFS_I(pip) : NULL;
struct xfs_mount *mp = tp->t_mountp;
struct xfs_inode *ip;
unsigned int flags;
@@ -805,18 +806,17 @@ xfs_init_new_inode(
ASSERT(ip != NULL);
inode = VFS_I(ip);
- inode->i_mode = mode;
set_nlink(inode, nlink);
- inode->i_uid = fsuid_into_mnt(mnt_userns);
inode->i_rdev = rdev;
ip->i_d.di_projid = prid;
- if (pip && XFS_INHERIT_GID(pip)) {
- inode->i_gid = VFS_I(pip)->i_gid;
- if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode))
- inode->i_mode |= S_ISGID;
+ if (dir && !(dir->i_mode & S_ISGID) &&
+ (mp->m_flags & XFS_MOUNT_GRPID)) {
+ inode->i_uid = fsuid_into_mnt(mnt_userns);
+ inode->i_gid = dir->i_gid;
+ inode->i_mode = mode;
} else {
- inode->i_gid = fsgid_into_mnt(mnt_userns);
+ inode_init_owner(mnt_userns, inode, dir, mode);
}
/*
@@ -1027,23 +1027,22 @@ xfs_create(
* the case we'll drop the one we have and get a more
* appropriate transaction later.
*/
- error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
+ error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
+ &tp);
if (error == -ENOSPC) {
/* flush outstanding delalloc blocks and retry */
xfs_flush_inodes(mp);
- error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
+ error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp,
+ resblks, &tp);
}
if (error)
- goto out_release_inode;
+ goto out_release_dquots;
xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
- /*
- * Reserve disk quota and the inode.
- */
- error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
- pdqp, resblks, 1, 0);
+ error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK,
+ XFS_IEXT_DIR_MANIP_CNT(mp));
if (error)
goto out_trans_cancel;
@@ -1122,7 +1121,7 @@ xfs_create(
xfs_finish_inode_setup(ip);
xfs_irele(ip);
}
-
+ out_release_dquots:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
xfs_qm_dqrele(pdqp);
@@ -1167,14 +1166,10 @@ xfs_create_tmpfile(
resblks = XFS_IALLOC_SPACE_RES(mp);
tres = &M_RES(mp)->tr_create_tmpfile;
- error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
- if (error)
- goto out_release_inode;
-
- error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
- pdqp, resblks, 1, 0);
+ error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
+ &tp);
if (error)
- goto out_trans_cancel;
+ goto out_release_dquots;
error = xfs_dir_ialloc(mnt_userns, &tp, dp, mode, 0, 0, prid, &ip);
if (error)
@@ -1217,7 +1212,7 @@ xfs_create_tmpfile(
xfs_finish_inode_setup(ip);
xfs_irele(ip);
}
-
+ out_release_dquots:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
xfs_qm_dqrele(pdqp);
@@ -1265,6 +1260,11 @@ xfs_link(
xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
+ error = xfs_iext_count_may_overflow(tdp, XFS_DATA_FORK,
+ XFS_IEXT_DIR_MANIP_CNT(mp));
+ if (error)
+ goto error_return;
+
/*
* If we are using project inheritance, we only allow hard link
* creation in our tree when the project IDs are the same; else
@@ -3027,7 +3027,7 @@ xfs_rename(
struct xfs_trans *tp;
struct xfs_inode *wip = NULL; /* whiteout inode */
struct xfs_inode *inodes[__XFS_SORT_INODES];
- struct xfs_buf *agibp;
+ int i;
int num_inodes = __XFS_SORT_INODES;
bool new_parent = (src_dp != target_dp);
bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
@@ -3116,6 +3116,35 @@ xfs_rename(
/*
* Check for expected errors before we dirty the transaction
* so we can return an error without a transaction abort.
+ *
+ * Extent count overflow check:
+ *
+ * From the perspective of src_dp, a rename operation is essentially a
+ * directory entry remove operation. Hence the only place where we check
+ * for extent count overflow for src_dp is in
+ * xfs_bmap_del_extent_real(). xfs_bmap_del_extent_real() returns
+ * -ENOSPC when it detects a possible extent count overflow and in
+ * response, the higher layers of directory handling code do the
+ * following:
+ * 1. Data/Free blocks: XFS lets these blocks linger until a
+ * future remove operation removes them.
+ * 2. Dabtree blocks: XFS swaps the blocks with the last block in the
+ * Leaf space and unmaps the last block.
+ *
+ * For target_dp, there are two cases depending on whether the
+ * destination directory entry exists or not.
+ *
+ * When destination directory entry does not exist (i.e. target_ip ==
+ * NULL), extent count overflow check is performed only when transaction
+ * has a non-zero sized space reservation associated with it. With a
+ * zero-sized space reservation, XFS allows a rename operation to
+ * continue only when the directory has sufficient free space in its
+ * data/leaf/free space blocks to hold the new entry.
+ *
+ * When destination directory entry exists (i.e. target_ip != NULL), all
+ * we need to do is change the inode number associated with the already
+ * existing entry. Hence there is no need to perform an extent count
+ * overflow check.
*/
if (target_ip == NULL) {
/*
@@ -3126,6 +3155,12 @@ xfs_rename(
error = xfs_dir_canenter(tp, target_dp, target_name);
if (error)
goto out_trans_cancel;
+ } else {
+ error = xfs_iext_count_may_overflow(target_dp,
+ XFS_DATA_FORK,
+ XFS_IEXT_DIR_MANIP_CNT(mp));
+ if (error)
+ goto out_trans_cancel;
}
} else {
/*
@@ -3141,6 +3176,30 @@ xfs_rename(
}
/*
+ * Lock the AGI buffers we need to handle bumping the nlink of the
+ * whiteout inode off the unlinked list and to handle dropping the
+ * nlink of the target inode. Per locking order rules, do this in
+ * increasing AG order and before directory block allocation tries to
+ * grab AGFs because we grab AGIs before AGFs.
+ *
+ * The (vfs) caller must ensure that if src is a directory then
+ * target_ip is either null or an empty directory.
+ */
+ for (i = 0; i < num_inodes && inodes[i] != NULL; i++) {
+ if (inodes[i] == wip ||
+ (inodes[i] == target_ip &&
+ (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) {
+ struct xfs_buf *bp;
+ xfs_agnumber_t agno;
+
+ agno = XFS_INO_TO_AGNO(mp, inodes[i]->i_ino);
+ error = xfs_read_agi(mp, tp, agno, &bp);
+ if (error)
+ goto out_trans_cancel;
+ }
+ }
+
+ /*
* Directory entry creation below may acquire the AGF. Remove
* the whiteout from the unlinked list first to preserve correct
* AGI/AGF locking order. This dirties the transaction so failures
@@ -3192,22 +3251,6 @@ xfs_rename(
* In case there is already an entry with the same
* name at the destination directory, remove it first.
*/
-
- /*
- * Check whether the replace operation will need to allocate
- * blocks. This happens when the shortform directory lacks
- * space and we have to convert it to a block format directory.
- * When more blocks are necessary, we must lock the AGI first
- * to preserve locking order (AGI -> AGF).
- */
- if (xfs_dir2_sf_replace_needblock(target_dp, src_ip->i_ino)) {
- error = xfs_read_agi(mp, tp,
- XFS_INO_TO_AGNO(mp, target_ip->i_ino),
- &agibp);
- if (error)
- goto out_trans_cancel;
- }
-
error = xfs_dir_replace(tp, target_dp, target_name,
src_ip->i_ino, spaceres);
if (error)
@@ -3283,9 +3326,16 @@ xfs_rename(
if (wip) {
error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
spaceres);
- } else
+ } else {
+ /*
+ * NOTE: We don't need to check for extent count overflow here
+ * because the dir remove name code will leave the dir block in
+ * place if the extent count would overflow.
+ */
error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
spaceres);
+ }
+
if (error)
goto out_trans_cancel;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 3d4c7ca080fb..99dfe89a8d08 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1280,7 +1280,8 @@ xfs_ioctl_setattr_prepare_dax(
*/
static struct xfs_trans *
xfs_ioctl_setattr_get_trans(
- struct file *file)
+ struct file *file,
+ struct xfs_dquot *pdqp)
{
struct xfs_inode *ip = XFS_I(file_inode(file));
struct xfs_mount *mp = ip->i_mount;
@@ -1288,17 +1289,15 @@ xfs_ioctl_setattr_get_trans(
int error = -EROFS;
if (mp->m_flags & XFS_MOUNT_RDONLY)
- goto out_unlock;
+ goto out_error;
error = -EIO;
if (XFS_FORCED_SHUTDOWN(mp))
- goto out_unlock;
+ goto out_error;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+ error = xfs_trans_alloc_ichange(ip, NULL, NULL, pdqp,
+ capable(CAP_FOWNER), &tp);
if (error)
- goto out_unlock;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ goto out_error;
/*
* CAP_FOWNER overrides the following restrictions:
@@ -1318,7 +1317,7 @@ xfs_ioctl_setattr_get_trans(
out_cancel:
xfs_trans_cancel(tp);
-out_unlock:
+out_error:
return ERR_PTR(error);
}
@@ -1444,13 +1443,13 @@ xfs_ioctl_setattr(
struct xfs_trans *tp;
struct xfs_dquot *pdqp = NULL;
struct xfs_dquot *olddquot = NULL;
- int code;
+ int error;
trace_xfs_ioctl_setattr(ip);
- code = xfs_ioctl_setattr_check_projid(ip, fa);
- if (code)
- return code;
+ error = xfs_ioctl_setattr_check_projid(ip, fa);
+ if (error)
+ return error;
/*
* If disk quotas is on, we make sure that the dquots do exist on disk,
@@ -1461,44 +1460,36 @@ xfs_ioctl_setattr(
* because the i_*dquot fields will get updated anyway.
*/
if (XFS_IS_QUOTA_ON(mp)) {
- code = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid,
+ error = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid,
VFS_I(ip)->i_gid, fa->fsx_projid,
XFS_QMOPT_PQUOTA, NULL, NULL, &pdqp);
- if (code)
- return code;
+ if (error)
+ return error;
}
xfs_ioctl_setattr_prepare_dax(ip, fa);
- tp = xfs_ioctl_setattr_get_trans(file);
+ tp = xfs_ioctl_setattr_get_trans(file, pdqp);
if (IS_ERR(tp)) {
- code = PTR_ERR(tp);
+ error = PTR_ERR(tp);
goto error_free_dquots;
}
- if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) &&
- ip->i_d.di_projid != fa->fsx_projid) {
- code = xfs_qm_vop_chown_reserve(tp, ip, NULL, NULL, pdqp,
- capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0);
- if (code) /* out of quota */
- goto error_trans_cancel;
- }
-
xfs_fill_fsxattr(ip, false, &old_fa);
- code = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, fa);
- if (code)
+ error = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, fa);
+ if (error)
goto error_trans_cancel;
- code = xfs_ioctl_setattr_check_extsize(ip, fa);
- if (code)
+ error = xfs_ioctl_setattr_check_extsize(ip, fa);
+ if (error)
goto error_trans_cancel;
- code = xfs_ioctl_setattr_check_cowextsize(ip, fa);
- if (code)
+ error = xfs_ioctl_setattr_check_cowextsize(ip, fa);
+ if (error)
goto error_trans_cancel;
- code = xfs_ioctl_setattr_xflags(tp, ip, fa);
- if (code)
+ error = xfs_ioctl_setattr_xflags(tp, ip, fa);
+ if (error)
goto error_trans_cancel;
/*
@@ -1538,7 +1529,7 @@ xfs_ioctl_setattr(
else
ip->i_d.di_cowextsize = 0;
- code = xfs_trans_commit(tp);
+ error = xfs_trans_commit(tp);
/*
* Release any dquot(s) the inode had kept before chown.
@@ -1546,13 +1537,13 @@ xfs_ioctl_setattr(
xfs_qm_dqrele(olddquot);
xfs_qm_dqrele(pdqp);
- return code;
+ return error;
error_trans_cancel:
xfs_trans_cancel(tp);
error_free_dquots:
xfs_qm_dqrele(pdqp);
- return code;
+ return error;
}
STATIC int
@@ -1615,7 +1606,7 @@ xfs_ioc_setxflags(
xfs_ioctl_setattr_prepare_dax(ip, &fa);
- tp = xfs_ioctl_setattr_get_trans(filp);
+ tp = xfs_ioctl_setattr_get_trans(filp, NULL);
if (IS_ERR(tp)) {
error = PTR_ERR(tp);
goto out_drop_write;
@@ -2267,7 +2258,7 @@ xfs_file_ioctl(
}
case XFS_IOC_FSGROWFSDATA: {
- xfs_growfs_data_t in;
+ struct xfs_growfs_data in;
if (copy_from_user(&in, arg, sizeof(in)))
return -EFAULT;
@@ -2281,7 +2272,7 @@ xfs_file_ioctl(
}
case XFS_IOC_FSGROWFSLOG: {
- xfs_growfs_log_t in;
+ struct xfs_growfs_log in;
if (copy_from_user(&in, arg, sizeof(in)))
return -EFAULT;
@@ -2355,8 +2346,10 @@ xfs_file_ioctl(
if (error)
return error;
+ trace_xfs_ioc_free_eofblocks(mp, &keofb, _RET_IP_);
+
sb_start_write(mp->m_super);
- error = xfs_icache_free_eofblocks(mp, &keofb);
+ error = xfs_blockgc_free_space(mp, &keofb);
sb_end_write(mp->m_super);
return error;
}
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 7b9ff824e82d..e17ab7f42928 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -194,25 +194,21 @@ xfs_iomap_write_direct(
struct xfs_trans *tp;
xfs_filblks_t resaligned;
int nimaps;
- int quota_flag;
- uint qblocks, resblks;
- unsigned int resrtextents = 0;
+ unsigned int dblocks, rblocks;
+ bool force = false;
int error;
int bmapi_flags = XFS_BMAPI_PREALLOC;
- uint tflags = 0;
ASSERT(count_fsb > 0);
resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb,
xfs_get_extsz_hint(ip));
if (unlikely(XFS_IS_REALTIME_INODE(ip))) {
- resrtextents = qblocks = resaligned;
- resrtextents /= mp->m_sb.sb_rextsize;
- resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
- quota_flag = XFS_QMOPT_RES_RTBLKS;
+ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+ rblocks = resaligned;
} else {
- resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
- quota_flag = XFS_QMOPT_RES_REGBLKS;
+ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
+ rblocks = 0;
}
error = xfs_qm_dqattach(ip);
@@ -235,23 +231,21 @@ xfs_iomap_write_direct(
if (IS_DAX(VFS_I(ip))) {
bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
if (imap->br_state == XFS_EXT_UNWRITTEN) {
- tflags |= XFS_TRANS_RESERVE;
- resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
+ force = true;
+ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
}
}
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, resrtextents,
- tflags, &tp);
+
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks,
+ rblocks, force, &tp);
if (error)
return error;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
-
- error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
+ error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
goto out_trans_cancel;
- xfs_trans_ijoin(tp, ip, 0);
-
/*
* From this point onwards we overwrite the imap pointer that the
* caller gave to us.
@@ -260,7 +254,7 @@ xfs_iomap_write_direct(
error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flags, 0,
imap, &nimaps);
if (error)
- goto out_res_cancel;
+ goto out_trans_cancel;
/*
* Complete the transaction
@@ -284,8 +278,6 @@ out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
-out_res_cancel:
- xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
out_trans_cancel:
xfs_trans_cancel(tp);
goto out_unlock;
@@ -548,16 +540,13 @@ xfs_iomap_write_unwritten(
* here as we might be asked to write out the same inode that we
* complete here and might deadlock on the iolock.
*/
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
- XFS_TRANS_RESERVE, &tp);
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks,
+ 0, true, &tp);
if (error)
return error;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
-
- error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0,
- XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES);
+ error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ XFS_IEXT_WRITE_UNWRITTEN_CNT);
if (error)
goto error_on_bmapi_transaction;
@@ -784,15 +773,28 @@ xfs_direct_write_iomap_begin(
goto allocate_blocks;
/*
- * NOWAIT IO needs to span the entire requested IO with a single map so
- * that we avoid partial IO failures due to the rest of the IO range not
- * covered by this map triggering an EAGAIN condition when it is
- * subsequently mapped and aborting the IO.
+ * NOWAIT and OVERWRITE I/O needs to span the entire requested I/O with
+ * a single map so that we avoid partial IO failures due to the rest of
+ * the I/O range not covered by this map triggering an EAGAIN condition
+ * when it is subsequently mapped and aborting the I/O.
*/
- if ((flags & IOMAP_NOWAIT) &&
- !imap_spans_range(&imap, offset_fsb, end_fsb)) {
+ if (flags & (IOMAP_NOWAIT | IOMAP_OVERWRITE_ONLY)) {
error = -EAGAIN;
- goto out_unlock;
+ if (!imap_spans_range(&imap, offset_fsb, end_fsb))
+ goto out_unlock;
+ }
+
+ /*
+ * For overwrite only I/O, we cannot convert unwritten extents without
+ * requiring sub-block zeroing. This can only be done under an
+ * exclusive IOLOCK, hence return -EAGAIN if this is not a written
+ * extent to tell the caller to try again.
+ */
+ if (flags & IOMAP_OVERWRITE_ONLY) {
+ error = -EAGAIN;
+ if (imap.br_state != XFS_EXT_NORM &&
+ ((offset | length) & mp->m_blockmask))
+ goto out_unlock;
}
xfs_iunlock(ip, lockmode);
@@ -801,7 +803,7 @@ xfs_direct_write_iomap_begin(
allocate_blocks:
error = -EAGAIN;
- if (flags & IOMAP_NOWAIT)
+ if (flags & (IOMAP_NOWAIT | IOMAP_OVERWRITE_ONLY))
goto out_unlock;
/*
@@ -842,7 +844,8 @@ out_found_cow:
return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
out_unlock:
- xfs_iunlock(ip, lockmode);
+ if (lockmode)
+ xfs_iunlock(ip, lockmode);
return error;
}
@@ -870,6 +873,9 @@ xfs_buffered_write_iomap_begin(
int allocfork = XFS_DATA_FORK;
int error = 0;
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
/* we can't use delayed allocations when using extent size hints */
if (xfs_get_extsz_hint(ip))
return xfs_direct_write_iomap_begin(inode, offset, count,
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 816a0f77a39f..66ebccb5a6ff 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -711,13 +711,11 @@ xfs_setattr_nonsize(
return error;
}
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+ error = xfs_trans_alloc_ichange(ip, udqp, gdqp, NULL,
+ capable(CAP_FOWNER), &tp);
if (error)
goto out_dqrele;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
-
/*
* Change file ownership. Must be the owner or privileged.
*/
@@ -734,21 +732,6 @@ xfs_setattr_nonsize(
uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
/*
- * Do a quota reservation only if uid/gid is actually
- * going to change.
- */
- if (XFS_IS_QUOTA_RUNNING(mp) &&
- ((XFS_IS_UQUOTA_ON(mp) && !uid_eq(iuid, uid)) ||
- (XFS_IS_GQUOTA_ON(mp) && !gid_eq(igid, gid)))) {
- ASSERT(tp);
- error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
- NULL, capable(CAP_FOWNER) ?
- XFS_QMOPT_FORCE_RES : 0);
- if (error) /* out of quota */
- goto out_cancel;
- }
-
- /*
* CAP_FSETID overrides the following restrictions:
*
* The set-user-ID and set-group-ID bits of a file will be
@@ -797,8 +780,6 @@ xfs_setattr_nonsize(
xfs_trans_set_sync(tp);
error = xfs_trans_commit(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
/*
* Release any dquot(s) the inode had kept before chown.
*/
@@ -825,9 +806,6 @@ xfs_setattr_nonsize(
return 0;
-out_cancel:
- xfs_trans_cancel(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
out_dqrele:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
@@ -858,7 +836,7 @@ xfs_setattr_size(
ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
ASSERT(S_ISREG(inode->i_mode));
ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
- ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
+ ATTR_MTIME_SET|ATTR_TIMES_SET)) == 0);
oldsize = inode->i_size;
newsize = iattr->ia_size;
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index eae3aff9bc97..c4a340f1f1e1 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -618,15 +618,12 @@ xfs_iwalk_threaded(
{
struct xfs_pwork_ctl pctl;
xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino);
- unsigned int nr_threads;
int error;
ASSERT(agno < mp->m_sb.sb_agcount);
ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL));
- nr_threads = xfs_pwork_guess_datadev_parallelism(mp);
- error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk",
- nr_threads);
+ error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk");
if (error)
return error;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 5b7a1e201559..af6be9b9ccdf 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -98,8 +98,7 @@ typedef __u32 xfs_nlink_t;
#define xfs_rotorstep xfs_params.rotorstep.val
#define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val
#define xfs_fstrm_centisecs xfs_params.fstrm_timer.val
-#define xfs_eofb_secs xfs_params.eofb_timer.val
-#define xfs_cowb_secs xfs_params.cowb_timer.val
+#define xfs_blockgc_secs xfs_params.blockgc_timer.val
#define current_cpu() (raw_smp_processor_id())
#define current_set_flags_nested(sp, f) \
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index fa2d05e65ff1..06041834daa3 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -91,6 +91,9 @@ STATIC int
xlog_iclogs_empty(
struct xlog *log);
+static int
+xfs_log_cover(struct xfs_mount *);
+
static void
xlog_grant_sub_space(
struct xlog *log,
@@ -347,6 +350,25 @@ xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type)
tic->t_res_num++;
}
+bool
+xfs_log_writable(
+ struct xfs_mount *mp)
+{
+ /*
+ * Never write to the log on norecovery mounts, if the block device is
+ * read-only, or if the filesystem is shutdown. Read-only mounts still
+ * allow internal writes for log recovery and unmount purposes, so don't
+ * restrict that case here.
+ */
+ if (mp->m_flags & XFS_MOUNT_NORECOVERY)
+ return false;
+ if (xfs_readonly_buftarg(mp->m_log->l_targ))
+ return false;
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return false;
+ return true;
+}
+
/*
* Replenish the byte reservation required by moving the grant write head.
*/
@@ -741,7 +763,7 @@ xfs_log_mount_finish(
xfs_log_force(mp, XFS_LOG_SYNC);
xfs_ail_push_all_sync(mp->m_ail);
}
- xfs_wait_buftarg(mp->m_ddev_targp);
+ xfs_buftarg_drain(mp->m_ddev_targp);
if (readonly)
mp->m_flags |= XFS_MOUNT_RDONLY;
@@ -886,15 +908,8 @@ xfs_log_unmount_write(
{
struct xlog *log = mp->m_log;
- /*
- * Don't write out unmount record on norecovery mounts or ro devices.
- * Or, if we are doing a forced umount (typically because of IO errors).
- */
- if (mp->m_flags & XFS_MOUNT_NORECOVERY ||
- xfs_readonly_buftarg(log->l_targ)) {
- ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
+ if (!xfs_log_writable(mp))
return;
- }
xfs_log_force(mp, XFS_LOG_SYNC);
@@ -924,10 +939,9 @@ xfs_log_unmount_write(
* To do this, we first need to shut down the background log work so it is not
* trying to cover the log as we clean up. We then need to unpin all objects in
* the log so we can then flush them out. Once they have completed their IO and
- * run the callbacks removing themselves from the AIL, we can write the unmount
- * record.
+ * run the callbacks removing themselves from the AIL, we can cover the log.
*/
-void
+int
xfs_log_quiesce(
struct xfs_mount *mp)
{
@@ -936,16 +950,24 @@ xfs_log_quiesce(
/*
* The superblock buffer is uncached and while xfs_ail_push_all_sync()
- * will push it, xfs_wait_buftarg() will not wait for it. Further,
+ * will push it, xfs_buftarg_wait() will not wait for it. Further,
* xfs_buf_iowait() cannot be used because it was pushed with the
* XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for
* the IO to complete.
*/
xfs_ail_push_all_sync(mp->m_ail);
- xfs_wait_buftarg(mp->m_ddev_targp);
+ xfs_buftarg_wait(mp->m_ddev_targp);
xfs_buf_lock(mp->m_sb_bp);
xfs_buf_unlock(mp->m_sb_bp);
+ return xfs_log_cover(mp);
+}
+
+void
+xfs_log_clean(
+ struct xfs_mount *mp)
+{
+ xfs_log_quiesce(mp);
xfs_log_unmount_write(mp);
}
@@ -960,7 +982,9 @@ void
xfs_log_unmount(
struct xfs_mount *mp)
{
- xfs_log_quiesce(mp);
+ xfs_log_clean(mp);
+
+ xfs_buftarg_drain(mp->m_ddev_targp);
xfs_trans_ail_destroy(mp);
@@ -1037,17 +1061,15 @@ xfs_log_space_wake(
* there's no point in running a dummy transaction at this point because we
* can't start trying to idle the log until both the CIL and AIL are empty.
*/
-static int
-xfs_log_need_covered(xfs_mount_t *mp)
+static bool
+xfs_log_need_covered(
+ struct xfs_mount *mp)
{
- struct xlog *log = mp->m_log;
- int needed = 0;
-
- if (!xfs_fs_writable(mp, SB_FREEZE_WRITE))
- return 0;
+ struct xlog *log = mp->m_log;
+ bool needed = false;
if (!xlog_cil_empty(log))
- return 0;
+ return false;
spin_lock(&log->l_icloglock);
switch (log->l_covered_state) {
@@ -1062,14 +1084,14 @@ xfs_log_need_covered(xfs_mount_t *mp)
if (!xlog_iclogs_empty(log))
break;
- needed = 1;
+ needed = true;
if (log->l_covered_state == XLOG_STATE_COVER_NEED)
log->l_covered_state = XLOG_STATE_COVER_DONE;
else
log->l_covered_state = XLOG_STATE_COVER_DONE2;
break;
default:
- needed = 1;
+ needed = true;
break;
}
spin_unlock(&log->l_icloglock);
@@ -1077,6 +1099,60 @@ xfs_log_need_covered(xfs_mount_t *mp)
}
/*
+ * Explicitly cover the log. This is similar to background log covering but
+ * intended for usage in quiesce codepaths. The caller is responsible to ensure
+ * the log is idle and suitable for covering. The CIL, iclog buffers and AIL
+ * must all be empty.
+ */
+static int
+xfs_log_cover(
+ struct xfs_mount *mp)
+{
+ int error = 0;
+ bool need_covered;
+
+ ASSERT((xlog_cil_empty(mp->m_log) && xlog_iclogs_empty(mp->m_log) &&
+ !xfs_ail_min_lsn(mp->m_log->l_ailp)) ||
+ XFS_FORCED_SHUTDOWN(mp));
+
+ if (!xfs_log_writable(mp))
+ return 0;
+
+ /*
+ * xfs_log_need_covered() is not idempotent because it progresses the
+ * state machine if the log requires covering. Therefore, we must call
+ * this function once and use the result until we've issued an sb sync.
+ * Do so first to make that abundantly clear.
+ *
+ * Fall into the covering sequence if the log needs covering or the
+ * mount has lazy superblock accounting to sync to disk. The sb sync
+ * used for covering accumulates the in-core counters, so covering
+ * handles this for us.
+ */
+ need_covered = xfs_log_need_covered(mp);
+ if (!need_covered && !xfs_sb_version_haslazysbcount(&mp->m_sb))
+ return 0;
+
+ /*
+ * To cover the log, commit the superblock twice (at most) in
+ * independent checkpoints. The first serves as a reference for the
+ * tail pointer. The sync transaction and AIL push empties the AIL and
+ * updates the in-core tail to the LSN of the first checkpoint. The
+ * second commit updates the on-disk tail with the in-core LSN,
+ * covering the log. Push the AIL one more time to leave it empty, as
+ * we found it.
+ */
+ do {
+ error = xfs_sync_sb(mp, true);
+ if (error)
+ break;
+ xfs_ail_push_all_sync(mp->m_ail);
+ } while (xfs_log_need_covered(mp));
+
+ return error;
+}
+
+/*
* We may be holding the log iclog lock upon entering this routine.
*/
xfs_lsn_t
@@ -1259,7 +1335,7 @@ xfs_log_worker(
struct xfs_mount *mp = log->l_mp;
/* dgc: errors ignored - not fatal and nowhere to report them */
- if (xfs_log_need_covered(mp)) {
+ if (xfs_fs_writable(mp, SB_FREEZE_WRITE) && xfs_log_need_covered(mp)) {
/*
* Dump a transaction into the log that contains no real change.
* This is needed to stamp the current tail LSN into the log
@@ -1416,8 +1492,9 @@ xlog_alloc_log(
log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */
log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s",
- WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_HIGHPRI, 0,
- mp->m_super->s_id);
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM |
+ WQ_HIGHPRI),
+ 0, mp->m_super->s_id);
if (!log->l_ioend_workqueue)
goto out_free_iclog;
@@ -2538,12 +2615,15 @@ xlog_covered_state(
int iclogs_changed)
{
/*
- * We usually go to NEED. But we go to NEED2 if the changed indicates we
- * are done writing the dummy record. If we are done with the second
- * dummy recored (DONE2), then we go to IDLE.
+ * We go to NEED for any non-covering writes. We go to NEED2 if we just
+ * wrote the first covering record (DONE). We go to IDLE if we just
+ * wrote the second covering record (DONE2) and remain in IDLE until a
+ * non-covering write occurs.
*/
switch (prev_state) {
case XLOG_STATE_COVER_IDLE:
+ if (iclogs_changed == 1)
+ return XLOG_STATE_COVER_IDLE;
case XLOG_STATE_COVER_NEED:
case XLOG_STATE_COVER_NEED2:
break;
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 58c3fcbec94a..044e02cb8921 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -127,6 +127,7 @@ int xfs_log_reserve(struct xfs_mount *mp,
int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
void xfs_log_unmount(struct xfs_mount *mp);
int xfs_log_force_umount(struct xfs_mount *mp, int logerror);
+bool xfs_log_writable(struct xfs_mount *mp);
struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
void xfs_log_ticket_put(struct xlog_ticket *ticket);
@@ -137,7 +138,8 @@ void xlog_cil_process_committed(struct list_head *list);
bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
void xfs_log_work_queue(struct xfs_mount *mp);
-void xfs_log_quiesce(struct xfs_mount *mp);
+int xfs_log_quiesce(struct xfs_mount *mp);
+void xfs_log_clean(struct xfs_mount *mp);
bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
bool xfs_log_in_recovery(struct xfs_mount *);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 7110507a2b6b..52370d0a3f43 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -126,6 +126,7 @@ __xfs_free_perag(
{
struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
+ ASSERT(!delayed_work_pending(&pag->pag_blockgc_work));
ASSERT(atomic_read(&pag->pag_ref) == 0);
kmem_free(pag);
}
@@ -146,6 +147,7 @@ xfs_free_perag(
spin_unlock(&mp->m_perag_lock);
ASSERT(pag);
ASSERT(atomic_read(&pag->pag_ref) == 0);
+ cancel_delayed_work_sync(&pag->pag_blockgc_work);
xfs_iunlink_destroy(pag);
xfs_buf_hash_destroy(pag);
call_rcu(&pag->rcu_head, __xfs_free_perag);
@@ -201,6 +203,7 @@ xfs_initialize_perag(
pag->pag_agno = index;
pag->pag_mount = mp;
spin_lock_init(&pag->pag_ici_lock);
+ INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker);
INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
error = xfs_buf_hash_init(pag);
@@ -946,7 +949,7 @@ xfs_mountfs(
*/
if ((mp->m_flags & (XFS_MOUNT_RDONLY|XFS_MOUNT_NORECOVERY)) ==
XFS_MOUNT_RDONLY) {
- xfs_quiesce_attr(mp);
+ xfs_log_clean(mp);
}
/*
@@ -1023,8 +1026,8 @@ xfs_mountfs(
xfs_log_mount_cancel(mp);
out_fail_wait:
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
- xfs_wait_buftarg(mp->m_logdev_targp);
- xfs_wait_buftarg(mp->m_ddev_targp);
+ xfs_buftarg_drain(mp->m_logdev_targp);
+ xfs_buftarg_drain(mp->m_ddev_targp);
out_free_perag:
xfs_free_perag(mp);
out_free_dir:
@@ -1054,7 +1057,7 @@ xfs_unmountfs(
uint64_t resblks;
int error;
- xfs_stop_block_reaping(mp);
+ xfs_blockgc_stop(mp);
xfs_fs_unreserve_ag_blocks(mp);
xfs_qm_unmount_quotas(mp);
xfs_rtunmount_inodes(mp);
@@ -1124,12 +1127,6 @@ xfs_unmountfs(
xfs_warn(mp, "Unable to free reserved block pool. "
"Freespace may not be correct on next mount.");
- error = xfs_log_sbcount(mp);
- if (error)
- xfs_warn(mp, "Unable to update superblock counters. "
- "Freespace may not be correct on next mount.");
-
-
xfs_log_unmount(mp);
xfs_da_unmount(mp);
xfs_uuid_unmount(mp);
@@ -1165,32 +1162,6 @@ xfs_fs_writable(
}
/*
- * xfs_log_sbcount
- *
- * Sync the superblock counters to disk.
- *
- * Note this code can be called during the process of freezing, so we use the
- * transaction allocator that does not block when the transaction subsystem is
- * in its frozen state.
- */
-int
-xfs_log_sbcount(xfs_mount_t *mp)
-{
- /* allow this to proceed during the freeze sequence... */
- if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE))
- return 0;
-
- /*
- * we don't need to do this if we are updating the superblock
- * counters on every modification.
- */
- if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
- return 0;
-
- return xfs_sync_sb(mp, true);
-}
-
-/*
* Deltas for the block count can vary from 1 to very large, but lock contention
* only occurs on frequent small block count updates such as in the delayed
* allocation path for buffered writes (page a time updates). Hence we set
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index dfa429b77ee2..659ad95fe3e0 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -93,7 +93,7 @@ typedef struct xfs_mount {
struct workqueue_struct *m_unwritten_workqueue;
struct workqueue_struct *m_cil_workqueue;
struct workqueue_struct *m_reclaim_workqueue;
- struct workqueue_struct *m_eofblocks_workqueue;
+ struct workqueue_struct *m_blockgc_workqueue;
struct workqueue_struct *m_sync_workqueue;
int m_bsize; /* fs logical block size */
@@ -177,10 +177,6 @@ typedef struct xfs_mount {
uint64_t m_resblks_avail;/* available reserved blocks */
uint64_t m_resblks_save; /* reserved blks @ remount,ro */
struct delayed_work m_reclaim_work; /* background inode reclaim */
- struct delayed_work m_eofblocks_work; /* background eof blocks
- trimming */
- struct delayed_work m_cowblocks_work; /* background cow blocks
- trimming */
struct xfs_kobj m_kobj;
struct xfs_kobj m_error_kobj;
struct xfs_kobj m_error_meta_kobj;
@@ -369,6 +365,9 @@ typedef struct xfs_perag {
/* Blocks reserved for the reverse mapping btree. */
struct xfs_ag_resv pag_rmapbt_resv;
+ /* background prealloc block trimming */
+ struct delayed_work pag_blockgc_work;
+
/* reference count */
uint8_t pagf_refcount_level;
@@ -399,7 +398,6 @@ int xfs_buf_hash_init(xfs_perag_t *pag);
void xfs_buf_hash_destroy(xfs_perag_t *pag);
extern void xfs_uuid_table_free(void);
-extern int xfs_log_sbcount(xfs_mount_t *);
extern uint64_t xfs_default_resblks(xfs_mount_t *mp);
extern int xfs_mountfs(xfs_mount_t *mp);
extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount,
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index a06661dac5be..34c3b16f834f 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -294,7 +294,7 @@ int
xfs_mru_cache_init(void)
{
xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 1);
+ XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 1);
if (!xfs_mru_reap_wq)
return -ENOMEM;
return 0;
diff --git a/fs/xfs/xfs_pwork.c b/fs/xfs/xfs_pwork.c
index b03333f1c84a..c283b801cc5d 100644
--- a/fs/xfs/xfs_pwork.c
+++ b/fs/xfs/xfs_pwork.c
@@ -61,16 +61,18 @@ xfs_pwork_init(
struct xfs_mount *mp,
struct xfs_pwork_ctl *pctl,
xfs_pwork_work_fn work_fn,
- const char *tag,
- unsigned int nr_threads)
+ const char *tag)
{
+ unsigned int nr_threads = 0;
+
#ifdef DEBUG
if (xfs_globals.pwork_threads >= 0)
nr_threads = xfs_globals.pwork_threads;
#endif
trace_xfs_pwork_init(mp, nr_threads, current->pid);
- pctl->wq = alloc_workqueue("%s-%d", WQ_FREEZABLE, nr_threads, tag,
+ pctl->wq = alloc_workqueue("%s-%d",
+ WQ_UNBOUND | WQ_SYSFS | WQ_FREEZABLE, nr_threads, tag,
current->pid);
if (!pctl->wq)
return -ENOMEM;
@@ -117,20 +119,3 @@ xfs_pwork_poll(
atomic_read(&pctl->nr_work) == 0, HZ) == 0)
touch_softlockup_watchdog();
}
-
-/*
- * Return the amount of parallelism that the data device can handle, or 0 for
- * no limit.
- */
-unsigned int
-xfs_pwork_guess_datadev_parallelism(
- struct xfs_mount *mp)
-{
- struct xfs_buftarg *btp = mp->m_ddev_targp;
-
- /*
- * For now we'll go with the most conservative setting possible,
- * which is two threads for an SSD and 1 thread everywhere else.
- */
- return blk_queue_nonrot(btp->bt_bdev->bd_disk->queue) ? 2 : 1;
-}
diff --git a/fs/xfs/xfs_pwork.h b/fs/xfs/xfs_pwork.h
index 8133124cf3bb..c0ef81fc85dd 100644
--- a/fs/xfs/xfs_pwork.h
+++ b/fs/xfs/xfs_pwork.h
@@ -51,11 +51,9 @@ xfs_pwork_want_abort(
}
int xfs_pwork_init(struct xfs_mount *mp, struct xfs_pwork_ctl *pctl,
- xfs_pwork_work_fn work_fn, const char *tag,
- unsigned int nr_threads);
+ xfs_pwork_work_fn work_fn, const char *tag);
void xfs_pwork_queue(struct xfs_pwork_ctl *pctl, struct xfs_pwork *pwork);
int xfs_pwork_destroy(struct xfs_pwork_ctl *pctl);
void xfs_pwork_poll(struct xfs_pwork_ctl *pctl);
-unsigned int xfs_pwork_guess_datadev_parallelism(struct xfs_mount *mp);
#endif /* __XFS_PWORK_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 1b7b1393cab2..bfa4164990b1 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1787,105 +1787,35 @@ xfs_qm_vop_chown(
xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1);
/*
- * Take an extra reference, because the inode is going to keep
- * this dquot pointer even after the trans_commit.
+ * Back when we made quota reservations for the chown, we reserved the
+ * ondisk blocks + delalloc blocks with the new dquot. Now that we've
+ * switched the dquots, decrease the new dquot's block reservation
+ * (having already bumped up the real counter) so that we don't have
+ * any reservation to give back when we commit.
*/
- *IO_olddq = xfs_qm_dqhold(newdq);
+ xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_RES_BLKS,
+ -ip->i_delayed_blks);
- return prevdq;
-}
-
-/*
- * Quota reservations for setattr(AT_UID|AT_GID|AT_PROJID).
- */
-int
-xfs_qm_vop_chown_reserve(
- struct xfs_trans *tp,
- struct xfs_inode *ip,
- struct xfs_dquot *udqp,
- struct xfs_dquot *gdqp,
- struct xfs_dquot *pdqp,
- uint flags)
-{
- struct xfs_mount *mp = ip->i_mount;
- uint64_t delblks;
- unsigned int blkflags;
- struct xfs_dquot *udq_unres = NULL;
- struct xfs_dquot *gdq_unres = NULL;
- struct xfs_dquot *pdq_unres = NULL;
- struct xfs_dquot *udq_delblks = NULL;
- struct xfs_dquot *gdq_delblks = NULL;
- struct xfs_dquot *pdq_delblks = NULL;
- int error;
-
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
- ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-
- delblks = ip->i_delayed_blks;
- blkflags = XFS_IS_REALTIME_INODE(ip) ?
- XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
-
- if (XFS_IS_UQUOTA_ON(mp) && udqp &&
- i_uid_read(VFS_I(ip)) != udqp->q_id) {
- udq_delblks = udqp;
- /*
- * If there are delayed allocation blocks, then we have to
- * unreserve those from the old dquot, and add them to the
- * new dquot.
- */
- if (delblks) {
- ASSERT(ip->i_udquot);
- udq_unres = ip->i_udquot;
- }
- }
- if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp &&
- i_gid_read(VFS_I(ip)) != gdqp->q_id) {
- gdq_delblks = gdqp;
- if (delblks) {
- ASSERT(ip->i_gdquot);
- gdq_unres = ip->i_gdquot;
- }
- }
-
- if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp &&
- ip->i_d.di_projid != pdqp->q_id) {
- pdq_delblks = pdqp;
- if (delblks) {
- ASSERT(ip->i_pdquot);
- pdq_unres = ip->i_pdquot;
- }
- }
-
- error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
- udq_delblks, gdq_delblks, pdq_delblks,
- ip->i_d.di_nblocks, 1, flags | blkflags);
- if (error)
- return error;
+ /*
+ * Give the incore reservation for delalloc blocks back to the old
+ * dquot. We don't normally handle delalloc quota reservations
+ * transactionally, so just lock the dquot and subtract from the
+ * reservation. Dirty the transaction because it's too late to turn
+ * back now.
+ */
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ xfs_dqlock(prevdq);
+ ASSERT(prevdq->q_blk.reserved >= ip->i_delayed_blks);
+ prevdq->q_blk.reserved -= ip->i_delayed_blks;
+ xfs_dqunlock(prevdq);
/*
- * Do the delayed blks reservations/unreservations now. Since, these
- * are done without the help of a transaction, if a reservation fails
- * its previous reservations won't be automatically undone by trans
- * code. So, we have to do it manually here.
+ * Take an extra reference, because the inode is going to keep
+ * this dquot pointer even after the trans_commit.
*/
- if (delblks) {
- /*
- * Do the reservations first. Unreservation can't fail.
- */
- ASSERT(udq_delblks || gdq_delblks || pdq_delblks);
- ASSERT(udq_unres || gdq_unres || pdq_unres);
- error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
- udq_delblks, gdq_delblks, pdq_delblks,
- (xfs_qcnt_t)delblks, 0, flags | blkflags);
- if (error)
- return error;
- xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
- udq_unres, gdq_unres, pdq_unres,
- -((xfs_qcnt_t)delblks), 0, blkflags);
- }
+ *IO_olddq = xfs_qm_dqhold(newdq);
- return 0;
+ return prevdq;
}
int
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 5a62398940d0..d00d01302545 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -81,11 +81,14 @@ extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *,
uint, int64_t);
extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *);
extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *);
-extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *,
- struct xfs_inode *, int64_t, long, uint);
+int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, struct xfs_inode *ip,
+ int64_t dblocks, int64_t rblocks, bool force);
extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
struct xfs_mount *, struct xfs_dquot *,
struct xfs_dquot *, struct xfs_dquot *, int64_t, long, uint);
+int xfs_trans_reserve_quota_icreate(struct xfs_trans *tp,
+ struct xfs_dquot *udqp, struct xfs_dquot *gdqp,
+ struct xfs_dquot *pdqp, int64_t dblocks);
extern int xfs_qm_vop_dqalloc(struct xfs_inode *, kuid_t, kgid_t,
prid_t, uint, struct xfs_dquot **, struct xfs_dquot **,
@@ -95,9 +98,6 @@ extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **);
extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *,
struct xfs_inode *, struct xfs_dquot **, struct xfs_dquot *);
-extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *,
- struct xfs_dquot *, struct xfs_dquot *,
- struct xfs_dquot *, uint);
extern int xfs_qm_dqattach(struct xfs_inode *);
extern int xfs_qm_dqattach_locked(struct xfs_inode *ip, bool doalloc);
extern void xfs_qm_dqdetach(struct xfs_inode *);
@@ -108,6 +108,11 @@ extern void xfs_qm_mount_quotas(struct xfs_mount *);
extern void xfs_qm_unmount(struct xfs_mount *);
extern void xfs_qm_unmount_quotas(struct xfs_mount *);
+static inline int
+xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks)
+{
+ return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false);
+}
#else
static inline int
xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid,
@@ -121,11 +126,12 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid,
}
#define xfs_trans_dup_dqinfo(tp, tp2)
#define xfs_trans_free_dqinfo(tp)
-#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta)
+#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) do { } while (0)
#define xfs_trans_apply_dquot_deltas(tp)
#define xfs_trans_unreserve_and_mod_dquots(tp)
static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
- struct xfs_inode *ip, int64_t nblks, long ninos, uint flags)
+ struct xfs_inode *ip, int64_t dblocks, int64_t rblocks,
+ bool force)
{
return 0;
}
@@ -136,26 +142,39 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
{
return 0;
}
+
+static inline int
+xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks)
+{
+ return 0;
+}
+
+static inline int
+xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, int64_t dblocks)
+{
+ return 0;
+}
+
#define xfs_qm_vop_create_dqattach(tp, ip, u, g, p)
#define xfs_qm_vop_rename_dqattach(it) (0)
#define xfs_qm_vop_chown(tp, ip, old, new) (NULL)
-#define xfs_qm_vop_chown_reserve(tp, ip, u, g, p, fl) (0)
#define xfs_qm_dqattach(ip) (0)
#define xfs_qm_dqattach_locked(ip, fl) (0)
#define xfs_qm_dqdetach(ip)
-#define xfs_qm_dqrele(d)
-#define xfs_qm_statvfs(ip, s)
+#define xfs_qm_dqrele(d) do { (d) = (d); } while(0)
+#define xfs_qm_statvfs(ip, s) do { } while(0)
#define xfs_qm_newmount(mp, a, b) (0)
#define xfs_qm_mount_quotas(mp)
#define xfs_qm_unmount(mp)
#define xfs_qm_unmount_quotas(mp)
#endif /* CONFIG_XFS_QUOTA */
-#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
- xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), -(ninos), flags)
-#define xfs_trans_reserve_quota(tp, mp, ud, gd, pd, nb, ni, f) \
- xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, pd, nb, ni, \
- f | XFS_QMOPT_RES_REGBLKS)
+static inline int
+xfs_quota_unreserve_blkres(struct xfs_inode *ip, int64_t blocks)
+{
+ return xfs_quota_reserve_blkres(ip, -blocks);
+}
extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 6fa05fb78189..725c7d8e4438 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -376,16 +376,14 @@ xfs_reflink_allocate_cow(
resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
xfs_iunlock(ip, *lockmode);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
- *lockmode = XFS_ILOCK_EXCL;
- xfs_ilock(ip, *lockmode);
+ *lockmode = 0;
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0,
+ false, &tp);
if (error)
return error;
- error = xfs_qm_dqattach_locked(ip, false);
- if (error)
- goto out_trans_cancel;
+ *lockmode = XFS_ILOCK_EXCL;
/*
* Check for an overlapping extent again now that we dropped the ilock.
@@ -398,20 +396,13 @@ xfs_reflink_allocate_cow(
goto convert;
}
- error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0,
- XFS_QMOPT_RES_REGBLKS);
- if (error)
- goto out_trans_cancel;
-
- xfs_trans_ijoin(tp, ip, 0);
-
/* Allocate the entire reservation as unwritten blocks. */
nimaps = 1;
error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap,
&nimaps);
if (error)
- goto out_unreserve;
+ goto out_trans_cancel;
xfs_inode_set_cowblocks_tag(ip);
error = xfs_trans_commit(tp);
@@ -436,9 +427,6 @@ convert:
trace_xfs_reflink_convert_cow(ip, cmap);
return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
-out_unreserve:
- xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
- XFS_QMOPT_RES_REGBLKS);
out_trans_cancel:
xfs_trans_cancel(tp);
return error;
@@ -508,9 +496,8 @@ xfs_reflink_cancel_cow_blocks(
xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
/* Remove the quota reservation */
- error = xfs_trans_reserve_quota_nblks(NULL, ip,
- -(long)del.br_blockcount, 0,
- XFS_QMOPT_RES_REGBLKS);
+ error = xfs_quota_unreserve_blkres(ip,
+ del.br_blockcount);
if (error)
break;
} else {
@@ -628,6 +615,11 @@ xfs_reflink_end_cow_extent(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
+ error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ XFS_IEXT_REFLINK_END_COW_CNT);
+ if (error)
+ goto out_cancel;
+
/*
* In case of racing, overlapping AIO writes no COW extents might be
* left by the time I/O completes for the loser of the race. In that
@@ -997,22 +989,47 @@ xfs_reflink_remap_extent(
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
xfs_off_t newlen;
- int64_t qres, qdelta;
+ int64_t qdelta = 0;
unsigned int resblks;
+ bool quota_reserved = true;
bool smap_real;
bool dmap_written = xfs_bmap_is_written_extent(dmap);
+ int iext_delta = 0;
int nimaps;
int error;
- /* Start a rolling transaction to switch the mappings */
+ /*
+ * Start a rolling transaction to switch the mappings.
+ *
+ * Adding a written extent to the extent map can cause a bmbt split,
+ * and removing a mapped extent from the extent can cause a bmbt split.
+ * The two operations cannot both cause a split since they operate on
+ * the same index in the bmap btree, so we only need a reservation for
+ * one bmbt split if either thing is happening. However, we haven't
+ * locked the inode yet, so we reserve assuming this is the case.
+ *
+ * The first allocation call tries to reserve enough space to handle
+ * mapping dmap into a sparse part of the file plus the bmbt split. We
+ * haven't locked the inode or read the existing mapping yet, so we do
+ * not know for sure that we need the space. This should succeed most
+ * of the time.
+ *
+ * If the first attempt fails, try again but reserving only enough
+ * space to handle a bmbt split. This is the hard minimum requirement,
+ * and we revisit quota reservations later when we know more about what
+ * we're remapping.
+ */
resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
+ resblks + dmap->br_blockcount, 0, false, &tp);
+ if (error == -EDQUOT || error == -ENOSPC) {
+ quota_reserved = false;
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
+ resblks, 0, false, &tp);
+ }
if (error)
goto out;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
-
/*
* Read what's currently mapped in the destination file into smap.
* If smap isn't a hole, we will have to remove it before we can add
@@ -1060,15 +1077,9 @@ xfs_reflink_remap_extent(
}
/*
- * Compute quota reservation if we think the quota block counter for
+ * Increase quota reservation if we think the quota block counter for
* this file could increase.
*
- * Adding a written extent to the extent map can cause a bmbt split,
- * and removing a mapped extent from the extent can cause a bmbt split.
- * The two operations cannot both cause a split since they operate on
- * the same index in the bmap btree, so we only need a reservation for
- * one bmbt split if either thing is happening.
- *
* If we are mapping a written extent into the file, we need to have
* enough quota block count reservation to handle the blocks in that
* extent. We log only the delta to the quota block counts, so if the
@@ -1081,19 +1092,29 @@ xfs_reflink_remap_extent(
* count. This is suboptimal, but the VFS flushed the dest range
* before we started. That should have removed all the delalloc
* reservations, but we code defensively.
+ *
+ * xfs_trans_alloc_inode above already tried to grab an even larger
+ * quota reservation, and kicked off a blockgc scan if it couldn't.
+ * If we can't get a potentially smaller quota reservation now, we're
+ * done.
*/
- qres = qdelta = 0;
- if (smap_real || dmap_written)
- qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
- if (!smap_real && dmap_written)
- qres += dmap->br_blockcount;
- if (qres > 0) {
- error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0,
- XFS_QMOPT_RES_REGBLKS);
+ if (!quota_reserved && !smap_real && dmap_written) {
+ error = xfs_trans_reserve_quota_nblks(tp, ip,
+ dmap->br_blockcount, 0, false);
if (error)
goto out_cancel;
}
+ if (smap_real)
+ ++iext_delta;
+
+ if (dmap_written)
+ ++iext_delta;
+
+ error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta);
+ if (error)
+ goto out_cancel;
+
if (smap_real) {
/*
* If the extent we're unmapping is backed by storage (written
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index b4999fb01ff7..161b0e8992ba 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -804,6 +804,11 @@ xfs_growfs_rt_alloc(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error)
+ goto out_trans_cancel;
+
/*
* Allocate blocks to the bitmap file.
*/
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index e95c1eff95e0..e5e0713bebcd 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -35,6 +35,7 @@
#include "xfs_refcount_item.h"
#include "xfs_bmap_item.h"
#include "xfs_reflink.h"
+#include "xfs_pwork.h"
#include <linux/magic.h>
#include <linux/fs_context.h>
@@ -342,7 +343,7 @@ void
xfs_blkdev_issue_flush(
xfs_buftarg_t *buftarg)
{
- blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS);
+ blkdev_issue_flush(buftarg->bt_bdev);
}
STATIC void
@@ -495,40 +496,44 @@ xfs_init_mount_workqueues(
struct xfs_mount *mp)
{
mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 1, mp->m_super->s_id);
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ 1, mp->m_super->s_id);
if (!mp->m_buf_workqueue)
goto out;
mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id);
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ 0, mp->m_super->s_id);
if (!mp->m_unwritten_workqueue)
goto out_destroy_buf;
mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s",
- WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND,
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_UNBOUND),
0, mp->m_super->s_id);
if (!mp->m_cil_workqueue)
goto out_destroy_unwritten;
mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id);
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ 0, mp->m_super->s_id);
if (!mp->m_reclaim_workqueue)
goto out_destroy_cil;
- mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id);
- if (!mp->m_eofblocks_workqueue)
+ mp->m_blockgc_workqueue = alloc_workqueue("xfs-blockgc/%s",
+ WQ_SYSFS | WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM,
+ 0, mp->m_super->s_id);
+ if (!mp->m_blockgc_workqueue)
goto out_destroy_reclaim;
- mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", WQ_FREEZABLE, 0,
- mp->m_super->s_id);
+ mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s",
+ XFS_WQFLAGS(WQ_FREEZABLE), 0, mp->m_super->s_id);
if (!mp->m_sync_workqueue)
goto out_destroy_eofb;
return 0;
out_destroy_eofb:
- destroy_workqueue(mp->m_eofblocks_workqueue);
+ destroy_workqueue(mp->m_blockgc_workqueue);
out_destroy_reclaim:
destroy_workqueue(mp->m_reclaim_workqueue);
out_destroy_cil:
@@ -546,7 +551,7 @@ xfs_destroy_mount_workqueues(
struct xfs_mount *mp)
{
destroy_workqueue(mp->m_sync_workqueue);
- destroy_workqueue(mp->m_eofblocks_workqueue);
+ destroy_workqueue(mp->m_blockgc_workqueue);
destroy_workqueue(mp->m_reclaim_workqueue);
destroy_workqueue(mp->m_cil_workqueue);
destroy_workqueue(mp->m_unwritten_workqueue);
@@ -868,39 +873,6 @@ xfs_restore_resvblks(struct xfs_mount *mp)
}
/*
- * Trigger writeback of all the dirty metadata in the file system.
- *
- * This ensures that the metadata is written to their location on disk rather
- * than just existing in transactions in the log. This means after a quiesce
- * there is no log replay required to write the inodes to disk - this is the
- * primary difference between a sync and a quiesce.
- *
- * We cancel log work early here to ensure all transactions the log worker may
- * run have finished before we clean up and log the superblock and write an
- * unmount record. The unfreeze process is responsible for restarting the log
- * worker correctly.
- */
-void
-xfs_quiesce_attr(
- struct xfs_mount *mp)
-{
- int error = 0;
-
- cancel_delayed_work_sync(&mp->m_log->l_work);
-
- /* force the log to unpin objects from the now complete transactions */
- xfs_log_force(mp, XFS_LOG_SYNC);
-
-
- /* Push the superblock and write an unmount record */
- error = xfs_log_sbcount(mp);
- if (error)
- xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
- "Frozen image may not be consistent.");
- xfs_log_quiesce(mp);
-}
-
-/*
* Second stage of a freeze. The data is already frozen so we only
* need to take care of the metadata. Once that's done sync the superblock
* to the log to dirty it in case of a crash while frozen. This ensures that we
@@ -920,10 +892,9 @@ xfs_fs_freeze(
* set a GFP_NOFS context here to avoid recursion deadlocks.
*/
flags = memalloc_nofs_save();
- xfs_stop_block_reaping(mp);
+ xfs_blockgc_stop(mp);
xfs_save_resvblks(mp);
- xfs_quiesce_attr(mp);
- ret = xfs_sync_sb(mp, true);
+ ret = xfs_log_quiesce(mp);
memalloc_nofs_restore(flags);
return ret;
}
@@ -936,7 +907,7 @@ xfs_fs_unfreeze(
xfs_restore_resvblks(mp);
xfs_log_work_queue(mp);
- xfs_start_block_reaping(mp);
+ xfs_blockgc_start(mp);
return 0;
}
@@ -1720,7 +1691,7 @@ xfs_remount_rw(
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
return error;
}
- xfs_start_block_reaping(mp);
+ xfs_blockgc_start(mp);
/* Create the per-AG metadata reservation pool .*/
error = xfs_fs_reserve_ag_blocks(mp);
@@ -1740,10 +1711,10 @@ xfs_remount_ro(
* Cancel background eofb scanning so it cannot race with the final
* log force+buftarg wait and deadlock the remount.
*/
- xfs_stop_block_reaping(mp);
+ xfs_blockgc_stop(mp);
/* Get rid of any leftover CoW reservations... */
- error = xfs_icache_free_cowblocks(mp, NULL);
+ error = xfs_blockgc_free_space(mp, NULL);
if (error) {
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
return error;
@@ -1765,7 +1736,7 @@ xfs_remount_ro(
*/
xfs_save_resvblks(mp);
- xfs_quiesce_attr(mp);
+ xfs_log_clean(mp);
mp->m_flags |= XFS_MOUNT_RDONLY;
return 0;
@@ -1872,8 +1843,6 @@ static int xfs_init_fs_context(
mutex_init(&mp->m_growlock);
INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
- INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
- INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);
mp->m_kobj.kobject.kset = xfs_kset;
/*
* We don't create the finobt per-ag space reservation until after log
@@ -2119,11 +2088,12 @@ xfs_init_workqueues(void)
* max_active value for this workqueue.
*/
xfs_alloc_wq = alloc_workqueue("xfsalloc",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 0);
+ XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 0);
if (!xfs_alloc_wq)
return -ENOMEM;
- xfs_discard_wq = alloc_workqueue("xfsdiscard", WQ_UNBOUND, 0);
+ xfs_discard_wq = alloc_workqueue("xfsdiscard", XFS_WQFLAGS(WQ_UNBOUND),
+ 0);
if (!xfs_discard_wq)
goto out_free_alloc_wq;
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index b552cf6d3379..1ca484b8357f 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -75,6 +75,12 @@ extern void xfs_qm_exit(void);
XFS_ASSERT_FATAL_STRING \
XFS_DBG_STRING /* DBG must be last */
+#ifdef DEBUG
+# define XFS_WQFLAGS(wqflags) (WQ_SYSFS | (wqflags))
+#else
+# define XFS_WQFLAGS(wqflags) (wqflags)
+#endif
+
struct xfs_inode;
struct xfs_mount;
struct xfs_buftarg;
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 77c8ea3229f1..1379013d74b8 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -198,9 +198,10 @@ xfs_symlink(
fs_blocks = xfs_symlink_blocks(mp, pathlen);
resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, resblks, 0, 0, &tp);
+ error = xfs_trans_alloc_icreate(mp, &M_RES(mp)->tr_symlink, udqp, gdqp,
+ pdqp, resblks, &tp);
if (error)
- goto out_release_inode;
+ goto out_release_dquots;
xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
@@ -213,11 +214,8 @@ xfs_symlink(
goto out_trans_cancel;
}
- /*
- * Reserve disk quota : blocks and inode.
- */
- error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
- pdqp, resblks, 1, 0);
+ error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK,
+ XFS_IEXT_DIR_MANIP_CNT(mp));
if (error)
goto out_trans_cancel;
@@ -301,6 +299,7 @@ xfs_symlink(
}
ASSERT(pathlen == 0);
}
+ i_size_write(VFS_I(ip), ip->i_d.di_size);
/*
* Create the directory entry for the symlink.
@@ -343,7 +342,7 @@ out_release_inode:
xfs_finish_inode_setup(ip);
xfs_irele(ip);
}
-
+out_release_dquots:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
xfs_qm_dqrele(pdqp);
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index fac9de7ee6d0..145e06c47744 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -194,21 +194,12 @@ static struct ctl_table xfs_table[] = {
},
{
.procname = "speculative_prealloc_lifetime",
- .data = &xfs_params.eofb_timer.val,
+ .data = &xfs_params.blockgc_timer.val,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &xfs_params.eofb_timer.min,
- .extra2 = &xfs_params.eofb_timer.max,
- },
- {
- .procname = "speculative_cow_prealloc_lifetime",
- .data = &xfs_params.cowb_timer.val,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &xfs_params.cowb_timer.min,
- .extra2 = &xfs_params.cowb_timer.max,
+ .extra1 = &xfs_params.blockgc_timer.min,
+ .extra2 = &xfs_params.blockgc_timer.max,
},
/* please keep this the last entry */
#ifdef CONFIG_PROC_FS
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index 8abf4640f1d5..7692e76ead33 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -35,8 +35,7 @@ typedef struct xfs_param {
xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */
xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */
- xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */
- xfs_sysctl_val_t cowb_timer; /* Interval between cowb scan wakeups */
+ xfs_sysctl_val_t blockgc_timer; /* Interval between blockgc scans */
} xfs_param_t;
/*
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 120398a37c2a..9b8d703dc9fd 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -29,6 +29,7 @@
#include "xfs_filestream.h"
#include "xfs_fsmap.h"
#include "xfs_btree_staging.h"
+#include "xfs_icache.h"
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 5a263ae3d4f0..e74bbb648f83 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -37,6 +37,7 @@ struct xfs_trans_res;
struct xfs_inobt_rec_incore;
union xfs_btree_ptr;
struct xfs_dqtrx;
+struct xfs_eofblocks;
#define XFS_ATTR_FILTER_FLAGS \
{ XFS_ATTR_ROOT, "ROOT" }, \
@@ -154,10 +155,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
DEFINE_PERAG_REF_EVENT(xfs_perag_put);
DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
-DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks);
-DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks);
-DEFINE_PERAG_REF_EVENT(xfs_perag_set_cowblocks);
-DEFINE_PERAG_REF_EVENT(xfs_perag_clear_cowblocks);
+DEFINE_PERAG_REF_EVENT(xfs_perag_set_blockgc);
+DEFINE_PERAG_REF_EVENT(xfs_perag_clear_blockgc);
DECLARE_EVENT_CLASS(xfs_ag_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno),
@@ -358,7 +357,7 @@ DEFINE_BUF_EVENT(xfs_buf_get_uncached);
DEFINE_BUF_EVENT(xfs_buf_item_relse);
DEFINE_BUF_EVENT(xfs_buf_iodone_async);
DEFINE_BUF_EVENT(xfs_buf_error_relse);
-DEFINE_BUF_EVENT(xfs_buf_wait_buftarg);
+DEFINE_BUF_EVENT(xfs_buf_drain_buftarg);
DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
/* not really buffer traces, but the buf provides useful information */
@@ -1287,8 +1286,8 @@ TRACE_EVENT(xfs_log_assign_tail_lsn,
)
DECLARE_EVENT_CLASS(xfs_file_class,
- TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset),
- TP_ARGS(ip, count, offset),
+ TP_PROTO(struct kiocb *iocb, struct iov_iter *iter),
+ TP_ARGS(iocb, iter),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
@@ -1297,11 +1296,11 @@ DECLARE_EVENT_CLASS(xfs_file_class,
__field(size_t, count)
),
TP_fast_assign(
- __entry->dev = VFS_I(ip)->i_sb->s_dev;
- __entry->ino = ip->i_ino;
- __entry->size = ip->i_d.di_size;
- __entry->offset = offset;
- __entry->count = count;
+ __entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev;
+ __entry->ino = XFS_I(file_inode(iocb->ki_filp))->i_ino;
+ __entry->size = XFS_I(file_inode(iocb->ki_filp))->i_d.di_size;
+ __entry->offset = iocb->ki_pos;
+ __entry->count = iov_iter_count(iter);
),
TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx",
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -1313,14 +1312,16 @@ DECLARE_EVENT_CLASS(xfs_file_class,
#define DEFINE_RW_EVENT(name) \
DEFINE_EVENT(xfs_file_class, name, \
- TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset), \
- TP_ARGS(ip, count, offset))
+ TP_PROTO(struct kiocb *iocb, struct iov_iter *iter), \
+ TP_ARGS(iocb, iter))
DEFINE_RW_EVENT(xfs_file_buffered_read);
DEFINE_RW_EVENT(xfs_file_direct_read);
DEFINE_RW_EVENT(xfs_file_dax_read);
DEFINE_RW_EVENT(xfs_file_buffered_write);
DEFINE_RW_EVENT(xfs_file_direct_write);
DEFINE_RW_EVENT(xfs_file_dax_write);
+DEFINE_RW_EVENT(xfs_reflink_bounce_dio_write);
+
DECLARE_EVENT_CLASS(xfs_imap_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
@@ -3294,8 +3295,6 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
-DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
-
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap);
@@ -3888,6 +3887,47 @@ DEFINE_EVENT(xfs_timestamp_range_class, name, \
DEFINE_TIMESTAMP_RANGE_EVENT(xfs_inode_timestamp_range);
DEFINE_TIMESTAMP_RANGE_EVENT(xfs_quota_expiry_range);
+DECLARE_EVENT_CLASS(xfs_eofblocks_class,
+ TP_PROTO(struct xfs_mount *mp, struct xfs_eofblocks *eofb,
+ unsigned long caller_ip),
+ TP_ARGS(mp, eofb, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(__u32, flags)
+ __field(uint32_t, uid)
+ __field(uint32_t, gid)
+ __field(prid_t, prid)
+ __field(__u64, min_file_size)
+ __field(unsigned long, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->flags = eofb ? eofb->eof_flags : 0;
+ __entry->uid = eofb ? from_kuid(mp->m_super->s_user_ns,
+ eofb->eof_uid) : 0;
+ __entry->gid = eofb ? from_kgid(mp->m_super->s_user_ns,
+ eofb->eof_gid) : 0;
+ __entry->prid = eofb ? eofb->eof_prid : 0;
+ __entry->min_file_size = eofb ? eofb->eof_min_file_size : 0;
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d flags 0x%x uid %u gid %u prid %u minsize %llu caller %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->flags,
+ __entry->uid,
+ __entry->gid,
+ __entry->prid,
+ __entry->min_file_size,
+ (char *)__entry->caller_ip)
+);
+#define DEFINE_EOFBLOCKS_EVENT(name) \
+DEFINE_EVENT(xfs_eofblocks_class, name, \
+ TP_PROTO(struct xfs_mount *mp, struct xfs_eofblocks *eofb, \
+ unsigned long caller_ip), \
+ TP_ARGS(mp, eofb, caller_ip))
+DEFINE_EOFBLOCKS_EVENT(xfs_ioc_free_eofblocks);
+DEFINE_EOFBLOCKS_EVENT(xfs_blockgc_free_space);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index e72730f85af1..44f72c09c203 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -20,6 +20,10 @@
#include "xfs_trace.h"
#include "xfs_error.h"
#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_dquot_item.h"
+#include "xfs_dquot.h"
+#include "xfs_icache.h"
kmem_zone_t *xfs_trans_zone;
@@ -285,6 +289,17 @@ xfs_trans_alloc(
tp->t_firstblock = NULLFSBLOCK;
error = xfs_trans_reserve(tp, resp, blocks, rtextents);
+ if (error == -ENOSPC) {
+ /*
+ * We weren't able to reserve enough space for the transaction.
+ * Flush the other speculative space allocations to free space.
+ * Do not perform a synchronous scan because callers can hold
+ * other locks.
+ */
+ error = xfs_blockgc_free_space(mp, NULL);
+ if (!error)
+ error = xfs_trans_reserve(tp, resp, blocks, rtextents);
+ }
if (error) {
xfs_trans_cancel(tp);
return error;
@@ -1024,3 +1039,183 @@ xfs_trans_roll(
tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
return xfs_trans_reserve(*tpp, &tres, 0, 0);
}
+
+/*
+ * Allocate an transaction, lock and join the inode to it, and reserve quota.
+ *
+ * The caller must ensure that the on-disk dquots attached to this inode have
+ * already been allocated and initialized. The caller is responsible for
+ * releasing ILOCK_EXCL if a new transaction is returned.
+ */
+int
+xfs_trans_alloc_inode(
+ struct xfs_inode *ip,
+ struct xfs_trans_res *resv,
+ unsigned int dblocks,
+ unsigned int rblocks,
+ bool force,
+ struct xfs_trans **tpp)
+{
+ struct xfs_trans *tp;
+ struct xfs_mount *mp = ip->i_mount;
+ bool retried = false;
+ int error;
+
+retry:
+ error = xfs_trans_alloc(mp, resv, dblocks,
+ rblocks / mp->m_sb.sb_rextsize,
+ force ? XFS_TRANS_RESERVE : 0, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ error = xfs_qm_dqattach_locked(ip, false);
+ if (error) {
+ /* Caller should have allocated the dquots! */
+ ASSERT(error != -ENOENT);
+ goto out_cancel;
+ }
+
+ error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks, force);
+ if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
+ xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_blockgc_free_quota(ip, 0);
+ retried = true;
+ goto retry;
+ }
+ if (error)
+ goto out_cancel;
+
+ *tpp = tp;
+ return 0;
+
+out_cancel:
+ xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+}
+
+/*
+ * Allocate an transaction in preparation for inode creation by reserving quota
+ * against the given dquots. Callers are not required to hold any inode locks.
+ */
+int
+xfs_trans_alloc_icreate(
+ struct xfs_mount *mp,
+ struct xfs_trans_res *resv,
+ struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp,
+ struct xfs_dquot *pdqp,
+ unsigned int dblocks,
+ struct xfs_trans **tpp)
+{
+ struct xfs_trans *tp;
+ bool retried = false;
+ int error;
+
+retry:
+ error = xfs_trans_alloc(mp, resv, dblocks, 0, 0, &tp);
+ if (error)
+ return error;
+
+ error = xfs_trans_reserve_quota_icreate(tp, udqp, gdqp, pdqp, dblocks);
+ if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
+ xfs_trans_cancel(tp);
+ xfs_blockgc_free_dquots(mp, udqp, gdqp, pdqp, 0);
+ retried = true;
+ goto retry;
+ }
+ if (error) {
+ xfs_trans_cancel(tp);
+ return error;
+ }
+
+ *tpp = tp;
+ return 0;
+}
+
+/*
+ * Allocate an transaction, lock and join the inode to it, and reserve quota
+ * in preparation for inode attribute changes that include uid, gid, or prid
+ * changes.
+ *
+ * The caller must ensure that the on-disk dquots attached to this inode have
+ * already been allocated and initialized. The ILOCK will be dropped when the
+ * transaction is committed or cancelled.
+ */
+int
+xfs_trans_alloc_ichange(
+ struct xfs_inode *ip,
+ struct xfs_dquot *new_udqp,
+ struct xfs_dquot *new_gdqp,
+ struct xfs_dquot *new_pdqp,
+ bool force,
+ struct xfs_trans **tpp)
+{
+ struct xfs_trans *tp;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_dquot *udqp;
+ struct xfs_dquot *gdqp;
+ struct xfs_dquot *pdqp;
+ bool retried = false;
+ int error;
+
+retry:
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+ error = xfs_qm_dqattach_locked(ip, false);
+ if (error) {
+ /* Caller should have allocated the dquots! */
+ ASSERT(error != -ENOENT);
+ goto out_cancel;
+ }
+
+ /*
+ * For each quota type, skip quota reservations if the inode's dquots
+ * now match the ones that came from the caller, or the caller didn't
+ * pass one in. The inode's dquots can change if we drop the ILOCK to
+ * perform a blockgc scan, so we must preserve the caller's arguments.
+ */
+ udqp = (new_udqp != ip->i_udquot) ? new_udqp : NULL;
+ gdqp = (new_gdqp != ip->i_gdquot) ? new_gdqp : NULL;
+ pdqp = (new_pdqp != ip->i_pdquot) ? new_pdqp : NULL;
+ if (udqp || gdqp || pdqp) {
+ unsigned int qflags = XFS_QMOPT_RES_REGBLKS;
+
+ if (force)
+ qflags |= XFS_QMOPT_FORCE_RES;
+
+ /*
+ * Reserve enough quota to handle blocks on disk and reserved
+ * for a delayed allocation. We'll actually transfer the
+ * delalloc reservation between dquots at chown time, even
+ * though that part is only semi-transactional.
+ */
+ error = xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp,
+ pdqp, ip->i_d.di_nblocks + ip->i_delayed_blks,
+ 1, qflags);
+ if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
+ xfs_trans_cancel(tp);
+ xfs_blockgc_free_dquots(mp, udqp, gdqp, pdqp, 0);
+ retried = true;
+ goto retry;
+ }
+ if (error)
+ goto out_cancel;
+ }
+
+ *tpp = tp;
+ return 0;
+
+out_cancel:
+ xfs_trans_cancel(tp);
+ return error;
+}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 084658946cc8..8b03fbfe9a1b 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -268,4 +268,17 @@ xfs_trans_item_relog(
return lip->li_ops->iop_relog(lip, tp);
}
+struct xfs_dquot;
+
+int xfs_trans_alloc_inode(struct xfs_inode *ip, struct xfs_trans_res *resv,
+ unsigned int dblocks, unsigned int rblocks, bool force,
+ struct xfs_trans **tpp);
+int xfs_trans_alloc_icreate(struct xfs_mount *mp, struct xfs_trans_res *resv,
+ struct xfs_dquot *udqp, struct xfs_dquot *gdqp,
+ struct xfs_dquot *pdqp, unsigned int dblocks,
+ struct xfs_trans **tpp);
+int xfs_trans_alloc_ichange(struct xfs_inode *ip, struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, bool force,
+ struct xfs_trans **tpp);
+
#endif /* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 28b8ac701919..48e09ea30ee5 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -16,6 +16,7 @@
#include "xfs_quota.h"
#include "xfs_qm.h"
#include "xfs_trace.h"
+#include "xfs_error.h"
STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *);
@@ -691,9 +692,11 @@ xfs_trans_dqresv(
nblks);
xfs_trans_mod_dquot(tp, dqp, XFS_TRANS_DQ_RES_INOS, ninos);
}
- ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count);
- ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count);
- ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count);
+
+ if (XFS_IS_CORRUPT(mp, dqp->q_blk.reserved < dqp->q_blk.count) ||
+ XFS_IS_CORRUPT(mp, dqp->q_rtb.reserved < dqp->q_rtb.count) ||
+ XFS_IS_CORRUPT(mp, dqp->q_ino.reserved < dqp->q_ino.count))
+ goto error_corrupt;
xfs_dqunlock(dqp);
return 0;
@@ -703,6 +706,10 @@ error_return:
if (xfs_dquot_type(dqp) == XFS_DQTYPE_PROJ)
return -ENOSPC;
return -EDQUOT;
+error_corrupt:
+ xfs_dqunlock(dqp);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return -EFSCORRUPTED;
}
@@ -780,28 +787,60 @@ int
xfs_trans_reserve_quota_nblks(
struct xfs_trans *tp,
struct xfs_inode *ip,
- int64_t nblks,
- long ninos,
- uint flags)
+ int64_t dblocks,
+ int64_t rblocks,
+ bool force)
{
struct xfs_mount *mp = ip->i_mount;
+ unsigned int qflags = 0;
+ int error;
if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
return 0;
ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino));
-
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT((flags & ~(XFS_QMOPT_FORCE_RES)) == XFS_TRANS_DQ_RES_RTBLKS ||
- (flags & ~(XFS_QMOPT_FORCE_RES)) == XFS_TRANS_DQ_RES_BLKS);
- /*
- * Reserve nblks against these dquots, with trans as the mediator.
- */
- return xfs_trans_reserve_quota_bydquots(tp, mp,
- ip->i_udquot, ip->i_gdquot,
- ip->i_pdquot,
- nblks, ninos, flags);
+ if (force)
+ qflags |= XFS_QMOPT_FORCE_RES;
+
+ /* Reserve data device quota against the inode's dquots. */
+ error = xfs_trans_reserve_quota_bydquots(tp, mp, ip->i_udquot,
+ ip->i_gdquot, ip->i_pdquot, dblocks, 0,
+ XFS_QMOPT_RES_REGBLKS | qflags);
+ if (error)
+ return error;
+
+ /* Do the same but for realtime blocks. */
+ error = xfs_trans_reserve_quota_bydquots(tp, mp, ip->i_udquot,
+ ip->i_gdquot, ip->i_pdquot, rblocks, 0,
+ XFS_QMOPT_RES_RTBLKS | qflags);
+ if (error) {
+ xfs_trans_reserve_quota_bydquots(tp, mp, ip->i_udquot,
+ ip->i_gdquot, ip->i_pdquot, -dblocks, 0,
+ XFS_QMOPT_RES_REGBLKS);
+ return error;
+ }
+
+ return 0;
+}
+
+/* Change the quota reservations for an inode creation activity. */
+int
+xfs_trans_reserve_quota_icreate(
+ struct xfs_trans *tp,
+ struct xfs_dquot *udqp,
+ struct xfs_dquot *gdqp,
+ struct xfs_dquot *pdqp,
+ int64_t dblocks)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+
+ if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+ return 0;
+
+ return xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp, pdqp,
+ dblocks, 1, XFS_QMOPT_RES_REGBLKS);
}
/*
diff --git a/fs/zonefs/Makefile b/fs/zonefs/Makefile
index 75a380aa1ae1..33c1a4f1132e 100644
--- a/fs/zonefs/Makefile
+++ b/fs/zonefs/Makefile
@@ -1,4 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
+ccflags-y += -I$(src)
+
obj-$(CONFIG_ZONEFS_FS) += zonefs.o
zonefs-y := super.o
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 76e45d66d4ce..b6ff4a21abac 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -24,6 +24,9 @@
#include "zonefs.h"
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
static inline int zonefs_zone_mgmt(struct inode *inode,
enum req_opf op)
{
@@ -32,6 +35,7 @@ static inline int zonefs_zone_mgmt(struct inode *inode,
lockdep_assert_held(&zi->i_truncate_mutex);
+ trace_zonefs_zone_mgmt(inode, op);
ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector,
zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS);
if (ret) {
@@ -100,6 +104,8 @@ static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
iomap->bdev = inode->i_sb->s_bdev;
iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
+ trace_zonefs_iomap_begin(inode, iomap);
+
return 0;
}
@@ -250,6 +256,9 @@ static loff_t zonefs_check_zone_condition(struct inode *inode,
}
inode->i_mode &= ~0222;
return i_size_read(inode);
+ case BLK_ZONE_COND_FULL:
+ /* The write pointer of full zones is invalid. */
+ return zi->i_max_size;
default:
if (zi->i_ztype == ZONEFS_ZTYPE_CNV)
return zi->i_max_size;
@@ -542,7 +551,7 @@ static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end,
if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV)
ret = file_write_and_wait_range(file, start, end);
if (!ret)
- ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ ret = blkdev_issue_flush(inode->i_sb->s_bdev);
if (ret)
zonefs_io_error(inode, true);
@@ -679,7 +688,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
if (!nr_pages)
return 0;
- bio = bio_alloc_bioset(GFP_NOFS, nr_pages, &fs_bio_set);
+ bio = bio_alloc(GFP_NOFS, nr_pages);
if (!bio)
return -ENOMEM;
@@ -704,6 +713,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
ret = submit_bio_wait(bio);
zonefs_file_write_dio_end_io(iocb, size, ret, 0);
+ trace_zonefs_file_dio_append(inode, size, ret);
out_release:
bio_release_pages(bio, false);
@@ -781,7 +791,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
ret = zonefs_file_dio_append(iocb, from);
else
ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
- &zonefs_write_dio_ops, sync);
+ &zonefs_write_dio_ops, 0);
if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
(ret > 0 || ret == -EIOCBQUEUED)) {
if (ret > 0)
@@ -918,7 +928,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
file_accessed(iocb->ki_filp);
ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops,
- &zonefs_read_dio_ops, is_sync_kiocb(iocb));
+ &zonefs_read_dio_ops, 0);
} else {
ret = generic_file_read_iter(iocb, to);
if (ret == -EIO)
@@ -1582,12 +1592,11 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_time_gran = 1;
/*
- * The block size is set to the device physical sector size to ensure
- * that write operations on 512e devices (512B logical block and 4KB
- * physical block) are always aligned to the device physical blocks,
- * as mandated by the ZBC/ZAC specifications.
+ * The block size is set to the device zone write granularity to ensure
+ * that write operations are always aligned according to the device
+ * interface constraints.
*/
- sb_set_blocksize(sb, bdev_physical_block_size(sb->s_bdev));
+ sb_set_blocksize(sb, bdev_zone_write_granularity(sb->s_bdev));
sbi->s_zone_sectors_shift = ilog2(bdev_zone_sectors(sb->s_bdev));
sbi->s_uid = GLOBAL_ROOT_UID;
sbi->s_gid = GLOBAL_ROOT_GID;
diff --git a/fs/zonefs/trace.h b/fs/zonefs/trace.h
new file mode 100644
index 000000000000..f369d7d50303
--- /dev/null
+++ b/fs/zonefs/trace.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * zonefs filesystem driver tracepoints.
+ *
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM zonefs
+
+#if !defined(_TRACE_ZONEFS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_ZONEFS_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+#include <linux/blkdev.h>
+
+#include "zonefs.h"
+
+#define show_dev(dev) MAJOR(dev), MINOR(dev)
+
+TRACE_EVENT(zonefs_zone_mgmt,
+ TP_PROTO(struct inode *inode, enum req_opf op),
+ TP_ARGS(inode, op),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(ino_t, ino)
+ __field(int, op)
+ __field(sector_t, sector)
+ __field(sector_t, nr_sectors)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->op = op;
+ __entry->sector = ZONEFS_I(inode)->i_zsector;
+ __entry->nr_sectors =
+ ZONEFS_I(inode)->i_zone_size >> SECTOR_SHIFT;
+ ),
+ TP_printk("bdev=(%d,%d), ino=%lu op=%s, sector=%llu, nr_sectors=%llu",
+ show_dev(__entry->dev), (unsigned long)__entry->ino,
+ blk_op_str(__entry->op), __entry->sector,
+ __entry->nr_sectors
+ )
+);
+
+TRACE_EVENT(zonefs_file_dio_append,
+ TP_PROTO(struct inode *inode, ssize_t size, ssize_t ret),
+ TP_ARGS(inode, size, ret),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(ino_t, ino)
+ __field(sector_t, sector)
+ __field(ssize_t, size)
+ __field(loff_t, wpoffset)
+ __field(ssize_t, ret)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->sector = ZONEFS_I(inode)->i_zsector;
+ __entry->size = size;
+ __entry->wpoffset = ZONEFS_I(inode)->i_wpoffset;
+ __entry->ret = ret;
+ ),
+ TP_printk("bdev=(%d, %d), ino=%lu, sector=%llu, size=%zu, wpoffset=%llu, ret=%zu",
+ show_dev(__entry->dev), (unsigned long)__entry->ino,
+ __entry->sector, __entry->size, __entry->wpoffset,
+ __entry->ret
+ )
+);
+
+TRACE_EVENT(zonefs_iomap_begin,
+ TP_PROTO(struct inode *inode, struct iomap *iomap),
+ TP_ARGS(inode, iomap),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(ino_t, ino)
+ __field(u64, addr)
+ __field(loff_t, offset)
+ __field(u64, length)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->addr = iomap->addr;
+ __entry->offset = iomap->offset;
+ __entry->length = iomap->length;
+ ),
+ TP_printk("bdev=(%d,%d), ino=%lu, addr=%llu, offset=%llu, length=%llu",
+ show_dev(__entry->dev), (unsigned long)__entry->ino,
+ __entry->addr, __entry->offset, __entry->length
+ )
+);
+
+#endif /* _TRACE_ZONEFS_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>