summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Makefile2
-rw-r--r--fs/adfs/adfs.h32
-rw-r--r--fs/adfs/dir.c314
-rw-r--r--fs/adfs/dir_f.c302
-rw-r--r--fs/adfs/dir_f.h52
-rw-r--r--fs/adfs/dir_fplus.c346
-rw-r--r--fs/adfs/dir_fplus.h6
-rw-r--r--fs/adfs/inode.c64
-rw-r--r--fs/adfs/map.c247
-rw-r--r--fs/adfs/super.c267
-rw-r--r--fs/afs/cell.c11
-rw-r--r--fs/afs/dir.c18
-rw-r--r--fs/binfmt_elf.c144
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/block-group.c212
-rw-r--r--fs/btrfs/block-group.h40
-rw-r--r--fs/btrfs/check-integrity.c4
-rw-r--r--fs/btrfs/compression.c13
-rw-r--r--fs/btrfs/ctree.h81
-rw-r--r--fs/btrfs/dev-replace.c6
-rw-r--r--fs/btrfs/discard.c702
-rw-r--r--fs/btrfs/discard.h41
-rw-r--r--fs/btrfs/disk-io.c37
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c50
-rw-r--r--fs/btrfs/extent_io.c54
-rw-r--r--fs/btrfs/extent_io.h6
-rw-r--r--fs/btrfs/file-item.c41
-rw-r--r--fs/btrfs/file.c23
-rw-r--r--fs/btrfs/free-space-cache.c619
-rw-r--r--fs/btrfs/free-space-cache.h41
-rw-r--r--fs/btrfs/inode-map.c13
-rw-r--r--fs/btrfs/inode.c913
-rw-r--r--fs/btrfs/ioctl.c19
-rw-r--r--fs/btrfs/ordered-data.c81
-rw-r--r--fs/btrfs/ordered-data.h26
-rw-r--r--fs/btrfs/print-tree.c2
-rw-r--r--fs/btrfs/qgroup.c50
-rw-r--r--fs/btrfs/relocation.c71
-rw-r--r--fs/btrfs/root-tree.c10
-rw-r--r--fs/btrfs/scrub.c40
-rw-r--r--fs/btrfs/space-info.c42
-rw-r--r--fs/btrfs/super.c39
-rw-r--r--fs/btrfs/sysfs.c394
-rw-r--r--fs/btrfs/sysfs.h5
-rw-r--r--fs/btrfs/tests/btrfs-tests.c29
-rw-r--r--fs/btrfs/tests/btrfs-tests.h1
-rw-r--r--fs/btrfs/tests/extent-map-tests.c154
-rw-r--r--fs/btrfs/tests/inode-tests.c44
-rw-r--r--fs/btrfs/transaction.c30
-rw-r--r--fs/btrfs/tree-checker.c225
-rw-r--r--fs/btrfs/tree-log.c455
-rw-r--r--fs/btrfs/volumes.c290
-rw-r--r--fs/btrfs/volumes.h12
-rw-r--r--fs/btrfs/zlib.c135
-rw-r--r--fs/buffer.c35
-rw-r--r--fs/ceph/mds_client.c8
-rw-r--r--fs/char_dev.c2
-rw-r--r--fs/cifs/cifs_dfs_ref.c97
-rw-r--r--fs/cifs/cifsacl.c20
-rw-r--r--fs/cifs/cifsfs.h3
-rw-r--r--fs/cifs/cifsglob.h1
-rw-r--r--fs/cifs/cifsproto.h4
-rw-r--r--fs/cifs/cifssmb.c4
-rw-r--r--fs/cifs/connect.c6
-rw-r--r--fs/cifs/dfs_cache.c1112
-rw-r--r--fs/cifs/file.c8
-rw-r--r--fs/cifs/inode.c4
-rw-r--r--fs/cifs/smb2misc.c2
-rw-r--r--fs/cifs/smb2ops.c171
-rw-r--r--fs/cifs/smb2pdu.c182
-rw-r--r--fs/cifs/smb2pdu.h2
-rw-r--r--fs/cifs/smb2proto.h5
-rw-r--r--fs/cifs/smb2transport.c2
-rw-r--r--fs/cifs/transport.c3
-rw-r--r--fs/cifs/xattr.c128
-rw-r--r--fs/compat_ioctl.c261
-rw-r--r--fs/crypto/Kconfig22
-rw-r--r--fs/crypto/bio.c114
-rw-r--r--fs/crypto/crypto.c57
-rw-r--r--fs/crypto/fname.c316
-rw-r--r--fs/crypto/fscrypt_private.h58
-rw-r--r--fs/crypto/hkdf.c2
-rw-r--r--fs/crypto/hooks.c47
-rw-r--r--fs/crypto/keyring.c147
-rw-r--r--fs/crypto/keysetup.c102
-rw-r--r--fs/crypto/keysetup_v1.c19
-rw-r--r--fs/crypto/policy.c170
-rw-r--r--fs/debugfs/file.c38
-rw-r--r--fs/debugfs/inode.c9
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/dlm/lowcomms.c6
-rw-r--r--fs/ecryptfs/crypto.c2
-rw-r--r--fs/ecryptfs/keystore.c4
-rw-r--r--fs/erofs/decompressor.c22
-rw-r--r--fs/erofs/internal.h4
-rw-r--r--fs/erofs/utils.c15
-rw-r--r--fs/erofs/xattr.h17
-rw-r--r--fs/erofs/zdata.c123
-rw-r--r--fs/eventpoll.c87
-rw-r--r--fs/exec.c6
-rw-r--r--fs/ext2/super.c7
-rw-r--r--fs/ext4/Kconfig9
-rw-r--r--fs/ext4/Makefile3
-rw-r--r--fs/ext4/balloc.c5
-rw-r--r--fs/ext4/dir.c12
-rw-r--r--fs/ext4/ext4.h81
-rw-r--r--fs/ext4/ext4_extents.h5
-rw-r--r--fs/ext4/ext4_jbd2.c25
-rw-r--r--fs/ext4/ext4_jbd2.h22
-rw-r--r--fs/ext4/extents.c205
-rw-r--r--fs/ext4/extents_status.h6
-rw-r--r--fs/ext4/file.c203
-rw-r--r--fs/ext4/ialloc.c6
-rw-r--r--fs/ext4/indirect.c26
-rw-r--r--fs/ext4/inline.c4
-rw-r--r--fs/ext4/inode-test.c4
-rw-r--r--fs/ext4/inode.c53
-rw-r--r--fs/ext4/ioctl.c2
-rw-r--r--fs/ext4/mballoc.c4
-rw-r--r--fs/ext4/mmp.c6
-rw-r--r--fs/ext4/namei.c20
-rw-r--r--fs/ext4/page-io.c19
-rw-r--r--fs/ext4/readpage.c42
-rw-r--r--fs/ext4/resize.c10
-rw-r--r--fs/ext4/super.c122
-rw-r--r--fs/ext4/sysfs.c88
-rw-r--r--fs/ext4/verity.c47
-rw-r--r--fs/ext4/xattr.c6
-rw-r--r--fs/f2fs/Kconfig28
-rw-r--r--fs/f2fs/Makefile1
-rw-r--r--fs/f2fs/checkpoint.c6
-rw-r--r--fs/f2fs/compress.c1176
-rw-r--r--fs/f2fs/data.c736
-rw-r--r--fs/f2fs/debug.c88
-rw-r--r--fs/f2fs/dir.c27
-rw-r--r--fs/f2fs/f2fs.h329
-rw-r--r--fs/f2fs/file.c251
-rw-r--r--fs/f2fs/gc.c18
-rw-r--r--fs/f2fs/inline.c44
-rw-r--r--fs/f2fs/inode.c41
-rw-r--r--fs/f2fs/namei.c120
-rw-r--r--fs/f2fs/recovery.c20
-rw-r--r--fs/f2fs/segment.c271
-rw-r--r--fs/f2fs/segment.h19
-rw-r--r--fs/f2fs/super.c182
-rw-r--r--fs/f2fs/sysfs.c158
-rw-r--r--fs/f2fs/verity.c47
-rw-r--r--fs/fat/inode.c3
-rw-r--r--fs/file.c35
-rw-r--r--fs/fs-writeback.c2
-rw-r--r--fs/fuse/file.c4
-rw-r--r--fs/hfs/hfs_fs.h28
-rw-r--r--fs/hfs/inode.c4
-rw-r--r--fs/hfsplus/hfsplus_fs.h28
-rw-r--r--fs/hfsplus/inode.c12
-rw-r--r--fs/hostfs/hostfs.h22
-rw-r--r--fs/hostfs/hostfs_kern.c15
-rw-r--r--fs/hugetlbfs/inode.c4
-rw-r--r--fs/inode.c3
-rw-r--r--fs/internal.h16
-rw-r--r--fs/io-wq.c99
-rw-r--r--fs/io-wq.h11
-rw-r--r--fs/io_uring.c2282
-rw-r--r--fs/ioctl.c131
-rw-r--r--fs/jbd2/checkpoint.c2
-rw-r--r--fs/jbd2/commit.c4
-rw-r--r--fs/jbd2/journal.c119
-rw-r--r--fs/jbd2/transaction.c4
-rw-r--r--fs/kernfs/dir.c2
-rw-r--r--fs/mpage.c2
-rw-r--r--fs/namei.c306
-rw-r--r--fs/namespace.c2
-rw-r--r--fs/nfs/fscache-index.c6
-rw-r--r--fs/nfs/fscache.c18
-rw-r--r--fs/nfs/fscache.h8
-rw-r--r--fs/nfs/nfs4xdr.c10
-rw-r--r--fs/nfs/nfstrace.h2
-rw-r--r--fs/nsfs.c32
-rw-r--r--fs/ocfs2/cluster/quorum.c2
-rw-r--r--fs/ocfs2/dlm/Makefile2
-rw-r--r--fs/ocfs2/dlm/dlmast.c8
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h4
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c8
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c8
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c8
-rw-r--r--fs/ocfs2/dlm/dlmlock.c8
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c10
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c10
-rw-r--r--fs/ocfs2/dlm/dlmthread.c8
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c8
-rw-r--r--fs/ocfs2/dlmfs/Makefile2
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c4
-rw-r--r--fs/ocfs2/dlmfs/userdlm.c6
-rw-r--r--fs/ocfs2/dlmglue.c3
-rw-r--r--fs/ocfs2/journal.c8
-rw-r--r--fs/ocfs2/journal.h8
-rw-r--r--fs/ocfs2/namei.c3
-rw-r--r--fs/open.c146
-rw-r--r--fs/posix_acl.c7
-rw-r--r--fs/proc/Kconfig4
-rw-r--r--fs/proc/base.c104
-rw-r--r--fs/proc/namespaces.c24
-rw-r--r--fs/proc/uptime.c3
-rw-r--r--fs/pstore/ram.c13
-rw-r--r--fs/pstore/ram_core.c2
-rw-r--r--fs/quota/quota_v2.c2
-rw-r--r--fs/quota/quotaio_v1.h6
-rw-r--r--fs/read_write.c10
-rw-r--r--fs/readdir.c79
-rw-r--r--fs/reiserfs/journal.c2
-rw-r--r--fs/reiserfs/procfs.c1
-rw-r--r--fs/reiserfs/stree.c9
-rw-r--r--fs/reiserfs/super.c4
-rw-r--r--fs/reiserfs/xattr.c8
-rw-r--r--fs/stack.c6
-rw-r--r--fs/stat.c34
-rw-r--r--fs/timerfd.c3
-rw-r--r--fs/ubifs/Kconfig1
-rw-r--r--fs/ubifs/dir.c16
-rw-r--r--fs/ubifs/file.c8
-rw-r--r--fs/ubifs/ioctl.c14
-rw-r--r--fs/ubifs/journal.c10
-rw-r--r--fs/ubifs/key.h1
-rw-r--r--fs/ubifs/orphan.c4
-rw-r--r--fs/ubifs/sb.c2
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/ubifs/ubifs.h7
-rw-r--r--fs/udf/ecma_167.h46
-rw-r--r--fs/udf/inode.c6
-rw-r--r--fs/udf/osta_udf.h100
-rw-r--r--fs/udf/super.c40
-rw-r--r--fs/udf/truncate.c2
-rw-r--r--fs/verity/enable.c67
-rw-r--r--fs/verity/fsverity_private.h17
-rw-r--r--fs/verity/hash_algs.c98
-rw-r--r--fs/verity/open.c5
-rw-r--r--fs/verity/verify.c47
-rw-r--r--fs/xfs/libxfs/xfs_attr.c14
-rw-r--r--fs/xfs/libxfs/xfs_attr.h15
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c4
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h9
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c89
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.h2
-rw-r--r--fs/xfs/libxfs/xfs_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h4
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h4
-rw-r--r--fs/xfs/libxfs/xfs_format.h7
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h19
-rw-r--r--fs/xfs/scrub/repair.h1
-rw-r--r--fs/xfs/scrub/trace.h6
-rw-r--r--fs/xfs/xfs_acl.c11
-rw-r--r--fs/xfs/xfs_attr_inactive.c149
-rw-r--r--fs/xfs/xfs_buf_item.c45
-rw-r--r--fs/xfs/xfs_buf_item.h1
-rw-r--r--fs/xfs/xfs_dquot.c6
-rw-r--r--fs/xfs/xfs_file.c7
-rw-r--r--fs/xfs/xfs_inode.c25
-rw-r--r--fs/xfs/xfs_ioctl.c20
-rw-r--r--fs/xfs/xfs_ioctl32.c9
-rw-r--r--fs/xfs/xfs_ioctl32.h2
-rw-r--r--fs/xfs/xfs_iomap.c2
-rw-r--r--fs/xfs/xfs_iops.c6
-rw-r--r--fs/xfs/xfs_log_recover.c6
-rw-r--r--fs/xfs/xfs_ondisk.h1
-rw-r--r--fs/xfs/xfs_qm.h6
-rw-r--r--fs/xfs/xfs_quotaops.c6
-rw-r--r--fs/xfs/xfs_reflink.c9
-rw-r--r--fs/xfs/xfs_reflink.h2
-rw-r--r--fs/xfs/xfs_super.c48
-rw-r--r--fs/xfs/xfs_trace.h4
-rw-r--r--fs/xfs/xfs_trans_dquot.c8
-rw-r--r--fs/xfs/xfs_xattr.c14
273 files changed, 13543 insertions, 6228 deletions
diff --git a/fs/Makefile b/fs/Makefile
index 1148c555c4d3..98be354fdb61 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -37,7 +37,7 @@ obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FS_VERITY) += verity/
obj-$(CONFIG_FILE_LOCKING) += locks.o
-obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
+obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o
obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index b7e844d2f321..699c4fa8b78b 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -26,14 +26,13 @@ static inline u16 adfs_filetype(u32 loadaddr)
#define ADFS_NDA_PUBLIC_READ (1 << 5)
#define ADFS_NDA_PUBLIC_WRITE (1 << 6)
-#include "dir_f.h"
-
/*
* adfs file system inode data in memory
*/
struct adfs_inode_info {
loff_t mmu_private;
__u32 parent_id; /* parent indirect disc address */
+ __u32 indaddr; /* object indirect disc address */
__u32 loadaddr; /* RISC OS load address */
__u32 execaddr; /* RISC OS exec address */
unsigned int attr; /* RISC OS permissions */
@@ -93,15 +92,19 @@ struct adfs_dir {
int nr_buffers;
struct buffer_head *bh[4];
-
- /* big directories need allocated buffers */
- struct buffer_head **bh_fplus;
+ struct buffer_head **bhs;
unsigned int pos;
__u32 parent_id;
- struct adfs_dirheader dirhead;
- union adfs_dirtail dirtail;
+ union {
+ struct adfs_dirheader *dirhead;
+ struct adfs_bigdirheader *bighead;
+ };
+ union {
+ struct adfs_newdirtail *newtail;
+ struct adfs_bigdirtail *bigtail;
+ };
};
/*
@@ -122,13 +125,13 @@ struct object_info {
struct adfs_dir_ops {
int (*read)(struct super_block *sb, unsigned int indaddr,
unsigned int size, struct adfs_dir *dir);
+ int (*iterate)(struct adfs_dir *dir, struct dir_context *ctx);
int (*setpos)(struct adfs_dir *dir, unsigned int fpos);
int (*getnext)(struct adfs_dir *dir, struct object_info *obj);
int (*update)(struct adfs_dir *dir, struct object_info *obj);
int (*create)(struct adfs_dir *dir, struct object_info *obj);
int (*remove)(struct adfs_dir *dir, struct object_info *obj);
- int (*sync)(struct adfs_dir *dir);
- void (*free)(struct adfs_dir *dir);
+ int (*commit)(struct adfs_dir *dir);
};
struct adfs_discmap {
@@ -145,7 +148,9 @@ int adfs_notify_change(struct dentry *dentry, struct iattr *attr);
/* map.c */
int adfs_map_lookup(struct super_block *sb, u32 frag_id, unsigned int offset);
-extern unsigned int adfs_map_free(struct super_block *sb);
+void adfs_map_statfs(struct super_block *sb, struct kstatfs *buf);
+struct adfs_discmap *adfs_read_map(struct super_block *sb, struct adfs_discrecord *dr);
+void adfs_free_map(struct super_block *sb);
/* Misc */
__printf(3, 4)
@@ -167,6 +172,13 @@ extern const struct dentry_operations adfs_dentry_operations;
extern const struct adfs_dir_ops adfs_f_dir_ops;
extern const struct adfs_dir_ops adfs_fplus_dir_ops;
+int adfs_dir_copyfrom(void *dst, struct adfs_dir *dir, unsigned int offset,
+ size_t len);
+int adfs_dir_copyto(struct adfs_dir *dir, unsigned int offset, const void *src,
+ size_t len);
+void adfs_dir_relse(struct adfs_dir *dir);
+int adfs_dir_read_buffers(struct super_block *sb, u32 indaddr,
+ unsigned int size, struct adfs_dir *dir);
void adfs_object_fixup(struct adfs_dir *dir, struct object_info *obj);
extern int adfs_dir_update(struct super_block *sb, struct object_info *obj,
int wait);
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index a54c53244992..77fbd196008f 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -6,12 +6,196 @@
*
* Common directory handling for ADFS
*/
+#include <linux/slab.h>
#include "adfs.h"
/*
* For future. This should probably be per-directory.
*/
-static DEFINE_RWLOCK(adfs_dir_lock);
+static DECLARE_RWSEM(adfs_dir_rwsem);
+
+int adfs_dir_copyfrom(void *dst, struct adfs_dir *dir, unsigned int offset,
+ size_t len)
+{
+ struct super_block *sb = dir->sb;
+ unsigned int index, remain;
+
+ index = offset >> sb->s_blocksize_bits;
+ offset &= sb->s_blocksize - 1;
+ remain = sb->s_blocksize - offset;
+ if (index + (remain < len) >= dir->nr_buffers)
+ return -EINVAL;
+
+ if (remain < len) {
+ memcpy(dst, dir->bhs[index]->b_data + offset, remain);
+ dst += remain;
+ len -= remain;
+ index += 1;
+ offset = 0;
+ }
+
+ memcpy(dst, dir->bhs[index]->b_data + offset, len);
+
+ return 0;
+}
+
+int adfs_dir_copyto(struct adfs_dir *dir, unsigned int offset, const void *src,
+ size_t len)
+{
+ struct super_block *sb = dir->sb;
+ unsigned int index, remain;
+
+ index = offset >> sb->s_blocksize_bits;
+ offset &= sb->s_blocksize - 1;
+ remain = sb->s_blocksize - offset;
+ if (index + (remain < len) >= dir->nr_buffers)
+ return -EINVAL;
+
+ if (remain < len) {
+ memcpy(dir->bhs[index]->b_data + offset, src, remain);
+ src += remain;
+ len -= remain;
+ index += 1;
+ offset = 0;
+ }
+
+ memcpy(dir->bhs[index]->b_data + offset, src, len);
+
+ return 0;
+}
+
+static void __adfs_dir_cleanup(struct adfs_dir *dir)
+{
+ dir->nr_buffers = 0;
+
+ if (dir->bhs != dir->bh)
+ kfree(dir->bhs);
+ dir->bhs = NULL;
+ dir->sb = NULL;
+}
+
+void adfs_dir_relse(struct adfs_dir *dir)
+{
+ unsigned int i;
+
+ for (i = 0; i < dir->nr_buffers; i++)
+ brelse(dir->bhs[i]);
+
+ __adfs_dir_cleanup(dir);
+}
+
+static void adfs_dir_forget(struct adfs_dir *dir)
+{
+ unsigned int i;
+
+ for (i = 0; i < dir->nr_buffers; i++)
+ bforget(dir->bhs[i]);
+
+ __adfs_dir_cleanup(dir);
+}
+
+int adfs_dir_read_buffers(struct super_block *sb, u32 indaddr,
+ unsigned int size, struct adfs_dir *dir)
+{
+ struct buffer_head **bhs;
+ unsigned int i, num;
+ int block;
+
+ num = ALIGN(size, sb->s_blocksize) >> sb->s_blocksize_bits;
+ if (num > ARRAY_SIZE(dir->bh)) {
+ /* We only allow one extension */
+ if (dir->bhs != dir->bh)
+ return -EINVAL;
+
+ bhs = kcalloc(num, sizeof(*bhs), GFP_KERNEL);
+ if (!bhs)
+ return -ENOMEM;
+
+ if (dir->nr_buffers)
+ memcpy(bhs, dir->bhs, dir->nr_buffers * sizeof(*bhs));
+
+ dir->bhs = bhs;
+ }
+
+ for (i = dir->nr_buffers; i < num; i++) {
+ block = __adfs_block_map(sb, indaddr, i);
+ if (!block) {
+ adfs_error(sb, "dir %06x has a hole at offset %u",
+ indaddr, i);
+ goto error;
+ }
+
+ dir->bhs[i] = sb_bread(sb, block);
+ if (!dir->bhs[i]) {
+ adfs_error(sb,
+ "dir %06x failed read at offset %u, mapped block 0x%08x",
+ indaddr, i, block);
+ goto error;
+ }
+
+ dir->nr_buffers++;
+ }
+ return 0;
+
+error:
+ adfs_dir_relse(dir);
+
+ return -EIO;
+}
+
+static int adfs_dir_read(struct super_block *sb, u32 indaddr,
+ unsigned int size, struct adfs_dir *dir)
+{
+ dir->sb = sb;
+ dir->bhs = dir->bh;
+ dir->nr_buffers = 0;
+
+ return ADFS_SB(sb)->s_dir->read(sb, indaddr, size, dir);
+}
+
+static int adfs_dir_read_inode(struct super_block *sb, struct inode *inode,
+ struct adfs_dir *dir)
+{
+ int ret;
+
+ ret = adfs_dir_read(sb, ADFS_I(inode)->indaddr, inode->i_size, dir);
+ if (ret)
+ return ret;
+
+ if (ADFS_I(inode)->parent_id != dir->parent_id) {
+ adfs_error(sb,
+ "parent directory id changed under me! (%06x but got %06x)\n",
+ ADFS_I(inode)->parent_id, dir->parent_id);
+ adfs_dir_relse(dir);
+ ret = -EIO;
+ }
+
+ return ret;
+}
+
+static void adfs_dir_mark_dirty(struct adfs_dir *dir)
+{
+ unsigned int i;
+
+ /* Mark the buffers dirty */
+ for (i = 0; i < dir->nr_buffers; i++)
+ mark_buffer_dirty(dir->bhs[i]);
+}
+
+static int adfs_dir_sync(struct adfs_dir *dir)
+{
+ int err = 0;
+ int i;
+
+ for (i = dir->nr_buffers - 1; i >= 0; i--) {
+ struct buffer_head *bh = dir->bhs[i];
+ sync_dirty_buffer(bh);
+ if (buffer_req(bh) && !buffer_uptodate(bh))
+ err = -EIO;
+ }
+
+ return err;
+}
void adfs_object_fixup(struct adfs_dir *dir, struct object_info *obj)
{
@@ -51,87 +235,90 @@ void adfs_object_fixup(struct adfs_dir *dir, struct object_info *obj)
}
}
-static int
-adfs_readdir(struct file *file, struct dir_context *ctx)
+static int adfs_iterate(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
- struct object_info obj;
struct adfs_dir dir;
- int ret = 0;
-
- if (ctx->pos >> 32)
- return 0;
+ int ret;
- ret = ops->read(sb, inode->i_ino, inode->i_size, &dir);
+ down_read(&adfs_dir_rwsem);
+ ret = adfs_dir_read_inode(sb, inode, &dir);
if (ret)
- return ret;
+ goto unlock;
if (ctx->pos == 0) {
if (!dir_emit_dot(file, ctx))
- goto free_out;
+ goto unlock_relse;
ctx->pos = 1;
}
if (ctx->pos == 1) {
if (!dir_emit(ctx, "..", 2, dir.parent_id, DT_DIR))
- goto free_out;
+ goto unlock_relse;
ctx->pos = 2;
}
- read_lock(&adfs_dir_lock);
+ ret = ops->iterate(&dir, ctx);
- ret = ops->setpos(&dir, ctx->pos - 2);
- if (ret)
- goto unlock_out;
- while (ops->getnext(&dir, &obj) == 0) {
- if (!dir_emit(ctx, obj.name, obj.name_len,
- obj.indaddr, DT_UNKNOWN))
- break;
- ctx->pos++;
- }
-
-unlock_out:
- read_unlock(&adfs_dir_lock);
+unlock_relse:
+ up_read(&adfs_dir_rwsem);
+ adfs_dir_relse(&dir);
+ return ret;
-free_out:
- ops->free(&dir);
+unlock:
+ up_read(&adfs_dir_rwsem);
return ret;
}
int
adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait)
{
- int ret = -EINVAL;
-#ifdef CONFIG_ADFS_FS_RW
const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
struct adfs_dir dir;
+ int ret;
- printk(KERN_INFO "adfs_dir_update: object %06x in dir %06x\n",
- obj->indaddr, obj->parent_id);
+ if (!IS_ENABLED(CONFIG_ADFS_FS_RW))
+ return -EINVAL;
- if (!ops->update) {
- ret = -EINVAL;
- goto out;
- }
+ if (!ops->update)
+ return -EINVAL;
- ret = ops->read(sb, obj->parent_id, 0, &dir);
+ down_write(&adfs_dir_rwsem);
+ ret = adfs_dir_read(sb, obj->parent_id, 0, &dir);
if (ret)
- goto out;
+ goto unlock;
- write_lock(&adfs_dir_lock);
ret = ops->update(&dir, obj);
- write_unlock(&adfs_dir_lock);
+ if (ret)
+ goto forget;
- if (wait) {
- int err = ops->sync(&dir);
- if (!ret)
- ret = err;
- }
+ ret = ops->commit(&dir);
+ if (ret)
+ goto forget;
+ up_write(&adfs_dir_rwsem);
+
+ adfs_dir_mark_dirty(&dir);
+
+ if (wait)
+ ret = adfs_dir_sync(&dir);
+
+ adfs_dir_relse(&dir);
+ return ret;
+
+ /*
+ * If the updated failed because the entry wasn't found, we can
+ * just release the buffers. If it was any other error, forget
+ * the dirtied buffers so they aren't written back to the media.
+ */
+forget:
+ if (ret == -ENOENT)
+ adfs_dir_relse(&dir);
+ else
+ adfs_dir_forget(&dir);
+unlock:
+ up_write(&adfs_dir_rwsem);
- ops->free(&dir);
-out:
-#endif
return ret;
}
@@ -167,25 +354,14 @@ static int adfs_dir_lookup_byname(struct inode *inode, const struct qstr *qstr,
u32 name_len;
int ret;
- ret = ops->read(sb, inode->i_ino, inode->i_size, &dir);
+ down_read(&adfs_dir_rwsem);
+ ret = adfs_dir_read_inode(sb, inode, &dir);
if (ret)
- goto out;
-
- if (ADFS_I(inode)->parent_id != dir.parent_id) {
- adfs_error(sb,
- "parent directory changed under me! (%06x but got %06x)\n",
- ADFS_I(inode)->parent_id, dir.parent_id);
- ret = -EIO;
- goto free_out;
- }
-
- obj->parent_id = inode->i_ino;
-
- read_lock(&adfs_dir_lock);
+ goto unlock;
ret = ops->setpos(&dir, 0);
if (ret)
- goto unlock_out;
+ goto unlock_relse;
ret = -ENOENT;
name = qstr->name;
@@ -196,20 +372,22 @@ static int adfs_dir_lookup_byname(struct inode *inode, const struct qstr *qstr,
break;
}
}
+ obj->parent_id = ADFS_I(inode)->indaddr;
-unlock_out:
- read_unlock(&adfs_dir_lock);
+unlock_relse:
+ up_read(&adfs_dir_rwsem);
+ adfs_dir_relse(&dir);
+ return ret;
-free_out:
- ops->free(&dir);
-out:
+unlock:
+ up_read(&adfs_dir_rwsem);
return ret;
}
const struct file_operations adfs_dir_operations = {
.read = generic_read_dir,
.llseek = generic_file_llseek,
- .iterate = adfs_readdir,
+ .iterate_shared = adfs_iterate,
.fsync = generic_file_fsync,
};
diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c
index c1a950c7400a..30d526fecc3f 100644
--- a/fs/adfs/dir_f.c
+++ b/fs/adfs/dir_f.c
@@ -9,8 +9,6 @@
#include "adfs.h"
#include "dir_f.h"
-static void adfs_f_free(struct adfs_dir *dir);
-
/*
* Read an (unaligned) value of length 1..4 bytes
*/
@@ -60,7 +58,7 @@ static inline void adfs_writeval(unsigned char *p, int len, unsigned int val)
#define bufoff(_bh,_idx) \
({ int _buf = _idx >> blocksize_bits; \
int _off = _idx - (_buf << blocksize_bits);\
- (u8 *)(_bh[_buf]->b_data + _off); \
+ (void *)(_bh[_buf]->b_data + _off); \
})
/*
@@ -123,65 +121,49 @@ adfs_dir_checkbyte(const struct adfs_dir *dir)
return (dircheck ^ (dircheck >> 8) ^ (dircheck >> 16) ^ (dircheck >> 24)) & 0xff;
}
-/* Read and check that a directory is valid */
-static int adfs_dir_read(struct super_block *sb, u32 indaddr,
- unsigned int size, struct adfs_dir *dir)
+static int adfs_f_validate(struct adfs_dir *dir)
{
- const unsigned int blocksize_bits = sb->s_blocksize_bits;
- int blk = 0;
-
- /*
- * Directories which are not a multiple of 2048 bytes
- * are considered bad v2 [3.6]
- */
- if (size & 2047)
- goto bad_dir;
-
- size >>= blocksize_bits;
-
- dir->nr_buffers = 0;
- dir->sb = sb;
-
- for (blk = 0; blk < size; blk++) {
- int phys;
+ struct adfs_dirheader *head = dir->dirhead;
+ struct adfs_newdirtail *tail = dir->newtail;
+
+ if (head->startmasseq != tail->endmasseq ||
+ tail->dirlastmask || tail->reserved[0] || tail->reserved[1] ||
+ (memcmp(&head->startname, "Nick", 4) &&
+ memcmp(&head->startname, "Hugo", 4)) ||
+ memcmp(&head->startname, &tail->endname, 4) ||
+ adfs_dir_checkbyte(dir) != tail->dircheckbyte)
+ return -EIO;
- phys = __adfs_block_map(sb, indaddr, blk);
- if (!phys) {
- adfs_error(sb, "dir %06x has a hole at offset %d",
- indaddr, blk);
- goto release_buffers;
- }
+ return 0;
+}
- dir->bh[blk] = sb_bread(sb, phys);
- if (!dir->bh[blk])
- goto release_buffers;
- }
+/* Read and check that a directory is valid */
+static int adfs_f_read(struct super_block *sb, u32 indaddr, unsigned int size,
+ struct adfs_dir *dir)
+{
+ const unsigned int blocksize_bits = sb->s_blocksize_bits;
+ int ret;
- memcpy(&dir->dirhead, bufoff(dir->bh, 0), sizeof(dir->dirhead));
- memcpy(&dir->dirtail, bufoff(dir->bh, 2007), sizeof(dir->dirtail));
+ if (size && size != ADFS_NEWDIR_SIZE)
+ return -EIO;
- if (dir->dirhead.startmasseq != dir->dirtail.new.endmasseq ||
- memcmp(&dir->dirhead.startname, &dir->dirtail.new.endname, 4))
- goto bad_dir;
+ ret = adfs_dir_read_buffers(sb, indaddr, ADFS_NEWDIR_SIZE, dir);
+ if (ret)
+ return ret;
- if (memcmp(&dir->dirhead.startname, "Nick", 4) &&
- memcmp(&dir->dirhead.startname, "Hugo", 4))
- goto bad_dir;
+ dir->dirhead = bufoff(dir->bh, 0);
+ dir->newtail = bufoff(dir->bh, 2007);
- if (adfs_dir_checkbyte(dir) != dir->dirtail.new.dircheckbyte)
+ if (adfs_f_validate(dir))
goto bad_dir;
- dir->nr_buffers = blk;
+ dir->parent_id = adfs_readval(dir->newtail->dirparent, 3);
return 0;
bad_dir:
adfs_error(sb, "dir %06x is corrupted", indaddr);
-release_buffers:
- for (blk -= 1; blk >= 0; blk -= 1)
- brelse(dir->bh[blk]);
-
- dir->sb = NULL;
+ adfs_dir_relse(dir);
return -EIO;
}
@@ -232,24 +214,12 @@ adfs_obj2dir(struct adfs_direntry *de, struct object_info *obj)
static int
__adfs_dir_get(struct adfs_dir *dir, int pos, struct object_info *obj)
{
- struct super_block *sb = dir->sb;
struct adfs_direntry de;
- int thissize, buffer, offset;
-
- buffer = pos >> sb->s_blocksize_bits;
-
- if (buffer > dir->nr_buffers)
- return -EINVAL;
-
- offset = pos & (sb->s_blocksize - 1);
- thissize = sb->s_blocksize - offset;
- if (thissize > 26)
- thissize = 26;
+ int ret;
- memcpy(&de, dir->bh[buffer]->b_data + offset, thissize);
- if (thissize != 26)
- memcpy(((char *)&de) + thissize, dir->bh[buffer + 1]->b_data,
- 26 - thissize);
+ ret = adfs_dir_copyfrom(&de, dir, pos, 26);
+ if (ret)
+ return ret;
if (!de.dirobname[0])
return -ENOENT;
@@ -260,89 +230,6 @@ __adfs_dir_get(struct adfs_dir *dir, int pos, struct object_info *obj)
}
static int
-__adfs_dir_put(struct adfs_dir *dir, int pos, struct object_info *obj)
-{
- struct super_block *sb = dir->sb;
- struct adfs_direntry de;
- int thissize, buffer, offset;
-
- buffer = pos >> sb->s_blocksize_bits;
-
- if (buffer > dir->nr_buffers)
- return -EINVAL;
-
- offset = pos & (sb->s_blocksize - 1);
- thissize = sb->s_blocksize - offset;
- if (thissize > 26)
- thissize = 26;
-
- /*
- * Get the entry in total
- */
- memcpy(&de, dir->bh[buffer]->b_data + offset, thissize);
- if (thissize != 26)
- memcpy(((char *)&de) + thissize, dir->bh[buffer + 1]->b_data,
- 26 - thissize);
-
- /*
- * update it
- */
- adfs_obj2dir(&de, obj);
-
- /*
- * Put the new entry back
- */
- memcpy(dir->bh[buffer]->b_data + offset, &de, thissize);
- if (thissize != 26)
- memcpy(dir->bh[buffer + 1]->b_data, ((char *)&de) + thissize,
- 26 - thissize);
-
- return 0;
-}
-
-/*
- * the caller is responsible for holding the necessary
- * locks.
- */
-static int adfs_dir_find_entry(struct adfs_dir *dir, u32 indaddr)
-{
- int pos, ret;
-
- ret = -ENOENT;
-
- for (pos = 5; pos < ADFS_NUM_DIR_ENTRIES * 26 + 5; pos += 26) {
- struct object_info obj;
-
- if (!__adfs_dir_get(dir, pos, &obj))
- break;
-
- if (obj.indaddr == indaddr) {
- ret = pos;
- break;
- }
- }
-
- return ret;
-}
-
-static int adfs_f_read(struct super_block *sb, u32 indaddr, unsigned int size,
- struct adfs_dir *dir)
-{
- int ret;
-
- if (size != ADFS_NEWDIR_SIZE)
- return -EIO;
-
- ret = adfs_dir_read(sb, indaddr, size, dir);
- if (ret)
- adfs_error(sb, "unable to read directory");
- else
- dir->parent_id = adfs_readval(dir->dirtail.new.dirparent, 3);
-
- return ret;
-}
-
-static int
adfs_f_setpos(struct adfs_dir *dir, unsigned int fpos)
{
if (fpos >= ADFS_NUM_DIR_ENTRIES)
@@ -364,99 +251,74 @@ adfs_f_getnext(struct adfs_dir *dir, struct object_info *obj)
return ret;
}
-static int
-adfs_f_update(struct adfs_dir *dir, struct object_info *obj)
+static int adfs_f_iterate(struct adfs_dir *dir, struct dir_context *ctx)
{
- struct super_block *sb = dir->sb;
- int ret, i;
+ struct object_info obj;
+ int pos = 5 + (ctx->pos - 2) * 26;
- ret = adfs_dir_find_entry(dir, obj->indaddr);
- if (ret < 0) {
- adfs_error(dir->sb, "unable to locate entry to update");
- goto out;
+ while (ctx->pos < 2 + ADFS_NUM_DIR_ENTRIES) {
+ if (__adfs_dir_get(dir, pos, &obj))
+ break;
+ if (!dir_emit(ctx, obj.name, obj.name_len,
+ obj.indaddr, DT_UNKNOWN))
+ break;
+ pos += 26;
+ ctx->pos++;
}
+ return 0;
+}
- __adfs_dir_put(dir, ret, obj);
-
- /*
- * Increment directory sequence number
- */
- dir->bh[0]->b_data[0] += 1;
- dir->bh[dir->nr_buffers - 1]->b_data[sb->s_blocksize - 6] += 1;
-
- ret = adfs_dir_checkbyte(dir);
- /*
- * Update directory check byte
- */
- dir->bh[dir->nr_buffers - 1]->b_data[sb->s_blocksize - 1] = ret;
-
-#if 1
- {
- const unsigned int blocksize_bits = sb->s_blocksize_bits;
-
- memcpy(&dir->dirhead, bufoff(dir->bh, 0), sizeof(dir->dirhead));
- memcpy(&dir->dirtail, bufoff(dir->bh, 2007), sizeof(dir->dirtail));
+static int adfs_f_update(struct adfs_dir *dir, struct object_info *obj)
+{
+ struct adfs_direntry de;
+ int offset, ret;
- if (dir->dirhead.startmasseq != dir->dirtail.new.endmasseq ||
- memcmp(&dir->dirhead.startname, &dir->dirtail.new.endname, 4))
- goto bad_dir;
+ offset = 5 - (int)sizeof(de);
- if (memcmp(&dir->dirhead.startname, "Nick", 4) &&
- memcmp(&dir->dirhead.startname, "Hugo", 4))
- goto bad_dir;
+ do {
+ offset += sizeof(de);
+ ret = adfs_dir_copyfrom(&de, dir, offset, sizeof(de));
+ if (ret) {
+ adfs_error(dir->sb, "error reading directory entry");
+ return -ENOENT;
+ }
+ if (!de.dirobname[0]) {
+ adfs_error(dir->sb, "unable to locate entry to update");
+ return -ENOENT;
+ }
+ } while (adfs_readval(de.dirinddiscadd, 3) != obj->indaddr);
- if (adfs_dir_checkbyte(dir) != dir->dirtail.new.dircheckbyte)
- goto bad_dir;
- }
-#endif
- for (i = dir->nr_buffers - 1; i >= 0; i--)
- mark_buffer_dirty(dir->bh[i]);
+ /* Update the directory entry with the new object state */
+ adfs_obj2dir(&de, obj);
- ret = 0;
-out:
- return ret;
-#if 1
-bad_dir:
- adfs_error(dir->sb, "whoops! I broke a directory!");
- return -EIO;
-#endif
+ /* Write the directory entry back to the directory */
+ return adfs_dir_copyto(dir, offset, &de, 26);
}
-static int
-adfs_f_sync(struct adfs_dir *dir)
+static int adfs_f_commit(struct adfs_dir *dir)
{
- int err = 0;
- int i;
-
- for (i = dir->nr_buffers - 1; i >= 0; i--) {
- struct buffer_head *bh = dir->bh[i];
- sync_dirty_buffer(bh);
- if (buffer_req(bh) && !buffer_uptodate(bh))
- err = -EIO;
- }
+ int ret;
- return err;
-}
+ /* Increment directory sequence number */
+ dir->dirhead->startmasseq += 1;
+ dir->newtail->endmasseq += 1;
-static void
-adfs_f_free(struct adfs_dir *dir)
-{
- int i;
+ /* Update directory check byte */
+ dir->newtail->dircheckbyte = adfs_dir_checkbyte(dir);
- for (i = dir->nr_buffers - 1; i >= 0; i--) {
- brelse(dir->bh[i]);
- dir->bh[i] = NULL;
- }
+ /* Make sure the directory still validates correctly */
+ ret = adfs_f_validate(dir);
+ if (ret)
+ adfs_msg(dir->sb, KERN_ERR, "error: update broke directory");
- dir->nr_buffers = 0;
- dir->sb = NULL;
+ return ret;
}
const struct adfs_dir_ops adfs_f_dir_ops = {
.read = adfs_f_read,
+ .iterate = adfs_f_iterate,
.setpos = adfs_f_setpos,
.getnext = adfs_f_getnext,
.update = adfs_f_update,
- .sync = adfs_f_sync,
- .free = adfs_f_free
+ .commit = adfs_f_commit,
};
diff --git a/fs/adfs/dir_f.h b/fs/adfs/dir_f.h
index 5aec332b90f5..a5393e6cf9f4 100644
--- a/fs/adfs/dir_f.h
+++ b/fs/adfs/dir_f.h
@@ -13,9 +13,9 @@
* Directory header
*/
struct adfs_dirheader {
- unsigned char startmasseq;
- unsigned char startname[4];
-};
+ __u8 startmasseq;
+ __u8 startname[4];
+} __attribute__((packed));
#define ADFS_NEWDIR_SIZE 2048
#define ADFS_NUM_DIR_ENTRIES 77
@@ -31,32 +31,36 @@ struct adfs_direntry {
__u8 dirlen[4];
__u8 dirinddiscadd[3];
__u8 newdiratts;
-};
+} __attribute__((packed));
/*
* Directory tail
*/
+struct adfs_olddirtail {
+ __u8 dirlastmask;
+ char dirname[10];
+ __u8 dirparent[3];
+ char dirtitle[19];
+ __u8 reserved[14];
+ __u8 endmasseq;
+ __u8 endname[4];
+ __u8 dircheckbyte;
+} __attribute__((packed));
+
+struct adfs_newdirtail {
+ __u8 dirlastmask;
+ __u8 reserved[2];
+ __u8 dirparent[3];
+ char dirtitle[19];
+ char dirname[10];
+ __u8 endmasseq;
+ __u8 endname[4];
+ __u8 dircheckbyte;
+} __attribute__((packed));
+
union adfs_dirtail {
- struct {
- unsigned char dirlastmask;
- char dirname[10];
- unsigned char dirparent[3];
- char dirtitle[19];
- unsigned char reserved[14];
- unsigned char endmasseq;
- unsigned char endname[4];
- unsigned char dircheckbyte;
- } old;
- struct {
- unsigned char dirlastmask;
- unsigned char reserved[2];
- unsigned char dirparent[3];
- char dirtitle[19];
- char dirname[10];
- unsigned char endmasseq;
- unsigned char endname[4];
- unsigned char dircheckbyte;
- } new;
+ struct adfs_olddirtail old;
+ struct adfs_newdirtail new;
};
#endif
diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c
index d56924c11b17..4a15924014da 100644
--- a/fs/adfs/dir_fplus.c
+++ b/fs/adfs/dir_fplus.c
@@ -4,123 +4,163 @@
*
* Copyright (C) 1997-1999 Russell King
*/
-#include <linux/slab.h>
#include "adfs.h"
#include "dir_fplus.h"
-static int
-adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct adfs_dir *dir)
+/* Return the byte offset to directory entry pos */
+static unsigned int adfs_fplus_offset(const struct adfs_bigdirheader *h,
+ unsigned int pos)
{
- struct adfs_bigdirheader *h;
- struct adfs_bigdirtail *t;
- unsigned long block;
- unsigned int blk, size;
- int i, ret = -EIO;
+ return offsetof(struct adfs_bigdirheader, bigdirname) +
+ ALIGN(le32_to_cpu(h->bigdirnamelen), 4) +
+ pos * sizeof(struct adfs_bigdirentry);
+}
- dir->nr_buffers = 0;
+static int adfs_fplus_validate_header(const struct adfs_bigdirheader *h)
+{
+ unsigned int size = le32_to_cpu(h->bigdirsize);
+ unsigned int len;
- /* start off using fixed bh set - only alloc for big dirs */
- dir->bh_fplus = &dir->bh[0];
+ if (h->bigdirversion[0] != 0 || h->bigdirversion[1] != 0 ||
+ h->bigdirversion[2] != 0 ||
+ h->bigdirstartname != cpu_to_le32(BIGDIRSTARTNAME) ||
+ !size || size & 2047 || size > SZ_4M)
+ return -EIO;
- block = __adfs_block_map(sb, id, 0);
- if (!block) {
- adfs_error(sb, "dir object %X has a hole at offset 0", id);
- goto out;
- }
+ size -= sizeof(struct adfs_bigdirtail) +
+ offsetof(struct adfs_bigdirheader, bigdirname);
- dir->bh_fplus[0] = sb_bread(sb, block);
- if (!dir->bh_fplus[0])
- goto out;
- dir->nr_buffers += 1;
+ /* Check that bigdirnamelen fits within the directory */
+ len = ALIGN(le32_to_cpu(h->bigdirnamelen), 4);
+ if (len > size)
+ return -EIO;
- h = (struct adfs_bigdirheader *)dir->bh_fplus[0]->b_data;
- size = le32_to_cpu(h->bigdirsize);
- if (size != sz) {
- adfs_msg(sb, KERN_WARNING,
- "directory header size %X does not match directory size %X",
- size, sz);
+ size -= len;
+
+ /* Check that bigdirnamesize fits within the directory */
+ len = le32_to_cpu(h->bigdirnamesize);
+ if (len > size)
+ return -EIO;
+
+ size -= len;
+
+ /*
+ * Avoid division, we know that absolute maximum number of entries
+ * can not be so large to cause overflow of the multiplication below.
+ */
+ len = le32_to_cpu(h->bigdirentries);
+ if (len > SZ_4M / sizeof(struct adfs_bigdirentry) ||
+ len * sizeof(struct adfs_bigdirentry) > size)
+ return -EIO;
+
+ return 0;
+}
+
+static int adfs_fplus_validate_tail(const struct adfs_bigdirheader *h,
+ const struct adfs_bigdirtail *t)
+{
+ if (t->bigdirendname != cpu_to_le32(BIGDIRENDNAME) ||
+ t->bigdirendmasseq != h->startmasseq ||
+ t->reserved[0] != 0 || t->reserved[1] != 0)
+ return -EIO;
+
+ return 0;
+}
+
+static u8 adfs_fplus_checkbyte(struct adfs_dir *dir)
+{
+ struct adfs_bigdirheader *h = dir->bighead;
+ struct adfs_bigdirtail *t = dir->bigtail;
+ unsigned int end, bs, bi, i;
+ __le32 *bp;
+ u32 dircheck;
+
+ end = adfs_fplus_offset(h, le32_to_cpu(h->bigdirentries)) +
+ le32_to_cpu(h->bigdirnamesize);
+
+ /* Accumulate the contents of the header, entries and names */
+ for (dircheck = 0, bi = 0; end; bi++) {
+ bp = (void *)dir->bhs[bi]->b_data;
+ bs = dir->bhs[bi]->b_size;
+ if (bs > end)
+ bs = end;
+
+ for (i = 0; i < bs; i += sizeof(u32))
+ dircheck = ror32(dircheck, 13) ^ le32_to_cpup(bp++);
+
+ end -= bs;
}
- if (h->bigdirversion[0] != 0 || h->bigdirversion[1] != 0 ||
- h->bigdirversion[2] != 0 || size & 2047 ||
- h->bigdirstartname != cpu_to_le32(BIGDIRSTARTNAME)) {
- adfs_error(sb, "dir %06x has malformed header", id);
+ /* Accumulate the contents of the tail except for the check byte */
+ dircheck = ror32(dircheck, 13) ^ le32_to_cpu(t->bigdirendname);
+ dircheck = ror32(dircheck, 13) ^ t->bigdirendmasseq;
+ dircheck = ror32(dircheck, 13) ^ t->reserved[0];
+ dircheck = ror32(dircheck, 13) ^ t->reserved[1];
+
+ return dircheck ^ dircheck >> 8 ^ dircheck >> 16 ^ dircheck >> 24;
+}
+
+static int adfs_fplus_read(struct super_block *sb, u32 indaddr,
+ unsigned int size, struct adfs_dir *dir)
+{
+ struct adfs_bigdirheader *h;
+ struct adfs_bigdirtail *t;
+ unsigned int dirsize;
+ int ret;
+
+ /* Read first buffer */
+ ret = adfs_dir_read_buffers(sb, indaddr, sb->s_blocksize, dir);
+ if (ret)
+ return ret;
+
+ dir->bighead = h = (void *)dir->bhs[0]->b_data;
+ ret = adfs_fplus_validate_header(h);
+ if (ret) {
+ adfs_error(sb, "dir %06x has malformed header", indaddr);
goto out;
}
- size >>= sb->s_blocksize_bits;
- if (size > ARRAY_SIZE(dir->bh)) {
- /* this directory is too big for fixed bh set, must allocate */
- struct buffer_head **bh_fplus =
- kcalloc(size, sizeof(struct buffer_head *),
- GFP_KERNEL);
- if (!bh_fplus) {
- adfs_msg(sb, KERN_ERR,
- "not enough memory for dir object %X (%d blocks)",
- id, size);
- ret = -ENOMEM;
- goto out;
- }
- dir->bh_fplus = bh_fplus;
- /* copy over the pointer to the block that we've already read */
- dir->bh_fplus[0] = dir->bh[0];
+ dirsize = le32_to_cpu(h->bigdirsize);
+ if (size && dirsize != size) {
+ adfs_msg(sb, KERN_WARNING,
+ "dir %06x header size %X does not match directory size %X",
+ indaddr, dirsize, size);
}
- for (blk = 1; blk < size; blk++) {
- block = __adfs_block_map(sb, id, blk);
- if (!block) {
- adfs_error(sb, "dir object %X has a hole at offset %d", id, blk);
- goto out;
- }
+ /* Read remaining buffers */
+ ret = adfs_dir_read_buffers(sb, indaddr, dirsize, dir);
+ if (ret)
+ return ret;
- dir->bh_fplus[blk] = sb_bread(sb, block);
- if (!dir->bh_fplus[blk]) {
- adfs_error(sb, "dir object %x failed read for offset %d, mapped block %lX",
- id, blk, block);
- goto out;
- }
+ dir->bigtail = t = (struct adfs_bigdirtail *)
+ (dir->bhs[dir->nr_buffers - 1]->b_data + (sb->s_blocksize - 8));
- dir->nr_buffers += 1;
+ ret = adfs_fplus_validate_tail(h, t);
+ if (ret) {
+ adfs_error(sb, "dir %06x has malformed tail", indaddr);
+ goto out;
}
- t = (struct adfs_bigdirtail *)
- (dir->bh_fplus[size - 1]->b_data + (sb->s_blocksize - 8));
-
- if (t->bigdirendname != cpu_to_le32(BIGDIRENDNAME) ||
- t->bigdirendmasseq != h->startmasseq ||
- t->reserved[0] != 0 || t->reserved[1] != 0) {
- adfs_error(sb, "dir %06x has malformed tail", id);
+ if (adfs_fplus_checkbyte(dir) != t->bigdircheckbyte) {
+ adfs_error(sb, "dir %06x checkbyte mismatch\n", indaddr);
goto out;
}
dir->parent_id = le32_to_cpu(h->bigdirparent);
- dir->sb = sb;
return 0;
out:
- if (dir->bh_fplus) {
- for (i = 0; i < dir->nr_buffers; i++)
- brelse(dir->bh_fplus[i]);
-
- if (&dir->bh[0] != dir->bh_fplus)
- kfree(dir->bh_fplus);
+ adfs_dir_relse(dir);
- dir->bh_fplus = NULL;
- }
-
- dir->nr_buffers = 0;
- dir->sb = NULL;
return ret;
}
static int
adfs_fplus_setpos(struct adfs_dir *dir, unsigned int fpos)
{
- struct adfs_bigdirheader *h =
- (struct adfs_bigdirheader *) dir->bh_fplus[0]->b_data;
int ret = -ENOENT;
- if (fpos <= le32_to_cpu(h->bigdirentries)) {
+ if (fpos <= le32_to_cpu(dir->bighead->bigdirentries)) {
dir->pos = fpos;
ret = 0;
}
@@ -128,51 +168,23 @@ adfs_fplus_setpos(struct adfs_dir *dir, unsigned int fpos)
return ret;
}
-static void
-dir_memcpy(struct adfs_dir *dir, unsigned int offset, void *to, int len)
-{
- struct super_block *sb = dir->sb;
- unsigned int buffer, partial, remainder;
-
- buffer = offset >> sb->s_blocksize_bits;
- offset &= sb->s_blocksize - 1;
-
- partial = sb->s_blocksize - offset;
-
- if (partial >= len)
- memcpy(to, dir->bh_fplus[buffer]->b_data + offset, len);
- else {
- char *c = (char *)to;
-
- remainder = len - partial;
-
- memcpy(c,
- dir->bh_fplus[buffer]->b_data + offset,
- partial);
-
- memcpy(c + partial,
- dir->bh_fplus[buffer + 1]->b_data,
- remainder);
- }
-}
-
static int
adfs_fplus_getnext(struct adfs_dir *dir, struct object_info *obj)
{
- struct adfs_bigdirheader *h =
- (struct adfs_bigdirheader *) dir->bh_fplus[0]->b_data;
+ struct adfs_bigdirheader *h = dir->bighead;
struct adfs_bigdirentry bde;
unsigned int offset;
- int ret = -ENOENT;
+ int ret;
if (dir->pos >= le32_to_cpu(h->bigdirentries))
- goto out;
+ return -ENOENT;
- offset = offsetof(struct adfs_bigdirheader, bigdirname);
- offset += ((le32_to_cpu(h->bigdirnamelen) + 4) & ~3);
- offset += dir->pos * sizeof(struct adfs_bigdirentry);
+ offset = adfs_fplus_offset(h, dir->pos);
- dir_memcpy(dir, offset, &bde, sizeof(struct adfs_bigdirentry));
+ ret = adfs_dir_copyfrom(&bde, dir, offset,
+ sizeof(struct adfs_bigdirentry));
+ if (ret)
+ return ret;
obj->loadaddr = le32_to_cpu(bde.bigdirload);
obj->execaddr = le32_to_cpu(bde.bigdirexec);
@@ -181,59 +193,95 @@ adfs_fplus_getnext(struct adfs_dir *dir, struct object_info *obj)
obj->attr = le32_to_cpu(bde.bigdirattr);
obj->name_len = le32_to_cpu(bde.bigdirobnamelen);
- offset = offsetof(struct adfs_bigdirheader, bigdirname);
- offset += ((le32_to_cpu(h->bigdirnamelen) + 4) & ~3);
- offset += le32_to_cpu(h->bigdirentries) * sizeof(struct adfs_bigdirentry);
+ offset = adfs_fplus_offset(h, le32_to_cpu(h->bigdirentries));
offset += le32_to_cpu(bde.bigdirobnameptr);
- dir_memcpy(dir, offset, obj->name, obj->name_len);
+ ret = adfs_dir_copyfrom(obj->name, dir, offset, obj->name_len);
+ if (ret)
+ return ret;
+
adfs_object_fixup(dir, obj);
dir->pos += 1;
- ret = 0;
-out:
- return ret;
+
+ return 0;
}
-static int
-adfs_fplus_sync(struct adfs_dir *dir)
+static int adfs_fplus_iterate(struct adfs_dir *dir, struct dir_context *ctx)
{
- int err = 0;
- int i;
-
- for (i = dir->nr_buffers - 1; i >= 0; i--) {
- struct buffer_head *bh = dir->bh_fplus[i];
- sync_dirty_buffer(bh);
- if (buffer_req(bh) && !buffer_uptodate(bh))
- err = -EIO;
+ struct object_info obj;
+
+ if ((ctx->pos - 2) >> 32)
+ return 0;
+
+ if (adfs_fplus_setpos(dir, ctx->pos - 2))
+ return 0;
+
+ while (!adfs_fplus_getnext(dir, &obj)) {
+ if (!dir_emit(ctx, obj.name, obj.name_len,
+ obj.indaddr, DT_UNKNOWN))
+ break;
+ ctx->pos++;
}
- return err;
+ return 0;
}
-static void
-adfs_fplus_free(struct adfs_dir *dir)
+static int adfs_fplus_update(struct adfs_dir *dir, struct object_info *obj)
{
- int i;
+ struct adfs_bigdirheader *h = dir->bighead;
+ struct adfs_bigdirentry bde;
+ int offset, end, ret;
- if (dir->bh_fplus) {
- for (i = 0; i < dir->nr_buffers; i++)
- brelse(dir->bh_fplus[i]);
+ offset = adfs_fplus_offset(h, 0) - sizeof(bde);
+ end = adfs_fplus_offset(h, le32_to_cpu(h->bigdirentries));
- if (&dir->bh[0] != dir->bh_fplus)
- kfree(dir->bh_fplus);
+ do {
+ offset += sizeof(bde);
+ if (offset >= end) {
+ adfs_error(dir->sb, "unable to locate entry to update");
+ return -ENOENT;
+ }
+ ret = adfs_dir_copyfrom(&bde, dir, offset, sizeof(bde));
+ if (ret) {
+ adfs_error(dir->sb, "error reading directory entry");
+ return -ENOENT;
+ }
+ } while (le32_to_cpu(bde.bigdirindaddr) != obj->indaddr);
- dir->bh_fplus = NULL;
- }
+ bde.bigdirload = cpu_to_le32(obj->loadaddr);
+ bde.bigdirexec = cpu_to_le32(obj->execaddr);
+ bde.bigdirlen = cpu_to_le32(obj->size);
+ bde.bigdirindaddr = cpu_to_le32(obj->indaddr);
+ bde.bigdirattr = cpu_to_le32(obj->attr);
+
+ return adfs_dir_copyto(dir, offset, &bde, sizeof(bde));
+}
+
+static int adfs_fplus_commit(struct adfs_dir *dir)
+{
+ int ret;
- dir->nr_buffers = 0;
- dir->sb = NULL;
+ /* Increment directory sequence number */
+ dir->bighead->startmasseq += 1;
+ dir->bigtail->bigdirendmasseq += 1;
+
+ /* Update directory check byte */
+ dir->bigtail->bigdircheckbyte = adfs_fplus_checkbyte(dir);
+
+ /* Make sure the directory still validates correctly */
+ ret = adfs_fplus_validate_header(dir->bighead);
+ if (ret == 0)
+ ret = adfs_fplus_validate_tail(dir->bighead, dir->bigtail);
+
+ return ret;
}
const struct adfs_dir_ops adfs_fplus_dir_ops = {
.read = adfs_fplus_read,
+ .iterate = adfs_fplus_iterate,
.setpos = adfs_fplus_setpos,
.getnext = adfs_fplus_getnext,
- .sync = adfs_fplus_sync,
- .free = adfs_fplus_free
+ .update = adfs_fplus_update,
+ .commit = adfs_fplus_commit,
};
diff --git a/fs/adfs/dir_fplus.h b/fs/adfs/dir_fplus.h
index 4ec0931e36ad..d729b1591e5e 100644
--- a/fs/adfs/dir_fplus.h
+++ b/fs/adfs/dir_fplus.h
@@ -22,7 +22,7 @@ struct adfs_bigdirheader {
__le32 bigdirnamesize;
__le32 bigdirparent;
char bigdirname[1];
-};
+} __attribute__((packed, aligned(4)));
struct adfs_bigdirentry {
__le32 bigdirload;
@@ -32,11 +32,11 @@ struct adfs_bigdirentry {
__le32 bigdirattr;
__le32 bigdirobnamelen;
__le32 bigdirobnameptr;
-};
+} __attribute__((packed, aligned(4)));
struct adfs_bigdirtail {
__le32 bigdirendname;
__u8 bigdirendmasseq;
__u8 reserved[2];
__u8 bigdircheckbyte;
-};
+} __attribute__((packed, aligned(4)));
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 124de75413a5..32620f4a7623 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -20,7 +20,8 @@ adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh,
if (block >= inode->i_blocks)
goto abort_toobig;
- block = __adfs_block_map(inode->i_sb, inode->i_ino, block);
+ block = __adfs_block_map(inode->i_sb, ADFS_I(inode)->indaddr,
+ block);
if (block)
map_bh(bh, inode->i_sb, block);
return 0;
@@ -126,29 +127,29 @@ adfs_atts2mode(struct super_block *sb, struct inode *inode)
* Convert Linux permission to ADFS attribute. We try to do the reverse
* of atts2mode, but there is not a 1:1 translation.
*/
-static int
-adfs_mode2atts(struct super_block *sb, struct inode *inode)
+static int adfs_mode2atts(struct super_block *sb, struct inode *inode,
+ umode_t ia_mode)
{
+ struct adfs_sb_info *asb = ADFS_SB(sb);
umode_t mode;
int attr;
- struct adfs_sb_info *asb = ADFS_SB(sb);
/* FIXME: should we be able to alter a link? */
if (S_ISLNK(inode->i_mode))
return ADFS_I(inode)->attr;
+ /* Directories do not have read/write permissions on the media */
if (S_ISDIR(inode->i_mode))
- attr = ADFS_NDA_DIRECTORY;
- else
- attr = 0;
+ return ADFS_NDA_DIRECTORY;
- mode = inode->i_mode & asb->s_owner_mask;
+ attr = 0;
+ mode = ia_mode & asb->s_owner_mask;
if (mode & S_IRUGO)
attr |= ADFS_NDA_OWNER_READ;
if (mode & S_IWUGO)
attr |= ADFS_NDA_OWNER_WRITE;
- mode = inode->i_mode & asb->s_other_mask;
+ mode = ia_mode & asb->s_other_mask;
mode &= ~asb->s_owner_mask;
if (mode & S_IRUGO)
attr |= ADFS_NDA_PUBLIC_READ;
@@ -158,6 +159,8 @@ adfs_mode2atts(struct super_block *sb, struct inode *inode)
return attr;
}
+static const s64 nsec_unix_epoch_diff_risc_os_epoch = 2208988800000000000LL;
+
/*
* Convert an ADFS time to Unix time. ADFS has a 40-bit centi-second time
* referenced to 1 Jan 1900 (til 2248) so we need to discard 2208988800 seconds
@@ -170,8 +173,6 @@ adfs_adfs2unix_time(struct timespec64 *tv, struct inode *inode)
/* 01 Jan 1970 00:00:00 (Unix epoch) as nanoseconds since
* 01 Jan 1900 00:00:00 (RISC OS epoch)
*/
- static const s64 nsec_unix_epoch_diff_risc_os_epoch =
- 2208988800000000000LL;
s64 nsec;
if (!adfs_inode_is_stamped(inode))
@@ -204,24 +205,23 @@ adfs_adfs2unix_time(struct timespec64 *tv, struct inode *inode)
return;
}
-/*
- * Convert an Unix time to ADFS time. We only do this if the entry has a
- * time/date stamp already.
- */
-static void
-adfs_unix2adfs_time(struct inode *inode, unsigned int secs)
+/* Convert an Unix time to ADFS time for an entry that is already stamped. */
+static void adfs_unix2adfs_time(struct inode *inode,
+ const struct timespec64 *ts)
{
- unsigned int high, low;
+ s64 cs, nsec = timespec64_to_ns(ts);
- if (adfs_inode_is_stamped(inode)) {
- /* convert 32-bit seconds to 40-bit centi-seconds */
- low = (secs & 255) * 100;
- high = (secs / 256) * 100 + (low >> 8) + 0x336e996a;
+ /* convert from Unix to RISC OS epoch */
+ nsec += nsec_unix_epoch_diff_risc_os_epoch;
- ADFS_I(inode)->loadaddr = (high >> 24) |
- (ADFS_I(inode)->loadaddr & ~0xff);
- ADFS_I(inode)->execaddr = (low & 255) | (high << 8);
- }
+ /* convert from nanoseconds to centiseconds */
+ cs = div_s64(nsec, 10000000);
+
+ cs = clamp_t(s64, cs, 0, 0xffffffffff);
+
+ ADFS_I(inode)->loadaddr &= ~0xff;
+ ADFS_I(inode)->loadaddr |= (cs >> 32) & 0xff;
+ ADFS_I(inode)->execaddr = cs;
}
/*
@@ -260,6 +260,7 @@ adfs_iget(struct super_block *sb, struct object_info *obj)
* for cross-directory renames.
*/
ADFS_I(inode)->parent_id = obj->parent_id;
+ ADFS_I(inode)->indaddr = obj->indaddr;
ADFS_I(inode)->loadaddr = obj->loadaddr;
ADFS_I(inode)->execaddr = obj->execaddr;
ADFS_I(inode)->attr = obj->attr;
@@ -315,10 +316,11 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
if (ia_valid & ATTR_SIZE)
truncate_setsize(inode, attr->ia_size);
- if (ia_valid & ATTR_MTIME) {
- inode->i_mtime = attr->ia_mtime;
- adfs_unix2adfs_time(inode, attr->ia_mtime.tv_sec);
+ if (ia_valid & ATTR_MTIME && adfs_inode_is_stamped(inode)) {
+ adfs_unix2adfs_time(inode, &attr->ia_mtime);
+ adfs_adfs2unix_time(&inode->i_mtime, inode);
}
+
/*
* FIXME: should we make these == to i_mtime since we don't
* have the ability to represent them in our filesystem?
@@ -328,7 +330,7 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
if (ia_valid & ATTR_CTIME)
inode->i_ctime = attr->ia_ctime;
if (ia_valid & ATTR_MODE) {
- ADFS_I(inode)->attr = adfs_mode2atts(sb, inode);
+ ADFS_I(inode)->attr = adfs_mode2atts(sb, inode, attr->ia_mode);
inode->i_mode = adfs_atts2mode(sb, inode);
}
@@ -353,7 +355,7 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc)
struct object_info obj;
int ret;
- obj.indaddr = inode->i_ino;
+ obj.indaddr = ADFS_I(inode)->indaddr;
obj.name_len = 0;
obj.parent_id = ADFS_I(inode)->parent_id;
obj.loadaddr = ADFS_I(inode)->loadaddr;
diff --git a/fs/adfs/map.c b/fs/adfs/map.c
index f44d12cef5be..a81de80c45c1 100644
--- a/fs/adfs/map.c
+++ b/fs/adfs/map.c
@@ -4,6 +4,8 @@
*
* Copyright (C) 1997-2002 Russell King
*/
+#include <linux/slab.h>
+#include <linux/statfs.h>
#include <asm/unaligned.h>
#include "adfs.h"
@@ -66,54 +68,41 @@ static DEFINE_RWLOCK(adfs_map_lock);
static int lookup_zone(const struct adfs_discmap *dm, const unsigned int idlen,
const u32 frag_id, unsigned int *offset)
{
- const unsigned int mapsize = dm->dm_endbit;
+ const unsigned int endbit = dm->dm_endbit;
const u32 idmask = (1 << idlen) - 1;
- unsigned char *map = dm->dm_bh->b_data + 4;
+ unsigned char *map = dm->dm_bh->b_data;
unsigned int start = dm->dm_startbit;
- unsigned int mapptr;
+ unsigned int freelink, fragend;
u32 frag;
+ frag = GET_FRAG_ID(map, 8, idmask & 0x7fff);
+ freelink = frag ? 8 + frag : 0;
+
do {
frag = GET_FRAG_ID(map, start, idmask);
- mapptr = start + idlen;
-
- /*
- * find end of fragment
- */
- {
- __le32 *_map = (__le32 *)map;
- u32 v = le32_to_cpu(_map[mapptr >> 5]) >> (mapptr & 31);
- while (v == 0) {
- mapptr = (mapptr & ~31) + 32;
- if (mapptr >= mapsize)
- goto error;
- v = le32_to_cpu(_map[mapptr >> 5]);
- }
-
- mapptr += 1 + ffz(~v);
+
+ fragend = find_next_bit_le(map, endbit, start + idlen);
+ if (fragend >= endbit)
+ goto error;
+
+ if (start == freelink) {
+ freelink += frag & 0x7fff;
+ } else if (frag == frag_id) {
+ unsigned int length = fragend + 1 - start;
+
+ if (*offset < length)
+ return start + *offset;
+ *offset -= length;
}
- if (frag == frag_id)
- goto found;
-again:
- start = mapptr;
- } while (mapptr < mapsize);
+ start = fragend + 1;
+ } while (start < endbit);
return -1;
error:
printk(KERN_ERR "adfs: oversized fragment 0x%x at 0x%x-0x%x\n",
- frag, start, mapptr);
+ frag, start, fragend);
return -1;
-
-found:
- {
- int length = mapptr - start;
- if (*offset >= length) {
- *offset -= length;
- goto again;
- }
- }
- return start + *offset;
}
/*
@@ -125,12 +114,12 @@ found:
static unsigned int
scan_free_map(struct adfs_sb_info *asb, struct adfs_discmap *dm)
{
- const unsigned int mapsize = dm->dm_endbit + 32;
+ const unsigned int endbit = dm->dm_endbit;
const unsigned int idlen = asb->s_idlen;
const unsigned int frag_idlen = idlen <= 15 ? idlen : 15;
const u32 idmask = (1 << frag_idlen) - 1;
unsigned char *map = dm->dm_bh->b_data;
- unsigned int start = 8, mapptr;
+ unsigned int start = 8, fragend;
u32 frag;
unsigned long total = 0;
@@ -149,29 +138,13 @@ scan_free_map(struct adfs_sb_info *asb, struct adfs_discmap *dm)
do {
start += frag;
- /*
- * get fragment id
- */
frag = GET_FRAG_ID(map, start, idmask);
- mapptr = start + idlen;
-
- /*
- * find end of fragment
- */
- {
- __le32 *_map = (__le32 *)map;
- u32 v = le32_to_cpu(_map[mapptr >> 5]) >> (mapptr & 31);
- while (v == 0) {
- mapptr = (mapptr & ~31) + 32;
- if (mapptr >= mapsize)
- goto error;
- v = le32_to_cpu(_map[mapptr >> 5]);
- }
-
- mapptr += 1 + ffz(~v);
- }
- total += mapptr - start;
+ fragend = find_next_bit_le(map, endbit, start + idlen);
+ if (fragend >= endbit)
+ goto error;
+
+ total += fragend + 1 - start;
} while (frag >= idlen + 1);
if (frag != 0)
@@ -220,10 +193,10 @@ found:
* total_free = E(free_in_zone_n)
* nzones
*/
-unsigned int
-adfs_map_free(struct super_block *sb)
+void adfs_map_statfs(struct super_block *sb, struct kstatfs *buf)
{
struct adfs_sb_info *asb = ADFS_SB(sb);
+ struct adfs_discrecord *dr = adfs_map_discrecord(asb->s_map);
struct adfs_discmap *dm;
unsigned int total = 0;
unsigned int zone;
@@ -235,7 +208,10 @@ adfs_map_free(struct super_block *sb)
total += scan_free_map(asb, dm++);
} while (--zone > 0);
- return signed_asl(total, asb->s_map2blk);
+ buf->f_blocks = adfs_disc_size(dr) >> sb->s_blocksize_bits;
+ buf->f_files = asb->s_ids_per_zone * asb->s_map_size;
+ buf->f_bavail =
+ buf->f_bfree = signed_asl(total, asb->s_map2blk);
}
int adfs_map_lookup(struct super_block *sb, u32 frag_id, unsigned int offset)
@@ -280,3 +256,152 @@ bad_fragment:
frag_id, zone, asb->s_map_size);
return 0;
}
+
+static unsigned char adfs_calczonecheck(struct super_block *sb, unsigned char *map)
+{
+ unsigned int v0, v1, v2, v3;
+ int i;
+
+ v0 = v1 = v2 = v3 = 0;
+ for (i = sb->s_blocksize - 4; i; i -= 4) {
+ v0 += map[i] + (v3 >> 8);
+ v3 &= 0xff;
+ v1 += map[i + 1] + (v0 >> 8);
+ v0 &= 0xff;
+ v2 += map[i + 2] + (v1 >> 8);
+ v1 &= 0xff;
+ v3 += map[i + 3] + (v2 >> 8);
+ v2 &= 0xff;
+ }
+ v0 += v3 >> 8;
+ v1 += map[1] + (v0 >> 8);
+ v2 += map[2] + (v1 >> 8);
+ v3 += map[3] + (v2 >> 8);
+
+ return v0 ^ v1 ^ v2 ^ v3;
+}
+
+static int adfs_checkmap(struct super_block *sb, struct adfs_discmap *dm)
+{
+ unsigned char crosscheck = 0, zonecheck = 1;
+ int i;
+
+ for (i = 0; i < ADFS_SB(sb)->s_map_size; i++) {
+ unsigned char *map;
+
+ map = dm[i].dm_bh->b_data;
+
+ if (adfs_calczonecheck(sb, map) != map[0]) {
+ adfs_error(sb, "zone %d fails zonecheck", i);
+ zonecheck = 0;
+ }
+ crosscheck ^= map[3];
+ }
+ if (crosscheck != 0xff)
+ adfs_error(sb, "crosscheck != 0xff");
+ return crosscheck == 0xff && zonecheck;
+}
+
+/*
+ * Layout the map - the first zone contains a copy of the disc record,
+ * and the last zone must be limited to the size of the filesystem.
+ */
+static void adfs_map_layout(struct adfs_discmap *dm, unsigned int nzones,
+ struct adfs_discrecord *dr)
+{
+ unsigned int zone, zone_size;
+ u64 size;
+
+ zone_size = (8 << dr->log2secsize) - le16_to_cpu(dr->zone_spare);
+
+ dm[0].dm_bh = NULL;
+ dm[0].dm_startblk = 0;
+ dm[0].dm_startbit = 32 + ADFS_DR_SIZE_BITS;
+ dm[0].dm_endbit = 32 + zone_size;
+
+ for (zone = 1; zone < nzones; zone++) {
+ dm[zone].dm_bh = NULL;
+ dm[zone].dm_startblk = zone * zone_size - ADFS_DR_SIZE_BITS;
+ dm[zone].dm_startbit = 32;
+ dm[zone].dm_endbit = 32 + zone_size;
+ }
+
+ size = adfs_disc_size(dr) >> dr->log2bpmb;
+ size -= (nzones - 1) * zone_size - ADFS_DR_SIZE_BITS;
+ dm[nzones - 1].dm_endbit = 32 + size;
+}
+
+static int adfs_map_read(struct adfs_discmap *dm, struct super_block *sb,
+ unsigned int map_addr, unsigned int nzones)
+{
+ unsigned int zone;
+
+ for (zone = 0; zone < nzones; zone++) {
+ dm[zone].dm_bh = sb_bread(sb, map_addr + zone);
+ if (!dm[zone].dm_bh)
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static void adfs_map_relse(struct adfs_discmap *dm, unsigned int nzones)
+{
+ unsigned int zone;
+
+ for (zone = 0; zone < nzones; zone++)
+ brelse(dm[zone].dm_bh);
+}
+
+struct adfs_discmap *adfs_read_map(struct super_block *sb, struct adfs_discrecord *dr)
+{
+ struct adfs_sb_info *asb = ADFS_SB(sb);
+ struct adfs_discmap *dm;
+ unsigned int map_addr, zone_size, nzones;
+ int ret;
+
+ nzones = dr->nzones | dr->nzones_high << 8;
+ zone_size = (8 << dr->log2secsize) - le16_to_cpu(dr->zone_spare);
+
+ asb->s_idlen = dr->idlen;
+ asb->s_map_size = nzones;
+ asb->s_map2blk = dr->log2bpmb - dr->log2secsize;
+ asb->s_log2sharesize = dr->log2sharesize;
+ asb->s_ids_per_zone = zone_size / (asb->s_idlen + 1);
+
+ map_addr = (nzones >> 1) * zone_size -
+ ((nzones > 1) ? ADFS_DR_SIZE_BITS : 0);
+ map_addr = signed_asl(map_addr, asb->s_map2blk);
+
+ dm = kmalloc_array(nzones, sizeof(*dm), GFP_KERNEL);
+ if (dm == NULL) {
+ adfs_error(sb, "not enough memory");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ adfs_map_layout(dm, nzones, dr);
+
+ ret = adfs_map_read(dm, sb, map_addr, nzones);
+ if (ret) {
+ adfs_error(sb, "unable to read map");
+ goto error_free;
+ }
+
+ if (adfs_checkmap(sb, dm))
+ return dm;
+
+ adfs_error(sb, "map corrupted");
+
+error_free:
+ adfs_map_relse(dm, nzones);
+ kfree(dm);
+ return ERR_PTR(-EIO);
+}
+
+void adfs_free_map(struct super_block *sb)
+{
+ struct adfs_sb_info *asb = ADFS_SB(sb);
+
+ adfs_map_relse(asb->s_map, asb->s_map_size);
+ kfree(asb->s_map);
+}
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 65b04ebb51c3..a3cc8ecb50da 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -88,59 +88,11 @@ static int adfs_checkdiscrecord(struct adfs_discrecord *dr)
return 0;
}
-static unsigned char adfs_calczonecheck(struct super_block *sb, unsigned char *map)
-{
- unsigned int v0, v1, v2, v3;
- int i;
-
- v0 = v1 = v2 = v3 = 0;
- for (i = sb->s_blocksize - 4; i; i -= 4) {
- v0 += map[i] + (v3 >> 8);
- v3 &= 0xff;
- v1 += map[i + 1] + (v0 >> 8);
- v0 &= 0xff;
- v2 += map[i + 2] + (v1 >> 8);
- v1 &= 0xff;
- v3 += map[i + 3] + (v2 >> 8);
- v2 &= 0xff;
- }
- v0 += v3 >> 8;
- v1 += map[1] + (v0 >> 8);
- v2 += map[2] + (v1 >> 8);
- v3 += map[3] + (v2 >> 8);
-
- return v0 ^ v1 ^ v2 ^ v3;
-}
-
-static int adfs_checkmap(struct super_block *sb, struct adfs_discmap *dm)
-{
- unsigned char crosscheck = 0, zonecheck = 1;
- int i;
-
- for (i = 0; i < ADFS_SB(sb)->s_map_size; i++) {
- unsigned char *map;
-
- map = dm[i].dm_bh->b_data;
-
- if (adfs_calczonecheck(sb, map) != map[0]) {
- adfs_error(sb, "zone %d fails zonecheck", i);
- zonecheck = 0;
- }
- crosscheck ^= map[3];
- }
- if (crosscheck != 0xff)
- adfs_error(sb, "crosscheck != 0xff");
- return crosscheck == 0xff && zonecheck;
-}
-
static void adfs_put_super(struct super_block *sb)
{
- int i;
struct adfs_sb_info *asb = ADFS_SB(sb);
- for (i = 0; i < asb->s_map_size; i++)
- brelse(asb->s_map[i].dm_bh);
- kfree(asb->s_map);
+ adfs_free_map(sb);
kfree_rcu(asb, rcu);
}
@@ -249,16 +201,13 @@ static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
struct adfs_sb_info *sbi = ADFS_SB(sb);
- struct adfs_discrecord *dr = adfs_map_discrecord(sbi->s_map);
u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+ adfs_map_statfs(sb, buf);
+
buf->f_type = ADFS_SUPER_MAGIC;
buf->f_namelen = sbi->s_namelen;
buf->f_bsize = sb->s_blocksize;
- buf->f_blocks = adfs_disc_size(dr) >> sb->s_blocksize_bits;
- buf->f_files = sbi->s_ids_per_zone * sbi->s_map_size;
- buf->f_bavail =
- buf->f_bfree = adfs_map_free(sb);
buf->f_ffree = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks;
buf->f_fsid.val[0] = (u32)id;
buf->f_fsid.val[1] = (u32)(id >> 32);
@@ -282,6 +231,12 @@ static void adfs_free_inode(struct inode *inode)
kmem_cache_free(adfs_inode_cachep, ADFS_I(inode));
}
+static int adfs_drop_inode(struct inode *inode)
+{
+ /* always drop inodes if we are read-only */
+ return !IS_ENABLED(CONFIG_ADFS_FS_RW) || IS_RDONLY(inode);
+}
+
static void init_once(void *foo)
{
struct adfs_inode_info *ei = (struct adfs_inode_info *) foo;
@@ -314,7 +269,7 @@ static void destroy_inodecache(void)
static const struct super_operations adfs_sops = {
.alloc_inode = adfs_alloc_inode,
.free_inode = adfs_free_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = adfs_drop_inode,
.write_inode = adfs_write_inode,
.put_super = adfs_put_super,
.statfs = adfs_statfs,
@@ -322,66 +277,94 @@ static const struct super_operations adfs_sops = {
.show_options = adfs_show_options,
};
-static struct adfs_discmap *adfs_read_map(struct super_block *sb, struct adfs_discrecord *dr)
+static int adfs_probe(struct super_block *sb, unsigned int offset, int silent,
+ int (*validate)(struct super_block *sb,
+ struct buffer_head *bh,
+ struct adfs_discrecord **bhp))
{
- struct adfs_discmap *dm;
- unsigned int map_addr, zone_size, nzones;
- int i, zone;
struct adfs_sb_info *asb = ADFS_SB(sb);
+ struct adfs_discrecord *dr;
+ struct buffer_head *bh;
+ unsigned int blocksize = BLOCK_SIZE;
+ int ret, try;
+
+ for (try = 0; try < 2; try++) {
+ /* try to set the requested block size */
+ if (sb->s_blocksize != blocksize &&
+ !sb_set_blocksize(sb, blocksize)) {
+ if (!silent)
+ adfs_msg(sb, KERN_ERR,
+ "error: unsupported blocksize");
+ return -EINVAL;
+ }
- nzones = asb->s_map_size;
- zone_size = (8 << dr->log2secsize) - le16_to_cpu(dr->zone_spare);
- map_addr = (nzones >> 1) * zone_size -
- ((nzones > 1) ? ADFS_DR_SIZE_BITS : 0);
- map_addr = signed_asl(map_addr, asb->s_map2blk);
+ /* read the buffer */
+ bh = sb_bread(sb, offset >> sb->s_blocksize_bits);
+ if (!bh) {
+ adfs_msg(sb, KERN_ERR,
+ "error: unable to read block %u, try %d",
+ offset >> sb->s_blocksize_bits, try);
+ return -EIO;
+ }
+
+ /* validate it */
+ ret = validate(sb, bh, &dr);
+ if (ret) {
+ brelse(bh);
+ return ret;
+ }
- asb->s_ids_per_zone = zone_size / (asb->s_idlen + 1);
+ /* does the block size match the filesystem block size? */
+ blocksize = 1 << dr->log2secsize;
+ if (sb->s_blocksize == blocksize) {
+ asb->s_map = adfs_read_map(sb, dr);
+ brelse(bh);
+ return PTR_ERR_OR_ZERO(asb->s_map);
+ }
- dm = kmalloc_array(nzones, sizeof(*dm), GFP_KERNEL);
- if (dm == NULL) {
- adfs_error(sb, "not enough memory");
- return ERR_PTR(-ENOMEM);
+ brelse(bh);
}
- for (zone = 0; zone < nzones; zone++, map_addr++) {
- dm[zone].dm_startbit = 0;
- dm[zone].dm_endbit = zone_size;
- dm[zone].dm_startblk = zone * zone_size - ADFS_DR_SIZE_BITS;
- dm[zone].dm_bh = sb_bread(sb, map_addr);
+ return -EIO;
+}
- if (!dm[zone].dm_bh) {
- adfs_error(sb, "unable to read map");
- goto error_free;
- }
- }
+static int adfs_validate_bblk(struct super_block *sb, struct buffer_head *bh,
+ struct adfs_discrecord **drp)
+{
+ struct adfs_discrecord *dr;
+ unsigned char *b_data;
- /* adjust the limits for the first and last map zones */
- i = zone - 1;
- dm[0].dm_startblk = 0;
- dm[0].dm_startbit = ADFS_DR_SIZE_BITS;
- dm[i].dm_endbit = (adfs_disc_size(dr) >> dr->log2bpmb) +
- (ADFS_DR_SIZE_BITS - i * zone_size);
+ b_data = bh->b_data + (ADFS_DISCRECORD % sb->s_blocksize);
+ if (adfs_checkbblk(b_data))
+ return -EILSEQ;
- if (adfs_checkmap(sb, dm))
- return dm;
+ /* Do some sanity checks on the ADFS disc record */
+ dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET);
+ if (adfs_checkdiscrecord(dr))
+ return -EILSEQ;
+
+ *drp = dr;
+ return 0;
+}
- adfs_error(sb, "map corrupted");
+static int adfs_validate_dr0(struct super_block *sb, struct buffer_head *bh,
+ struct adfs_discrecord **drp)
+{
+ struct adfs_discrecord *dr;
-error_free:
- while (--zone >= 0)
- brelse(dm[zone].dm_bh);
+ /* Do some sanity checks on the ADFS disc record */
+ dr = (struct adfs_discrecord *)(bh->b_data + 4);
+ if (adfs_checkdiscrecord(dr) || dr->nzones_high || dr->nzones != 1)
+ return -EILSEQ;
- kfree(dm);
- return ERR_PTR(-EIO);
+ *drp = dr;
+ return 0;
}
static int adfs_fill_super(struct super_block *sb, void *data, int silent)
{
struct adfs_discrecord *dr;
- struct buffer_head *bh;
struct object_info root_obj;
- unsigned char *b_data;
- unsigned int blocksize;
struct adfs_sb_info *asb;
struct inode *root;
int ret = -EINVAL;
@@ -391,7 +374,10 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
asb = kzalloc(sizeof(*asb), GFP_KERNEL);
if (!asb)
return -ENOMEM;
+
sb->s_fs_info = asb;
+ sb->s_magic = ADFS_SUPER_MAGIC;
+ sb->s_time_gran = 10000000;
/* set default options */
asb->s_uid = GLOBAL_ROOT_UID;
@@ -403,78 +389,21 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
if (parse_options(sb, asb, data))
goto error;
- sb_set_blocksize(sb, BLOCK_SIZE);
- if (!(bh = sb_bread(sb, ADFS_DISCRECORD / BLOCK_SIZE))) {
- adfs_msg(sb, KERN_ERR, "error: unable to read superblock");
- ret = -EIO;
- goto error;
- }
-
- b_data = bh->b_data + (ADFS_DISCRECORD % BLOCK_SIZE);
-
- if (adfs_checkbblk(b_data)) {
- ret = -EINVAL;
- goto error_badfs;
- }
-
- dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET);
-
- /*
- * Do some sanity checks on the ADFS disc record
- */
- if (adfs_checkdiscrecord(dr)) {
- ret = -EINVAL;
- goto error_badfs;
- }
-
- blocksize = 1 << dr->log2secsize;
- brelse(bh);
-
- if (sb_set_blocksize(sb, blocksize)) {
- bh = sb_bread(sb, ADFS_DISCRECORD / sb->s_blocksize);
- if (!bh) {
- adfs_msg(sb, KERN_ERR,
- "error: couldn't read superblock on 2nd try.");
- ret = -EIO;
- goto error;
- }
- b_data = bh->b_data + (ADFS_DISCRECORD % sb->s_blocksize);
- if (adfs_checkbblk(b_data)) {
- adfs_msg(sb, KERN_ERR,
- "error: disc record mismatch, very weird!");
- ret = -EINVAL;
- goto error_free_bh;
- }
- dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET);
- } else {
+ /* Try to probe the filesystem boot block */
+ ret = adfs_probe(sb, ADFS_DISCRECORD, 1, adfs_validate_bblk);
+ if (ret == -EILSEQ)
+ ret = adfs_probe(sb, 0, silent, adfs_validate_dr0);
+ if (ret == -EILSEQ) {
if (!silent)
adfs_msg(sb, KERN_ERR,
- "error: unsupported blocksize");
+ "error: can't find an ADFS filesystem on dev %s.",
+ sb->s_id);
ret = -EINVAL;
- goto error;
}
+ if (ret)
+ goto error;
- /*
- * blocksize on this device should now be set to the ADFS log2secsize
- */
-
- sb->s_magic = ADFS_SUPER_MAGIC;
- asb->s_idlen = dr->idlen;
- asb->s_map_size = dr->nzones | (dr->nzones_high << 8);
- asb->s_map2blk = dr->log2bpmb - dr->log2secsize;
- asb->s_log2sharesize = dr->log2sharesize;
-
- asb->s_map = adfs_read_map(sb, dr);
- if (IS_ERR(asb->s_map)) {
- ret = PTR_ERR(asb->s_map);
- goto error_free_bh;
- }
-
- brelse(bh);
-
- /*
- * set up enough so that we can read an inode
- */
+ /* set up enough so that we can read an inode */
sb->s_op = &adfs_sops;
dr = adfs_map_discrecord(asb->s_map);
@@ -511,23 +440,13 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
root = adfs_iget(sb, &root_obj);
sb->s_root = d_make_root(root);
if (!sb->s_root) {
- int i;
- for (i = 0; i < asb->s_map_size; i++)
- brelse(asb->s_map[i].dm_bh);
- kfree(asb->s_map);
+ adfs_free_map(sb);
adfs_error(sb, "get root inode failed\n");
ret = -EIO;
goto error;
}
return 0;
-error_badfs:
- if (!silent)
- adfs_msg(sb, KERN_ERR,
- "error: can't find an ADFS filesystem on dev %s.",
- sb->s_id);
-error_free_bh:
- brelse(bh);
error:
sb->s_fs_info = NULL;
kfree(asb);
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index fd5133e26a38..78ba5f932287 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -134,8 +134,17 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
_leave(" = -ENAMETOOLONG");
return ERR_PTR(-ENAMETOOLONG);
}
- if (namelen == 5 && memcmp(name, "@cell", 5) == 0)
+
+ /* Prohibit cell names that contain unprintable chars, '/' and '@' or
+ * that begin with a dot. This also precludes "@cell".
+ */
+ if (name[0] == '.')
return ERR_PTR(-EINVAL);
+ for (i = 0; i < namelen; i++) {
+ char ch = name[i];
+ if (!isprint(ch) || ch == '/' || ch == '@')
+ return ERR_PTR(-EINVAL);
+ }
_enter("%*.*s,%s", namelen, namelen, name, addresses);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 497f979018c2..5c794f4b051a 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -908,6 +908,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
struct afs_vnode *dvnode = AFS_FS_I(dir);
+ struct afs_fid fid = {};
struct inode *inode;
struct dentry *d;
struct key *key;
@@ -951,21 +952,18 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
afs_stat_v(dvnode, n_lookup);
inode = afs_do_lookup(dir, dentry, key);
key_put(key);
- if (inode == ERR_PTR(-ENOENT)) {
+ if (inode == ERR_PTR(-ENOENT))
inode = afs_try_auto_mntpt(dentry, dir);
- } else {
- dentry->d_fsdata =
- (void *)(unsigned long)dvnode->status.data_version;
- }
+
+ if (!IS_ERR_OR_NULL(inode))
+ fid = AFS_FS_I(inode)->fid;
+
d = d_splice_alias(inode, dentry);
if (!IS_ERR_OR_NULL(d)) {
d->d_fsdata = dentry->d_fsdata;
- trace_afs_lookup(dvnode, &d->d_name,
- inode ? AFS_FS_I(inode) : NULL);
+ trace_afs_lookup(dvnode, &d->d_name, &fid);
} else {
- trace_afs_lookup(dvnode, &dentry->d_name,
- IS_ERR_OR_NULL(inode) ? NULL
- : AFS_FS_I(inode));
+ trace_afs_lookup(dvnode, &dentry->d_name, &fid);
}
return d;
}
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index ecd8d2698515..f4713ea76e82 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -97,7 +97,7 @@ static struct linux_binfmt elf_format = {
.min_coredump = ELF_EXEC_PAGESIZE,
};
-#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
+#define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE))
static int set_brk(unsigned long start, unsigned long end, int prot)
{
@@ -161,9 +161,11 @@ static int padzero(unsigned long elf_bss)
#endif
static int
-create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
- unsigned long load_addr, unsigned long interp_load_addr)
+create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
+ unsigned long load_addr, unsigned long interp_load_addr,
+ unsigned long e_entry)
{
+ struct mm_struct *mm = current->mm;
unsigned long p = bprm->p;
int argc = bprm->argc;
int envc = bprm->envc;
@@ -176,7 +178,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
unsigned char k_rand_bytes[16];
int items;
elf_addr_t *elf_info;
- int ei_index = 0;
+ int ei_index;
const struct cred *cred = current_cred();
struct vm_area_struct *vma;
@@ -226,12 +228,12 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
return -EFAULT;
/* Create the ELF interpreter info */
- elf_info = (elf_addr_t *)current->mm->saved_auxv;
+ elf_info = (elf_addr_t *)mm->saved_auxv;
/* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */
#define NEW_AUX_ENT(id, val) \
do { \
- elf_info[ei_index++] = id; \
- elf_info[ei_index++] = val; \
+ *elf_info++ = id; \
+ *elf_info++ = val; \
} while (0)
#ifdef ARCH_DLINFO
@@ -251,7 +253,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
NEW_AUX_ENT(AT_PHNUM, exec->e_phnum);
NEW_AUX_ENT(AT_BASE, interp_load_addr);
NEW_AUX_ENT(AT_FLAGS, 0);
- NEW_AUX_ENT(AT_ENTRY, exec->e_entry);
+ NEW_AUX_ENT(AT_ENTRY, e_entry);
NEW_AUX_ENT(AT_UID, from_kuid_munged(cred->user_ns, cred->uid));
NEW_AUX_ENT(AT_EUID, from_kuid_munged(cred->user_ns, cred->euid));
NEW_AUX_ENT(AT_GID, from_kgid_munged(cred->user_ns, cred->gid));
@@ -275,12 +277,13 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
}
#undef NEW_AUX_ENT
/* AT_NULL is zero; clear the rest too */
- memset(&elf_info[ei_index], 0,
- sizeof current->mm->saved_auxv - ei_index * sizeof elf_info[0]);
+ memset(elf_info, 0, (char *)mm->saved_auxv +
+ sizeof(mm->saved_auxv) - (char *)elf_info);
/* And advance past the AT_NULL entry. */
- ei_index += 2;
+ elf_info += 2;
+ ei_index = elf_info - (elf_addr_t *)mm->saved_auxv;
sp = STACK_ADD(p, ei_index);
items = (argc + 1) + (envc + 1) + 1;
@@ -299,7 +302,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
* Grow the stack manually; some architectures have a limit on how
* far ahead a user-space access may be in order to grow the stack.
*/
- vma = find_extend_vma(current->mm, bprm->p);
+ vma = find_extend_vma(mm, bprm->p);
if (!vma)
return -EFAULT;
@@ -308,7 +311,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
return -EFAULT;
/* Populate list of argv pointers back to argv strings. */
- p = current->mm->arg_end = current->mm->arg_start;
+ p = mm->arg_end = mm->arg_start;
while (argc-- > 0) {
size_t len;
if (__put_user((elf_addr_t)p, sp++))
@@ -320,10 +323,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
}
if (__put_user(0, sp++))
return -EFAULT;
- current->mm->arg_end = p;
+ mm->arg_end = p;
/* Populate list of envp pointers back to envp strings. */
- current->mm->env_end = current->mm->env_start = p;
+ mm->env_end = mm->env_start = p;
while (envc-- > 0) {
size_t len;
if (__put_user((elf_addr_t)p, sp++))
@@ -335,10 +338,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
}
if (__put_user(0, sp++))
return -EFAULT;
- current->mm->env_end = p;
+ mm->env_end = p;
/* Put the elf_info on the stack in the right place. */
- if (copy_to_user(sp, elf_info, ei_index * sizeof(elf_addr_t)))
+ if (copy_to_user(sp, mm->saved_auxv, ei_index * sizeof(elf_addr_t)))
return -EFAULT;
return 0;
}
@@ -689,15 +692,17 @@ static int load_elf_binary(struct linux_binprm *bprm)
int bss_prot = 0;
int retval, i;
unsigned long elf_entry;
+ unsigned long e_entry;
unsigned long interp_load_addr = 0;
unsigned long start_code, end_code, start_data, end_data;
unsigned long reloc_func_desc __maybe_unused = 0;
int executable_stack = EXSTACK_DEFAULT;
+ struct elfhdr *elf_ex = (struct elfhdr *)bprm->buf;
struct {
- struct elfhdr elf_ex;
struct elfhdr interp_elf_ex;
} *loc;
struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE;
+ struct mm_struct *mm;
struct pt_regs *regs;
loc = kmalloc(sizeof(*loc), GFP_KERNEL);
@@ -705,30 +710,27 @@ static int load_elf_binary(struct linux_binprm *bprm)
retval = -ENOMEM;
goto out_ret;
}
-
- /* Get the exec-header */
- loc->elf_ex = *((struct elfhdr *)bprm->buf);
retval = -ENOEXEC;
/* First of all, some simple consistency checks */
- if (memcmp(loc->elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
+ if (memcmp(elf_ex->e_ident, ELFMAG, SELFMAG) != 0)
goto out;
- if (loc->elf_ex.e_type != ET_EXEC && loc->elf_ex.e_type != ET_DYN)
+ if (elf_ex->e_type != ET_EXEC && elf_ex->e_type != ET_DYN)
goto out;
- if (!elf_check_arch(&loc->elf_ex))
+ if (!elf_check_arch(elf_ex))
goto out;
- if (elf_check_fdpic(&loc->elf_ex))
+ if (elf_check_fdpic(elf_ex))
goto out;
if (!bprm->file->f_op->mmap)
goto out;
- elf_phdata = load_elf_phdrs(&loc->elf_ex, bprm->file);
+ elf_phdata = load_elf_phdrs(elf_ex, bprm->file);
if (!elf_phdata)
goto out;
elf_ppnt = elf_phdata;
- for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
+ for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++) {
char *elf_interpreter;
if (elf_ppnt->p_type != PT_INTERP)
@@ -782,7 +784,7 @@ out_free_interp:
}
elf_ppnt = elf_phdata;
- for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++)
+ for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++)
switch (elf_ppnt->p_type) {
case PT_GNU_STACK:
if (elf_ppnt->p_flags & PF_X)
@@ -792,7 +794,7 @@ out_free_interp:
break;
case PT_LOPROC ... PT_HIPROC:
- retval = arch_elf_pt_proc(&loc->elf_ex, elf_ppnt,
+ retval = arch_elf_pt_proc(elf_ex, elf_ppnt,
bprm->file, false,
&arch_state);
if (retval)
@@ -836,7 +838,7 @@ out_free_interp:
* still possible to return an error to the code that invoked
* the exec syscall.
*/
- retval = arch_check_elf(&loc->elf_ex,
+ retval = arch_check_elf(elf_ex,
!!interpreter, &loc->interp_elf_ex,
&arch_state);
if (retval)
@@ -849,8 +851,8 @@ out_free_interp:
/* Do this immediately, since STACK_TOP as used in setup_arg_pages
may depend on the personality. */
- SET_PERSONALITY2(loc->elf_ex, &arch_state);
- if (elf_read_implies_exec(loc->elf_ex, executable_stack))
+ SET_PERSONALITY2(*elf_ex, &arch_state);
+ if (elf_read_implies_exec(*elf_ex, executable_stack))
current->personality |= READ_IMPLIES_EXEC;
if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
@@ -877,7 +879,7 @@ out_free_interp:
/* Now we do a little grungy work by mmapping the ELF image into
the correct location in memory. */
for(i = 0, elf_ppnt = elf_phdata;
- i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
+ i < elf_ex->e_phnum; i++, elf_ppnt++) {
int elf_prot, elf_flags;
unsigned long k, vaddr;
unsigned long total_size = 0;
@@ -921,9 +923,9 @@ out_free_interp:
* If we are loading ET_EXEC or we have already performed
* the ET_DYN load_addr calculations, proceed normally.
*/
- if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) {
+ if (elf_ex->e_type == ET_EXEC || load_addr_set) {
elf_flags |= MAP_FIXED;
- } else if (loc->elf_ex.e_type == ET_DYN) {
+ } else if (elf_ex->e_type == ET_DYN) {
/*
* This logic is run once for the first LOAD Program
* Header for ET_DYN binaries to calculate the
@@ -972,7 +974,7 @@ out_free_interp:
load_bias = ELF_PAGESTART(load_bias - vaddr);
total_size = total_mapping_size(elf_phdata,
- loc->elf_ex.e_phnum);
+ elf_ex->e_phnum);
if (!total_size) {
retval = -EINVAL;
goto out_free_dentry;
@@ -990,7 +992,7 @@ out_free_interp:
if (!load_addr_set) {
load_addr_set = 1;
load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset);
- if (loc->elf_ex.e_type == ET_DYN) {
+ if (elf_ex->e_type == ET_DYN) {
load_bias += error -
ELF_PAGESTART(load_bias + vaddr);
load_addr += load_bias;
@@ -998,7 +1000,7 @@ out_free_interp:
}
}
k = elf_ppnt->p_vaddr;
- if (k < start_code)
+ if ((elf_ppnt->p_flags & PF_X) && k < start_code)
start_code = k;
if (start_data < k)
start_data = k;
@@ -1031,7 +1033,7 @@ out_free_interp:
}
}
- loc->elf_ex.e_entry += load_bias;
+ e_entry = elf_ex->e_entry + load_bias;
elf_bss += load_bias;
elf_brk += load_bias;
start_code += load_bias;
@@ -1074,7 +1076,7 @@ out_free_interp:
allow_write_access(interpreter);
fput(interpreter);
} else {
- elf_entry = loc->elf_ex.e_entry;
+ elf_entry = e_entry;
if (BAD_ADDR(elf_entry)) {
retval = -EINVAL;
goto out_free_dentry;
@@ -1092,15 +1094,17 @@ out_free_interp:
goto out;
#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
- retval = create_elf_tables(bprm, &loc->elf_ex,
- load_addr, interp_load_addr);
+ retval = create_elf_tables(bprm, elf_ex,
+ load_addr, interp_load_addr, e_entry);
if (retval < 0)
goto out;
- current->mm->end_code = end_code;
- current->mm->start_code = start_code;
- current->mm->start_data = start_data;
- current->mm->end_data = end_data;
- current->mm->start_stack = bprm->p;
+
+ mm = current->mm;
+ mm->end_code = end_code;
+ mm->start_code = start_code;
+ mm->start_data = start_data;
+ mm->end_data = end_data;
+ mm->start_stack = bprm->p;
if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {
/*
@@ -1111,12 +1115,11 @@ out_free_interp:
* growing down), and into the unused ELF_ET_DYN_BASE region.
*/
if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) &&
- loc->elf_ex.e_type == ET_DYN && !interpreter)
- current->mm->brk = current->mm->start_brk =
- ELF_ET_DYN_BASE;
+ elf_ex->e_type == ET_DYN && !interpreter) {
+ mm->brk = mm->start_brk = ELF_ET_DYN_BASE;
+ }
- current->mm->brk = current->mm->start_brk =
- arch_randomize_brk(current->mm);
+ mm->brk = mm->start_brk = arch_randomize_brk(mm);
#ifdef compat_brk_randomized
current->brk_randomized = 1;
#endif
@@ -1574,6 +1577,7 @@ static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata,
*/
static int fill_files_note(struct memelfnote *note)
{
+ struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned count, size, names_ofs, remaining, n;
user_long_t *data;
@@ -1581,7 +1585,7 @@ static int fill_files_note(struct memelfnote *note)
char *name_base, *name_curpos;
/* *Estimated* file count and total data size needed */
- count = current->mm->map_count;
+ count = mm->map_count;
if (count > UINT_MAX / 64)
return -EINVAL;
size = count * 64;
@@ -1591,6 +1595,10 @@ static int fill_files_note(struct memelfnote *note)
if (size >= MAX_FILE_NOTE_SIZE) /* paranoia check */
return -EINVAL;
size = round_up(size, PAGE_SIZE);
+ /*
+ * "size" can be 0 here legitimately.
+ * Let it ENOMEM and omit NT_FILE section which will be empty anyway.
+ */
data = kvmalloc(size, GFP_KERNEL);
if (ZERO_OR_NULL_PTR(data))
return -ENOMEM;
@@ -1599,7 +1607,7 @@ static int fill_files_note(struct memelfnote *note)
name_base = name_curpos = ((char *)data) + names_ofs;
remaining = size - names_ofs;
count = 0;
- for (vma = current->mm->mmap; vma != NULL; vma = vma->vm_next) {
+ for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
struct file *file;
const char *filename;
@@ -1633,10 +1641,10 @@ static int fill_files_note(struct memelfnote *note)
data[0] = count;
data[1] = PAGE_SIZE;
/*
- * Count usually is less than current->mm->map_count,
+ * Count usually is less than mm->map_count,
* we need to move filenames down.
*/
- n = current->mm->map_count - count;
+ n = mm->map_count - count;
if (n != 0) {
unsigned shift_bytes = n * 3 * sizeof(data[0]);
memmove(name_base - shift_bytes, name_base,
@@ -2182,7 +2190,7 @@ static int elf_core_dump(struct coredump_params *cprm)
int segs, i;
size_t vma_data_size = 0;
struct vm_area_struct *vma, *gate_vma;
- struct elfhdr *elf = NULL;
+ struct elfhdr elf;
loff_t offset = 0, dataoff;
struct elf_note_info info = { };
struct elf_phdr *phdr4note = NULL;
@@ -2203,10 +2211,6 @@ static int elf_core_dump(struct coredump_params *cprm)
* exists while dumping the mm->vm_next areas to the core file.
*/
- /* alloc memory for large data structures: too large to be on stack */
- elf = kmalloc(sizeof(*elf), GFP_KERNEL);
- if (!elf)
- goto out;
/*
* The number of segs are recored into ELF header as 16bit value.
* Please check DEFAULT_MAX_MAP_COUNT definition when you modify here.
@@ -2230,7 +2234,7 @@ static int elf_core_dump(struct coredump_params *cprm)
* Collect all the non-memory information about the process for the
* notes. This also sets up the file header.
*/
- if (!fill_note_info(elf, e_phnum, &info, cprm->siginfo, cprm->regs))
+ if (!fill_note_info(&elf, e_phnum, &info, cprm->siginfo, cprm->regs))
goto cleanup;
has_dumped = 1;
@@ -2238,7 +2242,7 @@ static int elf_core_dump(struct coredump_params *cprm)
fs = get_fs();
set_fs(KERNEL_DS);
- offset += sizeof(*elf); /* Elf header */
+ offset += sizeof(elf); /* Elf header */
offset += segs * sizeof(struct elf_phdr); /* Program headers */
/* Write notes phdr entry */
@@ -2257,11 +2261,13 @@ static int elf_core_dump(struct coredump_params *cprm)
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
- if (segs - 1 > ULONG_MAX / sizeof(*vma_filesz))
- goto end_coredump;
+ /*
+ * Zero vma process will get ZERO_SIZE_PTR here.
+ * Let coredump continue for register state at least.
+ */
vma_filesz = kvmalloc(array_size(sizeof(*vma_filesz), (segs - 1)),
GFP_KERNEL);
- if (ZERO_OR_NULL_PTR(vma_filesz))
+ if (!vma_filesz)
goto end_coredump;
for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
@@ -2281,12 +2287,12 @@ static int elf_core_dump(struct coredump_params *cprm)
shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL);
if (!shdr4extnum)
goto end_coredump;
- fill_extnum_info(elf, shdr4extnum, e_shoff, segs);
+ fill_extnum_info(&elf, shdr4extnum, e_shoff, segs);
}
offset = dataoff;
- if (!dump_emit(cprm, elf, sizeof(*elf)))
+ if (!dump_emit(cprm, &elf, sizeof(elf)))
goto end_coredump;
if (!dump_emit(cprm, phdr4note, sizeof(*phdr4note)))
@@ -2370,8 +2376,6 @@ cleanup:
kfree(shdr4extnum);
kvfree(vma_filesz);
kfree(phdr4note);
- kfree(elf);
-out:
return has_dumped;
}
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 82200dbca5ac..9a0ff3384381 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -11,7 +11,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
- block-rsv.o delalloc-space.o block-group.o
+ block-rsv.o delalloc-space.o block-group.o discard.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 6934a5b8708f..14851584e245 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -14,6 +14,8 @@
#include "sysfs.h"
#include "tree-log.h"
#include "delalloc-space.h"
+#include "discard.h"
+#include "raid56.h"
/*
* Return target flags in extended format or 0 if restripe for this chunk_type
@@ -95,7 +97,7 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
return extended_to_chunk(flags | allowed);
}
-static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
+u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
{
unsigned seq;
u64 flags;
@@ -115,11 +117,6 @@ static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
return btrfs_reduce_alloc_profile(fs_info, flags);
}
-u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
-{
- return get_alloc_profile(fs_info, orig_flags);
-}
-
void btrfs_get_block_group(struct btrfs_block_group *cache)
{
atomic_inc(&cache->count);
@@ -132,6 +129,15 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
WARN_ON(cache->reserved > 0);
/*
+ * A block_group shouldn't be on the discard_list anymore.
+ * Remove the block_group from the discard_list to prevent us
+ * from causing a panic due to NULL pointer dereference.
+ */
+ if (WARN_ON(!list_empty(&cache->discard_list)))
+ btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
+ cache);
+
+ /*
* If not empty, someone is still holding mutex of
* full_stripe_lock, which can only be released by caller.
* And it will definitely cause use-after-free when caller
@@ -466,8 +472,8 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
} else if (extent_start > start && extent_start < end) {
size = extent_start - start;
total_added += size;
- ret = btrfs_add_free_space(block_group, start,
- size);
+ ret = btrfs_add_free_space_async_trimmed(block_group,
+ start, size);
BUG_ON(ret); /* -ENOMEM or logic error */
start = extent_end + 1;
} else {
@@ -478,7 +484,8 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
if (start < end) {
size = end - start;
total_added += size;
- ret = btrfs_add_free_space(block_group, start, size);
+ ret = btrfs_add_free_space_async_trimmed(block_group, start,
+ size);
BUG_ON(ret); /* -ENOMEM or logic error */
}
@@ -1185,21 +1192,8 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
struct btrfs_space_info *sinfo = cache->space_info;
u64 num_bytes;
u64 sinfo_used;
- u64 min_allocable_bytes;
int ret = -ENOSPC;
- /*
- * We need some metadata space and system metadata space for
- * allocating chunks in some corner cases until we force to set
- * it to be readonly.
- */
- if ((sinfo->flags &
- (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
- !force)
- min_allocable_bytes = SZ_1M;
- else
- min_allocable_bytes = 0;
-
spin_lock(&sinfo->lock);
spin_lock(&cache->lock);
@@ -1217,10 +1211,9 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
* sinfo_used + num_bytes should always <= sinfo->total_bytes.
*
* Here we make sure if we mark this bg RO, we still have enough
- * free space as buffer (if min_allocable_bytes is not 0).
+ * free space as buffer.
*/
- if (sinfo_used + num_bytes + min_allocable_bytes <=
- sinfo->total_bytes) {
+ if (sinfo_used + num_bytes <= sinfo->total_bytes) {
sinfo->bytes_readonly += num_bytes;
cache->ro++;
list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
@@ -1233,8 +1226,8 @@ out:
btrfs_info(cache->fs_info,
"unable to make block group %llu ro", cache->start);
btrfs_info(cache->fs_info,
- "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
- sinfo_used, num_bytes, min_allocable_bytes);
+ "sinfo_used=%llu bg_num_bytes=%llu",
+ sinfo_used, num_bytes);
btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
}
return ret;
@@ -1249,6 +1242,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
struct btrfs_block_group *block_group;
struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
+ const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
int ret = 0;
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
@@ -1272,10 +1266,28 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
}
spin_unlock(&fs_info->unused_bgs_lock);
+ btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
+
mutex_lock(&fs_info->delete_unused_bgs_mutex);
/* Don't want to race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
+
+ /*
+ * Async discard moves the final block group discard to be prior
+ * to the unused_bgs code path. Therefore, if it's not fully
+ * trimmed, punt it back to the async discard lists.
+ */
+ if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
+ !btrfs_is_free_space_trimmed(block_group)) {
+ trace_btrfs_skip_unused_block_group(block_group);
+ up_write(&space_info->groups_sem);
+ /* Requeue if we failed because of async discard */
+ btrfs_discard_queue_work(&fs_info->discard_ctl,
+ block_group);
+ goto next;
+ }
+
spin_lock(&block_group->lock);
if (block_group->reserved || block_group->pinned ||
block_group->used || block_group->ro ||
@@ -1347,6 +1359,23 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
}
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ /*
+ * At this point, the block_group is read only and should fail
+ * new allocations. However, btrfs_finish_extent_commit() can
+ * cause this block_group to be placed back on the discard
+ * lists because now the block_group isn't fully discarded.
+ * Bail here and try again later after discarding everything.
+ */
+ spin_lock(&fs_info->discard_ctl.lock);
+ if (!list_empty(&block_group->discard_list)) {
+ spin_unlock(&fs_info->discard_ctl.lock);
+ btrfs_dec_block_group_ro(block_group);
+ btrfs_discard_queue_work(&fs_info->discard_ctl,
+ block_group);
+ goto end_trans;
+ }
+ spin_unlock(&fs_info->discard_ctl.lock);
+
/* Reset pinned so btrfs_put_block_group doesn't complain */
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
@@ -1362,8 +1391,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
+ /*
+ * The normal path here is an unused block group is passed here,
+ * then trimming is handled in the transaction commit path.
+ * Async discard interposes before this to do the trimming
+ * before coming down the unused block group path as trimming
+ * will no longer be done later in the transaction commit path.
+ */
+ if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
+ goto flip_async;
+
/* DISCARD can flip during remount */
- trimming = btrfs_test_opt(fs_info, DISCARD);
+ trimming = btrfs_test_opt(fs_info, DISCARD_SYNC);
/* Implicit trim during transaction commit. */
if (trimming)
@@ -1406,6 +1445,13 @@ next:
spin_lock(&fs_info->unused_bgs_lock);
}
spin_unlock(&fs_info->unused_bgs_lock);
+ return;
+
+flip_async:
+ btrfs_end_transaction(trans);
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ btrfs_put_block_group(block_group);
+ btrfs_discard_punt_unused_bgs_list(fs_info);
}
void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
@@ -1516,6 +1562,102 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
write_sequnlock(&fs_info->profiles_lock);
}
+/**
+ * btrfs_rmap_block - Map a physical disk address to a list of logical addresses
+ * @chunk_start: logical address of block group
+ * @physical: physical address to map to logical addresses
+ * @logical: return array of logical addresses which map to @physical
+ * @naddrs: length of @logical
+ * @stripe_len: size of IO stripe for the given block group
+ *
+ * Maps a particular @physical disk address to a list of @logical addresses.
+ * Used primarily to exclude those portions of a block group that contain super
+ * block copies.
+ */
+EXPORT_FOR_TESTS
+int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
+ u64 physical, u64 **logical, int *naddrs, int *stripe_len)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 *buf;
+ u64 bytenr;
+ u64 data_stripe_length;
+ u64 io_stripe_size;
+ int i, nr = 0;
+ int ret = 0;
+
+ em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
+ if (IS_ERR(em))
+ return -EIO;
+
+ map = em->map_lookup;
+ data_stripe_length = em->len;
+ io_stripe_size = map->stripe_len;
+
+ if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+ data_stripe_length = div_u64(data_stripe_length,
+ map->num_stripes / map->sub_stripes);
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+ data_stripe_length = div_u64(data_stripe_length, map->num_stripes);
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ data_stripe_length = div_u64(data_stripe_length,
+ nr_data_stripes(map));
+ io_stripe_size = map->stripe_len * nr_data_stripes(map);
+ }
+
+ buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
+ if (!buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < map->num_stripes; i++) {
+ bool already_inserted = false;
+ u64 stripe_nr;
+ int j;
+
+ if (!in_range(physical, map->stripes[i].physical,
+ data_stripe_length))
+ continue;
+
+ stripe_nr = physical - map->stripes[i].physical;
+ stripe_nr = div64_u64(stripe_nr, map->stripe_len);
+
+ if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+ stripe_nr = stripe_nr * map->num_stripes + i;
+ stripe_nr = div_u64(stripe_nr, map->sub_stripes);
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+ stripe_nr = stripe_nr * map->num_stripes + i;
+ }
+ /*
+ * The remaining case would be for RAID56, multiply by
+ * nr_data_stripes(). Alternatively, just use rmap_len below
+ * instead of map->stripe_len
+ */
+
+ bytenr = chunk_start + stripe_nr * io_stripe_size;
+
+ /* Ensure we don't add duplicate addresses */
+ for (j = 0; j < nr; j++) {
+ if (buf[j] == bytenr) {
+ already_inserted = true;
+ break;
+ }
+ }
+
+ if (!already_inserted)
+ buf[nr++] = bytenr;
+ }
+
+ *logical = buf;
+ *naddrs = nr;
+ *stripe_len = io_stripe_size;
+out:
+ free_extent_map(em);
+ return ret;
+}
+
static int exclude_super_stripes(struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
@@ -1610,6 +1752,8 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
set_free_space_tree_thresholds(cache);
+ cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
+
atomic_set(&cache->count, 1);
spin_lock_init(&cache->lock);
init_rwsem(&cache->data_rwsem);
@@ -1617,6 +1761,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
INIT_LIST_HEAD(&cache->cluster_list);
INIT_LIST_HEAD(&cache->bg_list);
INIT_LIST_HEAD(&cache->ro_list);
+ INIT_LIST_HEAD(&cache->discard_list);
INIT_LIST_HEAD(&cache->dirty_list);
INIT_LIST_HEAD(&cache->io_list);
btrfs_init_free_space_ctl(cache);
@@ -1775,7 +1920,10 @@ static int read_one_block_group(struct btrfs_fs_info *info,
inc_block_group_ro(cache, 1);
} else if (cache->used == 0) {
ASSERT(list_empty(&cache->bg_list));
- btrfs_mark_bg_unused(cache);
+ if (btrfs_test_opt(info, DISCARD_ASYNC))
+ btrfs_discard_queue_work(&info->discard_ctl, cache);
+ else
+ btrfs_mark_bg_unused(cache);
}
return 0;
error:
@@ -2738,8 +2886,10 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
* dirty list to avoid races between cleaner kthread and space
* cache writeout.
*/
- if (!alloc && old_val == 0)
- btrfs_mark_bg_unused(cache);
+ if (!alloc && old_val == 0) {
+ if (!btrfs_test_opt(info, DISCARD_ASYNC))
+ btrfs_mark_bg_unused(cache);
+ }
btrfs_put_block_group(cache);
total -= num_bytes;
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 9b409676c4b2..107bb557ca8d 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -13,6 +13,19 @@ enum btrfs_disk_cache_state {
};
/*
+ * This describes the state of the block_group for async discard. This is due
+ * to the two pass nature of it where extent discarding is prioritized over
+ * bitmap discarding. BTRFS_DISCARD_RESET_CURSOR is set when we are resetting
+ * between lists to prevent contention for discard state variables
+ * (eg. discard_cursor).
+ */
+enum btrfs_discard_state {
+ BTRFS_DISCARD_EXTENTS,
+ BTRFS_DISCARD_BITMAPS,
+ BTRFS_DISCARD_RESET_CURSOR,
+};
+
+/*
* Control flags for do_chunk_alloc's force field CHUNK_ALLOC_NO_FORCE means to
* only allocate a chunk if we really need one.
*
@@ -116,7 +129,13 @@ struct btrfs_block_group {
/* For read-only block groups */
struct list_head ro_list;
+ /* For discard operations */
atomic_t trimming;
+ struct list_head discard_list;
+ int discard_index;
+ u64 discard_eligible_time;
+ u64 discard_cursor;
+ enum btrfs_discard_state discard_state;
/* For dirty block groups */
struct list_head dirty_list;
@@ -158,6 +177,22 @@ struct btrfs_block_group {
struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
};
+static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
+{
+ return (block_group->start + block_group->length);
+}
+
+static inline bool btrfs_is_block_group_data_only(
+ struct btrfs_block_group *block_group)
+{
+ /*
+ * In mixed mode the fragmentation is expected to be high, lowering the
+ * efficiency, so only proper data block groups are considered.
+ */
+ return (block_group->flags & BTRFS_BLOCK_GROUP_DATA) &&
+ !(block_group->flags & BTRFS_BLOCK_GROUP_METADATA);
+}
+
#ifdef CONFIG_BTRFS_DEBUG
static inline int btrfs_should_fragment_free_space(
struct btrfs_block_group *block_group)
@@ -248,4 +283,9 @@ static inline int btrfs_block_group_done(struct btrfs_block_group *cache)
cache->cached == BTRFS_CACHE_ERROR;
}
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
+ u64 physical, u64 **logical, int *naddrs, int *stripe_len);
+#endif
+
#endif /* BTRFS_BLOCK_GROUP_H */
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 0b52ab4cb964..a0ce69f2d27c 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -629,7 +629,6 @@ static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev,
static int btrfsic_process_superblock(struct btrfsic_state *state,
struct btrfs_fs_devices *fs_devices)
{
- struct btrfs_fs_info *fs_info = state->fs_info;
struct btrfs_super_block *selected_super;
struct list_head *dev_head = &fs_devices->devices;
struct btrfs_device *device;
@@ -637,7 +636,6 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
int ret = 0;
int pass;
- BUG_ON(NULL == state);
selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
if (NULL == selected_super) {
pr_info("btrfsic: error, kmalloc failed!\n");
@@ -700,7 +698,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
break;
}
- num_copies = btrfs_num_copies(fs_info, next_bytenr,
+ num_copies = btrfs_num_copies(state->fs_info, next_bytenr,
state->metablock_size);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
pr_info("num_copies(log_bytenr=%llu) = %d\n",
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index ee834ef7beb4..9ab610cc9114 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -447,7 +447,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
if (blkcg_css) {
bio->bi_opf |= REQ_CGROUP_PUNT;
- bio_associate_blkg_from_css(bio, blkcg_css);
+ kthread_associate_blkcg(blkcg_css);
}
refcount_set(&cb->pending_bios, 1);
@@ -491,6 +491,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
bio->bi_opf = REQ_OP_WRITE | write_flags;
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
+ if (blkcg_css)
+ bio->bi_opf |= REQ_CGROUP_PUNT;
bio_add_page(bio, page, PAGE_SIZE, 0);
}
if (bytes_left < PAGE_SIZE) {
@@ -517,6 +519,9 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
bio_endio(bio);
}
+ if (blkcg_css)
+ kthread_associate_blkcg(NULL);
+
return 0;
}
@@ -758,7 +763,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
ret = btrfs_lookup_bio_sums(inode, comp_bio,
- sums);
+ (u64)-1, sums);
BUG_ON(ret); /* -ENOMEM */
}
@@ -786,7 +791,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
BUG_ON(ret); /* -ENOMEM */
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
- ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
+ ret = btrfs_lookup_bio_sums(inode, comp_bio, (u64)-1, sums);
BUG_ON(ret); /* -ENOMEM */
}
@@ -1285,7 +1290,7 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
/* copy bytes from the working buffer into the pages */
while (working_bytes > 0) {
bytes = min_t(unsigned long, bvec.bv_len,
- PAGE_SIZE - buf_offset);
+ PAGE_SIZE - (buf_offset % PAGE_SIZE));
bytes = min(bytes, working_bytes);
kaddr = kmap_atomic(bvec.bv_page);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 54efb21c2727..f90b82050d2d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -101,6 +101,14 @@ struct btrfs_ref;
#define BTRFS_MAX_EXTENT_SIZE SZ_128M
+/*
+ * Deltas are an effective way to populate global statistics. Give macro names
+ * to make it clear what we're doing. An example is discard_extents in
+ * btrfs_free_space_ctl.
+ */
+#define BTRFS_STAT_NR_ENTRIES 2
+#define BTRFS_STAT_CURR 0
+#define BTRFS_STAT_PREV 1
/*
* Count how many BTRFS_MAX_EXTENT_SIZE cover the @size
@@ -440,6 +448,36 @@ struct btrfs_full_stripe_locks_tree {
struct mutex lock;
};
+/* Discard control. */
+/*
+ * Async discard uses multiple lists to differentiate the discard filter
+ * parameters. Index 0 is for completely free block groups where we need to
+ * ensure the entire block group is trimmed without being lossy. Indices
+ * afterwards represent monotonically decreasing discard filter sizes to
+ * prioritize what should be discarded next.
+ */
+#define BTRFS_NR_DISCARD_LISTS 3
+#define BTRFS_DISCARD_INDEX_UNUSED 0
+#define BTRFS_DISCARD_INDEX_START 1
+
+struct btrfs_discard_ctl {
+ struct workqueue_struct *discard_workers;
+ struct delayed_work work;
+ spinlock_t lock;
+ struct btrfs_block_group *block_group;
+ struct list_head discard_list[BTRFS_NR_DISCARD_LISTS];
+ u64 prev_discard;
+ atomic_t discardable_extents;
+ atomic64_t discardable_bytes;
+ u64 max_discard_size;
+ unsigned long delay;
+ u32 iops_limit;
+ u32 kbps_limit;
+ u64 discard_extent_bytes;
+ u64 discard_bitmap_bytes;
+ atomic64_t discard_bytes_saved;
+};
+
/* delayed seq elem */
struct seq_list {
struct list_head list;
@@ -526,6 +564,9 @@ enum {
* so we don't need to offload checksums to workqueues.
*/
BTRFS_FS_CSUM_IMPL_FAST,
+
+ /* Indicate that the discard workqueue can service discards. */
+ BTRFS_FS_DISCARD_RUNNING,
};
struct btrfs_fs_info {
@@ -816,6 +857,8 @@ struct btrfs_fs_info {
struct btrfs_workqueue *scrub_wr_completion_workers;
struct btrfs_workqueue *scrub_parity_workers;
+ struct btrfs_discard_ctl discard_ctl;
+
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
u32 check_integrity_print_mask;
#endif
@@ -902,6 +945,11 @@ struct btrfs_fs_info {
spinlock_t ref_verify_lock;
struct rb_root block_tree;
#endif
+
+#ifdef CONFIG_BTRFS_DEBUG
+ struct kobject *debug_kobj;
+ struct kobject *discard_debug_kobj;
+#endif
};
static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
@@ -1170,7 +1218,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7)
#define BTRFS_MOUNT_SSD_SPREAD (1 << 8)
#define BTRFS_MOUNT_NOSSD (1 << 9)
-#define BTRFS_MOUNT_DISCARD (1 << 10)
+#define BTRFS_MOUNT_DISCARD_SYNC (1 << 10)
#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11)
#define BTRFS_MOUNT_SPACE_CACHE (1 << 12)
#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13)
@@ -1189,6 +1237,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
#define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26)
#define BTRFS_MOUNT_NOLOGREPLAY (1 << 27)
#define BTRFS_MOUNT_REF_VERIFY (1 << 28)
+#define BTRFS_MOUNT_DISCARD_ASYNC (1 << 29)
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
#define BTRFS_DEFAULT_MAX_INLINE (2048)
@@ -2449,8 +2498,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len, int delalloc);
-int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
- u64 start, u64 len);
+int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start,
+ u64 len);
void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
@@ -2789,9 +2838,7 @@ struct btrfs_dio_private;
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len);
blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
- u8 *dst);
-blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio,
- u64 logical_offset);
+ u64 offset, u8 *dst);
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 pos,
@@ -2877,7 +2924,7 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
struct btrfs_root *root);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct page *page, size_t pg_offset,
- u64 start, u64 end, int create);
+ u64 start, u64 end);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode);
@@ -3110,17 +3157,21 @@ do { \
rcu_read_unlock(); \
} while (0)
-__cold
-static inline void assfail(const char *expr, const char *file, int line)
+#ifdef CONFIG_BTRFS_ASSERT
+__cold __noreturn
+static inline void assertfail(const char *expr, const char *file, int line)
{
- if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) {
- pr_err("assertion failed: %s, in %s:%d\n", expr, file, line);
- BUG();
- }
+ pr_err("assertion failed: %s, in %s:%d\n", expr, file, line);
+ BUG();
}
-#define ASSERT(expr) \
- (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+#define ASSERT(expr) \
+ (likely(expr) ? (void)0 : assertfail(#expr, __FILE__, __LINE__))
+
+#else
+static inline void assertfail(const char *expr, const char* file, int line) { }
+#define ASSERT(expr) (void)(expr)
+#endif
/*
* Use that for functions that are conditionally exported for sanity tests but
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index f639dde2a679..2ca2a09d0e23 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -500,11 +500,8 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
&dev_replace->scrub_progress, 0, 1);
ret = btrfs_dev_replace_finishing(fs_info, ret);
- if (ret == -EINPROGRESS) {
+ if (ret == -EINPROGRESS)
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
- } else if (ret != -ECANCELED) {
- WARN_ON(ret);
- }
return ret;
@@ -707,6 +704,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
/* replace the sysfs entry */
btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device);
+ btrfs_sysfs_update_devid(tgt_device);
btrfs_rm_dev_replace_free_srcdev(src_device);
/* write back the superblocks */
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
new file mode 100644
index 000000000000..5615320fa659
--- /dev/null
+++ b/fs/btrfs/discard.c
@@ -0,0 +1,702 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/ktime.h>
+#include <linux/list.h>
+#include <linux/math64.h>
+#include <linux/sizes.h>
+#include <linux/workqueue.h>
+#include "ctree.h"
+#include "block-group.h"
+#include "discard.h"
+#include "free-space-cache.h"
+
+/*
+ * This contains the logic to handle async discard.
+ *
+ * Async discard manages trimming of free space outside of transaction commit.
+ * Discarding is done by managing the block_groups on a LRU list based on free
+ * space recency. Two passes are used to first prioritize discarding extents
+ * and then allow for trimming in the bitmap the best opportunity to coalesce.
+ * The block_groups are maintained on multiple lists to allow for multiple
+ * passes with different discard filter requirements. A delayed work item is
+ * used to manage discarding with timeout determined by a max of the delay
+ * incurred by the iops rate limit, the byte rate limit, and the max delay of
+ * BTRFS_DISCARD_MAX_DELAY.
+ *
+ * Note, this only keeps track of block_groups that are explicitly for data.
+ * Mixed block_groups are not supported.
+ *
+ * The first list is special to manage discarding of fully free block groups.
+ * This is necessary because we issue a final trim for a full free block group
+ * after forgetting it. When a block group becomes unused, instead of directly
+ * being added to the unused_bgs list, we add it to this first list. Then
+ * from there, if it becomes fully discarded, we place it onto the unused_bgs
+ * list.
+ *
+ * The in-memory free space cache serves as the backing state for discard.
+ * Consequently this means there is no persistence. We opt to load all the
+ * block groups in as not discarded, so the mount case degenerates to the
+ * crashing case.
+ *
+ * As the free space cache uses bitmaps, there exists a tradeoff between
+ * ease/efficiency for find_free_extent() and the accuracy of discard state.
+ * Here we opt to let untrimmed regions merge with everything while only letting
+ * trimmed regions merge with other trimmed regions. This can cause
+ * overtrimming, but the coalescing benefit seems to be worth it. Additionally,
+ * bitmap state is tracked as a whole. If we're able to fully trim a bitmap,
+ * the trimmed flag is set on the bitmap. Otherwise, if an allocation comes in,
+ * this resets the state and we will retry trimming the whole bitmap. This is a
+ * tradeoff between discard state accuracy and the cost of accounting.
+ */
+
+/* This is an initial delay to give some chance for block reuse */
+#define BTRFS_DISCARD_DELAY (120ULL * NSEC_PER_SEC)
+#define BTRFS_DISCARD_UNUSED_DELAY (10ULL * NSEC_PER_SEC)
+
+/* Target completion latency of discarding all discardable extents */
+#define BTRFS_DISCARD_TARGET_MSEC (6 * 60 * 60UL * MSEC_PER_SEC)
+#define BTRFS_DISCARD_MIN_DELAY_MSEC (1UL)
+#define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL)
+#define BTRFS_DISCARD_MAX_IOPS (10U)
+
+/* Montonically decreasing minimum length filters after index 0 */
+static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {
+ 0,
+ BTRFS_ASYNC_DISCARD_MAX_FILTER,
+ BTRFS_ASYNC_DISCARD_MIN_FILTER
+};
+
+static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group)
+{
+ return &discard_ctl->discard_list[block_group->discard_index];
+}
+
+static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group)
+{
+ if (!btrfs_run_discard_work(discard_ctl))
+ return;
+
+ if (list_empty(&block_group->discard_list) ||
+ block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) {
+ if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED)
+ block_group->discard_index = BTRFS_DISCARD_INDEX_START;
+ block_group->discard_eligible_time = (ktime_get_ns() +
+ BTRFS_DISCARD_DELAY);
+ block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
+ }
+
+ list_move_tail(&block_group->discard_list,
+ get_discard_list(discard_ctl, block_group));
+}
+
+static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group)
+{
+ if (!btrfs_is_block_group_data_only(block_group))
+ return;
+
+ spin_lock(&discard_ctl->lock);
+ __add_to_discard_list(discard_ctl, block_group);
+ spin_unlock(&discard_ctl->lock);
+}
+
+static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group)
+{
+ spin_lock(&discard_ctl->lock);
+
+ if (!btrfs_run_discard_work(discard_ctl)) {
+ spin_unlock(&discard_ctl->lock);
+ return;
+ }
+
+ list_del_init(&block_group->discard_list);
+
+ block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
+ block_group->discard_eligible_time = (ktime_get_ns() +
+ BTRFS_DISCARD_UNUSED_DELAY);
+ block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
+ list_add_tail(&block_group->discard_list,
+ &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]);
+
+ spin_unlock(&discard_ctl->lock);
+}
+
+static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group)
+{
+ bool running = false;
+
+ spin_lock(&discard_ctl->lock);
+
+ if (block_group == discard_ctl->block_group) {
+ running = true;
+ discard_ctl->block_group = NULL;
+ }
+
+ block_group->discard_eligible_time = 0;
+ list_del_init(&block_group->discard_list);
+
+ spin_unlock(&discard_ctl->lock);
+
+ return running;
+}
+
+/**
+ * find_next_block_group - find block_group that's up next for discarding
+ * @discard_ctl: discard control
+ * @now: current time
+ *
+ * Iterate over the discard lists to find the next block_group up for
+ * discarding checking the discard_eligible_time of block_group.
+ */
+static struct btrfs_block_group *find_next_block_group(
+ struct btrfs_discard_ctl *discard_ctl,
+ u64 now)
+{
+ struct btrfs_block_group *ret_block_group = NULL, *block_group;
+ int i;
+
+ for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
+ struct list_head *discard_list = &discard_ctl->discard_list[i];
+
+ if (!list_empty(discard_list)) {
+ block_group = list_first_entry(discard_list,
+ struct btrfs_block_group,
+ discard_list);
+
+ if (!ret_block_group)
+ ret_block_group = block_group;
+
+ if (ret_block_group->discard_eligible_time < now)
+ break;
+
+ if (ret_block_group->discard_eligible_time >
+ block_group->discard_eligible_time)
+ ret_block_group = block_group;
+ }
+ }
+
+ return ret_block_group;
+}
+
+/**
+ * peek_discard_list - wrap find_next_block_group()
+ * @discard_ctl: discard control
+ * @discard_state: the discard_state of the block_group after state management
+ * @discard_index: the discard_index of the block_group after state management
+ *
+ * This wraps find_next_block_group() and sets the block_group to be in use.
+ * discard_state's control flow is managed here. Variables related to
+ * discard_state are reset here as needed (eg discard_cursor). @discard_state
+ * and @discard_index are remembered as it may change while we're discarding,
+ * but we want the discard to execute in the context determined here.
+ */
+static struct btrfs_block_group *peek_discard_list(
+ struct btrfs_discard_ctl *discard_ctl,
+ enum btrfs_discard_state *discard_state,
+ int *discard_index)
+{
+ struct btrfs_block_group *block_group;
+ const u64 now = ktime_get_ns();
+
+ spin_lock(&discard_ctl->lock);
+again:
+ block_group = find_next_block_group(discard_ctl, now);
+
+ if (block_group && now > block_group->discard_eligible_time) {
+ if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
+ block_group->used != 0) {
+ if (btrfs_is_block_group_data_only(block_group))
+ __add_to_discard_list(discard_ctl, block_group);
+ else
+ list_del_init(&block_group->discard_list);
+ goto again;
+ }
+ if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) {
+ block_group->discard_cursor = block_group->start;
+ block_group->discard_state = BTRFS_DISCARD_EXTENTS;
+ }
+ discard_ctl->block_group = block_group;
+ *discard_state = block_group->discard_state;
+ *discard_index = block_group->discard_index;
+ } else {
+ block_group = NULL;
+ }
+
+ spin_unlock(&discard_ctl->lock);
+
+ return block_group;
+}
+
+/**
+ * btrfs_discard_check_filter - updates a block groups filters
+ * @block_group: block group of interest
+ * @bytes: recently freed region size after coalescing
+ *
+ * Async discard maintains multiple lists with progressively smaller filters
+ * to prioritize discarding based on size. Should a free space that matches
+ * a larger filter be returned to the free_space_cache, prioritize that discard
+ * by moving @block_group to the proper filter.
+ */
+void btrfs_discard_check_filter(struct btrfs_block_group *block_group,
+ u64 bytes)
+{
+ struct btrfs_discard_ctl *discard_ctl;
+
+ if (!block_group ||
+ !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
+ return;
+
+ discard_ctl = &block_group->fs_info->discard_ctl;
+
+ if (block_group->discard_index > BTRFS_DISCARD_INDEX_START &&
+ bytes >= discard_minlen[block_group->discard_index - 1]) {
+ int i;
+
+ remove_from_discard_list(discard_ctl, block_group);
+
+ for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS;
+ i++) {
+ if (bytes >= discard_minlen[i]) {
+ block_group->discard_index = i;
+ add_to_discard_list(discard_ctl, block_group);
+ break;
+ }
+ }
+ }
+}
+
+/**
+ * btrfs_update_discard_index - moves a block group along the discard lists
+ * @discard_ctl: discard control
+ * @block_group: block_group of interest
+ *
+ * Increment @block_group's discard_index. If it falls of the list, let it be.
+ * Otherwise add it back to the appropriate list.
+ */
+static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group)
+{
+ block_group->discard_index++;
+ if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) {
+ block_group->discard_index = 1;
+ return;
+ }
+
+ add_to_discard_list(discard_ctl, block_group);
+}
+
+/**
+ * btrfs_discard_cancel_work - remove a block_group from the discard lists
+ * @discard_ctl: discard control
+ * @block_group: block_group of interest
+ *
+ * This removes @block_group from the discard lists. If necessary, it waits on
+ * the current work and then reschedules the delayed work.
+ */
+void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group)
+{
+ if (remove_from_discard_list(discard_ctl, block_group)) {
+ cancel_delayed_work_sync(&discard_ctl->work);
+ btrfs_discard_schedule_work(discard_ctl, true);
+ }
+}
+
+/**
+ * btrfs_discard_queue_work - handles queuing the block_groups
+ * @discard_ctl: discard control
+ * @block_group: block_group of interest
+ *
+ * This maintains the LRU order of the discard lists.
+ */
+void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group)
+{
+ if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
+ return;
+
+ if (block_group->used == 0)
+ add_to_discard_unused_list(discard_ctl, block_group);
+ else
+ add_to_discard_list(discard_ctl, block_group);
+
+ if (!delayed_work_pending(&discard_ctl->work))
+ btrfs_discard_schedule_work(discard_ctl, false);
+}
+
+/**
+ * btrfs_discard_schedule_work - responsible for scheduling the discard work
+ * @discard_ctl: discard control
+ * @override: override the current timer
+ *
+ * Discards are issued by a delayed workqueue item. @override is used to
+ * update the current delay as the baseline delay interval is reevaluated on
+ * transaction commit. This is also maxed with any other rate limit.
+ */
+void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
+ bool override)
+{
+ struct btrfs_block_group *block_group;
+ const u64 now = ktime_get_ns();
+
+ spin_lock(&discard_ctl->lock);
+
+ if (!btrfs_run_discard_work(discard_ctl))
+ goto out;
+
+ if (!override && delayed_work_pending(&discard_ctl->work))
+ goto out;
+
+ block_group = find_next_block_group(discard_ctl, now);
+ if (block_group) {
+ unsigned long delay = discard_ctl->delay;
+ u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit);
+
+ /*
+ * A single delayed workqueue item is responsible for
+ * discarding, so we can manage the bytes rate limit by keeping
+ * track of the previous discard.
+ */
+ if (kbps_limit && discard_ctl->prev_discard) {
+ u64 bps_limit = ((u64)kbps_limit) * SZ_1K;
+ u64 bps_delay = div64_u64(discard_ctl->prev_discard *
+ MSEC_PER_SEC, bps_limit);
+
+ delay = max(delay, msecs_to_jiffies(bps_delay));
+ }
+
+ /*
+ * This timeout is to hopefully prevent immediate discarding
+ * in a recently allocated block group.
+ */
+ if (now < block_group->discard_eligible_time) {
+ u64 bg_timeout = block_group->discard_eligible_time - now;
+
+ delay = max(delay, nsecs_to_jiffies(bg_timeout));
+ }
+
+ mod_delayed_work(discard_ctl->discard_workers,
+ &discard_ctl->work, delay);
+ }
+out:
+ spin_unlock(&discard_ctl->lock);
+}
+
+/**
+ * btrfs_finish_discard_pass - determine next step of a block_group
+ * @discard_ctl: discard control
+ * @block_group: block_group of interest
+ *
+ * This determines the next step for a block group after it's finished going
+ * through a pass on a discard list. If it is unused and fully trimmed, we can
+ * mark it unused and send it to the unused_bgs path. Otherwise, pass it onto
+ * the appropriate filter list or let it fall off.
+ */
+static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group)
+{
+ remove_from_discard_list(discard_ctl, block_group);
+
+ if (block_group->used == 0) {
+ if (btrfs_is_free_space_trimmed(block_group))
+ btrfs_mark_bg_unused(block_group);
+ else
+ add_to_discard_unused_list(discard_ctl, block_group);
+ } else {
+ btrfs_update_discard_index(discard_ctl, block_group);
+ }
+}
+
+/**
+ * btrfs_discard_workfn - discard work function
+ * @work: work
+ *
+ * This finds the next block_group to start discarding and then discards a
+ * single region. It does this in a two-pass fashion: first extents and second
+ * bitmaps. Completely discarded block groups are sent to the unused_bgs path.
+ */
+static void btrfs_discard_workfn(struct work_struct *work)
+{
+ struct btrfs_discard_ctl *discard_ctl;
+ struct btrfs_block_group *block_group;
+ enum btrfs_discard_state discard_state;
+ int discard_index = 0;
+ u64 trimmed = 0;
+ u64 minlen = 0;
+
+ discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
+
+ block_group = peek_discard_list(discard_ctl, &discard_state,
+ &discard_index);
+ if (!block_group || !btrfs_run_discard_work(discard_ctl))
+ return;
+
+ /* Perform discarding */
+ minlen = discard_minlen[discard_index];
+
+ if (discard_state == BTRFS_DISCARD_BITMAPS) {
+ u64 maxlen = 0;
+
+ /*
+ * Use the previous levels minimum discard length as the max
+ * length filter. In the case something is added to make a
+ * region go beyond the max filter, the entire bitmap is set
+ * back to BTRFS_TRIM_STATE_UNTRIMMED.
+ */
+ if (discard_index != BTRFS_DISCARD_INDEX_UNUSED)
+ maxlen = discard_minlen[discard_index - 1];
+
+ btrfs_trim_block_group_bitmaps(block_group, &trimmed,
+ block_group->discard_cursor,
+ btrfs_block_group_end(block_group),
+ minlen, maxlen, true);
+ discard_ctl->discard_bitmap_bytes += trimmed;
+ } else {
+ btrfs_trim_block_group_extents(block_group, &trimmed,
+ block_group->discard_cursor,
+ btrfs_block_group_end(block_group),
+ minlen, true);
+ discard_ctl->discard_extent_bytes += trimmed;
+ }
+
+ discard_ctl->prev_discard = trimmed;
+
+ /* Determine next steps for a block_group */
+ if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
+ if (discard_state == BTRFS_DISCARD_BITMAPS) {
+ btrfs_finish_discard_pass(discard_ctl, block_group);
+ } else {
+ block_group->discard_cursor = block_group->start;
+ spin_lock(&discard_ctl->lock);
+ if (block_group->discard_state !=
+ BTRFS_DISCARD_RESET_CURSOR)
+ block_group->discard_state =
+ BTRFS_DISCARD_BITMAPS;
+ spin_unlock(&discard_ctl->lock);
+ }
+ }
+
+ spin_lock(&discard_ctl->lock);
+ discard_ctl->block_group = NULL;
+ spin_unlock(&discard_ctl->lock);
+
+ btrfs_discard_schedule_work(discard_ctl, false);
+}
+
+/**
+ * btrfs_run_discard_work - determines if async discard should be running
+ * @discard_ctl: discard control
+ *
+ * Checks if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
+ */
+bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl)
+{
+ struct btrfs_fs_info *fs_info = container_of(discard_ctl,
+ struct btrfs_fs_info,
+ discard_ctl);
+
+ return (!(fs_info->sb->s_flags & SB_RDONLY) &&
+ test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags));
+}
+
+/**
+ * btrfs_discard_calc_delay - recalculate the base delay
+ * @discard_ctl: discard control
+ *
+ * Recalculate the base delay which is based off the total number of
+ * discardable_extents. Clamp this between the lower_limit (iops_limit or 1ms)
+ * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC).
+ */
+void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
+{
+ s32 discardable_extents;
+ s64 discardable_bytes;
+ u32 iops_limit;
+ unsigned long delay;
+ unsigned long lower_limit = BTRFS_DISCARD_MIN_DELAY_MSEC;
+
+ discardable_extents = atomic_read(&discard_ctl->discardable_extents);
+ if (!discardable_extents)
+ return;
+
+ spin_lock(&discard_ctl->lock);
+
+ /*
+ * The following is to fix a potential -1 discrepenancy that we're not
+ * sure how to reproduce. But given that this is the only place that
+ * utilizes these numbers and this is only called by from
+ * btrfs_finish_extent_commit() which is synchronized, we can correct
+ * here.
+ */
+ if (discardable_extents < 0)
+ atomic_add(-discardable_extents,
+ &discard_ctl->discardable_extents);
+
+ discardable_bytes = atomic64_read(&discard_ctl->discardable_bytes);
+ if (discardable_bytes < 0)
+ atomic64_add(-discardable_bytes,
+ &discard_ctl->discardable_bytes);
+
+ if (discardable_extents <= 0) {
+ spin_unlock(&discard_ctl->lock);
+ return;
+ }
+
+ iops_limit = READ_ONCE(discard_ctl->iops_limit);
+ if (iops_limit)
+ lower_limit = max_t(unsigned long, lower_limit,
+ MSEC_PER_SEC / iops_limit);
+
+ delay = BTRFS_DISCARD_TARGET_MSEC / discardable_extents;
+ delay = clamp(delay, lower_limit, BTRFS_DISCARD_MAX_DELAY_MSEC);
+ discard_ctl->delay = msecs_to_jiffies(delay);
+
+ spin_unlock(&discard_ctl->lock);
+}
+
+/**
+ * btrfs_discard_update_discardable - propagate discard counters
+ * @block_group: block_group of interest
+ * @ctl: free_space_ctl of @block_group
+ *
+ * This propagates deltas of counters up to the discard_ctl. It maintains a
+ * current counter and a previous counter passing the delta up to the global
+ * stat. Then the current counter value becomes the previous counter value.
+ */
+void btrfs_discard_update_discardable(struct btrfs_block_group *block_group,
+ struct btrfs_free_space_ctl *ctl)
+{
+ struct btrfs_discard_ctl *discard_ctl;
+ s32 extents_delta;
+ s64 bytes_delta;
+
+ if (!block_group ||
+ !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) ||
+ !btrfs_is_block_group_data_only(block_group))
+ return;
+
+ discard_ctl = &block_group->fs_info->discard_ctl;
+
+ extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] -
+ ctl->discardable_extents[BTRFS_STAT_PREV];
+ if (extents_delta) {
+ atomic_add(extents_delta, &discard_ctl->discardable_extents);
+ ctl->discardable_extents[BTRFS_STAT_PREV] =
+ ctl->discardable_extents[BTRFS_STAT_CURR];
+ }
+
+ bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] -
+ ctl->discardable_bytes[BTRFS_STAT_PREV];
+ if (bytes_delta) {
+ atomic64_add(bytes_delta, &discard_ctl->discardable_bytes);
+ ctl->discardable_bytes[BTRFS_STAT_PREV] =
+ ctl->discardable_bytes[BTRFS_STAT_CURR];
+ }
+}
+
+/**
+ * btrfs_discard_punt_unused_bgs_list - punt unused_bgs list to discard lists
+ * @fs_info: fs_info of interest
+ *
+ * The unused_bgs list needs to be punted to the discard lists because the
+ * order of operations is changed. In the normal sychronous discard path, the
+ * block groups are trimmed via a single large trim in transaction commit. This
+ * is ultimately what we are trying to avoid with asynchronous discard. Thus,
+ * it must be done before going down the unused_bgs path.
+ */
+void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_group *block_group, *next;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ /* We enabled async discard, so punt all to the queue */
+ list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs,
+ bg_list) {
+ list_del_init(&block_group->bg_list);
+ btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+}
+
+/**
+ * btrfs_discard_purge_list - purge discard lists
+ * @discard_ctl: discard control
+ *
+ * If we are disabling async discard, we may have intercepted block groups that
+ * are completely free and ready for the unused_bgs path. As discarding will
+ * now happen in transaction commit or not at all, we can safely mark the
+ * corresponding block groups as unused and they will be sent on their merry
+ * way to the unused_bgs list.
+ */
+static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl)
+{
+ struct btrfs_block_group *block_group, *next;
+ int i;
+
+ spin_lock(&discard_ctl->lock);
+ for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
+ list_for_each_entry_safe(block_group, next,
+ &discard_ctl->discard_list[i],
+ discard_list) {
+ list_del_init(&block_group->discard_list);
+ spin_unlock(&discard_ctl->lock);
+ if (block_group->used == 0)
+ btrfs_mark_bg_unused(block_group);
+ spin_lock(&discard_ctl->lock);
+ }
+ }
+ spin_unlock(&discard_ctl->lock);
+}
+
+void btrfs_discard_resume(struct btrfs_fs_info *fs_info)
+{
+ if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
+ btrfs_discard_cleanup(fs_info);
+ return;
+ }
+
+ btrfs_discard_punt_unused_bgs_list(fs_info);
+
+ set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
+}
+
+void btrfs_discard_stop(struct btrfs_fs_info *fs_info)
+{
+ clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
+}
+
+void btrfs_discard_init(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
+ int i;
+
+ spin_lock_init(&discard_ctl->lock);
+ INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn);
+
+ for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++)
+ INIT_LIST_HEAD(&discard_ctl->discard_list[i]);
+
+ discard_ctl->prev_discard = 0;
+ atomic_set(&discard_ctl->discardable_extents, 0);
+ atomic64_set(&discard_ctl->discardable_bytes, 0);
+ discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE;
+ discard_ctl->delay = BTRFS_DISCARD_MAX_DELAY_MSEC;
+ discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS;
+ discard_ctl->kbps_limit = 0;
+ discard_ctl->discard_extent_bytes = 0;
+ discard_ctl->discard_bitmap_bytes = 0;
+ atomic64_set(&discard_ctl->discard_bytes_saved, 0);
+}
+
+void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info)
+{
+ btrfs_discard_stop(fs_info);
+ cancel_delayed_work_sync(&fs_info->discard_ctl.work);
+ btrfs_discard_purge_list(&fs_info->discard_ctl);
+}
diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h
new file mode 100644
index 000000000000..21a15776dac4
--- /dev/null
+++ b/fs/btrfs/discard.h
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef BTRFS_DISCARD_H
+#define BTRFS_DISCARD_H
+
+#include <linux/sizes.h>
+
+struct btrfs_fs_info;
+struct btrfs_discard_ctl;
+struct btrfs_block_group;
+
+/* Discard size limits */
+#define BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE (SZ_64M)
+#define BTRFS_ASYNC_DISCARD_MAX_FILTER (SZ_1M)
+#define BTRFS_ASYNC_DISCARD_MIN_FILTER (SZ_32K)
+
+/* List operations */
+void btrfs_discard_check_filter(struct btrfs_block_group *block_group, u64 bytes);
+
+/* Work operations */
+void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group);
+void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
+ struct btrfs_block_group *block_group);
+void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
+ bool override);
+bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl);
+
+/* Update operations */
+void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl);
+void btrfs_discard_update_discardable(struct btrfs_block_group *block_group,
+ struct btrfs_free_space_ctl *ctl);
+
+/* Setup/cleanup operations */
+void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info);
+void btrfs_discard_resume(struct btrfs_fs_info *fs_info);
+void btrfs_discard_stop(struct btrfs_fs_info *fs_info);
+void btrfs_discard_init(struct btrfs_fs_info *fs_info);
+void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info);
+
+#endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e0edfdc9c82b..aea48d6ddc0c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -41,6 +41,7 @@
#include "tree-checker.h"
#include "ref-verify.h"
#include "block-group.h"
+#include "discard.h"
#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
BTRFS_HEADER_FLAG_RELOC |\
@@ -202,8 +203,8 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
* that covers the entire device
*/
struct extent_map *btree_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset, u64 start, u64 len,
- int create)
+ struct page *page, size_t pg_offset,
+ u64 start, u64 len)
{
struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
@@ -1953,6 +1954,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
btrfs_destroy_workqueue(fs_info->readahead_workers);
btrfs_destroy_workqueue(fs_info->flush_workers);
btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
+ if (fs_info->discard_ctl.discard_workers)
+ destroy_workqueue(fs_info->discard_ctl.discard_workers);
/*
* Now that all other work queues are destroyed, we can safely destroy
* the queues used for metadata I/O, since tasks from those other work
@@ -2148,6 +2151,8 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
max_active, 2);
fs_info->qgroup_rescan_workers =
btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
+ fs_info->discard_ctl.discard_workers =
+ alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1);
if (!(fs_info->workers && fs_info->delalloc_workers &&
fs_info->flush_workers &&
@@ -2158,7 +2163,8 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
fs_info->caching_workers && fs_info->readahead_workers &&
fs_info->fixup_workers && fs_info->delayed_workers &&
- fs_info->qgroup_rescan_workers)) {
+ fs_info->qgroup_rescan_workers &&
+ fs_info->discard_ctl.discard_workers)) {
return -ENOMEM;
}
@@ -2792,6 +2798,7 @@ int __cold open_ctree(struct super_block *sb,
btrfs_init_dev_replace_locks(fs_info);
btrfs_init_qgroup(fs_info);
+ btrfs_discard_init(fs_info);
btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
@@ -3082,20 +3089,13 @@ int __cold open_ctree(struct super_block *sb,
btrfs_free_extra_devids(fs_devices, 1);
- ret = btrfs_sysfs_add_fsid(fs_devices, NULL);
+ ret = btrfs_sysfs_add_fsid(fs_devices);
if (ret) {
btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
ret);
goto fail_block_groups;
}
- ret = btrfs_sysfs_add_device(fs_devices);
- if (ret) {
- btrfs_err(fs_info, "failed to init sysfs device interface: %d",
- ret);
- goto fail_fsdev_sysfs;
- }
-
ret = btrfs_sysfs_add_mounted(fs_info);
if (ret) {
btrfs_err(fs_info, "failed to init sysfs interface: %d", ret);
@@ -3262,6 +3262,7 @@ int __cold open_ctree(struct super_block *sb,
}
btrfs_qgroup_rescan_resume(fs_info);
+ btrfs_discard_resume(fs_info);
if (!fs_info->uuid_root) {
btrfs_info(fs_info, "creating UUID tree");
@@ -3978,6 +3979,9 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
cancel_work_sync(&fs_info->async_reclaim_work);
+ /* Cancel or finish ongoing discard work */
+ btrfs_discard_cleanup(fs_info);
+
if (!sb_rdonly(fs_info->sb)) {
/*
* The cleaner kthread is stopped, so do one final pass over
@@ -4026,11 +4030,18 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
btrfs_stop_all_workers(fs_info);
- btrfs_free_block_groups(fs_info);
-
clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
free_root_pointers(fs_info, true);
+ /*
+ * We must free the block groups after dropping the fs_roots as we could
+ * have had an IO error and have left over tree log blocks that aren't
+ * cleaned up until the fs roots are freed. This makes the block group
+ * accounting appear to be wrong because there's pending reserved bytes,
+ * so make sure we do the block group cleanup afterwards.
+ */
+ btrfs_free_block_groups(fs_info);
+
iput(fs_info->btree_inode);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 76f123ebb292..8c2d6cf1ce59 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -134,8 +134,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
int btree_lock_page_hook(struct page *page, void *data,
void (*flush_fn)(void *));
struct extent_map *btree_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset, u64 start, u64 len,
- int create);
+ struct page *page, size_t pg_offset,
+ u64 start, u64 len);
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
int __init btrfs_end_io_wq_init(void);
void __cold btrfs_end_io_wq_exit(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 274318e9114e..0163fdd59f8f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -32,6 +32,7 @@
#include "block-rsv.h"
#include "delalloc-space.h"
#include "block-group.h"
+#include "discard.h"
#undef SCRAMBLE_DELAYED_REFS
@@ -2923,7 +2924,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
break;
}
- if (btrfs_test_opt(fs_info, DISCARD))
+ if (btrfs_test_opt(fs_info, DISCARD_SYNC))
ret = btrfs_discard_extent(fs_info, start,
end + 1 - start, NULL);
@@ -2934,6 +2935,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
cond_resched();
}
+ if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
+ btrfs_discard_calc_delay(&fs_info->discard_ctl);
+ btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
+ }
+
/*
* Transaction is finished. We don't need the lock anymore. We
* do need to clean up the block groups in case of a transaction
@@ -3438,7 +3444,6 @@ btrfs_release_block_group(struct btrfs_block_group *cache,
*/
struct find_free_extent_ctl {
/* Basic allocation info */
- u64 ram_bytes;
u64 num_bytes;
u64 empty_size;
u64 flags;
@@ -3810,7 +3815,6 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
WARN_ON(num_bytes < fs_info->sectorsize);
- ffe_ctl.ram_bytes = ram_bytes;
ffe_ctl.num_bytes = num_bytes;
ffe_ctl.empty_size = empty_size;
ffe_ctl.flags = flags;
@@ -4165,12 +4169,10 @@ again:
return ret;
}
-static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
- u64 start, u64 len,
- int pin, int delalloc)
+int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
+ u64 start, u64 len, int delalloc)
{
struct btrfs_block_group *cache;
- int ret = 0;
cache = btrfs_lookup_block_group(fs_info, start);
if (!cache) {
@@ -4179,30 +4181,28 @@ static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
return -ENOSPC;
}
- if (pin)
- pin_down_extent(cache, start, len, 1);
- else {
- if (btrfs_test_opt(fs_info, DISCARD))
- ret = btrfs_discard_extent(fs_info, start, len, NULL);
- btrfs_add_free_space(cache, start, len);
- btrfs_free_reserved_bytes(cache, len, delalloc);
- trace_btrfs_reserved_extent_free(fs_info, start, len);
- }
+ btrfs_add_free_space(cache, start, len);
+ btrfs_free_reserved_bytes(cache, len, delalloc);
+ trace_btrfs_reserved_extent_free(fs_info, start, len);
btrfs_put_block_group(cache);
- return ret;
+ return 0;
}
-int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
- u64 start, u64 len, int delalloc)
+int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
{
- return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
-}
+ struct btrfs_block_group *cache;
+ int ret = 0;
-int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
- u64 start, u64 len)
-{
- return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
+ cache = btrfs_lookup_block_group(fs_info, start);
+ if (!cache) {
+ btrfs_err(fs_info, "unable to find block group for %llu", start);
+ return -ENOSPC;
+ }
+
+ ret = pin_down_extent(cache, start, len, 1);
+ btrfs_put_block_group(cache);
+ return ret;
}
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2f4802f405a2..e2d30287e2d5 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3043,7 +3043,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
*em_cached = NULL;
}
- em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0);
+ em = get_extent(BTRFS_I(inode), page, pg_offset, start, len);
if (em_cached && !IS_ERR_OR_NULL(em)) {
BUG_ON(*em_cached);
refcount_inc(&em->refs);
@@ -3455,11 +3455,6 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
update_nr_written(wbc, nr_written + 1);
end = page_end;
- if (i_size <= start) {
- btrfs_writepage_endio_finish_ordered(page, start, page_end, 1);
- goto done;
- }
-
blocksize = inode->i_sb->s_blocksize;
while (cur <= end) {
@@ -3471,8 +3466,8 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
page_end, 1);
break;
}
- em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur,
- end - cur + 1, 1);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur,
+ end - cur + 1);
if (IS_ERR_OR_NULL(em)) {
SetPageError(page);
ret = PTR_ERR_OR_ZERO(em);
@@ -3497,22 +3492,11 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
*/
if (compressed || block_start == EXTENT_MAP_HOLE ||
block_start == EXTENT_MAP_INLINE) {
- /*
- * end_io notification does not happen here for
- * compressed extents
- */
- if (!compressed)
- btrfs_writepage_endio_finish_ordered(page, cur,
- cur + iosize - 1,
- 1);
- else if (compressed) {
- /* we don't want to end_page_writeback on
- * a compressed extent. this happens
- * elsewhere
- */
+ if (compressed)
nr++;
- }
-
+ else
+ btrfs_writepage_endio_finish_ordered(page, cur,
+ cur + iosize - 1, 1);
cur += iosize;
pg_offset += iosize;
continue;
@@ -3540,7 +3524,6 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
pg_offset += iosize;
nr++;
}
-done:
*nr_ret = nr;
return ret;
}
@@ -3562,7 +3545,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
u64 page_end = start + PAGE_SIZE - 1;
int ret;
int nr = 0;
- size_t pg_offset = 0;
+ size_t pg_offset;
loff_t i_size = i_size_read(inode);
unsigned long end_index = i_size >> PAGE_SHIFT;
unsigned long nr_written = 0;
@@ -3591,14 +3574,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
flush_dcache_page(page);
}
- pg_offset = 0;
-
set_page_extent_mapped(page);
if (!epd->extent_locked) {
ret = writepage_delalloc(inode, page, wbc, start, &nr_written);
if (ret == 1)
- goto done_unlocked;
+ return 0;
if (ret)
goto done;
}
@@ -3606,7 +3587,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
ret = __extent_writepage_io(inode, page, wbc, epd,
i_size, nr_written, &nr);
if (ret == 1)
- goto done_unlocked;
+ return 0;
done:
if (nr == 0) {
@@ -3621,9 +3602,6 @@ done:
unlock_page(page);
ASSERT(ret <= 0);
return ret;
-
-done_unlocked:
- return 0;
}
void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
@@ -3941,6 +3919,11 @@ int btree_write_cache_pages(struct address_space *mapping,
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
end = -1;
+ /*
+ * Start from the beginning does not need to cycle over the
+ * range, mark it as scanned.
+ */
+ scanned = (index == 0);
} else {
index = wbc->range_start >> PAGE_SHIFT;
end = wbc->range_end >> PAGE_SHIFT;
@@ -3958,7 +3941,6 @@ retry:
tag))) {
unsigned i;
- scanned = 1;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
@@ -4087,6 +4069,11 @@ static int extent_write_cache_pages(struct address_space *mapping,
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
end = -1;
+ /*
+ * Start from the beginning does not need to cycle over the
+ * range, mark it as scanned.
+ */
+ scanned = (index == 0);
} else {
index = wbc->range_start >> PAGE_SHIFT;
end = wbc->range_end >> PAGE_SHIFT;
@@ -4120,7 +4107,6 @@ retry:
&index, end, tag))) {
unsigned i;
- scanned = 1;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a8551a1f56e2..5d205bbaafdc 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -183,10 +183,8 @@ static inline int extent_compress_type(unsigned long bio_flags)
struct extent_map_tree;
typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
- struct page *page,
- size_t pg_offset,
- u64 start, u64 len,
- int create);
+ struct page *page, size_t pg_offset,
+ u64 start, u64 len);
int try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index b1bfdc5c1387..c2f365662d55 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -148,8 +148,19 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
return ret;
}
-static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
- u64 logical_offset, u8 *dst, int dio)
+/**
+ * btrfs_lookup_bio_sums - Look up checksums for a bio.
+ * @inode: inode that the bio is for.
+ * @bio: bio embedded in btrfs_io_bio.
+ * @offset: Unless (u64)-1, look up checksums for this offset in the file.
+ * If (u64)-1, use the page offsets from the bio instead.
+ * @dst: Buffer of size btrfs_super_csum_size() used to return checksum. If
+ * NULL, the checksum is returned in btrfs_io_bio(bio)->csum instead.
+ *
+ * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise.
+ */
+blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
+ u64 offset, u8 *dst)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct bio_vec bvec;
@@ -158,8 +169,8 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio
struct btrfs_csum_item *item = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_path *path;
+ const bool page_offsets = (offset == (u64)-1);
u8 *csum;
- u64 offset = 0;
u64 item_start_offset = 0;
u64 item_last_offset = 0;
u64 disk_bytenr;
@@ -205,15 +216,13 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio
}
disk_bytenr = (u64)bio->bi_iter.bi_sector << 9;
- if (dio)
- offset = logical_offset;
bio_for_each_segment(bvec, bio, iter) {
page_bytes_left = bvec.bv_len;
if (count)
goto next;
- if (!dio)
+ if (page_offsets)
offset = page_offset(bvec.bv_page) + bvec.bv_offset;
count = btrfs_find_ordered_sum(inode, offset, disk_bytenr,
csum, nblocks);
@@ -274,7 +283,8 @@ found:
csum += count * csum_size;
nblocks -= count;
next:
- while (count--) {
+ while (count > 0) {
+ count--;
disk_bytenr += fs_info->sectorsize;
offset += fs_info->sectorsize;
page_bytes_left -= fs_info->sectorsize;
@@ -285,18 +295,7 @@ next:
WARN_ON_ONCE(count);
btrfs_free_path(path);
- return 0;
-}
-
-blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
- u8 *dst)
-{
- return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0);
-}
-
-blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 offset)
-{
- return __btrfs_lookup_bio_sums(inode, bio, offset, NULL, 1);
+ return BLK_STS_OK;
}
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
@@ -483,8 +482,8 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
- 1);
for (i = 0; i < nr_sectors; i++) {
- if (offset >= ordered->file_offset + ordered->len ||
- offset < ordered->file_offset) {
+ if (offset >= ordered->file_offset + ordered->num_bytes ||
+ offset < ordered->file_offset) {
unsigned long bytes_left;
sums->len = this_sum_bytes;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8d47c76b7bd1..a16da274c9aa 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -477,8 +477,7 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
u64 em_len;
int ret = 0;
- em = btrfs_get_extent(inode, NULL, 0, search_start,
- search_len, 0);
+ em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
if (IS_ERR(em))
return PTR_ERR(em);
@@ -1501,7 +1500,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
ordered = btrfs_lookup_ordered_range(inode, start_pos,
last_pos - start_pos + 1);
if (ordered &&
- ordered->file_offset + ordered->len > start_pos &&
+ ordered->file_offset + ordered->num_bytes > start_pos &&
ordered->file_offset <= last_pos) {
unlock_extent_cached(&inode->io_tree, start_pos,
last_pos, cached_state);
@@ -2390,7 +2389,7 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
round_down(*start, fs_info->sectorsize),
- round_up(*len, fs_info->sectorsize), 0);
+ round_up(*len, fs_info->sectorsize));
if (IS_ERR(em))
return PTR_ERR(em);
@@ -2426,7 +2425,7 @@ static int btrfs_punch_hole_lock_range(struct inode *inode,
* we need to try again.
*/
if ((!ordered ||
- (ordered->file_offset + ordered->len <= lockstart ||
+ (ordered->file_offset + ordered->num_bytes <= lockstart ||
ordered->file_offset > lockend)) &&
!filemap_range_has_page(inode->i_mapping,
lockstart, lockend)) {
@@ -2957,7 +2956,7 @@ static int btrfs_zero_range_check_range_boundary(struct inode *inode,
int ret;
offset = round_down(offset, sectorsize);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em))
return PTR_ERR(em);
@@ -2990,8 +2989,8 @@ static int btrfs_zero_range(struct inode *inode,
inode_dio_wait(inode);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
- alloc_start, alloc_end - alloc_start, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
+ alloc_end - alloc_start);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
@@ -3034,8 +3033,8 @@ static int btrfs_zero_range(struct inode *inode,
if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
- alloc_start, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
+ sectorsize);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
@@ -3248,7 +3247,7 @@ static long btrfs_fallocate(struct file *file, int mode,
ordered = btrfs_lookup_first_ordered_extent(inode, locked_end);
if (ordered &&
- ordered->file_offset + ordered->len > alloc_start &&
+ ordered->file_offset + ordered->num_bytes > alloc_start &&
ordered->file_offset < alloc_end) {
btrfs_put_ordered_extent(ordered);
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
@@ -3273,7 +3272,7 @@ static long btrfs_fallocate(struct file *file, int mode,
INIT_LIST_HEAD(&reserve_list);
while (cur_offset < alloc_end) {
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
- alloc_end - cur_offset, 0);
+ alloc_end - cur_offset);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
break;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 3283da419200..0598fd3c6e3f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -21,9 +21,11 @@
#include "space-info.h"
#include "delalloc-space.h"
#include "block-group.h"
+#include "discard.h"
#define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
-#define MAX_CACHE_BYTES_PER_GIG SZ_32K
+#define MAX_CACHE_BYTES_PER_GIG SZ_64K
+#define FORCE_EXTENT_THRESHOLD SZ_1M
struct btrfs_trim_range {
u64 start;
@@ -31,6 +33,8 @@ struct btrfs_trim_range {
struct list_head list;
};
+static int count_bitmap_extents(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *bitmap_info);
static int link_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info);
static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
@@ -752,6 +756,16 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
goto free_cache;
}
+ /*
+ * Sync discard ensures that the free space cache is always
+ * trimmed. So when reading this in, the state should reflect
+ * that. We also do this for async as a stop gap for lack of
+ * persistence.
+ */
+ if (btrfs_test_opt(fs_info, DISCARD_SYNC) ||
+ btrfs_test_opt(fs_info, DISCARD_ASYNC))
+ e->trim_state = BTRFS_TRIM_STATE_TRIMMED;
+
if (!e->bytes) {
kmem_cache_free(btrfs_free_space_cachep, e);
goto free_cache;
@@ -805,12 +819,19 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
ret = io_ctl_read_bitmap(&io_ctl, e);
if (ret)
goto free_cache;
+ e->bitmap_extents = count_bitmap_extents(ctl, e);
+ if (!btrfs_free_space_trimmed(e)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR] +=
+ e->bitmap_extents;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] += e->bytes;
+ }
}
io_ctl_drop_pages(&io_ctl);
merge_space_tree(ctl);
ret = 1;
out:
+ btrfs_discard_update_discardable(ctl->private, ctl);
io_ctl_free(&io_ctl);
return ret;
free_cache:
@@ -1624,6 +1645,11 @@ __unlink_free_space(struct btrfs_free_space_ctl *ctl,
{
rb_erase(&info->offset_index, &ctl->free_space_offset);
ctl->free_extents--;
+
+ if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR]--;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes;
+ }
}
static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
@@ -1644,6 +1670,11 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
if (ret)
return ret;
+ if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR]++;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
+ }
+
ctl->free_space += info->bytes;
ctl->free_extents++;
return ret;
@@ -1664,26 +1695,17 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
ASSERT(ctl->total_bitmaps <= max_bitmaps);
/*
- * The goal is to keep the total amount of memory used per 1gb of space
- * at or below 32k, so we need to adjust how much memory we allow to be
- * used by extent based free space tracking
+ * We are trying to keep the total amount of memory used per 1GiB of
+ * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation
+ * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of
+ * bitmaps, we may end up using more memory than this.
*/
if (size < SZ_1G)
max_bytes = MAX_CACHE_BYTES_PER_GIG;
else
max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
- /*
- * we want to account for 1 more bitmap than what we have so we can make
- * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
- * we add more bitmaps.
- */
- bitmap_bytes = (ctl->total_bitmaps + 1) * ctl->unit;
-
- if (bitmap_bytes >= max_bytes) {
- ctl->extents_thresh = 0;
- return;
- }
+ bitmap_bytes = ctl->total_bitmaps * ctl->unit;
/*
* we want the extent entry threshold to always be at most 1/2 the max
@@ -1700,17 +1722,31 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info,
u64 offset, u64 bytes)
{
- unsigned long start, count;
+ unsigned long start, count, end;
+ int extent_delta = -1;
start = offset_to_bit(info->offset, ctl->unit, offset);
count = bytes_to_bits(bytes, ctl->unit);
- ASSERT(start + count <= BITS_PER_BITMAP);
+ end = start + count;
+ ASSERT(end <= BITS_PER_BITMAP);
bitmap_clear(info->bitmap, start, count);
info->bytes -= bytes;
if (info->max_extent_size > ctl->unit)
info->max_extent_size = 0;
+
+ if (start && test_bit(start - 1, info->bitmap))
+ extent_delta++;
+
+ if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
+ extent_delta++;
+
+ info->bitmap_extents += extent_delta;
+ if (!btrfs_free_space_trimmed(info)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
+ }
}
static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
@@ -1725,16 +1761,30 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, u64 offset,
u64 bytes)
{
- unsigned long start, count;
+ unsigned long start, count, end;
+ int extent_delta = 1;
start = offset_to_bit(info->offset, ctl->unit, offset);
count = bytes_to_bits(bytes, ctl->unit);
- ASSERT(start + count <= BITS_PER_BITMAP);
+ end = start + count;
+ ASSERT(end <= BITS_PER_BITMAP);
bitmap_set(info->bitmap, start, count);
info->bytes += bytes;
ctl->free_space += bytes;
+
+ if (start && test_bit(start - 1, info->bitmap))
+ extent_delta--;
+
+ if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
+ extent_delta--;
+
+ info->bitmap_extents += extent_delta;
+ if (!btrfs_free_space_trimmed(info)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes;
+ }
}
/*
@@ -1870,11 +1920,35 @@ out:
return NULL;
}
+static int count_bitmap_extents(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *bitmap_info)
+{
+ struct btrfs_block_group *block_group = ctl->private;
+ u64 bytes = bitmap_info->bytes;
+ unsigned int rs, re;
+ int count = 0;
+
+ if (!block_group || !bytes)
+ return count;
+
+ bitmap_for_each_set_region(bitmap_info->bitmap, rs, re, 0,
+ BITS_PER_BITMAP) {
+ bytes -= (rs - re) * ctl->unit;
+ count++;
+
+ if (!bytes)
+ break;
+ }
+
+ return count;
+}
+
static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, u64 offset)
{
info->offset = offset_to_bitmap(ctl, offset);
info->bytes = 0;
+ info->bitmap_extents = 0;
INIT_LIST_HEAD(&info->list);
link_free_space(ctl, info);
ctl->total_bitmaps++;
@@ -1885,6 +1959,18 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
static void free_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *bitmap_info)
{
+ /*
+ * Normally when this is called, the bitmap is completely empty. However,
+ * if we are blowing up the free space cache for one reason or another
+ * via __btrfs_remove_free_space_cache(), then it may not be freed and
+ * we may leave stats on the table.
+ */
+ if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR] -=
+ bitmap_info->bitmap_extents;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes;
+
+ }
unlink_free_space(ctl, bitmap_info);
kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
@@ -1971,11 +2057,24 @@ again:
static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, u64 offset,
- u64 bytes)
+ u64 bytes, enum btrfs_trim_state trim_state)
{
u64 bytes_to_set = 0;
u64 end;
+ /*
+ * This is a tradeoff to make bitmap trim state minimal. We mark the
+ * whole bitmap untrimmed if at any point we add untrimmed regions.
+ */
+ if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) {
+ if (btrfs_free_space_trimmed(info)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR] +=
+ info->bitmap_extents;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
+ }
+ info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+ }
+
end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
bytes_to_set = min(end - offset, bytes);
@@ -2004,6 +2103,10 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
forced = true;
#endif
+ /* This is a way to reclaim large regions from the bitmaps. */
+ if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD)
+ return false;
+
/*
* If we are below the extents threshold then we can add this as an
* extent, and don't have to deal with the bitmap
@@ -2016,8 +2119,8 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
* of cache left then go ahead an dadd them, no sense in adding
* the overhead of a bitmap if we don't have to.
*/
- if (info->bytes <= fs_info->sectorsize * 4) {
- if (ctl->free_extents * 2 <= ctl->extents_thresh)
+ if (info->bytes <= fs_info->sectorsize * 8) {
+ if (ctl->free_extents * 3 <= ctl->extents_thresh)
return false;
} else {
return false;
@@ -2050,10 +2153,12 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_block_group *block_group = NULL;
int added = 0;
u64 bytes, offset, bytes_added;
+ enum btrfs_trim_state trim_state;
int ret;
bytes = info->bytes;
offset = info->offset;
+ trim_state = info->trim_state;
if (!ctl->op->use_bitmap(ctl, info))
return 0;
@@ -2088,8 +2193,8 @@ again:
}
if (entry->offset == offset_to_bitmap(ctl, offset)) {
- bytes_added = add_bytes_to_bitmap(ctl, entry,
- offset, bytes);
+ bytes_added = add_bytes_to_bitmap(ctl, entry, offset,
+ bytes, trim_state);
bytes -= bytes_added;
offset += bytes_added;
}
@@ -2108,7 +2213,8 @@ no_cluster_bitmap:
goto new_bitmap;
}
- bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
+ bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
+ trim_state);
bytes -= bytes_added;
offset += bytes_added;
added = 0;
@@ -2142,6 +2248,7 @@ new_bitmap:
/* allocate the bitmap */
info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep,
GFP_NOFS);
+ info->trim_state = BTRFS_TRIM_STATE_TRIMMED;
spin_lock(&ctl->tree_lock);
if (!info->bitmap) {
ret = -ENOMEM;
@@ -2161,6 +2268,22 @@ out:
return ret;
}
+/*
+ * Free space merging rules:
+ * 1) Merge trimmed areas together
+ * 2) Let untrimmed areas coalesce with trimmed areas
+ * 3) Always pull neighboring regions from bitmaps
+ *
+ * The above rules are for when we merge free space based on btrfs_trim_state.
+ * Rules 2 and 3 are subtle because they are suboptimal, but are done for the
+ * same reason: to promote larger extent regions which makes life easier for
+ * find_free_extent(). Rule 2 enables coalescing based on the common path
+ * being returning free space from btrfs_finish_extent_commit(). So when free
+ * space is trimmed, it will prevent aggregating trimmed new region and
+ * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents
+ * and provide find_free_extent() with the largest extents possible hoping for
+ * the reuse path.
+ */
static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, bool update_stat)
{
@@ -2169,6 +2292,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
bool merged = false;
u64 offset = info->offset;
u64 bytes = info->bytes;
+ const bool is_trimmed = btrfs_free_space_trimmed(info);
/*
* first we want to see if there is free space adjacent to the range we
@@ -2182,7 +2306,9 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
else
left_info = tree_search_offset(ctl, offset - 1, 0, 0);
- if (right_info && !right_info->bitmap) {
+ /* See try_merge_free_space() comment. */
+ if (right_info && !right_info->bitmap &&
+ (!is_trimmed || btrfs_free_space_trimmed(right_info))) {
if (update_stat)
unlink_free_space(ctl, right_info);
else
@@ -2192,8 +2318,10 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
merged = true;
}
+ /* See try_merge_free_space() comment. */
if (left_info && !left_info->bitmap &&
- left_info->offset + left_info->bytes == offset) {
+ left_info->offset + left_info->bytes == offset &&
+ (!is_trimmed || btrfs_free_space_trimmed(left_info))) {
if (update_stat)
unlink_free_space(ctl, left_info);
else
@@ -2229,6 +2357,10 @@ static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
bytes = (j - i) * ctl->unit;
info->bytes += bytes;
+ /* See try_merge_free_space() comment. */
+ if (!btrfs_free_space_trimmed(bitmap))
+ info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+
if (update_stat)
bitmap_clear_bits(ctl, bitmap, end, bytes);
else
@@ -2282,6 +2414,10 @@ static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
info->offset -= bytes;
info->bytes += bytes;
+ /* See try_merge_free_space() comment. */
+ if (!btrfs_free_space_trimmed(bitmap))
+ info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+
if (update_stat)
bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
else
@@ -2331,10 +2467,13 @@ static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_free_space_ctl *ctl,
- u64 offset, u64 bytes)
+ u64 offset, u64 bytes,
+ enum btrfs_trim_state trim_state)
{
+ struct btrfs_block_group *block_group = ctl->private;
struct btrfs_free_space *info;
int ret = 0;
+ u64 filter_bytes = bytes;
info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
if (!info)
@@ -2342,6 +2481,7 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
info->offset = offset;
info->bytes = bytes;
+ info->trim_state = trim_state;
RB_CLEAR_NODE(&info->offset_index);
spin_lock(&ctl->tree_lock);
@@ -2370,10 +2510,13 @@ link:
*/
steal_from_bitmap(ctl, info, true);
+ filter_bytes = max(filter_bytes, info->bytes);
+
ret = link_free_space(ctl, info);
if (ret)
kmem_cache_free(btrfs_free_space_cachep, info);
out:
+ btrfs_discard_update_discardable(block_group, ctl);
spin_unlock(&ctl->tree_lock);
if (ret) {
@@ -2381,15 +2524,44 @@ out:
ASSERT(ret != -EEXIST);
}
+ if (trim_state != BTRFS_TRIM_STATE_TRIMMED) {
+ btrfs_discard_check_filter(block_group, filter_bytes);
+ btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
+ }
+
return ret;
}
int btrfs_add_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size)
{
+ enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+
+ if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
+ trim_state = BTRFS_TRIM_STATE_TRIMMED;
+
+ return __btrfs_add_free_space(block_group->fs_info,
+ block_group->free_space_ctl,
+ bytenr, size, trim_state);
+}
+
+/*
+ * This is a subtle distinction because when adding free space back in general,
+ * we want it to be added as untrimmed for async. But in the case where we add
+ * it on loading of a block group, we want to consider it trimmed.
+ */
+int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
+ u64 bytenr, u64 size)
+{
+ enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+
+ if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) ||
+ btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
+ trim_state = BTRFS_TRIM_STATE_TRIMMED;
+
return __btrfs_add_free_space(block_group->fs_info,
block_group->free_space_ctl,
- bytenr, size);
+ bytenr, size, trim_state);
}
int btrfs_remove_free_space(struct btrfs_block_group *block_group,
@@ -2464,8 +2636,10 @@ again:
}
spin_unlock(&ctl->tree_lock);
- ret = btrfs_add_free_space(block_group, offset + bytes,
- old_end - (offset + bytes));
+ ret = __btrfs_add_free_space(block_group->fs_info, ctl,
+ offset + bytes,
+ old_end - (offset + bytes),
+ info->trim_state);
WARN_ON(ret);
goto out;
}
@@ -2477,6 +2651,7 @@ again:
goto again;
}
out_lock:
+ btrfs_discard_update_discardable(block_group, ctl);
spin_unlock(&ctl->tree_lock);
out:
return ret;
@@ -2562,8 +2737,22 @@ __btrfs_return_cluster_to_free_space(
bitmap = (entry->bitmap != NULL);
if (!bitmap) {
+ /* Merging treats extents as if they were new */
+ if (!btrfs_free_space_trimmed(entry)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR]--;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] -=
+ entry->bytes;
+ }
+
try_merge_free_space(ctl, entry, false);
steal_from_bitmap(ctl, entry, false);
+
+ /* As we insert directly, update these statistics */
+ if (!btrfs_free_space_trimmed(entry)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR]++;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] +=
+ entry->bytes;
+ }
}
tree_insert_offset(&ctl->free_space_offset,
entry->offset, &entry->offset_index, bitmap);
@@ -2599,6 +2788,8 @@ void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
{
spin_lock(&ctl->tree_lock);
__btrfs_remove_free_space_cache_locked(ctl);
+ if (ctl->private)
+ btrfs_discard_update_discardable(ctl->private, ctl);
spin_unlock(&ctl->tree_lock);
}
@@ -2620,20 +2811,55 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
cond_resched_lock(&ctl->tree_lock);
}
__btrfs_remove_free_space_cache_locked(ctl);
+ btrfs_discard_update_discardable(block_group, ctl);
spin_unlock(&ctl->tree_lock);
}
+/**
+ * btrfs_is_free_space_trimmed - see if everything is trimmed
+ * @block_group: block_group of interest
+ *
+ * Walk @block_group's free space rb_tree to determine if everything is trimmed.
+ */
+bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *info;
+ struct rb_node *node;
+ bool ret = true;
+
+ spin_lock(&ctl->tree_lock);
+ node = rb_first(&ctl->free_space_offset);
+
+ while (node) {
+ info = rb_entry(node, struct btrfs_free_space, offset_index);
+
+ if (!btrfs_free_space_trimmed(info)) {
+ ret = false;
+ break;
+ }
+
+ node = rb_next(node);
+ }
+
+ spin_unlock(&ctl->tree_lock);
+ return ret;
+}
+
u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
u64 offset, u64 bytes, u64 empty_size,
u64 *max_extent_size)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_discard_ctl *discard_ctl =
+ &block_group->fs_info->discard_ctl;
struct btrfs_free_space *entry = NULL;
u64 bytes_search = bytes + empty_size;
u64 ret = 0;
u64 align_gap = 0;
u64 align_gap_len = 0;
+ enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
spin_lock(&ctl->tree_lock);
entry = find_free_space(ctl, &offset, &bytes_search,
@@ -2644,12 +2870,20 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
ret = offset;
if (entry->bitmap) {
bitmap_clear_bits(ctl, entry, offset, bytes);
+
+ if (!btrfs_free_space_trimmed(entry))
+ atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
+
if (!entry->bytes)
free_bitmap(ctl, entry);
} else {
unlink_free_space(ctl, entry);
align_gap_len = offset - entry->offset;
align_gap = entry->offset;
+ align_gap_trim_state = entry->trim_state;
+
+ if (!btrfs_free_space_trimmed(entry))
+ atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
entry->offset = offset + bytes;
WARN_ON(entry->bytes < bytes + align_gap_len);
@@ -2661,11 +2895,13 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
link_free_space(ctl, entry);
}
out:
+ btrfs_discard_update_discardable(block_group, ctl);
spin_unlock(&ctl->tree_lock);
if (align_gap_len)
__btrfs_add_free_space(block_group->fs_info, ctl,
- align_gap, align_gap_len);
+ align_gap, align_gap_len,
+ align_gap_trim_state);
return ret;
}
@@ -2707,6 +2943,8 @@ int btrfs_return_cluster_to_free_space(
ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
spin_unlock(&ctl->tree_lock);
+ btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group);
+
/* finally drop our ref */
btrfs_put_block_group(block_group);
return ret;
@@ -2750,6 +2988,8 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
u64 min_start, u64 *max_extent_size)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_discard_ctl *discard_ctl =
+ &block_group->fs_info->discard_ctl;
struct btrfs_free_space *entry = NULL;
struct rb_node *node;
u64 ret = 0;
@@ -2814,7 +3054,12 @@ out:
spin_lock(&ctl->tree_lock);
+ if (!btrfs_free_space_trimmed(entry))
+ atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
+
ctl->free_space -= bytes;
+ if (!entry->bitmap && !btrfs_free_space_trimmed(entry))
+ ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
if (entry->bytes == 0) {
ctl->free_extents--;
if (entry->bitmap) {
@@ -2822,6 +3067,8 @@ out:
entry->bitmap);
ctl->total_bitmaps--;
ctl->op->recalc_thresholds(ctl);
+ } else if (!btrfs_free_space_trimmed(entry)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR]--;
}
kmem_cache_free(btrfs_free_space_cachep, entry);
}
@@ -3148,6 +3395,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
static int do_trimming(struct btrfs_block_group *block_group,
u64 *total_trimmed, u64 start, u64 bytes,
u64 reserved_start, u64 reserved_bytes,
+ enum btrfs_trim_state reserved_trim_state,
struct btrfs_trim_range *trim_entry)
{
struct btrfs_space_info *space_info = block_group->space_info;
@@ -3155,6 +3403,9 @@ static int do_trimming(struct btrfs_block_group *block_group,
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
int ret;
int update = 0;
+ const u64 end = start + bytes;
+ const u64 reserved_end = reserved_start + reserved_bytes;
+ enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
u64 trimmed = 0;
spin_lock(&space_info->lock);
@@ -3168,11 +3419,20 @@ static int do_trimming(struct btrfs_block_group *block_group,
spin_unlock(&space_info->lock);
ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
- if (!ret)
+ if (!ret) {
*total_trimmed += trimmed;
+ trim_state = BTRFS_TRIM_STATE_TRIMMED;
+ }
mutex_lock(&ctl->cache_writeout_mutex);
- btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
+ if (reserved_start < start)
+ __btrfs_add_free_space(fs_info, ctl, reserved_start,
+ start - reserved_start,
+ reserved_trim_state);
+ if (start + bytes < reserved_start + reserved_bytes)
+ __btrfs_add_free_space(fs_info, ctl, end, reserved_end - end,
+ reserved_trim_state);
+ __btrfs_add_free_space(fs_info, ctl, start, bytes, trim_state);
list_del(&trim_entry->list);
mutex_unlock(&ctl->cache_writeout_mutex);
@@ -3190,16 +3450,24 @@ static int do_trimming(struct btrfs_block_group *block_group,
return ret;
}
+/*
+ * If @async is set, then we will trim 1 region and return.
+ */
static int trim_no_bitmap(struct btrfs_block_group *block_group,
- u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+ u64 *total_trimmed, u64 start, u64 end, u64 minlen,
+ bool async)
{
+ struct btrfs_discard_ctl *discard_ctl =
+ &block_group->fs_info->discard_ctl;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *entry;
struct rb_node *node;
int ret = 0;
u64 extent_start;
u64 extent_bytes;
+ enum btrfs_trim_state extent_trim_state;
u64 bytes;
+ const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
while (start < end) {
struct btrfs_trim_range trim_entry;
@@ -3207,49 +3475,66 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group,
mutex_lock(&ctl->cache_writeout_mutex);
spin_lock(&ctl->tree_lock);
- if (ctl->free_space < minlen) {
- spin_unlock(&ctl->tree_lock);
- mutex_unlock(&ctl->cache_writeout_mutex);
- break;
- }
+ if (ctl->free_space < minlen)
+ goto out_unlock;
entry = tree_search_offset(ctl, start, 0, 1);
- if (!entry) {
- spin_unlock(&ctl->tree_lock);
- mutex_unlock(&ctl->cache_writeout_mutex);
- break;
- }
+ if (!entry)
+ goto out_unlock;
- /* skip bitmaps */
- while (entry->bitmap) {
+ /* Skip bitmaps and if async, already trimmed entries */
+ while (entry->bitmap ||
+ (async && btrfs_free_space_trimmed(entry))) {
node = rb_next(&entry->offset_index);
- if (!node) {
- spin_unlock(&ctl->tree_lock);
- mutex_unlock(&ctl->cache_writeout_mutex);
- goto out;
- }
+ if (!node)
+ goto out_unlock;
entry = rb_entry(node, struct btrfs_free_space,
offset_index);
}
- if (entry->offset >= end) {
- spin_unlock(&ctl->tree_lock);
- mutex_unlock(&ctl->cache_writeout_mutex);
- break;
- }
+ if (entry->offset >= end)
+ goto out_unlock;
extent_start = entry->offset;
extent_bytes = entry->bytes;
- start = max(start, extent_start);
- bytes = min(extent_start + extent_bytes, end) - start;
- if (bytes < minlen) {
- spin_unlock(&ctl->tree_lock);
- mutex_unlock(&ctl->cache_writeout_mutex);
- goto next;
- }
+ extent_trim_state = entry->trim_state;
+ if (async) {
+ start = entry->offset;
+ bytes = entry->bytes;
+ if (bytes < minlen) {
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+ goto next;
+ }
+ unlink_free_space(ctl, entry);
+ /*
+ * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
+ * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim
+ * X when we come back around. So trim it now.
+ */
+ if (max_discard_size &&
+ bytes >= (max_discard_size +
+ BTRFS_ASYNC_DISCARD_MIN_FILTER)) {
+ bytes = max_discard_size;
+ extent_bytes = max_discard_size;
+ entry->offset += max_discard_size;
+ entry->bytes -= max_discard_size;
+ link_free_space(ctl, entry);
+ } else {
+ kmem_cache_free(btrfs_free_space_cachep, entry);
+ }
+ } else {
+ start = max(start, extent_start);
+ bytes = min(extent_start + extent_bytes, end) - start;
+ if (bytes < minlen) {
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+ goto next;
+ }
- unlink_free_space(ctl, entry);
- kmem_cache_free(btrfs_free_space_cachep, entry);
+ unlink_free_space(ctl, entry);
+ kmem_cache_free(btrfs_free_space_cachep, entry);
+ }
spin_unlock(&ctl->tree_lock);
trim_entry.start = extent_start;
@@ -3258,11 +3543,17 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group,
mutex_unlock(&ctl->cache_writeout_mutex);
ret = do_trimming(block_group, total_trimmed, start, bytes,
- extent_start, extent_bytes, &trim_entry);
- if (ret)
+ extent_start, extent_bytes, extent_trim_state,
+ &trim_entry);
+ if (ret) {
+ block_group->discard_cursor = start + bytes;
break;
+ }
next:
start += bytes;
+ block_group->discard_cursor = start;
+ if (async && *total_trimmed)
+ break;
if (fatal_signal_pending(current)) {
ret = -ERESTARTSYS;
@@ -3271,19 +3562,76 @@ next:
cond_resched();
}
-out:
+
+ return ret;
+
+out_unlock:
+ block_group->discard_cursor = btrfs_block_group_end(block_group);
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+
return ret;
}
+/*
+ * If we break out of trimming a bitmap prematurely, we should reset the
+ * trimming bit. In a rather contrieved case, it's possible to race here so
+ * reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
+ *
+ * start = start of bitmap
+ * end = near end of bitmap
+ *
+ * Thread 1: Thread 2:
+ * trim_bitmaps(start)
+ * trim_bitmaps(end)
+ * end_trimming_bitmap()
+ * reset_trimming_bitmap()
+ */
+static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset)
+{
+ struct btrfs_free_space *entry;
+
+ spin_lock(&ctl->tree_lock);
+ entry = tree_search_offset(ctl, offset, 1, 0);
+ if (entry) {
+ if (btrfs_free_space_trimmed(entry)) {
+ ctl->discardable_extents[BTRFS_STAT_CURR] +=
+ entry->bitmap_extents;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes;
+ }
+ entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+ }
+
+ spin_unlock(&ctl->tree_lock);
+}
+
+static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *entry)
+{
+ if (btrfs_free_space_trimming_bitmap(entry)) {
+ entry->trim_state = BTRFS_TRIM_STATE_TRIMMED;
+ ctl->discardable_extents[BTRFS_STAT_CURR] -=
+ entry->bitmap_extents;
+ ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes;
+ }
+}
+
+/*
+ * If @async is set, then we will trim 1 region and return.
+ */
static int trim_bitmaps(struct btrfs_block_group *block_group,
- u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+ u64 *total_trimmed, u64 start, u64 end, u64 minlen,
+ u64 maxlen, bool async)
{
+ struct btrfs_discard_ctl *discard_ctl =
+ &block_group->fs_info->discard_ctl;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *entry;
int ret = 0;
int ret2;
u64 bytes;
u64 offset = offset_to_bitmap(ctl, start);
+ const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
while (offset < end) {
bool next_bitmap = false;
@@ -3293,35 +3641,84 @@ static int trim_bitmaps(struct btrfs_block_group *block_group,
spin_lock(&ctl->tree_lock);
if (ctl->free_space < minlen) {
+ block_group->discard_cursor =
+ btrfs_block_group_end(block_group);
spin_unlock(&ctl->tree_lock);
mutex_unlock(&ctl->cache_writeout_mutex);
break;
}
entry = tree_search_offset(ctl, offset, 1, 0);
- if (!entry) {
+ /*
+ * Bitmaps are marked trimmed lossily now to prevent constant
+ * discarding of the same bitmap (the reason why we are bound
+ * by the filters). So, retrim the block group bitmaps when we
+ * are preparing to punt to the unused_bgs list. This uses
+ * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED
+ * which is the only discard index which sets minlen to 0.
+ */
+ if (!entry || (async && minlen && start == offset &&
+ btrfs_free_space_trimmed(entry))) {
spin_unlock(&ctl->tree_lock);
mutex_unlock(&ctl->cache_writeout_mutex);
next_bitmap = true;
goto next;
}
+ /*
+ * Async discard bitmap trimming begins at by setting the start
+ * to be key.objectid and the offset_to_bitmap() aligns to the
+ * start of the bitmap. This lets us know we are fully
+ * scanning the bitmap rather than only some portion of it.
+ */
+ if (start == offset)
+ entry->trim_state = BTRFS_TRIM_STATE_TRIMMING;
+
bytes = minlen;
ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
if (ret2 || start >= end) {
+ /*
+ * We lossily consider a bitmap trimmed if we only skip
+ * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER.
+ */
+ if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER)
+ end_trimming_bitmap(ctl, entry);
+ else
+ entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
spin_unlock(&ctl->tree_lock);
mutex_unlock(&ctl->cache_writeout_mutex);
next_bitmap = true;
goto next;
}
+ /*
+ * We already trimmed a region, but are using the locking above
+ * to reset the trim_state.
+ */
+ if (async && *total_trimmed) {
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+ goto out;
+ }
+
bytes = min(bytes, end - start);
- if (bytes < minlen) {
+ if (bytes < minlen || (async && maxlen && bytes > maxlen)) {
spin_unlock(&ctl->tree_lock);
mutex_unlock(&ctl->cache_writeout_mutex);
goto next;
}
+ /*
+ * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
+ * If X < @minlen, we won't trim X when we come back around.
+ * So trim it now. We differ here from trimming extents as we
+ * don't keep individual state per bit.
+ */
+ if (async &&
+ max_discard_size &&
+ bytes > (max_discard_size + minlen))
+ bytes = max_discard_size;
+
bitmap_clear_bits(ctl, entry, start, bytes);
if (entry->bytes == 0)
free_bitmap(ctl, entry);
@@ -3333,19 +3730,25 @@ static int trim_bitmaps(struct btrfs_block_group *block_group,
mutex_unlock(&ctl->cache_writeout_mutex);
ret = do_trimming(block_group, total_trimmed, start, bytes,
- start, bytes, &trim_entry);
- if (ret)
+ start, bytes, 0, &trim_entry);
+ if (ret) {
+ reset_trimming_bitmap(ctl, offset);
+ block_group->discard_cursor =
+ btrfs_block_group_end(block_group);
break;
+ }
next:
if (next_bitmap) {
offset += BITS_PER_BITMAP * ctl->unit;
+ start = offset;
} else {
start += bytes;
- if (start >= offset + BITS_PER_BITMAP * ctl->unit)
- offset += BITS_PER_BITMAP * ctl->unit;
}
+ block_group->discard_cursor = start;
if (fatal_signal_pending(current)) {
+ if (start != offset)
+ reset_trimming_bitmap(ctl, offset);
ret = -ERESTARTSYS;
break;
}
@@ -3353,6 +3756,10 @@ next:
cond_resched();
}
+ if (offset >= end)
+ block_group->discard_cursor = end;
+
+out:
return ret;
}
@@ -3399,7 +3806,9 @@ void btrfs_put_block_group_trimming(struct btrfs_block_group *block_group)
int btrfs_trim_block_group(struct btrfs_block_group *block_group,
u64 *trimmed, u64 start, u64 end, u64 minlen)
{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
int ret;
+ u64 rem = 0;
*trimmed = 0;
@@ -3411,16 +3820,66 @@ int btrfs_trim_block_group(struct btrfs_block_group *block_group,
btrfs_get_block_group_trimming(block_group);
spin_unlock(&block_group->lock);
- ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
+ ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false);
if (ret)
goto out;
- ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
+ ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false);
+ div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem);
+ /* If we ended in the middle of a bitmap, reset the trimming flag */
+ if (rem)
+ reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end));
out:
btrfs_put_block_group_trimming(block_group);
return ret;
}
+int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
+ u64 *trimmed, u64 start, u64 end, u64 minlen,
+ bool async)
+{
+ int ret;
+
+ *trimmed = 0;
+
+ spin_lock(&block_group->lock);
+ if (block_group->removed) {
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
+ btrfs_get_block_group_trimming(block_group);
+ spin_unlock(&block_group->lock);
+
+ ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async);
+ btrfs_put_block_group_trimming(block_group);
+
+ return ret;
+}
+
+int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
+ u64 *trimmed, u64 start, u64 end, u64 minlen,
+ u64 maxlen, bool async)
+{
+ int ret;
+
+ *trimmed = 0;
+
+ spin_lock(&block_group->lock);
+ if (block_group->removed) {
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
+ btrfs_get_block_group_trimming(block_group);
+ spin_unlock(&block_group->lock);
+
+ ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen,
+ async);
+
+ btrfs_put_block_group_trimming(block_group);
+
+ return ret;
+}
+
/*
* Find the left-most item in the cache tree, and then return the
* smallest inode number in the item.
@@ -3600,6 +4059,7 @@ int test_add_free_space_entry(struct btrfs_block_group *cache,
struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
struct btrfs_free_space *info = NULL, *bitmap_info;
void *map = NULL;
+ enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED;
u64 bytes_added;
int ret;
@@ -3641,7 +4101,8 @@ again:
info = NULL;
}
- bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
+ bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
+ trim_state);
bytes -= bytes_added;
offset += bytes_added;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index ba9a23241101..2e0a8077aa74 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -6,6 +6,20 @@
#ifndef BTRFS_FREE_SPACE_CACHE_H
#define BTRFS_FREE_SPACE_CACHE_H
+/*
+ * This is the trim state of an extent or bitmap.
+ *
+ * BTRFS_TRIM_STATE_TRIMMING is special and used to maintain the state of a
+ * bitmap as we may need several trims to fully trim a single bitmap entry.
+ * This is reset should any free space other than trimmed space be added to the
+ * bitmap.
+ */
+enum btrfs_trim_state {
+ BTRFS_TRIM_STATE_UNTRIMMED,
+ BTRFS_TRIM_STATE_TRIMMED,
+ BTRFS_TRIM_STATE_TRIMMING,
+};
+
struct btrfs_free_space {
struct rb_node offset_index;
u64 offset;
@@ -13,8 +27,21 @@ struct btrfs_free_space {
u64 max_extent_size;
unsigned long *bitmap;
struct list_head list;
+ enum btrfs_trim_state trim_state;
+ s32 bitmap_extents;
};
+static inline bool btrfs_free_space_trimmed(struct btrfs_free_space *info)
+{
+ return (info->trim_state == BTRFS_TRIM_STATE_TRIMMED);
+}
+
+static inline bool btrfs_free_space_trimming_bitmap(
+ struct btrfs_free_space *info)
+{
+ return (info->trim_state == BTRFS_TRIM_STATE_TRIMMING);
+}
+
struct btrfs_free_space_ctl {
spinlock_t tree_lock;
struct rb_root free_space_offset;
@@ -24,6 +51,8 @@ struct btrfs_free_space_ctl {
int total_bitmaps;
int unit;
u64 start;
+ s32 discardable_extents[BTRFS_STAT_NR_ENTRIES];
+ s64 discardable_bytes[BTRFS_STAT_NR_ENTRIES];
const struct btrfs_free_space_op *op;
void *private;
struct mutex cache_writeout_mutex;
@@ -83,13 +112,17 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group);
int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_free_space_ctl *ctl,
- u64 bytenr, u64 size);
+ u64 bytenr, u64 size,
+ enum btrfs_trim_state trim_state);
int btrfs_add_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
+int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
+ u64 bytenr, u64 size);
int btrfs_remove_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl);
void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group);
+bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group);
u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
u64 offset, u64 bytes, u64 empty_size,
u64 *max_extent_size);
@@ -108,6 +141,12 @@ int btrfs_return_cluster_to_free_space(
struct btrfs_free_cluster *cluster);
int btrfs_trim_block_group(struct btrfs_block_group *block_group,
u64 *trimmed, u64 start, u64 end, u64 minlen);
+int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
+ u64 *trimmed, u64 start, u64 end, u64 minlen,
+ bool async);
+int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
+ u64 *trimmed, u64 start, u64 end, u64 minlen,
+ u64 maxlen, bool async);
/* Support functions for running our sanity tests */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 37345fb6191d..d5c9c69d8263 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -107,7 +107,7 @@ again:
if (last != (u64)-1 && last + 1 != key.objectid) {
__btrfs_add_free_space(fs_info, ctl, last + 1,
- key.objectid - last - 1);
+ key.objectid - last - 1, 0);
wake_up(&root->ino_cache_wait);
}
@@ -118,7 +118,7 @@ next:
if (last < root->highest_objectid - 1) {
__btrfs_add_free_space(fs_info, ctl, last + 1,
- root->highest_objectid - last - 1);
+ root->highest_objectid - last - 1, 0);
}
spin_lock(&root->ino_cache_lock);
@@ -175,7 +175,8 @@ static void start_caching(struct btrfs_root *root)
ret = btrfs_find_free_objectid(root, &objectid);
if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) {
__btrfs_add_free_space(fs_info, ctl, objectid,
- BTRFS_LAST_FREE_OBJECTID - objectid + 1);
+ BTRFS_LAST_FREE_OBJECTID - objectid + 1,
+ 0);
wake_up(&root->ino_cache_wait);
}
@@ -221,7 +222,7 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
return;
again:
if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
- __btrfs_add_free_space(fs_info, pinned, objectid, 1);
+ __btrfs_add_free_space(fs_info, pinned, objectid, 1, 0);
} else {
down_write(&fs_info->commit_root_sem);
spin_lock(&root->ino_cache_lock);
@@ -234,7 +235,7 @@ again:
start_caching(root);
- __btrfs_add_free_space(fs_info, pinned, objectid, 1);
+ __btrfs_add_free_space(fs_info, pinned, objectid, 1, 0);
up_write(&fs_info->commit_root_sem);
}
@@ -281,7 +282,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
spin_unlock(rbroot_lock);
if (count)
__btrfs_add_free_space(root->fs_info, ctl,
- info->offset, count);
+ info->offset, count, 0);
kmem_cache_free(btrfs_free_space_cachep, info);
}
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e3c76645cad7..6d2bb58d277a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -44,7 +44,6 @@
#include "locking.h"
#include "free-space-cache.h"
#include "inode-map.h"
-#include "backref.h"
#include "props.h"
#include "qgroup.h"
#include "delalloc-space.h"
@@ -64,7 +63,6 @@ struct btrfs_dio_data {
static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
-static const struct inode_operations btrfs_dir_ro_inode_operations;
static const struct inode_operations btrfs_special_inode_operations;
static const struct inode_operations btrfs_file_inode_operations;
static const struct address_space_operations btrfs_aops;
@@ -1479,10 +1477,10 @@ next_slot:
disk_num_bytes =
btrfs_file_extent_disk_num_bytes(leaf, fi);
/*
- * If extent we got ends before our range starts, skip
- * to next extent
+ * If the extent we got ends before our current offset,
+ * skip to the next extent.
*/
- if (extent_end <= start) {
+ if (extent_end <= cur_offset) {
path->slots[0]++;
goto next_slot;
}
@@ -2128,7 +2126,7 @@ static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
bio_flags);
goto out;
} else if (!skip_sum) {
- ret = btrfs_lookup_bio_sums(inode, bio, NULL);
+ ret = btrfs_lookup_bio_sums(inode, bio, (u64)-1, NULL);
if (ret)
goto out;
}
@@ -2394,649 +2392,6 @@ out:
return ret;
}
-/* snapshot-aware defrag */
-struct sa_defrag_extent_backref {
- struct rb_node node;
- struct old_sa_defrag_extent *old;
- u64 root_id;
- u64 inum;
- u64 file_pos;
- u64 extent_offset;
- u64 num_bytes;
- u64 generation;
-};
-
-struct old_sa_defrag_extent {
- struct list_head list;
- struct new_sa_defrag_extent *new;
-
- u64 extent_offset;
- u64 bytenr;
- u64 offset;
- u64 len;
- int count;
-};
-
-struct new_sa_defrag_extent {
- struct rb_root root;
- struct list_head head;
- struct btrfs_path *path;
- struct inode *inode;
- u64 file_pos;
- u64 len;
- u64 bytenr;
- u64 disk_len;
- u8 compress_type;
-};
-
-static int backref_comp(struct sa_defrag_extent_backref *b1,
- struct sa_defrag_extent_backref *b2)
-{
- if (b1->root_id < b2->root_id)
- return -1;
- else if (b1->root_id > b2->root_id)
- return 1;
-
- if (b1->inum < b2->inum)
- return -1;
- else if (b1->inum > b2->inum)
- return 1;
-
- if (b1->file_pos < b2->file_pos)
- return -1;
- else if (b1->file_pos > b2->file_pos)
- return 1;
-
- /*
- * [------------------------------] ===> (a range of space)
- * |<--->| |<---->| =============> (fs/file tree A)
- * |<---------------------------->| ===> (fs/file tree B)
- *
- * A range of space can refer to two file extents in one tree while
- * refer to only one file extent in another tree.
- *
- * So we may process a disk offset more than one time(two extents in A)
- * and locate at the same extent(one extent in B), then insert two same
- * backrefs(both refer to the extent in B).
- */
- return 0;
-}
-
-static void backref_insert(struct rb_root *root,
- struct sa_defrag_extent_backref *backref)
-{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent = NULL;
- struct sa_defrag_extent_backref *entry;
- int ret;
-
- while (*p) {
- parent = *p;
- entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
-
- ret = backref_comp(backref, entry);
- if (ret < 0)
- p = &(*p)->rb_left;
- else
- p = &(*p)->rb_right;
- }
-
- rb_link_node(&backref->node, parent, p);
- rb_insert_color(&backref->node, root);
-}
-
-/*
- * Note the backref might has changed, and in this case we just return 0.
- */
-static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
- void *ctx)
-{
- struct btrfs_file_extent_item *extent;
- struct old_sa_defrag_extent *old = ctx;
- struct new_sa_defrag_extent *new = old->new;
- struct btrfs_path *path = new->path;
- struct btrfs_key key;
- struct btrfs_root *root;
- struct sa_defrag_extent_backref *backref;
- struct extent_buffer *leaf;
- struct inode *inode = new->inode;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- int slot;
- int ret;
- u64 extent_offset;
- u64 num_bytes;
-
- if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
- inum == btrfs_ino(BTRFS_I(inode)))
- return 0;
-
- key.objectid = root_id;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
-
- root = btrfs_read_fs_root_no_name(fs_info, &key);
- if (IS_ERR(root)) {
- if (PTR_ERR(root) == -ENOENT)
- return 0;
- WARN_ON(1);
- btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu",
- inum, offset, root_id);
- return PTR_ERR(root);
- }
-
- key.objectid = inum;
- key.type = BTRFS_EXTENT_DATA_KEY;
- if (offset > (u64)-1 << 32)
- key.offset = 0;
- else
- key.offset = offset;
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (WARN_ON(ret < 0))
- return ret;
- ret = 0;
-
- while (1) {
- cond_resched();
-
- leaf = path->nodes[0];
- slot = path->slots[0];
-
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- goto out;
- } else if (ret > 0) {
- ret = 0;
- goto out;
- }
- continue;
- }
-
- path->slots[0]++;
-
- btrfs_item_key_to_cpu(leaf, &key, slot);
-
- if (key.objectid > inum)
- goto out;
-
- if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
- continue;
-
- extent = btrfs_item_ptr(leaf, slot,
- struct btrfs_file_extent_item);
-
- if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
- continue;
-
- /*
- * 'offset' refers to the exact key.offset,
- * NOT the 'offset' field in btrfs_extent_data_ref, ie.
- * (key.offset - extent_offset).
- */
- if (key.offset != offset)
- continue;
-
- extent_offset = btrfs_file_extent_offset(leaf, extent);
- num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
-
- if (extent_offset >= old->extent_offset + old->offset +
- old->len || extent_offset + num_bytes <=
- old->extent_offset + old->offset)
- continue;
- break;
- }
-
- backref = kmalloc(sizeof(*backref), GFP_NOFS);
- if (!backref) {
- ret = -ENOENT;
- goto out;
- }
-
- backref->root_id = root_id;
- backref->inum = inum;
- backref->file_pos = offset;
- backref->num_bytes = num_bytes;
- backref->extent_offset = extent_offset;
- backref->generation = btrfs_file_extent_generation(leaf, extent);
- backref->old = old;
- backref_insert(&new->root, backref);
- old->count++;
-out:
- btrfs_release_path(path);
- WARN_ON(ret);
- return ret;
-}
-
-static noinline bool record_extent_backrefs(struct btrfs_path *path,
- struct new_sa_defrag_extent *new)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
- struct old_sa_defrag_extent *old, *tmp;
- int ret;
-
- new->path = path;
-
- list_for_each_entry_safe(old, tmp, &new->head, list) {
- ret = iterate_inodes_from_logical(old->bytenr +
- old->extent_offset, fs_info,
- path, record_one_backref,
- old, false);
- if (ret < 0 && ret != -ENOENT)
- return false;
-
- /* no backref to be processed for this extent */
- if (!old->count) {
- list_del(&old->list);
- kfree(old);
- }
- }
-
- if (list_empty(&new->head))
- return false;
-
- return true;
-}
-
-static int relink_is_mergable(struct extent_buffer *leaf,
- struct btrfs_file_extent_item *fi,
- struct new_sa_defrag_extent *new)
-{
- if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
- return 0;
-
- if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
- return 0;
-
- if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
- return 0;
-
- if (btrfs_file_extent_encryption(leaf, fi) ||
- btrfs_file_extent_other_encoding(leaf, fi))
- return 0;
-
- return 1;
-}
-
-/*
- * Note the backref might has changed, and in this case we just return 0.
- */
-static noinline int relink_extent_backref(struct btrfs_path *path,
- struct sa_defrag_extent_backref *prev,
- struct sa_defrag_extent_backref *backref)
-{
- struct btrfs_file_extent_item *extent;
- struct btrfs_file_extent_item *item;
- struct btrfs_ordered_extent *ordered;
- struct btrfs_trans_handle *trans;
- struct btrfs_ref ref = { 0 };
- struct btrfs_root *root;
- struct btrfs_key key;
- struct extent_buffer *leaf;
- struct old_sa_defrag_extent *old = backref->old;
- struct new_sa_defrag_extent *new = old->new;
- struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
- struct inode *inode;
- struct extent_state *cached = NULL;
- int ret = 0;
- u64 start;
- u64 len;
- u64 lock_start;
- u64 lock_end;
- bool merge = false;
- int index;
-
- if (prev && prev->root_id == backref->root_id &&
- prev->inum == backref->inum &&
- prev->file_pos + prev->num_bytes == backref->file_pos)
- merge = true;
-
- /* step 1: get root */
- key.objectid = backref->root_id;
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
-
- index = srcu_read_lock(&fs_info->subvol_srcu);
-
- root = btrfs_read_fs_root_no_name(fs_info, &key);
- if (IS_ERR(root)) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
- if (PTR_ERR(root) == -ENOENT)
- return 0;
- return PTR_ERR(root);
- }
-
- if (btrfs_root_readonly(root)) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
- return 0;
- }
-
- /* step 2: get inode */
- key.objectid = backref->inum;
- key.type = BTRFS_INODE_ITEM_KEY;
- key.offset = 0;
-
- inode = btrfs_iget(fs_info->sb, &key, root);
- if (IS_ERR(inode)) {
- srcu_read_unlock(&fs_info->subvol_srcu, index);
- return 0;
- }
-
- srcu_read_unlock(&fs_info->subvol_srcu, index);
-
- /* step 3: relink backref */
- lock_start = backref->file_pos;
- lock_end = backref->file_pos + backref->num_bytes - 1;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
- &cached);
-
- ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
- if (ordered) {
- btrfs_put_ordered_extent(ordered);
- goto out_unlock;
- }
-
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- goto out_unlock;
- }
-
- key.objectid = backref->inum;
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = backref->file_pos;
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0) {
- goto out_free_path;
- } else if (ret > 0) {
- ret = 0;
- goto out_free_path;
- }
-
- extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_file_extent_item);
-
- if (btrfs_file_extent_generation(path->nodes[0], extent) !=
- backref->generation)
- goto out_free_path;
-
- btrfs_release_path(path);
-
- start = backref->file_pos;
- if (backref->extent_offset < old->extent_offset + old->offset)
- start += old->extent_offset + old->offset -
- backref->extent_offset;
-
- len = min(backref->extent_offset + backref->num_bytes,
- old->extent_offset + old->offset + old->len);
- len -= max(backref->extent_offset, old->extent_offset + old->offset);
-
- ret = btrfs_drop_extents(trans, root, inode, start,
- start + len, 1);
- if (ret)
- goto out_free_path;
-again:
- key.objectid = btrfs_ino(BTRFS_I(inode));
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = start;
-
- path->leave_spinning = 1;
- if (merge) {
- struct btrfs_file_extent_item *fi;
- u64 extent_len;
- struct btrfs_key found_key;
-
- ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
- if (ret < 0)
- goto out_free_path;
-
- path->slots[0]--;
- leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-
- fi = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- extent_len = btrfs_file_extent_num_bytes(leaf, fi);
-
- if (extent_len + found_key.offset == start &&
- relink_is_mergable(leaf, fi, new)) {
- btrfs_set_file_extent_num_bytes(leaf, fi,
- extent_len + len);
- btrfs_mark_buffer_dirty(leaf);
- inode_add_bytes(inode, len);
-
- ret = 1;
- goto out_free_path;
- } else {
- merge = false;
- btrfs_release_path(path);
- goto again;
- }
- }
-
- ret = btrfs_insert_empty_item(trans, root, path, &key,
- sizeof(*extent));
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out_free_path;
- }
-
- leaf = path->nodes[0];
- item = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
- btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
- btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
- btrfs_set_file_extent_num_bytes(leaf, item, len);
- btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
- btrfs_set_file_extent_generation(leaf, item, trans->transid);
- btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
- btrfs_set_file_extent_compression(leaf, item, new->compress_type);
- btrfs_set_file_extent_encryption(leaf, item, 0);
- btrfs_set_file_extent_other_encoding(leaf, item, 0);
-
- btrfs_mark_buffer_dirty(leaf);
- inode_add_bytes(inode, len);
- btrfs_release_path(path);
-
- btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new->bytenr,
- new->disk_len, 0);
- btrfs_init_data_ref(&ref, backref->root_id, backref->inum,
- new->file_pos); /* start - extent_offset */
- ret = btrfs_inc_extent_ref(trans, &ref);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out_free_path;
- }
-
- ret = 1;
-out_free_path:
- btrfs_release_path(path);
- path->leave_spinning = 0;
- btrfs_end_transaction(trans);
-out_unlock:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
- &cached);
- iput(inode);
- return ret;
-}
-
-static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
-{
- struct old_sa_defrag_extent *old, *tmp;
-
- if (!new)
- return;
-
- list_for_each_entry_safe(old, tmp, &new->head, list) {
- kfree(old);
- }
- kfree(new);
-}
-
-static void relink_file_extents(struct new_sa_defrag_extent *new)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
- struct btrfs_path *path;
- struct sa_defrag_extent_backref *backref;
- struct sa_defrag_extent_backref *prev = NULL;
- struct rb_node *node;
- int ret;
-
- path = btrfs_alloc_path();
- if (!path)
- return;
-
- if (!record_extent_backrefs(path, new)) {
- btrfs_free_path(path);
- goto out;
- }
- btrfs_release_path(path);
-
- while (1) {
- node = rb_first(&new->root);
- if (!node)
- break;
- rb_erase(node, &new->root);
-
- backref = rb_entry(node, struct sa_defrag_extent_backref, node);
-
- ret = relink_extent_backref(path, prev, backref);
- WARN_ON(ret < 0);
-
- kfree(prev);
-
- if (ret == 1)
- prev = backref;
- else
- prev = NULL;
- cond_resched();
- }
- kfree(prev);
-
- btrfs_free_path(path);
-out:
- free_sa_defrag_extent(new);
-
- atomic_dec(&fs_info->defrag_running);
- wake_up(&fs_info->transaction_wait);
-}
-
-static struct new_sa_defrag_extent *
-record_old_file_extents(struct inode *inode,
- struct btrfs_ordered_extent *ordered)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_path *path;
- struct btrfs_key key;
- struct old_sa_defrag_extent *old;
- struct new_sa_defrag_extent *new;
- int ret;
-
- new = kmalloc(sizeof(*new), GFP_NOFS);
- if (!new)
- return NULL;
-
- new->inode = inode;
- new->file_pos = ordered->file_offset;
- new->len = ordered->len;
- new->bytenr = ordered->start;
- new->disk_len = ordered->disk_len;
- new->compress_type = ordered->compress_type;
- new->root = RB_ROOT;
- INIT_LIST_HEAD(&new->head);
-
- path = btrfs_alloc_path();
- if (!path)
- goto out_kfree;
-
- key.objectid = btrfs_ino(BTRFS_I(inode));
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = new->file_pos;
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out_free_path;
- if (ret > 0 && path->slots[0] > 0)
- path->slots[0]--;
-
- /* find out all the old extents for the file range */
- while (1) {
- struct btrfs_file_extent_item *extent;
- struct extent_buffer *l;
- int slot;
- u64 num_bytes;
- u64 offset;
- u64 end;
- u64 disk_bytenr;
- u64 extent_offset;
-
- l = path->nodes[0];
- slot = path->slots[0];
-
- if (slot >= btrfs_header_nritems(l)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto out_free_path;
- else if (ret > 0)
- break;
- continue;
- }
-
- btrfs_item_key_to_cpu(l, &key, slot);
-
- if (key.objectid != btrfs_ino(BTRFS_I(inode)))
- break;
- if (key.type != BTRFS_EXTENT_DATA_KEY)
- break;
- if (key.offset >= new->file_pos + new->len)
- break;
-
- extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
-
- num_bytes = btrfs_file_extent_num_bytes(l, extent);
- if (key.offset + num_bytes < new->file_pos)
- goto next;
-
- disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
- if (!disk_bytenr)
- goto next;
-
- extent_offset = btrfs_file_extent_offset(l, extent);
-
- old = kmalloc(sizeof(*old), GFP_NOFS);
- if (!old)
- goto out_free_path;
-
- offset = max(new->file_pos, key.offset);
- end = min(new->file_pos + new->len, key.offset + num_bytes);
-
- old->bytenr = disk_bytenr;
- old->extent_offset = extent_offset;
- old->offset = offset - key.offset;
- old->len = end - offset;
- old->new = new;
- old->count = 0;
- list_add_tail(&old->list, &new->head);
-next:
- path->slots[0]++;
- cond_resched();
- }
-
- btrfs_free_path(path);
- atomic_inc(&fs_info->defrag_running);
-
- return new;
-
-out_free_path:
- btrfs_free_path(path);
-out_kfree:
- free_sa_defrag_extent(new);
- return NULL;
-}
-
static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
u64 start, u64 len)
{
@@ -3064,15 +2419,19 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
struct btrfs_trans_handle *trans = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_state *cached_state = NULL;
- struct new_sa_defrag_extent *new = NULL;
+ u64 start, end;
int compress_type = 0;
int ret = 0;
- u64 logical_len = ordered_extent->len;
+ u64 logical_len = ordered_extent->num_bytes;
bool freespace_inode;
bool truncated = false;
bool range_locked = false;
bool clear_new_delalloc_bytes = false;
bool clear_reserved_extent = true;
+ unsigned int clear_bits;
+
+ start = ordered_extent->file_offset;
+ end = start + ordered_extent->num_bytes - 1;
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
@@ -3086,10 +2445,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- btrfs_free_io_failure_record(BTRFS_I(inode),
- ordered_extent->file_offset,
- ordered_extent->file_offset +
- ordered_extent->len - 1);
+ btrfs_free_io_failure_record(BTRFS_I(inode), start, end);
if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
truncated = true;
@@ -3107,8 +2463,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
* space for NOCOW range.
* As NOCOW won't cause a new delayed ref, just free the space
*/
- btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
- ordered_extent->len);
+ btrfs_qgroup_free_data(inode, NULL, start,
+ ordered_extent->num_bytes);
btrfs_ordered_update_i_size(inode, 0, ordered_extent);
if (freespace_inode)
trans = btrfs_join_transaction_spacecache(root);
@@ -3127,23 +2483,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
}
range_locked = true;
- lock_extent_bits(io_tree, ordered_extent->file_offset,
- ordered_extent->file_offset + ordered_extent->len - 1,
- &cached_state);
-
- ret = test_range_bit(io_tree, ordered_extent->file_offset,
- ordered_extent->file_offset + ordered_extent->len - 1,
- EXTENT_DEFRAG, 0, cached_state);
- if (ret) {
- u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
- if (0 && last_snapshot >= BTRFS_I(inode)->generation)
- /* the inode is shared */
- new = record_old_file_extents(inode, ordered_extent);
-
- clear_extent_bit(io_tree, ordered_extent->file_offset,
- ordered_extent->file_offset + ordered_extent->len - 1,
- EXTENT_DEFRAG, 0, 0, &cached_state);
- }
+ lock_extent_bits(io_tree, start, end, &cached_state);
if (freespace_inode)
trans = btrfs_join_transaction_spacecache(root);
@@ -3161,31 +2501,30 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
compress_type = ordered_extent->compress_type;
if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
BUG_ON(compress_type);
- btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
- ordered_extent->len);
+ btrfs_qgroup_free_data(inode, NULL, start,
+ ordered_extent->num_bytes);
ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
ordered_extent->file_offset,
ordered_extent->file_offset +
logical_len);
} else {
BUG_ON(root == fs_info->tree_root);
- ret = insert_reserved_file_extent(trans, inode,
- ordered_extent->file_offset,
- ordered_extent->start,
- ordered_extent->disk_len,
+ ret = insert_reserved_file_extent(trans, inode, start,
+ ordered_extent->disk_bytenr,
+ ordered_extent->disk_num_bytes,
logical_len, logical_len,
compress_type, 0, 0,
BTRFS_FILE_EXTENT_REG);
if (!ret) {
clear_reserved_extent = false;
btrfs_release_delalloc_bytes(fs_info,
- ordered_extent->start,
- ordered_extent->disk_len);
+ ordered_extent->disk_bytenr,
+ ordered_extent->disk_num_bytes);
}
}
unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
- ordered_extent->file_offset, ordered_extent->len,
- trans->transid);
+ ordered_extent->file_offset,
+ ordered_extent->num_bytes, trans->transid);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3205,37 +2544,27 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
}
ret = 0;
out:
- if (range_locked || clear_new_delalloc_bytes) {
- unsigned int clear_bits = 0;
-
- if (range_locked)
- clear_bits |= EXTENT_LOCKED;
- if (clear_new_delalloc_bytes)
- clear_bits |= EXTENT_DELALLOC_NEW;
- clear_extent_bit(&BTRFS_I(inode)->io_tree,
- ordered_extent->file_offset,
- ordered_extent->file_offset +
- ordered_extent->len - 1,
- clear_bits,
- (clear_bits & EXTENT_LOCKED) ? 1 : 0,
- 0, &cached_state);
- }
+ clear_bits = EXTENT_DEFRAG;
+ if (range_locked)
+ clear_bits |= EXTENT_LOCKED;
+ if (clear_new_delalloc_bytes)
+ clear_bits |= EXTENT_DELALLOC_NEW;
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits,
+ (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0,
+ &cached_state);
if (trans)
btrfs_end_transaction(trans);
if (ret || truncated) {
- u64 start, end;
+ u64 unwritten_start = start;
if (truncated)
- start = ordered_extent->file_offset + logical_len;
- else
- start = ordered_extent->file_offset;
- end = ordered_extent->file_offset + ordered_extent->len - 1;
- clear_extent_uptodate(io_tree, start, end, NULL);
+ unwritten_start += logical_len;
+ clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
/* Drop the cache for the part of the extent we didn't write. */
- btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
+ btrfs_drop_extent_cache(BTRFS_I(inode), unwritten_start, end, 0);
/*
* If the ordered extent had an IOERR or something else went
@@ -3250,29 +2579,28 @@ out:
if ((ret || !logical_len) &&
clear_reserved_extent &&
!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
- !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
+ !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
+ /*
+ * Discard the range before returning it back to the
+ * free space pool
+ */
+ if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
+ btrfs_discard_extent(fs_info,
+ ordered_extent->disk_bytenr,
+ ordered_extent->disk_num_bytes,
+ NULL);
btrfs_free_reserved_extent(fs_info,
- ordered_extent->start,
- ordered_extent->disk_len, 1);
+ ordered_extent->disk_bytenr,
+ ordered_extent->disk_num_bytes, 1);
+ }
}
-
/*
* This needs to be done to make sure anybody waiting knows we are done
* updating everything for this ordered extent.
*/
btrfs_remove_ordered_extent(inode, ordered_extent);
- /* for snapshot-aware defrag */
- if (new) {
- if (ret) {
- free_sa_defrag_extent(new);
- atomic_dec(&fs_info->defrag_running);
- } else {
- relink_file_extents(new);
- }
- }
-
/* once for us */
btrfs_put_ordered_extent(ordered_extent);
/* once for the tree */
@@ -4238,18 +3566,30 @@ out:
}
static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
- struct inode *dir, u64 objectid,
- const char *name, int name_len)
+ struct inode *dir, struct dentry *dentry)
{
struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_dir_item *di;
struct btrfs_key key;
+ const char *name = dentry->d_name.name;
+ int name_len = dentry->d_name.len;
u64 index;
int ret;
+ u64 objectid;
u64 dir_ino = btrfs_ino(BTRFS_I(dir));
+ if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
+ objectid = inode->root->root_key.objectid;
+ } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
+ objectid = inode->location.objectid;
+ } else {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -4271,13 +3611,16 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- ret = btrfs_del_root_ref(trans, objectid, root->root_key.objectid,
- dir_ino, &index, name, name_len);
- if (ret < 0) {
- if (ret != -ENOENT) {
- btrfs_abort_transaction(trans, ret);
- goto out;
- }
+ /*
+ * This is a placeholder inode for a subvolume we didn't have a
+ * reference to at the time of the snapshot creation. In the meantime
+ * we could have renamed the real subvol link into our snapshot, so
+ * depending on btrfs_del_root_ref to return -ENOENT here is incorret.
+ * Instead simply lookup the dir_index_item for this entry so we can
+ * remove it. Otherwise we know we have a ref to the root and we can
+ * call btrfs_del_root_ref, and it _shouldn't_ fail.
+ */
+ if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
di = btrfs_search_dir_index_item(root, path, dir_ino,
name, name_len);
if (IS_ERR_OR_NULL(di)) {
@@ -4292,8 +3635,16 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
index = key.offset;
+ btrfs_release_path(path);
+ } else {
+ ret = btrfs_del_root_ref(trans, objectid,
+ root->root_key.objectid, dir_ino,
+ &index, name, name_len);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
}
- btrfs_release_path(path);
ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index);
if (ret) {
@@ -4487,8 +3838,7 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
- ret = btrfs_unlink_subvol(trans, dir, dest->root_key.objectid,
- dentry->d_name.name, dentry->d_name.len);
+ ret = btrfs_unlink_subvol(trans, dir, dentry);
if (ret) {
err = ret;
btrfs_abort_transaction(trans, ret);
@@ -4583,10 +3933,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
return PTR_ERR(trans);
if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
- err = btrfs_unlink_subvol(trans, dir,
- BTRFS_I(inode)->location.objectid,
- dentry->d_name.name,
- dentry->d_name.len);
+ err = btrfs_unlink_subvol(trans, dir, dentry);
goto out;
}
@@ -5157,7 +4504,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
cur_offset = hole_start;
while (1) {
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
- block_end - cur_offset, 0);
+ block_end - cur_offset);
if (IS_ERR(em)) {
err = PTR_ERR(em);
em = NULL;
@@ -5841,7 +5188,11 @@ static struct inode *new_simple_dir(struct super_block *s,
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
- inode->i_op = &btrfs_dir_ro_inode_operations;
+ /*
+ * We only need lookup, the rest is read-only and there's no inode
+ * associated with the dentry
+ */
+ inode->i_op = &simple_dir_inode_operations;
inode->i_opflags &= ~IOP_XATTR;
inode->i_fop = &simple_dir_operations;
inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
@@ -6932,18 +6283,27 @@ static noinline int uncompress_inline(struct btrfs_path *path,
return ret;
}
-/*
- * a bit scary, this does extent mapping from logical file offset to the disk.
- * the ugly parts come from merging extents from the disk with the in-ram
- * representation. This gets more complex because of the data=ordered code,
- * where the in-ram extents might be locked pending data=ordered completion.
+/**
+ * btrfs_get_extent - Lookup the first extent overlapping a range in a file.
+ * @inode: file to search in
+ * @page: page to read extent data into if the extent is inline
+ * @pg_offset: offset into @page to copy to
+ * @start: file offset
+ * @len: length of range starting at @start
+ *
+ * This returns the first &struct extent_map which overlaps with the given
+ * range, reading it from the B-tree and caching it if necessary. Note that
+ * there may be more extents which overlap the given range after the returned
+ * extent_map.
+ *
+ * If @page is not NULL and the extent is inline, this also reads the extent
+ * data directly into the page and marks the extent up to date in the io_tree.
*
- * This also copies inline extents directly into the page.
+ * Return: ERR_PTR on error, non-NULL extent_map on success.
*/
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
- struct page *page,
- size_t pg_offset, u64 start, u64 len,
- int create)
+ struct page *page, size_t pg_offset,
+ u64 start, u64 len)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret;
@@ -6960,7 +6320,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct extent_map *em = NULL;
struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_io_tree *io_tree = &inode->io_tree;
- const bool new_inline = !page || create;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
@@ -7083,8 +6442,7 @@ next:
goto insert;
}
- btrfs_extent_item_to_extent_map(inode, path, item,
- new_inline, em);
+ btrfs_extent_item_to_extent_map(inode, path, item, !page, em);
if (extent_type == BTRFS_FILE_EXTENT_REG ||
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
@@ -7096,7 +6454,7 @@ next:
size_t extent_offset;
size_t copy_size;
- if (new_inline)
+ if (!page)
goto out;
size = btrfs_file_extent_ram_bytes(leaf, item);
@@ -7179,7 +6537,7 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
u64 delalloc_end;
int err = 0;
- em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+ em = btrfs_get_extent(inode, NULL, 0, start, len);
if (IS_ERR(em))
return em;
/*
@@ -7804,7 +7162,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
goto err;
}
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto unlock_err;
@@ -8356,8 +7714,8 @@ static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode,
* contention.
*/
if (dip->logical_offset == file_offset) {
- ret = btrfs_lookup_bio_sums_dio(inode, dip->orig_bio,
- file_offset);
+ ret = btrfs_lookup_bio_sums(inode, dip->orig_bio, file_offset,
+ NULL);
if (ret)
return ret;
}
@@ -8870,7 +8228,8 @@ again:
ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
page_end - start + 1);
if (ordered) {
- end = min(page_end, ordered->file_offset + ordered->len - 1);
+ end = min(page_end,
+ ordered->file_offset + ordered->num_bytes - 1);
/*
* IO on this page will never be started, so we need
* to account for any ordered extents now
@@ -9071,7 +8430,6 @@ again:
ret = VM_FAULT_SIGBUS;
goto out_unlock;
}
- ret2 = 0;
/* page is wholly or partially inside EOF */
if (page_start + PAGE_SIZE > size)
@@ -9095,12 +8453,10 @@ again:
unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
- if (!ret2) {
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
- sb_end_pagefault(inode->i_sb);
- extent_changeset_free(data_reserved);
- return VM_FAULT_LOCKED;
- }
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ sb_end_pagefault(inode->i_sb);
+ extent_changeset_free(data_reserved);
+ return VM_FAULT_LOCKED;
out_unlock:
unlock_page(page);
@@ -9398,7 +8754,7 @@ void btrfs_destroy_inode(struct inode *inode)
else {
btrfs_err(fs_info,
"found ordered extent %llu %llu on inode cleanup",
- ordered->file_offset, ordered->len);
+ ordered->file_offset, ordered->num_bytes);
btrfs_remove_ordered_extent(inode, ordered);
btrfs_put_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
@@ -9536,7 +8892,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
u64 old_idx = 0;
u64 new_idx = 0;
- u64 root_objectid;
int ret;
bool root_log_pinned = false;
bool dest_log_pinned = false;
@@ -9642,10 +8997,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* src is a subvolume */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
- root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
- ret = btrfs_unlink_subvol(trans, old_dir, root_objectid,
- old_dentry->d_name.name,
- old_dentry->d_name.len);
+ ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
} else { /* src is an inode */
ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
BTRFS_I(old_dentry->d_inode),
@@ -9661,10 +9013,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* dest is a subvolume */
if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
- root_objectid = BTRFS_I(new_inode)->root->root_key.objectid;
- ret = btrfs_unlink_subvol(trans, new_dir, root_objectid,
- new_dentry->d_name.name,
- new_dentry->d_name.len);
+ ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
} else { /* dest is an inode */
ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
BTRFS_I(new_dentry->d_inode),
@@ -9862,7 +9211,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_inode = d_inode(new_dentry);
struct inode *old_inode = d_inode(old_dentry);
u64 index = 0;
- u64 root_objectid;
int ret;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
bool log_pinned = false;
@@ -9970,10 +9318,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
BTRFS_I(old_inode), 1);
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
- root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
- ret = btrfs_unlink_subvol(trans, old_dir, root_objectid,
- old_dentry->d_name.name,
- old_dentry->d_name.len);
+ ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
} else {
ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
BTRFS_I(d_inode(old_dentry)),
@@ -9992,10 +9337,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
new_inode->i_ctime = current_time(new_inode);
if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
- root_objectid = BTRFS_I(new_inode)->location.objectid;
- ret = btrfs_unlink_subvol(trans, new_dir, root_objectid,
- new_dentry->d_name.name,
- new_dentry->d_name.len);
+ ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
BUG_ON(new_inode->i_nlink == 0);
} else {
ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
@@ -10831,7 +10173,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
struct btrfs_block_group *bg;
u64 len = isize - start;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out;
@@ -10999,11 +10341,6 @@ static const struct inode_operations btrfs_dir_inode_operations = {
.update_time = btrfs_update_time,
.tmpfile = btrfs_tmpfile,
};
-static const struct inode_operations btrfs_dir_ro_inode_operations = {
- .lookup = btrfs_lookup,
- .permission = btrfs_permission,
- .update_time = btrfs_update_time,
-};
static const struct file_operations btrfs_dir_file_operations = {
.llseek = generic_file_llseek,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 18e328ce4b54..4f4b13830b25 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1128,7 +1128,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
/* get the big lock and read metadata off disk */
lock_extent_bits(io_tree, start, end, &cached);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
unlock_extent_cached(io_tree, start, end, &cached);
if (IS_ERR(em))
@@ -3243,6 +3243,7 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
struct inode *dst, u64 dst_loff)
{
+ const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
int ret;
/*
@@ -3250,7 +3251,7 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
* source range to serialize with relocation.
*/
btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
- ret = btrfs_clone(src, dst, loff, len, len, dst_loff, 1);
+ ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
return ret;
@@ -4252,7 +4253,19 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
&sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
0);
- if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
+ /*
+ * Copy scrub args to user space even if btrfs_scrub_dev() returned an
+ * error. This is important as it allows user space to know how much
+ * progress scrub has done. For example, if scrub is canceled we get
+ * -ECANCELED from btrfs_scrub_dev() and return that error back to user
+ * space. Later user space can inspect the progress from the structure
+ * btrfs_ioctl_scrub_args and resume scrub from where it left off
+ * previously (btrfs-progs does this).
+ * If we fail to copy the btrfs_ioctl_scrub_args structure to user space
+ * then return -EFAULT to signal the structure was not copied or it may
+ * be corrupt and unreliable due to a partial copy.
+ */
+ if (copy_to_user(arg, sa, sizeof(*sa)))
ret = -EFAULT;
if (!(sa->flags & BTRFS_SCRUB_READONLY))
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index fb09bc2f8e4d..ecb9fb6a6fe0 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -20,9 +20,9 @@ static struct kmem_cache *btrfs_ordered_extent_cache;
static u64 entry_end(struct btrfs_ordered_extent *entry)
{
- if (entry->file_offset + entry->len < entry->file_offset)
+ if (entry->file_offset + entry->num_bytes < entry->file_offset)
return (u64)-1;
- return entry->file_offset + entry->len;
+ return entry->file_offset + entry->num_bytes;
}
/* returns NULL if the insertion worked, or it returns the node it did find
@@ -52,14 +52,6 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
return NULL;
}
-static void ordered_data_tree_panic(struct inode *inode, int errno,
- u64 offset)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- btrfs_panic(fs_info, errno,
- "Inconsistency in ordered tree at offset %llu", offset);
-}
-
/*
* look for a given offset in the tree, and if it can't be found return the
* first lesser offset
@@ -120,7 +112,7 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
{
if (file_offset < entry->file_offset ||
- entry->file_offset + entry->len <= file_offset)
+ entry->file_offset + entry->num_bytes <= file_offset)
return 0;
return 1;
}
@@ -129,7 +121,7 @@ static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
u64 len)
{
if (file_offset + len <= entry->file_offset ||
- entry->file_offset + entry->len <= file_offset)
+ entry->file_offset + entry->num_bytes <= file_offset)
return 0;
return 1;
}
@@ -161,19 +153,14 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
}
/* allocate and add a new ordered_extent into the per-inode tree.
- * file_offset is the logical offset in the file
- *
- * start is the disk block number of an extent already reserved in the
- * extent allocation tree
- *
- * len is the length of the extent
*
* The tree is given a single reference on the ordered extent that was
* inserted.
*/
static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
- u64 start, u64 len, u64 disk_len,
- int type, int dio, int compress_type)
+ u64 disk_bytenr, u64 num_bytes,
+ u64 disk_num_bytes, int type, int dio,
+ int compress_type)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -187,10 +174,10 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
return -ENOMEM;
entry->file_offset = file_offset;
- entry->start = start;
- entry->len = len;
- entry->disk_len = disk_len;
- entry->bytes_left = len;
+ entry->disk_bytenr = disk_bytenr;
+ entry->num_bytes = num_bytes;
+ entry->disk_num_bytes = disk_num_bytes;
+ entry->bytes_left = num_bytes;
entry->inode = igrab(inode);
entry->compress_type = compress_type;
entry->truncated_len = (u64)-1;
@@ -198,7 +185,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
set_bit(type, &entry->flags);
if (dio) {
- percpu_counter_add_batch(&fs_info->dio_bytes, len,
+ percpu_counter_add_batch(&fs_info->dio_bytes, num_bytes,
fs_info->delalloc_batch);
set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
}
@@ -219,7 +206,9 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
node = tree_insert(&tree->tree, file_offset,
&entry->rb_node);
if (node)
- ordered_data_tree_panic(inode, -EEXIST, file_offset);
+ btrfs_panic(fs_info, -EEXIST,
+ "inconsistency in ordered tree at offset %llu",
+ file_offset);
spin_unlock_irq(&tree->lock);
spin_lock(&root->ordered_extent_lock);
@@ -247,27 +236,30 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
}
int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
- u64 start, u64 len, u64 disk_len, int type)
+ u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
+ int type)
{
- return __btrfs_add_ordered_extent(inode, file_offset, start, len,
- disk_len, type, 0,
+ return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
+ num_bytes, disk_num_bytes, type, 0,
BTRFS_COMPRESS_NONE);
}
int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
- u64 start, u64 len, u64 disk_len, int type)
+ u64 disk_bytenr, u64 num_bytes,
+ u64 disk_num_bytes, int type)
{
- return __btrfs_add_ordered_extent(inode, file_offset, start, len,
- disk_len, type, 1,
+ return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
+ num_bytes, disk_num_bytes, type, 1,
BTRFS_COMPRESS_NONE);
}
int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
- u64 start, u64 len, u64 disk_len,
- int type, int compress_type)
+ u64 disk_bytenr, u64 num_bytes,
+ u64 disk_num_bytes, int type,
+ int compress_type)
{
- return __btrfs_add_ordered_extent(inode, file_offset, start, len,
- disk_len, type, 0,
+ return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
+ num_bytes, disk_num_bytes, type, 0,
compress_type);
}
@@ -328,8 +320,8 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
}
dec_start = max(*file_offset, entry->file_offset);
- dec_end = min(*file_offset + io_size, entry->file_offset +
- entry->len);
+ dec_end = min(*file_offset + io_size,
+ entry->file_offset + entry->num_bytes);
*file_offset = dec_end;
if (dec_start > dec_end) {
btrfs_crit(fs_info, "bad ordering dec_start %llu end %llu",
@@ -471,10 +463,11 @@ void btrfs_remove_ordered_extent(struct inode *inode,
btrfs_mod_outstanding_extents(btrfs_inode, -1);
spin_unlock(&btrfs_inode->lock);
if (root != fs_info->tree_root)
- btrfs_delalloc_release_metadata(btrfs_inode, entry->len, false);
+ btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes,
+ false);
if (test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
- percpu_counter_add_batch(&fs_info->dio_bytes, -entry->len,
+ percpu_counter_add_batch(&fs_info->dio_bytes, -entry->num_bytes,
fs_info->delalloc_batch);
tree = &btrfs_inode->ordered_tree;
@@ -534,8 +527,8 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
root_extent_list);
- if (range_end <= ordered->start ||
- ordered->start + ordered->disk_len <= range_start) {
+ if (range_end <= ordered->disk_bytenr ||
+ ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) {
list_move_tail(&ordered->root_extent_list, &skipped);
cond_resched_lock(&root->ordered_extent_lock);
continue;
@@ -619,7 +612,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
int wait)
{
u64 start = entry->file_offset;
- u64 end = start + entry->len - 1;
+ u64 end = start + entry->num_bytes - 1;
trace_btrfs_ordered_extent_start(inode, entry);
@@ -680,7 +673,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
btrfs_put_ordered_extent(ordered);
break;
}
- if (ordered->file_offset + ordered->len <= start) {
+ if (ordered->file_offset + ordered->num_bytes <= start) {
btrfs_put_ordered_extent(ordered);
break;
}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 4eb0319a86d7..3beb4da4ab41 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -67,14 +67,13 @@ struct btrfs_ordered_extent {
/* logical offset in the file */
u64 file_offset;
- /* disk byte number */
- u64 start;
-
- /* ram length of the extent in bytes */
- u64 len;
-
- /* extent length on disk */
- u64 disk_len;
+ /*
+ * These fields directly correspond to the same fields in
+ * btrfs_file_extent_item.
+ */
+ u64 disk_bytenr;
+ u64 num_bytes;
+ u64 disk_num_bytes;
/* number of bytes that still need writing */
u64 bytes_left;
@@ -161,12 +160,15 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
u64 *file_offset, u64 io_size,
int uptodate);
int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
- u64 start, u64 len, u64 disk_len, int type);
+ u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
+ int type);
int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
- u64 start, u64 len, u64 disk_len, int type);
+ u64 disk_bytenr, u64 num_bytes,
+ u64 disk_num_bytes, int type);
int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
- u64 start, u64 len, u64 disk_len,
- int type, int compress_type);
+ u64 disk_bytenr, u64 num_bytes,
+ u64 disk_num_bytes, int type,
+ int compress_type);
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 873b6b694107..61f44e78e3c9 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -317,7 +317,7 @@ void btrfs_print_leaf(struct extent_buffer *l)
print_uuid_item(l, btrfs_item_ptr_offset(l, i),
btrfs_item_size_nr(l, i));
break;
- };
+ }
}
}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index d4282e12f2a6..98d9a50352d6 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1243,7 +1243,6 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
u64 dst)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *quota_root;
struct btrfs_qgroup *parent;
struct btrfs_qgroup *member;
struct btrfs_qgroup_list *list;
@@ -1259,9 +1258,8 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
return -ENOMEM;
mutex_lock(&fs_info->qgroup_ioctl_lock);
- quota_root = fs_info->quota_root;
- if (!quota_root) {
- ret = -EINVAL;
+ if (!fs_info->quota_root) {
+ ret = -ENOTCONN;
goto out;
}
member = find_qgroup_rb(fs_info, src);
@@ -1307,7 +1305,6 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
u64 dst)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *quota_root;
struct btrfs_qgroup *parent;
struct btrfs_qgroup *member;
struct btrfs_qgroup_list *list;
@@ -1320,9 +1317,8 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
if (!tmp)
return -ENOMEM;
- quota_root = fs_info->quota_root;
- if (!quota_root) {
- ret = -EINVAL;
+ if (!fs_info->quota_root) {
+ ret = -ENOTCONN;
goto out;
}
@@ -1387,11 +1383,11 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
int ret = 0;
mutex_lock(&fs_info->qgroup_ioctl_lock);
- quota_root = fs_info->quota_root;
- if (!quota_root) {
- ret = -EINVAL;
+ if (!fs_info->quota_root) {
+ ret = -ENOTCONN;
goto out;
}
+ quota_root = fs_info->quota_root;
qgroup = find_qgroup_rb(fs_info, qgroupid);
if (qgroup) {
ret = -EEXIST;
@@ -1416,15 +1412,13 @@ out:
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
struct btrfs_qgroup_list *list;
int ret = 0;
mutex_lock(&fs_info->qgroup_ioctl_lock);
- quota_root = fs_info->quota_root;
- if (!quota_root) {
- ret = -EINVAL;
+ if (!fs_info->quota_root) {
+ ret = -ENOTCONN;
goto out;
}
@@ -1465,7 +1459,6 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
struct btrfs_qgroup_limit *limit)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
int ret = 0;
/* Sometimes we would want to clear the limit on this qgroup.
@@ -1475,9 +1468,8 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
const u64 CLEAR_VALUE = -1;
mutex_lock(&fs_info->qgroup_ioctl_lock);
- quota_root = fs_info->quota_root;
- if (!quota_root) {
- ret = -EINVAL;
+ if (!fs_info->quota_root) {
+ ret = -ENOTCONN;
goto out;
}
@@ -2423,8 +2415,12 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 nr_old_roots = 0;
int ret = 0;
+ /*
+ * If quotas get disabled meanwhile, the resouces need to be freed and
+ * we can't just exit here.
+ */
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
- return 0;
+ goto out_free;
if (new_roots) {
if (!maybe_fs_roots(new_roots))
@@ -2578,10 +2574,9 @@ cleanup:
int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *quota_root = fs_info->quota_root;
int ret = 0;
- if (!quota_root)
+ if (!fs_info->quota_root)
return ret;
spin_lock(&fs_info->qgroup_lock);
@@ -2875,7 +2870,6 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
enum btrfs_qgroup_rsv_type type)
{
- struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 ref_root = root->root_key.objectid;
@@ -2894,8 +2888,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
enforce = false;
spin_lock(&fs_info->qgroup_lock);
- quota_root = fs_info->quota_root;
- if (!quota_root)
+ if (!fs_info->quota_root)
goto out;
qgroup = find_qgroup_rb(fs_info, ref_root);
@@ -2962,7 +2955,6 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes,
enum btrfs_qgroup_rsv_type type)
{
- struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -2980,8 +2972,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
}
spin_lock(&fs_info->qgroup_lock);
- quota_root = fs_info->quota_root;
- if (!quota_root)
+ if (!fs_info->quota_root)
goto out;
qgroup = find_qgroup_rb(fs_info, ref_root);
@@ -3681,7 +3672,6 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
int num_bytes)
{
- struct btrfs_root *quota_root = fs_info->quota_root;
struct btrfs_qgroup *qgroup;
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -3689,7 +3679,7 @@ static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
if (num_bytes == 0)
return;
- if (!quota_root)
+ if (!fs_info->quota_root)
return;
spin_lock(&fs_info->qgroup_lock);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index c58245797f30..995d4b8b1cfd 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -517,6 +517,34 @@ static int update_backref_cache(struct btrfs_trans_handle *trans,
return 1;
}
+static bool reloc_root_is_dead(struct btrfs_root *root)
+{
+ /*
+ * Pair with set_bit/clear_bit in clean_dirty_subvols and
+ * btrfs_update_reloc_root. We need to see the updated bit before
+ * trying to access reloc_root
+ */
+ smp_rmb();
+ if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state))
+ return true;
+ return false;
+}
+
+/*
+ * Check if this subvolume tree has valid reloc tree.
+ *
+ * Reloc tree after swap is considered dead, thus not considered as valid.
+ * This is enough for most callers, as they don't distinguish dead reloc root
+ * from no reloc root. But should_ignore_root() below is a special case.
+ */
+static bool have_reloc_root(struct btrfs_root *root)
+{
+ if (reloc_root_is_dead(root))
+ return false;
+ if (!root->reloc_root)
+ return false;
+ return true;
+}
static int should_ignore_root(struct btrfs_root *root)
{
@@ -525,6 +553,10 @@ static int should_ignore_root(struct btrfs_root *root)
if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
return 0;
+ /* This root has been merged with its reloc tree, we can ignore it */
+ if (reloc_root_is_dead(root))
+ return 1;
+
reloc_root = root->reloc_root;
if (!reloc_root)
return 0;
@@ -1439,7 +1471,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
* The subvolume has reloc tree but the swap is finished, no need to
* create/update the dead reloc tree
*/
- if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state))
+ if (reloc_root_is_dead(root))
return 0;
if (root->reloc_root) {
@@ -1478,8 +1510,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root_item *root_item;
int ret;
- if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state) ||
- !root->reloc_root)
+ if (!have_reloc_root(root))
goto out;
reloc_root = root->reloc_root;
@@ -1489,6 +1520,11 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
if (fs_info->reloc_ctl->merge_reloc_tree &&
btrfs_root_refs(root_item) == 0) {
set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
+ /*
+ * Mark the tree as dead before we change reloc_root so
+ * have_reloc_root will not touch it from now on.
+ */
+ smp_wmb();
__del_reloc_root(reloc_root);
}
@@ -2201,6 +2237,11 @@ static int clean_dirty_subvols(struct reloc_control *rc)
if (ret2 < 0 && !ret)
ret = ret2;
}
+ /*
+ * Need barrier to ensure clear_bit() only happens after
+ * root->reloc_root = NULL. Pairs with have_reloc_root.
+ */
+ smp_wmb();
clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
btrfs_put_fs_root(root);
} else {
@@ -4291,6 +4332,15 @@ static void describe_relocation(struct btrfs_fs_info *fs_info,
block_group->start, buf);
}
+static const char *stage_to_string(int stage)
+{
+ if (stage == MOVE_DATA_EXTENTS)
+ return "move data extents";
+ if (stage == UPDATE_DATA_PTRS)
+ return "update data pointers";
+ return "unknown";
+}
+
/*
* function to relocate all extents in a block group.
*/
@@ -4365,12 +4415,15 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
rc->block_group->length);
while (1) {
+ int finishes_stage;
+
mutex_lock(&fs_info->cleaner_mutex);
ret = relocate_block_group(rc);
mutex_unlock(&fs_info->cleaner_mutex);
if (ret < 0)
err = ret;
+ finishes_stage = rc->stage;
/*
* We may have gotten ENOSPC after we already dirtied some
* extents. If writeout happens while we're relocating a
@@ -4396,8 +4449,8 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
if (rc->extents_found == 0)
break;
- btrfs_info(fs_info, "found %llu extents", rc->extents_found);
-
+ btrfs_info(fs_info, "found %llu extents, stage: %s",
+ rc->extents_found, stage_to_string(finishes_stage));
}
WARN_ON(rc->block_group->pinned > 0);
@@ -4615,7 +4668,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
LIST_HEAD(list);
ordered = btrfs_lookup_ordered_extent(inode, file_pos);
- BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
+ BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len);
disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr,
@@ -4639,7 +4692,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
* disk_len vs real len like with real inodes since it's all
* disk length.
*/
- new_bytenr = ordered->start + (sums->bytenr - disk_bytenr);
+ new_bytenr = ordered->disk_bytenr + sums->bytenr - disk_bytenr;
sums->bytenr = new_bytenr;
btrfs_add_ordered_sum(ordered, sums);
@@ -4718,7 +4771,7 @@ void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
struct btrfs_root *root = pending->root;
struct reloc_control *rc = root->fs_info->reloc_ctl;
- if (!root->reloc_root || !rc)
+ if (!rc || !have_reloc_root(root))
return;
if (!rc->merge_reloc_tree)
@@ -4752,7 +4805,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
struct reloc_control *rc = root->fs_info->reloc_ctl;
int ret;
- if (!root->reloc_root || !rc)
+ if (!rc || !have_reloc_root(root))
return 0;
rc = root->fs_info->reloc_ctl;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 3b17b647d002..612411c74550 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -376,11 +376,13 @@ again:
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_root_ref);
-
- WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
- WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
ptr = (unsigned long)(ref + 1);
- WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
+ if ((btrfs_root_ref_dirid(leaf, ref) != dirid) ||
+ (btrfs_root_ref_name_len(leaf, ref) != name_len) ||
+ memcmp_extent_buffer(leaf, name, ptr, name_len)) {
+ err = -ENOENT;
+ goto out;
+ }
*sequence = btrfs_root_ref_sequence(leaf, ref);
ret = btrfs_del_item(trans, tree_root, path);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 21de630b0730..61b37c56a7fb 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -8,6 +8,7 @@
#include <linux/sched/mm.h>
#include <crypto/hash.h>
#include "ctree.h"
+#include "discard.h"
#include "volumes.h"
#include "disk-io.h"
#include "ordered-data.h"
@@ -3577,17 +3578,27 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
* This can easily boost the amount of SYSTEM chunks if cleaner
* thread can't be triggered fast enough, and use up all space
* of btrfs_super_block::sys_chunk_array
+ *
+ * While for dev replace, we need to try our best to mark block
+ * group RO, to prevent race between:
+ * - Write duplication
+ * Contains latest data
+ * - Scrub copy
+ * Contains data from commit tree
+ *
+ * If target block group is not marked RO, nocow writes can
+ * be overwritten by scrub copy, causing data corruption.
+ * So for dev-replace, it's not allowed to continue if a block
+ * group is not RO.
*/
- ret = btrfs_inc_block_group_ro(cache, false);
- scrub_pause_off(fs_info);
-
+ ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
if (ret == 0) {
ro_set = 1;
- } else if (ret == -ENOSPC) {
+ } else if (ret == -ENOSPC && !sctx->is_dev_replace) {
/*
* btrfs_inc_block_group_ro return -ENOSPC when it
* failed in creating new chunk for metadata.
- * It is not a problem for scrub/replace, because
+ * It is not a problem for scrub, because
* metadata are always cowed, and our scrub paused
* commit_transactions.
*/
@@ -3596,9 +3607,22 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
btrfs_warn(fs_info,
"failed setting block group ro: %d", ret);
btrfs_put_block_group(cache);
+ scrub_pause_off(fs_info);
break;
}
+ /*
+ * Now the target block is marked RO, wait for nocow writes to
+ * finish before dev-replace.
+ * COW is fine, as COW never overwrites extents in commit tree.
+ */
+ if (sctx->is_dev_replace) {
+ btrfs_wait_nocow_writers(cache);
+ btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
+ cache->length);
+ }
+
+ scrub_pause_off(fs_info);
down_write(&dev_replace->rwsem);
dev_replace->cursor_right = found_key.offset + length;
dev_replace->cursor_left = found_key.offset;
@@ -3659,7 +3683,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (!cache->removed && !cache->ro && cache->reserved == 0 &&
cache->used == 0) {
spin_unlock(&cache->lock);
- btrfs_mark_bg_unused(cache);
+ if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
+ btrfs_discard_queue_work(&fs_info->discard_ctl,
+ cache);
+ else
+ btrfs_mark_bg_unused(cache);
} else {
spin_unlock(&cache->lock);
}
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index f09aa6ee9113..537bc310a673 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -161,8 +161,7 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
static int can_overcommit(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, u64 bytes,
- enum btrfs_reserve_flush_enum flush,
- bool system_chunk)
+ enum btrfs_reserve_flush_enum flush)
{
u64 profile;
u64 avail;
@@ -173,7 +172,7 @@ static int can_overcommit(struct btrfs_fs_info *fs_info,
if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
return 0;
- if (system_chunk)
+ if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
profile = btrfs_system_alloc_profile(fs_info);
else
profile = btrfs_metadata_alloc_profile(fs_info);
@@ -227,8 +226,7 @@ again:
/* Check and see if our ticket can be satisified now. */
if ((used + ticket->bytes <= space_info->total_bytes) ||
- can_overcommit(fs_info, space_info, ticket->bytes, flush,
- false)) {
+ can_overcommit(fs_info, space_info, ticket->bytes, flush)) {
btrfs_space_info_update_bytes_may_use(fs_info,
space_info,
ticket->bytes);
@@ -626,8 +624,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
static inline u64
btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- bool system_chunk)
+ struct btrfs_space_info *space_info)
{
struct reserve_ticket *ticket;
u64 used;
@@ -643,13 +640,12 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
if (can_overcommit(fs_info, space_info, to_reclaim,
- BTRFS_RESERVE_FLUSH_ALL, system_chunk))
+ BTRFS_RESERVE_FLUSH_ALL))
return 0;
used = btrfs_space_info_used(space_info, true);
- if (can_overcommit(fs_info, space_info, SZ_1M,
- BTRFS_RESERVE_FLUSH_ALL, system_chunk))
+ if (can_overcommit(fs_info, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL))
expected = div_factor_fine(space_info->total_bytes, 95);
else
expected = div_factor_fine(space_info->total_bytes, 90);
@@ -665,7 +661,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
- u64 used, bool system_chunk)
+ u64 used)
{
u64 thresh = div_factor_fine(space_info->total_bytes, 98);
@@ -673,8 +669,7 @@ static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
return 0;
- if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
- system_chunk))
+ if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info))
return 0;
return (used >= thresh && !btrfs_fs_closing(fs_info) &&
@@ -765,8 +760,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
spin_lock(&space_info->lock);
- to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
- false);
+ to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
if (!to_reclaim) {
space_info->flush = 0;
spin_unlock(&space_info->lock);
@@ -785,8 +779,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
return;
}
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
- space_info,
- false);
+ space_info);
if (last_tickets_id == space_info->tickets_id) {
flush_state++;
} else {
@@ -858,8 +851,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
int flush_state;
spin_lock(&space_info->lock);
- to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
- false);
+ to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
if (!to_reclaim) {
spin_unlock(&space_info->lock);
return;
@@ -990,8 +982,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
u64 orig_bytes,
- enum btrfs_reserve_flush_enum flush,
- bool system_chunk)
+ enum btrfs_reserve_flush_enum flush)
{
struct reserve_ticket ticket;
u64 used;
@@ -1013,8 +1004,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
*/
if (!pending_tickets &&
((used + orig_bytes <= space_info->total_bytes) ||
- can_overcommit(fs_info, space_info, orig_bytes, flush,
- system_chunk))) {
+ can_overcommit(fs_info, space_info, orig_bytes, flush))) {
btrfs_space_info_update_bytes_may_use(fs_info, space_info,
orig_bytes);
ret = 0;
@@ -1054,8 +1044,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
* the async reclaim as we will panic.
*/
if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
- need_do_async_reclaim(fs_info, space_info,
- used, system_chunk) &&
+ need_do_async_reclaim(fs_info, space_info, used) &&
!work_busy(&fs_info->async_reclaim_work)) {
trace_btrfs_trigger_flush(fs_info, space_info->flags,
orig_bytes, flush, "preempt");
@@ -1092,10 +1081,9 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
int ret;
- bool system_chunk = (root == fs_info->chunk_root);
ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
- orig_bytes, flush, system_chunk);
+ orig_bytes, flush);
if (ret == -ENOSPC &&
unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
if (block_rsv != global_rsv &&
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f452a94abdc3..a906315efd19 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -46,6 +46,7 @@
#include "sysfs.h"
#include "tests/btrfs-tests.h"
#include "block-group.h"
+#include "discard.h"
#include "qgroup.h"
#define CREATE_TRACE_POINTS
@@ -146,6 +147,8 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
if (sb_rdonly(sb))
return;
+ btrfs_discard_stop(fs_info);
+
/* btrfs handle error by forcing the filesystem readonly */
sb->s_flags |= SB_RDONLY;
btrfs_info(fs_info, "forced readonly");
@@ -313,6 +316,7 @@ enum {
Opt_datasum, Opt_nodatasum,
Opt_defrag, Opt_nodefrag,
Opt_discard, Opt_nodiscard,
+ Opt_discard_mode,
Opt_nologreplay,
Opt_norecovery,
Opt_ratio,
@@ -375,6 +379,7 @@ static const match_table_t tokens = {
{Opt_defrag, "autodefrag"},
{Opt_nodefrag, "noautodefrag"},
{Opt_discard, "discard"},
+ {Opt_discard_mode, "discard=%s"},
{Opt_nodiscard, "nodiscard"},
{Opt_nologreplay, "nologreplay"},
{Opt_norecovery, "norecovery"},
@@ -695,12 +700,26 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
info->metadata_ratio);
break;
case Opt_discard:
- btrfs_set_and_info(info, DISCARD,
- "turning on discard");
+ case Opt_discard_mode:
+ if (token == Opt_discard ||
+ strcmp(args[0].from, "sync") == 0) {
+ btrfs_clear_opt(info->mount_opt, DISCARD_ASYNC);
+ btrfs_set_and_info(info, DISCARD_SYNC,
+ "turning on sync discard");
+ } else if (strcmp(args[0].from, "async") == 0) {
+ btrfs_clear_opt(info->mount_opt, DISCARD_SYNC);
+ btrfs_set_and_info(info, DISCARD_ASYNC,
+ "turning on async discard");
+ } else {
+ ret = -EINVAL;
+ goto out;
+ }
break;
case Opt_nodiscard:
- btrfs_clear_and_info(info, DISCARD,
+ btrfs_clear_and_info(info, DISCARD_SYNC,
"turning off discard");
+ btrfs_clear_and_info(info, DISCARD_ASYNC,
+ "turning off async discard");
break;
case Opt_space_cache:
case Opt_space_cache_version:
@@ -1322,8 +1341,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",nologreplay");
if (btrfs_test_opt(info, FLUSHONCOMMIT))
seq_puts(seq, ",flushoncommit");
- if (btrfs_test_opt(info, DISCARD))
+ if (btrfs_test_opt(info, DISCARD_SYNC))
seq_puts(seq, ",discard");
+ if (btrfs_test_opt(info, DISCARD_ASYNC))
+ seq_puts(seq, ",discard=async");
if (!(info->sb->s_flags & SB_POSIXACL))
seq_puts(seq, ",noacl");
if (btrfs_test_opt(info, SPACE_CACHE))
@@ -1713,6 +1734,14 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
btrfs_cleanup_defrag_inodes(fs_info);
}
+ /* If we toggled discard async */
+ if (!btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) &&
+ btrfs_test_opt(fs_info, DISCARD_ASYNC))
+ btrfs_discard_resume(fs_info);
+ else if (btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) &&
+ !btrfs_test_opt(fs_info, DISCARD_ASYNC))
+ btrfs_discard_cleanup(fs_info);
+
clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
}
@@ -1760,6 +1789,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
*/
cancel_work_sync(&fs_info->async_reclaim_work);
+ btrfs_discard_cleanup(fs_info);
+
/* wait for the uuid_scan task to finish */
down(&fs_info->uuid_tree_rescan_sem);
/* avoid complains from lockdep et al. */
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 5ebbe8a5ee76..7436422194da 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -12,6 +12,7 @@
#include <crypto/hash.h>
#include "ctree.h"
+#include "discard.h"
#include "disk-io.h"
#include "transaction.h"
#include "sysfs.h"
@@ -339,11 +340,177 @@ static const struct attribute_group btrfs_static_feature_attr_group = {
#ifdef CONFIG_BTRFS_DEBUG
/*
+ * Discard statistics and tunables
+ */
+#define discard_to_fs_info(_kobj) to_fs_info((_kobj)->parent->parent)
+
+static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+
+ return snprintf(buf, PAGE_SIZE, "%lld\n",
+ atomic64_read(&fs_info->discard_ctl.discardable_bytes));
+}
+BTRFS_ATTR(discard, discardable_bytes, btrfs_discardable_bytes_show);
+
+static ssize_t btrfs_discardable_extents_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+
+ return snprintf(buf, PAGE_SIZE, "%d\n",
+ atomic_read(&fs_info->discard_ctl.discardable_extents));
+}
+BTRFS_ATTR(discard, discardable_extents, btrfs_discardable_extents_show);
+
+static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+
+ return snprintf(buf, PAGE_SIZE, "%lld\n",
+ fs_info->discard_ctl.discard_bitmap_bytes);
+}
+BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show);
+
+static ssize_t btrfs_discard_bytes_saved_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+
+ return snprintf(buf, PAGE_SIZE, "%lld\n",
+ atomic64_read(&fs_info->discard_ctl.discard_bytes_saved));
+}
+BTRFS_ATTR(discard, discard_bytes_saved, btrfs_discard_bytes_saved_show);
+
+static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+
+ return snprintf(buf, PAGE_SIZE, "%lld\n",
+ fs_info->discard_ctl.discard_extent_bytes);
+}
+BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show);
+
+static ssize_t btrfs_discard_iops_limit_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n",
+ READ_ONCE(fs_info->discard_ctl.iops_limit));
+}
+
+static ssize_t btrfs_discard_iops_limit_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+ struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
+ u32 iops_limit;
+ int ret;
+
+ ret = kstrtou32(buf, 10, &iops_limit);
+ if (ret)
+ return -EINVAL;
+
+ WRITE_ONCE(discard_ctl->iops_limit, iops_limit);
+
+ return len;
+}
+BTRFS_ATTR_RW(discard, iops_limit, btrfs_discard_iops_limit_show,
+ btrfs_discard_iops_limit_store);
+
+static ssize_t btrfs_discard_kbps_limit_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n",
+ READ_ONCE(fs_info->discard_ctl.kbps_limit));
+}
+
+static ssize_t btrfs_discard_kbps_limit_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+ struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
+ u32 kbps_limit;
+ int ret;
+
+ ret = kstrtou32(buf, 10, &kbps_limit);
+ if (ret)
+ return -EINVAL;
+
+ WRITE_ONCE(discard_ctl->kbps_limit, kbps_limit);
+
+ return len;
+}
+BTRFS_ATTR_RW(discard, kbps_limit, btrfs_discard_kbps_limit_show,
+ btrfs_discard_kbps_limit_store);
+
+static ssize_t btrfs_discard_max_discard_size_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ READ_ONCE(fs_info->discard_ctl.max_discard_size));
+}
+
+static ssize_t btrfs_discard_max_discard_size_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
+ struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
+ u64 max_discard_size;
+ int ret;
+
+ ret = kstrtou64(buf, 10, &max_discard_size);
+ if (ret)
+ return -EINVAL;
+
+ WRITE_ONCE(discard_ctl->max_discard_size, max_discard_size);
+
+ return len;
+}
+BTRFS_ATTR_RW(discard, max_discard_size, btrfs_discard_max_discard_size_show,
+ btrfs_discard_max_discard_size_store);
+
+static const struct attribute *discard_debug_attrs[] = {
+ BTRFS_ATTR_PTR(discard, discardable_bytes),
+ BTRFS_ATTR_PTR(discard, discardable_extents),
+ BTRFS_ATTR_PTR(discard, discard_bitmap_bytes),
+ BTRFS_ATTR_PTR(discard, discard_bytes_saved),
+ BTRFS_ATTR_PTR(discard, discard_extent_bytes),
+ BTRFS_ATTR_PTR(discard, iops_limit),
+ BTRFS_ATTR_PTR(discard, kbps_limit),
+ BTRFS_ATTR_PTR(discard, max_discard_size),
+ NULL,
+};
+
+/*
* Runtime debugging exported via sysfs
*
* /sys/fs/btrfs/debug - applies to module or all filesystems
* /sys/fs/btrfs/UUID - applies only to the given filesystem
*/
+static const struct attribute *btrfs_debug_mount_attrs[] = {
+ NULL,
+};
+
static struct attribute *btrfs_debug_feature_attrs[] = {
NULL
};
@@ -734,10 +901,10 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
{
- if (fs_devs->device_dir_kobj) {
- kobject_del(fs_devs->device_dir_kobj);
- kobject_put(fs_devs->device_dir_kobj);
- fs_devs->device_dir_kobj = NULL;
+ if (fs_devs->devices_kobj) {
+ kobject_del(fs_devs->devices_kobj);
+ kobject_put(fs_devs->devices_kobj);
+ fs_devs->devices_kobj = NULL;
}
if (fs_devs->fsid_kobj.state_initialized) {
@@ -771,6 +938,19 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
kobject_del(fs_info->space_info_kobj);
kobject_put(fs_info->space_info_kobj);
}
+#ifdef CONFIG_BTRFS_DEBUG
+ if (fs_info->discard_debug_kobj) {
+ sysfs_remove_files(fs_info->discard_debug_kobj,
+ discard_debug_attrs);
+ kobject_del(fs_info->discard_debug_kobj);
+ kobject_put(fs_info->discard_debug_kobj);
+ }
+ if (fs_info->debug_kobj) {
+ sysfs_remove_files(fs_info->debug_kobj, btrfs_debug_mount_attrs);
+ kobject_del(fs_info->debug_kobj);
+ kobject_put(fs_info->debug_kobj);
+ }
+#endif
addrm_unknown_feature_attrs(fs_info, false);
sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group);
sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs);
@@ -969,46 +1149,120 @@ int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
struct hd_struct *disk;
struct kobject *disk_kobj;
- if (!fs_devices->device_dir_kobj)
+ if (!fs_devices->devices_kobj)
return -EINVAL;
- if (one_device && one_device->bdev) {
- disk = one_device->bdev->bd_part;
- disk_kobj = &part_to_dev(disk)->kobj;
+ if (one_device) {
+ if (one_device->bdev) {
+ disk = one_device->bdev->bd_part;
+ disk_kobj = &part_to_dev(disk)->kobj;
+ sysfs_remove_link(fs_devices->devices_kobj,
+ disk_kobj->name);
+ }
- sysfs_remove_link(fs_devices->device_dir_kobj,
- disk_kobj->name);
- }
+ kobject_del(&one_device->devid_kobj);
+ kobject_put(&one_device->devid_kobj);
+
+ wait_for_completion(&one_device->kobj_unregister);
- if (one_device)
return 0;
+ }
- list_for_each_entry(one_device,
- &fs_devices->devices, dev_list) {
- if (!one_device->bdev)
- continue;
- disk = one_device->bdev->bd_part;
- disk_kobj = &part_to_dev(disk)->kobj;
+ list_for_each_entry(one_device, &fs_devices->devices, dev_list) {
+
+ if (one_device->bdev) {
+ disk = one_device->bdev->bd_part;
+ disk_kobj = &part_to_dev(disk)->kobj;
+ sysfs_remove_link(fs_devices->devices_kobj,
+ disk_kobj->name);
+ }
+ kobject_del(&one_device->devid_kobj);
+ kobject_put(&one_device->devid_kobj);
- sysfs_remove_link(fs_devices->device_dir_kobj,
- disk_kobj->name);
+ wait_for_completion(&one_device->kobj_unregister);
}
return 0;
}
-int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs)
+static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
{
- if (!fs_devs->device_dir_kobj)
- fs_devs->device_dir_kobj = kobject_create_and_add("devices",
- &fs_devs->fsid_kobj);
+ int val;
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
- if (!fs_devs->device_dir_kobj)
- return -ENOMEM;
+ val = !!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
- return 0;
+ return snprintf(buf, PAGE_SIZE, "%d\n", val);
+}
+BTRFS_ATTR(devid, in_fs_metadata, btrfs_devinfo_in_fs_metadata_show);
+
+static ssize_t btrfs_sysfs_missing_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ int val;
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+
+ val = !!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
+
+ return snprintf(buf, PAGE_SIZE, "%d\n", val);
+}
+BTRFS_ATTR(devid, missing, btrfs_sysfs_missing_show);
+
+static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ int val;
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+
+ val = !!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
+
+ return snprintf(buf, PAGE_SIZE, "%d\n", val);
+}
+BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show);
+
+static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ int val;
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+
+ val = !!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+
+ return snprintf(buf, PAGE_SIZE, "%d\n", val);
+}
+BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show);
+
+static struct attribute *devid_attrs[] = {
+ BTRFS_ATTR_PTR(devid, in_fs_metadata),
+ BTRFS_ATTR_PTR(devid, missing),
+ BTRFS_ATTR_PTR(devid, replace_target),
+ BTRFS_ATTR_PTR(devid, writeable),
+ NULL
+};
+ATTRIBUTE_GROUPS(devid);
+
+static void btrfs_release_devid_kobj(struct kobject *kobj)
+{
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+
+ memset(&device->devid_kobj, 0, sizeof(struct kobject));
+ complete(&device->kobj_unregister);
}
+static struct kobj_type devid_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = devid_groups,
+ .release = btrfs_release_devid_kobj,
+};
+
int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device)
{
@@ -1016,22 +1270,31 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *dev;
list_for_each_entry(dev, &fs_devices->devices, dev_list) {
- struct hd_struct *disk;
- struct kobject *disk_kobj;
-
- if (!dev->bdev)
- continue;
if (one_device && one_device != dev)
continue;
- disk = dev->bdev->bd_part;
- disk_kobj = &part_to_dev(disk)->kobj;
+ if (dev->bdev) {
+ struct hd_struct *disk;
+ struct kobject *disk_kobj;
+
+ disk = dev->bdev->bd_part;
+ disk_kobj = &part_to_dev(disk)->kobj;
- error = sysfs_create_link(fs_devices->device_dir_kobj,
- disk_kobj, disk_kobj->name);
- if (error)
+ error = sysfs_create_link(fs_devices->devices_kobj,
+ disk_kobj, disk_kobj->name);
+ if (error)
+ break;
+ }
+
+ init_completion(&dev->kobj_unregister);
+ error = kobject_init_and_add(&dev->devid_kobj, &devid_ktype,
+ fs_devices->devices_kobj, "%llu",
+ dev->devid);
+ if (error) {
+ kobject_put(&dev->devid_kobj);
break;
+ }
}
return error;
@@ -1063,27 +1326,49 @@ void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices,
"sysfs: failed to create fsid for sprout");
}
+void btrfs_sysfs_update_devid(struct btrfs_device *device)
+{
+ char tmp[24];
+
+ snprintf(tmp, sizeof(tmp), "%llu", device->devid);
+
+ if (kobject_rename(&device->devid_kobj, tmp))
+ btrfs_warn(device->fs_devices->fs_info,
+ "sysfs: failed to update devid for %llu",
+ device->devid);
+}
+
/* /sys/fs/btrfs/ entry */
static struct kset *btrfs_kset;
/*
+ * Creates:
+ * /sys/fs/btrfs/UUID
+ *
* Can be called by the device discovery thread.
- * And parent can be specified for seed device
*/
-int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
- struct kobject *parent)
+int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs)
{
int error;
init_completion(&fs_devs->kobj_unregister);
fs_devs->fsid_kobj.kset = btrfs_kset;
- error = kobject_init_and_add(&fs_devs->fsid_kobj,
- &btrfs_ktype, parent, "%pU", fs_devs->fsid);
+ error = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, NULL,
+ "%pU", fs_devs->fsid);
if (error) {
kobject_put(&fs_devs->fsid_kobj);
return error;
}
+ fs_devs->devices_kobj = kobject_create_and_add("devices",
+ &fs_devs->fsid_kobj);
+ if (!fs_devs->devices_kobj) {
+ btrfs_err(fs_devs->fs_info,
+ "failed to init sysfs device interface");
+ kobject_put(&fs_devs->fsid_kobj);
+ return -ENOMEM;
+ }
+
return 0;
}
@@ -1111,8 +1396,26 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
goto failure;
#ifdef CONFIG_BTRFS_DEBUG
- error = sysfs_create_group(fsid_kobj,
- &btrfs_debug_feature_attr_group);
+ fs_info->debug_kobj = kobject_create_and_add("debug", fsid_kobj);
+ if (!fs_info->debug_kobj) {
+ error = -ENOMEM;
+ goto failure;
+ }
+
+ error = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs);
+ if (error)
+ goto failure;
+
+ /* Discard directory */
+ fs_info->discard_debug_kobj = kobject_create_and_add("discard",
+ fs_info->debug_kobj);
+ if (!fs_info->discard_debug_kobj) {
+ error = -ENOMEM;
+ goto failure;
+ }
+
+ error = sysfs_create_files(fs_info->discard_debug_kobj,
+ discard_debug_attrs);
if (error)
goto failure;
#endif
@@ -1209,6 +1512,9 @@ void __cold btrfs_exit_sysfs(void)
sysfs_unmerge_group(&btrfs_kset->kobj,
&btrfs_static_feature_attr_group);
sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
+#ifdef CONFIG_BTRFS_DEBUG
+ sysfs_remove_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group);
+#endif
kset_unregister(btrfs_kset);
}
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index e10c3adfc30f..c68582add92e 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -18,9 +18,7 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device);
int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device);
-int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
- struct kobject *parent);
-int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs);
+int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices,
const u8 *fsid);
@@ -36,5 +34,6 @@ void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache);
int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info);
void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info);
+void btrfs_sysfs_update_devid(struct btrfs_device *device);
#endif
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index a7aca4141788..c12b91ff5f56 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -86,6 +86,27 @@ static void btrfs_destroy_test_fs(void)
unregister_filesystem(&test_type);
}
+struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_device *dev;
+
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ return ERR_PTR(-ENOMEM);
+
+ extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
+ INIT_LIST_HEAD(&dev->dev_list);
+ list_add(&dev->dev_list, &fs_info->fs_devices->devices);
+
+ return dev;
+}
+
+static void btrfs_free_dummy_device(struct btrfs_device *dev)
+{
+ extent_io_tree_release(&dev->alloc_state);
+ kfree(dev);
+}
+
struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
{
struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info),
@@ -132,12 +153,14 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
INIT_LIST_HEAD(&fs_info->dirty_qgroups);
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
+ INIT_LIST_HEAD(&fs_info->fs_devices->devices);
INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
extent_io_tree_init(fs_info, &fs_info->freed_extents[0],
IO_TREE_FS_INFO_FREED_EXTENTS0, NULL);
extent_io_tree_init(fs_info, &fs_info->freed_extents[1],
IO_TREE_FS_INFO_FREED_EXTENTS1, NULL);
+ extent_map_tree_init(&fs_info->mapping_tree);
fs_info->pinned_extents = &fs_info->freed_extents[0];
set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
@@ -150,6 +173,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
{
struct radix_tree_iter iter;
void **slot;
+ struct btrfs_device *dev, *tmp;
if (!fs_info)
return;
@@ -180,6 +204,11 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
}
spin_unlock(&fs_info->buffer_lock);
+ btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices,
+ dev_list) {
+ btrfs_free_dummy_device(dev);
+ }
btrfs_free_qgroup_config(fs_info);
btrfs_free_fs_roots(fs_info);
cleanup_srcu_struct(&fs_info->subvol_srcu);
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index 9e52527357d8..7a2d7ffbe30e 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -46,6 +46,7 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long lengt
void btrfs_free_dummy_block_group(struct btrfs_block_group *cache);
void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
+struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info);
#else
static inline int btrfs_run_sanity_tests(void)
{
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 4a7f796c9900..57379e96ccc9 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -6,6 +6,9 @@
#include <linux/types.h>
#include "btrfs-tests.h"
#include "../ctree.h"
+#include "../volumes.h"
+#include "../disk-io.h"
+#include "../block-group.h"
static void free_extent_map_tree(struct extent_map_tree *em_tree)
{
@@ -437,11 +440,153 @@ static int test_case_4(struct btrfs_fs_info *fs_info,
return ret;
}
+struct rmap_test_vector {
+ u64 raid_type;
+ u64 physical_start;
+ u64 data_stripe_size;
+ u64 num_data_stripes;
+ u64 num_stripes;
+ /* Assume we won't have more than 5 physical stripes */
+ u64 data_stripe_phys_start[5];
+ bool expected_mapped_addr;
+ /* Physical to logical addresses */
+ u64 mapped_logical[5];
+};
+
+static int test_rmap_block(struct btrfs_fs_info *fs_info,
+ struct rmap_test_vector *test)
+{
+ struct extent_map *em;
+ struct map_lookup *map = NULL;
+ u64 *logical = NULL;
+ int i, out_ndaddrs, out_stripe_len;
+ int ret;
+
+ em = alloc_extent_map();
+ if (!em) {
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ return -ENOMEM;
+ }
+
+ map = kmalloc(map_lookup_size(test->num_stripes), GFP_KERNEL);
+ if (!map) {
+ kfree(em);
+ test_std_err(TEST_ALLOC_EXTENT_MAP);
+ return -ENOMEM;
+ }
+
+ set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
+ /* Start at 4GiB logical address */
+ em->start = SZ_4G;
+ em->len = test->data_stripe_size * test->num_data_stripes;
+ em->block_len = em->len;
+ em->orig_block_len = test->data_stripe_size;
+ em->map_lookup = map;
+
+ map->num_stripes = test->num_stripes;
+ map->stripe_len = BTRFS_STRIPE_LEN;
+ map->type = test->raid_type;
+
+ for (i = 0; i < map->num_stripes; i++) {
+ struct btrfs_device *dev = btrfs_alloc_dummy_device(fs_info);
+
+ if (IS_ERR(dev)) {
+ test_err("cannot allocate device");
+ ret = PTR_ERR(dev);
+ goto out;
+ }
+ map->stripes[i].dev = dev;
+ map->stripes[i].physical = test->data_stripe_phys_start[i];
+ }
+
+ write_lock(&fs_info->mapping_tree.lock);
+ ret = add_extent_mapping(&fs_info->mapping_tree, em, 0);
+ write_unlock(&fs_info->mapping_tree.lock);
+ if (ret) {
+ test_err("error adding block group mapping to mapping tree");
+ goto out_free;
+ }
+
+ ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1),
+ &logical, &out_ndaddrs, &out_stripe_len);
+ if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) {
+ test_err("didn't rmap anything but expected %d",
+ test->expected_mapped_addr);
+ goto out;
+ }
+
+ if (out_stripe_len != BTRFS_STRIPE_LEN) {
+ test_err("calculated stripe length doesn't match");
+ goto out;
+ }
+
+ if (out_ndaddrs != test->expected_mapped_addr) {
+ for (i = 0; i < out_ndaddrs; i++)
+ test_msg("mapped %llu", logical[i]);
+ test_err("unexpected number of mapped addresses: %d", out_ndaddrs);
+ goto out;
+ }
+
+ for (i = 0; i < out_ndaddrs; i++) {
+ if (logical[i] != test->mapped_logical[i]) {
+ test_err("unexpected logical address mapped");
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ write_lock(&fs_info->mapping_tree.lock);
+ remove_extent_mapping(&fs_info->mapping_tree, em);
+ write_unlock(&fs_info->mapping_tree.lock);
+ /* For us */
+ free_extent_map(em);
+out_free:
+ /* For the tree */
+ free_extent_map(em);
+ kfree(logical);
+ return ret;
+}
+
int btrfs_test_extent_map(void)
{
struct btrfs_fs_info *fs_info = NULL;
struct extent_map_tree *em_tree;
- int ret = 0;
+ int ret = 0, i;
+ struct rmap_test_vector rmap_tests[] = {
+ {
+ /*
+ * Test a chunk with 2 data stripes one of which
+ * interesects the physical address of the super block
+ * is correctly recognised.
+ */
+ .raid_type = BTRFS_BLOCK_GROUP_RAID1,
+ .physical_start = SZ_64M - SZ_4M,
+ .data_stripe_size = SZ_256M,
+ .num_data_stripes = 2,
+ .num_stripes = 2,
+ .data_stripe_phys_start =
+ {SZ_64M - SZ_4M, SZ_64M - SZ_4M + SZ_256M},
+ .expected_mapped_addr = true,
+ .mapped_logical= {SZ_4G + SZ_4M}
+ },
+ {
+ /*
+ * Test that out-of-range physical addresses are
+ * ignored
+ */
+
+ /* SINGLE chunk type */
+ .raid_type = 0,
+ .physical_start = SZ_4G,
+ .data_stripe_size = SZ_256M,
+ .num_data_stripes = 1,
+ .num_stripes = 1,
+ .data_stripe_phys_start = {SZ_256M},
+ .expected_mapped_addr = false,
+ .mapped_logical = {0}
+ }
+ };
test_msg("running extent_map tests");
@@ -474,6 +619,13 @@ int btrfs_test_extent_map(void)
goto out;
ret = test_case_4(fs_info, em_tree);
+ test_msg("running rmap tests");
+ for (i = 0; i < ARRAY_SIZE(rmap_tests); i++) {
+ ret = test_rmap_block(fs_info, &rmap_tests[i]);
+ if (ret)
+ goto out;
+ }
+
out:
kfree(em_tree);
btrfs_free_dummy_fs_info(fs_info);
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 09ecf7dc7b08..24a8c714f56c 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -263,7 +263,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
/* First with no extents */
BTRFS_I(inode)->root = root;
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize);
if (IS_ERR(em)) {
em = NULL;
test_err("got an error when we shouldn't have");
@@ -283,7 +283,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
*/
setup_file_extents(root, sectorsize);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -305,7 +305,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -333,7 +333,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -356,7 +356,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
free_extent_map(em);
/* Regular extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -384,7 +384,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
free_extent_map(em);
/* The next 3 are split extents */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -413,7 +413,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -435,7 +435,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -469,7 +469,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
free_extent_map(em);
/* Prealloc extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -498,7 +498,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
free_extent_map(em);
/* The next 3 are a half written prealloc extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -528,7 +528,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -561,7 +561,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -596,7 +596,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
free_extent_map(em);
/* Now for the compressed extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -630,7 +630,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
free_extent_map(em);
/* Split compressed extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -665,7 +665,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -692,7 +692,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -727,8 +727,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
free_extent_map(em);
/* A hole between regular extents but no hole extent */
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6,
- sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -755,7 +754,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -788,7 +787,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
offset = em->start + em->len;
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -872,7 +871,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
insert_inode_item_key(root);
insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize,
sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
@@ -894,8 +893,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
}
free_extent_map(em);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize,
- 2 * sectorsize, 0);
+ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize, 2 * sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index cfc08ef9b876..33dcc88b428a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -147,13 +147,14 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
}
}
-static noinline void switch_commit_roots(struct btrfs_transaction *trans)
+static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
{
+ struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root, *tmp;
down_write(&fs_info->commit_root_sem);
- list_for_each_entry_safe(root, tmp, &trans->switch_commits,
+ list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits,
dirty_list) {
list_del_init(&root->dirty_list);
free_extent_buffer(root->commit_root);
@@ -165,16 +166,17 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans)
}
/* We can free old roots now. */
- spin_lock(&trans->dropped_roots_lock);
- while (!list_empty(&trans->dropped_roots)) {
- root = list_first_entry(&trans->dropped_roots,
+ spin_lock(&cur_trans->dropped_roots_lock);
+ while (!list_empty(&cur_trans->dropped_roots)) {
+ root = list_first_entry(&cur_trans->dropped_roots,
struct btrfs_root, root_list);
list_del_init(&root->root_list);
- spin_unlock(&trans->dropped_roots_lock);
+ spin_unlock(&cur_trans->dropped_roots_lock);
+ btrfs_free_log(trans, root);
btrfs_drop_and_free_fs_root(fs_info, root);
- spin_lock(&trans->dropped_roots_lock);
+ spin_lock(&cur_trans->dropped_roots_lock);
}
- spin_unlock(&trans->dropped_roots_lock);
+ spin_unlock(&cur_trans->dropped_roots_lock);
up_write(&fs_info->commit_root_sem);
}
@@ -1421,7 +1423,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
ret = commit_cowonly_roots(trans);
if (ret)
goto out;
- switch_commit_roots(trans->transaction);
+ switch_commit_roots(trans);
ret = btrfs_write_and_wait_transaction(trans);
if (ret)
btrfs_handle_fs_error(fs_info, ret,
@@ -2013,6 +2015,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
ASSERT(refcount_read(&trans->use_count) == 1);
+ /*
+ * Some places just start a transaction to commit it. We need to make
+ * sure that if this commit fails that the abort code actually marks the
+ * transaction as failed, so set trans->dirty to make the abort code do
+ * the right thing.
+ */
+ trans->dirty = true;
+
/* Stop the commit early if ->aborted is set */
if (unlikely(READ_ONCE(cur_trans->aborted))) {
ret = cur_trans->aborted;
@@ -2301,7 +2311,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
list_add_tail(&fs_info->chunk_root->dirty_list,
&cur_trans->switch_commits);
- switch_commit_roots(cur_trans);
+ switch_commit_roots(trans);
ASSERT(list_empty(&cur_trans->dirty_bgs));
ASSERT(list_empty(&cur_trans->io_bgs));
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 97f3520b8d98..a92f8a6dd192 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -373,6 +373,104 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key,
return 0;
}
+/* Inode item error output has the same format as dir_item_err() */
+#define inode_item_err(eb, slot, fmt, ...) \
+ dir_item_err(eb, slot, fmt, __VA_ARGS__)
+
+static int check_inode_key(struct extent_buffer *leaf, struct btrfs_key *key,
+ int slot)
+{
+ struct btrfs_key item_key;
+ bool is_inode_item;
+
+ btrfs_item_key_to_cpu(leaf, &item_key, slot);
+ is_inode_item = (item_key.type == BTRFS_INODE_ITEM_KEY);
+
+ /* For XATTR_ITEM, location key should be all 0 */
+ if (item_key.type == BTRFS_XATTR_ITEM_KEY) {
+ if (key->type != 0 || key->objectid != 0 || key->offset != 0)
+ return -EUCLEAN;
+ return 0;
+ }
+
+ if ((key->objectid < BTRFS_FIRST_FREE_OBJECTID ||
+ key->objectid > BTRFS_LAST_FREE_OBJECTID) &&
+ key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID &&
+ key->objectid != BTRFS_FREE_INO_OBJECTID) {
+ if (is_inode_item) {
+ generic_err(leaf, slot,
+ "invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu",
+ key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID,
+ BTRFS_FIRST_FREE_OBJECTID,
+ BTRFS_LAST_FREE_OBJECTID,
+ BTRFS_FREE_INO_OBJECTID);
+ } else {
+ dir_item_err(leaf, slot,
+"invalid location key objectid: has %llu expect %llu or [%llu, %llu] or %llu",
+ key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID,
+ BTRFS_FIRST_FREE_OBJECTID,
+ BTRFS_LAST_FREE_OBJECTID,
+ BTRFS_FREE_INO_OBJECTID);
+ }
+ return -EUCLEAN;
+ }
+ if (key->offset != 0) {
+ if (is_inode_item)
+ inode_item_err(leaf, slot,
+ "invalid key offset: has %llu expect 0",
+ key->offset);
+ else
+ dir_item_err(leaf, slot,
+ "invalid location key offset:has %llu expect 0",
+ key->offset);
+ return -EUCLEAN;
+ }
+ return 0;
+}
+
+static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key,
+ int slot)
+{
+ struct btrfs_key item_key;
+ bool is_root_item;
+
+ btrfs_item_key_to_cpu(leaf, &item_key, slot);
+ is_root_item = (item_key.type == BTRFS_ROOT_ITEM_KEY);
+
+ /* No such tree id */
+ if (key->objectid == 0) {
+ if (is_root_item)
+ generic_err(leaf, slot, "invalid root id 0");
+ else
+ dir_item_err(leaf, slot,
+ "invalid location key root id 0");
+ return -EUCLEAN;
+ }
+
+ /* DIR_ITEM/INDEX/INODE_REF is not allowed to point to non-fs trees */
+ if (!is_fstree(key->objectid) && !is_root_item) {
+ dir_item_err(leaf, slot,
+ "invalid location key objectid, have %llu expect [%llu, %llu]",
+ key->objectid, BTRFS_FIRST_FREE_OBJECTID,
+ BTRFS_LAST_FREE_OBJECTID);
+ return -EUCLEAN;
+ }
+
+ /*
+ * ROOT_ITEM with non-zero offset means this is a snapshot, created at
+ * @offset transid.
+ * Furthermore, for location key in DIR_ITEM, its offset is always -1.
+ *
+ * So here we only check offset for reloc tree whose key->offset must
+ * be a valid tree.
+ */
+ if (key->objectid == BTRFS_TREE_RELOC_OBJECTID && key->offset == 0) {
+ generic_err(leaf, slot, "invalid root id 0 for reloc tree");
+ return -EUCLEAN;
+ }
+ return 0;
+}
+
static int check_dir_item(struct extent_buffer *leaf,
struct btrfs_key *key, struct btrfs_key *prev_key,
int slot)
@@ -386,12 +484,14 @@ static int check_dir_item(struct extent_buffer *leaf,
return -EUCLEAN;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
while (cur < item_size) {
+ struct btrfs_key location_key;
u32 name_len;
u32 data_len;
u32 max_name_len;
u32 total_size;
u32 name_hash;
u8 dir_type;
+ int ret;
/* header itself should not cross item boundary */
if (cur + sizeof(*di) > item_size) {
@@ -401,6 +501,25 @@ static int check_dir_item(struct extent_buffer *leaf,
return -EUCLEAN;
}
+ /* Location key check */
+ btrfs_dir_item_key_to_cpu(leaf, di, &location_key);
+ if (location_key.type == BTRFS_ROOT_ITEM_KEY) {
+ ret = check_root_key(leaf, &location_key, slot);
+ if (ret < 0)
+ return ret;
+ } else if (location_key.type == BTRFS_INODE_ITEM_KEY ||
+ location_key.type == 0) {
+ ret = check_inode_key(leaf, &location_key, slot);
+ if (ret < 0)
+ return ret;
+ } else {
+ dir_item_err(leaf, slot,
+ "invalid location key type, have %u, expect %u or %u",
+ location_key.type, BTRFS_ROOT_ITEM_KEY,
+ BTRFS_INODE_ITEM_KEY);
+ return -EUCLEAN;
+ }
+
/* dir type check */
dir_type = btrfs_dir_type(leaf, di);
if (dir_type >= BTRFS_FT_MAX) {
@@ -738,6 +857,44 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
return 0;
}
+/*
+ * Enhanced version of chunk item checker.
+ *
+ * The common btrfs_check_chunk_valid() doesn't check item size since it needs
+ * to work on super block sys_chunk_array which doesn't have full item ptr.
+ */
+static int check_leaf_chunk_item(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk,
+ struct btrfs_key *key, int slot)
+{
+ int num_stripes;
+
+ if (btrfs_item_size_nr(leaf, slot) < sizeof(struct btrfs_chunk)) {
+ chunk_err(leaf, chunk, key->offset,
+ "invalid chunk item size: have %u expect [%zu, %u)",
+ btrfs_item_size_nr(leaf, slot),
+ sizeof(struct btrfs_chunk),
+ BTRFS_LEAF_DATA_SIZE(leaf->fs_info));
+ return -EUCLEAN;
+ }
+
+ num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+ /* Let btrfs_check_chunk_valid() handle this error type */
+ if (num_stripes == 0)
+ goto out;
+
+ if (btrfs_chunk_item_size(num_stripes) !=
+ btrfs_item_size_nr(leaf, slot)) {
+ chunk_err(leaf, chunk, key->offset,
+ "invalid chunk item size: have %u expect %lu",
+ btrfs_item_size_nr(leaf, slot),
+ btrfs_chunk_item_size(num_stripes));
+ return -EUCLEAN;
+ }
+out:
+ return btrfs_check_chunk_valid(leaf, chunk, key->offset);
+}
+
__printf(3, 4)
__cold
static void dev_item_err(const struct extent_buffer *eb, int slot,
@@ -801,7 +958,7 @@ static int check_dev_item(struct extent_buffer *leaf,
}
/* Inode item error output has the same format as dir_item_err() */
-#define inode_item_err(fs_info, eb, slot, fmt, ...) \
+#define inode_item_err(eb, slot, fmt, ...) \
dir_item_err(eb, slot, fmt, __VA_ARGS__)
static int check_inode_item(struct extent_buffer *leaf,
@@ -812,30 +969,17 @@ static int check_inode_item(struct extent_buffer *leaf,
u64 super_gen = btrfs_super_generation(fs_info->super_copy);
u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777);
u32 mode;
+ int ret;
+
+ ret = check_inode_key(leaf, key, slot);
+ if (ret < 0)
+ return ret;
- if ((key->objectid < BTRFS_FIRST_FREE_OBJECTID ||
- key->objectid > BTRFS_LAST_FREE_OBJECTID) &&
- key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID &&
- key->objectid != BTRFS_FREE_INO_OBJECTID) {
- generic_err(leaf, slot,
- "invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu",
- key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID,
- BTRFS_FIRST_FREE_OBJECTID,
- BTRFS_LAST_FREE_OBJECTID,
- BTRFS_FREE_INO_OBJECTID);
- return -EUCLEAN;
- }
- if (key->offset != 0) {
- inode_item_err(fs_info, leaf, slot,
- "invalid key offset: has %llu expect 0",
- key->offset);
- return -EUCLEAN;
- }
iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item);
/* Here we use super block generation + 1 to handle log tree */
if (btrfs_inode_generation(leaf, iitem) > super_gen + 1) {
- inode_item_err(fs_info, leaf, slot,
+ inode_item_err(leaf, slot,
"invalid inode generation: has %llu expect (0, %llu]",
btrfs_inode_generation(leaf, iitem),
super_gen + 1);
@@ -843,7 +987,7 @@ static int check_inode_item(struct extent_buffer *leaf,
}
/* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */
if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) {
- inode_item_err(fs_info, leaf, slot,
+ inode_item_err(leaf, slot,
"invalid inode generation: has %llu expect [0, %llu]",
btrfs_inode_transid(leaf, iitem), super_gen + 1);
return -EUCLEAN;
@@ -856,7 +1000,7 @@ static int check_inode_item(struct extent_buffer *leaf,
*/
mode = btrfs_inode_mode(leaf, iitem);
if (mode & ~valid_mask) {
- inode_item_err(fs_info, leaf, slot,
+ inode_item_err(leaf, slot,
"unknown mode bit detected: 0x%x",
mode & ~valid_mask);
return -EUCLEAN;
@@ -869,20 +1013,20 @@ static int check_inode_item(struct extent_buffer *leaf,
*/
if (!has_single_bit_set(mode & S_IFMT)) {
if (!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode)) {
- inode_item_err(fs_info, leaf, slot,
+ inode_item_err(leaf, slot,
"invalid mode: has 0%o expect valid S_IF* bit(s)",
mode & S_IFMT);
return -EUCLEAN;
}
}
if (S_ISDIR(mode) && btrfs_inode_nlink(leaf, iitem) > 1) {
- inode_item_err(fs_info, leaf, slot,
+ inode_item_err(leaf, slot,
"invalid nlink: has %u expect no more than 1 for dir",
btrfs_inode_nlink(leaf, iitem));
return -EUCLEAN;
}
if (btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK) {
- inode_item_err(fs_info, leaf, slot,
+ inode_item_err(leaf, slot,
"unknown flags detected: 0x%llx",
btrfs_inode_flags(leaf, iitem) &
~BTRFS_INODE_FLAG_MASK);
@@ -898,22 +1042,11 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
struct btrfs_root_item ri;
const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY |
BTRFS_ROOT_SUBVOL_DEAD;
+ int ret;
- /* No such tree id */
- if (key->objectid == 0) {
- generic_err(leaf, slot, "invalid root id 0");
- return -EUCLEAN;
- }
-
- /*
- * Some older kernel may create ROOT_ITEM with non-zero offset, so here
- * we only check offset for reloc tree whose key->offset must be a
- * valid tree.
- */
- if (key->objectid == BTRFS_TREE_RELOC_OBJECTID && key->offset == 0) {
- generic_err(leaf, slot, "invalid root id 0 for reloc tree");
- return -EUCLEAN;
- }
+ ret = check_root_key(leaf, key, slot);
+ if (ret < 0)
+ return ret;
if (btrfs_item_size_nr(leaf, slot) != sizeof(ri)) {
generic_err(leaf, slot,
@@ -1302,8 +1435,8 @@ static int check_extent_data_ref(struct extent_buffer *leaf,
return 0;
}
-#define inode_ref_err(fs_info, eb, slot, fmt, args...) \
- inode_item_err(fs_info, eb, slot, fmt, ##args)
+#define inode_ref_err(eb, slot, fmt, args...) \
+ inode_item_err(eb, slot, fmt, ##args)
static int check_inode_ref(struct extent_buffer *leaf,
struct btrfs_key *key, struct btrfs_key *prev_key,
int slot)
@@ -1316,7 +1449,7 @@ static int check_inode_ref(struct extent_buffer *leaf,
return -EUCLEAN;
/* namelen can't be 0, so item_size == sizeof() is also invalid */
if (btrfs_item_size_nr(leaf, slot) <= sizeof(*iref)) {
- inode_ref_err(fs_info, leaf, slot,
+ inode_ref_err(leaf, slot,
"invalid item size, have %u expect (%zu, %u)",
btrfs_item_size_nr(leaf, slot),
sizeof(*iref), BTRFS_LEAF_DATA_SIZE(leaf->fs_info));
@@ -1329,7 +1462,7 @@ static int check_inode_ref(struct extent_buffer *leaf,
u16 namelen;
if (ptr + sizeof(iref) > end) {
- inode_ref_err(fs_info, leaf, slot,
+ inode_ref_err(leaf, slot,
"inode ref overflow, ptr %lu end %lu inode_ref_size %zu",
ptr, end, sizeof(iref));
return -EUCLEAN;
@@ -1338,7 +1471,7 @@ static int check_inode_ref(struct extent_buffer *leaf,
iref = (struct btrfs_inode_ref *)ptr;
namelen = btrfs_inode_ref_name_len(leaf, iref);
if (ptr + sizeof(*iref) + namelen > end) {
- inode_ref_err(fs_info, leaf, slot,
+ inode_ref_err(leaf, slot,
"inode ref overflow, ptr %lu end %lu namelen %u",
ptr, end, namelen);
return -EUCLEAN;
@@ -1384,7 +1517,7 @@ static int check_leaf_item(struct extent_buffer *leaf,
break;
case BTRFS_CHUNK_ITEM_KEY:
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
- ret = btrfs_check_chunk_valid(leaf, chunk, key->offset);
+ ret = check_leaf_chunk_item(leaf, chunk, key, slot);
break;
case BTRFS_DEV_ITEM_KEY:
ret = check_dev_item(leaf, key, slot);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d3f115909ff0..7dd7552f53a4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2674,14 +2674,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
u32 blocksize;
int ret = 0;
- WARN_ON(*level < 0);
- WARN_ON(*level >= BTRFS_MAX_LEVEL);
-
while (*level > 0) {
struct btrfs_key first_key;
- WARN_ON(*level < 0);
- WARN_ON(*level >= BTRFS_MAX_LEVEL);
cur = path->nodes[*level];
WARN_ON(btrfs_header_level(cur) != *level);
@@ -2732,9 +2727,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(root_owner !=
BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_free_and_pin_reserved_extent(
- fs_info, bytenr,
- blocksize);
+ ret = btrfs_pin_reserved_extent(fs_info,
+ bytenr, blocksize);
if (ret) {
free_extent_buffer(next);
return ret;
@@ -2749,7 +2743,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
return ret;
}
- WARN_ON(*level <= 0);
if (path->nodes[*level-1])
free_extent_buffer(path->nodes[*level-1]);
path->nodes[*level-1] = next;
@@ -2757,9 +2750,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
path->slots[*level] = 0;
cond_resched();
}
- WARN_ON(*level < 0);
- WARN_ON(*level >= BTRFS_MAX_LEVEL);
-
path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
cond_resched();
@@ -2815,8 +2805,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
}
WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_free_and_pin_reserved_extent(
- fs_info,
+ ret = btrfs_pin_reserved_extent(fs_info,
path->nodes[*level]->start,
path->nodes[*level]->len);
if (ret)
@@ -2896,10 +2885,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
clear_extent_buffer_dirty(next);
}
- WARN_ON(log->root_key.objectid !=
- BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_free_and_pin_reserved_extent(fs_info,
- next->start, next->len);
+ ret = btrfs_pin_reserved_extent(fs_info, next->start,
+ next->len);
if (ret)
goto out;
}
@@ -3935,7 +3922,7 @@ static int log_csums(struct btrfs_trans_handle *trans,
static noinline int copy_items(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_path *dst_path,
- struct btrfs_path *src_path, u64 *last_extent,
+ struct btrfs_path *src_path,
int start_slot, int nr, int inode_only,
u64 logged_isize)
{
@@ -3946,7 +3933,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
struct btrfs_file_extent_item *extent;
struct btrfs_inode_item *inode_item;
struct extent_buffer *src = src_path->nodes[0];
- struct btrfs_key first_key, last_key, key;
int ret;
struct btrfs_key *ins_keys;
u32 *ins_sizes;
@@ -3954,9 +3940,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
int i;
struct list_head ordered_sums;
int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
- bool has_extents = false;
- bool need_find_last_extent = true;
- bool done = false;
INIT_LIST_HEAD(&ordered_sums);
@@ -3965,8 +3948,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
if (!ins_data)
return -ENOMEM;
- first_key.objectid = (u64)-1;
-
ins_sizes = (u32 *)ins_data;
ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
@@ -3987,9 +3968,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
src_offset = btrfs_item_ptr_offset(src, start_slot + i);
- if (i == nr - 1)
- last_key = ins_keys[i];
-
if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
inode_item = btrfs_item_ptr(dst_path->nodes[0],
dst_path->slots[0],
@@ -4003,20 +3981,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
src_offset, ins_sizes[i]);
}
- /*
- * We set need_find_last_extent here in case we know we were
- * processing other items and then walk into the first extent in
- * the inode. If we don't hit an extent then nothing changes,
- * we'll do the last search the next time around.
- */
- if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
- has_extents = true;
- if (first_key.objectid == (u64)-1)
- first_key = ins_keys[i];
- } else {
- need_find_last_extent = false;
- }
-
/* take a reference on file data extents so that truncates
* or deletes of this inode don't have to relog the inode
* again
@@ -4082,167 +4046,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
kfree(sums);
}
- if (!has_extents)
- return ret;
-
- if (need_find_last_extent && *last_extent == first_key.offset) {
- /*
- * We don't have any leafs between our current one and the one
- * we processed before that can have file extent items for our
- * inode (and have a generation number smaller than our current
- * transaction id).
- */
- need_find_last_extent = false;
- }
-
- /*
- * Because we use btrfs_search_forward we could skip leaves that were
- * not modified and then assume *last_extent is valid when it really
- * isn't. So back up to the previous leaf and read the end of the last
- * extent before we go and fill in holes.
- */
- if (need_find_last_extent) {
- u64 len;
-
- ret = btrfs_prev_leaf(inode->root, src_path);
- if (ret < 0)
- return ret;
- if (ret)
- goto fill_holes;
- if (src_path->slots[0])
- src_path->slots[0]--;
- src = src_path->nodes[0];
- btrfs_item_key_to_cpu(src, &key, src_path->slots[0]);
- if (key.objectid != btrfs_ino(inode) ||
- key.type != BTRFS_EXTENT_DATA_KEY)
- goto fill_holes;
- extent = btrfs_item_ptr(src, src_path->slots[0],
- struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(src, extent) ==
- BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_ram_bytes(src, extent);
- *last_extent = ALIGN(key.offset + len,
- fs_info->sectorsize);
- } else {
- len = btrfs_file_extent_num_bytes(src, extent);
- *last_extent = key.offset + len;
- }
- }
-fill_holes:
- /* So we did prev_leaf, now we need to move to the next leaf, but a few
- * things could have happened
- *
- * 1) A merge could have happened, so we could currently be on a leaf
- * that holds what we were copying in the first place.
- * 2) A split could have happened, and now not all of the items we want
- * are on the same leaf.
- *
- * So we need to adjust how we search for holes, we need to drop the
- * path and re-search for the first extent key we found, and then walk
- * forward until we hit the last one we copied.
- */
- if (need_find_last_extent) {
- /* btrfs_prev_leaf could return 1 without releasing the path */
- btrfs_release_path(src_path);
- ret = btrfs_search_slot(NULL, inode->root, &first_key,
- src_path, 0, 0);
- if (ret < 0)
- return ret;
- ASSERT(ret == 0);
- src = src_path->nodes[0];
- i = src_path->slots[0];
- } else {
- i = start_slot;
- }
-
- /*
- * Ok so here we need to go through and fill in any holes we may have
- * to make sure that holes are punched for those areas in case they had
- * extents previously.
- */
- while (!done) {
- u64 offset, len;
- u64 extent_end;
-
- if (i >= btrfs_header_nritems(src_path->nodes[0])) {
- ret = btrfs_next_leaf(inode->root, src_path);
- if (ret < 0)
- return ret;
- ASSERT(ret == 0);
- src = src_path->nodes[0];
- i = 0;
- need_find_last_extent = true;
- }
-
- btrfs_item_key_to_cpu(src, &key, i);
- if (!btrfs_comp_cpu_keys(&key, &last_key))
- done = true;
- if (key.objectid != btrfs_ino(inode) ||
- key.type != BTRFS_EXTENT_DATA_KEY) {
- i++;
- continue;
- }
- extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(src, extent) ==
- BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_ram_bytes(src, extent);
- extent_end = ALIGN(key.offset + len,
- fs_info->sectorsize);
- } else {
- len = btrfs_file_extent_num_bytes(src, extent);
- extent_end = key.offset + len;
- }
- i++;
-
- if (*last_extent == key.offset) {
- *last_extent = extent_end;
- continue;
- }
- offset = *last_extent;
- len = key.offset - *last_extent;
- ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
- offset, 0, 0, len, 0, len, 0, 0, 0);
- if (ret)
- break;
- *last_extent = extent_end;
- }
-
- /*
- * Check if there is a hole between the last extent found in our leaf
- * and the first extent in the next leaf. If there is one, we need to
- * log an explicit hole so that at replay time we can punch the hole.
- */
- if (ret == 0 &&
- key.objectid == btrfs_ino(inode) &&
- key.type == BTRFS_EXTENT_DATA_KEY &&
- i == btrfs_header_nritems(src_path->nodes[0])) {
- ret = btrfs_next_leaf(inode->root, src_path);
- need_find_last_extent = true;
- if (ret > 0) {
- ret = 0;
- } else if (ret == 0) {
- btrfs_item_key_to_cpu(src_path->nodes[0], &key,
- src_path->slots[0]);
- if (key.objectid == btrfs_ino(inode) &&
- key.type == BTRFS_EXTENT_DATA_KEY &&
- *last_extent < key.offset) {
- const u64 len = key.offset - *last_extent;
-
- ret = btrfs_insert_file_extent(trans, log,
- btrfs_ino(inode),
- *last_extent, 0,
- 0, len, 0, len,
- 0, 0, 0);
- *last_extent += len;
- }
- }
- }
- /*
- * Need to let the callers know we dropped the path so they should
- * re-search.
- */
- if (!ret && need_find_last_extent)
- ret = 1;
return ret;
}
@@ -4407,7 +4210,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
const u64 i_size = i_size_read(&inode->vfs_inode);
const u64 ino = btrfs_ino(inode);
struct btrfs_path *dst_path = NULL;
- u64 last_extent = (u64)-1;
+ bool dropped_extents = false;
int ins_nr = 0;
int start_slot;
int ret;
@@ -4429,8 +4232,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
if (slot >= btrfs_header_nritems(leaf)) {
if (ins_nr > 0) {
ret = copy_items(trans, inode, dst_path, path,
- &last_extent, start_slot,
- ins_nr, 1, 0);
+ start_slot, ins_nr, 1, 0);
if (ret < 0)
goto out;
ins_nr = 0;
@@ -4454,8 +4256,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
path->slots[0]++;
continue;
}
- if (last_extent == (u64)-1) {
- last_extent = key.offset;
+ if (!dropped_extents) {
/*
* Avoid logging extent items logged in past fsync calls
* and leading to duplicate keys in the log tree.
@@ -4469,6 +4270,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
} while (ret == -EAGAIN);
if (ret)
goto out;
+ dropped_extents = true;
}
if (ins_nr == 0)
start_slot = slot;
@@ -4483,7 +4285,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
}
}
if (ins_nr > 0) {
- ret = copy_items(trans, inode, dst_path, path, &last_extent,
+ ret = copy_items(trans, inode, dst_path, path,
start_slot, ins_nr, 1, 0);
if (ret > 0)
ret = 0;
@@ -4670,13 +4472,8 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
if (slot >= nritems) {
if (ins_nr > 0) {
- u64 last_extent = 0;
-
ret = copy_items(trans, inode, dst_path, path,
- &last_extent, start_slot,
- ins_nr, 1, 0);
- /* can't be 1, extent items aren't processed */
- ASSERT(ret <= 0);
+ start_slot, ins_nr, 1, 0);
if (ret < 0)
return ret;
ins_nr = 0;
@@ -4700,13 +4497,8 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
cond_resched();
}
if (ins_nr > 0) {
- u64 last_extent = 0;
-
ret = copy_items(trans, inode, dst_path, path,
- &last_extent, start_slot,
- ins_nr, 1, 0);
- /* can't be 1, extent items aren't processed */
- ASSERT(ret <= 0);
+ start_slot, ins_nr, 1, 0);
if (ret < 0)
return ret;
}
@@ -4715,100 +4507,119 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
}
/*
- * If the no holes feature is enabled we need to make sure any hole between the
- * last extent and the i_size of our inode is explicitly marked in the log. This
- * is to make sure that doing something like:
- *
- * 1) create file with 128Kb of data
- * 2) truncate file to 64Kb
- * 3) truncate file to 256Kb
- * 4) fsync file
- * 5) <crash/power failure>
- * 6) mount fs and trigger log replay
- *
- * Will give us a file with a size of 256Kb, the first 64Kb of data match what
- * the file had in its first 64Kb of data at step 1 and the last 192Kb of the
- * file correspond to a hole. The presence of explicit holes in a log tree is
- * what guarantees that log replay will remove/adjust file extent items in the
- * fs/subvol tree.
- *
- * Here we do not need to care about holes between extents, that is already done
- * by copy_items(). We also only need to do this in the full sync path, where we
- * lookup for extents from the fs/subvol tree only. In the fast path case, we
- * lookup the list of modified extent maps and if any represents a hole, we
- * insert a corresponding extent representing a hole in the log tree.
+ * When using the NO_HOLES feature if we punched a hole that causes the
+ * deletion of entire leafs or all the extent items of the first leaf (the one
+ * that contains the inode item and references) we may end up not processing
+ * any extents, because there are no leafs with a generation matching the
+ * current transaction that have extent items for our inode. So we need to find
+ * if any holes exist and then log them. We also need to log holes after any
+ * truncate operation that changes the inode's size.
*/
-static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_inode *inode,
- struct btrfs_path *path)
+static int btrfs_log_holes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- int ret;
struct btrfs_key key;
- u64 hole_start;
- u64 hole_size;
- struct extent_buffer *leaf;
- struct btrfs_root *log = root->log_root;
const u64 ino = btrfs_ino(inode);
const u64 i_size = i_size_read(&inode->vfs_inode);
+ u64 prev_extent_end = 0;
+ int ret;
- if (!btrfs_fs_incompat(fs_info, NO_HOLES))
+ if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
return 0;
key.objectid = ino;
key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = (u64)-1;
+ key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- ASSERT(ret != 0);
if (ret < 0)
return ret;
- ASSERT(path->slots[0] > 0);
- path->slots[0]--;
- leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-
- if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
- /* inode does not have any extents */
- hole_start = 0;
- hole_size = i_size;
- } else {
+ while (true) {
struct btrfs_file_extent_item *extent;
+ struct extent_buffer *leaf = path->nodes[0];
u64 len;
- /*
- * If there's an extent beyond i_size, an explicit hole was
- * already inserted by copy_items().
- */
- if (key.offset >= i_size)
- return 0;
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
+ break;
+
+ /* We have a hole, log it. */
+ if (prev_extent_end < key.offset) {
+ const u64 hole_len = key.offset - prev_extent_end;
+
+ /*
+ * Release the path to avoid deadlocks with other code
+ * paths that search the root while holding locks on
+ * leafs from the log root.
+ */
+ btrfs_release_path(path);
+ ret = btrfs_insert_file_extent(trans, root->log_root,
+ ino, prev_extent_end, 0,
+ 0, hole_len, 0, hole_len,
+ 0, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Search for the same key again in the root. Since it's
+ * an extent item and we are holding the inode lock, the
+ * key must still exist. If it doesn't just emit warning
+ * and return an error to fall back to a transaction
+ * commit.
+ */
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+ if (WARN_ON(ret > 0))
+ return -ENOENT;
+ leaf = path->nodes[0];
+ }
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
-
if (btrfs_file_extent_type(leaf, extent) ==
- BTRFS_FILE_EXTENT_INLINE)
- return 0;
+ BTRFS_FILE_EXTENT_INLINE) {
+ len = btrfs_file_extent_ram_bytes(leaf, extent);
+ prev_extent_end = ALIGN(key.offset + len,
+ fs_info->sectorsize);
+ } else {
+ len = btrfs_file_extent_num_bytes(leaf, extent);
+ prev_extent_end = key.offset + len;
+ }
- len = btrfs_file_extent_num_bytes(leaf, extent);
- /* Last extent goes beyond i_size, no need to log a hole. */
- if (key.offset + len > i_size)
- return 0;
- hole_start = key.offset + len;
- hole_size = i_size - hole_start;
+ path->slots[0]++;
+ cond_resched();
}
- btrfs_release_path(path);
- /* Last extent ends at i_size. */
- if (hole_size == 0)
- return 0;
+ if (prev_extent_end < i_size) {
+ u64 hole_len;
- hole_size = ALIGN(hole_size, fs_info->sectorsize);
- ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0,
- hole_size, 0, hole_size, 0, 0, 0);
- return ret;
+ btrfs_release_path(path);
+ hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
+ ret = btrfs_insert_file_extent(trans, root->log_root,
+ ino, prev_extent_end, 0, 0,
+ hole_len, 0, hole_len,
+ 0, 0, 0);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
}
/*
@@ -5012,6 +4823,50 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
continue;
}
/*
+ * If the inode was already logged skip it - otherwise we can
+ * hit an infinite loop. Example:
+ *
+ * From the commit root (previous transaction) we have the
+ * following inodes:
+ *
+ * inode 257 a directory
+ * inode 258 with references "zz" and "zz_link" on inode 257
+ * inode 259 with reference "a" on inode 257
+ *
+ * And in the current (uncommitted) transaction we have:
+ *
+ * inode 257 a directory, unchanged
+ * inode 258 with references "a" and "a2" on inode 257
+ * inode 259 with reference "zz_link" on inode 257
+ * inode 261 with reference "zz" on inode 257
+ *
+ * When logging inode 261 the following infinite loop could
+ * happen if we don't skip already logged inodes:
+ *
+ * - we detect inode 258 as a conflicting inode, with inode 261
+ * on reference "zz", and log it;
+ *
+ * - we detect inode 259 as a conflicting inode, with inode 258
+ * on reference "a", and log it;
+ *
+ * - we detect inode 258 as a conflicting inode, with inode 259
+ * on reference "zz_link", and log it - again! After this we
+ * repeat the above steps forever.
+ */
+ spin_lock(&BTRFS_I(inode)->lock);
+ /*
+ * Check the inode's logged_trans only instead of
+ * btrfs_inode_in_log(). This is because the last_log_commit of
+ * the inode is not updated when we only log that it exists and
+ * and it has the full sync bit set (see btrfs_log_inode()).
+ */
+ if (BTRFS_I(inode)->logged_trans == trans->transid) {
+ spin_unlock(&BTRFS_I(inode)->lock);
+ btrfs_add_delayed_iput(inode);
+ continue;
+ }
+ spin_unlock(&BTRFS_I(inode)->lock);
+ /*
* We are safe logging the other inode without acquiring its
* lock as long as we log with the LOG_INODE_EXISTS mode. We
* are safe against concurrent renames of the other inode as
@@ -5110,7 +4965,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_key min_key;
struct btrfs_key max_key;
struct btrfs_root *log = root->log_root;
- u64 last_extent = 0;
int err = 0;
int ret;
int nritems;
@@ -5288,7 +5142,7 @@ again:
ins_start_slot = path->slots[0];
}
ret = copy_items(trans, inode, dst_path, path,
- &last_extent, ins_start_slot,
+ ins_start_slot,
ins_nr, inode_only,
logged_isize);
if (ret < 0) {
@@ -5311,17 +5165,13 @@ again:
if (ins_nr == 0)
goto next_slot;
ret = copy_items(trans, inode, dst_path, path,
- &last_extent, ins_start_slot,
+ ins_start_slot,
ins_nr, inode_only, logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
}
ins_nr = 0;
- if (ret) {
- btrfs_release_path(path);
- continue;
- }
goto next_slot;
}
@@ -5334,18 +5184,13 @@ again:
goto next_slot;
}
- ret = copy_items(trans, inode, dst_path, path, &last_extent,
+ ret = copy_items(trans, inode, dst_path, path,
ins_start_slot, ins_nr, inode_only,
logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
}
- if (ret) {
- ins_nr = 0;
- btrfs_release_path(path);
- continue;
- }
ins_nr = 1;
ins_start_slot = path->slots[0];
next_slot:
@@ -5359,13 +5204,12 @@ next_slot:
}
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path,
- &last_extent, ins_start_slot,
+ ins_start_slot,
ins_nr, inode_only, logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
}
- ret = 0;
ins_nr = 0;
}
btrfs_release_path(path);
@@ -5380,14 +5224,13 @@ next_key:
}
}
if (ins_nr) {
- ret = copy_items(trans, inode, dst_path, path, &last_extent,
+ ret = copy_items(trans, inode, dst_path, path,
ins_start_slot, ins_nr, inode_only,
logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
}
- ret = 0;
ins_nr = 0;
}
@@ -5400,7 +5243,7 @@ next_key:
if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
btrfs_release_path(path);
btrfs_release_path(dst_path);
- err = btrfs_log_trailing_hole(trans, root, inode, path);
+ err = btrfs_log_holes(trans, root, inode, path);
if (err)
goto out_unlock;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a6d3f08bfff3..9cfc668f91f4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -30,6 +30,7 @@
#include "tree-checker.h"
#include "space-info.h"
#include "block-group.h"
+#include "discard.h"
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = {
@@ -66,6 +67,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.tolerated_failures = 2,
.devs_increment = 3,
.ncopies = 3,
+ .nparity = 0,
.raid_name = "raid1c3",
.bg_flag = BTRFS_BLOCK_GROUP_RAID1C3,
.mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
@@ -78,6 +80,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.tolerated_failures = 3,
.devs_increment = 4,
.ncopies = 4,
+ .nparity = 0,
.raid_name = "raid1c4",
.bg_flag = BTRFS_BLOCK_GROUP_RAID1C4,
.mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
@@ -438,39 +441,6 @@ static noinline struct btrfs_fs_devices *find_fsid(
ASSERT(fsid);
- if (metadata_fsid) {
- /*
- * Handle scanned device having completed its fsid change but
- * belonging to a fs_devices that was created by first scanning
- * a device which didn't have its fsid/metadata_uuid changed
- * at all and the CHANGING_FSID_V2 flag set.
- */
- list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- if (fs_devices->fsid_change &&
- memcmp(metadata_fsid, fs_devices->fsid,
- BTRFS_FSID_SIZE) == 0 &&
- memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
- BTRFS_FSID_SIZE) == 0) {
- return fs_devices;
- }
- }
- /*
- * Handle scanned device having completed its fsid change but
- * belonging to a fs_devices that was created by a device that
- * has an outdated pair of fsid/metadata_uuid and
- * CHANGING_FSID_V2 flag set.
- */
- list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
- if (fs_devices->fsid_change &&
- memcmp(fs_devices->metadata_uuid,
- fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
- memcmp(metadata_fsid, fs_devices->metadata_uuid,
- BTRFS_FSID_SIZE) == 0) {
- return fs_devices;
- }
- }
- }
-
/* Handle non-split brain cases */
list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
if (metadata_fsid) {
@@ -486,6 +456,47 @@ static noinline struct btrfs_fs_devices *find_fsid(
return NULL;
}
+static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
+ struct btrfs_super_block *disk_super)
+{
+
+ struct btrfs_fs_devices *fs_devices;
+
+ /*
+ * Handle scanned device having completed its fsid change but
+ * belonging to a fs_devices that was created by first scanning
+ * a device which didn't have its fsid/metadata_uuid changed
+ * at all and the CHANGING_FSID_V2 flag set.
+ */
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+ if (fs_devices->fsid_change &&
+ memcmp(disk_super->metadata_uuid, fs_devices->fsid,
+ BTRFS_FSID_SIZE) == 0 &&
+ memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
+ BTRFS_FSID_SIZE) == 0) {
+ return fs_devices;
+ }
+ }
+ /*
+ * Handle scanned device having completed its fsid change but
+ * belonging to a fs_devices that was created by a device that
+ * has an outdated pair of fsid/metadata_uuid and
+ * CHANGING_FSID_V2 flag set.
+ */
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+ if (fs_devices->fsid_change &&
+ memcmp(fs_devices->metadata_uuid,
+ fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
+ memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
+ BTRFS_FSID_SIZE) == 0) {
+ return fs_devices;
+ }
+ }
+
+ return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
+}
+
+
static int
btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
int flush, struct block_device **bdev,
@@ -669,7 +680,9 @@ error_brelse:
/*
* Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
- * being created with a disk that has already completed its fsid change.
+ * being created with a disk that has already completed its fsid change. Such
+ * disk can belong to an fs which has its FSID changed or to one which doesn't.
+ * Handle both cases here.
*/
static struct btrfs_fs_devices *find_fsid_inprogress(
struct btrfs_super_block *disk_super)
@@ -685,7 +698,7 @@ static struct btrfs_fs_devices *find_fsid_inprogress(
}
}
- return NULL;
+ return find_fsid(disk_super->fsid, NULL);
}
@@ -697,17 +710,54 @@ static struct btrfs_fs_devices *find_fsid_changed(
/*
* Handles the case where scanned device is part of an fs that had
* multiple successful changes of FSID but curently device didn't
- * observe it. Meaning our fsid will be different than theirs.
+ * observe it. Meaning our fsid will be different than theirs. We need
+ * to handle two subcases :
+ * 1 - The fs still continues to have different METADATA/FSID uuids.
+ * 2 - The fs is switched back to its original FSID (METADATA/FSID
+ * are equal).
*/
list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+ /* Changed UUIDs */
if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
BTRFS_FSID_SIZE) != 0 &&
memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
BTRFS_FSID_SIZE) == 0 &&
memcmp(fs_devices->fsid, disk_super->fsid,
- BTRFS_FSID_SIZE) != 0) {
+ BTRFS_FSID_SIZE) != 0)
+ return fs_devices;
+
+ /* Unchanged UUIDs */
+ if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
+ BTRFS_FSID_SIZE) == 0 &&
+ memcmp(fs_devices->fsid, disk_super->metadata_uuid,
+ BTRFS_FSID_SIZE) == 0)
+ return fs_devices;
+ }
+
+ return NULL;
+}
+
+static struct btrfs_fs_devices *find_fsid_reverted_metadata(
+ struct btrfs_super_block *disk_super)
+{
+ struct btrfs_fs_devices *fs_devices;
+
+ /*
+ * Handle the case where the scanned device is part of an fs whose last
+ * metadata UUID change reverted it to the original FSID. At the same
+ * time * fs_devices was first created by another constitutent device
+ * which didn't fully observe the operation. This results in an
+ * btrfs_fs_devices created with metadata/fsid different AND
+ * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
+ * fs_devices equal to the FSID of the disk.
+ */
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+ if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
+ BTRFS_FSID_SIZE) != 0 &&
+ memcmp(fs_devices->metadata_uuid, disk_super->fsid,
+ BTRFS_FSID_SIZE) == 0 &&
+ fs_devices->fsid_change)
return fs_devices;
- }
}
return NULL;
@@ -734,24 +784,16 @@ static noinline struct btrfs_device *device_list_add(const char *path,
BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
if (fsid_change_in_progress) {
- if (!has_metadata_uuid) {
- /*
- * When we have an image which has CHANGING_FSID_V2 set
- * it might belong to either a filesystem which has
- * disks with completed fsid change or it might belong
- * to fs with no UUID changes in effect, handle both.
- */
+ if (!has_metadata_uuid)
fs_devices = find_fsid_inprogress(disk_super);
- if (!fs_devices)
- fs_devices = find_fsid(disk_super->fsid, NULL);
- } else {
+ else
fs_devices = find_fsid_changed(disk_super);
- }
} else if (has_metadata_uuid) {
- fs_devices = find_fsid(disk_super->fsid,
- disk_super->metadata_uuid);
+ fs_devices = find_fsid_with_metadata_uuid(disk_super);
} else {
- fs_devices = find_fsid(disk_super->fsid, NULL);
+ fs_devices = find_fsid_reverted_metadata(disk_super);
+ if (!fs_devices)
+ fs_devices = find_fsid(disk_super->fsid, NULL);
}
@@ -781,12 +823,18 @@ static noinline struct btrfs_device *device_list_add(const char *path,
* a device which had the CHANGING_FSID_V2 flag then replace the
* metadata_uuid/fsid values of the fs_devices.
*/
- if (has_metadata_uuid && fs_devices->fsid_change &&
+ if (fs_devices->fsid_change &&
found_transid > fs_devices->latest_generation) {
memcpy(fs_devices->fsid, disk_super->fsid,
BTRFS_FSID_SIZE);
- memcpy(fs_devices->metadata_uuid,
- disk_super->metadata_uuid, BTRFS_FSID_SIZE);
+
+ if (has_metadata_uuid)
+ memcpy(fs_devices->metadata_uuid,
+ disk_super->metadata_uuid,
+ BTRFS_FSID_SIZE);
+ else
+ memcpy(fs_devices->metadata_uuid,
+ disk_super->fsid, BTRFS_FSID_SIZE);
fs_devices->fsid_change = false;
}
@@ -1064,11 +1112,6 @@ static void btrfs_close_bdev(struct btrfs_device *device)
static void btrfs_close_one_device(struct btrfs_device *device)
{
struct btrfs_fs_devices *fs_devices = device->fs_devices;
- struct btrfs_device *new_device;
- struct rcu_string *name;
-
- if (device->bdev)
- fs_devices->open_devices--;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
device->devid != BTRFS_DEV_REPLACE_DEVID) {
@@ -1080,23 +1123,22 @@ static void btrfs_close_one_device(struct btrfs_device *device)
fs_devices->missing_devices--;
btrfs_close_bdev(device);
-
- new_device = btrfs_alloc_device(NULL, &device->devid,
- device->uuid);
- BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
-
- /* Safe because we are under uuid_mutex */
- if (device->name) {
- name = rcu_string_strdup(device->name->str, GFP_NOFS);
- BUG_ON(!name); /* -ENOMEM */
- rcu_assign_pointer(new_device->name, name);
+ if (device->bdev) {
+ fs_devices->open_devices--;
+ device->bdev = NULL;
}
+ clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
- list_replace_rcu(&device->dev_list, &new_device->dev_list);
- new_device->fs_devices = device->fs_devices;
+ device->fs_info = NULL;
+ atomic_set(&device->dev_stats_ccnt, 0);
+ extent_io_tree_release(&device->alloc_state);
- synchronize_rcu();
- btrfs_free_device(device);
+ /* Verify the device is back in a pristine state */
+ ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
+ ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
+ ASSERT(list_empty(&device->dev_alloc_list));
+ ASSERT(list_empty(&device->post_commit_list));
+ ASSERT(atomic_read(&device->reada_in_flight) == 0);
}
static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
@@ -2130,7 +2172,6 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
{
struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
- WARN_ON(!tgtdev);
mutex_lock(&fs_devices->device_list_mutex);
btrfs_sysfs_rm_device_link(fs_devices, tgtdev);
@@ -2875,6 +2916,7 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
struct btrfs_root *root = fs_info->chunk_root;
struct btrfs_trans_handle *trans;
+ struct btrfs_block_group *block_group;
int ret;
/*
@@ -2898,6 +2940,12 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
if (ret)
return ret;
+ block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
+ if (!block_group)
+ return -ENOENT;
+ btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
+ btrfs_put_block_group(block_group);
+
trans = btrfs_start_trans_remove_block_group(root->fs_info,
chunk_offset);
if (IS_ERR(trans)) {
@@ -3881,7 +3929,11 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
}
}
- num_devices = btrfs_num_devices(fs_info);
+ /*
+ * rw_devices will not change at the moment, device add/delete/replace
+ * are excluded by EXCL_OP
+ */
+ num_devices = fs_info->fs_devices->rw_devices;
/*
* SINGLE profile on-disk has no profile bit, but in-memory we have a
@@ -6107,75 +6159,6 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
}
-int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
- u64 physical, u64 **logical, int *naddrs, int *stripe_len)
-{
- struct extent_map *em;
- struct map_lookup *map;
- u64 *buf;
- u64 bytenr;
- u64 length;
- u64 stripe_nr;
- u64 rmap_len;
- int i, j, nr = 0;
-
- em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
- if (IS_ERR(em))
- return -EIO;
-
- map = em->map_lookup;
- length = em->len;
- rmap_len = map->stripe_len;
-
- if (map->type & BTRFS_BLOCK_GROUP_RAID10)
- length = div_u64(length, map->num_stripes / map->sub_stripes);
- else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
- length = div_u64(length, map->num_stripes);
- else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- length = div_u64(length, nr_data_stripes(map));
- rmap_len = map->stripe_len * nr_data_stripes(map);
- }
-
- buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
- BUG_ON(!buf); /* -ENOMEM */
-
- for (i = 0; i < map->num_stripes; i++) {
- if (map->stripes[i].physical > physical ||
- map->stripes[i].physical + length <= physical)
- continue;
-
- stripe_nr = physical - map->stripes[i].physical;
- stripe_nr = div64_u64(stripe_nr, map->stripe_len);
-
- if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
- stripe_nr = stripe_nr * map->num_stripes + i;
- stripe_nr = div_u64(stripe_nr, map->sub_stripes);
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- stripe_nr = stripe_nr * map->num_stripes + i;
- } /* else if RAID[56], multiply by nr_data_stripes().
- * Alternatively, just use rmap_len below instead of
- * map->stripe_len */
-
- bytenr = chunk_start + stripe_nr * rmap_len;
- WARN_ON(nr >= map->num_stripes);
- for (j = 0; j < nr; j++) {
- if (buf[j] == bytenr)
- break;
- }
- if (j == nr) {
- WARN_ON(nr >= map->num_stripes);
- buf[nr++] = bytenr;
- }
- }
-
- *logical = buf;
- *naddrs = nr;
- *stripe_len = rmap_len;
-
- free_extent_map(em);
- return 0;
-}
-
static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
{
bio->bi_private = bbio->private;
@@ -6476,19 +6459,14 @@ static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
{
int index = btrfs_bg_flags_to_raid_index(type);
int ncopies = btrfs_raid_array[index].ncopies;
+ const int nparity = btrfs_raid_array[index].nparity;
int data_stripes;
- switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
- case BTRFS_BLOCK_GROUP_RAID5:
- data_stripes = num_stripes - 1;
- break;
- case BTRFS_BLOCK_GROUP_RAID6:
- data_stripes = num_stripes - 2;
- break;
- default:
+ if (nparity)
+ data_stripes = num_stripes - nparity;
+ else
data_stripes = num_stripes / ncopies;
- break;
- }
+
return div_u64(chunk_len, data_stripes);
}
@@ -7327,6 +7305,8 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
else
btrfs_dev_stat_set(dev, i, 0);
}
+ btrfs_info(fs_info, "device stats zeroed by %s (%d)",
+ current->comm, task_pid_nr(current));
} else {
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
if (stats->nr_items > i)
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index fc1b564b9cfe..409f4816fb89 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -120,8 +120,6 @@ struct btrfs_device {
/* per-device scrub information */
struct scrub_ctx *scrub_ctx;
- struct btrfs_work work;
-
/* readahead state */
atomic_t reada_in_flight;
u64 reada_next;
@@ -138,6 +136,10 @@ struct btrfs_device {
atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
struct extent_io_tree alloc_state;
+
+ struct completion kobj_unregister;
+ /* For sysfs/FSID/devinfo/devid/ */
+ struct kobject devid_kobj;
};
/*
@@ -168,7 +170,7 @@ btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
write_seqcount_end(&dev->data_seqcount); \
preempt_enable(); \
}
-#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT)
+#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
#define BTRFS_DEVICE_GETSET_FUNCS(name) \
static inline u64 \
btrfs_device_get_##name(const struct btrfs_device *dev) \
@@ -255,7 +257,7 @@ struct btrfs_fs_devices {
struct btrfs_fs_info *fs_info;
/* sysfs kobjects */
struct kobject fsid_kobj;
- struct kobject *device_dir_kobj;
+ struct kobject *devices_kobj;
struct completion kobj_unregister;
};
@@ -417,8 +419,6 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
struct btrfs_bio **bbio_ret);
int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 len, struct btrfs_io_geometry *io_geom);
-int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
- u64 physical, u64 **logical, int *naddrs, int *stripe_len);
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index a6c90a003c12..05615a1099db 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -20,9 +20,13 @@
#include <linux/refcount.h>
#include "compression.h"
+/* workspace buffer size for s390 zlib hardware support */
+#define ZLIB_DFLTCC_BUF_SIZE (4 * PAGE_SIZE)
+
struct workspace {
z_stream strm;
char *buf;
+ unsigned int buf_size;
struct list_head list;
int level;
};
@@ -61,7 +65,21 @@ struct list_head *zlib_alloc_workspace(unsigned int level)
zlib_inflate_workspacesize());
workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL);
workspace->level = level;
- workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ workspace->buf = NULL;
+ /*
+ * In case of s390 zlib hardware support, allocate lager workspace
+ * buffer. If allocator fails, fall back to a single page buffer.
+ */
+ if (zlib_deflate_dfltcc_enabled()) {
+ workspace->buf = kmalloc(ZLIB_DFLTCC_BUF_SIZE,
+ __GFP_NOMEMALLOC | __GFP_NORETRY |
+ __GFP_NOWARN | GFP_NOIO);
+ workspace->buf_size = ZLIB_DFLTCC_BUF_SIZE;
+ }
+ if (!workspace->buf) {
+ workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ workspace->buf_size = PAGE_SIZE;
+ }
if (!workspace->strm.workspace || !workspace->buf)
goto fail;
@@ -85,6 +103,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
struct page *in_page = NULL;
struct page *out_page = NULL;
unsigned long bytes_left;
+ unsigned int in_buf_pages;
unsigned long len = *total_out;
unsigned long nr_dest_pages = *out_pages;
const unsigned long max_out = nr_dest_pages * PAGE_SIZE;
@@ -102,9 +121,6 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
workspace->strm.total_in = 0;
workspace->strm.total_out = 0;
- in_page = find_get_page(mapping, start >> PAGE_SHIFT);
- data_in = kmap(in_page);
-
out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
if (out_page == NULL) {
ret = -ENOMEM;
@@ -114,12 +130,51 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
pages[0] = out_page;
nr_pages = 1;
- workspace->strm.next_in = data_in;
+ workspace->strm.next_in = workspace->buf;
+ workspace->strm.avail_in = 0;
workspace->strm.next_out = cpage_out;
workspace->strm.avail_out = PAGE_SIZE;
- workspace->strm.avail_in = min(len, PAGE_SIZE);
while (workspace->strm.total_in < len) {
+ /*
+ * Get next input pages and copy the contents to
+ * the workspace buffer if required.
+ */
+ if (workspace->strm.avail_in == 0) {
+ bytes_left = len - workspace->strm.total_in;
+ in_buf_pages = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE),
+ workspace->buf_size / PAGE_SIZE);
+ if (in_buf_pages > 1) {
+ int i;
+
+ for (i = 0; i < in_buf_pages; i++) {
+ if (in_page) {
+ kunmap(in_page);
+ put_page(in_page);
+ }
+ in_page = find_get_page(mapping,
+ start >> PAGE_SHIFT);
+ data_in = kmap(in_page);
+ memcpy(workspace->buf + i * PAGE_SIZE,
+ data_in, PAGE_SIZE);
+ start += PAGE_SIZE;
+ }
+ workspace->strm.next_in = workspace->buf;
+ } else {
+ if (in_page) {
+ kunmap(in_page);
+ put_page(in_page);
+ }
+ in_page = find_get_page(mapping,
+ start >> PAGE_SHIFT);
+ data_in = kmap(in_page);
+ start += PAGE_SIZE;
+ workspace->strm.next_in = data_in;
+ }
+ workspace->strm.avail_in = min(bytes_left,
+ (unsigned long) workspace->buf_size);
+ }
+
ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH);
if (ret != Z_OK) {
pr_debug("BTRFS: deflate in loop returned %d\n",
@@ -161,33 +216,43 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
/* we're all done */
if (workspace->strm.total_in >= len)
break;
-
- /* we've read in a full page, get a new one */
- if (workspace->strm.avail_in == 0) {
- if (workspace->strm.total_out > max_out)
- break;
-
- bytes_left = len - workspace->strm.total_in;
- kunmap(in_page);
- put_page(in_page);
-
- start += PAGE_SIZE;
- in_page = find_get_page(mapping,
- start >> PAGE_SHIFT);
- data_in = kmap(in_page);
- workspace->strm.avail_in = min(bytes_left,
- PAGE_SIZE);
- workspace->strm.next_in = data_in;
- }
+ if (workspace->strm.total_out > max_out)
+ break;
}
workspace->strm.avail_in = 0;
- ret = zlib_deflate(&workspace->strm, Z_FINISH);
- zlib_deflateEnd(&workspace->strm);
-
- if (ret != Z_STREAM_END) {
- ret = -EIO;
- goto out;
+ /*
+ * Call deflate with Z_FINISH flush parameter providing more output
+ * space but no more input data, until it returns with Z_STREAM_END.
+ */
+ while (ret != Z_STREAM_END) {
+ ret = zlib_deflate(&workspace->strm, Z_FINISH);
+ if (ret == Z_STREAM_END)
+ break;
+ if (ret != Z_OK && ret != Z_BUF_ERROR) {
+ zlib_deflateEnd(&workspace->strm);
+ ret = -EIO;
+ goto out;
+ } else if (workspace->strm.avail_out == 0) {
+ /* get another page for the stream end */
+ kunmap(out_page);
+ if (nr_pages == nr_dest_pages) {
+ out_page = NULL;
+ ret = -E2BIG;
+ goto out;
+ }
+ out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (out_page == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ cpage_out = kmap(out_page);
+ pages[nr_pages] = out_page;
+ nr_pages++;
+ workspace->strm.avail_out = PAGE_SIZE;
+ workspace->strm.next_out = cpage_out;
+ }
}
+ zlib_deflateEnd(&workspace->strm);
if (workspace->strm.total_out >= workspace->strm.total_in) {
ret = -E2BIG;
@@ -231,7 +296,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
workspace->strm.total_out = 0;
workspace->strm.next_out = workspace->buf;
- workspace->strm.avail_out = PAGE_SIZE;
+ workspace->strm.avail_out = workspace->buf_size;
/* If it's deflate, and it's got no preset dictionary, then
we can tell zlib to skip the adler32 check. */
@@ -270,7 +335,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
}
workspace->strm.next_out = workspace->buf;
- workspace->strm.avail_out = PAGE_SIZE;
+ workspace->strm.avail_out = workspace->buf_size;
if (workspace->strm.avail_in == 0) {
unsigned long tmp;
@@ -320,7 +385,7 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in,
workspace->strm.total_in = 0;
workspace->strm.next_out = workspace->buf;
- workspace->strm.avail_out = PAGE_SIZE;
+ workspace->strm.avail_out = workspace->buf_size;
workspace->strm.total_out = 0;
/* If it's deflate, and it's got no preset dictionary, then
we can tell zlib to skip the adler32 check. */
@@ -364,7 +429,7 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in,
buf_offset = 0;
bytes = min(PAGE_SIZE - pg_offset,
- PAGE_SIZE - buf_offset);
+ PAGE_SIZE - (buf_offset % PAGE_SIZE));
bytes = min(bytes, bytes_left);
kaddr = kmap_atomic(dest_page);
@@ -375,7 +440,7 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in,
bytes_left -= bytes;
next:
workspace->strm.next_out = workspace->buf;
- workspace->strm.avail_out = PAGE_SIZE;
+ workspace->strm.avail_out = workspace->buf_size;
}
if (ret != Z_STREAM_END && bytes_left != 0)
diff --git a/fs/buffer.c b/fs/buffer.c
index d8c7242426bb..b8d28370cfd7 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1433,7 +1433,7 @@ static bool has_bh_in_lru(int cpu, void *dummy)
void invalidate_bh_lrus(void)
{
- on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1, GFP_KERNEL);
+ on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
}
EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
@@ -3031,11 +3031,9 @@ static void end_bio_bh_io_sync(struct bio *bio)
* errors, this only handles the "we need to be able to
* do IO at the final sector" case.
*/
-void guard_bio_eod(int op, struct bio *bio)
+void guard_bio_eod(struct bio *bio)
{
sector_t maxsector;
- struct bio_vec *bvec = bio_last_bvec_all(bio);
- unsigned truncated_bytes;
struct hd_struct *part;
rcu_read_lock();
@@ -3061,28 +3059,7 @@ void guard_bio_eod(int op, struct bio *bio)
if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
return;
- /* Uhhuh. We've got a bio that straddles the device size! */
- truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
-
- /*
- * The bio contains more than one segment which spans EOD, just return
- * and let IO layer turn it into an EIO
- */
- if (truncated_bytes > bvec->bv_len)
- return;
-
- /* Truncate the bio.. */
- bio->bi_iter.bi_size -= truncated_bytes;
- bvec->bv_len -= truncated_bytes;
-
- /* ..and clear the end of the buffer for reads */
- if (op == REQ_OP_READ) {
- struct bio_vec bv;
-
- mp_bvec_last_segment(bvec, &bv);
- zero_user(bv.bv_page, bv.bv_offset + bv.bv_len,
- truncated_bytes);
- }
+ bio_truncate(bio, maxsector << 9);
}
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
@@ -3118,15 +3095,15 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
bio->bi_end_io = end_bio_bh_io_sync;
bio->bi_private = bh;
- /* Take care of bh's that straddle the end of the device */
- guard_bio_eod(op, bio);
-
if (buffer_meta(bh))
op_flags |= REQ_META;
if (buffer_prio(bh))
op_flags |= REQ_PRIO;
bio_set_op_attrs(bio, op, op_flags);
+ /* Take care of bh's that straddle the end of the device */
+ guard_bio_eod(bio);
+
if (wbc) {
wbc_init_bio(wbc, bio);
wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 374db1bd57d1..145d46ba25ae 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -708,8 +708,10 @@ void ceph_mdsc_release_request(struct kref *kref)
/* avoid calling iput_final() in mds dispatch threads */
ceph_async_iput(req->r_inode);
}
- if (req->r_parent)
+ if (req->r_parent) {
ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
+ ceph_async_iput(req->r_parent);
+ }
ceph_async_iput(req->r_target_inode);
if (req->r_dentry)
dput(req->r_dentry);
@@ -2676,8 +2678,10 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
/* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
if (req->r_inode)
ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
- if (req->r_parent)
+ if (req->r_parent) {
ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
+ ihold(req->r_parent);
+ }
if (req->r_old_dentry_dir)
ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
CEPH_CAP_PIN);
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 00dfe17871ac..c5e6eff5a381 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -352,7 +352,7 @@ static struct kobject *cdev_get(struct cdev *p)
if (owner && !try_module_get(owner))
return NULL;
- kobj = kobject_get(&p->kobj);
+ kobj = kobject_get_unless_zero(&p->kobj);
if (!kobj)
module_put(owner);
return kobj;
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 41957b82d796..606f26d862dc 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -120,17 +120,17 @@ cifs_build_devname(char *nodename, const char *prepath)
/**
- * cifs_compose_mount_options - creates mount options for refferral
+ * cifs_compose_mount_options - creates mount options for referral
* @sb_mountdata: parent/root DFS mount options (template)
* @fullpath: full path in UNC format
- * @ref: server's referral
+ * @ref: optional server's referral
* @devname: optional pointer for saving device name
*
* creates mount options for submount based on template options sb_mountdata
* and replacing unc,ip,prefixpath options with ones we've got form ref_unc.
*
* Returns: pointer to new mount options or ERR_PTR.
- * Caller is responcible for freeing retunrned value if it is not error.
+ * Caller is responsible for freeing returned value if it is not error.
*/
char *cifs_compose_mount_options(const char *sb_mountdata,
const char *fullpath,
@@ -150,18 +150,27 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
if (sb_mountdata == NULL)
return ERR_PTR(-EINVAL);
- if (strlen(fullpath) - ref->path_consumed) {
- prepath = fullpath + ref->path_consumed;
- /* skip initial delimiter */
- if (*prepath == '/' || *prepath == '\\')
- prepath++;
- }
+ if (ref) {
+ if (strlen(fullpath) - ref->path_consumed) {
+ prepath = fullpath + ref->path_consumed;
+ /* skip initial delimiter */
+ if (*prepath == '/' || *prepath == '\\')
+ prepath++;
+ }
- name = cifs_build_devname(ref->node_name, prepath);
- if (IS_ERR(name)) {
- rc = PTR_ERR(name);
- name = NULL;
- goto compose_mount_options_err;
+ name = cifs_build_devname(ref->node_name, prepath);
+ if (IS_ERR(name)) {
+ rc = PTR_ERR(name);
+ name = NULL;
+ goto compose_mount_options_err;
+ }
+ } else {
+ name = cifs_build_devname((char *)fullpath, NULL);
+ if (IS_ERR(name)) {
+ rc = PTR_ERR(name);
+ name = NULL;
+ goto compose_mount_options_err;
+ }
}
rc = dns_resolve_server_name_to_ip(name, &srvIP);
@@ -225,6 +234,8 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
if (devname)
*devname = name;
+ else
+ kfree(name);
/*cifs_dbg(FYI, "%s: parent mountdata: %s\n", __func__, sb_mountdata);*/
/*cifs_dbg(FYI, "%s: submount mountdata: %s\n", __func__, mountdata );*/
@@ -241,23 +252,23 @@ compose_mount_options_err:
}
/**
- * cifs_dfs_do_refmount - mounts specified path using provided refferal
+ * cifs_dfs_do_mount - mounts specified path using DFS full path
+ *
+ * Always pass down @fullpath to smb3_do_mount() so we can use the root server
+ * to perform failover in case we failed to connect to the first target in the
+ * referral.
+ *
* @cifs_sb: parent/root superblock
* @fullpath: full path in UNC format
- * @ref: server's referral
*/
-static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt,
- struct cifs_sb_info *cifs_sb,
- const char *fullpath, const struct dfs_info3_param *ref)
+static struct vfsmount *cifs_dfs_do_mount(struct dentry *mntpt,
+ struct cifs_sb_info *cifs_sb,
+ const char *fullpath)
{
struct vfsmount *mnt;
char *mountdata;
char *devname;
- /*
- * Always pass down the DFS full path to smb3_do_mount() so we
- * can use it later for failover.
- */
devname = kstrndup(fullpath, strlen(fullpath), GFP_KERNEL);
if (!devname)
return ERR_PTR(-ENOMEM);
@@ -266,7 +277,7 @@ static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt,
/* strip first '\' from fullpath */
mountdata = cifs_compose_mount_options(cifs_sb->mountdata,
- fullpath + 1, ref, NULL);
+ fullpath + 1, NULL, NULL);
if (IS_ERR(mountdata)) {
kfree(devname);
return (struct vfsmount *)mountdata;
@@ -278,28 +289,16 @@ static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt,
return mnt;
}
-static void dump_referral(const struct dfs_info3_param *ref)
-{
- cifs_dbg(FYI, "DFS: ref path: %s\n", ref->path_name);
- cifs_dbg(FYI, "DFS: node path: %s\n", ref->node_name);
- cifs_dbg(FYI, "DFS: fl: %d, srv_type: %d\n",
- ref->flags, ref->server_type);
- cifs_dbg(FYI, "DFS: ref_flags: %d, path_consumed: %d\n",
- ref->ref_flag, ref->path_consumed);
-}
-
/*
* Create a vfsmount that we can automount
*/
static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
{
- struct dfs_info3_param referral = {0};
struct cifs_sb_info *cifs_sb;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
char *full_path, *root_path;
unsigned int xid;
- int len;
int rc;
struct vfsmount *mnt;
@@ -357,7 +356,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
if (!rc) {
rc = dfs_cache_find(xid, ses, cifs_sb->local_nls,
cifs_remap(cifs_sb), full_path + 1,
- &referral, NULL);
+ NULL, NULL);
}
free_xid(xid);
@@ -366,26 +365,16 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
mnt = ERR_PTR(rc);
goto free_root_path;
}
-
- dump_referral(&referral);
-
- len = strlen(referral.node_name);
- if (len < 2) {
- cifs_dbg(VFS, "%s: Net Address path too short: %s\n",
- __func__, referral.node_name);
- mnt = ERR_PTR(-EINVAL);
- goto free_dfs_ref;
- }
/*
- * cifs_mount() will retry every available node server in case
- * of failures.
+ * OK - we were able to get and cache a referral for @full_path.
+ *
+ * Now, pass it down to cifs_mount() and it will retry every available
+ * node server in case of failures - no need to do it here.
*/
- mnt = cifs_dfs_do_refmount(mntpt, cifs_sb, full_path, &referral);
- cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n", __func__,
- referral.node_name, mnt);
+ mnt = cifs_dfs_do_mount(mntpt, cifs_sb, full_path);
+ cifs_dbg(FYI, "%s: cifs_dfs_do_mount:%s , mnt:%p\n", __func__,
+ full_path + 1, mnt);
-free_dfs_ref:
- free_dfs_info_param(&referral);
free_root_path:
kfree(root_path);
free_full_path:
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 96ae72b556ac..fb41e51dd574 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -802,6 +802,26 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
return;
}
+unsigned int setup_authusers_ACE(struct cifs_ace *pntace)
+{
+ int i;
+ unsigned int ace_size = 20;
+
+ pntace->type = ACCESS_ALLOWED_ACE_TYPE;
+ pntace->flags = 0x0;
+ pntace->access_req = cpu_to_le32(GENERIC_ALL);
+ pntace->sid.num_subauth = 1;
+ pntace->sid.revision = 1;
+ for (i = 0; i < NUM_AUTHS; i++)
+ pntace->sid.authority[i] = sid_authusers.authority[i];
+
+ pntace->sid.sub_auth[0] = sid_authusers.sub_auth[0];
+
+ /* size = 1 + 1 + 2 + 4 + 1 + 1 + 6 + (psid->num_subauth*4) */
+ pntace->size = cpu_to_le16(ace_size);
+ return ace_size;
+}
+
/*
* Fill in the special SID based on the mode. See
* http://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index b59dc7478130..096a4c18fbd0 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -149,6 +149,9 @@ extern ssize_t cifs_file_copychunk_range(unsigned int xid,
size_t len, unsigned int flags);
extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
+extern void cifs_setsize(struct inode *inode, loff_t offset);
+extern int cifs_truncate_page(struct address_space *mapping, loff_t from);
+
#ifdef CONFIG_CIFS_NFSD_EXPORT
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 40705e862451..239338d57086 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1588,6 +1588,7 @@ struct mid_q_entry {
mid_callback_t *callback; /* call completion callback */
mid_handle_t *handle; /* call handle mid callback */
void *callback_data; /* general purpose pointer for callback */
+ struct task_struct *creator;
void *resp_buf; /* pointer to received SMB header */
unsigned int resp_buf_size;
int mid_state; /* wish this were enum but can not pass to wait_event */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 9c229408a251..948bf3474db1 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -213,6 +213,7 @@ extern struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *,
const struct cifs_fid *, u32 *);
extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
const char *, int);
+extern unsigned int setup_authusers_ACE(struct cifs_ace *pace);
extern unsigned int setup_special_mode_ACE(struct cifs_ace *pace, __u64 nmode);
extern void dequeue_mid(struct mid_q_entry *mid, bool malformed);
@@ -596,6 +597,9 @@ bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface);
void extract_unc_hostname(const char *unc, const char **h, size_t *len);
int copy_path_name(char *dst, const char *src);
+int smb2_parse_query_directory(struct cifs_tcon *tcon, struct kvec *rsp_iov,
+ int resp_buftype,
+ struct cifs_search_info *srch_inf);
#ifdef CONFIG_CIFS_DFS_UPCALL
static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index cc86a67225d1..a481296f417f 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -4619,7 +4619,7 @@ findFirstRetry:
psrch_inf->unicode = false;
psrch_inf->ntwrk_buf_start = (char *)pSMBr;
- psrch_inf->smallBuf = 0;
+ psrch_inf->smallBuf = false;
psrch_inf->srch_entries_start =
(char *) &pSMBr->hdr.Protocol +
le16_to_cpu(pSMBr->t2.DataOffset);
@@ -4753,7 +4753,7 @@ int CIFSFindNext(const unsigned int xid, struct cifs_tcon *tcon,
cifs_buf_release(psrch_inf->ntwrk_buf_start);
psrch_inf->srch_entries_start = response_data;
psrch_inf->ntwrk_buf_start = (char *)pSMB;
- psrch_inf->smallBuf = 0;
+ psrch_inf->smallBuf = false;
if (parms->EndofSearch)
psrch_inf->endOfSearch = true;
else
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 05ea0e2b7e0e..0aa3623ae0e1 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3709,8 +3709,10 @@ match_prepath(struct super_block *sb, struct cifs_mnt_data *mnt_data)
{
struct cifs_sb_info *old = CIFS_SB(sb);
struct cifs_sb_info *new = mnt_data->cifs_sb;
- bool old_set = old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH;
- bool new_set = new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH;
+ bool old_set = (old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) &&
+ old->prepath;
+ bool new_set = (new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) &&
+ new->prepath;
if (old_set && new_set && !strcmp(new->prepath, old->prepath))
return 1;
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index 2faa05860a48..9a384d1e27b4 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -5,8 +5,6 @@
* Copyright (c) 2018-2019 Paulo Alcantara <palcantara@suse.de>
*/
-#include <linux/rcupdate.h>
-#include <linux/rculist.h>
#include <linux/jhash.h>
#include <linux/ktime.h>
#include <linux/slab.h>
@@ -22,67 +20,68 @@
#include "dfs_cache.h"
-#define DFS_CACHE_HTABLE_SIZE 32
-#define DFS_CACHE_MAX_ENTRIES 64
+#define CACHE_HTABLE_SIZE 32
+#define CACHE_MAX_ENTRIES 64
#define IS_INTERLINK_SET(v) ((v) & (DFSREF_REFERRAL_SERVER | \
DFSREF_STORAGE_SERVER))
-struct dfs_cache_tgt {
- char *t_name;
- struct list_head t_list;
+struct cache_dfs_tgt {
+ char *name;
+ struct list_head list;
};
-struct dfs_cache_entry {
- struct hlist_node ce_hlist;
- const char *ce_path;
- int ce_ttl;
- int ce_srvtype;
- int ce_flags;
- struct timespec64 ce_etime;
- int ce_path_consumed;
- int ce_numtgts;
- struct list_head ce_tlist;
- struct dfs_cache_tgt *ce_tgthint;
- struct rcu_head ce_rcu;
+struct cache_entry {
+ struct hlist_node hlist;
+ const char *path;
+ int ttl;
+ int srvtype;
+ int flags;
+ struct timespec64 etime;
+ int path_consumed;
+ int numtgts;
+ struct list_head tlist;
+ struct cache_dfs_tgt *tgthint;
};
-static struct kmem_cache *dfs_cache_slab __read_mostly;
-
-struct dfs_cache_vol_info {
- char *vi_fullpath;
- struct smb_vol vi_vol;
- char *vi_mntdata;
- struct list_head vi_list;
+struct vol_info {
+ char *fullpath;
+ spinlock_t smb_vol_lock;
+ struct smb_vol smb_vol;
+ char *mntdata;
+ struct list_head list;
+ struct list_head rlist;
+ struct kref refcnt;
};
-struct dfs_cache {
- struct mutex dc_lock;
- struct nls_table *dc_nlsc;
- struct list_head dc_vol_list;
- int dc_ttl;
- struct delayed_work dc_refresh;
-};
+static struct kmem_cache *cache_slab __read_mostly;
+static struct workqueue_struct *dfscache_wq __read_mostly;
-static struct dfs_cache dfs_cache;
+static int cache_ttl;
+static DEFINE_SPINLOCK(cache_ttl_lock);
+
+static struct nls_table *cache_nlsc;
/*
* Number of entries in the cache
*/
-static size_t dfs_cache_count;
+static atomic_t cache_count;
+
+static struct hlist_head cache_htable[CACHE_HTABLE_SIZE];
+static DECLARE_RWSEM(htable_rw_lock);
-static DEFINE_MUTEX(dfs_cache_list_lock);
-static struct hlist_head dfs_cache_htable[DFS_CACHE_HTABLE_SIZE];
+static LIST_HEAD(vol_list);
+static DEFINE_SPINLOCK(vol_list_lock);
static void refresh_cache_worker(struct work_struct *work);
-static inline bool is_path_valid(const char *path)
-{
- return path && (strchr(path + 1, '\\') || strchr(path + 1, '/'));
-}
+static DECLARE_DELAYED_WORK(refresh_task, refresh_cache_worker);
-static inline int get_normalized_path(const char *path, char **npath)
+static int get_normalized_path(const char *path, char **npath)
{
+ if (!path || strlen(path) < 3 || (*path != '\\' && *path != '/'))
+ return -EINVAL;
+
if (*path == '\\') {
*npath = (char *)path;
} else {
@@ -100,57 +99,48 @@ static inline void free_normalized_path(const char *path, char *npath)
kfree(npath);
}
-static inline bool cache_entry_expired(const struct dfs_cache_entry *ce)
+static inline bool cache_entry_expired(const struct cache_entry *ce)
{
struct timespec64 ts;
ktime_get_coarse_real_ts64(&ts);
- return timespec64_compare(&ts, &ce->ce_etime) >= 0;
+ return timespec64_compare(&ts, &ce->etime) >= 0;
}
-static inline void free_tgts(struct dfs_cache_entry *ce)
+static inline void free_tgts(struct cache_entry *ce)
{
- struct dfs_cache_tgt *t, *n;
+ struct cache_dfs_tgt *t, *n;
- list_for_each_entry_safe(t, n, &ce->ce_tlist, t_list) {
- list_del(&t->t_list);
- kfree(t->t_name);
+ list_for_each_entry_safe(t, n, &ce->tlist, list) {
+ list_del(&t->list);
+ kfree(t->name);
kfree(t);
}
}
-static void free_cache_entry(struct rcu_head *rcu)
+static inline void flush_cache_ent(struct cache_entry *ce)
{
- struct dfs_cache_entry *ce = container_of(rcu, struct dfs_cache_entry,
- ce_rcu);
- kmem_cache_free(dfs_cache_slab, ce);
-}
-
-static inline void flush_cache_ent(struct dfs_cache_entry *ce)
-{
- if (hlist_unhashed(&ce->ce_hlist))
- return;
-
- hlist_del_init_rcu(&ce->ce_hlist);
- kfree_const(ce->ce_path);
+ hlist_del_init(&ce->hlist);
+ kfree(ce->path);
free_tgts(ce);
- dfs_cache_count--;
- call_rcu(&ce->ce_rcu, free_cache_entry);
+ atomic_dec(&cache_count);
+ kmem_cache_free(cache_slab, ce);
}
static void flush_cache_ents(void)
{
int i;
- rcu_read_lock();
- for (i = 0; i < DFS_CACHE_HTABLE_SIZE; i++) {
- struct hlist_head *l = &dfs_cache_htable[i];
- struct dfs_cache_entry *ce;
+ for (i = 0; i < CACHE_HTABLE_SIZE; i++) {
+ struct hlist_head *l = &cache_htable[i];
+ struct hlist_node *n;
+ struct cache_entry *ce;
- hlist_for_each_entry_rcu(ce, l, ce_hlist)
- flush_cache_ent(ce);
+ hlist_for_each_entry_safe(ce, n, l, hlist) {
+ if (!hlist_unhashed(&ce->hlist))
+ flush_cache_ent(ce);
+ }
}
- rcu_read_unlock();
}
/*
@@ -158,36 +148,39 @@ static void flush_cache_ents(void)
*/
static int dfscache_proc_show(struct seq_file *m, void *v)
{
- int bucket;
- struct dfs_cache_entry *ce;
- struct dfs_cache_tgt *t;
+ int i;
+ struct cache_entry *ce;
+ struct cache_dfs_tgt *t;
seq_puts(m, "DFS cache\n---------\n");
- mutex_lock(&dfs_cache_list_lock);
-
- rcu_read_lock();
- hash_for_each_rcu(dfs_cache_htable, bucket, ce, ce_hlist) {
- seq_printf(m,
- "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,"
- "interlink=%s,path_consumed=%d,expired=%s\n",
- ce->ce_path,
- ce->ce_srvtype == DFS_TYPE_ROOT ? "root" : "link",
- ce->ce_ttl, ce->ce_etime.tv_nsec,
- IS_INTERLINK_SET(ce->ce_flags) ? "yes" : "no",
- ce->ce_path_consumed,
- cache_entry_expired(ce) ? "yes" : "no");
-
- list_for_each_entry(t, &ce->ce_tlist, t_list) {
- seq_printf(m, " %s%s\n",
- t->t_name,
- ce->ce_tgthint == t ? " (target hint)" : "");
+ down_read(&htable_rw_lock);
+ for (i = 0; i < CACHE_HTABLE_SIZE; i++) {
+ struct hlist_head *l = &cache_htable[i];
+
+ hlist_for_each_entry(ce, l, hlist) {
+ if (hlist_unhashed(&ce->hlist))
+ continue;
+
+ seq_printf(m,
+ "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,"
+ "interlink=%s,path_consumed=%d,expired=%s\n",
+ ce->path,
+ ce->srvtype == DFS_TYPE_ROOT ? "root" : "link",
+ ce->ttl, ce->etime.tv_nsec,
+ IS_INTERLINK_SET(ce->flags) ? "yes" : "no",
+ ce->path_consumed,
+ cache_entry_expired(ce) ? "yes" : "no");
+
+ list_for_each_entry(t, &ce->tlist, list) {
+ seq_printf(m, " %s%s\n",
+ t->name,
+ ce->tgthint == t ? " (target hint)" : "");
+ }
}
-
}
- rcu_read_unlock();
+ up_read(&htable_rw_lock);
- mutex_unlock(&dfs_cache_list_lock);
return 0;
}
@@ -205,9 +198,10 @@ static ssize_t dfscache_proc_write(struct file *file, const char __user *buffer,
return -EINVAL;
cifs_dbg(FYI, "clearing dfs cache");
- mutex_lock(&dfs_cache_list_lock);
+
+ down_write(&htable_rw_lock);
flush_cache_ents();
- mutex_unlock(&dfs_cache_list_lock);
+ up_write(&htable_rw_lock);
return count;
}
@@ -226,25 +220,25 @@ const struct file_operations dfscache_proc_fops = {
};
#ifdef CONFIG_CIFS_DEBUG2
-static inline void dump_tgts(const struct dfs_cache_entry *ce)
+static inline void dump_tgts(const struct cache_entry *ce)
{
- struct dfs_cache_tgt *t;
+ struct cache_dfs_tgt *t;
cifs_dbg(FYI, "target list:\n");
- list_for_each_entry(t, &ce->ce_tlist, t_list) {
- cifs_dbg(FYI, " %s%s\n", t->t_name,
- ce->ce_tgthint == t ? " (target hint)" : "");
+ list_for_each_entry(t, &ce->tlist, list) {
+ cifs_dbg(FYI, " %s%s\n", t->name,
+ ce->tgthint == t ? " (target hint)" : "");
}
}
-static inline void dump_ce(const struct dfs_cache_entry *ce)
+static inline void dump_ce(const struct cache_entry *ce)
{
cifs_dbg(FYI, "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,"
- "interlink=%s,path_consumed=%d,expired=%s\n", ce->ce_path,
- ce->ce_srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ce_ttl,
- ce->ce_etime.tv_nsec,
- IS_INTERLINK_SET(ce->ce_flags) ? "yes" : "no",
- ce->ce_path_consumed,
+ "interlink=%s,path_consumed=%d,expired=%s\n", ce->path,
+ ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ttl,
+ ce->etime.tv_nsec,
+ IS_INTERLINK_SET(ce->flags) ? "yes" : "no",
+ ce->path_consumed,
cache_entry_expired(ce) ? "yes" : "no");
dump_tgts(ce);
}
@@ -284,25 +278,34 @@ static inline void dump_refs(const struct dfs_info3_param *refs, int numrefs)
*/
int dfs_cache_init(void)
{
+ int rc;
int i;
- dfs_cache_slab = kmem_cache_create("cifs_dfs_cache",
- sizeof(struct dfs_cache_entry), 0,
- SLAB_HWCACHE_ALIGN, NULL);
- if (!dfs_cache_slab)
+ dfscache_wq = alloc_workqueue("cifs-dfscache",
+ WQ_FREEZABLE | WQ_MEM_RECLAIM, 1);
+ if (!dfscache_wq)
return -ENOMEM;
- for (i = 0; i < DFS_CACHE_HTABLE_SIZE; i++)
- INIT_HLIST_HEAD(&dfs_cache_htable[i]);
+ cache_slab = kmem_cache_create("cifs_dfs_cache",
+ sizeof(struct cache_entry), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!cache_slab) {
+ rc = -ENOMEM;
+ goto out_destroy_wq;
+ }
+
+ for (i = 0; i < CACHE_HTABLE_SIZE; i++)
+ INIT_HLIST_HEAD(&cache_htable[i]);
- INIT_LIST_HEAD(&dfs_cache.dc_vol_list);
- mutex_init(&dfs_cache.dc_lock);
- INIT_DELAYED_WORK(&dfs_cache.dc_refresh, refresh_cache_worker);
- dfs_cache.dc_ttl = -1;
- dfs_cache.dc_nlsc = load_nls_default();
+ atomic_set(&cache_count, 0);
+ cache_nlsc = load_nls_default();
cifs_dbg(FYI, "%s: initialized DFS referral cache\n", __func__);
return 0;
+
+out_destroy_wq:
+ destroy_workqueue(dfscache_wq);
+ return rc;
}
static inline unsigned int cache_entry_hash(const void *data, int size)
@@ -310,7 +313,7 @@ static inline unsigned int cache_entry_hash(const void *data, int size)
unsigned int h;
h = jhash(data, size, 0);
- return h & (DFS_CACHE_HTABLE_SIZE - 1);
+ return h & (CACHE_HTABLE_SIZE - 1);
}
/* Check whether second path component of @path is SYSVOL or NETLOGON */
@@ -325,11 +328,11 @@ static inline bool is_sysvol_or_netlogon(const char *path)
}
/* Return target hint of a DFS cache entry */
-static inline char *get_tgt_name(const struct dfs_cache_entry *ce)
+static inline char *get_tgt_name(const struct cache_entry *ce)
{
- struct dfs_cache_tgt *t = ce->ce_tgthint;
+ struct cache_dfs_tgt *t = ce->tgthint;
- return t ? t->t_name : ERR_PTR(-ENOENT);
+ return t ? t->name : ERR_PTR(-ENOENT);
}
/* Return expire time out of a new entry's TTL */
@@ -346,19 +349,19 @@ static inline struct timespec64 get_expire_time(int ttl)
}
/* Allocate a new DFS target */
-static inline struct dfs_cache_tgt *alloc_tgt(const char *name)
+static struct cache_dfs_tgt *alloc_target(const char *name)
{
- struct dfs_cache_tgt *t;
+ struct cache_dfs_tgt *t;
- t = kmalloc(sizeof(*t), GFP_KERNEL);
+ t = kmalloc(sizeof(*t), GFP_ATOMIC);
if (!t)
return ERR_PTR(-ENOMEM);
- t->t_name = kstrndup(name, strlen(name), GFP_KERNEL);
- if (!t->t_name) {
+ t->name = kstrndup(name, strlen(name), GFP_ATOMIC);
+ if (!t->name) {
kfree(t);
return ERR_PTR(-ENOMEM);
}
- INIT_LIST_HEAD(&t->t_list);
+ INIT_LIST_HEAD(&t->list);
return t;
}
@@ -367,180 +370,184 @@ static inline struct dfs_cache_tgt *alloc_tgt(const char *name)
* target hint.
*/
static int copy_ref_data(const struct dfs_info3_param *refs, int numrefs,
- struct dfs_cache_entry *ce, const char *tgthint)
+ struct cache_entry *ce, const char *tgthint)
{
int i;
- ce->ce_ttl = refs[0].ttl;
- ce->ce_etime = get_expire_time(ce->ce_ttl);
- ce->ce_srvtype = refs[0].server_type;
- ce->ce_flags = refs[0].ref_flag;
- ce->ce_path_consumed = refs[0].path_consumed;
+ ce->ttl = refs[0].ttl;
+ ce->etime = get_expire_time(ce->ttl);
+ ce->srvtype = refs[0].server_type;
+ ce->flags = refs[0].ref_flag;
+ ce->path_consumed = refs[0].path_consumed;
for (i = 0; i < numrefs; i++) {
- struct dfs_cache_tgt *t;
+ struct cache_dfs_tgt *t;
- t = alloc_tgt(refs[i].node_name);
+ t = alloc_target(refs[i].node_name);
if (IS_ERR(t)) {
free_tgts(ce);
return PTR_ERR(t);
}
- if (tgthint && !strcasecmp(t->t_name, tgthint)) {
- list_add(&t->t_list, &ce->ce_tlist);
+ if (tgthint && !strcasecmp(t->name, tgthint)) {
+ list_add(&t->list, &ce->tlist);
tgthint = NULL;
} else {
- list_add_tail(&t->t_list, &ce->ce_tlist);
+ list_add_tail(&t->list, &ce->tlist);
}
- ce->ce_numtgts++;
+ ce->numtgts++;
}
- ce->ce_tgthint = list_first_entry_or_null(&ce->ce_tlist,
- struct dfs_cache_tgt, t_list);
+ ce->tgthint = list_first_entry_or_null(&ce->tlist,
+ struct cache_dfs_tgt, list);
return 0;
}
/* Allocate a new cache entry */
-static struct dfs_cache_entry *
-alloc_cache_entry(const char *path, const struct dfs_info3_param *refs,
- int numrefs)
+static struct cache_entry *alloc_cache_entry(const char *path,
+ const struct dfs_info3_param *refs,
+ int numrefs)
{
- struct dfs_cache_entry *ce;
+ struct cache_entry *ce;
int rc;
- ce = kmem_cache_zalloc(dfs_cache_slab, GFP_KERNEL);
+ ce = kmem_cache_zalloc(cache_slab, GFP_KERNEL);
if (!ce)
return ERR_PTR(-ENOMEM);
- ce->ce_path = kstrdup_const(path, GFP_KERNEL);
- if (!ce->ce_path) {
- kmem_cache_free(dfs_cache_slab, ce);
+ ce->path = kstrndup(path, strlen(path), GFP_KERNEL);
+ if (!ce->path) {
+ kmem_cache_free(cache_slab, ce);
return ERR_PTR(-ENOMEM);
}
- INIT_HLIST_NODE(&ce->ce_hlist);
- INIT_LIST_HEAD(&ce->ce_tlist);
+ INIT_HLIST_NODE(&ce->hlist);
+ INIT_LIST_HEAD(&ce->tlist);
rc = copy_ref_data(refs, numrefs, ce, NULL);
if (rc) {
- kfree_const(ce->ce_path);
- kmem_cache_free(dfs_cache_slab, ce);
+ kfree(ce->path);
+ kmem_cache_free(cache_slab, ce);
ce = ERR_PTR(rc);
}
return ce;
}
+/* Must be called with htable_rw_lock held */
static void remove_oldest_entry(void)
{
- int bucket;
- struct dfs_cache_entry *ce;
- struct dfs_cache_entry *to_del = NULL;
-
- rcu_read_lock();
- hash_for_each_rcu(dfs_cache_htable, bucket, ce, ce_hlist) {
- if (!to_del || timespec64_compare(&ce->ce_etime,
- &to_del->ce_etime) < 0)
- to_del = ce;
+ int i;
+ struct cache_entry *ce;
+ struct cache_entry *to_del = NULL;
+
+ for (i = 0; i < CACHE_HTABLE_SIZE; i++) {
+ struct hlist_head *l = &cache_htable[i];
+
+ hlist_for_each_entry(ce, l, hlist) {
+ if (hlist_unhashed(&ce->hlist))
+ continue;
+ if (!to_del || timespec64_compare(&ce->etime,
+ &to_del->etime) < 0)
+ to_del = ce;
+ }
}
+
if (!to_del) {
cifs_dbg(FYI, "%s: no entry to remove", __func__);
- goto out;
+ return;
}
+
cifs_dbg(FYI, "%s: removing entry", __func__);
dump_ce(to_del);
flush_cache_ent(to_del);
-out:
- rcu_read_unlock();
}
/* Add a new DFS cache entry */
-static inline struct dfs_cache_entry *
-add_cache_entry(unsigned int hash, const char *path,
- const struct dfs_info3_param *refs, int numrefs)
+static int add_cache_entry(const char *path, unsigned int hash,
+ struct dfs_info3_param *refs, int numrefs)
{
- struct dfs_cache_entry *ce;
+ struct cache_entry *ce;
ce = alloc_cache_entry(path, refs, numrefs);
if (IS_ERR(ce))
- return ce;
+ return PTR_ERR(ce);
- hlist_add_head_rcu(&ce->ce_hlist, &dfs_cache_htable[hash]);
-
- mutex_lock(&dfs_cache.dc_lock);
- if (dfs_cache.dc_ttl < 0) {
- dfs_cache.dc_ttl = ce->ce_ttl;
- queue_delayed_work(cifsiod_wq, &dfs_cache.dc_refresh,
- dfs_cache.dc_ttl * HZ);
+ spin_lock(&cache_ttl_lock);
+ if (!cache_ttl) {
+ cache_ttl = ce->ttl;
+ queue_delayed_work(dfscache_wq, &refresh_task, cache_ttl * HZ);
} else {
- dfs_cache.dc_ttl = min_t(int, dfs_cache.dc_ttl, ce->ce_ttl);
- mod_delayed_work(cifsiod_wq, &dfs_cache.dc_refresh,
- dfs_cache.dc_ttl * HZ);
+ cache_ttl = min_t(int, cache_ttl, ce->ttl);
+ mod_delayed_work(dfscache_wq, &refresh_task, cache_ttl * HZ);
}
- mutex_unlock(&dfs_cache.dc_lock);
+ spin_unlock(&cache_ttl_lock);
- return ce;
+ down_write(&htable_rw_lock);
+ hlist_add_head(&ce->hlist, &cache_htable[hash]);
+ dump_ce(ce);
+ up_write(&htable_rw_lock);
+
+ return 0;
}
-static struct dfs_cache_entry *__find_cache_entry(unsigned int hash,
- const char *path)
+/*
+ * Find a DFS cache entry in hash table and optionally check prefix path against
+ * @path.
+ * Use whole path components in the match.
+ * Must be called with htable_rw_lock held.
+ *
+ * Return ERR_PTR(-ENOENT) if the entry is not found.
+ */
+static struct cache_entry *lookup_cache_entry(const char *path,
+ unsigned int *hash)
{
- struct dfs_cache_entry *ce;
+ struct cache_entry *ce;
+ unsigned int h;
bool found = false;
- rcu_read_lock();
- hlist_for_each_entry_rcu(ce, &dfs_cache_htable[hash], ce_hlist) {
- if (!strcasecmp(path, ce->ce_path)) {
-#ifdef CONFIG_CIFS_DEBUG2
- char *name = get_tgt_name(ce);
+ h = cache_entry_hash(path, strlen(path));
- if (IS_ERR(name)) {
- rcu_read_unlock();
- return ERR_CAST(name);
- }
- cifs_dbg(FYI, "%s: cache hit\n", __func__);
- cifs_dbg(FYI, "%s: target hint: %s\n", __func__, name);
-#endif
+ hlist_for_each_entry(ce, &cache_htable[h], hlist) {
+ if (!strcasecmp(path, ce->path)) {
found = true;
+ dump_ce(ce);
break;
}
}
- rcu_read_unlock();
- return found ? ce : ERR_PTR(-ENOENT);
-}
-/*
- * Find a DFS cache entry in hash table and optionally check prefix path against
- * @path.
- * Use whole path components in the match.
- * Return ERR_PTR(-ENOENT) if the entry is not found.
- */
-static inline struct dfs_cache_entry *find_cache_entry(const char *path,
- unsigned int *hash)
-{
- *hash = cache_entry_hash(path, strlen(path));
- return __find_cache_entry(*hash, path);
+ if (!found)
+ ce = ERR_PTR(-ENOENT);
+ if (hash)
+ *hash = h;
+
+ return ce;
}
-static inline void destroy_slab_cache(void)
+static void __vol_release(struct vol_info *vi)
{
- rcu_barrier();
- kmem_cache_destroy(dfs_cache_slab);
+ kfree(vi->fullpath);
+ kfree(vi->mntdata);
+ cifs_cleanup_volume_info_contents(&vi->smb_vol);
+ kfree(vi);
}
-static inline void free_vol(struct dfs_cache_vol_info *vi)
+static void vol_release(struct kref *kref)
{
- list_del(&vi->vi_list);
- kfree(vi->vi_fullpath);
- kfree(vi->vi_mntdata);
- cifs_cleanup_volume_info_contents(&vi->vi_vol);
- kfree(vi);
+ struct vol_info *vi = container_of(kref, struct vol_info, refcnt);
+
+ spin_lock(&vol_list_lock);
+ list_del(&vi->list);
+ spin_unlock(&vol_list_lock);
+ __vol_release(vi);
}
static inline void free_vol_list(void)
{
- struct dfs_cache_vol_info *vi, *nvi;
+ struct vol_info *vi, *nvi;
- list_for_each_entry_safe(vi, nvi, &dfs_cache.dc_vol_list, vi_list)
- free_vol(vi);
+ list_for_each_entry_safe(vi, nvi, &vol_list, list) {
+ list_del_init(&vi->list);
+ __vol_release(vi);
+ }
}
/**
@@ -548,83 +555,78 @@ static inline void free_vol_list(void)
*/
void dfs_cache_destroy(void)
{
- cancel_delayed_work_sync(&dfs_cache.dc_refresh);
- unload_nls(dfs_cache.dc_nlsc);
+ cancel_delayed_work_sync(&refresh_task);
+ unload_nls(cache_nlsc);
free_vol_list();
- mutex_destroy(&dfs_cache.dc_lock);
-
flush_cache_ents();
- destroy_slab_cache();
- mutex_destroy(&dfs_cache_list_lock);
+ kmem_cache_destroy(cache_slab);
+ destroy_workqueue(dfscache_wq);
cifs_dbg(FYI, "%s: destroyed DFS referral cache\n", __func__);
}
-static inline struct dfs_cache_entry *
-__update_cache_entry(const char *path, const struct dfs_info3_param *refs,
- int numrefs)
+/* Must be called with htable_rw_lock held */
+static int __update_cache_entry(const char *path,
+ const struct dfs_info3_param *refs,
+ int numrefs)
{
int rc;
- unsigned int h;
- struct dfs_cache_entry *ce;
+ struct cache_entry *ce;
char *s, *th = NULL;
- ce = find_cache_entry(path, &h);
+ ce = lookup_cache_entry(path, NULL);
if (IS_ERR(ce))
- return ce;
+ return PTR_ERR(ce);
- if (ce->ce_tgthint) {
- s = ce->ce_tgthint->t_name;
- th = kstrndup(s, strlen(s), GFP_KERNEL);
+ if (ce->tgthint) {
+ s = ce->tgthint->name;
+ th = kstrndup(s, strlen(s), GFP_ATOMIC);
if (!th)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
}
free_tgts(ce);
- ce->ce_numtgts = 0;
+ ce->numtgts = 0;
rc = copy_ref_data(refs, numrefs, ce, th);
- kfree(th);
- if (rc)
- ce = ERR_PTR(rc);
+ kfree(th);
- return ce;
+ return rc;
}
-/* Update an expired cache entry by getting a new DFS referral from server */
-static struct dfs_cache_entry *
-update_cache_entry(const unsigned int xid, struct cifs_ses *ses,
- const struct nls_table *nls_codepage, int remap,
- const char *path, struct dfs_cache_entry *ce)
+static int get_dfs_referral(const unsigned int xid, struct cifs_ses *ses,
+ const struct nls_table *nls_codepage, int remap,
+ const char *path, struct dfs_info3_param **refs,
+ int *numrefs)
{
- int rc;
- struct dfs_info3_param *refs = NULL;
- int numrefs = 0;
+ cifs_dbg(FYI, "%s: get an DFS referral for %s\n", __func__, path);
- cifs_dbg(FYI, "%s: update expired cache entry\n", __func__);
- /*
- * Check if caller provided enough parameters to update an expired
- * entry.
- */
if (!ses || !ses->server || !ses->server->ops->get_dfs_refer)
- return ERR_PTR(-ETIME);
+ return -EOPNOTSUPP;
if (unlikely(!nls_codepage))
- return ERR_PTR(-ETIME);
+ return -EINVAL;
- cifs_dbg(FYI, "%s: DFS referral request for %s\n", __func__, path);
+ *refs = NULL;
+ *numrefs = 0;
- rc = ses->server->ops->get_dfs_refer(xid, ses, path, &refs, &numrefs,
- nls_codepage, remap);
- if (rc)
- ce = ERR_PTR(rc);
- else
- ce = __update_cache_entry(path, refs, numrefs);
+ return ses->server->ops->get_dfs_refer(xid, ses, path, refs, numrefs,
+ nls_codepage, remap);
+}
- dump_refs(refs, numrefs);
- free_dfs_info_array(refs, numrefs);
+/* Update an expired cache entry by getting a new DFS referral from server */
+static int update_cache_entry(const char *path,
+ const struct dfs_info3_param *refs,
+ int numrefs)
+{
- return ce;
+ int rc;
+
+ down_write(&htable_rw_lock);
+ rc = __update_cache_entry(path, refs, numrefs);
+ up_write(&htable_rw_lock);
+
+ return rc;
}
/*
@@ -636,95 +638,86 @@ update_cache_entry(const unsigned int xid, struct cifs_ses *ses,
* For interlinks, __cifs_dfs_mount() and expand_dfs_referral() are supposed to
* handle them properly.
*/
-static struct dfs_cache_entry *
-do_dfs_cache_find(const unsigned int xid, struct cifs_ses *ses,
- const struct nls_table *nls_codepage, int remap,
- const char *path, bool noreq)
+static int __dfs_cache_find(const unsigned int xid, struct cifs_ses *ses,
+ const struct nls_table *nls_codepage, int remap,
+ const char *path, bool noreq)
{
int rc;
- unsigned int h;
- struct dfs_cache_entry *ce;
- struct dfs_info3_param *nrefs;
- int numnrefs;
+ unsigned int hash;
+ struct cache_entry *ce;
+ struct dfs_info3_param *refs = NULL;
+ int numrefs = 0;
+ bool newent = false;
cifs_dbg(FYI, "%s: search path: %s\n", __func__, path);
- ce = find_cache_entry(path, &h);
- if (IS_ERR(ce)) {
- cifs_dbg(FYI, "%s: cache miss\n", __func__);
- /*
- * If @noreq is set, no requests will be sent to the server for
- * either updating or getting a new DFS referral.
- */
- if (noreq)
- return ce;
- /*
- * No cache entry was found, so check for valid parameters that
- * will be required to get a new DFS referral and then create a
- * new cache entry.
- */
- if (!ses || !ses->server || !ses->server->ops->get_dfs_refer) {
- ce = ERR_PTR(-EOPNOTSUPP);
- return ce;
- }
- if (unlikely(!nls_codepage)) {
- ce = ERR_PTR(-EINVAL);
- return ce;
- }
+ down_read(&htable_rw_lock);
- nrefs = NULL;
- numnrefs = 0;
+ ce = lookup_cache_entry(path, &hash);
- cifs_dbg(FYI, "%s: DFS referral request for %s\n", __func__,
- path);
+ /*
+ * If @noreq is set, no requests will be sent to the server. Just return
+ * the cache entry.
+ */
+ if (noreq) {
+ up_read(&htable_rw_lock);
+ return PTR_ERR_OR_ZERO(ce);
+ }
- rc = ses->server->ops->get_dfs_refer(xid, ses, path, &nrefs,
- &numnrefs, nls_codepage,
- remap);
- if (rc) {
- ce = ERR_PTR(rc);
- return ce;
+ if (!IS_ERR(ce)) {
+ if (!cache_entry_expired(ce)) {
+ dump_ce(ce);
+ up_read(&htable_rw_lock);
+ return 0;
}
+ } else {
+ newent = true;
+ }
- dump_refs(nrefs, numnrefs);
+ up_read(&htable_rw_lock);
- cifs_dbg(FYI, "%s: new cache entry\n", __func__);
+ /*
+ * No entry was found.
+ *
+ * Request a new DFS referral in order to create a new cache entry, or
+ * updating an existing one.
+ */
+ rc = get_dfs_referral(xid, ses, nls_codepage, remap, path,
+ &refs, &numrefs);
+ if (rc)
+ return rc;
- if (dfs_cache_count >= DFS_CACHE_MAX_ENTRIES) {
- cifs_dbg(FYI, "%s: reached max cache size (%d)",
- __func__, DFS_CACHE_MAX_ENTRIES);
- remove_oldest_entry();
- }
- ce = add_cache_entry(h, path, nrefs, numnrefs);
- free_dfs_info_array(nrefs, numnrefs);
+ dump_refs(refs, numrefs);
- if (IS_ERR(ce))
- return ce;
+ if (!newent) {
+ rc = update_cache_entry(path, refs, numrefs);
+ goto out_free_refs;
+ }
- dfs_cache_count++;
+ if (atomic_read(&cache_count) >= CACHE_MAX_ENTRIES) {
+ cifs_dbg(FYI, "%s: reached max cache size (%d)", __func__,
+ CACHE_MAX_ENTRIES);
+ down_write(&htable_rw_lock);
+ remove_oldest_entry();
+ up_write(&htable_rw_lock);
}
- dump_ce(ce);
+ rc = add_cache_entry(path, hash, refs, numrefs);
+ if (!rc)
+ atomic_inc(&cache_count);
- /* Just return the found cache entry in case @noreq is set */
- if (noreq)
- return ce;
-
- if (cache_entry_expired(ce)) {
- cifs_dbg(FYI, "%s: expired cache entry\n", __func__);
- ce = update_cache_entry(xid, ses, nls_codepage, remap, path,
- ce);
- if (IS_ERR(ce)) {
- cifs_dbg(FYI, "%s: failed to update expired entry\n",
- __func__);
- }
- }
- return ce;
+out_free_refs:
+ free_dfs_info_array(refs, numrefs);
+ return rc;
}
-/* Set up a new DFS referral from a given cache entry */
-static int setup_ref(const char *path, const struct dfs_cache_entry *ce,
- struct dfs_info3_param *ref, const char *tgt)
+/*
+ * Set up a DFS referral from a given cache entry.
+ *
+ * Must be called with htable_rw_lock held.
+ */
+static int setup_referral(const char *path, struct cache_entry *ce,
+ struct dfs_info3_param *ref, const char *target)
{
int rc;
@@ -732,21 +725,20 @@ static int setup_ref(const char *path, const struct dfs_cache_entry *ce,
memset(ref, 0, sizeof(*ref));
- ref->path_name = kstrndup(path, strlen(path), GFP_KERNEL);
+ ref->path_name = kstrndup(path, strlen(path), GFP_ATOMIC);
if (!ref->path_name)
return -ENOMEM;
- ref->path_consumed = ce->ce_path_consumed;
-
- ref->node_name = kstrndup(tgt, strlen(tgt), GFP_KERNEL);
+ ref->node_name = kstrndup(target, strlen(target), GFP_ATOMIC);
if (!ref->node_name) {
rc = -ENOMEM;
goto err_free_path;
}
- ref->ttl = ce->ce_ttl;
- ref->server_type = ce->ce_srvtype;
- ref->ref_flag = ce->ce_flags;
+ ref->path_consumed = ce->path_consumed;
+ ref->ttl = ce->ttl;
+ ref->server_type = ce->srvtype;
+ ref->ref_flag = ce->flags;
return 0;
@@ -757,38 +749,37 @@ err_free_path:
}
/* Return target list of a DFS cache entry */
-static int get_tgt_list(const struct dfs_cache_entry *ce,
- struct dfs_cache_tgt_list *tl)
+static int get_targets(struct cache_entry *ce, struct dfs_cache_tgt_list *tl)
{
int rc;
struct list_head *head = &tl->tl_list;
- struct dfs_cache_tgt *t;
+ struct cache_dfs_tgt *t;
struct dfs_cache_tgt_iterator *it, *nit;
memset(tl, 0, sizeof(*tl));
INIT_LIST_HEAD(head);
- list_for_each_entry(t, &ce->ce_tlist, t_list) {
- it = kzalloc(sizeof(*it), GFP_KERNEL);
+ list_for_each_entry(t, &ce->tlist, list) {
+ it = kzalloc(sizeof(*it), GFP_ATOMIC);
if (!it) {
rc = -ENOMEM;
goto err_free_it;
}
- it->it_name = kstrndup(t->t_name, strlen(t->t_name),
- GFP_KERNEL);
+ it->it_name = kstrndup(t->name, strlen(t->name), GFP_ATOMIC);
if (!it->it_name) {
kfree(it);
rc = -ENOMEM;
goto err_free_it;
}
- if (ce->ce_tgthint == t)
+ if (ce->tgthint == t)
list_add(&it->it_list, head);
else
list_add_tail(&it->it_list, head);
}
- tl->tl_numtgts = ce->ce_numtgts;
+
+ tl->tl_numtgts = ce->numtgts;
return 0;
@@ -829,28 +820,35 @@ int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses,
{
int rc;
char *npath;
- struct dfs_cache_entry *ce;
-
- if (unlikely(!is_path_valid(path)))
- return -EINVAL;
+ struct cache_entry *ce;
rc = get_normalized_path(path, &npath);
if (rc)
return rc;
- mutex_lock(&dfs_cache_list_lock);
- ce = do_dfs_cache_find(xid, ses, nls_codepage, remap, npath, false);
- if (!IS_ERR(ce)) {
- if (ref)
- rc = setup_ref(path, ce, ref, get_tgt_name(ce));
- else
- rc = 0;
- if (!rc && tgt_list)
- rc = get_tgt_list(ce, tgt_list);
- } else {
+ rc = __dfs_cache_find(xid, ses, nls_codepage, remap, npath, false);
+ if (rc)
+ goto out_free_path;
+
+ down_read(&htable_rw_lock);
+
+ ce = lookup_cache_entry(npath, NULL);
+ if (IS_ERR(ce)) {
+ up_read(&htable_rw_lock);
rc = PTR_ERR(ce);
+ goto out_free_path;
}
- mutex_unlock(&dfs_cache_list_lock);
+
+ if (ref)
+ rc = setup_referral(path, ce, ref, get_tgt_name(ce));
+ else
+ rc = 0;
+ if (!rc && tgt_list)
+ rc = get_targets(ce, tgt_list);
+
+ up_read(&htable_rw_lock);
+
+out_free_path:
free_normalized_path(path, npath);
return rc;
}
@@ -876,31 +874,33 @@ int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref,
{
int rc;
char *npath;
- struct dfs_cache_entry *ce;
-
- if (unlikely(!is_path_valid(path)))
- return -EINVAL;
+ struct cache_entry *ce;
rc = get_normalized_path(path, &npath);
if (rc)
return rc;
- mutex_lock(&dfs_cache_list_lock);
- ce = do_dfs_cache_find(0, NULL, NULL, 0, npath, true);
+ cifs_dbg(FYI, "%s: path: %s\n", __func__, npath);
+
+ down_read(&htable_rw_lock);
+
+ ce = lookup_cache_entry(npath, NULL);
if (IS_ERR(ce)) {
rc = PTR_ERR(ce);
- goto out;
+ goto out_unlock;
}
if (ref)
- rc = setup_ref(path, ce, ref, get_tgt_name(ce));
+ rc = setup_referral(path, ce, ref, get_tgt_name(ce));
else
rc = 0;
if (!rc && tgt_list)
- rc = get_tgt_list(ce, tgt_list);
-out:
- mutex_unlock(&dfs_cache_list_lock);
+ rc = get_targets(ce, tgt_list);
+
+out_unlock:
+ up_read(&htable_rw_lock);
free_normalized_path(path, npath);
+
return rc;
}
@@ -929,44 +929,46 @@ int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses,
{
int rc;
char *npath;
- struct dfs_cache_entry *ce;
- struct dfs_cache_tgt *t;
-
- if (unlikely(!is_path_valid(path)))
- return -EINVAL;
+ struct cache_entry *ce;
+ struct cache_dfs_tgt *t;
rc = get_normalized_path(path, &npath);
if (rc)
return rc;
- cifs_dbg(FYI, "%s: path: %s\n", __func__, npath);
+ cifs_dbg(FYI, "%s: update target hint - path: %s\n", __func__, npath);
- mutex_lock(&dfs_cache_list_lock);
- ce = do_dfs_cache_find(xid, ses, nls_codepage, remap, npath, false);
+ rc = __dfs_cache_find(xid, ses, nls_codepage, remap, npath, false);
+ if (rc)
+ goto out_free_path;
+
+ down_write(&htable_rw_lock);
+
+ ce = lookup_cache_entry(npath, NULL);
if (IS_ERR(ce)) {
rc = PTR_ERR(ce);
- goto out;
+ goto out_unlock;
}
- rc = 0;
-
- t = ce->ce_tgthint;
+ t = ce->tgthint;
- if (likely(!strcasecmp(it->it_name, t->t_name)))
- goto out;
+ if (likely(!strcasecmp(it->it_name, t->name)))
+ goto out_unlock;
- list_for_each_entry(t, &ce->ce_tlist, t_list) {
- if (!strcasecmp(t->t_name, it->it_name)) {
- ce->ce_tgthint = t;
+ list_for_each_entry(t, &ce->tlist, list) {
+ if (!strcasecmp(t->name, it->it_name)) {
+ ce->tgthint = t;
cifs_dbg(FYI, "%s: new target hint: %s\n", __func__,
it->it_name);
break;
}
}
-out:
- mutex_unlock(&dfs_cache_list_lock);
+out_unlock:
+ up_write(&htable_rw_lock);
+out_free_path:
free_normalized_path(path, npath);
+
return rc;
}
@@ -989,10 +991,10 @@ int dfs_cache_noreq_update_tgthint(const char *path,
{
int rc;
char *npath;
- struct dfs_cache_entry *ce;
- struct dfs_cache_tgt *t;
+ struct cache_entry *ce;
+ struct cache_dfs_tgt *t;
- if (unlikely(!is_path_valid(path)) || !it)
+ if (!it)
return -EINVAL;
rc = get_normalized_path(path, &npath);
@@ -1001,33 +1003,33 @@ int dfs_cache_noreq_update_tgthint(const char *path,
cifs_dbg(FYI, "%s: path: %s\n", __func__, npath);
- mutex_lock(&dfs_cache_list_lock);
+ down_write(&htable_rw_lock);
- ce = do_dfs_cache_find(0, NULL, NULL, 0, npath, true);
+ ce = lookup_cache_entry(npath, NULL);
if (IS_ERR(ce)) {
rc = PTR_ERR(ce);
- goto out;
+ goto out_unlock;
}
rc = 0;
+ t = ce->tgthint;
- t = ce->ce_tgthint;
+ if (unlikely(!strcasecmp(it->it_name, t->name)))
+ goto out_unlock;
- if (unlikely(!strcasecmp(it->it_name, t->t_name)))
- goto out;
-
- list_for_each_entry(t, &ce->ce_tlist, t_list) {
- if (!strcasecmp(t->t_name, it->it_name)) {
- ce->ce_tgthint = t;
+ list_for_each_entry(t, &ce->tlist, list) {
+ if (!strcasecmp(t->name, it->it_name)) {
+ ce->tgthint = t;
cifs_dbg(FYI, "%s: new target hint: %s\n", __func__,
it->it_name);
break;
}
}
-out:
- mutex_unlock(&dfs_cache_list_lock);
+out_unlock:
+ up_write(&htable_rw_lock);
free_normalized_path(path, npath);
+
return rc;
}
@@ -1047,13 +1049,10 @@ int dfs_cache_get_tgt_referral(const char *path,
{
int rc;
char *npath;
- struct dfs_cache_entry *ce;
- unsigned int h;
+ struct cache_entry *ce;
if (!it || !ref)
return -EINVAL;
- if (unlikely(!is_path_valid(path)))
- return -EINVAL;
rc = get_normalized_path(path, &npath);
if (rc)
@@ -1061,21 +1060,22 @@ int dfs_cache_get_tgt_referral(const char *path,
cifs_dbg(FYI, "%s: path: %s\n", __func__, npath);
- mutex_lock(&dfs_cache_list_lock);
+ down_read(&htable_rw_lock);
- ce = find_cache_entry(npath, &h);
+ ce = lookup_cache_entry(npath, NULL);
if (IS_ERR(ce)) {
rc = PTR_ERR(ce);
- goto out;
+ goto out_unlock;
}
cifs_dbg(FYI, "%s: target name: %s\n", __func__, it->it_name);
- rc = setup_ref(path, ce, ref, it->it_name);
+ rc = setup_referral(path, ce, ref, it->it_name);
-out:
- mutex_unlock(&dfs_cache_list_lock);
+out_unlock:
+ up_read(&htable_rw_lock);
free_normalized_path(path, npath);
+
return rc;
}
@@ -1085,7 +1085,7 @@ static int dup_vol(struct smb_vol *vol, struct smb_vol *new)
if (vol->username) {
new->username = kstrndup(vol->username, strlen(vol->username),
- GFP_KERNEL);
+ GFP_KERNEL);
if (!new->username)
return -ENOMEM;
}
@@ -1103,7 +1103,7 @@ static int dup_vol(struct smb_vol *vol, struct smb_vol *new)
}
if (vol->domainname) {
new->domainname = kstrndup(vol->domainname,
- strlen(vol->domainname), GFP_KERNEL);
+ strlen(vol->domainname), GFP_KERNEL);
if (!new->domainname)
goto err_free_unc;
}
@@ -1150,7 +1150,7 @@ err_free_username:
int dfs_cache_add_vol(char *mntdata, struct smb_vol *vol, const char *fullpath)
{
int rc;
- struct dfs_cache_vol_info *vi;
+ struct vol_info *vi;
if (!vol || !fullpath || !mntdata)
return -EINVAL;
@@ -1161,38 +1161,41 @@ int dfs_cache_add_vol(char *mntdata, struct smb_vol *vol, const char *fullpath)
if (!vi)
return -ENOMEM;
- vi->vi_fullpath = kstrndup(fullpath, strlen(fullpath), GFP_KERNEL);
- if (!vi->vi_fullpath) {
+ vi->fullpath = kstrndup(fullpath, strlen(fullpath), GFP_KERNEL);
+ if (!vi->fullpath) {
rc = -ENOMEM;
goto err_free_vi;
}
- rc = dup_vol(vol, &vi->vi_vol);
+ rc = dup_vol(vol, &vi->smb_vol);
if (rc)
goto err_free_fullpath;
- vi->vi_mntdata = mntdata;
+ vi->mntdata = mntdata;
+ spin_lock_init(&vi->smb_vol_lock);
+ kref_init(&vi->refcnt);
+
+ spin_lock(&vol_list_lock);
+ list_add_tail(&vi->list, &vol_list);
+ spin_unlock(&vol_list_lock);
- mutex_lock(&dfs_cache.dc_lock);
- list_add_tail(&vi->vi_list, &dfs_cache.dc_vol_list);
- mutex_unlock(&dfs_cache.dc_lock);
return 0;
err_free_fullpath:
- kfree(vi->vi_fullpath);
+ kfree(vi->fullpath);
err_free_vi:
kfree(vi);
return rc;
}
-static inline struct dfs_cache_vol_info *find_vol(const char *fullpath)
+/* Must be called with vol_list_lock held */
+static struct vol_info *find_vol(const char *fullpath)
{
- struct dfs_cache_vol_info *vi;
+ struct vol_info *vi;
- list_for_each_entry(vi, &dfs_cache.dc_vol_list, vi_list) {
- cifs_dbg(FYI, "%s: vi->vi_fullpath: %s\n", __func__,
- vi->vi_fullpath);
- if (!strcasecmp(vi->vi_fullpath, fullpath))
+ list_for_each_entry(vi, &vol_list, list) {
+ cifs_dbg(FYI, "%s: vi->fullpath: %s\n", __func__, vi->fullpath);
+ if (!strcasecmp(vi->fullpath, fullpath))
return vi;
}
return ERR_PTR(-ENOENT);
@@ -1208,30 +1211,31 @@ static inline struct dfs_cache_vol_info *find_vol(const char *fullpath)
*/
int dfs_cache_update_vol(const char *fullpath, struct TCP_Server_Info *server)
{
- int rc;
- struct dfs_cache_vol_info *vi;
+ struct vol_info *vi;
if (!fullpath || !server)
return -EINVAL;
cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath);
- mutex_lock(&dfs_cache.dc_lock);
-
+ spin_lock(&vol_list_lock);
vi = find_vol(fullpath);
if (IS_ERR(vi)) {
- rc = PTR_ERR(vi);
- goto out;
+ spin_unlock(&vol_list_lock);
+ return PTR_ERR(vi);
}
+ kref_get(&vi->refcnt);
+ spin_unlock(&vol_list_lock);
cifs_dbg(FYI, "%s: updating volume info\n", __func__);
- memcpy(&vi->vi_vol.dstaddr, &server->dstaddr,
- sizeof(vi->vi_vol.dstaddr));
- rc = 0;
+ spin_lock(&vi->smb_vol_lock);
+ memcpy(&vi->smb_vol.dstaddr, &server->dstaddr,
+ sizeof(vi->smb_vol.dstaddr));
+ spin_unlock(&vi->smb_vol_lock);
-out:
- mutex_unlock(&dfs_cache.dc_lock);
- return rc;
+ kref_put(&vi->refcnt, vol_release);
+
+ return 0;
}
/**
@@ -1241,18 +1245,18 @@ out:
*/
void dfs_cache_del_vol(const char *fullpath)
{
- struct dfs_cache_vol_info *vi;
+ struct vol_info *vi;
if (!fullpath || !*fullpath)
return;
cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath);
- mutex_lock(&dfs_cache.dc_lock);
+ spin_lock(&vol_list_lock);
vi = find_vol(fullpath);
- if (!IS_ERR(vi))
- free_vol(vi);
- mutex_unlock(&dfs_cache.dc_lock);
+ spin_unlock(&vol_list_lock);
+
+ kref_put(&vi->refcnt, vol_release);
}
/* Get all tcons that are within a DFS namespace and can be refreshed */
@@ -1280,7 +1284,7 @@ static void get_tcons(struct TCP_Server_Info *server, struct list_head *head)
spin_unlock(&cifs_tcp_ses_lock);
}
-static inline bool is_dfs_link(const char *path)
+static bool is_dfs_link(const char *path)
{
char *s;
@@ -1290,7 +1294,7 @@ static inline bool is_dfs_link(const char *path)
return !!strchr(s + 1, '\\');
}
-static inline char *get_dfs_root(const char *path)
+static char *get_dfs_root(const char *path)
{
char *s, *npath;
@@ -1309,31 +1313,67 @@ static inline char *get_dfs_root(const char *path)
return npath;
}
+static inline void put_tcp_server(struct TCP_Server_Info *server)
+{
+ cifs_put_tcp_session(server, 0);
+}
+
+static struct TCP_Server_Info *get_tcp_server(struct smb_vol *vol)
+{
+ struct TCP_Server_Info *server;
+
+ server = cifs_find_tcp_session(vol);
+ if (IS_ERR_OR_NULL(server))
+ return NULL;
+
+ spin_lock(&GlobalMid_Lock);
+ if (server->tcpStatus != CifsGood) {
+ spin_unlock(&GlobalMid_Lock);
+ put_tcp_server(server);
+ return NULL;
+ }
+ spin_unlock(&GlobalMid_Lock);
+
+ return server;
+}
+
/* Find root SMB session out of a DFS link path */
-static struct cifs_ses *find_root_ses(struct dfs_cache_vol_info *vi,
- struct cifs_tcon *tcon, const char *path)
+static struct cifs_ses *find_root_ses(struct vol_info *vi,
+ struct cifs_tcon *tcon,
+ const char *path)
{
char *rpath;
int rc;
+ struct cache_entry *ce;
struct dfs_info3_param ref = {0};
char *mdata = NULL, *devname = NULL;
struct TCP_Server_Info *server;
struct cifs_ses *ses;
- struct smb_vol vol;
+ struct smb_vol vol = {NULL};
rpath = get_dfs_root(path);
if (IS_ERR(rpath))
return ERR_CAST(rpath);
- memset(&vol, 0, sizeof(vol));
+ down_read(&htable_rw_lock);
+
+ ce = lookup_cache_entry(rpath, NULL);
+ if (IS_ERR(ce)) {
+ up_read(&htable_rw_lock);
+ ses = ERR_CAST(ce);
+ goto out;
+ }
- rc = dfs_cache_noreq_find(rpath, &ref, NULL);
+ rc = setup_referral(path, ce, &ref, get_tgt_name(ce));
if (rc) {
+ up_read(&htable_rw_lock);
ses = ERR_PTR(rc);
goto out;
}
- mdata = cifs_compose_mount_options(vi->vi_mntdata, rpath, &ref,
+ up_read(&htable_rw_lock);
+
+ mdata = cifs_compose_mount_options(vi->mntdata, rpath, &ref,
&devname);
free_dfs_info_param(&ref);
@@ -1351,13 +1391,8 @@ static struct cifs_ses *find_root_ses(struct dfs_cache_vol_info *vi,
goto out;
}
- server = cifs_find_tcp_session(&vol);
- if (IS_ERR_OR_NULL(server)) {
- ses = ERR_PTR(-EHOSTDOWN);
- goto out;
- }
- if (server->tcpStatus != CifsGood) {
- cifs_put_tcp_session(server, 0);
+ server = get_tcp_server(&vol);
+ if (!server) {
ses = ERR_PTR(-EHOSTDOWN);
goto out;
}
@@ -1373,17 +1408,15 @@ out:
}
/* Refresh DFS cache entry from a given tcon */
-static void do_refresh_tcon(struct dfs_cache *dc, struct dfs_cache_vol_info *vi,
- struct cifs_tcon *tcon)
+static int refresh_tcon(struct vol_info *vi, struct cifs_tcon *tcon)
{
int rc = 0;
unsigned int xid;
char *path, *npath;
- unsigned int h;
- struct dfs_cache_entry *ce;
+ struct cache_entry *ce;
+ struct cifs_ses *root_ses = NULL, *ses;
struct dfs_info3_param *refs = NULL;
int numrefs = 0;
- struct cifs_ses *root_ses = NULL, *ses;
xid = get_xid();
@@ -1391,19 +1424,23 @@ static void do_refresh_tcon(struct dfs_cache *dc, struct dfs_cache_vol_info *vi,
rc = get_normalized_path(path, &npath);
if (rc)
- goto out;
+ goto out_free_xid;
- mutex_lock(&dfs_cache_list_lock);
- ce = find_cache_entry(npath, &h);
- mutex_unlock(&dfs_cache_list_lock);
+ down_read(&htable_rw_lock);
+ ce = lookup_cache_entry(npath, NULL);
if (IS_ERR(ce)) {
rc = PTR_ERR(ce);
- goto out;
+ up_read(&htable_rw_lock);
+ goto out_free_path;
}
- if (!cache_entry_expired(ce))
- goto out;
+ if (!cache_entry_expired(ce)) {
+ up_read(&htable_rw_lock);
+ goto out_free_path;
+ }
+
+ up_read(&htable_rw_lock);
/* If it's a DFS Link, then use root SMB session for refreshing it */
if (is_dfs_link(npath)) {
@@ -1411,35 +1448,29 @@ static void do_refresh_tcon(struct dfs_cache *dc, struct dfs_cache_vol_info *vi,
if (IS_ERR(ses)) {
rc = PTR_ERR(ses);
root_ses = NULL;
- goto out;
+ goto out_free_path;
}
} else {
ses = tcon->ses;
}
- if (unlikely(!ses->server->ops->get_dfs_refer)) {
- rc = -EOPNOTSUPP;
- } else {
- rc = ses->server->ops->get_dfs_refer(xid, ses, path, &refs,
- &numrefs, dc->dc_nlsc,
- tcon->remap);
- if (!rc) {
- mutex_lock(&dfs_cache_list_lock);
- ce = __update_cache_entry(npath, refs, numrefs);
- mutex_unlock(&dfs_cache_list_lock);
- dump_refs(refs, numrefs);
- free_dfs_info_array(refs, numrefs);
- if (IS_ERR(ce))
- rc = PTR_ERR(ce);
- }
+ rc = get_dfs_referral(xid, ses, cache_nlsc, tcon->remap, npath, &refs,
+ &numrefs);
+ if (!rc) {
+ dump_refs(refs, numrefs);
+ rc = update_cache_entry(npath, refs, numrefs);
+ free_dfs_info_array(refs, numrefs);
}
-out:
if (root_ses)
cifs_put_smb_ses(root_ses);
- free_xid(xid);
+out_free_path:
free_normalized_path(path, npath);
+
+out_free_xid:
+ free_xid(xid);
+ return rc;
}
/*
@@ -1448,30 +1479,61 @@ out:
*/
static void refresh_cache_worker(struct work_struct *work)
{
- struct dfs_cache *dc = container_of(work, struct dfs_cache,
- dc_refresh.work);
- struct dfs_cache_vol_info *vi;
+ struct vol_info *vi, *nvi;
struct TCP_Server_Info *server;
- LIST_HEAD(list);
+ LIST_HEAD(vols);
+ LIST_HEAD(tcons);
struct cifs_tcon *tcon, *ntcon;
+ int rc;
- mutex_lock(&dc->dc_lock);
-
- list_for_each_entry(vi, &dc->dc_vol_list, vi_list) {
- server = cifs_find_tcp_session(&vi->vi_vol);
- if (IS_ERR_OR_NULL(server))
+ /*
+ * Find SMB volumes that are eligible (server->tcpStatus == CifsGood)
+ * for refreshing.
+ */
+ spin_lock(&vol_list_lock);
+ list_for_each_entry(vi, &vol_list, list) {
+ server = get_tcp_server(&vi->smb_vol);
+ if (!server)
continue;
- if (server->tcpStatus != CifsGood)
- goto next;
- get_tcons(server, &list);
- list_for_each_entry_safe(tcon, ntcon, &list, ulist) {
- do_refresh_tcon(dc, vi, tcon);
+
+ kref_get(&vi->refcnt);
+ list_add_tail(&vi->rlist, &vols);
+ put_tcp_server(server);
+ }
+ spin_unlock(&vol_list_lock);
+
+ /* Walk through all TCONs and refresh any expired cache entry */
+ list_for_each_entry_safe(vi, nvi, &vols, rlist) {
+ spin_lock(&vi->smb_vol_lock);
+ server = get_tcp_server(&vi->smb_vol);
+ spin_unlock(&vi->smb_vol_lock);
+
+ if (!server)
+ goto next_vol;
+
+ get_tcons(server, &tcons);
+ rc = 0;
+
+ list_for_each_entry_safe(tcon, ntcon, &tcons, ulist) {
+ /*
+ * Skip tcp server if any of its tcons failed to refresh
+ * (possibily due to reconnects).
+ */
+ if (!rc)
+ rc = refresh_tcon(vi, tcon);
+
list_del_init(&tcon->ulist);
cifs_put_tcon(tcon);
}
-next:
- cifs_put_tcp_session(server, 0);
+
+ put_tcp_server(server);
+
+next_vol:
+ list_del_init(&vi->rlist);
+ kref_put(&vi->refcnt, vol_release);
}
- queue_delayed_work(cifsiod_wq, &dc->dc_refresh, dc->dc_ttl * HZ);
- mutex_unlock(&dc->dc_lock);
+
+ spin_lock(&cache_ttl_lock);
+ queue_delayed_work(dfscache_wq, &refresh_task, cache_ttl * HZ);
+ spin_unlock(&cache_ttl_lock);
}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 043288b5c728..a4e8f7d445ac 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2921,7 +2921,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
"direct_writev couldn't get user pages "
"(rc=%zd) iter type %d iov_offset %zd "
"count %zd\n",
- result, from->type,
+ result, iov_iter_type(from),
from->iov_offset, from->count);
dump_stack();
@@ -3132,7 +3132,7 @@ static ssize_t __cifs_writev(
* In this case, fall back to non-direct write function.
* this could be improved by getting pages directly in ITER_KVEC
*/
- if (direct && from->type & ITER_KVEC) {
+ if (direct && iov_iter_is_kvec(from)) {
cifs_dbg(FYI, "use non-direct cifs_writev for kvec I/O\n");
direct = false;
}
@@ -3652,7 +3652,7 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
"couldn't get user pages (rc=%zd)"
" iter type %d"
" iov_offset %zd count %zd\n",
- result, direct_iov.type,
+ result, iov_iter_type(&direct_iov),
direct_iov.iov_offset,
direct_iov.count);
dump_stack();
@@ -3863,7 +3863,7 @@ static ssize_t __cifs_readv(
* fall back to data copy read path
* this could be improved by getting pages directly in ITER_KVEC
*/
- if (direct && to->type & ITER_KVEC) {
+ if (direct && iov_iter_is_kvec(to)) {
cifs_dbg(FYI, "use non-direct cifs_user_readv for kvec I/O\n");
direct = false;
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index ca76a9287456..9b547f7f5f5d 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -2228,7 +2228,7 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start,
return -ENOTSUPP;
}
-static int cifs_truncate_page(struct address_space *mapping, loff_t from)
+int cifs_truncate_page(struct address_space *mapping, loff_t from)
{
pgoff_t index = from >> PAGE_SHIFT;
unsigned offset = from & (PAGE_SIZE - 1);
@@ -2245,7 +2245,7 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from)
return rc;
}
-static void cifs_setsize(struct inode *inode, loff_t offset)
+void cifs_setsize(struct inode *inode, loff_t offset)
{
struct cifsInodeInfo *cifs_i = CIFS_I(inode);
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 0516fc482d43..0511aaf451d4 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -743,7 +743,7 @@ __smb2_handle_cancelled_cmd(struct cifs_tcon *tcon, __u16 cmd, __u64 mid,
{
struct close_cancelled_open *cancelled;
- cancelled = kzalloc(sizeof(*cancelled), GFP_KERNEL);
+ cancelled = kzalloc(sizeof(*cancelled), GFP_ATOMIC);
if (!cancelled)
return -ENOMEM;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 6250370c1170..6787fce26f20 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -12,6 +12,7 @@
#include <linux/uuid.h>
#include <linux/sort.h>
#include <crypto/aead.h>
+#include "cifsfs.h"
#include "cifsglob.h"
#include "smb2pdu.h"
#include "smb2proto.h"
@@ -804,7 +805,7 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid)
sizeof(struct smb2_file_all_info),
&rsp_iov[1], sizeof(struct smb2_file_all_info),
(char *)&tcon->crfid.file_all_info))
- tcon->crfid.file_all_info_is_valid = 1;
+ tcon->crfid.file_all_info_is_valid = true;
oshr_exit:
mutex_unlock(&tcon->crfid.fid_mutex);
@@ -1523,7 +1524,9 @@ smb2_ioctl_query_info(const unsigned int xid,
COMPOUND_FID, COMPOUND_FID,
qi.info_type, true, buffer,
qi.output_buffer_length,
- CIFSMaxBufSize);
+ CIFSMaxBufSize -
+ MAX_SMB2_CREATE_RESPONSE_SIZE -
+ MAX_SMB2_CLOSE_RESPONSE_SIZE);
}
} else if (qi.flags == PASSTHRU_SET_INFO) {
/* Can eventually relax perm check since server enforces too */
@@ -2053,14 +2056,33 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_search_info *srch_inf)
{
__le16 *utf16_path;
- int rc;
- __u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+ struct smb_rqst rqst[2];
+ struct kvec rsp_iov[2];
+ int resp_buftype[2];
+ struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
+ struct kvec qd_iov[SMB2_QUERY_DIRECTORY_IOV_SIZE];
+ int rc, flags = 0;
+ u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
struct cifs_open_parms oparms;
+ struct smb2_query_directory_rsp *qd_rsp = NULL;
+ struct smb2_create_rsp *op_rsp = NULL;
utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
if (!utf16_path)
return -ENOMEM;
+ if (smb3_encryption_required(tcon))
+ flags |= CIFS_TRANSFORM_REQ;
+
+ memset(rqst, 0, sizeof(rqst));
+ resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER;
+ memset(rsp_iov, 0, sizeof(rsp_iov));
+
+ /* Open */
+ memset(&open_iov, 0, sizeof(open_iov));
+ rqst[0].rq_iov = open_iov;
+ rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
+
oparms.tcon = tcon;
oparms.desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA;
oparms.disposition = FILE_OPEN;
@@ -2071,22 +2093,75 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
oparms.fid = fid;
oparms.reconnect = false;
- rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL);
- kfree(utf16_path);
- if (rc) {
- cifs_dbg(FYI, "open dir failed rc=%d\n", rc);
- return rc;
- }
+ rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, utf16_path);
+ if (rc)
+ goto qdf_free;
+ smb2_set_next_command(tcon, &rqst[0]);
+ /* Query directory */
srch_inf->entries_in_buffer = 0;
srch_inf->index_of_last_entry = 2;
- rc = SMB2_query_directory(xid, tcon, fid->persistent_fid,
- fid->volatile_fid, 0, srch_inf);
- if (rc) {
- cifs_dbg(FYI, "query directory failed rc=%d\n", rc);
+ memset(&qd_iov, 0, sizeof(qd_iov));
+ rqst[1].rq_iov = qd_iov;
+ rqst[1].rq_nvec = SMB2_QUERY_DIRECTORY_IOV_SIZE;
+
+ rc = SMB2_query_directory_init(xid, tcon, &rqst[1],
+ COMPOUND_FID, COMPOUND_FID,
+ 0, srch_inf->info_level);
+ if (rc)
+ goto qdf_free;
+
+ smb2_set_related(&rqst[1]);
+
+ rc = compound_send_recv(xid, tcon->ses, flags, 2, rqst,
+ resp_buftype, rsp_iov);
+
+ /* If the open failed there is nothing to do */
+ op_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
+ if (op_rsp == NULL || op_rsp->sync_hdr.Status != STATUS_SUCCESS) {
+ cifs_dbg(FYI, "query_dir_first: open failed rc=%d\n", rc);
+ goto qdf_free;
+ }
+ fid->persistent_fid = op_rsp->PersistentFileId;
+ fid->volatile_fid = op_rsp->VolatileFileId;
+
+ /* Anything else than ENODATA means a genuine error */
+ if (rc && rc != -ENODATA) {
SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid);
+ cifs_dbg(FYI, "query_dir_first: query directory failed rc=%d\n", rc);
+ trace_smb3_query_dir_err(xid, fid->persistent_fid,
+ tcon->tid, tcon->ses->Suid, 0, 0, rc);
+ goto qdf_free;
+ }
+
+ qd_rsp = (struct smb2_query_directory_rsp *)rsp_iov[1].iov_base;
+ if (qd_rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) {
+ trace_smb3_query_dir_done(xid, fid->persistent_fid,
+ tcon->tid, tcon->ses->Suid, 0, 0);
+ srch_inf->endOfSearch = true;
+ rc = 0;
+ goto qdf_free;
+ }
+
+ rc = smb2_parse_query_directory(tcon, &rsp_iov[1], resp_buftype[1],
+ srch_inf);
+ if (rc) {
+ trace_smb3_query_dir_err(xid, fid->persistent_fid, tcon->tid,
+ tcon->ses->Suid, 0, 0, rc);
+ goto qdf_free;
}
+ resp_buftype[1] = CIFS_NO_BUFFER;
+
+ trace_smb3_query_dir_done(xid, fid->persistent_fid, tcon->tid,
+ tcon->ses->Suid, 0, srch_inf->entries_in_buffer);
+
+ qdf_free:
+ kfree(utf16_path);
+ SMB2_open_free(&rqst[0]);
+ SMB2_query_directory_free(&rqst[1]);
+ free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
+ free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
return rc;
}
@@ -2697,7 +2772,10 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_ioctl_init(tcon, &rqst[1], fid.persistent_fid,
fid.volatile_fid, FSCTL_GET_REPARSE_POINT,
- true /* is_fctl */, NULL, 0, CIFSMaxBufSize);
+ true /* is_fctl */, NULL, 0,
+ CIFSMaxBufSize -
+ MAX_SMB2_CREATE_RESPONSE_SIZE -
+ MAX_SMB2_CLOSE_RESPONSE_SIZE);
if (rc)
goto querty_exit;
@@ -3095,28 +3173,32 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
}
/*
+ * Extending the file
+ */
+ if ((keep_size == false) && i_size_read(inode) < off + len) {
+ if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) == 0)
+ smb2_set_sparse(xid, tcon, cfile, inode, false);
+
+ eof = cpu_to_le64(off + len);
+ rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid, cfile->pid, &eof);
+ if (rc == 0) {
+ cifsi->server_eof = off + len;
+ cifs_setsize(inode, off + len);
+ cifs_truncate_page(inode->i_mapping, inode->i_size);
+ truncate_setsize(inode, off + len);
+ }
+ goto out;
+ }
+
+ /*
* Files are non-sparse by default so falloc may be a no-op
- * Must check if file sparse. If not sparse, and not extending
- * then no need to do anything since file already allocated
+ * Must check if file sparse. If not sparse, and since we are not
+ * extending then no need to do anything since file already allocated
*/
if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) == 0) {
- if (keep_size == true)
- rc = 0;
- /* check if extending file */
- else if (i_size_read(inode) >= off + len)
- /* not extending file and already not sparse */
- rc = 0;
- /* BB: in future add else clause to extend file */
- else
- rc = -EOPNOTSUPP;
- if (rc)
- trace_smb3_falloc_err(xid, cfile->fid.persistent_fid,
- tcon->tid, tcon->ses->Suid, off, len, rc);
- else
- trace_smb3_falloc_done(xid, cfile->fid.persistent_fid,
- tcon->tid, tcon->ses->Suid, off, len);
- free_xid(xid);
- return rc;
+ rc = 0;
+ goto out;
}
if ((keep_size == true) || (i_size_read(inode) >= off + len)) {
@@ -3130,25 +3212,14 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
*/
if ((off > 8192) || (off + len + 8192 < i_size_read(inode))) {
rc = -EOPNOTSUPP;
- trace_smb3_falloc_err(xid, cfile->fid.persistent_fid,
- tcon->tid, tcon->ses->Suid, off, len, rc);
- free_xid(xid);
- return rc;
- }
-
- smb2_set_sparse(xid, tcon, cfile, inode, false);
- rc = 0;
- } else {
- smb2_set_sparse(xid, tcon, cfile, inode, false);
- rc = 0;
- if (i_size_read(inode) < off + len) {
- eof = cpu_to_le64(off + len);
- rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
- cfile->fid.volatile_fid, cfile->pid,
- &eof);
+ goto out;
}
}
+ smb2_set_sparse(xid, tcon, cfile, inode, false);
+ rc = 0;
+
+out:
if (rc)
trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, tcon->tid,
tcon->ses->Suid, off, len, rc);
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 9434f6dd8df3..7edba3e6d5e6 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -2199,13 +2199,14 @@ create_sd_buf(umode_t mode, unsigned int *len)
struct cifs_ace *pace;
unsigned int sdlen, acelen;
- *len = roundup(sizeof(struct crt_sd_ctxt) + sizeof(struct cifs_ace), 8);
+ *len = roundup(sizeof(struct crt_sd_ctxt) + sizeof(struct cifs_ace) * 2,
+ 8);
buf = kzalloc(*len, GFP_KERNEL);
if (buf == NULL)
return buf;
sdlen = sizeof(struct smb3_sd) + sizeof(struct smb3_acl) +
- sizeof(struct cifs_ace);
+ 2 * sizeof(struct cifs_ace);
buf->ccontext.DataOffset = cpu_to_le16(offsetof
(struct crt_sd_ctxt, sd));
@@ -2232,8 +2233,12 @@ create_sd_buf(umode_t mode, unsigned int *len)
/* create one ACE to hold the mode embedded in reserved special SID */
pace = (struct cifs_ace *)(sizeof(struct crt_sd_ctxt) + (char *)buf);
acelen = setup_special_mode_ACE(pace, (__u64)mode);
+ /* and one more ACE to allow access for authenticated users */
+ pace = (struct cifs_ace *)(acelen + (sizeof(struct crt_sd_ctxt) +
+ (char *)buf));
+ acelen += setup_authusers_ACE(pace);
buf->acl.AclSize = cpu_to_le16(sizeof(struct cifs_acl) + acelen);
- buf->acl.AceCount = cpu_to_le16(1);
+ buf->acl.AceCount = cpu_to_le16(2);
return buf;
}
@@ -4296,56 +4301,38 @@ num_entries(char *bufstart, char *end_of_buf, char **lastentry, size_t size)
/*
* Readdir/FindFirst
*/
-int
-SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_fid, u64 volatile_fid, int index,
- struct cifs_search_info *srch_inf)
+int SMB2_query_directory_init(const unsigned int xid,
+ struct cifs_tcon *tcon, struct smb_rqst *rqst,
+ u64 persistent_fid, u64 volatile_fid,
+ int index, int info_level)
{
- struct smb_rqst rqst;
+ struct TCP_Server_Info *server = tcon->ses->server;
struct smb2_query_directory_req *req;
- struct smb2_query_directory_rsp *rsp = NULL;
- struct kvec iov[2];
- struct kvec rsp_iov;
- int rc = 0;
- int len;
- int resp_buftype = CIFS_NO_BUFFER;
unsigned char *bufptr;
- struct TCP_Server_Info *server;
- struct cifs_ses *ses = tcon->ses;
__le16 asteriks = cpu_to_le16('*');
- char *end_of_smb;
- unsigned int output_size = CIFSMaxBufSize;
- size_t info_buf_size;
- int flags = 0;
+ unsigned int output_size = CIFSMaxBufSize -
+ MAX_SMB2_CREATE_RESPONSE_SIZE -
+ MAX_SMB2_CLOSE_RESPONSE_SIZE;
unsigned int total_len;
-
- if (ses && (ses->server))
- server = ses->server;
- else
- return -EIO;
+ struct kvec *iov = rqst->rq_iov;
+ int len, rc;
rc = smb2_plain_req_init(SMB2_QUERY_DIRECTORY, tcon, (void **) &req,
&total_len);
if (rc)
return rc;
- if (smb3_encryption_required(tcon))
- flags |= CIFS_TRANSFORM_REQ;
-
- switch (srch_inf->info_level) {
+ switch (info_level) {
case SMB_FIND_FILE_DIRECTORY_INFO:
req->FileInformationClass = FILE_DIRECTORY_INFORMATION;
- info_buf_size = sizeof(FILE_DIRECTORY_INFO) - 1;
break;
case SMB_FIND_FILE_ID_FULL_DIR_INFO:
req->FileInformationClass = FILEID_FULL_DIRECTORY_INFORMATION;
- info_buf_size = sizeof(SEARCH_ID_FULL_DIR_INFO) - 1;
break;
default:
cifs_tcon_dbg(VFS, "info level %u isn't supported\n",
- srch_inf->info_level);
- rc = -EINVAL;
- goto qdir_exit;
+ info_level);
+ return -EINVAL;
}
req->FileIndex = cpu_to_le32(index);
@@ -4374,40 +4361,50 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
iov[1].iov_base = (char *)(req->Buffer);
iov[1].iov_len = len;
- memset(&rqst, 0, sizeof(struct smb_rqst));
- rqst.rq_iov = iov;
- rqst.rq_nvec = 2;
-
trace_smb3_query_dir_enter(xid, persistent_fid, tcon->tid,
tcon->ses->Suid, index, output_size);
- rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
- cifs_small_buf_release(req);
- rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base;
+ return 0;
+}
- if (rc) {
- if (rc == -ENODATA &&
- rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) {
- trace_smb3_query_dir_done(xid, persistent_fid,
- tcon->tid, tcon->ses->Suid, index, 0);
- srch_inf->endOfSearch = true;
- rc = 0;
- } else {
- trace_smb3_query_dir_err(xid, persistent_fid, tcon->tid,
- tcon->ses->Suid, index, 0, rc);
- cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE);
- }
- goto qdir_exit;
+void SMB2_query_directory_free(struct smb_rqst *rqst)
+{
+ if (rqst && rqst->rq_iov) {
+ cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */
+ }
+}
+
+int
+smb2_parse_query_directory(struct cifs_tcon *tcon,
+ struct kvec *rsp_iov,
+ int resp_buftype,
+ struct cifs_search_info *srch_inf)
+{
+ struct smb2_query_directory_rsp *rsp;
+ size_t info_buf_size;
+ char *end_of_smb;
+ int rc;
+
+ rsp = (struct smb2_query_directory_rsp *)rsp_iov->iov_base;
+
+ switch (srch_inf->info_level) {
+ case SMB_FIND_FILE_DIRECTORY_INFO:
+ info_buf_size = sizeof(FILE_DIRECTORY_INFO) - 1;
+ break;
+ case SMB_FIND_FILE_ID_FULL_DIR_INFO:
+ info_buf_size = sizeof(SEARCH_ID_FULL_DIR_INFO) - 1;
+ break;
+ default:
+ cifs_tcon_dbg(VFS, "info level %u isn't supported\n",
+ srch_inf->info_level);
+ return -EINVAL;
}
rc = smb2_validate_iov(le16_to_cpu(rsp->OutputBufferOffset),
- le32_to_cpu(rsp->OutputBufferLength), &rsp_iov,
+ le32_to_cpu(rsp->OutputBufferLength), rsp_iov,
info_buf_size);
- if (rc) {
- trace_smb3_query_dir_err(xid, persistent_fid, tcon->tid,
- tcon->ses->Suid, index, 0, rc);
- goto qdir_exit;
- }
+ if (rc)
+ return rc;
srch_inf->unicode = true;
@@ -4420,7 +4417,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
srch_inf->ntwrk_buf_start = (char *)rsp;
srch_inf->srch_entries_start = srch_inf->last_entry =
(char *)rsp + le16_to_cpu(rsp->OutputBufferOffset);
- end_of_smb = rsp_iov.iov_len + (char *)rsp;
+ end_of_smb = rsp_iov->iov_len + (char *)rsp;
srch_inf->entries_in_buffer =
num_entries(srch_inf->srch_entries_start, end_of_smb,
&srch_inf->last_entry, info_buf_size);
@@ -4435,11 +4432,72 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
else
cifs_tcon_dbg(VFS, "illegal search buffer type\n");
+ return 0;
+}
+
+int
+SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid, int index,
+ struct cifs_search_info *srch_inf)
+{
+ struct smb_rqst rqst;
+ struct kvec iov[SMB2_QUERY_DIRECTORY_IOV_SIZE];
+ struct smb2_query_directory_rsp *rsp = NULL;
+ int resp_buftype = CIFS_NO_BUFFER;
+ struct kvec rsp_iov;
+ int rc = 0;
+ struct cifs_ses *ses = tcon->ses;
+ int flags = 0;
+
+ if (!ses || !(ses->server))
+ return -EIO;
+
+ if (smb3_encryption_required(tcon))
+ flags |= CIFS_TRANSFORM_REQ;
+
+ memset(&rqst, 0, sizeof(struct smb_rqst));
+ memset(&iov, 0, sizeof(iov));
+ rqst.rq_iov = iov;
+ rqst.rq_nvec = SMB2_QUERY_DIRECTORY_IOV_SIZE;
+
+ rc = SMB2_query_directory_init(xid, tcon, &rqst, persistent_fid,
+ volatile_fid, index,
+ srch_inf->info_level);
+ if (rc)
+ goto qdir_exit;
+
+ rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
+ rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base;
+
+ if (rc) {
+ if (rc == -ENODATA &&
+ rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) {
+ trace_smb3_query_dir_done(xid, persistent_fid,
+ tcon->tid, tcon->ses->Suid, index, 0);
+ srch_inf->endOfSearch = true;
+ rc = 0;
+ } else {
+ trace_smb3_query_dir_err(xid, persistent_fid, tcon->tid,
+ tcon->ses->Suid, index, 0, rc);
+ cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE);
+ }
+ goto qdir_exit;
+ }
+
+ rc = smb2_parse_query_directory(tcon, &rsp_iov, resp_buftype,
+ srch_inf);
+ if (rc) {
+ trace_smb3_query_dir_err(xid, persistent_fid, tcon->tid,
+ tcon->ses->Suid, index, 0, rc);
+ goto qdir_exit;
+ }
+ resp_buftype = CIFS_NO_BUFFER;
+
trace_smb3_query_dir_done(xid, persistent_fid, tcon->tid,
tcon->ses->Suid, index, srch_inf->entries_in_buffer);
- return rc;
qdir_exit:
+ SMB2_query_directory_free(&rqst);
free_rsp_buf(resp_buftype, rsp);
return rc;
}
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 7b1c379fdf7a..4c43dbd1e089 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -1282,6 +1282,8 @@ struct smb2_echo_rsp {
#define SMB2_INDEX_SPECIFIED 0x04
#define SMB2_REOPEN 0x10
+#define SMB2_QUERY_DIRECTORY_IOV_SIZE 2
+
struct smb2_query_directory_req {
struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 33 */
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 27d29f2eb6c8..6c678e00046f 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -197,6 +197,11 @@ extern int SMB2_echo(struct TCP_Server_Info *server);
extern int SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, int index,
struct cifs_search_info *srch_inf);
+extern int SMB2_query_directory_init(unsigned int xid, struct cifs_tcon *tcon,
+ struct smb_rqst *rqst,
+ u64 persistent_fid, u64 volatile_fid,
+ int index, int info_level);
+extern void SMB2_query_directory_free(struct smb_rqst *rqst);
extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, u32 pid,
__le64 *eof);
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 387c88704c52..fe6acfce3390 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -685,6 +685,8 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
* The default is for the mid to be synchronous, so the
* default callback just wakes up the current task.
*/
+ get_task_struct(current);
+ temp->creator = current;
temp->callback = cifs_wake_up_task;
temp->callback_data = current;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 3d2e11f85cba..cb3ee916f527 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -76,6 +76,8 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
* The default is for the mid to be synchronous, so the
* default callback just wakes up the current task.
*/
+ get_task_struct(current);
+ temp->creator = current;
temp->callback = cifs_wake_up_task;
temp->callback_data = current;
@@ -158,6 +160,7 @@ static void _cifs_mid_q_entry_release(struct kref *refcount)
}
}
#endif
+ put_task_struct(midEntry->creator);
mempool_free(midEntry, cifs_mid_poolp);
}
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index db4ba8f6077e..b8299173ea7e 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -32,7 +32,8 @@
#include "cifs_unicode.h"
#define MAX_EA_VALUE_SIZE CIFSMaxBufSize
-#define CIFS_XATTR_CIFS_ACL "system.cifs_acl"
+#define CIFS_XATTR_CIFS_ACL "system.cifs_acl" /* DACL only */
+#define CIFS_XATTR_CIFS_NTSD "system.cifs_ntsd" /* owner plus DACL */
#define CIFS_XATTR_ATTRIB "cifs.dosattrib" /* full name: user.cifs.dosattrib */
#define CIFS_XATTR_CREATETIME "cifs.creationtime" /* user.cifs.creationtime */
/*
@@ -40,12 +41,62 @@
* confusing users and using the 20+ year old term 'cifs' when it is no longer
* secure, replaced by SMB2 (then even more highly secure SMB3) many years ago
*/
-#define SMB3_XATTR_CIFS_ACL "system.smb3_acl"
+#define SMB3_XATTR_CIFS_ACL "system.smb3_acl" /* DACL only */
+#define SMB3_XATTR_CIFS_NTSD "system.smb3_ntsd" /* owner plus DACL */
#define SMB3_XATTR_ATTRIB "smb3.dosattrib" /* full name: user.smb3.dosattrib */
#define SMB3_XATTR_CREATETIME "smb3.creationtime" /* user.smb3.creationtime */
/* BB need to add server (Samba e.g) support for security and trusted prefix */
-enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT };
+enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT,
+ XATTR_CIFS_NTSD };
+
+static int cifs_attrib_set(unsigned int xid, struct cifs_tcon *pTcon,
+ struct inode *inode, char *full_path,
+ const void *value, size_t size)
+{
+ ssize_t rc = -EOPNOTSUPP;
+ __u32 *pattrib = (__u32 *)value;
+ __u32 attrib;
+ FILE_BASIC_INFO info_buf;
+
+ if ((value == NULL) || (size != sizeof(__u32)))
+ return -ERANGE;
+
+ memset(&info_buf, 0, sizeof(info_buf));
+ attrib = *pattrib;
+ info_buf.Attributes = cpu_to_le32(attrib);
+ if (pTcon->ses->server->ops->set_file_info)
+ rc = pTcon->ses->server->ops->set_file_info(inode, full_path,
+ &info_buf, xid);
+ if (rc == 0)
+ CIFS_I(inode)->cifsAttrs = attrib;
+
+ return rc;
+}
+
+static int cifs_creation_time_set(unsigned int xid, struct cifs_tcon *pTcon,
+ struct inode *inode, char *full_path,
+ const void *value, size_t size)
+{
+ ssize_t rc = -EOPNOTSUPP;
+ __u64 *pcreation_time = (__u64 *)value;
+ __u64 creation_time;
+ FILE_BASIC_INFO info_buf;
+
+ if ((value == NULL) || (size != sizeof(__u64)))
+ return -ERANGE;
+
+ memset(&info_buf, 0, sizeof(info_buf));
+ creation_time = *pcreation_time;
+ info_buf.CreationTime = cpu_to_le64(creation_time);
+ if (pTcon->ses->server->ops->set_file_info)
+ rc = pTcon->ses->server->ops->set_file_info(inode, full_path,
+ &info_buf, xid);
+ if (rc == 0)
+ CIFS_I(inode)->createtime = creation_time;
+
+ return rc;
+}
static int cifs_xattr_set(const struct xattr_handler *handler,
struct dentry *dentry, struct inode *inode,
@@ -86,6 +137,23 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
switch (handler->flags) {
case XATTR_USER:
+ cifs_dbg(FYI, "%s:setting user xattr %s\n", __func__, name);
+ if ((strcmp(name, CIFS_XATTR_ATTRIB) == 0) ||
+ (strcmp(name, SMB3_XATTR_ATTRIB) == 0)) {
+ rc = cifs_attrib_set(xid, pTcon, inode, full_path,
+ value, size);
+ if (rc == 0) /* force revalidate of the inode */
+ CIFS_I(inode)->time = 0;
+ break;
+ } else if ((strcmp(name, CIFS_XATTR_CREATETIME) == 0) ||
+ (strcmp(name, SMB3_XATTR_CREATETIME) == 0)) {
+ rc = cifs_creation_time_set(xid, pTcon, inode,
+ full_path, value, size);
+ if (rc == 0) /* force revalidate of the inode */
+ CIFS_I(inode)->time = 0;
+ break;
+ }
+
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
goto out;
@@ -95,7 +163,8 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
cifs_sb->local_nls, cifs_sb);
break;
- case XATTR_CIFS_ACL: {
+ case XATTR_CIFS_ACL:
+ case XATTR_CIFS_NTSD: {
struct cifs_ntsd *pacl;
if (!value)
@@ -106,12 +175,25 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
} else {
memcpy(pacl, value, size);
if (value &&
- pTcon->ses->server->ops->set_acl)
- rc = pTcon->ses->server->ops->set_acl(pacl,
- size, inode,
- full_path, CIFS_ACL_DACL);
- else
+ pTcon->ses->server->ops->set_acl) {
+ rc = 0;
+ if (handler->flags == XATTR_CIFS_NTSD) {
+ /* set owner and DACL */
+ rc = pTcon->ses->server->ops->set_acl(
+ pacl, size, inode,
+ full_path,
+ CIFS_ACL_OWNER);
+ }
+ if (rc == 0) {
+ /* set DACL */
+ rc = pTcon->ses->server->ops->set_acl(
+ pacl, size, inode,
+ full_path,
+ CIFS_ACL_DACL);
+ }
+ } else {
rc = -EOPNOTSUPP;
+ }
if (rc == 0) /* force revalidate of the inode */
CIFS_I(inode)->time = 0;
kfree(pacl);
@@ -179,7 +261,7 @@ static int cifs_creation_time_get(struct dentry *dentry, struct inode *inode,
void *value, size_t size)
{
ssize_t rc;
- __u64 * pcreatetime;
+ __u64 *pcreatetime;
rc = cifs_revalidate_dentry_attr(dentry);
if (rc)
@@ -244,7 +326,9 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
full_path, name, value, size, cifs_sb);
break;
- case XATTR_CIFS_ACL: {
+ case XATTR_CIFS_ACL:
+ case XATTR_CIFS_NTSD: {
+ /* the whole ntsd is fetched regardless */
u32 acllen;
struct cifs_ntsd *pacl;
@@ -382,6 +466,26 @@ static const struct xattr_handler smb3_acl_xattr_handler = {
.set = cifs_xattr_set,
};
+static const struct xattr_handler cifs_cifs_ntsd_xattr_handler = {
+ .name = CIFS_XATTR_CIFS_NTSD,
+ .flags = XATTR_CIFS_NTSD,
+ .get = cifs_xattr_get,
+ .set = cifs_xattr_set,
+};
+
+/*
+ * Although this is just an alias for the above, need to move away from
+ * confusing users and using the 20 year old term 'cifs' when it is no
+ * longer secure and was replaced by SMB2/SMB3 a long time ago, and
+ * SMB3 and later are highly secure.
+ */
+static const struct xattr_handler smb3_ntsd_xattr_handler = {
+ .name = SMB3_XATTR_CIFS_NTSD,
+ .flags = XATTR_CIFS_NTSD,
+ .get = cifs_xattr_get,
+ .set = cifs_xattr_set,
+};
+
static const struct xattr_handler cifs_posix_acl_access_xattr_handler = {
.name = XATTR_NAME_POSIX_ACL_ACCESS,
.flags = XATTR_ACL_ACCESS,
@@ -401,6 +505,8 @@ const struct xattr_handler *cifs_xattr_handlers[] = {
&cifs_os2_xattr_handler,
&cifs_cifs_acl_xattr_handler,
&smb3_acl_xattr_handler, /* alias for above since avoiding "cifs" */
+ &cifs_cifs_ntsd_xattr_handler,
+ &smb3_ntsd_xattr_handler, /* alias for above since avoiding "cifs" */
&cifs_posix_acl_access_xattr_handler,
&cifs_posix_acl_default_xattr_handler,
NULL
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
deleted file mode 100644
index 358ea2ecf36b..000000000000
--- a/fs/compat_ioctl.c
+++ /dev/null
@@ -1,261 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * ioctl32.c: Conversion between 32bit and 64bit native ioctls.
- *
- * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com)
- * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be)
- * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs
- * Copyright (C) 2003 Pavel Machek (pavel@ucw.cz)
- *
- * These routines maintain argument size conversion between 32bit and 64bit
- * ioctls.
- */
-
-#include <linux/types.h>
-#include <linux/compat.h>
-#include <linux/kernel.h>
-#include <linux/capability.h>
-#include <linux/compiler.h>
-#include <linux/sched.h>
-#include <linux/smp.h>
-#include <linux/ioctl.h>
-#include <linux/if.h>
-#include <linux/raid/md_u.h>
-#include <linux/falloc.h>
-#include <linux/file.h>
-#include <linux/ppp-ioctl.h>
-#include <linux/if_pppox.h>
-#include <linux/tty.h>
-#include <linux/vt_kern.h>
-#include <linux/blkdev.h>
-#include <linux/serial.h>
-#include <linux/ctype.h>
-#include <linux/syscalls.h>
-#include <linux/gfp.h>
-#include <linux/cec.h>
-
-#include "internal.h"
-
-#ifdef CONFIG_BLOCK
-#include <linux/cdrom.h>
-#include <linux/fd.h>
-#include <scsi/scsi.h>
-#include <scsi/scsi_ioctl.h>
-#include <scsi/sg.h>
-#endif
-
-#include <linux/uaccess.h>
-#include <linux/watchdog.h>
-
-#include <linux/hiddev.h>
-
-
-#include <linux/sort.h>
-
-/*
- * simple reversible transform to make our table more evenly
- * distributed after sorting.
- */
-#define XFORM(i) (((i) ^ ((i) << 27) ^ ((i) << 17)) & 0xffffffff)
-
-#define COMPATIBLE_IOCTL(cmd) XFORM((u32)cmd),
-static unsigned int ioctl_pointer[] = {
-#ifdef CONFIG_BLOCK
-/* Big S */
-COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN)
-COMPATIBLE_IOCTL(SCSI_IOCTL_DOORLOCK)
-COMPATIBLE_IOCTL(SCSI_IOCTL_DOORUNLOCK)
-COMPATIBLE_IOCTL(SCSI_IOCTL_TEST_UNIT_READY)
-COMPATIBLE_IOCTL(SCSI_IOCTL_GET_BUS_NUMBER)
-COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND)
-COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST)
-COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI)
-#endif
-#ifdef CONFIG_BLOCK
-/* SG stuff */
-COMPATIBLE_IOCTL(SG_IO)
-COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE)
-COMPATIBLE_IOCTL(SG_SET_TIMEOUT)
-COMPATIBLE_IOCTL(SG_GET_TIMEOUT)
-COMPATIBLE_IOCTL(SG_EMULATED_HOST)
-COMPATIBLE_IOCTL(SG_GET_TRANSFORM)
-COMPATIBLE_IOCTL(SG_SET_RESERVED_SIZE)
-COMPATIBLE_IOCTL(SG_GET_RESERVED_SIZE)
-COMPATIBLE_IOCTL(SG_GET_SCSI_ID)
-COMPATIBLE_IOCTL(SG_SET_FORCE_LOW_DMA)
-COMPATIBLE_IOCTL(SG_GET_LOW_DMA)
-COMPATIBLE_IOCTL(SG_SET_FORCE_PACK_ID)
-COMPATIBLE_IOCTL(SG_GET_PACK_ID)
-COMPATIBLE_IOCTL(SG_GET_NUM_WAITING)
-COMPATIBLE_IOCTL(SG_SET_DEBUG)
-COMPATIBLE_IOCTL(SG_GET_SG_TABLESIZE)
-COMPATIBLE_IOCTL(SG_GET_COMMAND_Q)
-COMPATIBLE_IOCTL(SG_SET_COMMAND_Q)
-COMPATIBLE_IOCTL(SG_GET_VERSION_NUM)
-COMPATIBLE_IOCTL(SG_NEXT_CMD_LEN)
-COMPATIBLE_IOCTL(SG_SCSI_RESET)
-COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE)
-COMPATIBLE_IOCTL(SG_SET_KEEP_ORPHAN)
-COMPATIBLE_IOCTL(SG_GET_KEEP_ORPHAN)
-#endif
-};
-
-/*
- * Convert common ioctl arguments based on their command number
- *
- * Please do not add any code in here. Instead, implement
- * a compat_ioctl operation in the place that handleѕ the
- * ioctl for the native case.
- */
-static long do_ioctl_trans(unsigned int cmd,
- unsigned long arg, struct file *file)
-{
- return -ENOIOCTLCMD;
-}
-
-static int compat_ioctl_check_table(unsigned int xcmd)
-{
-#ifdef CONFIG_BLOCK
- int i;
- const int max = ARRAY_SIZE(ioctl_pointer) - 1;
-
- BUILD_BUG_ON(max >= (1 << 16));
-
- /* guess initial offset into table, assuming a
- normalized distribution */
- i = ((xcmd >> 16) * max) >> 16;
-
- /* do linear search up first, until greater or equal */
- while (ioctl_pointer[i] < xcmd && i < max)
- i++;
-
- /* then do linear search down */
- while (ioctl_pointer[i] > xcmd && i > 0)
- i--;
-
- return ioctl_pointer[i] == xcmd;
-#else
- return 0;
-#endif
-}
-
-COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
- compat_ulong_t, arg32)
-{
- unsigned long arg = arg32;
- struct fd f = fdget(fd);
- int error = -EBADF;
- if (!f.file)
- goto out;
-
- /* RED-PEN how should LSM module know it's handling 32bit? */
- error = security_file_ioctl(f.file, cmd, arg);
- if (error)
- goto out_fput;
-
- switch (cmd) {
- /* these are never seen by ->ioctl(), no argument or int argument */
- case FIOCLEX:
- case FIONCLEX:
- case FIFREEZE:
- case FITHAW:
- case FICLONE:
- goto do_ioctl;
- /* these are never seen by ->ioctl(), pointer argument */
- case FIONBIO:
- case FIOASYNC:
- case FIOQSIZE:
- case FS_IOC_FIEMAP:
- case FIGETBSZ:
- case FICLONERANGE:
- case FIDEDUPERANGE:
- goto found_handler;
- /*
- * The next group is the stuff handled inside file_ioctl().
- * For regular files these never reach ->ioctl(); for
- * devices, sockets, etc. they do and one (FIONREAD) is
- * even accepted in some cases. In all those cases
- * argument has the same type, so we can handle these
- * here, shunting them towards do_vfs_ioctl().
- * ->compat_ioctl() will never see any of those.
- */
- /* pointer argument, never actually handled by ->ioctl() */
- case FIBMAP:
- goto found_handler;
- /* handled by some ->ioctl(); always a pointer to int */
- case FIONREAD:
- goto found_handler;
- /* these get messy on amd64 due to alignment differences */
-#if defined(CONFIG_X86_64)
- case FS_IOC_RESVSP_32:
- case FS_IOC_RESVSP64_32:
- error = compat_ioctl_preallocate(f.file, 0, compat_ptr(arg));
- goto out_fput;
- case FS_IOC_UNRESVSP_32:
- case FS_IOC_UNRESVSP64_32:
- error = compat_ioctl_preallocate(f.file, FALLOC_FL_PUNCH_HOLE,
- compat_ptr(arg));
- goto out_fput;
- case FS_IOC_ZERO_RANGE_32:
- error = compat_ioctl_preallocate(f.file, FALLOC_FL_ZERO_RANGE,
- compat_ptr(arg));
- goto out_fput;
-#else
- case FS_IOC_RESVSP:
- case FS_IOC_RESVSP64:
- case FS_IOC_UNRESVSP:
- case FS_IOC_UNRESVSP64:
- case FS_IOC_ZERO_RANGE:
- goto found_handler;
-#endif
-
- default:
- if (f.file->f_op->compat_ioctl) {
- error = f.file->f_op->compat_ioctl(f.file, cmd, arg);
- if (error != -ENOIOCTLCMD)
- goto out_fput;
- }
-
- if (!f.file->f_op->unlocked_ioctl)
- goto do_ioctl;
- break;
- }
-
- if (compat_ioctl_check_table(XFORM(cmd)))
- goto found_handler;
-
- error = do_ioctl_trans(cmd, arg, f.file);
- if (error == -ENOIOCTLCMD)
- error = -ENOTTY;
-
- goto out_fput;
-
- found_handler:
- arg = (unsigned long)compat_ptr(arg);
- do_ioctl:
- error = do_vfs_ioctl(f.file, fd, cmd, arg);
- out_fput:
- fdput(f);
- out:
- return error;
-}
-
-static int __init init_sys32_ioctl_cmp(const void *p, const void *q)
-{
- unsigned int a, b;
- a = *(unsigned int *)p;
- b = *(unsigned int *)q;
- if (a > b)
- return 1;
- if (a < b)
- return -1;
- return 0;
-}
-
-static int __init init_sys32_ioctl(void)
-{
- sort(ioctl_pointer, ARRAY_SIZE(ioctl_pointer), sizeof(*ioctl_pointer),
- init_sys32_ioctl_cmp, NULL);
- return 0;
-}
-__initcall(init_sys32_ioctl);
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
index ff5a1746cbae..8046d7c7a3e9 100644
--- a/fs/crypto/Kconfig
+++ b/fs/crypto/Kconfig
@@ -2,13 +2,8 @@
config FS_ENCRYPTION
bool "FS Encryption (Per-file encryption)"
select CRYPTO
- select CRYPTO_AES
- select CRYPTO_CBC
- select CRYPTO_ECB
- select CRYPTO_XTS
- select CRYPTO_CTS
- select CRYPTO_SHA512
- select CRYPTO_HMAC
+ select CRYPTO_HASH
+ select CRYPTO_SKCIPHER
select KEYS
help
Enable encryption of files and directories. This
@@ -16,3 +11,16 @@ config FS_ENCRYPTION
efficient since it avoids caching the encrypted and
decrypted pages in the page cache. Currently Ext4,
F2FS and UBIFS make use of this feature.
+
+# Filesystems supporting encryption must select this if FS_ENCRYPTION. This
+# allows the algorithms to be built as modules when all the filesystems are.
+config FS_ENCRYPTION_ALGS
+ tristate
+ select CRYPTO_AES
+ select CRYPTO_CBC
+ select CRYPTO_CTS
+ select CRYPTO_ECB
+ select CRYPTO_HMAC
+ select CRYPTO_SHA256
+ select CRYPTO_SHA512
+ select CRYPTO_XTS
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 1f4b8a277060..4fa18fff9c4e 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -41,53 +41,101 @@ void fscrypt_decrypt_bio(struct bio *bio)
}
EXPORT_SYMBOL(fscrypt_decrypt_bio);
+/**
+ * fscrypt_zeroout_range() - zero out a range of blocks in an encrypted file
+ * @inode: the file's inode
+ * @lblk: the first file logical block to zero out
+ * @pblk: the first filesystem physical block to zero out
+ * @len: number of blocks to zero out
+ *
+ * Zero out filesystem blocks in an encrypted regular file on-disk, i.e. write
+ * ciphertext blocks which decrypt to the all-zeroes block. The blocks must be
+ * both logically and physically contiguous. It's also assumed that the
+ * filesystem only uses a single block device, ->s_bdev.
+ *
+ * Note that since each block uses a different IV, this involves writing a
+ * different ciphertext to each block; we can't simply reuse the same one.
+ *
+ * Return: 0 on success; -errno on failure.
+ */
int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
- sector_t pblk, unsigned int len)
+ sector_t pblk, unsigned int len)
{
const unsigned int blockbits = inode->i_blkbits;
const unsigned int blocksize = 1 << blockbits;
- struct page *ciphertext_page;
+ const unsigned int blocks_per_page_bits = PAGE_SHIFT - blockbits;
+ const unsigned int blocks_per_page = 1 << blocks_per_page_bits;
+ struct page *pages[16]; /* write up to 16 pages at a time */
+ unsigned int nr_pages;
+ unsigned int i;
+ unsigned int offset;
struct bio *bio;
- int ret, err = 0;
+ int ret, err;
- ciphertext_page = fscrypt_alloc_bounce_page(GFP_NOWAIT);
- if (!ciphertext_page)
- return -ENOMEM;
+ if (len == 0)
+ return 0;
- while (len--) {
- err = fscrypt_crypt_block(inode, FS_ENCRYPT, lblk,
- ZERO_PAGE(0), ciphertext_page,
- blocksize, 0, GFP_NOFS);
- if (err)
- goto errout;
+ BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_PAGES);
+ nr_pages = min_t(unsigned int, ARRAY_SIZE(pages),
+ (len + blocks_per_page - 1) >> blocks_per_page_bits);
- bio = bio_alloc(GFP_NOWAIT, 1);
- if (!bio) {
- err = -ENOMEM;
- goto errout;
- }
+ /*
+ * We need at least one page for ciphertext. Allocate the first one
+ * from a mempool, with __GFP_DIRECT_RECLAIM set so that it can't fail.
+ *
+ * Any additional page allocations are allowed to fail, as they only
+ * help performance, and waiting on the mempool for them could deadlock.
+ */
+ for (i = 0; i < nr_pages; i++) {
+ pages[i] = fscrypt_alloc_bounce_page(i == 0 ? GFP_NOFS :
+ GFP_NOWAIT | __GFP_NOWARN);
+ if (!pages[i])
+ break;
+ }
+ nr_pages = i;
+ if (WARN_ON(nr_pages <= 0))
+ return -EINVAL;
+
+ /* This always succeeds since __GFP_DIRECT_RECLAIM is set. */
+ bio = bio_alloc(GFP_NOFS, nr_pages);
+
+ do {
bio_set_dev(bio, inode->i_sb->s_bdev);
bio->bi_iter.bi_sector = pblk << (blockbits - 9);
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
- ret = bio_add_page(bio, ciphertext_page, blocksize, 0);
- if (WARN_ON(ret != blocksize)) {
- /* should never happen! */
- bio_put(bio);
- err = -EIO;
- goto errout;
- }
+
+ i = 0;
+ offset = 0;
+ do {
+ err = fscrypt_crypt_block(inode, FS_ENCRYPT, lblk,
+ ZERO_PAGE(0), pages[i],
+ blocksize, offset, GFP_NOFS);
+ if (err)
+ goto out;
+ lblk++;
+ pblk++;
+ len--;
+ offset += blocksize;
+ if (offset == PAGE_SIZE || len == 0) {
+ ret = bio_add_page(bio, pages[i++], offset, 0);
+ if (WARN_ON(ret != offset)) {
+ err = -EIO;
+ goto out;
+ }
+ offset = 0;
+ }
+ } while (i != nr_pages && len != 0);
+
err = submit_bio_wait(bio);
- if (err == 0 && bio->bi_status)
- err = -EIO;
- bio_put(bio);
if (err)
- goto errout;
- lblk++;
- pblk++;
- }
+ goto out;
+ bio_reset(bio);
+ } while (len != 0);
err = 0;
-errout:
- fscrypt_free_bounce_page(ciphertext_page);
+out:
+ bio_put(bio);
+ for (i = 0; i < nr_pages; i++)
+ fscrypt_free_bounce_page(pages[i]);
return err;
}
EXPORT_SYMBOL(fscrypt_zeroout_range);
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 3719efa546c6..1ecaac7ee3cb 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -25,8 +25,6 @@
#include <linux/module.h>
#include <linux/scatterlist.h>
#include <linux/ratelimit.h>
-#include <linux/dcache.h>
-#include <linux/namei.h>
#include <crypto/skcipher.h>
#include "fscrypt_private.h"
@@ -140,7 +138,7 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
* multiple of the filesystem's block size.
* @offs: Byte offset within @page of the first block to encrypt. Must be
* a multiple of the filesystem's block size.
- * @gfp_flags: Memory allocation flags
+ * @gfp_flags: Memory allocation flags. See details below.
*
* A new bounce page is allocated, and the specified block(s) are encrypted into
* it. In the bounce page, the ciphertext block(s) will be located at the same
@@ -150,6 +148,11 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
*
* This is for use by the filesystem's ->writepages() method.
*
+ * The bounce page allocation is mempool-backed, so it will always succeed when
+ * @gfp_flags includes __GFP_DIRECT_RECLAIM, e.g. when it's GFP_NOFS. However,
+ * only the first page of each bio can be allocated this way. To prevent
+ * deadlocks, for any additional pages a mask like GFP_NOWAIT must be used.
+ *
* Return: the new encrypted bounce page on success; an ERR_PTR() on failure
*/
struct page *fscrypt_encrypt_pagecache_blocks(struct page *page,
@@ -286,54 +289,6 @@ int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page,
}
EXPORT_SYMBOL(fscrypt_decrypt_block_inplace);
-/*
- * Validate dentries in encrypted directories to make sure we aren't potentially
- * caching stale dentries after a key has been added.
- */
-static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
-{
- struct dentry *dir;
- int err;
- int valid;
-
- /*
- * Plaintext names are always valid, since fscrypt doesn't support
- * reverting to ciphertext names without evicting the directory's inode
- * -- which implies eviction of the dentries in the directory.
- */
- if (!(dentry->d_flags & DCACHE_ENCRYPTED_NAME))
- return 1;
-
- /*
- * Ciphertext name; valid if the directory's key is still unavailable.
- *
- * Although fscrypt forbids rename() on ciphertext names, we still must
- * use dget_parent() here rather than use ->d_parent directly. That's
- * because a corrupted fs image may contain directory hard links, which
- * the VFS handles by moving the directory's dentry tree in the dcache
- * each time ->lookup() finds the directory and it already has a dentry
- * elsewhere. Thus ->d_parent can be changing, and we must safely grab
- * a reference to some ->d_parent to prevent it from being freed.
- */
-
- if (flags & LOOKUP_RCU)
- return -ECHILD;
-
- dir = dget_parent(dentry);
- err = fscrypt_get_encryption_info(d_inode(dir));
- valid = !fscrypt_has_encryption_key(d_inode(dir));
- dput(dir);
-
- if (err < 0)
- return err;
-
- return valid;
-}
-
-const struct dentry_operations fscrypt_d_ops = {
- .d_revalidate = fscrypt_d_revalidate,
-};
-
/**
* fscrypt_initialize() - allocate major buffers for fs encryption.
* @cop_flags: fscrypt operations flags
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 3da3707c10e3..4c212442a8f7 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -11,10 +11,87 @@
* This has not yet undergone a rigorous security audit.
*/
+#include <linux/namei.h>
#include <linux/scatterlist.h>
+#include <crypto/hash.h>
+#include <crypto/sha.h>
#include <crypto/skcipher.h>
#include "fscrypt_private.h"
+/**
+ * struct fscrypt_nokey_name - identifier for directory entry when key is absent
+ *
+ * When userspace lists an encrypted directory without access to the key, the
+ * filesystem must present a unique "no-key name" for each filename that allows
+ * it to find the directory entry again if requested. Naively, that would just
+ * mean using the ciphertext filenames. However, since the ciphertext filenames
+ * can contain illegal characters ('\0' and '/'), they must be encoded in some
+ * way. We use base64. But that can cause names to exceed NAME_MAX (255
+ * bytes), so we also need to use a strong hash to abbreviate long names.
+ *
+ * The filesystem may also need another kind of hash, the "dirhash", to quickly
+ * find the directory entry. Since filesystems normally compute the dirhash
+ * over the on-disk filename (i.e. the ciphertext), it's not computable from
+ * no-key names that abbreviate the ciphertext using the strong hash to fit in
+ * NAME_MAX. It's also not computable if it's a keyed hash taken over the
+ * plaintext (but it may still be available in the on-disk directory entry);
+ * casefolded directories use this type of dirhash. At least in these cases,
+ * each no-key name must include the name's dirhash too.
+ *
+ * To meet all these requirements, we base64-encode the following
+ * variable-length structure. It contains the dirhash, or 0's if the filesystem
+ * didn't provide one; up to 149 bytes of the ciphertext name; and for
+ * ciphertexts longer than 149 bytes, also the SHA-256 of the remaining bytes.
+ *
+ * This ensures that each no-key name contains everything needed to find the
+ * directory entry again, contains only legal characters, doesn't exceed
+ * NAME_MAX, is unambiguous unless there's a SHA-256 collision, and that we only
+ * take the performance hit of SHA-256 on very long filenames (which are rare).
+ */
+struct fscrypt_nokey_name {
+ u32 dirhash[2];
+ u8 bytes[149];
+ u8 sha256[SHA256_DIGEST_SIZE];
+}; /* 189 bytes => 252 bytes base64-encoded, which is <= NAME_MAX (255) */
+
+/*
+ * Decoded size of max-size nokey name, i.e. a name that was abbreviated using
+ * the strong hash and thus includes the 'sha256' field. This isn't simply
+ * sizeof(struct fscrypt_nokey_name), as the padding at the end isn't included.
+ */
+#define FSCRYPT_NOKEY_NAME_MAX offsetofend(struct fscrypt_nokey_name, sha256)
+
+static struct crypto_shash *sha256_hash_tfm;
+
+static int fscrypt_do_sha256(const u8 *data, unsigned int data_len, u8 *result)
+{
+ struct crypto_shash *tfm = READ_ONCE(sha256_hash_tfm);
+
+ if (unlikely(!tfm)) {
+ struct crypto_shash *prev_tfm;
+
+ tfm = crypto_alloc_shash("sha256", 0, 0);
+ if (IS_ERR(tfm)) {
+ fscrypt_err(NULL,
+ "Error allocating SHA-256 transform: %ld",
+ PTR_ERR(tfm));
+ return PTR_ERR(tfm);
+ }
+ prev_tfm = cmpxchg(&sha256_hash_tfm, NULL, tfm);
+ if (prev_tfm) {
+ crypto_free_shash(tfm);
+ tfm = prev_tfm;
+ }
+ }
+ {
+ SHASH_DESC_ON_STACK(desc, tfm);
+
+ desc->tfm = tfm;
+
+ return crypto_shash_digest(desc, data, data_len, result);
+ }
+}
+
static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
{
if (str->len == 1 && str->name[0] == '.')
@@ -27,19 +104,19 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
}
/**
- * fname_encrypt() - encrypt a filename
+ * fscrypt_fname_encrypt() - encrypt a filename
*
* The output buffer must be at least as large as the input buffer.
* Any extra space is filled with NUL padding before encryption.
*
* Return: 0 on success, -errno on failure
*/
-int fname_encrypt(struct inode *inode, const struct qstr *iname,
- u8 *out, unsigned int olen)
+int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
+ u8 *out, unsigned int olen)
{
struct skcipher_request *req = NULL;
DECLARE_CRYPTO_WAIT(wait);
- struct fscrypt_info *ci = inode->i_crypt_info;
+ const struct fscrypt_info *ci = inode->i_crypt_info;
struct crypto_skcipher *tfm = ci->ci_ctfm;
union fscrypt_iv iv;
struct scatterlist sg;
@@ -85,14 +162,14 @@ int fname_encrypt(struct inode *inode, const struct qstr *iname,
*
* Return: 0 on success, -errno on failure
*/
-static int fname_decrypt(struct inode *inode,
- const struct fscrypt_str *iname,
- struct fscrypt_str *oname)
+static int fname_decrypt(const struct inode *inode,
+ const struct fscrypt_str *iname,
+ struct fscrypt_str *oname)
{
struct skcipher_request *req = NULL;
DECLARE_CRYPTO_WAIT(wait);
struct scatterlist src_sg, dst_sg;
- struct fscrypt_info *ci = inode->i_crypt_info;
+ const struct fscrypt_info *ci = inode->i_crypt_info;
struct crypto_skcipher *tfm = ci->ci_ctfm;
union fscrypt_iv iv;
int res;
@@ -206,9 +283,7 @@ int fscrypt_fname_alloc_buffer(const struct inode *inode,
u32 max_encrypted_len,
struct fscrypt_str *crypto_str)
{
- const u32 max_encoded_len =
- max_t(u32, BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE),
- 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name)));
+ const u32 max_encoded_len = BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX);
u32 max_presented_len;
max_presented_len = max(max_encoded_len, max_encrypted_len);
@@ -241,19 +316,21 @@ EXPORT_SYMBOL(fscrypt_fname_free_buffer);
*
* The caller must have allocated sufficient memory for the @oname string.
*
- * If the key is available, we'll decrypt the disk name; otherwise, we'll encode
- * it for presentation. Short names are directly base64-encoded, while long
- * names are encoded in fscrypt_digested_name format.
+ * If the key is available, we'll decrypt the disk name. Otherwise, we'll
+ * encode it for presentation in fscrypt_nokey_name format.
+ * See struct fscrypt_nokey_name for details.
*
* Return: 0 on success, -errno on failure
*/
-int fscrypt_fname_disk_to_usr(struct inode *inode,
- u32 hash, u32 minor_hash,
- const struct fscrypt_str *iname,
- struct fscrypt_str *oname)
+int fscrypt_fname_disk_to_usr(const struct inode *inode,
+ u32 hash, u32 minor_hash,
+ const struct fscrypt_str *iname,
+ struct fscrypt_str *oname)
{
const struct qstr qname = FSTR_TO_QSTR(iname);
- struct fscrypt_digested_name digested_name;
+ struct fscrypt_nokey_name nokey_name;
+ u32 size; /* size of the unencoded no-key name */
+ int err;
if (fscrypt_is_dot_dotdot(&qname)) {
oname->name[0] = '.';
@@ -268,24 +345,37 @@ int fscrypt_fname_disk_to_usr(struct inode *inode,
if (fscrypt_has_encryption_key(inode))
return fname_decrypt(inode, iname, oname);
- if (iname->len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) {
- oname->len = base64_encode(iname->name, iname->len,
- oname->name);
- return 0;
- }
+ /*
+ * Sanity check that struct fscrypt_nokey_name doesn't have padding
+ * between fields and that its encoded size never exceeds NAME_MAX.
+ */
+ BUILD_BUG_ON(offsetofend(struct fscrypt_nokey_name, dirhash) !=
+ offsetof(struct fscrypt_nokey_name, bytes));
+ BUILD_BUG_ON(offsetofend(struct fscrypt_nokey_name, bytes) !=
+ offsetof(struct fscrypt_nokey_name, sha256));
+ BUILD_BUG_ON(BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX) > NAME_MAX);
+
if (hash) {
- digested_name.hash = hash;
- digested_name.minor_hash = minor_hash;
+ nokey_name.dirhash[0] = hash;
+ nokey_name.dirhash[1] = minor_hash;
+ } else {
+ nokey_name.dirhash[0] = 0;
+ nokey_name.dirhash[1] = 0;
+ }
+ if (iname->len <= sizeof(nokey_name.bytes)) {
+ memcpy(nokey_name.bytes, iname->name, iname->len);
+ size = offsetof(struct fscrypt_nokey_name, bytes[iname->len]);
} else {
- digested_name.hash = 0;
- digested_name.minor_hash = 0;
+ memcpy(nokey_name.bytes, iname->name, sizeof(nokey_name.bytes));
+ /* Compute strong hash of remaining part of name. */
+ err = fscrypt_do_sha256(&iname->name[sizeof(nokey_name.bytes)],
+ iname->len - sizeof(nokey_name.bytes),
+ nokey_name.sha256);
+ if (err)
+ return err;
+ size = FSCRYPT_NOKEY_NAME_MAX;
}
- memcpy(digested_name.digest,
- FSCRYPT_FNAME_DIGEST(iname->name, iname->len),
- FSCRYPT_FNAME_DIGEST_SIZE);
- oname->name[0] = '_';
- oname->len = 1 + base64_encode((const u8 *)&digested_name,
- sizeof(digested_name), oname->name + 1);
+ oname->len = base64_encode((const u8 *)&nokey_name, size, oname->name);
return 0;
}
EXPORT_SYMBOL(fscrypt_fname_disk_to_usr);
@@ -306,8 +396,7 @@ EXPORT_SYMBOL(fscrypt_fname_disk_to_usr);
* get the disk_name.
*
* Else, for keyless @lookup operations, @iname is the presented ciphertext, so
- * we decode it to get either the ciphertext disk_name (for short names) or the
- * fscrypt_digested_name (for long names). Non-@lookup operations will be
+ * we decode it to get the fscrypt_nokey_name. Non-@lookup operations will be
* impossible in this case, so we fail them with ENOKEY.
*
* If successful, fscrypt_free_filename() must be called later to clean up.
@@ -317,8 +406,8 @@ EXPORT_SYMBOL(fscrypt_fname_disk_to_usr);
int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
int lookup, struct fscrypt_name *fname)
{
+ struct fscrypt_nokey_name *nokey_name;
int ret;
- int digested;
memset(fname, 0, sizeof(struct fscrypt_name));
fname->usr_fname = iname;
@@ -342,8 +431,8 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
if (!fname->crypto_buf.name)
return -ENOMEM;
- ret = fname_encrypt(dir, iname, fname->crypto_buf.name,
- fname->crypto_buf.len);
+ ret = fscrypt_fname_encrypt(dir, iname, fname->crypto_buf.name,
+ fname->crypto_buf.len);
if (ret)
goto errout;
fname->disk_name.name = fname->crypto_buf.name;
@@ -358,40 +447,31 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
* We don't have the key and we are doing a lookup; decode the
* user-supplied name
*/
- if (iname->name[0] == '_') {
- if (iname->len !=
- 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name)))
- return -ENOENT;
- digested = 1;
- } else {
- if (iname->len >
- BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE))
- return -ENOENT;
- digested = 0;
- }
- fname->crypto_buf.name =
- kmalloc(max_t(size_t, FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE,
- sizeof(struct fscrypt_digested_name)),
- GFP_KERNEL);
+ if (iname->len > BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX))
+ return -ENOENT;
+
+ fname->crypto_buf.name = kmalloc(FSCRYPT_NOKEY_NAME_MAX, GFP_KERNEL);
if (fname->crypto_buf.name == NULL)
return -ENOMEM;
- ret = base64_decode(iname->name + digested, iname->len - digested,
- fname->crypto_buf.name);
- if (ret < 0) {
+ ret = base64_decode(iname->name, iname->len, fname->crypto_buf.name);
+ if (ret < (int)offsetof(struct fscrypt_nokey_name, bytes[1]) ||
+ (ret > offsetof(struct fscrypt_nokey_name, sha256) &&
+ ret != FSCRYPT_NOKEY_NAME_MAX)) {
ret = -ENOENT;
goto errout;
}
fname->crypto_buf.len = ret;
- if (digested) {
- const struct fscrypt_digested_name *n =
- (const void *)fname->crypto_buf.name;
- fname->hash = n->hash;
- fname->minor_hash = n->minor_hash;
- } else {
- fname->disk_name.name = fname->crypto_buf.name;
- fname->disk_name.len = fname->crypto_buf.len;
+
+ nokey_name = (void *)fname->crypto_buf.name;
+ fname->hash = nokey_name->dirhash[0];
+ fname->minor_hash = nokey_name->dirhash[1];
+ if (ret != FSCRYPT_NOKEY_NAME_MAX) {
+ /* The full ciphertext filename is available. */
+ fname->disk_name.name = nokey_name->bytes;
+ fname->disk_name.len =
+ ret - offsetof(struct fscrypt_nokey_name, bytes);
}
return 0;
@@ -400,3 +480,109 @@ errout:
return ret;
}
EXPORT_SYMBOL(fscrypt_setup_filename);
+
+/**
+ * fscrypt_match_name() - test whether the given name matches a directory entry
+ * @fname: the name being searched for
+ * @de_name: the name from the directory entry
+ * @de_name_len: the length of @de_name in bytes
+ *
+ * Normally @fname->disk_name will be set, and in that case we simply compare
+ * that to the name stored in the directory entry. The only exception is that
+ * if we don't have the key for an encrypted directory and the name we're
+ * looking for is very long, then we won't have the full disk_name and instead
+ * we'll need to match against a fscrypt_nokey_name that includes a strong hash.
+ *
+ * Return: %true if the name matches, otherwise %false.
+ */
+bool fscrypt_match_name(const struct fscrypt_name *fname,
+ const u8 *de_name, u32 de_name_len)
+{
+ const struct fscrypt_nokey_name *nokey_name =
+ (const void *)fname->crypto_buf.name;
+ u8 sha256[SHA256_DIGEST_SIZE];
+
+ if (likely(fname->disk_name.name)) {
+ if (de_name_len != fname->disk_name.len)
+ return false;
+ return !memcmp(de_name, fname->disk_name.name, de_name_len);
+ }
+ if (de_name_len <= sizeof(nokey_name->bytes))
+ return false;
+ if (memcmp(de_name, nokey_name->bytes, sizeof(nokey_name->bytes)))
+ return false;
+ if (fscrypt_do_sha256(&de_name[sizeof(nokey_name->bytes)],
+ de_name_len - sizeof(nokey_name->bytes), sha256))
+ return false;
+ return !memcmp(sha256, nokey_name->sha256, sizeof(sha256));
+}
+EXPORT_SYMBOL_GPL(fscrypt_match_name);
+
+/**
+ * fscrypt_fname_siphash() - calculate the SipHash of a filename
+ * @dir: the parent directory
+ * @name: the filename to calculate the SipHash of
+ *
+ * Given a plaintext filename @name and a directory @dir which uses SipHash as
+ * its dirhash method and has had its fscrypt key set up, this function
+ * calculates the SipHash of that name using the directory's secret dirhash key.
+ *
+ * Return: the SipHash of @name using the hash key of @dir
+ */
+u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name)
+{
+ const struct fscrypt_info *ci = dir->i_crypt_info;
+
+ WARN_ON(!ci->ci_dirhash_key_initialized);
+
+ return siphash(name->name, name->len, &ci->ci_dirhash_key);
+}
+EXPORT_SYMBOL_GPL(fscrypt_fname_siphash);
+
+/*
+ * Validate dentries in encrypted directories to make sure we aren't potentially
+ * caching stale dentries after a key has been added.
+ */
+static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ struct dentry *dir;
+ int err;
+ int valid;
+
+ /*
+ * Plaintext names are always valid, since fscrypt doesn't support
+ * reverting to ciphertext names without evicting the directory's inode
+ * -- which implies eviction of the dentries in the directory.
+ */
+ if (!(dentry->d_flags & DCACHE_ENCRYPTED_NAME))
+ return 1;
+
+ /*
+ * Ciphertext name; valid if the directory's key is still unavailable.
+ *
+ * Although fscrypt forbids rename() on ciphertext names, we still must
+ * use dget_parent() here rather than use ->d_parent directly. That's
+ * because a corrupted fs image may contain directory hard links, which
+ * the VFS handles by moving the directory's dentry tree in the dcache
+ * each time ->lookup() finds the directory and it already has a dentry
+ * elsewhere. Thus ->d_parent can be changing, and we must safely grab
+ * a reference to some ->d_parent to prevent it from being freed.
+ */
+
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ dir = dget_parent(dentry);
+ err = fscrypt_get_encryption_info(d_inode(dir));
+ valid = !fscrypt_has_encryption_key(d_inode(dir));
+ dput(dir);
+
+ if (err < 0)
+ return err;
+
+ return valid;
+}
+
+const struct dentry_operations fscrypt_d_ops = {
+ .d_revalidate = fscrypt_d_revalidate,
+};
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 130b50e5a011..9aae851409e5 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -12,6 +12,7 @@
#define _FSCRYPT_PRIVATE_H
#include <linux/fscrypt.h>
+#include <linux/siphash.h>
#include <crypto/hash.h>
#define CONST_STRLEN(str) (sizeof(str) - 1)
@@ -136,12 +137,6 @@ fscrypt_policy_flags(const union fscrypt_policy *policy)
BUG();
}
-static inline bool
-fscrypt_is_direct_key_policy(const union fscrypt_policy *policy)
-{
- return fscrypt_policy_flags(policy) & FSCRYPT_POLICY_FLAG_DIRECT_KEY;
-}
-
/**
* For encrypted symlinks, the ciphertext length is stored at the beginning
* of the string in little-endian format.
@@ -194,6 +189,14 @@ struct fscrypt_info {
*/
struct fscrypt_direct_key *ci_direct_key;
+ /*
+ * This inode's hash key for filenames. This is a 128-bit SipHash-2-4
+ * key. This is only set for directories that use a keyed dirhash over
+ * the plaintext filenames -- currently just casefolded directories.
+ */
+ siphash_key_t ci_dirhash_key;
+ bool ci_dirhash_key_initialized;
+
/* The encryption policy used by this inode */
union fscrypt_policy ci_policy;
@@ -206,24 +209,6 @@ typedef enum {
FS_ENCRYPT,
} fscrypt_direction_t;
-static inline bool fscrypt_valid_enc_modes(u32 contents_mode,
- u32 filenames_mode)
-{
- if (contents_mode == FSCRYPT_MODE_AES_128_CBC &&
- filenames_mode == FSCRYPT_MODE_AES_128_CTS)
- return true;
-
- if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
- filenames_mode == FSCRYPT_MODE_AES_256_CTS)
- return true;
-
- if (contents_mode == FSCRYPT_MODE_ADIANTUM &&
- filenames_mode == FSCRYPT_MODE_ADIANTUM)
- return true;
-
- return false;
-}
-
/* crypto.c */
extern struct kmem_cache *fscrypt_info_cachep;
extern int fscrypt_initialize(unsigned int cop_flags);
@@ -233,7 +218,6 @@ extern int fscrypt_crypt_block(const struct inode *inode,
unsigned int len, unsigned int offs,
gfp_t gfp_flags);
extern struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags);
-extern const struct dentry_operations fscrypt_d_ops;
extern void __printf(3, 4) __cold
fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...);
@@ -260,11 +244,13 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
const struct fscrypt_info *ci);
/* fname.c */
-extern int fname_encrypt(struct inode *inode, const struct qstr *iname,
- u8 *out, unsigned int olen);
+extern int fscrypt_fname_encrypt(const struct inode *inode,
+ const struct qstr *iname,
+ u8 *out, unsigned int olen);
extern bool fscrypt_fname_encrypted_size(const struct inode *inode,
u32 orig_len, u32 max_len,
u32 *encrypted_len_ret);
+extern const struct dentry_operations fscrypt_d_ops;
/* hkdf.c */
@@ -283,11 +269,12 @@ extern int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
* output doesn't reveal another.
*/
#define HKDF_CONTEXT_KEY_IDENTIFIER 1
-#define HKDF_CONTEXT_PER_FILE_KEY 2
+#define HKDF_CONTEXT_PER_FILE_ENC_KEY 2
#define HKDF_CONTEXT_DIRECT_KEY 3
#define HKDF_CONTEXT_IV_INO_LBLK_64_KEY 4
+#define HKDF_CONTEXT_DIRHASH_KEY 5
-extern int fscrypt_hkdf_expand(struct fscrypt_hkdf *hkdf, u8 context,
+extern int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context,
const u8 *info, unsigned int infolen,
u8 *okm, unsigned int okmlen);
@@ -448,18 +435,17 @@ struct fscrypt_mode {
int logged_impl_name;
};
-static inline bool
-fscrypt_mode_supports_direct_key(const struct fscrypt_mode *mode)
-{
- return mode->ivsize >= offsetofend(union fscrypt_iv, nonce);
-}
+extern struct fscrypt_mode fscrypt_modes[];
extern struct crypto_skcipher *
fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key,
const struct inode *inode);
-extern int fscrypt_set_derived_key(struct fscrypt_info *ci,
- const u8 *derived_key);
+extern int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci,
+ const u8 *raw_key);
+
+extern int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
+ const struct fscrypt_master_key *mk);
/* keysetup_v1.c */
diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c
index f21873e1b467..efb95bd19a89 100644
--- a/fs/crypto/hkdf.c
+++ b/fs/crypto/hkdf.c
@@ -112,7 +112,7 @@ out:
* adds to its application-specific info strings to guarantee that it doesn't
* accidentally repeat an info string when using HKDF for different purposes.)
*/
-int fscrypt_hkdf_expand(struct fscrypt_hkdf *hkdf, u8 context,
+int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context,
const u8 *info, unsigned int infolen,
u8 *okm, unsigned int okmlen)
{
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index bb3b7fcfdd48..5ef861742921 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -5,6 +5,8 @@
* Encryption hooks for higher-level filesystem operations.
*/
+#include <linux/key.h>
+
#include "fscrypt_private.h"
/**
@@ -122,6 +124,48 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
}
EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup);
+/**
+ * fscrypt_prepare_setflags() - prepare to change flags with FS_IOC_SETFLAGS
+ * @inode: the inode on which flags are being changed
+ * @oldflags: the old flags
+ * @flags: the new flags
+ *
+ * The caller should be holding i_rwsem for write.
+ *
+ * Return: 0 on success; -errno if the flags change isn't allowed or if
+ * another error occurs.
+ */
+int fscrypt_prepare_setflags(struct inode *inode,
+ unsigned int oldflags, unsigned int flags)
+{
+ struct fscrypt_info *ci;
+ struct fscrypt_master_key *mk;
+ int err;
+
+ /*
+ * When the CASEFOLD flag is set on an encrypted directory, we must
+ * derive the secret key needed for the dirhash. This is only possible
+ * if the directory uses a v2 encryption policy.
+ */
+ if (IS_ENCRYPTED(inode) && (flags & ~oldflags & FS_CASEFOLD_FL)) {
+ err = fscrypt_require_key(inode);
+ if (err)
+ return err;
+ ci = inode->i_crypt_info;
+ if (ci->ci_policy.version != FSCRYPT_POLICY_V2)
+ return -EINVAL;
+ mk = ci->ci_master_key->payload.data[0];
+ down_read(&mk->mk_secret_sem);
+ if (is_master_key_secret_present(&mk->mk_secret))
+ err = fscrypt_derive_dirhash_key(ci, mk);
+ else
+ err = -ENOKEY;
+ up_read(&mk->mk_secret_sem);
+ return err;
+ }
+ return 0;
+}
+
int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len,
unsigned int max_len,
struct fscrypt_str *disk_link)
@@ -188,7 +232,8 @@ int __fscrypt_encrypt_symlink(struct inode *inode, const char *target,
ciphertext_len = disk_link->len - sizeof(*sd);
sd->len = cpu_to_le16(ciphertext_len);
- err = fname_encrypt(inode, &iname, sd->encrypted_path, ciphertext_len);
+ err = fscrypt_fname_encrypt(inode, &iname, sd->encrypted_path,
+ ciphertext_len);
if (err)
goto err_free_sd;
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index 40cca351273f..ab41b25d4fa1 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -465,6 +465,109 @@ out_unlock:
return err;
}
+static int fscrypt_provisioning_key_preparse(struct key_preparsed_payload *prep)
+{
+ const struct fscrypt_provisioning_key_payload *payload = prep->data;
+
+ if (prep->datalen < sizeof(*payload) + FSCRYPT_MIN_KEY_SIZE ||
+ prep->datalen > sizeof(*payload) + FSCRYPT_MAX_KEY_SIZE)
+ return -EINVAL;
+
+ if (payload->type != FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR &&
+ payload->type != FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER)
+ return -EINVAL;
+
+ if (payload->__reserved)
+ return -EINVAL;
+
+ prep->payload.data[0] = kmemdup(payload, prep->datalen, GFP_KERNEL);
+ if (!prep->payload.data[0])
+ return -ENOMEM;
+
+ prep->quotalen = prep->datalen;
+ return 0;
+}
+
+static void fscrypt_provisioning_key_free_preparse(
+ struct key_preparsed_payload *prep)
+{
+ kzfree(prep->payload.data[0]);
+}
+
+static void fscrypt_provisioning_key_describe(const struct key *key,
+ struct seq_file *m)
+{
+ seq_puts(m, key->description);
+ if (key_is_positive(key)) {
+ const struct fscrypt_provisioning_key_payload *payload =
+ key->payload.data[0];
+
+ seq_printf(m, ": %u [%u]", key->datalen, payload->type);
+ }
+}
+
+static void fscrypt_provisioning_key_destroy(struct key *key)
+{
+ kzfree(key->payload.data[0]);
+}
+
+static struct key_type key_type_fscrypt_provisioning = {
+ .name = "fscrypt-provisioning",
+ .preparse = fscrypt_provisioning_key_preparse,
+ .free_preparse = fscrypt_provisioning_key_free_preparse,
+ .instantiate = generic_key_instantiate,
+ .describe = fscrypt_provisioning_key_describe,
+ .destroy = fscrypt_provisioning_key_destroy,
+};
+
+/*
+ * Retrieve the raw key from the Linux keyring key specified by 'key_id', and
+ * store it into 'secret'.
+ *
+ * The key must be of type "fscrypt-provisioning" and must have the field
+ * fscrypt_provisioning_key_payload::type set to 'type', indicating that it's
+ * only usable with fscrypt with the particular KDF version identified by
+ * 'type'. We don't use the "logon" key type because there's no way to
+ * completely restrict the use of such keys; they can be used by any kernel API
+ * that accepts "logon" keys and doesn't require a specific service prefix.
+ *
+ * The ability to specify the key via Linux keyring key is intended for cases
+ * where userspace needs to re-add keys after the filesystem is unmounted and
+ * re-mounted. Most users should just provide the raw key directly instead.
+ */
+static int get_keyring_key(u32 key_id, u32 type,
+ struct fscrypt_master_key_secret *secret)
+{
+ key_ref_t ref;
+ struct key *key;
+ const struct fscrypt_provisioning_key_payload *payload;
+ int err;
+
+ ref = lookup_user_key(key_id, 0, KEY_NEED_SEARCH);
+ if (IS_ERR(ref))
+ return PTR_ERR(ref);
+ key = key_ref_to_ptr(ref);
+
+ if (key->type != &key_type_fscrypt_provisioning)
+ goto bad_key;
+ payload = key->payload.data[0];
+
+ /* Don't allow fscrypt v1 keys to be used as v2 keys and vice versa. */
+ if (payload->type != type)
+ goto bad_key;
+
+ secret->size = key->datalen - sizeof(*payload);
+ memcpy(secret->raw, payload->raw, secret->size);
+ err = 0;
+ goto out_put;
+
+bad_key:
+ err = -EKEYREJECTED;
+out_put:
+ key_ref_put(ref);
+ return err;
+}
+
/*
* Add a master encryption key to the filesystem, causing all files which were
* encrypted with it to appear "unlocked" (decrypted) when accessed.
@@ -503,18 +606,25 @@ int fscrypt_ioctl_add_key(struct file *filp, void __user *_uarg)
if (!valid_key_spec(&arg.key_spec))
return -EINVAL;
- if (arg.raw_size < FSCRYPT_MIN_KEY_SIZE ||
- arg.raw_size > FSCRYPT_MAX_KEY_SIZE)
- return -EINVAL;
-
if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved)))
return -EINVAL;
memset(&secret, 0, sizeof(secret));
- secret.size = arg.raw_size;
- err = -EFAULT;
- if (copy_from_user(secret.raw, uarg->raw, secret.size))
- goto out_wipe_secret;
+ if (arg.key_id) {
+ if (arg.raw_size != 0)
+ return -EINVAL;
+ err = get_keyring_key(arg.key_id, arg.key_spec.type, &secret);
+ if (err)
+ goto out_wipe_secret;
+ } else {
+ if (arg.raw_size < FSCRYPT_MIN_KEY_SIZE ||
+ arg.raw_size > FSCRYPT_MAX_KEY_SIZE)
+ return -EINVAL;
+ secret.size = arg.raw_size;
+ err = -EFAULT;
+ if (copy_from_user(secret.raw, uarg->raw, secret.size))
+ goto out_wipe_secret;
+ }
switch (arg.key_spec.type) {
case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR:
@@ -666,9 +776,6 @@ static int check_for_busy_inodes(struct super_block *sb,
struct list_head *pos;
size_t busy_count = 0;
unsigned long ino;
- struct dentry *dentry;
- char _path[256];
- char *path = NULL;
spin_lock(&mk->mk_decrypted_inodes_lock);
@@ -687,22 +794,14 @@ static int check_for_busy_inodes(struct super_block *sb,
struct fscrypt_info,
ci_master_key_link)->ci_inode;
ino = inode->i_ino;
- dentry = d_find_alias(inode);
}
spin_unlock(&mk->mk_decrypted_inodes_lock);
- if (dentry) {
- path = dentry_path(dentry, _path, sizeof(_path));
- dput(dentry);
- }
- if (IS_ERR_OR_NULL(path))
- path = "(unknown)";
-
fscrypt_warn(NULL,
- "%s: %zu inode(s) still busy after removing key with %s %*phN, including ino %lu (%s)",
+ "%s: %zu inode(s) still busy after removing key with %s %*phN, including ino %lu",
sb->s_id, busy_count, master_key_spec_type(&mk->mk_spec),
master_key_spec_len(&mk->mk_spec), (u8 *)&mk->mk_spec.u,
- ino, path);
+ ino);
return -EBUSY;
}
@@ -978,8 +1077,14 @@ int __init fscrypt_init_keyring(void)
if (err)
goto err_unregister_fscrypt;
+ err = register_key_type(&key_type_fscrypt_provisioning);
+ if (err)
+ goto err_unregister_fscrypt_user;
+
return 0;
+err_unregister_fscrypt_user:
+ unregister_key_type(&key_type_fscrypt_user);
err_unregister_fscrypt:
unregister_key_type(&key_type_fscrypt);
return err;
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index f577bb6613f9..65cb09fa6ead 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -13,7 +13,7 @@
#include "fscrypt_private.h"
-static struct fscrypt_mode available_modes[] = {
+struct fscrypt_mode fscrypt_modes[] = {
[FSCRYPT_MODE_AES_256_XTS] = {
.friendly_name = "AES-256-XTS",
.cipher_str = "xts(aes)",
@@ -51,10 +51,10 @@ select_encryption_mode(const union fscrypt_policy *policy,
const struct inode *inode)
{
if (S_ISREG(inode->i_mode))
- return &available_modes[fscrypt_policy_contents_mode(policy)];
+ return &fscrypt_modes[fscrypt_policy_contents_mode(policy)];
if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- return &available_modes[fscrypt_policy_fnames_mode(policy)];
+ return &fscrypt_modes[fscrypt_policy_fnames_mode(policy)];
WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n",
inode->i_ino, (inode->i_mode & S_IFMT));
@@ -89,8 +89,11 @@ struct crypto_skcipher *fscrypt_allocate_skcipher(struct fscrypt_mode *mode,
* first time a mode is used.
*/
pr_info("fscrypt: %s using implementation \"%s\"\n",
- mode->friendly_name,
- crypto_skcipher_alg(tfm)->base.cra_driver_name);
+ mode->friendly_name, crypto_skcipher_driver_name(tfm));
+ }
+ if (WARN_ON(crypto_skcipher_ivsize(tfm) != mode->ivsize)) {
+ err = -EINVAL;
+ goto err_free_tfm;
}
crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
err = crypto_skcipher_setkey(tfm, raw_key, mode->keysize);
@@ -104,12 +107,12 @@ err_free_tfm:
return ERR_PTR(err);
}
-/* Given the per-file key, set up the file's crypto transform object */
-int fscrypt_set_derived_key(struct fscrypt_info *ci, const u8 *derived_key)
+/* Given a per-file encryption key, set up the file's crypto transform object */
+int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key)
{
struct crypto_skcipher *tfm;
- tfm = fscrypt_allocate_skcipher(ci->ci_mode, derived_key, ci->ci_inode);
+ tfm = fscrypt_allocate_skcipher(ci->ci_mode, raw_key, ci->ci_inode);
if (IS_ERR(tfm))
return PTR_ERR(tfm);
@@ -118,15 +121,15 @@ int fscrypt_set_derived_key(struct fscrypt_info *ci, const u8 *derived_key)
return 0;
}
-static int setup_per_mode_key(struct fscrypt_info *ci,
- struct fscrypt_master_key *mk,
- struct crypto_skcipher **tfms,
- u8 hkdf_context, bool include_fs_uuid)
+static int setup_per_mode_enc_key(struct fscrypt_info *ci,
+ struct fscrypt_master_key *mk,
+ struct crypto_skcipher **tfms,
+ u8 hkdf_context, bool include_fs_uuid)
{
const struct inode *inode = ci->ci_inode;
const struct super_block *sb = inode->i_sb;
struct fscrypt_mode *mode = ci->ci_mode;
- u8 mode_num = mode - available_modes;
+ const u8 mode_num = mode - fscrypt_modes;
struct crypto_skcipher *tfm, *prev_tfm;
u8 mode_key[FSCRYPT_MAX_KEY_SIZE];
u8 hkdf_info[sizeof(mode_num) + sizeof(sb->s_uuid)];
@@ -171,29 +174,37 @@ done:
return 0;
}
+int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
+ const struct fscrypt_master_key *mk)
+{
+ int err;
+
+ err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, HKDF_CONTEXT_DIRHASH_KEY,
+ ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE,
+ (u8 *)&ci->ci_dirhash_key,
+ sizeof(ci->ci_dirhash_key));
+ if (err)
+ return err;
+ ci->ci_dirhash_key_initialized = true;
+ return 0;
+}
+
static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
struct fscrypt_master_key *mk)
{
- u8 derived_key[FSCRYPT_MAX_KEY_SIZE];
int err;
if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) {
/*
- * DIRECT_KEY: instead of deriving per-file keys, the per-file
- * nonce will be included in all the IVs. But unlike v1
- * policies, for v2 policies in this case we don't encrypt with
- * the master key directly but rather derive a per-mode key.
- * This ensures that the master key is consistently used only
- * for HKDF, avoiding key reuse issues.
+ * DIRECT_KEY: instead of deriving per-file encryption keys, the
+ * per-file nonce will be included in all the IVs. But unlike
+ * v1 policies, for v2 policies in this case we don't encrypt
+ * with the master key directly but rather derive a per-mode
+ * encryption key. This ensures that the master key is
+ * consistently used only for HKDF, avoiding key reuse issues.
*/
- if (!fscrypt_mode_supports_direct_key(ci->ci_mode)) {
- fscrypt_warn(ci->ci_inode,
- "Direct key flag not allowed with %s",
- ci->ci_mode->friendly_name);
- return -EINVAL;
- }
- return setup_per_mode_key(ci, mk, mk->mk_direct_tfms,
- HKDF_CONTEXT_DIRECT_KEY, false);
+ err = setup_per_mode_enc_key(ci, mk, mk->mk_direct_tfms,
+ HKDF_CONTEXT_DIRECT_KEY, false);
} else if (ci->ci_policy.v2.flags &
FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) {
/*
@@ -202,21 +213,34 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
* the IVs. This format is optimized for use with inline
* encryption hardware compliant with the UFS or eMMC standards.
*/
- return setup_per_mode_key(ci, mk, mk->mk_iv_ino_lblk_64_tfms,
- HKDF_CONTEXT_IV_INO_LBLK_64_KEY,
- true);
+ err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_64_tfms,
+ HKDF_CONTEXT_IV_INO_LBLK_64_KEY,
+ true);
+ } else {
+ u8 derived_key[FSCRYPT_MAX_KEY_SIZE];
+
+ err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf,
+ HKDF_CONTEXT_PER_FILE_ENC_KEY,
+ ci->ci_nonce,
+ FS_KEY_DERIVATION_NONCE_SIZE,
+ derived_key, ci->ci_mode->keysize);
+ if (err)
+ return err;
+
+ err = fscrypt_set_per_file_enc_key(ci, derived_key);
+ memzero_explicit(derived_key, ci->ci_mode->keysize);
}
-
- err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf,
- HKDF_CONTEXT_PER_FILE_KEY,
- ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE,
- derived_key, ci->ci_mode->keysize);
if (err)
return err;
- err = fscrypt_set_derived_key(ci, derived_key);
- memzero_explicit(derived_key, ci->ci_mode->keysize);
- return err;
+ /* Derive a secret dirhash key for directories that need it. */
+ if (S_ISDIR(ci->ci_inode->i_mode) && IS_CASEFOLDED(ci->ci_inode)) {
+ err = fscrypt_derive_dirhash_key(ci, mk);
+ if (err)
+ return err;
+ }
+
+ return 0;
}
/*
diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c
index 5298ef22aa85..801b48c0cd7f 100644
--- a/fs/crypto/keysetup_v1.c
+++ b/fs/crypto/keysetup_v1.c
@@ -9,7 +9,7 @@
* This file implements compatibility functions for the original encryption
* policy version ("v1"), including:
*
- * - Deriving per-file keys using the AES-128-ECB based KDF
+ * - Deriving per-file encryption keys using the AES-128-ECB based KDF
* (rather than the new method of using HKDF-SHA512)
*
* - Retrieving fscrypt master keys from process-subscribed keyrings
@@ -253,23 +253,8 @@ err_free_dk:
static int setup_v1_file_key_direct(struct fscrypt_info *ci,
const u8 *raw_master_key)
{
- const struct fscrypt_mode *mode = ci->ci_mode;
struct fscrypt_direct_key *dk;
- if (!fscrypt_mode_supports_direct_key(mode)) {
- fscrypt_warn(ci->ci_inode,
- "Direct key mode not allowed with %s",
- mode->friendly_name);
- return -EINVAL;
- }
-
- if (ci->ci_policy.v1.contents_encryption_mode !=
- ci->ci_policy.v1.filenames_encryption_mode) {
- fscrypt_warn(ci->ci_inode,
- "Direct key mode not allowed with different contents and filenames modes");
- return -EINVAL;
- }
-
dk = fscrypt_get_direct_key(ci, raw_master_key);
if (IS_ERR(dk))
return PTR_ERR(dk);
@@ -298,7 +283,7 @@ static int setup_v1_file_key_derived(struct fscrypt_info *ci,
if (err)
goto out;
- err = fscrypt_set_derived_key(ci, derived_key);
+ err = fscrypt_set_per_file_enc_key(ci, derived_key);
out:
kzfree(derived_key);
return err;
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 96f528071bed..cf2a9d26ef7d 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -29,6 +29,43 @@ bool fscrypt_policies_equal(const union fscrypt_policy *policy1,
return !memcmp(policy1, policy2, fscrypt_policy_size(policy1));
}
+static bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode)
+{
+ if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
+ filenames_mode == FSCRYPT_MODE_AES_256_CTS)
+ return true;
+
+ if (contents_mode == FSCRYPT_MODE_AES_128_CBC &&
+ filenames_mode == FSCRYPT_MODE_AES_128_CTS)
+ return true;
+
+ if (contents_mode == FSCRYPT_MODE_ADIANTUM &&
+ filenames_mode == FSCRYPT_MODE_ADIANTUM)
+ return true;
+
+ return false;
+}
+
+static bool supported_direct_key_modes(const struct inode *inode,
+ u32 contents_mode, u32 filenames_mode)
+{
+ const struct fscrypt_mode *mode;
+
+ if (contents_mode != filenames_mode) {
+ fscrypt_warn(inode,
+ "Direct key flag not allowed with different contents and filenames modes");
+ return false;
+ }
+ mode = &fscrypt_modes[contents_mode];
+
+ if (mode->ivsize < offsetofend(union fscrypt_iv, nonce)) {
+ fscrypt_warn(inode, "Direct key flag not allowed with %s",
+ mode->friendly_name);
+ return false;
+ }
+ return true;
+}
+
static bool supported_iv_ino_lblk_64_policy(
const struct fscrypt_policy_v2 *policy,
const struct inode *inode)
@@ -63,13 +100,82 @@ static bool supported_iv_ino_lblk_64_policy(
return true;
}
+static bool fscrypt_supported_v1_policy(const struct fscrypt_policy_v1 *policy,
+ const struct inode *inode)
+{
+ if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode,
+ policy->filenames_encryption_mode)) {
+ fscrypt_warn(inode,
+ "Unsupported encryption modes (contents %d, filenames %d)",
+ policy->contents_encryption_mode,
+ policy->filenames_encryption_mode);
+ return false;
+ }
+
+ if (policy->flags & ~(FSCRYPT_POLICY_FLAGS_PAD_MASK |
+ FSCRYPT_POLICY_FLAG_DIRECT_KEY)) {
+ fscrypt_warn(inode, "Unsupported encryption flags (0x%02x)",
+ policy->flags);
+ return false;
+ }
+
+ if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) &&
+ !supported_direct_key_modes(inode, policy->contents_encryption_mode,
+ policy->filenames_encryption_mode))
+ return false;
+
+ if (IS_CASEFOLDED(inode)) {
+ /* With v1, there's no way to derive dirhash keys. */
+ fscrypt_warn(inode,
+ "v1 policies can't be used on casefolded directories");
+ return false;
+ }
+
+ return true;
+}
+
+static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy,
+ const struct inode *inode)
+{
+ if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode,
+ policy->filenames_encryption_mode)) {
+ fscrypt_warn(inode,
+ "Unsupported encryption modes (contents %d, filenames %d)",
+ policy->contents_encryption_mode,
+ policy->filenames_encryption_mode);
+ return false;
+ }
+
+ if (policy->flags & ~FSCRYPT_POLICY_FLAGS_VALID) {
+ fscrypt_warn(inode, "Unsupported encryption flags (0x%02x)",
+ policy->flags);
+ return false;
+ }
+
+ if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) &&
+ !supported_direct_key_modes(inode, policy->contents_encryption_mode,
+ policy->filenames_encryption_mode))
+ return false;
+
+ if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) &&
+ !supported_iv_ino_lblk_64_policy(policy, inode))
+ return false;
+
+ if (memchr_inv(policy->__reserved, 0, sizeof(policy->__reserved))) {
+ fscrypt_warn(inode, "Reserved bits set in encryption policy");
+ return false;
+ }
+
+ return true;
+}
+
/**
* fscrypt_supported_policy - check whether an encryption policy is supported
*
* Given an encryption policy, check whether all its encryption modes and other
- * settings are supported by this kernel. (But we don't currently don't check
- * for crypto API support here, so attempting to use an algorithm not configured
- * into the crypto API will still fail later.)
+ * settings are supported by this kernel on the given inode. (But we don't
+ * currently don't check for crypto API support here, so attempting to use an
+ * algorithm not configured into the crypto API will still fail later.)
*
* Return: %true if supported, else %false
*/
@@ -77,60 +183,10 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u,
const struct inode *inode)
{
switch (policy_u->version) {
- case FSCRYPT_POLICY_V1: {
- const struct fscrypt_policy_v1 *policy = &policy_u->v1;
-
- if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode,
- policy->filenames_encryption_mode)) {
- fscrypt_warn(inode,
- "Unsupported encryption modes (contents %d, filenames %d)",
- policy->contents_encryption_mode,
- policy->filenames_encryption_mode);
- return false;
- }
-
- if (policy->flags & ~(FSCRYPT_POLICY_FLAGS_PAD_MASK |
- FSCRYPT_POLICY_FLAG_DIRECT_KEY)) {
- fscrypt_warn(inode,
- "Unsupported encryption flags (0x%02x)",
- policy->flags);
- return false;
- }
-
- return true;
- }
- case FSCRYPT_POLICY_V2: {
- const struct fscrypt_policy_v2 *policy = &policy_u->v2;
-
- if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode,
- policy->filenames_encryption_mode)) {
- fscrypt_warn(inode,
- "Unsupported encryption modes (contents %d, filenames %d)",
- policy->contents_encryption_mode,
- policy->filenames_encryption_mode);
- return false;
- }
-
- if (policy->flags & ~FSCRYPT_POLICY_FLAGS_VALID) {
- fscrypt_warn(inode,
- "Unsupported encryption flags (0x%02x)",
- policy->flags);
- return false;
- }
-
- if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) &&
- !supported_iv_ino_lblk_64_policy(policy, inode))
- return false;
-
- if (memchr_inv(policy->__reserved, 0,
- sizeof(policy->__reserved))) {
- fscrypt_warn(inode,
- "Reserved bits set in encryption policy");
- return false;
- }
-
- return true;
- }
+ case FSCRYPT_POLICY_V1:
+ return fscrypt_supported_v1_policy(&policy_u->v1, inode);
+ case FSCRYPT_POLICY_V2:
+ return fscrypt_supported_v2_policy(&policy_u->v2, inode);
}
return false;
}
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index dede25247b81..634b09d18b77 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -142,18 +142,21 @@ EXPORT_SYMBOL_GPL(debugfs_file_put);
* We also need to exclude any file that has ways to write or alter it as root
* can bypass the permissions check.
*/
-static bool debugfs_is_locked_down(struct inode *inode,
- struct file *filp,
- const struct file_operations *real_fops)
+static int debugfs_locked_down(struct inode *inode,
+ struct file *filp,
+ const struct file_operations *real_fops)
{
if ((inode->i_mode & 07777) == 0444 &&
!(filp->f_mode & FMODE_WRITE) &&
!real_fops->unlocked_ioctl &&
!real_fops->compat_ioctl &&
!real_fops->mmap)
- return false;
+ return 0;
- return security_locked_down(LOCKDOWN_DEBUGFS);
+ if (security_locked_down(LOCKDOWN_DEBUGFS))
+ return -EPERM;
+
+ return 0;
}
static int open_proxy_open(struct inode *inode, struct file *filp)
@@ -168,7 +171,7 @@ static int open_proxy_open(struct inode *inode, struct file *filp)
real_fops = debugfs_real_fops(filp);
- r = debugfs_is_locked_down(inode, filp, real_fops);
+ r = debugfs_locked_down(inode, filp, real_fops);
if (r)
goto out;
@@ -298,7 +301,7 @@ static int full_proxy_open(struct inode *inode, struct file *filp)
real_fops = debugfs_real_fops(filp);
- r = debugfs_is_locked_down(inode, filp, real_fops);
+ r = debugfs_locked_down(inode, filp, real_fops);
if (r)
goto out;
@@ -496,10 +499,10 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n");
* This function will return a pointer to a dentry if it succeeds. This
* pointer must be passed to the debugfs_remove() function when the file is
* to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be
+ * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be
* returned.
*
- * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will
+ * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will
* be returned.
*/
struct dentry *debugfs_create_u32(const char *name, umode_t mode,
@@ -581,10 +584,10 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n");
* This function will return a pointer to a dentry if it succeeds. This
* pointer must be passed to the debugfs_remove() function when the file is
* to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be
+ * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be
* returned.
*
- * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will
+ * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will
* be returned.
*/
struct dentry *debugfs_create_ulong(const char *name, umode_t mode,
@@ -846,10 +849,10 @@ static const struct file_operations fops_bool_wo = {
* This function will return a pointer to a dentry if it succeeds. This
* pointer must be passed to the debugfs_remove() function when the file is
* to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be
+ * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be
* returned.
*
- * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will
+ * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will
* be returned.
*/
struct dentry *debugfs_create_bool(const char *name, umode_t mode,
@@ -899,10 +902,10 @@ static const struct file_operations fops_blob = {
* This function will return a pointer to a dentry if it succeeds. This
* pointer must be passed to the debugfs_remove() function when the file is
* to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be
+ * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be
* returned.
*
- * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will
+ * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will
* be returned.
*/
struct dentry *debugfs_create_blob(const char *name, umode_t mode,
@@ -1091,10 +1094,10 @@ static const struct file_operations fops_regset32 = {
* This function will return a pointer to a dentry if it succeeds. This
* pointer must be passed to the debugfs_remove() function when the file is
* to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be
+ * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be
* returned.
*
- * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will
+ * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will
* be returned.
*/
struct dentry *debugfs_create_regset32(const char *name, umode_t mode,
@@ -1158,4 +1161,3 @@ struct dentry *debugfs_create_devm_seqfile(struct device *dev, const char *name,
&debugfs_devm_entry_ops);
}
EXPORT_SYMBOL_GPL(debugfs_create_devm_seqfile);
-
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index f4d8df5e4714..dc6cffc4feba 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -423,7 +423,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
* This function will return a pointer to a dentry if it succeeds. This
* pointer must be passed to the debugfs_remove() function when the file is
* to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be
+ * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be
* returned.
*
* If debugfs is not enabled in the kernel, the value -%ENODEV will be
@@ -502,7 +502,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_unsafe);
* This function will return a pointer to a dentry if it succeeds. This
* pointer must be passed to the debugfs_remove() function when the file is
* to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be
+ * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be
* returned.
*
* If debugfs is not enabled in the kernel, the value -%ENODEV will be
@@ -534,7 +534,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_size);
* This function will return a pointer to a dentry if it succeeds. This
* pointer must be passed to the debugfs_remove() function when the file is
* to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be
+ * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be
* returned.
*
* If debugfs is not enabled in the kernel, the value -%ENODEV will be
@@ -627,7 +627,7 @@ EXPORT_SYMBOL(debugfs_create_automount);
* This function will return a pointer to a dentry if it succeeds. This
* pointer must be passed to the debugfs_remove() function when the symbolic
* link is to be removed (no automatic cleanup happens if your module is
- * unloaded, you are responsible here.) If an error occurs, %ERR_PTR(-ERROR)
+ * unloaded, you are responsible here.) If an error occurs, ERR_PTR(-ERROR)
* will be returned.
*
* If debugfs is not enabled in the kernel, the value -%ENODEV will be
@@ -906,4 +906,3 @@ static int __init debugfs_init(void)
return retval;
}
core_initcall(debugfs_init);
-
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 0ec4f270139f..00b4d15bb811 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -39,6 +39,8 @@
#include <linux/atomic.h>
#include <linux/prefetch.h>
+#include "internal.h"
+
/*
* How many user pages to map in one call to get_user_pages(). This determines
* the size of a structure in the slab cache
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 3951d39b9b75..cdfaf4f0e11a 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1035,7 +1035,7 @@ static void sctp_connect_to_sock(struct connection *con)
int result;
int addr_len;
struct socket *sock;
- struct timeval tv = { .tv_sec = 5, .tv_usec = 0 };
+ struct __kernel_sock_timeval tv = { .tv_sec = 5, .tv_usec = 0 };
if (con->nodeid == 0) {
log_print("attempt to connect sock 0 foiled");
@@ -1087,12 +1087,12 @@ static void sctp_connect_to_sock(struct connection *con)
* since O_NONBLOCK argument in connect() function does not work here,
* then, we should restore the default value of this attribute.
*/
- kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_OLD, (char *)&tv,
+ kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_NEW, (char *)&tv,
sizeof(tv));
result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len,
0);
memset(&tv, 0, sizeof(tv));
- kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_OLD, (char *)&tv,
+ kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_NEW, (char *)&tv,
sizeof(tv));
if (result == -EINPROGRESS)
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index f91db24bbf3b..db1ef144c63a 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1586,7 +1586,7 @@ ecryptfs_process_key_cipher(struct crypto_skcipher **key_tfm,
}
crypto_skcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
if (*key_size == 0)
- *key_size = crypto_skcipher_default_keysize(*key_tfm);
+ *key_size = crypto_skcipher_max_keysize(*key_tfm);
get_random_bytes(dummy_key, *key_size);
rc = crypto_skcipher_setkey(*key_tfm, dummy_key, *key_size);
if (rc) {
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 216fbe6a4837..7d326aa0308e 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -2204,9 +2204,9 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
if (mount_crypt_stat->global_default_cipher_key_size == 0) {
printk(KERN_WARNING "No key size specified at mount; "
"defaulting to [%d]\n",
- crypto_skcipher_default_keysize(tfm));
+ crypto_skcipher_max_keysize(tfm));
mount_crypt_stat->global_default_cipher_key_size =
- crypto_skcipher_default_keysize(tfm);
+ crypto_skcipher_max_keysize(tfm);
}
if (crypt_stat->key_size == 0)
crypt_stat->key_size =
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 2890a67a1ded..5779a15c2cd6 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -306,24 +306,22 @@ static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq,
}
src = kmap_atomic(*rq->in);
- if (!rq->out[0]) {
- dst = NULL;
- } else {
+ if (rq->out[0]) {
dst = kmap_atomic(rq->out[0]);
memcpy(dst + rq->pageofs_out, src, righthalf);
+ kunmap_atomic(dst);
}
- if (rq->out[1] == *rq->in) {
- memmove(src, src + righthalf, rq->pageofs_out);
- } else if (nrpages_out == 2) {
- if (dst)
- kunmap_atomic(dst);
+ if (nrpages_out == 2) {
DBG_BUGON(!rq->out[1]);
- dst = kmap_atomic(rq->out[1]);
- memcpy(dst, src + righthalf, rq->pageofs_out);
+ if (rq->out[1] == *rq->in) {
+ memmove(src, src + righthalf, rq->pageofs_out);
+ } else {
+ dst = kmap_atomic(rq->out[1]);
+ memcpy(dst, src + righthalf, rq->pageofs_out);
+ kunmap_atomic(dst);
+ }
}
- if (dst)
- kunmap_atomic(dst);
kunmap_atomic(src);
return 0;
}
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 1ed5beff7d11..c4c6dcdc89ad 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -401,9 +401,9 @@ static inline void *erofs_get_pcpubuf(unsigned int pagenr)
#ifdef CONFIG_EROFS_FS_ZIP
int erofs_workgroup_put(struct erofs_workgroup *grp);
struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
- pgoff_t index, bool *tag);
+ pgoff_t index);
int erofs_register_workgroup(struct super_block *sb,
- struct erofs_workgroup *grp, bool tag);
+ struct erofs_workgroup *grp);
void erofs_workgroup_free_rcu(struct erofs_workgroup *grp);
void erofs_shrinker_register(struct super_block *sb);
void erofs_shrinker_unregister(struct super_block *sb);
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index 1e8e1450d5b0..fddc5059c930 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -59,7 +59,7 @@ repeat:
}
struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
- pgoff_t index, bool *tag)
+ pgoff_t index)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
struct erofs_workgroup *grp;
@@ -68,9 +68,6 @@ repeat:
rcu_read_lock();
grp = radix_tree_lookup(&sbi->workstn_tree, index);
if (grp) {
- *tag = xa_pointer_tag(grp);
- grp = xa_untag_pointer(grp);
-
if (erofs_workgroup_get(grp)) {
/* prefer to relax rcu read side */
rcu_read_unlock();
@@ -84,8 +81,7 @@ repeat:
}
int erofs_register_workgroup(struct super_block *sb,
- struct erofs_workgroup *grp,
- bool tag)
+ struct erofs_workgroup *grp)
{
struct erofs_sb_info *sbi;
int err;
@@ -103,8 +99,6 @@ int erofs_register_workgroup(struct super_block *sb,
sbi = EROFS_SB(sb);
xa_lock(&sbi->workstn_tree);
- grp = xa_tag_pointer(grp, tag);
-
/*
* Bump up reference count before making this workgroup
* visible to other users in order to avoid potential UAF
@@ -175,8 +169,7 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
* however in order to avoid some race conditions, add a
* DBG_BUGON to observe this in advance.
*/
- DBG_BUGON(xa_untag_pointer(radix_tree_delete(&sbi->workstn_tree,
- grp->index)) != grp);
+ DBG_BUGON(radix_tree_delete(&sbi->workstn_tree, grp->index) != grp);
/*
* If managed cache is on, last refcount should indicate
@@ -201,7 +194,7 @@ repeat:
batch, first_index, PAGEVEC_SIZE);
for (i = 0; i < found; ++i) {
- struct erofs_workgroup *grp = xa_untag_pointer(batch[i]);
+ struct erofs_workgroup *grp = batch[i];
first_index = grp->index + 1;
diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h
index 3585b84d2f20..50966f1c676e 100644
--- a/fs/erofs/xattr.h
+++ b/fs/erofs/xattr.h
@@ -46,18 +46,19 @@ extern const struct xattr_handler erofs_xattr_security_handler;
static inline const struct xattr_handler *erofs_xattr_handler(unsigned int idx)
{
-static const struct xattr_handler *xattr_handler_map[] = {
- [EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler,
+ static const struct xattr_handler *xattr_handler_map[] = {
+ [EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler,
#ifdef CONFIG_EROFS_FS_POSIX_ACL
- [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
- [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] =
- &posix_acl_default_xattr_handler,
+ [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] =
+ &posix_acl_access_xattr_handler,
+ [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] =
+ &posix_acl_default_xattr_handler,
#endif
- [EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler,
+ [EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler,
#ifdef CONFIG_EROFS_FS_SECURITY
- [EROFS_XATTR_INDEX_SECURITY] = &erofs_xattr_security_handler,
+ [EROFS_XATTR_INDEX_SECURITY] = &erofs_xattr_security_handler,
#endif
-};
+ };
return idx && idx < ARRAY_SIZE(xattr_handler_map) ?
xattr_handler_map[idx] : NULL;
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index ca99425a4536..80e47f07d946 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -345,9 +345,8 @@ static int z_erofs_lookup_collection(struct z_erofs_collector *clt,
struct z_erofs_pcluster *pcl;
struct z_erofs_collection *cl;
unsigned int length;
- bool tag;
- grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT, &tag);
+ grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT);
if (!grp)
return -ENOENT;
@@ -438,7 +437,7 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
*/
mutex_trylock(&cl->lock);
- err = erofs_register_workgroup(inode->i_sb, &pcl->obj, 0);
+ err = erofs_register_workgroup(inode->i_sb, &pcl->obj);
if (err) {
mutex_unlock(&cl->lock);
kmem_cache_free(pcluster_cachep, pcl);
@@ -1149,21 +1148,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
qtail[JQ_BYPASS] = &pcl->next;
}
-static bool postsubmit_is_all_bypassed(struct z_erofs_decompressqueue *q[],
- unsigned int nr_bios, bool force_fg)
-{
- /*
- * although background is preferred, no one is pending for submission.
- * don't issue workqueue for decompression but drop it directly instead.
- */
- if (force_fg || nr_bios)
- return false;
-
- kvfree(q[JQ_SUBMIT]);
- return true;
-}
-
-static bool z_erofs_submit_queue(struct super_block *sb,
+static void z_erofs_submit_queue(struct super_block *sb,
z_erofs_next_pcluster_t owned_head,
struct list_head *pagepool,
struct z_erofs_decompressqueue *fgq,
@@ -1172,19 +1157,12 @@ static bool z_erofs_submit_queue(struct super_block *sb,
struct erofs_sb_info *const sbi = EROFS_SB(sb);
z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
- struct bio *bio;
void *bi_private;
/* since bio will be NULL, no need to initialize last_index */
pgoff_t uninitialized_var(last_index);
- bool force_submit = false;
- unsigned int nr_bios;
-
- if (owned_head == Z_EROFS_PCLUSTER_TAIL)
- return false;
+ unsigned int nr_bios = 0;
+ struct bio *bio = NULL;
- force_submit = false;
- bio = NULL;
- nr_bios = 0;
bi_private = jobqueueset_init(sb, q, fgq, force_fg);
qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head;
qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head;
@@ -1194,11 +1172,9 @@ static bool z_erofs_submit_queue(struct super_block *sb,
do {
struct z_erofs_pcluster *pcl;
- unsigned int clusterpages;
- pgoff_t first_index;
- struct page *page;
- unsigned int i = 0, bypass = 0;
- int err;
+ pgoff_t cur, end;
+ unsigned int i = 0;
+ bool bypass = true;
/* no possible 'owned_head' equals the following */
DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
@@ -1206,55 +1182,50 @@ static bool z_erofs_submit_queue(struct super_block *sb,
pcl = container_of(owned_head, struct z_erofs_pcluster, next);
- clusterpages = BIT(pcl->clusterbits);
+ cur = pcl->obj.index;
+ end = cur + BIT(pcl->clusterbits);
/* close the main owned chain at first */
owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
Z_EROFS_PCLUSTER_TAIL_CLOSED);
- first_index = pcl->obj.index;
- force_submit |= (first_index != last_index + 1);
+ do {
+ struct page *page;
+ int err;
-repeat:
- page = pickup_page_for_submission(pcl, i, pagepool,
- MNGD_MAPPING(sbi),
- GFP_NOFS);
- if (!page) {
- force_submit = true;
- ++bypass;
- goto skippage;
- }
+ page = pickup_page_for_submission(pcl, i++, pagepool,
+ MNGD_MAPPING(sbi),
+ GFP_NOFS);
+ if (!page)
+ continue;
- if (bio && force_submit) {
+ if (bio && cur != last_index + 1) {
submit_bio_retry:
- submit_bio(bio);
- bio = NULL;
- }
-
- if (!bio) {
- bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
+ submit_bio(bio);
+ bio = NULL;
+ }
- bio->bi_end_io = z_erofs_decompressqueue_endio;
- bio_set_dev(bio, sb->s_bdev);
- bio->bi_iter.bi_sector = (sector_t)(first_index + i) <<
- LOG_SECTORS_PER_BLOCK;
- bio->bi_private = bi_private;
- bio->bi_opf = REQ_OP_READ;
+ if (!bio) {
+ bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
- ++nr_bios;
- }
+ bio->bi_end_io = z_erofs_decompressqueue_endio;
+ bio_set_dev(bio, sb->s_bdev);
+ bio->bi_iter.bi_sector = (sector_t)cur <<
+ LOG_SECTORS_PER_BLOCK;
+ bio->bi_private = bi_private;
+ bio->bi_opf = REQ_OP_READ;
+ ++nr_bios;
+ }
- err = bio_add_page(bio, page, PAGE_SIZE, 0);
- if (err < PAGE_SIZE)
- goto submit_bio_retry;
+ err = bio_add_page(bio, page, PAGE_SIZE, 0);
+ if (err < PAGE_SIZE)
+ goto submit_bio_retry;
- force_submit = false;
- last_index = first_index + i;
-skippage:
- if (++i < clusterpages)
- goto repeat;
+ last_index = cur;
+ bypass = false;
+ } while (++cur < end);
- if (bypass < clusterpages)
+ if (!bypass)
qtail[JQ_SUBMIT] = &pcl->next;
else
move_to_bypass_jobqueue(pcl, qtail, owned_head);
@@ -1263,11 +1234,15 @@ skippage:
if (bio)
submit_bio(bio);
- if (postsubmit_is_all_bypassed(q, nr_bios, *force_fg))
- return true;
-
+ /*
+ * although background is preferred, no one is pending for submission.
+ * don't issue workqueue for decompression but drop it directly instead.
+ */
+ if (!*force_fg && !nr_bios) {
+ kvfree(q[JQ_SUBMIT]);
+ return;
+ }
z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios);
- return true;
}
static void z_erofs_runqueue(struct super_block *sb,
@@ -1276,9 +1251,9 @@ static void z_erofs_runqueue(struct super_block *sb,
{
struct z_erofs_decompressqueue io[NR_JOBQUEUES];
- if (!z_erofs_submit_queue(sb, clt->owned_head,
- pagepool, io, &force_fg))
+ if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
return;
+ z_erofs_submit_queue(sb, clt->owned_head, pagepool, io, &force_fg);
/* handle bypass queue (no i/o pclusters) immediately */
z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 67a395039268..b041b66002db 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -354,12 +354,6 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p)
return container_of(p, struct ep_pqueue, pt)->epi;
}
-/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
-static inline int ep_op_has_event(int op)
-{
- return op != EPOLL_CTL_DEL;
-}
-
/* Initialize the poll safe wake up structure */
static void ep_nested_calls_init(struct nested_calls *ncalls)
{
@@ -2074,27 +2068,28 @@ SYSCALL_DEFINE1(epoll_create, int, size)
return do_epoll_create(0);
}
-/*
- * The following function implements the controller interface for
- * the eventpoll file that enables the insertion/removal/change of
- * file descriptors inside the interest set.
- */
-SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
- struct epoll_event __user *, event)
+static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
+ bool nonblock)
+{
+ if (!nonblock) {
+ mutex_lock_nested(mutex, depth);
+ return 0;
+ }
+ if (mutex_trylock(mutex))
+ return 0;
+ return -EAGAIN;
+}
+
+int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
+ bool nonblock)
{
int error;
int full_check = 0;
struct fd f, tf;
struct eventpoll *ep;
struct epitem *epi;
- struct epoll_event epds;
struct eventpoll *tep = NULL;
- error = -EFAULT;
- if (ep_op_has_event(op) &&
- copy_from_user(&epds, event, sizeof(struct epoll_event)))
- goto error_return;
-
error = -EBADF;
f = fdget(epfd);
if (!f.file)
@@ -2112,7 +2107,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
/* Check if EPOLLWAKEUP is allowed */
if (ep_op_has_event(op))
- ep_take_care_of_epollwakeup(&epds);
+ ep_take_care_of_epollwakeup(epds);
/*
* We have to check that the file structure underneath the file descriptor
@@ -2128,11 +2123,11 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
* so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
* Also, we do not currently supported nested exclusive wakeups.
*/
- if (ep_op_has_event(op) && (epds.events & EPOLLEXCLUSIVE)) {
+ if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
if (op == EPOLL_CTL_MOD)
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
- (epds.events & ~EPOLLEXCLUSIVE_OK_BITS)))
+ (epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
goto error_tgt_fput;
}
@@ -2157,13 +2152,17 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
* deep wakeup paths from forming in parallel through multiple
* EPOLL_CTL_ADD operations.
*/
- mutex_lock_nested(&ep->mtx, 0);
+ error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
+ if (error)
+ goto error_tgt_fput;
if (op == EPOLL_CTL_ADD) {
if (!list_empty(&f.file->f_ep_links) ||
is_file_epoll(tf.file)) {
- full_check = 1;
mutex_unlock(&ep->mtx);
- mutex_lock(&epmutex);
+ error = epoll_mutex_lock(&epmutex, 0, nonblock);
+ if (error)
+ goto error_tgt_fput;
+ full_check = 1;
if (is_file_epoll(tf.file)) {
error = -ELOOP;
if (ep_loop_check(ep, tf.file) != 0) {
@@ -2173,10 +2172,19 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
} else
list_add(&tf.file->f_tfile_llink,
&tfile_check_list);
- mutex_lock_nested(&ep->mtx, 0);
+ error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
+ if (error) {
+out_del:
+ list_del(&tf.file->f_tfile_llink);
+ goto error_tgt_fput;
+ }
if (is_file_epoll(tf.file)) {
tep = tf.file->private_data;
- mutex_lock_nested(&tep->mtx, 1);
+ error = epoll_mutex_lock(&tep->mtx, 1, nonblock);
+ if (error) {
+ mutex_unlock(&ep->mtx);
+ goto out_del;
+ }
}
}
}
@@ -2192,8 +2200,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
- epds.events |= EPOLLERR | EPOLLHUP;
- error = ep_insert(ep, &epds, tf.file, fd, full_check);
+ epds->events |= EPOLLERR | EPOLLHUP;
+ error = ep_insert(ep, epds, tf.file, fd, full_check);
} else
error = -EEXIST;
if (full_check)
@@ -2208,8 +2216,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
case EPOLL_CTL_MOD:
if (epi) {
if (!(epi->event.events & EPOLLEXCLUSIVE)) {
- epds.events |= EPOLLERR | EPOLLHUP;
- error = ep_modify(ep, epi, &epds);
+ epds->events |= EPOLLERR | EPOLLHUP;
+ error = ep_modify(ep, epi, epds);
}
} else
error = -ENOENT;
@@ -2232,6 +2240,23 @@ error_return:
}
/*
+ * The following function implements the controller interface for
+ * the eventpoll file that enables the insertion/removal/change of
+ * file descriptors inside the interest set.
+ */
+SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
+ struct epoll_event __user *, event)
+{
+ struct epoll_event epds;
+
+ if (ep_op_has_event(op) &&
+ copy_from_user(&epds, event, sizeof(struct epoll_event)))
+ return -EFAULT;
+
+ return do_epoll_ctl(epfd, op, fd, &epds, false);
+}
+
+/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_wait(2).
*/
diff --git a/fs/exec.c b/fs/exec.c
index 74d88dab98dd..db17be51b112 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -272,7 +272,6 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
goto err;
mm->stack_vm = mm->total_vm = 1;
- arch_bprm_mm_init(mm, vma);
up_write(&mm->mmap_sem);
bprm->p = vma->vm_end - sizeof(void *);
return 0;
@@ -761,6 +760,11 @@ int setup_arg_pages(struct linux_binprm *bprm,
goto out_unlock;
BUG_ON(prev != vma);
+ if (unlikely(vm_flags & VM_EXEC)) {
+ pr_warn_once("process '%pD4' started with executable stack\n",
+ bprm->file);
+ }
+
/* Move stack pages down in memory. */
if (stack_shift) {
ret = shift_arg_pages(vma, stack_shift);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index bcffe25da2f0..4a4ab683250d 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1073,9 +1073,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
if (EXT2_BLOCKS_PER_GROUP(sb) == 0)
goto cantfind_ext2;
- sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
- le32_to_cpu(es->s_first_data_block) - 1)
- / EXT2_BLOCKS_PER_GROUP(sb)) + 1;
+ sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
+ le32_to_cpu(es->s_first_data_block) - 1)
+ / EXT2_BLOCKS_PER_GROUP(sb)) + 1;
db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) /
EXT2_DESC_PER_BLOCK(sb);
sbi->s_group_desc = kmalloc_array (db_count,
@@ -1138,6 +1138,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
ext2_count_dirs(sb), GFP_KERNEL);
}
if (err) {
+ ret = err;
ext2_msg(sb, KERN_ERR, "error: insufficient memory");
goto failed_mount3;
}
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index ef42ab040905..2a592e38cdfe 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -4,12 +4,7 @@
# kernels after the removal of ext3 driver.
config EXT3_FS
tristate "The Extended 3 (ext3) filesystem"
- # These must match EXT4_FS selects...
select EXT4_FS
- select JBD2
- select CRC16
- select CRYPTO
- select CRYPTO_CRC32C
help
This config option is here only for backward compatibility. ext3
filesystem is now handled by the ext4 driver.
@@ -33,12 +28,12 @@ config EXT3_FS_SECURITY
config EXT4_FS
tristate "The Extended 4 (ext4) filesystem"
- # Please update EXT3_FS selects when changing these
select JBD2
select CRC16
select CRYPTO
select CRYPTO_CRC32C
select FS_IOMAP
+ select FS_ENCRYPTION_ALGS if FS_ENCRYPTION
help
This is the next generation of the ext3 filesystem.
@@ -108,7 +103,7 @@ config EXT4_DEBUG
echo 1 > /sys/module/ext4/parameters/mballoc_debug
config EXT4_KUNIT_TESTS
- bool "KUnit tests for ext4"
+ tristate "KUnit tests for ext4"
select EXT4_FS
depends on KUNIT
help
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 840b91d040f1..4ccb3c9189d8 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -13,5 +13,6 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
-ext4-$(CONFIG_EXT4_KUNIT_TESTS) += inode-test.o
+ext4-inode-test-objs += inode-test.o
+obj-$(CONFIG_EXT4_KUNIT_TESTS) += ext4-inode-test.o
ext4-$(CONFIG_FS_VERITY) += verity.o
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 0b202e00d93f..5f993a411251 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -371,7 +371,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
if (buffer_verified(bh))
goto verified;
if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
- desc, bh))) {
+ desc, bh) ||
+ ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) {
ext4_unlock_group(sb, block_group);
ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
ext4_mark_group_bitmap_corrupted(sb, block_group,
@@ -505,7 +506,9 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
if (!desc)
return -EFSCORRUPTED;
wait_on_buffer(bh);
+ ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO);
if (!buffer_uptodate(bh)) {
+ ext4_set_errno(sb, EIO);
ext4_error(sb, "Cannot read block bitmap - "
"block_group = %u, block_bitmap = %llu",
block_group, (unsigned long long) bh->b_blocknr);
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 9f00fc0bf21d..1f340743c9a8 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -120,7 +120,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
if (IS_ENCRYPTED(inode)) {
err = fscrypt_get_encryption_info(inode);
- if (err && err != -ENOKEY)
+ if (err)
return err;
}
@@ -462,7 +462,6 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
new_fn->name_len = ent_name->len;
new_fn->file_type = dirent->file_type;
memcpy(new_fn->name, ent_name->name, ent_name->len);
- new_fn->name[ent_name->len] = 0;
while (*p) {
parent = *p;
@@ -672,9 +671,11 @@ static int ext4_d_compare(const struct dentry *dentry, unsigned int len,
const char *str, const struct qstr *name)
{
struct qstr qstr = {.name = str, .len = len };
- struct inode *inode = dentry->d_parent->d_inode;
+ const struct dentry *parent = READ_ONCE(dentry->d_parent);
+ const struct inode *inode = READ_ONCE(parent->d_inode);
- if (!IS_CASEFOLDED(inode) || !EXT4_SB(inode->i_sb)->s_encoding) {
+ if (!inode || !IS_CASEFOLDED(inode) ||
+ !EXT4_SB(inode->i_sb)->s_encoding) {
if (len != name->len)
return -1;
return memcmp(str, name->name, len);
@@ -687,10 +688,11 @@ static int ext4_d_hash(const struct dentry *dentry, struct qstr *str)
{
const struct ext4_sb_info *sbi = EXT4_SB(dentry->d_sb);
const struct unicode_map *um = sbi->s_encoding;
+ const struct inode *inode = READ_ONCE(dentry->d_inode);
unsigned char *norm;
int len, ret = 0;
- if (!IS_CASEFOLDED(dentry->d_inode) || !um)
+ if (!inode || !IS_CASEFOLDED(inode) || !um)
return 0;
norm = kmalloc(PATH_MAX, GFP_ATOMIC);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f8578caba40d..9a2ee2428ecc 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1052,8 +1052,6 @@ struct ext4_inode_info {
/* allocation reservation info for delalloc */
/* In case of bigalloc, this refer to clusters rather than blocks */
unsigned int i_reserved_data_blocks;
- ext4_lblk_t i_da_metadata_calc_last_lblock;
- int i_da_metadata_calc_len;
/* pending cluster reservations for bigalloc file systems */
struct ext4_pending_tree i_pending_tree;
@@ -1343,7 +1341,8 @@ struct ext4_super_block {
__u8 s_lastcheck_hi;
__u8 s_first_error_time_hi;
__u8 s_last_error_time_hi;
- __u8 s_pad[2];
+ __u8 s_first_error_errcode;
+ __u8 s_last_error_errcode;
__le16 s_encoding; /* Filename charset encoding */
__le16 s_encoding_flags; /* Filename charset encoding flags */
__le32 s_reserved[95]; /* Padding to the end of the block */
@@ -1556,6 +1555,9 @@ struct ext4_sb_info {
/* Barrier between changing inodes' journal flags and writepages ops. */
struct percpu_rw_semaphore s_journal_flag_rwsem;
struct dax_device *s_daxdev;
+#ifdef CONFIG_EXT4_DEBUG
+ unsigned long s_simulate_fail;
+#endif
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1575,6 +1577,66 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
}
/*
+ * Simulate_fail codes
+ */
+#define EXT4_SIM_BBITMAP_EIO 1
+#define EXT4_SIM_BBITMAP_CRC 2
+#define EXT4_SIM_IBITMAP_EIO 3
+#define EXT4_SIM_IBITMAP_CRC 4
+#define EXT4_SIM_INODE_EIO 5
+#define EXT4_SIM_INODE_CRC 6
+#define EXT4_SIM_DIRBLOCK_EIO 7
+#define EXT4_SIM_DIRBLOCK_CRC 8
+
+static inline bool ext4_simulate_fail(struct super_block *sb,
+ unsigned long code)
+{
+#ifdef CONFIG_EXT4_DEBUG
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (unlikely(sbi->s_simulate_fail == code)) {
+ sbi->s_simulate_fail = 0;
+ return true;
+ }
+#endif
+ return false;
+}
+
+static inline void ext4_simulate_fail_bh(struct super_block *sb,
+ struct buffer_head *bh,
+ unsigned long code)
+{
+ if (!IS_ERR(bh) && ext4_simulate_fail(sb, code))
+ clear_buffer_uptodate(bh);
+}
+
+/*
+ * Error number codes for s_{first,last}_error_errno
+ *
+ * Linux errno numbers are architecture specific, so we need to translate
+ * them into something which is architecture independent. We don't define
+ * codes for all errno's; just the ones which are most likely to be the cause
+ * of an ext4_error() call.
+ */
+#define EXT4_ERR_UNKNOWN 1
+#define EXT4_ERR_EIO 2
+#define EXT4_ERR_ENOMEM 3
+#define EXT4_ERR_EFSBADCRC 4
+#define EXT4_ERR_EFSCORRUPTED 5
+#define EXT4_ERR_ENOSPC 6
+#define EXT4_ERR_ENOKEY 7
+#define EXT4_ERR_EROFS 8
+#define EXT4_ERR_EFBIG 9
+#define EXT4_ERR_EEXIST 10
+#define EXT4_ERR_ERANGE 11
+#define EXT4_ERR_EOVERFLOW 12
+#define EXT4_ERR_EBUSY 13
+#define EXT4_ERR_ENOTDIR 14
+#define EXT4_ERR_ENOTEMPTY 15
+#define EXT4_ERR_ESHUTDOWN 16
+#define EXT4_ERR_EFAULT 17
+
+/*
* Inode dynamic state flags
*/
enum {
@@ -2628,7 +2690,6 @@ extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
/* indirect.c */
extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
-extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
extern void ext4_ind_truncate(handle_t *, struct inode *inode);
extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
@@ -2679,8 +2740,6 @@ extern struct buffer_head *ext4_sb_bread(struct super_block *sb,
extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
extern int ext4_calculate_overhead(struct super_block *sb);
extern void ext4_superblock_csum_set(struct super_block *sb);
-extern void *ext4_kvmalloc(size_t size, gfp_t flags);
-extern void *ext4_kvzalloc(size_t size, gfp_t flags);
extern int ext4_alloc_flex_bg_array(struct super_block *sb,
ext4_group_t ngroup);
extern const char *ext4_decode_error(struct super_block *sb, int errno,
@@ -2688,6 +2747,7 @@ extern const char *ext4_decode_error(struct super_block *sb, int errno,
extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
ext4_group_t block_group,
unsigned int flags);
+extern void ext4_set_errno(struct super_block *sb, int err);
extern __printf(4, 5)
void __ext4_error(struct super_block *, const char *, unsigned int,
@@ -3254,7 +3314,6 @@ struct ext4_extent;
#define EXT_MAX_BLOCKS 0xffffffff
extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
-extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
@@ -3271,14 +3330,9 @@ extern int ext4_convert_unwritten_io_end_vec(handle_t *handle,
ext4_io_end_t *io_end);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
-extern int ext4_ext_calc_metadata_amount(struct inode *inode,
- ext4_lblk_t lblocks);
extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
int num,
struct ext4_ext_path *path);
-extern int ext4_can_extents_be_merged(struct inode *inode,
- struct ext4_extent *ex1,
- struct ext4_extent *ex2);
extern int ext4_ext_insert_extent(handle_t *, struct inode *,
struct ext4_ext_path **,
struct ext4_extent *, int);
@@ -3294,8 +3348,6 @@ extern int ext4_get_es_cache(struct inode *inode,
struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
extern int ext4_ext_precache(struct inode *inode);
-extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
-extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
struct inode *inode2, ext4_lblk_t lblk1,
ext4_lblk_t lblk2, ext4_lblk_t count,
@@ -3390,6 +3442,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
}
extern const struct iomap_ops ext4_iomap_ops;
+extern const struct iomap_ops ext4_iomap_overwrite_ops;
extern const struct iomap_ops ext4_iomap_report_ops;
static inline int ext4_buffer_uptodate(struct buffer_head *bh)
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 98bd0e9ee7df..1c216fcc202a 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -267,10 +267,5 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
0xffff);
}
-#define ext4_ext_dirty(handle, inode, path) \
- __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
-int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
- struct inode *inode, struct ext4_ext_path *path);
-
#endif /* _EXT4_EXTENTS */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index d3b8cdea5df7..1f53d64e42a5 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -7,6 +7,28 @@
#include <trace/events/ext4.h>
+int ext4_inode_journal_mode(struct inode *inode)
+{
+ if (EXT4_JOURNAL(inode) == NULL)
+ return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
+ /* We do not support data journalling with delayed allocation */
+ if (!S_ISREG(inode->i_mode) ||
+ ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
+ test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
+ (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
+ !test_opt(inode->i_sb, DELALLOC))) {
+ /* We do not support data journalling for encrypted data */
+ if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
+ return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
+ return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
+ }
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+ return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+ return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
+ BUG();
+}
+
/* Just increment the non-pointer handle value */
static handle_t *ext4_get_nojournal(void)
{
@@ -58,6 +80,7 @@ static int ext4_journal_check_start(struct super_block *sb)
* take the FS itself readonly cleanly.
*/
if (journal && is_journal_aborted(journal)) {
+ ext4_set_errno(sb, -journal->j_errno);
ext4_abort(sb, "Detected aborted journal");
return -EROFS;
}
@@ -249,6 +272,7 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
if (err) {
ext4_journal_abort_handle(where, line, __func__,
bh, handle, err);
+ ext4_set_errno(inode->i_sb, -err);
__ext4_abort(inode->i_sb, where, line,
"error %d when attempting revoke", err);
}
@@ -320,6 +344,7 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
es = EXT4_SB(inode->i_sb)->s_es;
es->s_last_error_block =
cpu_to_le64(bh->b_blocknr);
+ ext4_set_errno(inode->i_sb, EIO);
ext4_error_inode(inode, where, line,
bh->b_blocknr,
"IO error syncing itable block");
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index a6b9b66dbfad..7ea4f6fa173b 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -463,27 +463,7 @@ int ext4_force_commit(struct super_block *sb);
#define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */
#define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */
-static inline int ext4_inode_journal_mode(struct inode *inode)
-{
- if (EXT4_JOURNAL(inode) == NULL)
- return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
- /* We do not support data journalling with delayed allocation */
- if (!S_ISREG(inode->i_mode) ||
- ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
- test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
- (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
- !test_opt(inode->i_sb, DELALLOC))) {
- /* We do not support data journalling for encrypted data */
- if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
- return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
- return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
- }
- if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
- return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
- if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
- return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
- BUG();
-}
+int ext4_inode_journal_mode(struct inode *inode);
static inline int ext4_should_journal_data(struct inode *inode)
{
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0e8708b77da6..954013d6076b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -161,8 +161,9 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
* - ENOMEM
* - EIO
*/
-int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
- struct inode *inode, struct ext4_ext_path *path)
+static int __ext4_ext_dirty(const char *where, unsigned int line,
+ handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path)
{
int err;
@@ -179,6 +180,9 @@ int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
return err;
}
+#define ext4_ext_dirty(handle, inode, path) \
+ __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
+
static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
struct ext4_ext_path *path,
ext4_lblk_t block)
@@ -309,53 +313,6 @@ ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
(nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0));
}
-/*
- * Calculate the number of metadata blocks needed
- * to allocate @blocks
- * Worse case is one block per extent
- */
-int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
- int idxs;
-
- idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
- / sizeof(struct ext4_extent_idx));
-
- /*
- * If the new delayed allocation block is contiguous with the
- * previous da block, it can share index blocks with the
- * previous block, so we only need to allocate a new index
- * block every idxs leaf blocks. At ldxs**2 blocks, we need
- * an additional index block, and at ldxs**3 blocks, yet
- * another index blocks.
- */
- if (ei->i_da_metadata_calc_len &&
- ei->i_da_metadata_calc_last_lblock+1 == lblock) {
- int num = 0;
-
- if ((ei->i_da_metadata_calc_len % idxs) == 0)
- num++;
- if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
- num++;
- if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
- num++;
- ei->i_da_metadata_calc_len = 0;
- } else
- ei->i_da_metadata_calc_len++;
- ei->i_da_metadata_calc_last_lblock++;
- return num;
- }
-
- /*
- * In the worst case we need a new set of index blocks at
- * every level of the inode's extent tree.
- */
- ei->i_da_metadata_calc_len = 1;
- ei->i_da_metadata_calc_last_lblock = lblock;
- return ext_depth(inode) + 1;
-}
-
static int
ext4_ext_max_entries(struct inode *inode, int depth)
{
@@ -492,6 +449,7 @@ static int __ext4_ext_check(const char *function, unsigned int line,
return 0;
corrupted:
+ ext4_set_errno(inode->i_sb, -err);
ext4_error_inode(inode, function, line, 0,
"pblk %llu bad header/extent: %s - magic %x, "
"entries %u, max %u(%u), depth %u(%u)",
@@ -510,6 +468,30 @@ int ext4_ext_check_inode(struct inode *inode)
return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
}
+static void ext4_cache_extents(struct inode *inode,
+ struct ext4_extent_header *eh)
+{
+ struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
+ ext4_lblk_t prev = 0;
+ int i;
+
+ for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
+ unsigned int status = EXTENT_STATUS_WRITTEN;
+ ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
+ int len = ext4_ext_get_actual_len(ex);
+
+ if (prev && (prev != lblk))
+ ext4_es_cache_extent(inode, prev, lblk - prev, ~0,
+ EXTENT_STATUS_HOLE);
+
+ if (ext4_ext_is_unwritten(ex))
+ status = EXTENT_STATUS_UNWRITTEN;
+ ext4_es_cache_extent(inode, lblk, len,
+ ext4_ext_pblock(ex), status);
+ prev = lblk + len;
+ }
+}
+
static struct buffer_head *
__read_extent_tree_block(const char *function, unsigned int line,
struct inode *inode, ext4_fsblk_t pblk, int depth,
@@ -544,26 +526,7 @@ __read_extent_tree_block(const char *function, unsigned int line,
*/
if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
struct ext4_extent_header *eh = ext_block_hdr(bh);
- struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
- ext4_lblk_t prev = 0;
- int i;
-
- for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
- unsigned int status = EXTENT_STATUS_WRITTEN;
- ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
- int len = ext4_ext_get_actual_len(ex);
-
- if (prev && (prev != lblk))
- ext4_es_cache_extent(inode, prev,
- lblk - prev, ~0,
- EXTENT_STATUS_HOLE);
-
- if (ext4_ext_is_unwritten(ex))
- status = EXTENT_STATUS_UNWRITTEN;
- ext4_es_cache_extent(inode, lblk, len,
- ext4_ext_pblock(ex), status);
- prev = lblk + len;
- }
+ ext4_cache_extents(inode, eh);
}
return bh;
errout:
@@ -649,8 +612,9 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
ext_debug("path:");
for (k = 0; k <= l; k++, path++) {
if (path->p_idx) {
- ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block),
- ext4_idx_pblock(path->p_idx));
+ ext_debug(" %d->%llu",
+ le32_to_cpu(path->p_idx->ei_block),
+ ext4_idx_pblock(path->p_idx));
} else if (path->p_ext) {
ext_debug(" %d:[%d]%d:%llu ",
le32_to_cpu(path->p_ext->ee_block),
@@ -731,11 +695,12 @@ void ext4_ext_drop_refs(struct ext4_ext_path *path)
if (!path)
return;
depth = path->p_depth;
- for (i = 0; i <= depth; i++, path++)
+ for (i = 0; i <= depth; i++, path++) {
if (path->p_bh) {
brelse(path->p_bh);
path->p_bh = NULL;
}
+ }
}
/*
@@ -777,8 +742,8 @@ ext4_ext_binsearch_idx(struct inode *inode,
chix = ix = EXT_FIRST_INDEX(eh);
for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
- if (k != 0 &&
- le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
+ if (k != 0 && le32_to_cpu(ix->ei_block) <=
+ le32_to_cpu(ix[-1].ei_block)) {
printk(KERN_DEBUG "k=%d, ix=0x%p, "
"first=0x%p\n", k,
ix, EXT_FIRST_INDEX(eh));
@@ -911,6 +876,8 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
path[0].p_bh = NULL;
i = depth;
+ if (!(flags & EXT4_EX_NOCACHE) && depth == 0)
+ ext4_cache_extents(inode, eh);
/* walk through the tree */
while (i) {
ext_debug("depth %d: num %d, max %d\n",
@@ -1632,17 +1599,16 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
return EXT_MAX_BLOCKS;
while (depth >= 0) {
+ struct ext4_ext_path *p = &path[depth];
+
if (depth == path->p_depth) {
/* leaf */
- if (path[depth].p_ext &&
- path[depth].p_ext !=
- EXT_LAST_EXTENT(path[depth].p_hdr))
- return le32_to_cpu(path[depth].p_ext[1].ee_block);
+ if (p->p_ext && p->p_ext != EXT_LAST_EXTENT(p->p_hdr))
+ return le32_to_cpu(p->p_ext[1].ee_block);
} else {
/* index */
- if (path[depth].p_idx !=
- EXT_LAST_INDEX(path[depth].p_hdr))
- return le32_to_cpu(path[depth].p_idx[1].ei_block);
+ if (p->p_idx != EXT_LAST_INDEX(p->p_hdr))
+ return le32_to_cpu(p->p_idx[1].ei_block);
}
depth--;
}
@@ -1742,9 +1708,9 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
return err;
}
-int
-ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
- struct ext4_extent *ex2)
+static int ext4_can_extents_be_merged(struct inode *inode,
+ struct ext4_extent *ex1,
+ struct ext4_extent *ex2)
{
unsigned short ext1_ee_len, ext2_ee_len;
@@ -1758,11 +1724,6 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
le32_to_cpu(ex2->ee_block))
return 0;
- /*
- * To allow future support for preallocated extents to be added
- * as an RO_COMPAT feature, refuse to merge to extents if
- * this can result in the top bit of ee_len being set.
- */
if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
return 0;
@@ -1870,13 +1831,14 @@ static void ext4_ext_try_to_merge_up(handle_t *handle,
}
/*
- * This function tries to merge the @ex extent to neighbours in the tree.
- * return 1 if merge left else 0.
+ * This function tries to merge the @ex extent to neighbours in the tree, then
+ * tries to collapse the extent tree into the inode.
*/
static void ext4_ext_try_to_merge(handle_t *handle,
struct inode *inode,
struct ext4_ext_path *path,
- struct ext4_extent *ex) {
+ struct ext4_extent *ex)
+{
struct ext4_extent_header *eh;
unsigned int depth;
int merge_done = 0;
@@ -3718,9 +3680,6 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
max_zeroout = sbi->s_extent_max_zeroout_kb >>
(inode->i_sb->s_blocksize_bits - 10);
- if (IS_ENCRYPTED(inode))
- max_zeroout = 0;
-
/*
* five cases:
* 1. split the extent into three extents.
@@ -4706,6 +4665,10 @@ retry:
return ret > 0 ? ret2 : ret;
}
+static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
+
+static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
+
static long ext4_zero_range(struct file *file, loff_t offset,
loff_t len, int mode)
{
@@ -4723,9 +4686,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
trace_ext4_zero_range(inode, offset, len, mode);
- if (!S_ISREG(inode->i_mode))
- return -EINVAL;
-
/* Call ext4_force_commit to flush all data in case of data=journal. */
if (ext4_should_journal_data(inode)) {
ret = ext4_force_commit(inode->i_sb);
@@ -4765,7 +4725,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
}
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- (offset + len > i_size_read(inode) ||
+ (offset + len > inode->i_size ||
offset + len > EXT4_I(inode)->i_disksize)) {
new_size = offset + len;
ret = inode_newsize_ok(inode, new_size);
@@ -4849,7 +4809,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
* Mark that we allocate beyond EOF so the subsequent truncate
* can proceed even if the new size is the same as i_size.
*/
- if ((offset + len) > i_size_read(inode))
+ if (offset + len > inode->i_size)
ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
}
ext4_mark_inode_dirty(handle, inode);
@@ -4890,14 +4850,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
* range since we would need to re-encrypt blocks with a
* different IV or XTS tweak (which are based on the logical
* block number).
- *
- * XXX It's not clear why zero range isn't working, but we'll
- * leave it disabled for encrypted inodes for now. This is a
- * bug we should fix....
*/
if (IS_ENCRYPTED(inode) &&
- (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE |
- FALLOC_FL_ZERO_RANGE)))
+ (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
return -EOPNOTSUPP;
/* Return error if mode is not supported */
@@ -4941,7 +4896,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
}
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- (offset + len > i_size_read(inode) ||
+ (offset + len > inode->i_size ||
offset + len > EXT4_I(inode)->i_disksize)) {
new_size = offset + len;
ret = inode_newsize_ok(inode, new_size);
@@ -5268,7 +5223,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
{
int depth, err = 0;
struct ext4_extent *ex_start, *ex_last;
- bool update = 0;
+ bool update = false;
depth = path->p_depth;
while (depth >= 0) {
@@ -5284,7 +5239,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
goto out;
if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
- update = 1;
+ update = true;
while (ex_start <= ex_last) {
if (SHIFT == SHIFT_LEFT) {
@@ -5472,7 +5427,7 @@ out:
* This implements the fallocate's collapse range functionality for ext4
* Returns: 0 and non-zero on error.
*/
-int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
{
struct super_block *sb = inode->i_sb;
ext4_lblk_t punch_start, punch_stop;
@@ -5489,12 +5444,8 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return -EOPNOTSUPP;
- /* Collapse range works only on fs block size aligned offsets. */
- if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
- len & (EXT4_CLUSTER_SIZE(sb) - 1))
- return -EINVAL;
-
- if (!S_ISREG(inode->i_mode))
+ /* Collapse range works only on fs cluster size aligned regions. */
+ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
return -EINVAL;
trace_ext4_collapse_range(inode, offset, len);
@@ -5514,7 +5465,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
* There is no need to overlap collapse range with EOF, in which case
* it is effectively a truncate operation
*/
- if (offset + len >= i_size_read(inode)) {
+ if (offset + len >= inode->i_size) {
ret = -EINVAL;
goto out_mutex;
}
@@ -5592,7 +5543,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
goto out_stop;
}
- new_size = i_size_read(inode) - len;
+ new_size = inode->i_size - len;
i_size_write(inode, new_size);
EXT4_I(inode)->i_disksize = new_size;
@@ -5620,7 +5571,7 @@ out_mutex:
* by len bytes.
* Returns 0 on success, error otherwise.
*/
-int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
+static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
{
struct super_block *sb = inode->i_sb;
handle_t *handle;
@@ -5639,14 +5590,10 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return -EOPNOTSUPP;
- /* Insert range works only on fs block size aligned offsets. */
- if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
- len & (EXT4_CLUSTER_SIZE(sb) - 1))
+ /* Insert range works only on fs cluster size aligned regions. */
+ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
return -EINVAL;
- if (!S_ISREG(inode->i_mode))
- return -EOPNOTSUPP;
-
trace_ext4_insert_range(inode, offset, len);
offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
@@ -5666,14 +5613,14 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
goto out_mutex;
}
- /* Check for wrap through zero */
- if (inode->i_size + len > inode->i_sb->s_maxbytes) {
+ /* Check whether the maximum file size would be exceeded */
+ if (len > inode->i_sb->s_maxbytes - inode->i_size) {
ret = -EFBIG;
goto out_mutex;
}
- /* Offset should be less than i_size */
- if (offset >= i_size_read(inode)) {
+ /* Offset must be less than i_size */
+ if (offset >= inode->i_size) {
ret = -EINVAL;
goto out_mutex;
}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 825313c59752..4ec30a798260 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -209,6 +209,12 @@ static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
return es->es_pblk & ~ES_MASK;
}
+static inline ext4_fsblk_t ext4_es_show_pblock(struct extent_status *es)
+{
+ ext4_fsblk_t pblock = ext4_es_pblock(es);
+ return pblock == ~ES_MASK ? 0 : pblock;
+}
+
static inline void ext4_es_store_pblock(struct extent_status *es,
ext4_fsblk_t pb)
{
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6a7293a5cda2..5f225881176b 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -88,9 +88,10 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
- if (!inode_trylock_shared(inode)) {
- if (iocb->ki_flags & IOCB_NOWAIT)
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock_shared(inode))
return -EAGAIN;
+ } else {
inode_lock_shared(inode);
}
/*
@@ -165,19 +166,25 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
* threads are at work on the same unwritten block, they must be synchronized
* or one thread will zero the other's data, causing corruption.
*/
-static int
-ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
+static bool
+ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos)
{
struct super_block *sb = inode->i_sb;
- int blockmask = sb->s_blocksize - 1;
-
- if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize))
- return 0;
+ unsigned long blockmask = sb->s_blocksize - 1;
if ((pos | iov_iter_alignment(from)) & blockmask)
- return 1;
+ return true;
- return 0;
+ return false;
+}
+
+static bool
+ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
+{
+ if (offset + len > i_size_read(inode) ||
+ offset + len > EXT4_I(inode)->i_disksize)
+ return true;
+ return false;
}
/* Is IO overwriting allocated and initialized blocks? */
@@ -203,7 +210,8 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
}
-static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
+ struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
@@ -227,11 +235,21 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
}
+ return iov_iter_count(from);
+}
+
+static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
+{
+ ssize_t ret, count;
+
+ count = ext4_generic_write_checks(iocb, from);
+ if (count <= 0)
+ return count;
+
ret = file_modified(iocb->ki_filp);
if (ret)
return ret;
-
- return iov_iter_count(from);
+ return count;
}
static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
@@ -363,62 +381,137 @@ static const struct iomap_dio_ops ext4_dio_write_ops = {
.end_io = ext4_dio_write_end_io,
};
+/*
+ * The intention here is to start with shared lock acquired then see if any
+ * condition requires an exclusive inode lock. If yes, then we restart the
+ * whole operation by releasing the shared lock and acquiring exclusive lock.
+ *
+ * - For unaligned_io we never take shared lock as it may cause data corruption
+ * when two unaligned IO tries to modify the same block e.g. while zeroing.
+ *
+ * - For extending writes case we don't take the shared lock, since it requires
+ * updating inode i_disksize and/or orphan handling with exclusive lock.
+ *
+ * - shared locking will only be true mostly with overwrites. Otherwise we will
+ * switch to exclusive i_rwsem lock.
+ */
+static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
+ bool *ilock_shared, bool *extend)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ loff_t offset;
+ size_t count;
+ ssize_t ret;
+
+restart:
+ ret = ext4_generic_write_checks(iocb, from);
+ if (ret <= 0)
+ goto out;
+
+ offset = iocb->ki_pos;
+ count = ret;
+ if (ext4_extending_io(inode, offset, count))
+ *extend = true;
+ /*
+ * Determine whether the IO operation will overwrite allocated
+ * and initialized blocks.
+ * We need exclusive i_rwsem for changing security info
+ * in file_modified().
+ */
+ if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
+ !ext4_overwrite_io(inode, offset, count))) {
+ inode_unlock_shared(inode);
+ *ilock_shared = false;
+ inode_lock(inode);
+ goto restart;
+ }
+
+ ret = file_modified(file);
+ if (ret < 0)
+ goto out;
+
+ return count;
+out:
+ if (*ilock_shared)
+ inode_unlock_shared(inode);
+ else
+ inode_unlock(inode);
+ return ret;
+}
+
static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
ssize_t ret;
- size_t count;
- loff_t offset;
handle_t *handle;
struct inode *inode = file_inode(iocb->ki_filp);
- bool extend = false, overwrite = false, unaligned_aio = false;
+ loff_t offset = iocb->ki_pos;
+ size_t count = iov_iter_count(from);
+ const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
+ bool extend = false, unaligned_io = false;
+ bool ilock_shared = true;
+
+ /*
+ * We initially start with shared inode lock unless it is
+ * unaligned IO which needs exclusive lock anyways.
+ */
+ if (ext4_unaligned_io(inode, from, offset)) {
+ unaligned_io = true;
+ ilock_shared = false;
+ }
+ /*
+ * Quick check here without any i_rwsem lock to see if it is extending
+ * IO. A more reliable check is done in ext4_dio_write_checks() with
+ * proper locking in place.
+ */
+ if (offset + count > i_size_read(inode))
+ ilock_shared = false;
if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!inode_trylock(inode))
- return -EAGAIN;
+ if (ilock_shared) {
+ if (!inode_trylock_shared(inode))
+ return -EAGAIN;
+ } else {
+ if (!inode_trylock(inode))
+ return -EAGAIN;
+ }
} else {
- inode_lock(inode);
+ if (ilock_shared)
+ inode_lock_shared(inode);
+ else
+ inode_lock(inode);
}
+ /* Fallback to buffered I/O if the inode does not support direct I/O. */
if (!ext4_dio_supported(inode)) {
- inode_unlock(inode);
- /*
- * Fallback to buffered I/O if the inode does not support
- * direct I/O.
- */
+ if (ilock_shared)
+ inode_unlock_shared(inode);
+ else
+ inode_unlock(inode);
return ext4_buffered_write_iter(iocb, from);
}
- ret = ext4_write_checks(iocb, from);
- if (ret <= 0) {
- inode_unlock(inode);
+ ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
+ if (ret <= 0)
return ret;
- }
- /*
- * Unaligned asynchronous direct I/O must be serialized among each
- * other as the zeroing of partial blocks of two competing unaligned
- * asynchronous direct I/O writes can result in data corruption.
- */
offset = iocb->ki_pos;
- count = iov_iter_count(from);
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
- !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) {
- unaligned_aio = true;
- inode_dio_wait(inode);
- }
+ count = ret;
/*
- * Determine whether the I/O will overwrite allocated and initialized
- * blocks. If so, check to see whether it is possible to take the
- * dioread_nolock path.
+ * Unaligned direct IO must be serialized among each other as zeroing
+ * of partial blocks of two competing unaligned IOs can result in data
+ * corruption.
+ *
+ * So we make sure we don't allow any unaligned IO in flight.
+ * For IOs where we need not wait (like unaligned non-AIO DIO),
+ * below inode_dio_wait() may anyway become a no-op, since we start
+ * with exclusive lock.
*/
- if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) &&
- ext4_should_dioread_nolock(inode)) {
- overwrite = true;
- downgrade_write(&inode->i_rwsem);
- }
+ if (unaligned_io)
+ inode_dio_wait(inode);
- if (offset + count > EXT4_I(inode)->i_disksize) {
+ if (extend) {
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
@@ -431,18 +524,19 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
goto out;
}
- extend = true;
ext4_journal_stop(handle);
}
- ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops,
- is_sync_kiocb(iocb) || unaligned_aio || extend);
+ if (ilock_shared)
+ iomap_ops = &ext4_iomap_overwrite_ops;
+ ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
+ is_sync_kiocb(iocb) || unaligned_io || extend);
if (extend)
ret = ext4_handle_inode_extension(inode, offset, ret, count);
out:
- if (overwrite)
+ if (ilock_shared)
inode_unlock_shared(inode);
else
inode_unlock(inode);
@@ -487,9 +581,10 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
bool extend = false;
struct inode *inode = file_inode(iocb->ki_filp);
- if (!inode_trylock(inode)) {
- if (iocb->ki_flags & IOCB_NOWAIT)
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock(inode))
return -EAGAIN;
+ } else {
inode_lock(inode);
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 8ca4a23129aa..c66e8f9451a2 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -94,7 +94,8 @@ static int ext4_validate_inode_bitmap(struct super_block *sb,
goto verified;
blk = ext4_inode_bitmap(sb, desc);
if (!ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
- EXT4_INODES_PER_GROUP(sb) / 8)) {
+ EXT4_INODES_PER_GROUP(sb) / 8) ||
+ ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) {
ext4_unlock_group(sb, block_group);
ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
"inode_bitmap = %llu", block_group, blk);
@@ -192,8 +193,10 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
get_bh(bh);
submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
+ ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO);
if (!buffer_uptodate(bh)) {
put_bh(bh);
+ ext4_set_errno(sb, EIO);
ext4_error(sb, "Cannot read inode bitmap - "
"block_group = %u, inode_bitmap = %llu",
block_group, bitmap_blk);
@@ -1223,6 +1226,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
+ ext4_set_errno(sb, -err);
ext4_error(sb, "couldn't read orphan inode %lu (err %d)",
ino, err);
return inode;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 3a4ab70fe9e0..569fc68e8975 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -660,32 +660,6 @@ out:
}
/*
- * Calculate the number of metadata blocks need to reserve
- * to allocate a new block at @lblocks for non extent file based file
- */
-int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
- sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
- int blk_bits;
-
- if (lblock < EXT4_NDIR_BLOCKS)
- return 0;
-
- lblock -= EXT4_NDIR_BLOCKS;
-
- if (ei->i_da_metadata_calc_len &&
- (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
- ei->i_da_metadata_calc_len++;
- return 0;
- }
- ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
- ei->i_da_metadata_calc_len = 1;
- blk_bits = order_base_2(lblock);
- return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
-}
-
-/*
* Calculate number of indirect blocks touched by mapping @nrblocks logically
* contiguous blocks
*/
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 2fec62d764fa..fad82d08fca5 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -98,6 +98,7 @@ int ext4_get_max_inline_size(struct inode *inode)
error = ext4_get_inode_loc(inode, &iloc);
if (error) {
+ ext4_set_errno(inode->i_sb, -error);
ext4_error_inode(inode, __func__, __LINE__, 0,
"can't get inode location %lu",
inode->i_ino);
@@ -849,7 +850,7 @@ out:
/*
* Prepare the write for the inline data.
- * If the the data can be written into the inode, we just read
+ * If the data can be written into the inode, we just read
* the page and make it uptodate, and start the journal.
* Otherwise read the page, makes it dirty so that it can be
* handle in writepages(the i_disksize update is left to the
@@ -1761,6 +1762,7 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data)
err = ext4_get_inode_loc(dir, &iloc);
if (err) {
+ ext4_set_errno(dir->i_sb, -err);
EXT4_ERROR_INODE(dir, "error %d getting inode %lu block",
err, dir->i_ino);
return true;
diff --git a/fs/ext4/inode-test.c b/fs/ext4/inode-test.c
index bbce1c328d85..d62d802c9c12 100644
--- a/fs/ext4/inode-test.c
+++ b/fs/ext4/inode-test.c
@@ -269,4 +269,6 @@ static struct kunit_suite ext4_inode_test_suite = {
.test_cases = ext4_inode_test_cases,
};
-kunit_test_suite(ext4_inode_test_suite);
+kunit_test_suites(&ext4_inode_test_suite);
+
+MODULE_LICENSE("GPL v2");
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 629a25d999f0..3313168b680f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -48,8 +48,6 @@
#include <trace/events/ext4.h>
-#define MPAGE_DA_EXTENT_TAIL 0x01
-
static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
struct ext4_inode_info *ei)
{
@@ -271,6 +269,7 @@ void ext4_evict_inode(struct inode *inode)
if (inode->i_blocks) {
err = ext4_truncate(inode);
if (err) {
+ ext4_set_errno(inode->i_sb, -err);
ext4_error(inode->i_sb,
"couldn't truncate inode %lu (err %d)",
inode->i_ino, err);
@@ -402,7 +401,7 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
{
int ret;
- if (IS_ENCRYPTED(inode))
+ if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
return fscrypt_zeroout_range(inode, lblk, pblk, len);
ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS);
@@ -2478,10 +2477,12 @@ update_disksize:
EXT4_I(inode)->i_disksize = disksize;
up_write(&EXT4_I(inode)->i_data_sem);
err2 = ext4_mark_inode_dirty(handle, inode);
- if (err2)
+ if (err2) {
+ ext4_set_errno(inode->i_sb, -err2);
ext4_error(inode->i_sb,
"Failed to mark inode %lu dirty",
inode->i_ino);
+ }
if (!err)
err = err2;
}
@@ -3448,6 +3449,22 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
return 0;
}
+static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset,
+ loff_t length, unsigned flags, struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ int ret;
+
+ /*
+ * Even for writes we don't need to allocate blocks, so just pretend
+ * we are reading to save overhead of starting a transaction.
+ */
+ flags &= ~IOMAP_WRITE;
+ ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap);
+ WARN_ON_ONCE(iomap->type != IOMAP_MAPPED);
+ return ret;
+}
+
static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
ssize_t written, unsigned flags, struct iomap *iomap)
{
@@ -3469,6 +3486,11 @@ const struct iomap_ops ext4_iomap_ops = {
.iomap_end = ext4_iomap_end,
};
+const struct iomap_ops ext4_iomap_overwrite_ops = {
+ .iomap_begin = ext4_iomap_overwrite_begin,
+ .iomap_end = ext4_iomap_end,
+};
+
static bool ext4_iomap_is_delalloc(struct inode *inode,
struct ext4_map_blocks *map)
{
@@ -3701,8 +3723,12 @@ static int __ext4_block_zero_page_range(handle_t *handle,
if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) {
/* We expect the key to be set. */
BUG_ON(!fscrypt_has_encryption_key(inode));
- WARN_ON_ONCE(fscrypt_decrypt_pagecache_blocks(
- page, blocksize, bh_offset(bh)));
+ err = fscrypt_decrypt_pagecache_blocks(page, blocksize,
+ bh_offset(bh));
+ if (err) {
+ clear_buffer_uptodate(bh);
+ goto unlock;
+ }
}
}
if (ext4_should_journal_data(inode)) {
@@ -3912,9 +3938,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
unsigned int credits;
int ret = 0;
- if (!S_ISREG(inode->i_mode))
- return -EOPNOTSUPP;
-
trace_ext4_punch_hole(inode, offset, length, 0);
ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
@@ -4240,6 +4263,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
bh = sb_getblk(sb, block);
if (unlikely(!bh))
return -ENOMEM;
+ if (ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO))
+ goto simulate_eio;
if (!buffer_uptodate(bh)) {
lock_buffer(bh);
@@ -4338,6 +4363,8 @@ make_io:
blk_finish_plug(&plug);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
+ simulate_eio:
+ ext4_set_errno(inode->i_sb, EIO);
EXT4_ERROR_INODE_BLOCK(inode, block,
"unable to read itable block");
brelse(bh);
@@ -4551,7 +4578,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
sizeof(gen));
}
- if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
+ if (!ext4_inode_csum_verify(inode, raw_inode, ei) ||
+ ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) {
+ ext4_set_errno(inode->i_sb, EFSBADCRC);
ext4_error_inode(inode, function, line, 0,
"iget: checksum invalid");
ret = -EFSBADCRC;
@@ -5090,6 +5119,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
sync_dirty_buffer(iloc.bh);
if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
+ ext4_set_errno(inode->i_sb, EIO);
EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
"IO error syncing inode");
err = -EIO;
@@ -5368,7 +5398,8 @@ int ext4_getattr(const struct path *path, struct kstat *stat,
struct ext4_inode_info *ei = EXT4_I(inode);
unsigned int flags;
- if (EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) {
+ if ((request_mask & STATX_BTIME) &&
+ EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) {
stat->result_mask |= STATX_BTIME;
stat->btime.tv_sec = ei->i_crtime.tv_sec;
stat->btime.tv_nsec = ei->i_crtime.tv_nsec;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index e8870fff8224..a0ec750018dd 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -1377,6 +1377,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC_CLEAR_ES_CACHE:
case EXT4_IOC_GETSTATE:
case EXT4_IOC_GET_ES_CACHE:
+ case EXT4_IOC_FSGETXATTR:
+ case EXT4_IOC_FSSETXATTR:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a3e2767bdf2f..f64838187559 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3895,6 +3895,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
bitmap_bh = ext4_read_block_bitmap(sb, group);
if (IS_ERR(bitmap_bh)) {
err = PTR_ERR(bitmap_bh);
+ ext4_set_errno(sb, -err);
ext4_error(sb, "Error %d reading block bitmap for %u",
err, group);
return 0;
@@ -4063,6 +4064,7 @@ repeat:
err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
GFP_NOFS|__GFP_NOFAIL);
if (err) {
+ ext4_set_errno(sb, -err);
ext4_error(sb, "Error %d loading buddy information for %u",
err, group);
continue;
@@ -4071,6 +4073,7 @@ repeat:
bitmap_bh = ext4_read_block_bitmap(sb, group);
if (IS_ERR(bitmap_bh)) {
err = PTR_ERR(bitmap_bh);
+ ext4_set_errno(sb, -err);
ext4_error(sb, "Error %d reading block bitmap for %u",
err, group);
ext4_mb_unload_buddy(&e4b);
@@ -4325,6 +4328,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
GFP_NOFS|__GFP_NOFAIL);
if (err) {
+ ext4_set_errno(sb, -err);
ext4_error(sb, "Error %d loading buddy information for %u",
err, group);
continue;
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 2305b4374fd3..1c44b1a32001 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -173,8 +173,10 @@ static int kmmpd(void *data)
* (s_mmp_update_interval * 60) seconds.
*/
if (retval) {
- if ((failed_writes % 60) == 0)
+ if ((failed_writes % 60) == 0) {
+ ext4_set_errno(sb, -retval);
ext4_error(sb, "Error writing to MMP block");
+ }
failed_writes++;
}
@@ -205,6 +207,7 @@ static int kmmpd(void *data)
retval = read_mmp_block(sb, &bh_check, mmp_block);
if (retval) {
+ ext4_set_errno(sb, -retval);
ext4_error(sb, "error reading MMP data: %d",
retval);
goto exit_thread;
@@ -218,6 +221,7 @@ static int kmmpd(void *data)
"Error while updating MMP info. "
"The filesystem seems to have been"
" multiply mounted.");
+ ext4_set_errno(sb, EBUSY);
ext4_error(sb, "abort");
put_bh(bh_check);
retval = -EBUSY;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 1cb42d940784..129d2ebae00d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -109,7 +109,10 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
struct ext4_dir_entry *dirent;
int is_dx_block = 0;
- bh = ext4_bread(NULL, inode, block, 0);
+ if (ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_EIO))
+ bh = ERR_PTR(-EIO);
+ else
+ bh = ext4_bread(NULL, inode, block, 0);
if (IS_ERR(bh)) {
__ext4_warning(inode->i_sb, func, line,
"inode #%lu: lblock %lu: comm %s: "
@@ -153,9 +156,11 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
* caller is sure it should be an index block.
*/
if (is_dx_block && type == INDEX) {
- if (ext4_dx_csum_verify(inode, dirent))
+ if (ext4_dx_csum_verify(inode, dirent) &&
+ !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC))
set_buffer_verified(bh);
else {
+ ext4_set_errno(inode->i_sb, EFSBADCRC);
ext4_error_inode(inode, func, line, block,
"Directory index failed checksum");
brelse(bh);
@@ -163,9 +168,11 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
}
}
if (!is_dx_block) {
- if (ext4_dirblock_csum_verify(inode, bh))
+ if (ext4_dirblock_csum_verify(inode, bh) &&
+ !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC))
set_buffer_verified(bh);
else {
+ ext4_set_errno(inode->i_sb, EFSBADCRC);
ext4_error_inode(inode, func, line, block,
"Directory block failed checksum");
brelse(bh);
@@ -1002,7 +1009,6 @@ static int htree_dirblock_to_tree(struct file *dir_file,
top = (struct ext4_dir_entry_2 *) ((char *) de +
dir->i_sb->s_blocksize -
EXT4_DIR_REC_LEN(0));
-#ifdef CONFIG_FS_ENCRYPTION
/* Check if the directory is encrypted */
if (IS_ENCRYPTED(dir)) {
err = fscrypt_get_encryption_info(dir);
@@ -1017,7 +1023,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
return err;
}
}
-#endif
+
for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
bh->b_data, bh->b_size,
@@ -1065,9 +1071,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
}
errout:
brelse(bh);
-#ifdef CONFIG_FS_ENCRYPTION
fscrypt_fname_free_buffer(&fname_crypto_str);
-#endif
return count;
}
@@ -1527,6 +1531,7 @@ restart:
goto next;
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
+ ext4_set_errno(sb, EIO);
EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
(unsigned long) block);
brelse(bh);
@@ -1537,6 +1542,7 @@ restart:
!is_dx_internal_node(dir, block,
(struct ext4_dir_entry *)bh->b_data) &&
!ext4_dirblock_csum_verify(dir, bh)) {
+ ext4_set_errno(sb, EFSBADCRC);
EXT4_ERROR_INODE(dir, "checksumming directory "
"block %lu", (unsigned long)block);
brelse(bh);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 24aeedb8fc75..68b39e75446a 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -512,17 +512,26 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
gfp_t gfp_flags = GFP_NOFS;
unsigned int enc_bytes = round_up(len, i_blocksize(inode));
+ /*
+ * Since bounce page allocation uses a mempool, we can only use
+ * a waiting mask (i.e. request guaranteed allocation) on the
+ * first page of the bio. Otherwise it can deadlock.
+ */
+ if (io->io_bio)
+ gfp_flags = GFP_NOWAIT | __GFP_NOWARN;
retry_encrypt:
bounce_page = fscrypt_encrypt_pagecache_blocks(page, enc_bytes,
0, gfp_flags);
if (IS_ERR(bounce_page)) {
ret = PTR_ERR(bounce_page);
- if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) {
- if (io->io_bio) {
+ if (ret == -ENOMEM &&
+ (io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) {
+ gfp_flags = GFP_NOFS;
+ if (io->io_bio)
ext4_io_submit(io);
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- }
- gfp_flags |= __GFP_NOFAIL;
+ else
+ gfp_flags |= __GFP_NOFAIL;
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
goto retry_encrypt;
}
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index fef7755300c3..c1769afbf799 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -57,6 +57,7 @@ enum bio_post_read_step {
STEP_INITIAL = 0,
STEP_DECRYPT,
STEP_VERITY,
+ STEP_MAX,
};
struct bio_post_read_ctx {
@@ -106,10 +107,22 @@ static void verity_work(struct work_struct *work)
{
struct bio_post_read_ctx *ctx =
container_of(work, struct bio_post_read_ctx, work);
+ struct bio *bio = ctx->bio;
- fsverity_verify_bio(ctx->bio);
+ /*
+ * fsverity_verify_bio() may call readpages() again, and although verity
+ * will be disabled for that, decryption may still be needed, causing
+ * another bio_post_read_ctx to be allocated. So to guarantee that
+ * mempool_alloc() never deadlocks we must free the current ctx first.
+ * This is safe because verity is the last post-read step.
+ */
+ BUILD_BUG_ON(STEP_VERITY + 1 != STEP_MAX);
+ mempool_free(ctx, bio_post_read_ctx_pool);
+ bio->bi_private = NULL;
- bio_post_read_processing(ctx);
+ fsverity_verify_bio(bio);
+
+ __read_end_io(bio);
}
static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
@@ -176,12 +189,11 @@ static inline bool ext4_need_verity(const struct inode *inode, pgoff_t idx)
idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
}
-static struct bio_post_read_ctx *get_bio_post_read_ctx(struct inode *inode,
- struct bio *bio,
- pgoff_t first_idx)
+static void ext4_set_bio_post_read_ctx(struct bio *bio,
+ const struct inode *inode,
+ pgoff_t first_idx)
{
unsigned int post_read_steps = 0;
- struct bio_post_read_ctx *ctx = NULL;
if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
post_read_steps |= 1 << STEP_DECRYPT;
@@ -190,14 +202,14 @@ static struct bio_post_read_ctx *get_bio_post_read_ctx(struct inode *inode,
post_read_steps |= 1 << STEP_VERITY;
if (post_read_steps) {
- ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS);
- if (!ctx)
- return ERR_PTR(-ENOMEM);
+ /* Due to the mempool, this never fails. */
+ struct bio_post_read_ctx *ctx =
+ mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS);
+
ctx->bio = bio;
ctx->enabled_steps = post_read_steps;
bio->bi_private = ctx;
}
- return ctx;
}
static inline loff_t ext4_readpage_limit(struct inode *inode)
@@ -358,24 +370,16 @@ int ext4_mpage_readpages(struct address_space *mapping,
bio = NULL;
}
if (bio == NULL) {
- struct bio_post_read_ctx *ctx;
-
/*
* bio_alloc will _always_ be able to allocate a bio if
* __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset().
*/
bio = bio_alloc(GFP_KERNEL,
min_t(int, nr_pages, BIO_MAX_PAGES));
- ctx = get_bio_post_read_ctx(inode, bio, page->index);
- if (IS_ERR(ctx)) {
- bio_put(bio);
- bio = NULL;
- goto set_error_page;
- }
+ ext4_set_bio_post_read_ctx(bio, inode, page->index);
bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
bio->bi_end_io = mpage_end_io;
- bio->bi_private = ctx;
bio_set_op_attrs(bio, REQ_OP_READ,
is_readahead ? REQ_RAHEAD : 0);
}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index a8c0f2b5b6e1..86a2500ed292 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -824,9 +824,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
if (unlikely(err))
goto errout;
- n_group_desc = ext4_kvmalloc((gdb_num + 1) *
- sizeof(struct buffer_head *),
- GFP_NOFS);
+ n_group_desc = kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
+ GFP_KERNEL);
if (!n_group_desc) {
err = -ENOMEM;
ext4_warning(sb, "not enough memory for %lu groups",
@@ -900,9 +899,8 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
gdb_bh = ext4_sb_bread(sb, gdblock, 0);
if (IS_ERR(gdb_bh))
return PTR_ERR(gdb_bh);
- n_group_desc = ext4_kvmalloc((gdb_num + 1) *
- sizeof(struct buffer_head *),
- GFP_NOFS);
+ n_group_desc = kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
+ GFP_KERNEL);
if (!n_group_desc) {
brelse(gdb_bh);
err = -ENOMEM;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2937a8873fe1..88b213bd32bc 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -154,7 +154,7 @@ ext4_sb_bread(struct super_block *sb, sector_t block, int op_flags)
if (bh == NULL)
return ERR_PTR(-ENOMEM);
- if (buffer_uptodate(bh))
+ if (ext4_buffer_uptodate(bh))
return bh;
ll_rw_block(REQ_OP_READ, REQ_META | op_flags, 1, &bh);
wait_on_buffer(bh);
@@ -204,26 +204,6 @@ void ext4_superblock_csum_set(struct super_block *sb)
es->s_checksum = ext4_superblock_csum(sb, es);
}
-void *ext4_kvmalloc(size_t size, gfp_t flags)
-{
- void *ret;
-
- ret = kmalloc(size, flags | __GFP_NOWARN);
- if (!ret)
- ret = __vmalloc(size, flags, PAGE_KERNEL);
- return ret;
-}
-
-void *ext4_kvzalloc(size_t size, gfp_t flags)
-{
- void *ret;
-
- ret = kzalloc(size, flags | __GFP_NOWARN);
- if (!ret)
- ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
- return ret;
-}
-
ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
struct ext4_group_desc *bg)
{
@@ -367,6 +347,8 @@ static void __save_error_info(struct super_block *sb, const char *func,
ext4_update_tstamp(es, s_last_error_time);
strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
es->s_last_error_line = cpu_to_le32(line);
+ if (es->s_last_error_errcode == 0)
+ es->s_last_error_errcode = EXT4_ERR_EFSCORRUPTED;
if (!es->s_first_error_time) {
es->s_first_error_time = es->s_last_error_time;
es->s_first_error_time_hi = es->s_last_error_time_hi;
@@ -375,6 +357,7 @@ static void __save_error_info(struct super_block *sb, const char *func,
es->s_first_error_line = cpu_to_le32(line);
es->s_first_error_ino = es->s_last_error_ino;
es->s_first_error_block = es->s_last_error_block;
+ es->s_first_error_errcode = es->s_last_error_errcode;
}
/*
* Start the daily error reporting function if it hasn't been
@@ -631,6 +614,66 @@ const char *ext4_decode_error(struct super_block *sb, int errno,
return errstr;
}
+void ext4_set_errno(struct super_block *sb, int err)
+{
+ if (err < 0)
+ err = -err;
+
+ switch (err) {
+ case EIO:
+ err = EXT4_ERR_EIO;
+ break;
+ case ENOMEM:
+ err = EXT4_ERR_ENOMEM;
+ break;
+ case EFSBADCRC:
+ err = EXT4_ERR_EFSBADCRC;
+ break;
+ case EFSCORRUPTED:
+ err = EXT4_ERR_EFSCORRUPTED;
+ break;
+ case ENOSPC:
+ err = EXT4_ERR_ENOSPC;
+ break;
+ case ENOKEY:
+ err = EXT4_ERR_ENOKEY;
+ break;
+ case EROFS:
+ err = EXT4_ERR_EROFS;
+ break;
+ case EFBIG:
+ err = EXT4_ERR_EFBIG;
+ break;
+ case EEXIST:
+ err = EXT4_ERR_EEXIST;
+ break;
+ case ERANGE:
+ err = EXT4_ERR_ERANGE;
+ break;
+ case EOVERFLOW:
+ err = EXT4_ERR_EOVERFLOW;
+ break;
+ case EBUSY:
+ err = EXT4_ERR_EBUSY;
+ break;
+ case ENOTDIR:
+ err = EXT4_ERR_ENOTDIR;
+ break;
+ case ENOTEMPTY:
+ err = EXT4_ERR_ENOTEMPTY;
+ break;
+ case ESHUTDOWN:
+ err = EXT4_ERR_ESHUTDOWN;
+ break;
+ case EFAULT:
+ err = EXT4_ERR_EFAULT;
+ break;
+ default:
+ err = EXT4_ERR_UNKNOWN;
+ }
+ EXT4_SB(sb)->s_es->s_last_error_errcode = err;
+}
+
/* __ext4_std_error decodes expected errors from journaling functions
* automatically and invokes the appropriate error response. */
@@ -655,6 +698,7 @@ void __ext4_std_error(struct super_block *sb, const char *function,
sb->s_id, function, line, errstr);
}
+ ext4_set_errno(sb, -errno);
save_error_info(sb, function, line);
ext4_handle_error(sb);
}
@@ -982,8 +1026,10 @@ static void ext4_put_super(struct super_block *sb)
aborted = is_journal_aborted(sbi->s_journal);
err = jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
- if ((err < 0) && !aborted)
+ if ((err < 0) && !aborted) {
+ ext4_set_errno(sb, -err);
ext4_abort(sb, "Couldn't clean up the journal");
+ }
}
ext4_unregister_sysfs(sb);
@@ -1085,8 +1131,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_es_shk_nr = 0;
ei->i_es_shrink_lblk = 0;
ei->i_reserved_data_blocks = 0;
- ei->i_da_metadata_calc_len = 0;
- ei->i_da_metadata_calc_last_lblock = 0;
spin_lock_init(&(ei->i_block_reservation_lock));
ext4_init_pending_tree(&ei->i_pending_tree);
#ifdef CONFIG_QUOTA
@@ -1548,6 +1592,7 @@ static const match_table_t tokens = {
{Opt_auto_da_alloc, "auto_da_alloc"},
{Opt_noauto_da_alloc, "noauto_da_alloc"},
{Opt_dioread_nolock, "dioread_nolock"},
+ {Opt_dioread_lock, "nodioread_nolock"},
{Opt_dioread_lock, "dioread_lock"},
{Opt_discard, "discard"},
{Opt_nodiscard, "nodiscard"},
@@ -3720,6 +3765,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
set_opt(sb, NO_UID32);
/* xattr user namespace & acls are now defaulted on */
set_opt(sb, XATTR_USER);
+ set_opt(sb, DIOREAD_NOLOCK);
#ifdef CONFIG_EXT4_FS_POSIX_ACL
set_opt(sb, POSIX_ACL);
#endif
@@ -3887,9 +3933,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
#endif
if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
- printk_once(KERN_WARNING "EXT4-fs: Warning: mounting "
- "with data=journal disables delayed "
- "allocation and O_DIRECT support!\n");
+ printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, and O_DIRECT support!\n");
+ clear_opt(sb, DIOREAD_NOLOCK);
if (test_opt2(sb, EXPLICIT_DELALLOC)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"both data=journal and delalloc");
@@ -5540,9 +5585,15 @@ static int ext4_statfs_project(struct super_block *sb,
return PTR_ERR(dquot);
spin_lock(&dquot->dq_dqb_lock);
- limit = (dquot->dq_dqb.dqb_bsoftlimit ?
- dquot->dq_dqb.dqb_bsoftlimit :
- dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits;
+ limit = 0;
+ if (dquot->dq_dqb.dqb_bsoftlimit &&
+ (!limit || dquot->dq_dqb.dqb_bsoftlimit < limit))
+ limit = dquot->dq_dqb.dqb_bsoftlimit;
+ if (dquot->dq_dqb.dqb_bhardlimit &&
+ (!limit || dquot->dq_dqb.dqb_bhardlimit < limit))
+ limit = dquot->dq_dqb.dqb_bhardlimit;
+ limit >>= sb->s_blocksize_bits;
+
if (limit && buf->f_blocks > limit) {
curblock = (dquot->dq_dqb.dqb_curspace +
dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
@@ -5552,9 +5603,14 @@ static int ext4_statfs_project(struct super_block *sb,
(buf->f_blocks - curblock) : 0;
}
- limit = dquot->dq_dqb.dqb_isoftlimit ?
- dquot->dq_dqb.dqb_isoftlimit :
- dquot->dq_dqb.dqb_ihardlimit;
+ limit = 0;
+ if (dquot->dq_dqb.dqb_isoftlimit &&
+ (!limit || dquot->dq_dqb.dqb_isoftlimit < limit))
+ limit = dquot->dq_dqb.dqb_isoftlimit;
+ if (dquot->dq_dqb.dqb_ihardlimit &&
+ (!limit || dquot->dq_dqb.dqb_ihardlimit < limit))
+ limit = dquot->dq_dqb.dqb_ihardlimit;
+
if (limit && buf->f_files > limit) {
buf->f_files = limit;
buf->f_ffree =
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index eb1efad0e20a..d218ebdafa4a 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -29,6 +29,10 @@ typedef enum {
attr_last_error_time,
attr_feature,
attr_pointer_ui,
+ attr_pointer_ul,
+ attr_pointer_u64,
+ attr_pointer_u8,
+ attr_pointer_string,
attr_pointer_atomic,
attr_journal_task,
} attr_id_t;
@@ -46,6 +50,7 @@ struct ext4_attr {
struct attribute attr;
short attr_id;
short attr_ptr;
+ unsigned short attr_size;
union {
int offset;
void *explicit_ptr;
@@ -154,12 +159,35 @@ static struct ext4_attr ext4_attr_##_name = { \
}, \
}
+#define EXT4_ATTR_STRING(_name,_mode,_size,_struct,_elname) \
+static struct ext4_attr ext4_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .attr_id = attr_pointer_string, \
+ .attr_size = _size, \
+ .attr_ptr = ptr_##_struct##_offset, \
+ .u = { \
+ .offset = offsetof(struct _struct, _elname),\
+ }, \
+}
+
#define EXT4_RO_ATTR_ES_UI(_name,_elname) \
EXT4_ATTR_OFFSET(_name, 0444, pointer_ui, ext4_super_block, _elname)
+#define EXT4_RO_ATTR_ES_U8(_name,_elname) \
+ EXT4_ATTR_OFFSET(_name, 0444, pointer_u8, ext4_super_block, _elname)
+
+#define EXT4_RO_ATTR_ES_U64(_name,_elname) \
+ EXT4_ATTR_OFFSET(_name, 0444, pointer_u64, ext4_super_block, _elname)
+
+#define EXT4_RO_ATTR_ES_STRING(_name,_elname,_size) \
+ EXT4_ATTR_STRING(_name, 0444, _size, ext4_super_block, _elname)
+
#define EXT4_RW_ATTR_SBI_UI(_name,_elname) \
EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname)
+#define EXT4_RW_ATTR_SBI_UL(_name,_elname) \
+ EXT4_ATTR_OFFSET(_name, 0644, pointer_ul, ext4_sb_info, _elname)
+
#define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \
static struct ext4_attr ext4_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = _mode }, \
@@ -194,7 +222,20 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.int
EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
+#ifdef CONFIG_EXT4_DEBUG
+EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail);
+#endif
EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
+EXT4_RO_ATTR_ES_U8(first_error_errcode, s_first_error_errcode);
+EXT4_RO_ATTR_ES_U8(last_error_errcode, s_last_error_errcode);
+EXT4_RO_ATTR_ES_UI(first_error_ino, s_first_error_ino);
+EXT4_RO_ATTR_ES_UI(last_error_ino, s_last_error_ino);
+EXT4_RO_ATTR_ES_U64(first_error_block, s_first_error_block);
+EXT4_RO_ATTR_ES_U64(last_error_block, s_last_error_block);
+EXT4_RO_ATTR_ES_UI(first_error_line, s_first_error_line);
+EXT4_RO_ATTR_ES_UI(last_error_line, s_last_error_line);
+EXT4_RO_ATTR_ES_STRING(first_error_func, s_first_error_func, 32);
+EXT4_RO_ATTR_ES_STRING(last_error_func, s_last_error_func, 32);
EXT4_ATTR(first_error_time, 0444, first_error_time);
EXT4_ATTR(last_error_time, 0444, last_error_time);
EXT4_ATTR(journal_task, 0444, journal_task);
@@ -225,9 +266,22 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(msg_ratelimit_interval_ms),
ATTR_LIST(msg_ratelimit_burst),
ATTR_LIST(errors_count),
+ ATTR_LIST(first_error_ino),
+ ATTR_LIST(last_error_ino),
+ ATTR_LIST(first_error_block),
+ ATTR_LIST(last_error_block),
+ ATTR_LIST(first_error_line),
+ ATTR_LIST(last_error_line),
+ ATTR_LIST(first_error_func),
+ ATTR_LIST(last_error_func),
+ ATTR_LIST(first_error_errcode),
+ ATTR_LIST(last_error_errcode),
ATTR_LIST(first_error_time),
ATTR_LIST(last_error_time),
ATTR_LIST(journal_task),
+#ifdef CONFIG_EXT4_DEBUG
+ ATTR_LIST(simulate_fail),
+#endif
NULL,
};
ATTRIBUTE_GROUPS(ext4);
@@ -280,7 +334,7 @@ static void *calc_ptr(struct ext4_attr *a, struct ext4_sb_info *sbi)
static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi)
{
- return snprintf(buf, PAGE_SIZE, "%lld",
+ return snprintf(buf, PAGE_SIZE, "%lld\n",
((time64_t)hi << 32) + le32_to_cpu(lo));
}
@@ -318,6 +372,30 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
else
return snprintf(buf, PAGE_SIZE, "%u\n",
*((unsigned int *) ptr));
+ case attr_pointer_ul:
+ if (!ptr)
+ return 0;
+ return snprintf(buf, PAGE_SIZE, "%lu\n",
+ *((unsigned long *) ptr));
+ case attr_pointer_u8:
+ if (!ptr)
+ return 0;
+ return snprintf(buf, PAGE_SIZE, "%u\n",
+ *((unsigned char *) ptr));
+ case attr_pointer_u64:
+ if (!ptr)
+ return 0;
+ if (a->attr_ptr == ptr_ext4_super_block_offset)
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ le64_to_cpup(ptr));
+ else
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ *((unsigned long long *) ptr));
+ case attr_pointer_string:
+ if (!ptr)
+ return 0;
+ return snprintf(buf, PAGE_SIZE, "%.*s\n", a->attr_size,
+ (char *) ptr);
case attr_pointer_atomic:
if (!ptr)
return 0;
@@ -361,6 +439,14 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
else
*((unsigned int *) ptr) = t;
return len;
+ case attr_pointer_ul:
+ if (!ptr)
+ return 0;
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+ *((unsigned long *) ptr) = t;
+ return len;
case attr_inode_readahead:
return inode_readahead_blks_store(sbi, buf, len);
case attr_trigger_test_error:
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index d0d8a9795dd6..dc5ec724d889 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -342,12 +342,55 @@ static int ext4_get_verity_descriptor(struct inode *inode, void *buf,
return desc_size;
}
+/*
+ * Prefetch some pages from the file's Merkle tree.
+ *
+ * This is basically a stripped-down version of __do_page_cache_readahead()
+ * which works on pages past i_size.
+ */
+static void ext4_merkle_tree_readahead(struct address_space *mapping,
+ pgoff_t start_index, unsigned long count)
+{
+ LIST_HEAD(pages);
+ unsigned int nr_pages = 0;
+ struct page *page;
+ pgoff_t index;
+ struct blk_plug plug;
+
+ for (index = start_index; index < start_index + count; index++) {
+ page = xa_load(&mapping->i_pages, index);
+ if (!page || xa_is_value(page)) {
+ page = __page_cache_alloc(readahead_gfp_mask(mapping));
+ if (!page)
+ break;
+ page->index = index;
+ list_add(&page->lru, &pages);
+ nr_pages++;
+ }
+ }
+ blk_start_plug(&plug);
+ ext4_mpage_readpages(mapping, &pages, NULL, nr_pages, true);
+ blk_finish_plug(&plug);
+}
+
static struct page *ext4_read_merkle_tree_page(struct inode *inode,
- pgoff_t index)
+ pgoff_t index,
+ unsigned long num_ra_pages)
{
+ struct page *page;
+
index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
- return read_mapping_page(inode->i_mapping, index, NULL);
+ page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
+ if (!page || !PageUptodate(page)) {
+ if (page)
+ put_page(page);
+ else if (num_ra_pages > 1)
+ ext4_merkle_tree_readahead(inode->i_mapping, index,
+ num_ra_pages);
+ page = read_mapping_page(inode->i_mapping, index, NULL);
+ }
+ return page;
}
static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 8966a5439a22..8cac7d95c3ad 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1456,7 +1456,7 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
if (!ce)
return NULL;
- ea_data = ext4_kvmalloc(value_len, GFP_NOFS);
+ ea_data = kvmalloc(value_len, GFP_KERNEL);
if (!ea_data) {
mb_cache_entry_put(ea_inode_cache, ce);
return NULL;
@@ -2879,9 +2879,11 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
if (IS_ERR(bh)) {
error = PTR_ERR(bh);
- if (error == -EIO)
+ if (error == -EIO) {
+ ext4_set_errno(inode->i_sb, EIO);
EXT4_ERROR_INODE(inode, "block %llu read error",
EXT4_I(inode)->i_file_acl);
+ }
bh = NULL;
goto cleanup;
}
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 652fd2e2b23d..f0faada30f30 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -6,6 +6,7 @@ config F2FS_FS
select CRYPTO
select CRYPTO_CRC32
select F2FS_FS_XATTR if FS_ENCRYPTION
+ select FS_ENCRYPTION_ALGS if FS_ENCRYPTION
help
F2FS is based on Log-structured File System (LFS), which supports
versatile "flash-friendly" features. The design has been focused on
@@ -21,7 +22,7 @@ config F2FS_FS
config F2FS_STAT_FS
bool "F2FS Status Information"
- depends on F2FS_FS && DEBUG_FS
+ depends on F2FS_FS
default y
help
/sys/kernel/debug/f2fs/ contains information about all the partitions
@@ -92,3 +93,28 @@ config F2FS_FAULT_INJECTION
Test F2FS to inject faults such as ENOMEM, ENOSPC, and so on.
If unsure, say N.
+
+config F2FS_FS_COMPRESSION
+ bool "F2FS compression feature"
+ depends on F2FS_FS
+ help
+ Enable filesystem-level compression on f2fs regular files,
+ multiple back-end compression algorithms are supported.
+
+config F2FS_FS_LZO
+ bool "LZO compression support"
+ depends on F2FS_FS_COMPRESSION
+ select LZO_COMPRESS
+ select LZO_DECOMPRESS
+ default y
+ help
+ Support LZO compress algorithm, if unsure, say Y.
+
+config F2FS_FS_LZ4
+ bool "LZ4 compression support"
+ depends on F2FS_FS_COMPRESSION
+ select LZ4_COMPRESS
+ select LZ4_DECOMPRESS
+ default y
+ help
+ Support LZ4 compress algorithm, if unsure, say Y.
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index 2aaecc63834f..ee7316b42f69 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -9,3 +9,4 @@ f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o
f2fs-$(CONFIG_FS_VERITY) += verity.o
+f2fs-$(CONFIG_F2FS_FS_COMPRESSION) += compress.o
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index ffdaba0c55d2..44e84ac5c941 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -1509,10 +1509,10 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
f2fs_wait_on_all_pages_writeback(sbi);
/*
- * invalidate intermediate page cache borrowed from meta inode
- * which are used for migration of encrypted inode's blocks.
+ * invalidate intermediate page cache borrowed from meta inode which are
+ * used for migration of encrypted or verity inode's blocks.
*/
- if (f2fs_sb_has_encrypt(sbi))
+ if (f2fs_sb_has_encrypt(sbi) || f2fs_sb_has_verity(sbi))
invalidate_mapping_pages(META_MAPPING(sbi),
MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1);
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
new file mode 100644
index 000000000000..d8a64be90a50
--- /dev/null
+++ b/fs/f2fs/compress.c
@@ -0,0 +1,1176 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * f2fs compress support
+ *
+ * Copyright (c) 2019 Chao Yu <chao@kernel.org>
+ */
+
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/lzo.h>
+#include <linux/lz4.h>
+
+#include "f2fs.h"
+#include "node.h"
+#include <trace/events/f2fs.h>
+
+struct f2fs_compress_ops {
+ int (*init_compress_ctx)(struct compress_ctx *cc);
+ void (*destroy_compress_ctx)(struct compress_ctx *cc);
+ int (*compress_pages)(struct compress_ctx *cc);
+ int (*decompress_pages)(struct decompress_io_ctx *dic);
+};
+
+static unsigned int offset_in_cluster(struct compress_ctx *cc, pgoff_t index)
+{
+ return index & (cc->cluster_size - 1);
+}
+
+static pgoff_t cluster_idx(struct compress_ctx *cc, pgoff_t index)
+{
+ return index >> cc->log_cluster_size;
+}
+
+static pgoff_t start_idx_of_cluster(struct compress_ctx *cc)
+{
+ return cc->cluster_idx << cc->log_cluster_size;
+}
+
+bool f2fs_is_compressed_page(struct page *page)
+{
+ if (!PagePrivate(page))
+ return false;
+ if (!page_private(page))
+ return false;
+ if (IS_ATOMIC_WRITTEN_PAGE(page) || IS_DUMMY_WRITTEN_PAGE(page))
+ return false;
+ f2fs_bug_on(F2FS_M_SB(page->mapping),
+ *((u32 *)page_private(page)) != F2FS_COMPRESSED_PAGE_MAGIC);
+ return true;
+}
+
+static void f2fs_set_compressed_page(struct page *page,
+ struct inode *inode, pgoff_t index, void *data, refcount_t *r)
+{
+ SetPagePrivate(page);
+ set_page_private(page, (unsigned long)data);
+
+ /* i_crypto_info and iv index */
+ page->index = index;
+ page->mapping = inode->i_mapping;
+ if (r)
+ refcount_inc(r);
+}
+
+static void f2fs_put_compressed_page(struct page *page)
+{
+ set_page_private(page, (unsigned long)NULL);
+ ClearPagePrivate(page);
+ page->mapping = NULL;
+ unlock_page(page);
+ put_page(page);
+}
+
+static void f2fs_drop_rpages(struct compress_ctx *cc, int len, bool unlock)
+{
+ int i;
+
+ for (i = 0; i < len; i++) {
+ if (!cc->rpages[i])
+ continue;
+ if (unlock)
+ unlock_page(cc->rpages[i]);
+ else
+ put_page(cc->rpages[i]);
+ }
+}
+
+static void f2fs_put_rpages(struct compress_ctx *cc)
+{
+ f2fs_drop_rpages(cc, cc->cluster_size, false);
+}
+
+static void f2fs_unlock_rpages(struct compress_ctx *cc, int len)
+{
+ f2fs_drop_rpages(cc, len, true);
+}
+
+static void f2fs_put_rpages_mapping(struct compress_ctx *cc,
+ struct address_space *mapping,
+ pgoff_t start, int len)
+{
+ int i;
+
+ for (i = 0; i < len; i++) {
+ struct page *page = find_get_page(mapping, start + i);
+
+ put_page(page);
+ put_page(page);
+ }
+}
+
+static void f2fs_put_rpages_wbc(struct compress_ctx *cc,
+ struct writeback_control *wbc, bool redirty, int unlock)
+{
+ unsigned int i;
+
+ for (i = 0; i < cc->cluster_size; i++) {
+ if (!cc->rpages[i])
+ continue;
+ if (redirty)
+ redirty_page_for_writepage(wbc, cc->rpages[i]);
+ f2fs_put_page(cc->rpages[i], unlock);
+ }
+}
+
+struct page *f2fs_compress_control_page(struct page *page)
+{
+ return ((struct compress_io_ctx *)page_private(page))->rpages[0];
+}
+
+int f2fs_init_compress_ctx(struct compress_ctx *cc)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
+
+ if (cc->nr_rpages)
+ return 0;
+
+ cc->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) <<
+ cc->log_cluster_size, GFP_NOFS);
+ return cc->rpages ? 0 : -ENOMEM;
+}
+
+void f2fs_destroy_compress_ctx(struct compress_ctx *cc)
+{
+ kfree(cc->rpages);
+ cc->rpages = NULL;
+ cc->nr_rpages = 0;
+ cc->nr_cpages = 0;
+ cc->cluster_idx = NULL_CLUSTER;
+}
+
+void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page)
+{
+ unsigned int cluster_ofs;
+
+ if (!f2fs_cluster_can_merge_page(cc, page->index))
+ f2fs_bug_on(F2FS_I_SB(cc->inode), 1);
+
+ cluster_ofs = offset_in_cluster(cc, page->index);
+ cc->rpages[cluster_ofs] = page;
+ cc->nr_rpages++;
+ cc->cluster_idx = cluster_idx(cc, page->index);
+}
+
+#ifdef CONFIG_F2FS_FS_LZO
+static int lzo_init_compress_ctx(struct compress_ctx *cc)
+{
+ cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode),
+ LZO1X_MEM_COMPRESS, GFP_NOFS);
+ if (!cc->private)
+ return -ENOMEM;
+
+ cc->clen = lzo1x_worst_compress(PAGE_SIZE << cc->log_cluster_size);
+ return 0;
+}
+
+static void lzo_destroy_compress_ctx(struct compress_ctx *cc)
+{
+ kvfree(cc->private);
+ cc->private = NULL;
+}
+
+static int lzo_compress_pages(struct compress_ctx *cc)
+{
+ int ret;
+
+ ret = lzo1x_1_compress(cc->rbuf, cc->rlen, cc->cbuf->cdata,
+ &cc->clen, cc->private);
+ if (ret != LZO_E_OK) {
+ printk_ratelimited("%sF2FS-fs (%s): lzo compress failed, ret:%d\n",
+ KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, ret);
+ return -EIO;
+ }
+ return 0;
+}
+
+static int lzo_decompress_pages(struct decompress_io_ctx *dic)
+{
+ int ret;
+
+ ret = lzo1x_decompress_safe(dic->cbuf->cdata, dic->clen,
+ dic->rbuf, &dic->rlen);
+ if (ret != LZO_E_OK) {
+ printk_ratelimited("%sF2FS-fs (%s): lzo decompress failed, ret:%d\n",
+ KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, ret);
+ return -EIO;
+ }
+
+ if (dic->rlen != PAGE_SIZE << dic->log_cluster_size) {
+ printk_ratelimited("%sF2FS-fs (%s): lzo invalid rlen:%zu, "
+ "expected:%lu\n", KERN_ERR,
+ F2FS_I_SB(dic->inode)->sb->s_id,
+ dic->rlen,
+ PAGE_SIZE << dic->log_cluster_size);
+ return -EIO;
+ }
+ return 0;
+}
+
+static const struct f2fs_compress_ops f2fs_lzo_ops = {
+ .init_compress_ctx = lzo_init_compress_ctx,
+ .destroy_compress_ctx = lzo_destroy_compress_ctx,
+ .compress_pages = lzo_compress_pages,
+ .decompress_pages = lzo_decompress_pages,
+};
+#endif
+
+#ifdef CONFIG_F2FS_FS_LZ4
+static int lz4_init_compress_ctx(struct compress_ctx *cc)
+{
+ cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode),
+ LZ4_MEM_COMPRESS, GFP_NOFS);
+ if (!cc->private)
+ return -ENOMEM;
+
+ cc->clen = LZ4_compressBound(PAGE_SIZE << cc->log_cluster_size);
+ return 0;
+}
+
+static void lz4_destroy_compress_ctx(struct compress_ctx *cc)
+{
+ kvfree(cc->private);
+ cc->private = NULL;
+}
+
+static int lz4_compress_pages(struct compress_ctx *cc)
+{
+ int len;
+
+ len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen,
+ cc->clen, cc->private);
+ if (!len) {
+ printk_ratelimited("%sF2FS-fs (%s): lz4 compress failed\n",
+ KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id);
+ return -EIO;
+ }
+ cc->clen = len;
+ return 0;
+}
+
+static int lz4_decompress_pages(struct decompress_io_ctx *dic)
+{
+ int ret;
+
+ ret = LZ4_decompress_safe(dic->cbuf->cdata, dic->rbuf,
+ dic->clen, dic->rlen);
+ if (ret < 0) {
+ printk_ratelimited("%sF2FS-fs (%s): lz4 decompress failed, ret:%d\n",
+ KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, ret);
+ return -EIO;
+ }
+
+ if (ret != PAGE_SIZE << dic->log_cluster_size) {
+ printk_ratelimited("%sF2FS-fs (%s): lz4 invalid rlen:%zu, "
+ "expected:%lu\n", KERN_ERR,
+ F2FS_I_SB(dic->inode)->sb->s_id,
+ dic->rlen,
+ PAGE_SIZE << dic->log_cluster_size);
+ return -EIO;
+ }
+ return 0;
+}
+
+static const struct f2fs_compress_ops f2fs_lz4_ops = {
+ .init_compress_ctx = lz4_init_compress_ctx,
+ .destroy_compress_ctx = lz4_destroy_compress_ctx,
+ .compress_pages = lz4_compress_pages,
+ .decompress_pages = lz4_decompress_pages,
+};
+#endif
+
+static const struct f2fs_compress_ops *f2fs_cops[COMPRESS_MAX] = {
+#ifdef CONFIG_F2FS_FS_LZO
+ &f2fs_lzo_ops,
+#else
+ NULL,
+#endif
+#ifdef CONFIG_F2FS_FS_LZ4
+ &f2fs_lz4_ops,
+#else
+ NULL,
+#endif
+};
+
+bool f2fs_is_compress_backend_ready(struct inode *inode)
+{
+ if (!f2fs_compressed_file(inode))
+ return true;
+ return f2fs_cops[F2FS_I(inode)->i_compress_algorithm];
+}
+
+static struct page *f2fs_grab_page(void)
+{
+ struct page *page;
+
+ page = alloc_page(GFP_NOFS);
+ if (!page)
+ return NULL;
+ lock_page(page);
+ return page;
+}
+
+static int f2fs_compress_pages(struct compress_ctx *cc)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
+ struct f2fs_inode_info *fi = F2FS_I(cc->inode);
+ const struct f2fs_compress_ops *cops =
+ f2fs_cops[fi->i_compress_algorithm];
+ unsigned int max_len, nr_cpages;
+ int i, ret;
+
+ trace_f2fs_compress_pages_start(cc->inode, cc->cluster_idx,
+ cc->cluster_size, fi->i_compress_algorithm);
+
+ ret = cops->init_compress_ctx(cc);
+ if (ret)
+ goto out;
+
+ max_len = COMPRESS_HEADER_SIZE + cc->clen;
+ cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE);
+
+ cc->cpages = f2fs_kzalloc(sbi, sizeof(struct page *) *
+ cc->nr_cpages, GFP_NOFS);
+ if (!cc->cpages) {
+ ret = -ENOMEM;
+ goto destroy_compress_ctx;
+ }
+
+ for (i = 0; i < cc->nr_cpages; i++) {
+ cc->cpages[i] = f2fs_grab_page();
+ if (!cc->cpages[i]) {
+ ret = -ENOMEM;
+ goto out_free_cpages;
+ }
+ }
+
+ cc->rbuf = vmap(cc->rpages, cc->cluster_size, VM_MAP, PAGE_KERNEL_RO);
+ if (!cc->rbuf) {
+ ret = -ENOMEM;
+ goto out_free_cpages;
+ }
+
+ cc->cbuf = vmap(cc->cpages, cc->nr_cpages, VM_MAP, PAGE_KERNEL);
+ if (!cc->cbuf) {
+ ret = -ENOMEM;
+ goto out_vunmap_rbuf;
+ }
+
+ ret = cops->compress_pages(cc);
+ if (ret)
+ goto out_vunmap_cbuf;
+
+ max_len = PAGE_SIZE * (cc->cluster_size - 1) - COMPRESS_HEADER_SIZE;
+
+ if (cc->clen > max_len) {
+ ret = -EAGAIN;
+ goto out_vunmap_cbuf;
+ }
+
+ cc->cbuf->clen = cpu_to_le32(cc->clen);
+ cc->cbuf->chksum = cpu_to_le32(0);
+
+ for (i = 0; i < COMPRESS_DATA_RESERVED_SIZE; i++)
+ cc->cbuf->reserved[i] = cpu_to_le32(0);
+
+ vunmap(cc->cbuf);
+ vunmap(cc->rbuf);
+
+ nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE);
+
+ for (i = nr_cpages; i < cc->nr_cpages; i++) {
+ f2fs_put_compressed_page(cc->cpages[i]);
+ cc->cpages[i] = NULL;
+ }
+
+ cc->nr_cpages = nr_cpages;
+
+ trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx,
+ cc->clen, ret);
+ return 0;
+
+out_vunmap_cbuf:
+ vunmap(cc->cbuf);
+out_vunmap_rbuf:
+ vunmap(cc->rbuf);
+out_free_cpages:
+ for (i = 0; i < cc->nr_cpages; i++) {
+ if (cc->cpages[i])
+ f2fs_put_compressed_page(cc->cpages[i]);
+ }
+ kfree(cc->cpages);
+ cc->cpages = NULL;
+destroy_compress_ctx:
+ cops->destroy_compress_ctx(cc);
+out:
+ trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx,
+ cc->clen, ret);
+ return ret;
+}
+
+void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
+{
+ struct decompress_io_ctx *dic =
+ (struct decompress_io_ctx *)page_private(page);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode);
+ struct f2fs_inode_info *fi= F2FS_I(dic->inode);
+ const struct f2fs_compress_ops *cops =
+ f2fs_cops[fi->i_compress_algorithm];
+ int ret;
+
+ dec_page_count(sbi, F2FS_RD_DATA);
+
+ if (bio->bi_status || PageError(page))
+ dic->failed = true;
+
+ if (refcount_dec_not_one(&dic->ref))
+ return;
+
+ trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx,
+ dic->cluster_size, fi->i_compress_algorithm);
+
+ /* submit partial compressed pages */
+ if (dic->failed) {
+ ret = -EIO;
+ goto out_free_dic;
+ }
+
+ dic->rbuf = vmap(dic->tpages, dic->cluster_size, VM_MAP, PAGE_KERNEL);
+ if (!dic->rbuf) {
+ ret = -ENOMEM;
+ goto out_free_dic;
+ }
+
+ dic->cbuf = vmap(dic->cpages, dic->nr_cpages, VM_MAP, PAGE_KERNEL_RO);
+ if (!dic->cbuf) {
+ ret = -ENOMEM;
+ goto out_vunmap_rbuf;
+ }
+
+ dic->clen = le32_to_cpu(dic->cbuf->clen);
+ dic->rlen = PAGE_SIZE << dic->log_cluster_size;
+
+ if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) {
+ ret = -EFSCORRUPTED;
+ goto out_vunmap_cbuf;
+ }
+
+ ret = cops->decompress_pages(dic);
+
+out_vunmap_cbuf:
+ vunmap(dic->cbuf);
+out_vunmap_rbuf:
+ vunmap(dic->rbuf);
+out_free_dic:
+ if (!verity)
+ f2fs_decompress_end_io(dic->rpages, dic->cluster_size,
+ ret, false);
+
+ trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx,
+ dic->clen, ret);
+ if (!verity)
+ f2fs_free_dic(dic);
+}
+
+static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index)
+{
+ if (cc->cluster_idx == NULL_CLUSTER)
+ return true;
+ return cc->cluster_idx == cluster_idx(cc, index);
+}
+
+bool f2fs_cluster_is_empty(struct compress_ctx *cc)
+{
+ return cc->nr_rpages == 0;
+}
+
+static bool f2fs_cluster_is_full(struct compress_ctx *cc)
+{
+ return cc->cluster_size == cc->nr_rpages;
+}
+
+bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index)
+{
+ if (f2fs_cluster_is_empty(cc))
+ return true;
+ return is_page_in_cluster(cc, index);
+}
+
+static bool __cluster_may_compress(struct compress_ctx *cc)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
+ loff_t i_size = i_size_read(cc->inode);
+ unsigned nr_pages = DIV_ROUND_UP(i_size, PAGE_SIZE);
+ int i;
+
+ for (i = 0; i < cc->cluster_size; i++) {
+ struct page *page = cc->rpages[i];
+
+ f2fs_bug_on(sbi, !page);
+
+ if (unlikely(f2fs_cp_error(sbi)))
+ return false;
+ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+ return false;
+
+ /* beyond EOF */
+ if (page->index >= nr_pages)
+ return false;
+ }
+ return true;
+}
+
+/* return # of compressed block addresses */
+static int f2fs_compressed_blocks(struct compress_ctx *cc)
+{
+ struct dnode_of_data dn;
+ int ret;
+
+ set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
+ ret = f2fs_get_dnode_of_data(&dn, start_idx_of_cluster(cc),
+ LOOKUP_NODE);
+ if (ret) {
+ if (ret == -ENOENT)
+ ret = 0;
+ goto fail;
+ }
+
+ if (dn.data_blkaddr == COMPRESS_ADDR) {
+ int i;
+
+ ret = 1;
+ for (i = 1; i < cc->cluster_size; i++) {
+ block_t blkaddr;
+
+ blkaddr = datablock_addr(dn.inode,
+ dn.node_page, dn.ofs_in_node + i);
+ if (blkaddr != NULL_ADDR)
+ ret++;
+ }
+ }
+fail:
+ f2fs_put_dnode(&dn);
+ return ret;
+}
+
+int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index)
+{
+ struct compress_ctx cc = {
+ .inode = inode,
+ .log_cluster_size = F2FS_I(inode)->i_log_cluster_size,
+ .cluster_size = F2FS_I(inode)->i_cluster_size,
+ .cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size,
+ };
+
+ return f2fs_compressed_blocks(&cc);
+}
+
+static bool cluster_may_compress(struct compress_ctx *cc)
+{
+ if (!f2fs_compressed_file(cc->inode))
+ return false;
+ if (f2fs_is_atomic_file(cc->inode))
+ return false;
+ if (f2fs_is_mmap_file(cc->inode))
+ return false;
+ if (!f2fs_cluster_is_full(cc))
+ return false;
+ return __cluster_may_compress(cc);
+}
+
+static void set_cluster_writeback(struct compress_ctx *cc)
+{
+ int i;
+
+ for (i = 0; i < cc->cluster_size; i++) {
+ if (cc->rpages[i])
+ set_page_writeback(cc->rpages[i]);
+ }
+}
+
+static void set_cluster_dirty(struct compress_ctx *cc)
+{
+ int i;
+
+ for (i = 0; i < cc->cluster_size; i++)
+ if (cc->rpages[i])
+ set_page_dirty(cc->rpages[i]);
+}
+
+static int prepare_compress_overwrite(struct compress_ctx *cc,
+ struct page **pagep, pgoff_t index, void **fsdata)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
+ struct address_space *mapping = cc->inode->i_mapping;
+ struct page *page;
+ struct dnode_of_data dn;
+ sector_t last_block_in_bio;
+ unsigned fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT;
+ pgoff_t start_idx = start_idx_of_cluster(cc);
+ int i, ret;
+ bool prealloc;
+
+retry:
+ ret = f2fs_compressed_blocks(cc);
+ if (ret <= 0)
+ return ret;
+
+ /* compressed case */
+ prealloc = (ret < cc->cluster_size);
+
+ ret = f2fs_init_compress_ctx(cc);
+ if (ret)
+ return ret;
+
+ /* keep page reference to avoid page reclaim */
+ for (i = 0; i < cc->cluster_size; i++) {
+ page = f2fs_pagecache_get_page(mapping, start_idx + i,
+ fgp_flag, GFP_NOFS);
+ if (!page) {
+ ret = -ENOMEM;
+ goto unlock_pages;
+ }
+
+ if (PageUptodate(page))
+ unlock_page(page);
+ else
+ f2fs_compress_ctx_add_page(cc, page);
+ }
+
+ if (!f2fs_cluster_is_empty(cc)) {
+ struct bio *bio = NULL;
+
+ ret = f2fs_read_multi_pages(cc, &bio, cc->cluster_size,
+ &last_block_in_bio, false);
+ f2fs_destroy_compress_ctx(cc);
+ if (ret)
+ goto release_pages;
+ if (bio)
+ f2fs_submit_bio(sbi, bio, DATA);
+
+ ret = f2fs_init_compress_ctx(cc);
+ if (ret)
+ goto release_pages;
+ }
+
+ for (i = 0; i < cc->cluster_size; i++) {
+ f2fs_bug_on(sbi, cc->rpages[i]);
+
+ page = find_lock_page(mapping, start_idx + i);
+ f2fs_bug_on(sbi, !page);
+
+ f2fs_wait_on_page_writeback(page, DATA, true, true);
+
+ f2fs_compress_ctx_add_page(cc, page);
+ f2fs_put_page(page, 0);
+
+ if (!PageUptodate(page)) {
+ f2fs_unlock_rpages(cc, i + 1);
+ f2fs_put_rpages_mapping(cc, mapping, start_idx,
+ cc->cluster_size);
+ f2fs_destroy_compress_ctx(cc);
+ goto retry;
+ }
+ }
+
+ if (prealloc) {
+ __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
+
+ set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
+
+ for (i = cc->cluster_size - 1; i > 0; i--) {
+ ret = f2fs_get_block(&dn, start_idx + i);
+ if (ret) {
+ i = cc->cluster_size;
+ break;
+ }
+
+ if (dn.data_blkaddr != NEW_ADDR)
+ break;
+ }
+
+ __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
+ }
+
+ if (likely(!ret)) {
+ *fsdata = cc->rpages;
+ *pagep = cc->rpages[offset_in_cluster(cc, index)];
+ return cc->cluster_size;
+ }
+
+unlock_pages:
+ f2fs_unlock_rpages(cc, i);
+release_pages:
+ f2fs_put_rpages_mapping(cc, mapping, start_idx, i);
+ f2fs_destroy_compress_ctx(cc);
+ return ret;
+}
+
+int f2fs_prepare_compress_overwrite(struct inode *inode,
+ struct page **pagep, pgoff_t index, void **fsdata)
+{
+ struct compress_ctx cc = {
+ .inode = inode,
+ .log_cluster_size = F2FS_I(inode)->i_log_cluster_size,
+ .cluster_size = F2FS_I(inode)->i_cluster_size,
+ .cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size,
+ .rpages = NULL,
+ .nr_rpages = 0,
+ };
+
+ return prepare_compress_overwrite(&cc, pagep, index, fsdata);
+}
+
+bool f2fs_compress_write_end(struct inode *inode, void *fsdata,
+ pgoff_t index, unsigned copied)
+
+{
+ struct compress_ctx cc = {
+ .log_cluster_size = F2FS_I(inode)->i_log_cluster_size,
+ .cluster_size = F2FS_I(inode)->i_cluster_size,
+ .rpages = fsdata,
+ };
+ bool first_index = (index == cc.rpages[0]->index);
+
+ if (copied)
+ set_cluster_dirty(&cc);
+
+ f2fs_put_rpages_wbc(&cc, NULL, false, 1);
+ f2fs_destroy_compress_ctx(&cc);
+
+ return first_index;
+}
+
+static int f2fs_write_compressed_pages(struct compress_ctx *cc,
+ int *submitted,
+ struct writeback_control *wbc,
+ enum iostat_type io_type)
+{
+ struct inode *inode = cc->inode;
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct f2fs_io_info fio = {
+ .sbi = sbi,
+ .ino = cc->inode->i_ino,
+ .type = DATA,
+ .op = REQ_OP_WRITE,
+ .op_flags = wbc_to_write_flags(wbc),
+ .old_blkaddr = NEW_ADDR,
+ .page = NULL,
+ .encrypted_page = NULL,
+ .compressed_page = NULL,
+ .submitted = false,
+ .need_lock = LOCK_RETRY,
+ .io_type = io_type,
+ .io_wbc = wbc,
+ .encrypted = f2fs_encrypted_file(cc->inode),
+ };
+ struct dnode_of_data dn;
+ struct node_info ni;
+ struct compress_io_ctx *cic;
+ pgoff_t start_idx = start_idx_of_cluster(cc);
+ unsigned int last_index = cc->cluster_size - 1;
+ loff_t psize;
+ int i, err;
+
+ set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
+
+ f2fs_lock_op(sbi);
+
+ err = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE);
+ if (err)
+ goto out_unlock_op;
+
+ for (i = 0; i < cc->cluster_size; i++) {
+ if (datablock_addr(dn.inode, dn.node_page,
+ dn.ofs_in_node + i) == NULL_ADDR)
+ goto out_put_dnode;
+ }
+
+ psize = (loff_t)(cc->rpages[last_index]->index + 1) << PAGE_SHIFT;
+
+ err = f2fs_get_node_info(fio.sbi, dn.nid, &ni);
+ if (err)
+ goto out_put_dnode;
+
+ fio.version = ni.version;
+
+ cic = f2fs_kzalloc(sbi, sizeof(struct compress_io_ctx), GFP_NOFS);
+ if (!cic)
+ goto out_put_dnode;
+
+ cic->magic = F2FS_COMPRESSED_PAGE_MAGIC;
+ cic->inode = inode;
+ refcount_set(&cic->ref, 1);
+ cic->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) <<
+ cc->log_cluster_size, GFP_NOFS);
+ if (!cic->rpages)
+ goto out_put_cic;
+
+ cic->nr_rpages = cc->cluster_size;
+
+ for (i = 0; i < cc->nr_cpages; i++) {
+ f2fs_set_compressed_page(cc->cpages[i], inode,
+ cc->rpages[i + 1]->index,
+ cic, i ? &cic->ref : NULL);
+ fio.compressed_page = cc->cpages[i];
+ if (fio.encrypted) {
+ fio.page = cc->rpages[i + 1];
+ err = f2fs_encrypt_one_page(&fio);
+ if (err)
+ goto out_destroy_crypt;
+ cc->cpages[i] = fio.encrypted_page;
+ }
+ }
+
+ set_cluster_writeback(cc);
+
+ for (i = 0; i < cc->cluster_size; i++)
+ cic->rpages[i] = cc->rpages[i];
+
+ for (i = 0; i < cc->cluster_size; i++, dn.ofs_in_node++) {
+ block_t blkaddr;
+
+ blkaddr = datablock_addr(dn.inode, dn.node_page,
+ dn.ofs_in_node);
+ fio.page = cic->rpages[i];
+ fio.old_blkaddr = blkaddr;
+
+ /* cluster header */
+ if (i == 0) {
+ if (blkaddr == COMPRESS_ADDR)
+ fio.compr_blocks++;
+ if (__is_valid_data_blkaddr(blkaddr))
+ f2fs_invalidate_blocks(sbi, blkaddr);
+ f2fs_update_data_blkaddr(&dn, COMPRESS_ADDR);
+ goto unlock_continue;
+ }
+
+ if (fio.compr_blocks && __is_valid_data_blkaddr(blkaddr))
+ fio.compr_blocks++;
+
+ if (i > cc->nr_cpages) {
+ if (__is_valid_data_blkaddr(blkaddr)) {
+ f2fs_invalidate_blocks(sbi, blkaddr);
+ f2fs_update_data_blkaddr(&dn, NEW_ADDR);
+ }
+ goto unlock_continue;
+ }
+
+ f2fs_bug_on(fio.sbi, blkaddr == NULL_ADDR);
+
+ if (fio.encrypted)
+ fio.encrypted_page = cc->cpages[i - 1];
+ else
+ fio.compressed_page = cc->cpages[i - 1];
+
+ cc->cpages[i - 1] = NULL;
+ f2fs_outplace_write_data(&dn, &fio);
+ (*submitted)++;
+unlock_continue:
+ inode_dec_dirty_pages(cc->inode);
+ unlock_page(fio.page);
+ }
+
+ if (fio.compr_blocks)
+ f2fs_i_compr_blocks_update(inode, fio.compr_blocks - 1, false);
+ f2fs_i_compr_blocks_update(inode, cc->nr_cpages, true);
+
+ set_inode_flag(cc->inode, FI_APPEND_WRITE);
+ if (cc->cluster_idx == 0)
+ set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
+
+ f2fs_put_dnode(&dn);
+ f2fs_unlock_op(sbi);
+
+ down_write(&fi->i_sem);
+ if (fi->last_disk_size < psize)
+ fi->last_disk_size = psize;
+ up_write(&fi->i_sem);
+
+ f2fs_put_rpages(cc);
+ f2fs_destroy_compress_ctx(cc);
+ return 0;
+
+out_destroy_crypt:
+ kfree(cic->rpages);
+
+ for (--i; i >= 0; i--)
+ fscrypt_finalize_bounce_page(&cc->cpages[i]);
+ for (i = 0; i < cc->nr_cpages; i++) {
+ if (!cc->cpages[i])
+ continue;
+ f2fs_put_page(cc->cpages[i], 1);
+ }
+out_put_cic:
+ kfree(cic);
+out_put_dnode:
+ f2fs_put_dnode(&dn);
+out_unlock_op:
+ f2fs_unlock_op(sbi);
+ return -EAGAIN;
+}
+
+void f2fs_compress_write_end_io(struct bio *bio, struct page *page)
+{
+ struct f2fs_sb_info *sbi = bio->bi_private;
+ struct compress_io_ctx *cic =
+ (struct compress_io_ctx *)page_private(page);
+ int i;
+
+ if (unlikely(bio->bi_status))
+ mapping_set_error(cic->inode->i_mapping, -EIO);
+
+ f2fs_put_compressed_page(page);
+
+ dec_page_count(sbi, F2FS_WB_DATA);
+
+ if (refcount_dec_not_one(&cic->ref))
+ return;
+
+ for (i = 0; i < cic->nr_rpages; i++) {
+ WARN_ON(!cic->rpages[i]);
+ clear_cold_data(cic->rpages[i]);
+ end_page_writeback(cic->rpages[i]);
+ }
+
+ kfree(cic->rpages);
+ kfree(cic);
+}
+
+static int f2fs_write_raw_pages(struct compress_ctx *cc,
+ int *submitted,
+ struct writeback_control *wbc,
+ enum iostat_type io_type)
+{
+ struct address_space *mapping = cc->inode->i_mapping;
+ int _submitted, compr_blocks, ret;
+ int i = -1, err = 0;
+
+ compr_blocks = f2fs_compressed_blocks(cc);
+ if (compr_blocks < 0) {
+ err = compr_blocks;
+ goto out_err;
+ }
+
+ for (i = 0; i < cc->cluster_size; i++) {
+ if (!cc->rpages[i])
+ continue;
+retry_write:
+ if (cc->rpages[i]->mapping != mapping) {
+ unlock_page(cc->rpages[i]);
+ continue;
+ }
+
+ BUG_ON(!PageLocked(cc->rpages[i]));
+
+ ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted,
+ NULL, NULL, wbc, io_type,
+ compr_blocks);
+ if (ret) {
+ if (ret == AOP_WRITEPAGE_ACTIVATE) {
+ unlock_page(cc->rpages[i]);
+ ret = 0;
+ } else if (ret == -EAGAIN) {
+ ret = 0;
+ cond_resched();
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ lock_page(cc->rpages[i]);
+ clear_page_dirty_for_io(cc->rpages[i]);
+ goto retry_write;
+ }
+ err = ret;
+ goto out_fail;
+ }
+
+ *submitted += _submitted;
+ }
+ return 0;
+
+out_fail:
+ /* TODO: revoke partially updated block addresses */
+ BUG_ON(compr_blocks);
+out_err:
+ for (++i; i < cc->cluster_size; i++) {
+ if (!cc->rpages[i])
+ continue;
+ redirty_page_for_writepage(wbc, cc->rpages[i]);
+ unlock_page(cc->rpages[i]);
+ }
+ return err;
+}
+
+int f2fs_write_multi_pages(struct compress_ctx *cc,
+ int *submitted,
+ struct writeback_control *wbc,
+ enum iostat_type io_type)
+{
+ struct f2fs_inode_info *fi = F2FS_I(cc->inode);
+ const struct f2fs_compress_ops *cops =
+ f2fs_cops[fi->i_compress_algorithm];
+ int err;
+
+ *submitted = 0;
+ if (cluster_may_compress(cc)) {
+ err = f2fs_compress_pages(cc);
+ if (err == -EAGAIN) {
+ goto write;
+ } else if (err) {
+ f2fs_put_rpages_wbc(cc, wbc, true, 1);
+ goto destroy_out;
+ }
+
+ err = f2fs_write_compressed_pages(cc, submitted,
+ wbc, io_type);
+ cops->destroy_compress_ctx(cc);
+ if (!err)
+ return 0;
+ f2fs_bug_on(F2FS_I_SB(cc->inode), err != -EAGAIN);
+ }
+write:
+ f2fs_bug_on(F2FS_I_SB(cc->inode), *submitted);
+
+ err = f2fs_write_raw_pages(cc, submitted, wbc, io_type);
+ f2fs_put_rpages_wbc(cc, wbc, false, 0);
+destroy_out:
+ f2fs_destroy_compress_ctx(cc);
+ return err;
+}
+
+struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
+ struct decompress_io_ctx *dic;
+ pgoff_t start_idx = start_idx_of_cluster(cc);
+ int i;
+
+ dic = f2fs_kzalloc(sbi, sizeof(struct decompress_io_ctx), GFP_NOFS);
+ if (!dic)
+ return ERR_PTR(-ENOMEM);
+
+ dic->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) <<
+ cc->log_cluster_size, GFP_NOFS);
+ if (!dic->rpages) {
+ kfree(dic);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ dic->magic = F2FS_COMPRESSED_PAGE_MAGIC;
+ dic->inode = cc->inode;
+ refcount_set(&dic->ref, 1);
+ dic->cluster_idx = cc->cluster_idx;
+ dic->cluster_size = cc->cluster_size;
+ dic->log_cluster_size = cc->log_cluster_size;
+ dic->nr_cpages = cc->nr_cpages;
+ dic->failed = false;
+
+ for (i = 0; i < dic->cluster_size; i++)
+ dic->rpages[i] = cc->rpages[i];
+ dic->nr_rpages = cc->cluster_size;
+
+ dic->cpages = f2fs_kzalloc(sbi, sizeof(struct page *) *
+ dic->nr_cpages, GFP_NOFS);
+ if (!dic->cpages)
+ goto out_free;
+
+ for (i = 0; i < dic->nr_cpages; i++) {
+ struct page *page;
+
+ page = f2fs_grab_page();
+ if (!page)
+ goto out_free;
+
+ f2fs_set_compressed_page(page, cc->inode,
+ start_idx + i + 1,
+ dic, i ? &dic->ref : NULL);
+ dic->cpages[i] = page;
+ }
+
+ dic->tpages = f2fs_kzalloc(sbi, sizeof(struct page *) *
+ dic->cluster_size, GFP_NOFS);
+ if (!dic->tpages)
+ goto out_free;
+
+ for (i = 0; i < dic->cluster_size; i++) {
+ if (cc->rpages[i])
+ continue;
+
+ dic->tpages[i] = f2fs_grab_page();
+ if (!dic->tpages[i])
+ goto out_free;
+ }
+
+ for (i = 0; i < dic->cluster_size; i++) {
+ if (dic->tpages[i])
+ continue;
+ dic->tpages[i] = cc->rpages[i];
+ }
+
+ return dic;
+
+out_free:
+ f2fs_free_dic(dic);
+ return ERR_PTR(-ENOMEM);
+}
+
+void f2fs_free_dic(struct decompress_io_ctx *dic)
+{
+ int i;
+
+ if (dic->tpages) {
+ for (i = 0; i < dic->cluster_size; i++) {
+ if (dic->rpages[i])
+ continue;
+ f2fs_put_page(dic->tpages[i], 1);
+ }
+ kfree(dic->tpages);
+ }
+
+ if (dic->cpages) {
+ for (i = 0; i < dic->nr_cpages; i++) {
+ if (!dic->cpages[i])
+ continue;
+ f2fs_put_compressed_page(dic->cpages[i]);
+ }
+ kfree(dic->cpages);
+ }
+
+ kfree(dic->rpages);
+ kfree(dic);
+}
+
+void f2fs_decompress_end_io(struct page **rpages,
+ unsigned int cluster_size, bool err, bool verity)
+{
+ int i;
+
+ for (i = 0; i < cluster_size; i++) {
+ struct page *rpage = rpages[i];
+
+ if (!rpage)
+ continue;
+
+ if (err || PageError(rpage)) {
+ ClearPageUptodate(rpage);
+ ClearPageError(rpage);
+ } else {
+ if (!verity || fsverity_verify_page(rpage))
+ SetPageUptodate(rpage);
+ else
+ SetPageError(rpage);
+ }
+ unlock_page(rpage);
+ }
+}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index a034cd0ce021..8bd9afa81c54 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -31,6 +31,47 @@
static struct kmem_cache *bio_post_read_ctx_cache;
static struct kmem_cache *bio_entry_slab;
static mempool_t *bio_post_read_ctx_pool;
+static struct bio_set f2fs_bioset;
+
+#define F2FS_BIO_POOL_SIZE NR_CURSEG_TYPE
+
+int __init f2fs_init_bioset(void)
+{
+ if (bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE,
+ 0, BIOSET_NEED_BVECS))
+ return -ENOMEM;
+ return 0;
+}
+
+void f2fs_destroy_bioset(void)
+{
+ bioset_exit(&f2fs_bioset);
+}
+
+static inline struct bio *__f2fs_bio_alloc(gfp_t gfp_mask,
+ unsigned int nr_iovecs)
+{
+ return bio_alloc_bioset(gfp_mask, nr_iovecs, &f2fs_bioset);
+}
+
+struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool no_fail)
+{
+ struct bio *bio;
+
+ if (no_fail) {
+ /* No failure on bio allocation */
+ bio = __f2fs_bio_alloc(GFP_NOIO, npages);
+ if (!bio)
+ bio = __f2fs_bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages);
+ return bio;
+ }
+ if (time_to_inject(sbi, FAULT_ALLOC_BIO)) {
+ f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO);
+ return NULL;
+ }
+
+ return __f2fs_bio_alloc(GFP_KERNEL, npages);
+}
static bool __is_cp_guaranteed(struct page *page)
{
@@ -41,6 +82,9 @@ static bool __is_cp_guaranteed(struct page *page)
if (!mapping)
return false;
+ if (f2fs_is_compressed_page(page))
+ return false;
+
inode = mapping->host;
sbi = F2FS_I_SB(inode);
@@ -73,19 +117,19 @@ static enum count_type __read_io_type(struct page *page)
/* postprocessing steps for read bios */
enum bio_post_read_step {
- STEP_INITIAL = 0,
STEP_DECRYPT,
+ STEP_DECOMPRESS,
STEP_VERITY,
};
struct bio_post_read_ctx {
struct bio *bio;
+ struct f2fs_sb_info *sbi;
struct work_struct work;
- unsigned int cur_step;
unsigned int enabled_steps;
};
-static void __read_end_io(struct bio *bio)
+static void __read_end_io(struct bio *bio, bool compr, bool verity)
{
struct page *page;
struct bio_vec *bv;
@@ -94,6 +138,13 @@ static void __read_end_io(struct bio *bio)
bio_for_each_segment_all(bv, bio, iter_all) {
page = bv->bv_page;
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (compr && f2fs_is_compressed_page(page)) {
+ f2fs_decompress_pages(bio, page, verity);
+ continue;
+ }
+#endif
+
/* PG_error was set if any post_read step failed */
if (bio->bi_status || PageError(page)) {
ClearPageUptodate(page);
@@ -105,31 +156,107 @@ static void __read_end_io(struct bio *bio)
dec_page_count(F2FS_P_SB(page), __read_io_type(page));
unlock_page(page);
}
- if (bio->bi_private)
- mempool_free(bio->bi_private, bio_post_read_ctx_pool);
- bio_put(bio);
+}
+
+static void f2fs_release_read_bio(struct bio *bio);
+static void __f2fs_read_end_io(struct bio *bio, bool compr, bool verity)
+{
+ if (!compr)
+ __read_end_io(bio, false, verity);
+ f2fs_release_read_bio(bio);
+}
+
+static void f2fs_decompress_bio(struct bio *bio, bool verity)
+{
+ __read_end_io(bio, true, verity);
}
static void bio_post_read_processing(struct bio_post_read_ctx *ctx);
-static void decrypt_work(struct work_struct *work)
+static void f2fs_decrypt_work(struct bio_post_read_ctx *ctx)
+{
+ fscrypt_decrypt_bio(ctx->bio);
+}
+
+static void f2fs_decompress_work(struct bio_post_read_ctx *ctx)
+{
+ f2fs_decompress_bio(ctx->bio, ctx->enabled_steps & (1 << STEP_VERITY));
+}
+
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+static void f2fs_verify_pages(struct page **rpages, unsigned int cluster_size)
+{
+ f2fs_decompress_end_io(rpages, cluster_size, false, true);
+}
+
+static void f2fs_verify_bio(struct bio *bio)
+{
+ struct page *page = bio_first_page_all(bio);
+ struct decompress_io_ctx *dic =
+ (struct decompress_io_ctx *)page_private(page);
+
+ f2fs_verify_pages(dic->rpages, dic->cluster_size);
+ f2fs_free_dic(dic);
+}
+#endif
+
+static void f2fs_verity_work(struct work_struct *work)
{
struct bio_post_read_ctx *ctx =
container_of(work, struct bio_post_read_ctx, work);
+ struct bio *bio = ctx->bio;
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ unsigned int enabled_steps = ctx->enabled_steps;
+#endif
- fscrypt_decrypt_bio(ctx->bio);
+ /*
+ * fsverity_verify_bio() may call readpages() again, and while verity
+ * will be disabled for this, decryption may still be needed, resulting
+ * in another bio_post_read_ctx being allocated. So to prevent
+ * deadlocks we need to release the current ctx to the mempool first.
+ * This assumes that verity is the last post-read step.
+ */
+ mempool_free(ctx, bio_post_read_ctx_pool);
+ bio->bi_private = NULL;
+
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ /* previous step is decompression */
+ if (enabled_steps & (1 << STEP_DECOMPRESS)) {
+ f2fs_verify_bio(bio);
+ f2fs_release_read_bio(bio);
+ return;
+ }
+#endif
- bio_post_read_processing(ctx);
+ fsverity_verify_bio(bio);
+ __f2fs_read_end_io(bio, false, false);
}
-static void verity_work(struct work_struct *work)
+static void f2fs_post_read_work(struct work_struct *work)
{
struct bio_post_read_ctx *ctx =
container_of(work, struct bio_post_read_ctx, work);
- fsverity_verify_bio(ctx->bio);
+ if (ctx->enabled_steps & (1 << STEP_DECRYPT))
+ f2fs_decrypt_work(ctx);
- bio_post_read_processing(ctx);
+ if (ctx->enabled_steps & (1 << STEP_DECOMPRESS))
+ f2fs_decompress_work(ctx);
+
+ if (ctx->enabled_steps & (1 << STEP_VERITY)) {
+ INIT_WORK(&ctx->work, f2fs_verity_work);
+ fsverity_enqueue_verify_work(&ctx->work);
+ return;
+ }
+
+ __f2fs_read_end_io(ctx->bio,
+ ctx->enabled_steps & (1 << STEP_DECOMPRESS), false);
+}
+
+static void f2fs_enqueue_post_read_work(struct f2fs_sb_info *sbi,
+ struct work_struct *work)
+{
+ queue_work(sbi->post_read_wq, work);
}
static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
@@ -139,31 +266,26 @@ static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
* verity may require reading metadata pages that need decryption, and
* we shouldn't recurse to the same workqueue.
*/
- switch (++ctx->cur_step) {
- case STEP_DECRYPT:
- if (ctx->enabled_steps & (1 << STEP_DECRYPT)) {
- INIT_WORK(&ctx->work, decrypt_work);
- fscrypt_enqueue_decrypt_work(&ctx->work);
- return;
- }
- ctx->cur_step++;
- /* fall-through */
- case STEP_VERITY:
- if (ctx->enabled_steps & (1 << STEP_VERITY)) {
- INIT_WORK(&ctx->work, verity_work);
- fsverity_enqueue_verify_work(&ctx->work);
- return;
- }
- ctx->cur_step++;
- /* fall-through */
- default:
- __read_end_io(ctx->bio);
+
+ if (ctx->enabled_steps & (1 << STEP_DECRYPT) ||
+ ctx->enabled_steps & (1 << STEP_DECOMPRESS)) {
+ INIT_WORK(&ctx->work, f2fs_post_read_work);
+ f2fs_enqueue_post_read_work(ctx->sbi, &ctx->work);
+ return;
}
+
+ if (ctx->enabled_steps & (1 << STEP_VERITY)) {
+ INIT_WORK(&ctx->work, f2fs_verity_work);
+ fsverity_enqueue_verify_work(&ctx->work);
+ return;
+ }
+
+ __f2fs_read_end_io(ctx->bio, false, false);
}
static bool f2fs_bio_post_read_required(struct bio *bio)
{
- return bio->bi_private && !bio->bi_status;
+ return bio->bi_private;
}
static void f2fs_read_end_io(struct bio *bio)
@@ -178,12 +300,11 @@ static void f2fs_read_end_io(struct bio *bio)
if (f2fs_bio_post_read_required(bio)) {
struct bio_post_read_ctx *ctx = bio->bi_private;
- ctx->cur_step = STEP_INITIAL;
bio_post_read_processing(ctx);
return;
}
- __read_end_io(bio);
+ __f2fs_read_end_io(bio, false, false);
}
static void f2fs_write_end_io(struct bio *bio)
@@ -214,6 +335,13 @@ static void f2fs_write_end_io(struct bio *bio)
fscrypt_finalize_bounce_page(&page);
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (f2fs_is_compressed_page(page)) {
+ f2fs_compress_write_end_io(bio, page);
+ continue;
+ }
+#endif
+
if (unlikely(bio->bi_status)) {
mapping_set_error(page->mapping, -EIO);
if (type == F2FS_WB_CP_DATA)
@@ -358,6 +486,12 @@ submit_io:
submit_bio(bio);
}
+void f2fs_submit_bio(struct f2fs_sb_info *sbi,
+ struct bio *bio, enum page_type type)
+{
+ __submit_bio(sbi, bio, type);
+}
+
static void __submit_merged_bio(struct f2fs_bio_info *io)
{
struct f2fs_io_info *fio = &io->fio;
@@ -380,7 +514,6 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode,
struct page *page, nid_t ino)
{
struct bio_vec *bvec;
- struct page *target;
struct bvec_iter_all iter_all;
if (!bio)
@@ -390,10 +523,18 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode,
return true;
bio_for_each_segment_all(bvec, bio, iter_all) {
+ struct page *target = bvec->bv_page;
- target = bvec->bv_page;
- if (fscrypt_is_bounce_page(target))
+ if (fscrypt_is_bounce_page(target)) {
target = fscrypt_pagecache_page(target);
+ if (IS_ERR(target))
+ continue;
+ }
+ if (f2fs_is_compressed_page(target)) {
+ target = f2fs_compress_control_page(target);
+ if (IS_ERR(target))
+ continue;
+ }
if (inode && inode == target->mapping->host)
return true;
@@ -588,7 +729,8 @@ static int add_ipu_page(struct f2fs_sb_info *sbi, struct bio **bio,
found = true;
- if (bio_add_page(*bio, page, PAGE_SIZE, 0) == PAGE_SIZE) {
+ if (bio_add_page(*bio, page, PAGE_SIZE, 0) ==
+ PAGE_SIZE) {
ret = 0;
break;
}
@@ -728,7 +870,12 @@ next:
verify_fio_blkaddr(fio);
- bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
+ if (fio->encrypted_page)
+ bio_page = fio->encrypted_page;
+ else if (fio->compressed_page)
+ bio_page = fio->compressed_page;
+ else
+ bio_page = fio->page;
/* set submitted = true as a return value */
fio->submitted = true;
@@ -797,17 +944,16 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
if (f2fs_encrypted_file(inode))
post_read_steps |= 1 << STEP_DECRYPT;
-
+ if (f2fs_compressed_file(inode))
+ post_read_steps |= 1 << STEP_DECOMPRESS;
if (f2fs_need_verity(inode, first_idx))
post_read_steps |= 1 << STEP_VERITY;
if (post_read_steps) {
+ /* Due to the mempool, this never fails. */
ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS);
- if (!ctx) {
- bio_put(bio);
- return ERR_PTR(-ENOMEM);
- }
ctx->bio = bio;
+ ctx->sbi = sbi;
ctx->enabled_steps = post_read_steps;
bio->bi_private = ctx;
}
@@ -815,6 +961,13 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
return bio;
}
+static void f2fs_release_read_bio(struct bio *bio)
+{
+ if (bio->bi_private)
+ mempool_free(bio->bi_private, bio_post_read_ctx_pool);
+ bio_put(bio);
+}
+
/* This can handle encryption stuffs */
static int f2fs_submit_page_read(struct inode *inode, struct page *page,
block_t blkaddr)
@@ -1180,19 +1333,6 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
int err = 0;
bool direct_io = iocb->ki_flags & IOCB_DIRECT;
- /* convert inline data for Direct I/O*/
- if (direct_io) {
- err = f2fs_convert_inline_inode(inode);
- if (err)
- return err;
- }
-
- if (direct_io && allow_outplace_dio(inode, iocb, from))
- return 0;
-
- if (is_inode_flag_set(inode, FI_NO_PREALLOC))
- return 0;
-
map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos);
map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from));
if (map.m_len > map.m_lblk)
@@ -1872,6 +2012,144 @@ out:
return ret;
}
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
+ unsigned nr_pages, sector_t *last_block_in_bio,
+ bool is_readahead)
+{
+ struct dnode_of_data dn;
+ struct inode *inode = cc->inode;
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct bio *bio = *bio_ret;
+ unsigned int start_idx = cc->cluster_idx << cc->log_cluster_size;
+ sector_t last_block_in_file;
+ const unsigned blkbits = inode->i_blkbits;
+ const unsigned blocksize = 1 << blkbits;
+ struct decompress_io_ctx *dic = NULL;
+ int i;
+ int ret = 0;
+
+ f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc));
+
+ last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
+
+ /* get rid of pages beyond EOF */
+ for (i = 0; i < cc->cluster_size; i++) {
+ struct page *page = cc->rpages[i];
+
+ if (!page)
+ continue;
+ if ((sector_t)page->index >= last_block_in_file) {
+ zero_user_segment(page, 0, PAGE_SIZE);
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
+ } else if (!PageUptodate(page)) {
+ continue;
+ }
+ unlock_page(page);
+ cc->rpages[i] = NULL;
+ cc->nr_rpages--;
+ }
+
+ /* we are done since all pages are beyond EOF */
+ if (f2fs_cluster_is_empty(cc))
+ goto out;
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE);
+ if (ret)
+ goto out;
+
+ /* cluster was overwritten as normal cluster */
+ if (dn.data_blkaddr != COMPRESS_ADDR)
+ goto out;
+
+ for (i = 1; i < cc->cluster_size; i++) {
+ block_t blkaddr;
+
+ blkaddr = datablock_addr(dn.inode, dn.node_page,
+ dn.ofs_in_node + i);
+
+ if (!__is_valid_data_blkaddr(blkaddr))
+ break;
+
+ if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC)) {
+ ret = -EFAULT;
+ goto out_put_dnode;
+ }
+ cc->nr_cpages++;
+ }
+
+ /* nothing to decompress */
+ if (cc->nr_cpages == 0) {
+ ret = 0;
+ goto out_put_dnode;
+ }
+
+ dic = f2fs_alloc_dic(cc);
+ if (IS_ERR(dic)) {
+ ret = PTR_ERR(dic);
+ goto out_put_dnode;
+ }
+
+ for (i = 0; i < dic->nr_cpages; i++) {
+ struct page *page = dic->cpages[i];
+ block_t blkaddr;
+
+ blkaddr = datablock_addr(dn.inode, dn.node_page,
+ dn.ofs_in_node + i + 1);
+
+ if (bio && !page_is_mergeable(sbi, bio,
+ *last_block_in_bio, blkaddr)) {
+submit_and_realloc:
+ __submit_bio(sbi, bio, DATA);
+ bio = NULL;
+ }
+
+ if (!bio) {
+ bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages,
+ is_readahead ? REQ_RAHEAD : 0,
+ page->index);
+ if (IS_ERR(bio)) {
+ ret = PTR_ERR(bio);
+ bio = NULL;
+ dic->failed = true;
+ if (refcount_sub_and_test(dic->nr_cpages - i,
+ &dic->ref))
+ f2fs_decompress_end_io(dic->rpages,
+ cc->cluster_size, true,
+ false);
+ f2fs_free_dic(dic);
+ f2fs_put_dnode(&dn);
+ *bio_ret = bio;
+ return ret;
+ }
+ }
+
+ f2fs_wait_on_block_writeback(inode, blkaddr);
+
+ if (bio_add_page(bio, page, blocksize, 0) < blocksize)
+ goto submit_and_realloc;
+
+ inc_page_count(sbi, F2FS_RD_DATA);
+ ClearPageError(page);
+ *last_block_in_bio = blkaddr;
+ }
+
+ f2fs_put_dnode(&dn);
+
+ *bio_ret = bio;
+ return 0;
+
+out_put_dnode:
+ f2fs_put_dnode(&dn);
+out:
+ f2fs_decompress_end_io(cc->rpages, cc->cluster_size, true, false);
+ *bio_ret = bio;
+ return ret;
+}
+#endif
+
/*
* This function was originally taken from fs/mpage.c, and customized for f2fs.
* Major change was from block_size == page_size in f2fs by default.
@@ -1881,7 +2159,7 @@ out:
* use ->readpage() or do the necessary surgery to decouple ->readpages()
* from read-ahead.
*/
-static int f2fs_mpage_readpages(struct address_space *mapping,
+int f2fs_mpage_readpages(struct address_space *mapping,
struct list_head *pages, struct page *page,
unsigned nr_pages, bool is_readahead)
{
@@ -1889,6 +2167,19 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
sector_t last_block_in_bio = 0;
struct inode *inode = mapping->host;
struct f2fs_map_blocks map;
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ struct compress_ctx cc = {
+ .inode = inode,
+ .log_cluster_size = F2FS_I(inode)->i_log_cluster_size,
+ .cluster_size = F2FS_I(inode)->i_cluster_size,
+ .cluster_idx = NULL_CLUSTER,
+ .rpages = NULL,
+ .cpages = NULL,
+ .nr_rpages = 0,
+ .nr_cpages = 0,
+ };
+#endif
+ unsigned max_nr_pages = nr_pages;
int ret = 0;
map.m_pblk = 0;
@@ -1912,9 +2203,41 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
goto next_page;
}
- ret = f2fs_read_single_page(inode, page, nr_pages, &map, &bio,
- &last_block_in_bio, is_readahead);
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (f2fs_compressed_file(inode)) {
+ /* there are remained comressed pages, submit them */
+ if (!f2fs_cluster_can_merge_page(&cc, page->index)) {
+ ret = f2fs_read_multi_pages(&cc, &bio,
+ max_nr_pages,
+ &last_block_in_bio,
+ is_readahead);
+ f2fs_destroy_compress_ctx(&cc);
+ if (ret)
+ goto set_error_page;
+ }
+ ret = f2fs_is_compressed_cluster(inode, page->index);
+ if (ret < 0)
+ goto set_error_page;
+ else if (!ret)
+ goto read_single_page;
+
+ ret = f2fs_init_compress_ctx(&cc);
+ if (ret)
+ goto set_error_page;
+
+ f2fs_compress_ctx_add_page(&cc, page);
+
+ goto next_page;
+ }
+read_single_page:
+#endif
+
+ ret = f2fs_read_single_page(inode, page, max_nr_pages, &map,
+ &bio, &last_block_in_bio, is_readahead);
if (ret) {
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+set_error_page:
+#endif
SetPageError(page);
zero_user_segment(page, 0, PAGE_SIZE);
unlock_page(page);
@@ -1922,6 +2245,19 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
next_page:
if (pages)
put_page(page);
+
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (f2fs_compressed_file(inode)) {
+ /* last page */
+ if (nr_pages == 1 && !f2fs_cluster_is_empty(&cc)) {
+ ret = f2fs_read_multi_pages(&cc, &bio,
+ max_nr_pages,
+ &last_block_in_bio,
+ is_readahead);
+ f2fs_destroy_compress_ctx(&cc);
+ }
+ }
+#endif
}
BUG_ON(pages && !list_empty(pages));
if (bio)
@@ -1936,6 +2272,11 @@ static int f2fs_read_data_page(struct file *file, struct page *page)
trace_f2fs_readpage(page, DATA);
+ if (!f2fs_is_compress_backend_ready(inode)) {
+ unlock_page(page);
+ return -EOPNOTSUPP;
+ }
+
/* If the file has inline data, try to read it directly */
if (f2fs_has_inline_data(inode))
ret = f2fs_read_inline_data(inode, page);
@@ -1954,6 +2295,9 @@ static int f2fs_read_data_pages(struct file *file,
trace_f2fs_readpages(inode, page, nr_pages);
+ if (!f2fs_is_compress_backend_ready(inode))
+ return 0;
+
/* If the file has inline data, skip readpages */
if (f2fs_has_inline_data(inode))
return 0;
@@ -1961,22 +2305,23 @@ static int f2fs_read_data_pages(struct file *file,
return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages, true);
}
-static int encrypt_one_page(struct f2fs_io_info *fio)
+int f2fs_encrypt_one_page(struct f2fs_io_info *fio)
{
struct inode *inode = fio->page->mapping->host;
- struct page *mpage;
+ struct page *mpage, *page;
gfp_t gfp_flags = GFP_NOFS;
if (!f2fs_encrypted_file(inode))
return 0;
+ page = fio->compressed_page ? fio->compressed_page : fio->page;
+
/* wait for GCed page writeback via META_MAPPING */
f2fs_wait_on_block_writeback(inode, fio->old_blkaddr);
retry_encrypt:
- fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(fio->page,
- PAGE_SIZE, 0,
- gfp_flags);
+ fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page,
+ PAGE_SIZE, 0, gfp_flags);
if (IS_ERR(fio->encrypted_page)) {
/* flush pending IOs and wait for a while in the ENOMEM case */
if (PTR_ERR(fio->encrypted_page) == -ENOMEM) {
@@ -2136,7 +2481,7 @@ got_it:
if (ipu_force ||
(__is_valid_data_blkaddr(fio->old_blkaddr) &&
need_inplace_update(fio))) {
- err = encrypt_one_page(fio);
+ err = f2fs_encrypt_one_page(fio);
if (err)
goto out_writepage;
@@ -2172,13 +2517,16 @@ got_it:
fio->version = ni.version;
- err = encrypt_one_page(fio);
+ err = f2fs_encrypt_one_page(fio);
if (err)
goto out_writepage;
set_page_writeback(page);
ClearPageError(page);
+ if (fio->compr_blocks && fio->old_blkaddr == COMPRESS_ADDR)
+ f2fs_i_compr_blocks_update(inode, fio->compr_blocks - 1, false);
+
/* LFS mode write path */
f2fs_outplace_write_data(&dn, fio);
trace_f2fs_do_write_data_page(page, OPU);
@@ -2193,16 +2541,17 @@ out:
return err;
}
-static int __write_data_page(struct page *page, bool *submitted,
+int f2fs_write_single_data_page(struct page *page, int *submitted,
struct bio **bio,
sector_t *last_block,
struct writeback_control *wbc,
- enum iostat_type io_type)
+ enum iostat_type io_type,
+ int compr_blocks)
{
struct inode *inode = page->mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
loff_t i_size = i_size_read(inode);
- const pgoff_t end_index = ((unsigned long long) i_size)
+ const pgoff_t end_index = ((unsigned long long)i_size)
>> PAGE_SHIFT;
loff_t psize = (loff_t)(page->index + 1) << PAGE_SHIFT;
unsigned offset = 0;
@@ -2218,6 +2567,7 @@ static int __write_data_page(struct page *page, bool *submitted,
.page = page,
.encrypted_page = NULL,
.submitted = false,
+ .compr_blocks = compr_blocks,
.need_lock = LOCK_RETRY,
.io_type = io_type,
.io_wbc = wbc,
@@ -2242,7 +2592,9 @@ static int __write_data_page(struct page *page, bool *submitted,
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
goto redirty_out;
- if (page->index < end_index || f2fs_verity_in_progress(inode))
+ if (page->index < end_index ||
+ f2fs_verity_in_progress(inode) ||
+ compr_blocks)
goto write;
/*
@@ -2318,7 +2670,6 @@ out:
f2fs_remove_dirty_inode(inode);
submitted = NULL;
}
-
unlock_page(page);
if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) &&
!F2FS_I(inode)->cp_task)
@@ -2331,7 +2682,7 @@ out:
}
if (submitted)
- *submitted = fio.submitted;
+ *submitted = fio.submitted ? 1 : 0;
return 0;
@@ -2352,7 +2703,23 @@ redirty_out:
static int f2fs_write_data_page(struct page *page,
struct writeback_control *wbc)
{
- return __write_data_page(page, NULL, NULL, NULL, wbc, FS_DATA_IO);
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ struct inode *inode = page->mapping->host;
+
+ if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+ goto out;
+
+ if (f2fs_compressed_file(inode)) {
+ if (f2fs_is_compressed_cluster(inode, page->index)) {
+ redirty_page_for_writepage(wbc, page);
+ return AOP_WRITEPAGE_ACTIVATE;
+ }
+ }
+out:
+#endif
+
+ return f2fs_write_single_data_page(page, NULL, NULL, NULL,
+ wbc, FS_DATA_IO, 0);
}
/*
@@ -2365,11 +2732,27 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
enum iostat_type io_type)
{
int ret = 0;
- int done = 0;
+ int done = 0, retry = 0;
struct pagevec pvec;
struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
struct bio *bio = NULL;
sector_t last_block;
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ struct inode *inode = mapping->host;
+ struct compress_ctx cc = {
+ .inode = inode,
+ .log_cluster_size = F2FS_I(inode)->i_log_cluster_size,
+ .cluster_size = F2FS_I(inode)->i_cluster_size,
+ .cluster_idx = NULL_CLUSTER,
+ .rpages = NULL,
+ .nr_rpages = 0,
+ .cpages = NULL,
+ .rbuf = NULL,
+ .cbuf = NULL,
+ .rlen = PAGE_SIZE * F2FS_I(inode)->i_cluster_size,
+ .private = NULL,
+ };
+#endif
int nr_pages;
pgoff_t uninitialized_var(writeback_index);
pgoff_t index;
@@ -2379,6 +2762,8 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
int range_whole = 0;
xa_mark_t tag;
int nwritten = 0;
+ int submitted = 0;
+ int i;
pagevec_init(&pvec);
@@ -2408,12 +2793,11 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
else
tag = PAGECACHE_TAG_DIRTY;
retry:
+ retry = 0;
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, index, end);
done_index = index;
- while (!done && (index <= end)) {
- int i;
-
+ while (!done && !retry && (index <= end)) {
nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
tag);
if (nr_pages == 0)
@@ -2421,15 +2805,62 @@ retry:
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- bool submitted = false;
+ bool need_readd;
+readd:
+ need_readd = false;
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (f2fs_compressed_file(inode)) {
+ ret = f2fs_init_compress_ctx(&cc);
+ if (ret) {
+ done = 1;
+ break;
+ }
+
+ if (!f2fs_cluster_can_merge_page(&cc,
+ page->index)) {
+ ret = f2fs_write_multi_pages(&cc,
+ &submitted, wbc, io_type);
+ if (!ret)
+ need_readd = true;
+ goto result;
+ }
+ if (unlikely(f2fs_cp_error(sbi)))
+ goto lock_page;
+
+ if (f2fs_cluster_is_empty(&cc)) {
+ void *fsdata = NULL;
+ struct page *pagep;
+ int ret2;
+
+ ret2 = f2fs_prepare_compress_overwrite(
+ inode, &pagep,
+ page->index, &fsdata);
+ if (ret2 < 0) {
+ ret = ret2;
+ done = 1;
+ break;
+ } else if (ret2 &&
+ !f2fs_compress_write_end(inode,
+ fsdata, page->index,
+ 1)) {
+ retry = 1;
+ break;
+ }
+ } else {
+ goto lock_page;
+ }
+ }
+#endif
/* give a priority to WB_SYNC threads */
if (atomic_read(&sbi->wb_sync_req[DATA]) &&
wbc->sync_mode == WB_SYNC_NONE) {
done = 1;
break;
}
-
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+lock_page:
+#endif
done_index = page->index;
retry_write:
lock_page(page);
@@ -2456,45 +2887,71 @@ continue_unlock:
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
- ret = __write_data_page(page, &submitted, &bio,
- &last_block, wbc, io_type);
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (f2fs_compressed_file(inode)) {
+ get_page(page);
+ f2fs_compress_ctx_add_page(&cc, page);
+ continue;
+ }
+#endif
+ ret = f2fs_write_single_data_page(page, &submitted,
+ &bio, &last_block, wbc, io_type, 0);
+ if (ret == AOP_WRITEPAGE_ACTIVATE)
+ unlock_page(page);
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+result:
+#endif
+ nwritten += submitted;
+ wbc->nr_to_write -= submitted;
+
if (unlikely(ret)) {
/*
* keep nr_to_write, since vfs uses this to
* get # of written pages.
*/
if (ret == AOP_WRITEPAGE_ACTIVATE) {
- unlock_page(page);
ret = 0;
- continue;
+ goto next;
} else if (ret == -EAGAIN) {
ret = 0;
if (wbc->sync_mode == WB_SYNC_ALL) {
cond_resched();
congestion_wait(BLK_RW_ASYNC,
- HZ/50);
+ HZ/50);
goto retry_write;
}
- continue;
+ goto next;
}
done_index = page->index + 1;
done = 1;
break;
- } else if (submitted) {
- nwritten++;
}
- if (--wbc->nr_to_write <= 0 &&
+ if (wbc->nr_to_write <= 0 &&
wbc->sync_mode == WB_SYNC_NONE) {
done = 1;
break;
}
+next:
+ if (need_readd)
+ goto readd;
}
pagevec_release(&pvec);
cond_resched();
}
-
- if (!cycled && !done) {
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ /* flush remained pages in compress cluster */
+ if (f2fs_compressed_file(inode) && !f2fs_cluster_is_empty(&cc)) {
+ ret = f2fs_write_multi_pages(&cc, &submitted, wbc, io_type);
+ nwritten += submitted;
+ wbc->nr_to_write -= submitted;
+ if (ret) {
+ done = 1;
+ retry = 0;
+ }
+ }
+#endif
+ if ((!cycled && !done) || retry) {
cycled = 1;
index = 0;
end = writeback_index - 1;
@@ -2518,6 +2975,8 @@ static inline bool __should_serialize_io(struct inode *inode,
{
if (!S_ISREG(inode->i_mode))
return false;
+ if (f2fs_compressed_file(inode))
+ return true;
if (IS_NOQUOTA(inode))
return false;
/* to avoid deadlock in path of data flush */
@@ -2613,14 +3072,16 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to)
struct inode *inode = mapping->host;
loff_t i_size = i_size_read(inode);
+ if (IS_NOQUOTA(inode))
+ return;
+
/* In the fs-verity case, f2fs_end_enable_verity() does the truncate */
if (to > i_size && !f2fs_verity_in_progress(inode)) {
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
down_write(&F2FS_I(inode)->i_mmap_sem);
truncate_pagecache(inode, i_size);
- if (!IS_NOQUOTA(inode))
- f2fs_truncate_blocks(inode, i_size, true);
+ f2fs_truncate_blocks(inode, i_size, true);
up_write(&F2FS_I(inode)->i_mmap_sem);
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
@@ -2660,6 +3121,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
__do_map_lock(sbi, flag, true);
locked = true;
}
+
restart:
/* check inline_data */
ipage = f2fs_get_node_page(sbi, inode->i_ino);
@@ -2750,6 +3212,24 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
if (err)
goto fail;
}
+
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (f2fs_compressed_file(inode)) {
+ int ret;
+
+ *fsdata = NULL;
+
+ ret = f2fs_prepare_compress_overwrite(inode, pagep,
+ index, fsdata);
+ if (ret < 0) {
+ err = ret;
+ goto fail;
+ } else if (ret) {
+ return 0;
+ }
+ }
+#endif
+
repeat:
/*
* Do not use grab_cache_page_write_begin() to avoid deadlock due to
@@ -2762,6 +3242,8 @@ repeat:
goto fail;
}
+ /* TODO: cluster can be compressed due to race with .writepage */
+
*pagep = page;
err = prepare_write_begin(sbi, page, pos, len,
@@ -2845,6 +3327,16 @@ static int f2fs_write_end(struct file *file,
else
SetPageUptodate(page);
}
+
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ /* overwrite compressed file */
+ if (f2fs_compressed_file(inode) && fsdata) {
+ f2fs_compress_write_end(inode, fsdata, page->index, copied);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+ return copied;
+ }
+#endif
+
if (!copied)
goto unlock_out;
@@ -3145,7 +3637,8 @@ int f2fs_migrate_page(struct address_space *mapping,
#ifdef CONFIG_SWAP
/* Copied from generic_swapfile_activate() to check any holes */
-static int check_swap_activate(struct file *swap_file, unsigned int max)
+static int check_swap_activate(struct swap_info_struct *sis,
+ struct file *swap_file, sector_t *span)
{
struct address_space *mapping = swap_file->f_mapping;
struct inode *inode = mapping->host;
@@ -3156,6 +3649,8 @@ static int check_swap_activate(struct file *swap_file, unsigned int max)
sector_t last_block;
sector_t lowest_block = -1;
sector_t highest_block = 0;
+ int nr_extents = 0;
+ int ret;
blkbits = inode->i_blkbits;
blocks_per_page = PAGE_SIZE >> blkbits;
@@ -3167,7 +3662,8 @@ static int check_swap_activate(struct file *swap_file, unsigned int max)
probe_block = 0;
page_no = 0;
last_block = i_size_read(inode) >> blkbits;
- while ((probe_block + blocks_per_page) <= last_block && page_no < max) {
+ while ((probe_block + blocks_per_page) <= last_block &&
+ page_no < sis->max) {
unsigned block_in_page;
sector_t first_block;
@@ -3207,13 +3703,27 @@ static int check_swap_activate(struct file *swap_file, unsigned int max)
highest_block = first_block;
}
+ /*
+ * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
+ */
+ ret = add_swap_extent(sis, page_no, 1, first_block);
+ if (ret < 0)
+ goto out;
+ nr_extents += ret;
page_no++;
probe_block += blocks_per_page;
reprobe:
continue;
}
- return 0;
-
+ ret = nr_extents;
+ *span = 1 + highest_block - lowest_block;
+ if (page_no == 0)
+ page_no = 1; /* force Empty message */
+ sis->max = page_no;
+ sis->pages = page_no - 1;
+ sis->highest_bit = page_no - 1;
+out:
+ return ret;
bad_bmap:
pr_err("swapon: swapfile has holes\n");
return -EINVAL;
@@ -3235,14 +3745,17 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file,
if (ret)
return ret;
- ret = check_swap_activate(file, sis->max);
- if (ret)
+ if (f2fs_disable_compressed_file(inode))
+ return -EINVAL;
+
+ ret = check_swap_activate(sis, file, span);
+ if (ret < 0)
return ret;
set_inode_flag(inode, FI_PIN_FILE);
f2fs_precache_extents(inode);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
- return 0;
+ return ret;
}
static void f2fs_swap_deactivate(struct file *file)
@@ -3319,6 +3832,27 @@ void f2fs_destroy_post_read_processing(void)
kmem_cache_destroy(bio_post_read_ctx_cache);
}
+int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi)
+{
+ if (!f2fs_sb_has_encrypt(sbi) &&
+ !f2fs_sb_has_verity(sbi) &&
+ !f2fs_sb_has_compression(sbi))
+ return 0;
+
+ sbi->post_read_wq = alloc_workqueue("f2fs_post_read_wq",
+ WQ_UNBOUND | WQ_HIGHPRI,
+ num_online_cpus());
+ if (!sbi->post_read_wq)
+ return -ENOMEM;
+ return 0;
+}
+
+void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi)
+{
+ if (sbi->post_read_wq)
+ destroy_workqueue(sbi->post_read_wq);
+}
+
int __init f2fs_init_bio_entry_cache(void)
{
bio_entry_slab = f2fs_kmem_cache_create("bio_entry_slab",
@@ -3328,7 +3862,7 @@ int __init f2fs_init_bio_entry_cache(void)
return 0;
}
-void __exit f2fs_destroy_bio_entry_cache(void)
+void f2fs_destroy_bio_entry_cache(void)
{
kmem_cache_destroy(bio_entry_slab);
}
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 9b0bedd82581..6b89eae5e4ca 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -21,9 +21,45 @@
#include "gc.h"
static LIST_HEAD(f2fs_stat_list);
-static struct dentry *f2fs_debugfs_root;
static DEFINE_MUTEX(f2fs_stat_mutex);
+#ifdef CONFIG_DEBUG_FS
+static struct dentry *f2fs_debugfs_root;
+#endif
+
+/*
+ * This function calculates BDF of every segments
+ */
+void f2fs_update_sit_info(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_stat_info *si = F2FS_STAT(sbi);
+ unsigned long long blks_per_sec, hblks_per_sec, total_vblocks;
+ unsigned long long bimodal, dist;
+ unsigned int segno, vblocks;
+ int ndirty = 0;
+
+ bimodal = 0;
+ total_vblocks = 0;
+ blks_per_sec = BLKS_PER_SEC(sbi);
+ hblks_per_sec = blks_per_sec / 2;
+ for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
+ vblocks = get_valid_blocks(sbi, segno, true);
+ dist = abs(vblocks - hblks_per_sec);
+ bimodal += dist * dist;
+
+ if (vblocks > 0 && vblocks < blks_per_sec) {
+ total_vblocks += vblocks;
+ ndirty++;
+ }
+ }
+ dist = div_u64(MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec, 100);
+ si->bimodal = div64_u64(bimodal, dist);
+ if (si->dirty_count)
+ si->avg_vblocks = div_u64(total_vblocks, ndirty);
+ else
+ si->avg_vblocks = 0;
+}
+#ifdef CONFIG_DEBUG_FS
static void update_general_status(struct f2fs_sb_info *sbi)
{
struct f2fs_stat_info *si = F2FS_STAT(sbi);
@@ -56,7 +92,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->nquota_files = sbi->nquota_files;
si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
- si->aw_cnt = atomic_read(&sbi->aw_cnt);
+ si->aw_cnt = sbi->atomic_files;
si->vw_cnt = atomic_read(&sbi->vw_cnt);
si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt);
si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt);
@@ -94,6 +130,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->inline_xattr = atomic_read(&sbi->inline_xattr);
si->inline_inode = atomic_read(&sbi->inline_inode);
si->inline_dir = atomic_read(&sbi->inline_dir);
+ si->compr_inode = atomic_read(&sbi->compr_inode);
+ si->compr_blocks = atomic_read(&sbi->compr_blocks);
si->append = sbi->im[APPEND_INO].ino_num;
si->update = sbi->im[UPDATE_INO].ino_num;
si->orphans = sbi->im[ORPHAN_INO].ino_num;
@@ -114,7 +152,6 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID];
si->avail_nids = NM_I(sbi)->available_nids;
si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID];
- si->bg_gc = sbi->bg_gc;
si->io_skip_bggc = sbi->io_skip_bggc;
si->other_skip_bggc = sbi->other_skip_bggc;
si->skipped_atomic_files[BG_GC] = sbi->skipped_atomic_files[BG_GC];
@@ -146,39 +183,6 @@ static void update_general_status(struct f2fs_sb_info *sbi)
}
/*
- * This function calculates BDF of every segments
- */
-static void update_sit_info(struct f2fs_sb_info *sbi)
-{
- struct f2fs_stat_info *si = F2FS_STAT(sbi);
- unsigned long long blks_per_sec, hblks_per_sec, total_vblocks;
- unsigned long long bimodal, dist;
- unsigned int segno, vblocks;
- int ndirty = 0;
-
- bimodal = 0;
- total_vblocks = 0;
- blks_per_sec = BLKS_PER_SEC(sbi);
- hblks_per_sec = blks_per_sec / 2;
- for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
- vblocks = get_valid_blocks(sbi, segno, true);
- dist = abs(vblocks - hblks_per_sec);
- bimodal += dist * dist;
-
- if (vblocks > 0 && vblocks < blks_per_sec) {
- total_vblocks += vblocks;
- ndirty++;
- }
- }
- dist = div_u64(MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec, 100);
- si->bimodal = div64_u64(bimodal, dist);
- if (si->dirty_count)
- si->avg_vblocks = div_u64(total_vblocks, ndirty);
- else
- si->avg_vblocks = 0;
-}
-
-/*
* This function calculates memory footprint.
*/
static void update_mem_info(struct f2fs_sb_info *sbi)
@@ -315,6 +319,8 @@ static int stat_show(struct seq_file *s, void *v)
si->inline_inode);
seq_printf(s, " - Inline_dentry Inode: %u\n",
si->inline_dir);
+ seq_printf(s, " - Compressed Inode: %u, Blocks: %u\n",
+ si->compr_inode, si->compr_blocks);
seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n",
si->orphans, si->append, si->update);
seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
@@ -441,7 +447,7 @@ static int stat_show(struct seq_file *s, void *v)
si->block_count[LFS], si->segment_count[LFS]);
/* segment usage info */
- update_sit_info(si->sbi);
+ f2fs_update_sit_info(si->sbi);
seq_printf(s, "\nBDF: %u, avg. vblocks: %u\n",
si->bimodal, si->avg_vblocks);
@@ -461,6 +467,7 @@ static int stat_show(struct seq_file *s, void *v)
}
DEFINE_SHOW_ATTRIBUTE(stat);
+#endif
int f2fs_build_stats(struct f2fs_sb_info *sbi)
{
@@ -491,11 +498,12 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
atomic_set(&sbi->inline_xattr, 0);
atomic_set(&sbi->inline_inode, 0);
atomic_set(&sbi->inline_dir, 0);
+ atomic_set(&sbi->compr_inode, 0);
+ atomic_set(&sbi->compr_blocks, 0);
atomic_set(&sbi->inplace_count, 0);
for (i = META_CP; i < META_MAX; i++)
atomic_set(&sbi->meta_count[i], 0);
- atomic_set(&sbi->aw_cnt, 0);
atomic_set(&sbi->vw_cnt, 0);
atomic_set(&sbi->max_aw_cnt, 0);
atomic_set(&sbi->max_vw_cnt, 0);
@@ -520,14 +528,18 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
void __init f2fs_create_root_stats(void)
{
+#ifdef CONFIG_DEBUG_FS
f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL);
debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL,
&stat_fops);
+#endif
}
void f2fs_destroy_root_stats(void)
{
+#ifdef CONFIG_DEBUG_FS
debugfs_remove_recursive(f2fs_debugfs_root);
f2fs_debugfs_root = NULL;
+#endif
}
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index c967cacf979e..27d0dd7a16d6 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -578,6 +578,20 @@ next:
goto next;
}
+bool f2fs_has_enough_room(struct inode *dir, struct page *ipage,
+ struct fscrypt_name *fname)
+{
+ struct f2fs_dentry_ptr d;
+ unsigned int bit_pos;
+ int slots = GET_DENTRY_SLOTS(fname_len(fname));
+
+ make_dentry_ptr_inline(dir, &d, inline_data_addr(dir, ipage));
+
+ bit_pos = f2fs_room_for_filename(d.bitmap, slots, d.max);
+
+ return bit_pos < d.max;
+}
+
void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
const struct qstr *name, f2fs_hash_t name_hash,
unsigned int bit_pos)
@@ -987,7 +1001,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
if (IS_ENCRYPTED(inode)) {
err = fscrypt_get_encryption_info(inode);
- if (err && err != -ENOKEY)
+ if (err)
goto out;
err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr);
@@ -1069,24 +1083,27 @@ static int f2fs_d_compare(const struct dentry *dentry, unsigned int len,
const char *str, const struct qstr *name)
{
struct qstr qstr = {.name = str, .len = len };
+ const struct dentry *parent = READ_ONCE(dentry->d_parent);
+ const struct inode *inode = READ_ONCE(parent->d_inode);
- if (!IS_CASEFOLDED(dentry->d_parent->d_inode)) {
+ if (!inode || !IS_CASEFOLDED(inode)) {
if (len != name->len)
return -1;
- return memcmp(str, name, len);
+ return memcmp(str, name->name, len);
}
- return f2fs_ci_compare(dentry->d_parent->d_inode, name, &qstr, false);
+ return f2fs_ci_compare(inode, name, &qstr, false);
}
static int f2fs_d_hash(const struct dentry *dentry, struct qstr *str)
{
struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
const struct unicode_map *um = sbi->s_encoding;
+ const struct inode *inode = READ_ONCE(dentry->d_inode);
unsigned char *norm;
int len, ret = 0;
- if (!IS_CASEFOLDED(dentry->d_inode))
+ if (!inode || !IS_CASEFOLDED(inode))
return 0;
norm = f2fs_kmalloc(sbi, PATH_MAX, GFP_ATOMIC);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 5a888a063c7f..5355be6b6755 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -116,6 +116,8 @@ typedef u32 block_t; /*
*/
typedef u32 nid_t;
+#define COMPRESS_EXT_NUM 16
+
struct f2fs_mount_info {
unsigned int opt;
int write_io_size_bits; /* Write IO size bits */
@@ -140,6 +142,12 @@ struct f2fs_mount_info {
block_t unusable_cap; /* Amount of space allowed to be
* unusable when disabling checkpoint
*/
+
+ /* For compression */
+ unsigned char compress_algorithm; /* algorithm type */
+ unsigned compress_log_size; /* cluster log size */
+ unsigned char compress_ext_cnt; /* extension count */
+ unsigned char extensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */
};
#define F2FS_FEATURE_ENCRYPT 0x0001
@@ -155,6 +163,7 @@ struct f2fs_mount_info {
#define F2FS_FEATURE_VERITY 0x0400
#define F2FS_FEATURE_SB_CHKSUM 0x0800
#define F2FS_FEATURE_CASEFOLD 0x1000
+#define F2FS_FEATURE_COMPRESSION 0x2000
#define __F2FS_HAS_FEATURE(raw_super, mask) \
((raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -712,6 +721,12 @@ struct f2fs_inode_info {
int i_inline_xattr_size; /* inline xattr size */
struct timespec64 i_crtime; /* inode creation time */
struct timespec64 i_disk_time[4];/* inode disk times */
+
+ /* for file compress */
+ u64 i_compr_blocks; /* # of compressed blocks */
+ unsigned char i_compress_algorithm; /* algorithm type */
+ unsigned char i_log_cluster_size; /* log of cluster size */
+ unsigned int i_cluster_size; /* cluster size */
};
static inline void get_extent_info(struct extent_info *ext,
@@ -1018,6 +1033,7 @@ enum need_lock_type {
enum cp_reason_type {
CP_NO_NEEDED,
CP_NON_REGULAR,
+ CP_COMPRESSED,
CP_HARDLINK,
CP_SB_NEED_CP,
CP_WRONG_PINO,
@@ -1056,12 +1072,15 @@ struct f2fs_io_info {
block_t old_blkaddr; /* old block address before Cow */
struct page *page; /* page to be written */
struct page *encrypted_page; /* encrypted page */
+ struct page *compressed_page; /* compressed page */
struct list_head list; /* serialize IOs */
bool submitted; /* indicate IO submission */
int need_lock; /* indicate we need to lock cp_rwsem */
bool in_list; /* indicate fio is in io_list */
bool is_por; /* indicate IO is from recovery or not */
bool retry; /* need to reallocate block address */
+ int compr_blocks; /* # of compressed block addresses */
+ bool encrypted; /* indicate file is encrypted */
enum iostat_type io_type; /* io type */
struct writeback_control *io_wbc; /* writeback control */
struct bio **bio; /* bio for ipu */
@@ -1169,6 +1188,18 @@ enum fsync_mode {
FSYNC_MODE_NOBARRIER, /* fsync behaves nobarrier based on posix */
};
+/*
+ * this value is set in page as a private data which indicate that
+ * the page is atomically written, and it is in inmem_pages list.
+ */
+#define ATOMIC_WRITTEN_PAGE ((unsigned long)-1)
+#define DUMMY_WRITTEN_PAGE ((unsigned long)-2)
+
+#define IS_ATOMIC_WRITTEN_PAGE(page) \
+ (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE)
+#define IS_DUMMY_WRITTEN_PAGE(page) \
+ (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE)
+
#ifdef CONFIG_FS_ENCRYPTION
#define DUMMY_ENCRYPTION_ENABLED(sbi) \
(unlikely(F2FS_OPTION(sbi).test_dummy_encryption))
@@ -1176,6 +1207,75 @@ enum fsync_mode {
#define DUMMY_ENCRYPTION_ENABLED(sbi) (0)
#endif
+/* For compression */
+enum compress_algorithm_type {
+ COMPRESS_LZO,
+ COMPRESS_LZ4,
+ COMPRESS_MAX,
+};
+
+#define COMPRESS_DATA_RESERVED_SIZE 4
+struct compress_data {
+ __le32 clen; /* compressed data size */
+ __le32 chksum; /* checksum of compressed data */
+ __le32 reserved[COMPRESS_DATA_RESERVED_SIZE]; /* reserved */
+ u8 cdata[]; /* compressed data */
+};
+
+#define COMPRESS_HEADER_SIZE (sizeof(struct compress_data))
+
+#define F2FS_COMPRESSED_PAGE_MAGIC 0xF5F2C000
+
+/* compress context */
+struct compress_ctx {
+ struct inode *inode; /* inode the context belong to */
+ pgoff_t cluster_idx; /* cluster index number */
+ unsigned int cluster_size; /* page count in cluster */
+ unsigned int log_cluster_size; /* log of cluster size */
+ struct page **rpages; /* pages store raw data in cluster */
+ unsigned int nr_rpages; /* total page number in rpages */
+ struct page **cpages; /* pages store compressed data in cluster */
+ unsigned int nr_cpages; /* total page number in cpages */
+ void *rbuf; /* virtual mapped address on rpages */
+ struct compress_data *cbuf; /* virtual mapped address on cpages */
+ size_t rlen; /* valid data length in rbuf */
+ size_t clen; /* valid data length in cbuf */
+ void *private; /* payload buffer for specified compression algorithm */
+};
+
+/* compress context for write IO path */
+struct compress_io_ctx {
+ u32 magic; /* magic number to indicate page is compressed */
+ struct inode *inode; /* inode the context belong to */
+ struct page **rpages; /* pages store raw data in cluster */
+ unsigned int nr_rpages; /* total page number in rpages */
+ refcount_t ref; /* referrence count of raw page */
+};
+
+/* decompress io context for read IO path */
+struct decompress_io_ctx {
+ u32 magic; /* magic number to indicate page is compressed */
+ struct inode *inode; /* inode the context belong to */
+ pgoff_t cluster_idx; /* cluster index number */
+ unsigned int cluster_size; /* page count in cluster */
+ unsigned int log_cluster_size; /* log of cluster size */
+ struct page **rpages; /* pages store raw data in cluster */
+ unsigned int nr_rpages; /* total page number in rpages */
+ struct page **cpages; /* pages store compressed data in cluster */
+ unsigned int nr_cpages; /* total page number in cpages */
+ struct page **tpages; /* temp pages to pad holes in cluster */
+ void *rbuf; /* virtual mapped address on rpages */
+ struct compress_data *cbuf; /* virtual mapped address on cpages */
+ size_t rlen; /* valid data length in rbuf */
+ size_t clen; /* valid data length in cbuf */
+ refcount_t ref; /* referrence count of compressed page */
+ bool failed; /* indicate IO error during decompression */
+};
+
+#define NULL_CLUSTER ((unsigned int)(~0))
+#define MIN_COMPRESS_LOG_SIZE 2
+#define MAX_COMPRESS_LOG_SIZE 8
+
struct f2fs_sb_info {
struct super_block *sb; /* pointer to VFS super block */
struct proc_dir_entry *s_proc; /* proc entry */
@@ -1291,7 +1391,10 @@ struct f2fs_sb_info {
struct f2fs_mount_info mount_opt; /* mount options */
/* for cleaning operations */
- struct mutex gc_mutex; /* mutex for GC */
+ struct rw_semaphore gc_lock; /*
+ * semaphore for GC, avoid
+ * race between GC and GC or CP
+ */
struct f2fs_gc_kthread *gc_thread; /* GC thread */
unsigned int cur_victim_sec; /* current victim section num */
unsigned int gc_mode; /* current GC state */
@@ -1327,11 +1430,11 @@ struct f2fs_sb_info {
atomic_t inline_xattr; /* # of inline_xattr inodes */
atomic_t inline_inode; /* # of inline_data inodes */
atomic_t inline_dir; /* # of inline_dentry inodes */
- atomic_t aw_cnt; /* # of atomic writes */
+ atomic_t compr_inode; /* # of compressed inodes */
+ atomic_t compr_blocks; /* # of compressed blocks */
atomic_t vw_cnt; /* # of volatile writes */
atomic_t max_aw_cnt; /* max # of atomic writes */
atomic_t max_vw_cnt; /* max # of volatile writes */
- int bg_gc; /* background gc calls */
unsigned int io_skip_bggc; /* skip background gc for in-flight IO */
unsigned int other_skip_bggc; /* skip background gc for other reasons */
unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */
@@ -1365,6 +1468,8 @@ struct f2fs_sb_info {
/* Precomputed FS UUID checksum for seeding other checksums */
__u32 s_chksum_seed;
+
+ struct workqueue_struct *post_read_wq; /* post read workqueue */
};
struct f2fs_private_dio {
@@ -2222,26 +2327,6 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep,
return entry;
}
-static inline struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi,
- int npages, bool no_fail)
-{
- struct bio *bio;
-
- if (no_fail) {
- /* No failure on bio allocation */
- bio = bio_alloc(GFP_NOIO, npages);
- if (!bio)
- bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages);
- return bio;
- }
- if (time_to_inject(sbi, FAULT_ALLOC_BIO)) {
- f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO);
- return NULL;
- }
-
- return bio_alloc(GFP_KERNEL, npages);
-}
-
static inline bool is_idle(struct f2fs_sb_info *sbi, int type)
{
if (sbi->gc_mode == GC_URGENT)
@@ -2378,11 +2463,13 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
/*
* On-disk inode flags (f2fs_inode::i_flags)
*/
+#define F2FS_COMPR_FL 0x00000004 /* Compress file */
#define F2FS_SYNC_FL 0x00000008 /* Synchronous updates */
#define F2FS_IMMUTABLE_FL 0x00000010 /* Immutable file */
#define F2FS_APPEND_FL 0x00000020 /* writes to file may only append */
#define F2FS_NODUMP_FL 0x00000040 /* do not dump file */
#define F2FS_NOATIME_FL 0x00000080 /* do not update atime */
+#define F2FS_NOCOMP_FL 0x00000400 /* Don't compress */
#define F2FS_INDEX_FL 0x00001000 /* hash-indexed directory */
#define F2FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */
#define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */
@@ -2391,7 +2478,7 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
/* Flags that should be inherited by new inodes from their parent. */
#define F2FS_FL_INHERITED (F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL | \
F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \
- F2FS_CASEFOLD_FL)
+ F2FS_CASEFOLD_FL | F2FS_COMPR_FL | F2FS_NOCOMP_FL)
/* Flags that are appropriate for regular files (all but dir-specific ones). */
#define F2FS_REG_FLMASK (~(F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \
@@ -2443,6 +2530,8 @@ enum {
FI_PIN_FILE, /* indicate file should not be gced */
FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */
FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
+ FI_COMPRESSED_FILE, /* indicate file's data can be compressed */
+ FI_MMAP_FILE, /* indicate file was mmapped */
};
static inline void __mark_inode_dirty_flag(struct inode *inode,
@@ -2459,6 +2548,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode,
case FI_DATA_EXIST:
case FI_INLINE_DOTS:
case FI_PIN_FILE:
+ case FI_COMPRESSED_FILE:
f2fs_mark_inode_dirty_sync(inode, true);
}
}
@@ -2614,16 +2704,27 @@ static inline int f2fs_has_inline_xattr(struct inode *inode)
return is_inode_flag_set(inode, FI_INLINE_XATTR);
}
+static inline int f2fs_compressed_file(struct inode *inode)
+{
+ return S_ISREG(inode->i_mode) &&
+ is_inode_flag_set(inode, FI_COMPRESSED_FILE);
+}
+
static inline unsigned int addrs_per_inode(struct inode *inode)
{
unsigned int addrs = CUR_ADDRS_PER_INODE(inode) -
get_inline_xattr_addrs(inode);
- return ALIGN_DOWN(addrs, 1);
+
+ if (!f2fs_compressed_file(inode))
+ return addrs;
+ return ALIGN_DOWN(addrs, F2FS_I(inode)->i_cluster_size);
}
static inline unsigned int addrs_per_block(struct inode *inode)
{
- return ALIGN_DOWN(DEF_ADDRS_PER_BLOCK, 1);
+ if (!f2fs_compressed_file(inode))
+ return DEF_ADDRS_PER_BLOCK;
+ return ALIGN_DOWN(DEF_ADDRS_PER_BLOCK, F2FS_I(inode)->i_cluster_size);
}
static inline void *inline_xattr_addr(struct inode *inode, struct page *page)
@@ -2656,6 +2757,11 @@ static inline int f2fs_has_inline_dots(struct inode *inode)
return is_inode_flag_set(inode, FI_INLINE_DOTS);
}
+static inline int f2fs_is_mmap_file(struct inode *inode)
+{
+ return is_inode_flag_set(inode, FI_MMAP_FILE);
+}
+
static inline bool f2fs_is_pinned_file(struct inode *inode)
{
return is_inode_flag_set(inode, FI_PIN_FILE);
@@ -2783,7 +2889,8 @@ static inline bool f2fs_may_extent_tree(struct inode *inode)
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
if (!test_opt(sbi, EXTENT_CACHE) ||
- is_inode_flag_set(inode, FI_NO_EXTENT))
+ is_inode_flag_set(inode, FI_NO_EXTENT) ||
+ is_inode_flag_set(inode, FI_COMPRESSED_FILE))
return false;
/*
@@ -2903,7 +3010,8 @@ static inline void verify_blkaddr(struct f2fs_sb_info *sbi,
static inline bool __is_valid_data_blkaddr(block_t blkaddr)
{
- if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR)
+ if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR ||
+ blkaddr == COMPRESS_ADDR)
return false;
return true;
}
@@ -3001,6 +3109,8 @@ ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr,
struct page **page);
void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
struct page *page, struct inode *inode);
+bool f2fs_has_enough_room(struct inode *dir, struct page *ipage,
+ struct fscrypt_name *fname);
void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
const struct qstr *name, f2fs_hash_t name_hash,
unsigned int bit_pos);
@@ -3155,6 +3265,8 @@ void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk);
int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type,
unsigned int val, int alloc);
void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc);
+int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi);
+int f2fs_check_write_pointer(struct f2fs_sb_info *sbi);
int f2fs_build_segment_manager(struct f2fs_sb_info *sbi);
void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi);
int __init f2fs_create_segment_manager_caches(void);
@@ -3205,10 +3317,13 @@ void f2fs_destroy_checkpoint_caches(void);
/*
* data.c
*/
-int f2fs_init_post_read_processing(void);
-void f2fs_destroy_post_read_processing(void);
+int __init f2fs_init_bioset(void);
+void f2fs_destroy_bioset(void);
+struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool no_fail);
int f2fs_init_bio_entry_cache(void);
void f2fs_destroy_bio_entry_cache(void);
+void f2fs_submit_bio(struct f2fs_sb_info *sbi,
+ struct bio *bio, enum page_type type);
void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type);
void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi,
struct inode *inode, struct page *page,
@@ -3229,6 +3344,9 @@ int f2fs_reserve_new_block(struct dnode_of_data *dn);
int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index);
int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from);
int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index);
+int f2fs_mpage_readpages(struct address_space *mapping,
+ struct list_head *pages, struct page *page,
+ unsigned nr_pages, bool is_readahead);
struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
int op_flags, bool for_write);
struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index);
@@ -3242,8 +3360,14 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
int create, int flag);
int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
+int f2fs_encrypt_one_page(struct f2fs_io_info *fio);
bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio);
bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio);
+int f2fs_write_single_data_page(struct page *page, int *submitted,
+ struct bio **bio, sector_t *last_block,
+ struct writeback_control *wbc,
+ enum iostat_type io_type,
+ int compr_blocks);
void f2fs_invalidate_page(struct page *page, unsigned int offset,
unsigned int length);
int f2fs_release_page(struct page *page, gfp_t wait);
@@ -3253,6 +3377,10 @@ int f2fs_migrate_page(struct address_space *mapping, struct page *newpage,
#endif
bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len);
void f2fs_clear_page_cache_dirty_tag(struct page *page);
+int f2fs_init_post_read_processing(void);
+void f2fs_destroy_post_read_processing(void);
+int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi);
+void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi);
/*
* gc.c
@@ -3299,6 +3427,7 @@ struct f2fs_stat_info {
int nr_discard_cmd;
unsigned int undiscard_blks;
int inline_xattr, inline_inode, inline_dir, append, update, orphans;
+ int compr_inode, compr_blocks;
int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt;
unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks;
unsigned int bimodal, avg_vblocks;
@@ -3330,7 +3459,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
#define stat_inc_cp_count(si) ((si)->cp_count++)
#define stat_inc_bg_cp_count(si) ((si)->bg_cp_count++)
#define stat_inc_call_count(si) ((si)->call_count++)
-#define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++)
+#define stat_inc_bggc_count(si) ((si)->bg_gc++)
#define stat_io_skip_bggc_count(sbi) ((sbi)->io_skip_bggc++)
#define stat_other_skip_bggc_count(sbi) ((sbi)->other_skip_bggc++)
#define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++)
@@ -3369,6 +3498,20 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
if (f2fs_has_inline_dentry(inode)) \
(atomic_dec(&F2FS_I_SB(inode)->inline_dir)); \
} while (0)
+#define stat_inc_compr_inode(inode) \
+ do { \
+ if (f2fs_compressed_file(inode)) \
+ (atomic_inc(&F2FS_I_SB(inode)->compr_inode)); \
+ } while (0)
+#define stat_dec_compr_inode(inode) \
+ do { \
+ if (f2fs_compressed_file(inode)) \
+ (atomic_dec(&F2FS_I_SB(inode)->compr_inode)); \
+ } while (0)
+#define stat_add_compr_blocks(inode, blocks) \
+ (atomic_add(blocks, &F2FS_I_SB(inode)->compr_blocks))
+#define stat_sub_compr_blocks(inode, blocks) \
+ (atomic_sub(blocks, &F2FS_I_SB(inode)->compr_blocks))
#define stat_inc_meta_count(sbi, blkaddr) \
do { \
if (blkaddr < SIT_I(sbi)->sit_base_addr) \
@@ -3386,13 +3529,9 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
((sbi)->block_count[(curseg)->alloc_type]++)
#define stat_inc_inplace_blocks(sbi) \
(atomic_inc(&(sbi)->inplace_count))
-#define stat_inc_atomic_write(inode) \
- (atomic_inc(&F2FS_I_SB(inode)->aw_cnt))
-#define stat_dec_atomic_write(inode) \
- (atomic_dec(&F2FS_I_SB(inode)->aw_cnt))
#define stat_update_max_atomic_write(inode) \
do { \
- int cur = atomic_read(&F2FS_I_SB(inode)->aw_cnt); \
+ int cur = F2FS_I_SB(inode)->atomic_files; \
int max = atomic_read(&F2FS_I_SB(inode)->max_aw_cnt); \
if (cur > max) \
atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \
@@ -3444,6 +3583,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi);
void f2fs_destroy_stats(struct f2fs_sb_info *sbi);
void __init f2fs_create_root_stats(void);
void f2fs_destroy_root_stats(void);
+void f2fs_update_sit_info(struct f2fs_sb_info *sbi);
#else
#define stat_inc_cp_count(si) do { } while (0)
#define stat_inc_bg_cp_count(si) do { } while (0)
@@ -3453,8 +3593,8 @@ void f2fs_destroy_root_stats(void);
#define stat_other_skip_bggc_count(sbi) do { } while (0)
#define stat_inc_dirty_inode(sbi, type) do { } while (0)
#define stat_dec_dirty_inode(sbi, type) do { } while (0)
-#define stat_inc_total_hit(sb) do { } while (0)
-#define stat_inc_rbtree_node_hit(sb) do { } while (0)
+#define stat_inc_total_hit(sbi) do { } while (0)
+#define stat_inc_rbtree_node_hit(sbi) do { } while (0)
#define stat_inc_largest_node_hit(sbi) do { } while (0)
#define stat_inc_cached_node_hit(sbi) do { } while (0)
#define stat_inc_inline_xattr(inode) do { } while (0)
@@ -3463,6 +3603,10 @@ void f2fs_destroy_root_stats(void);
#define stat_dec_inline_inode(inode) do { } while (0)
#define stat_inc_inline_dir(inode) do { } while (0)
#define stat_dec_inline_dir(inode) do { } while (0)
+#define stat_inc_compr_inode(inode) do { } while (0)
+#define stat_dec_compr_inode(inode) do { } while (0)
+#define stat_add_compr_blocks(inode, blocks) do { } while (0)
+#define stat_sub_compr_blocks(inode, blocks) do { } while (0)
#define stat_inc_atomic_write(inode) do { } while (0)
#define stat_dec_atomic_write(inode) do { } while (0)
#define stat_update_max_atomic_write(inode) do { } while (0)
@@ -3482,6 +3626,7 @@ static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
static inline void __init f2fs_create_root_stats(void) { }
static inline void f2fs_destroy_root_stats(void) { }
+static inline void update_sit_info(struct f2fs_sb_info *sbi) {}
#endif
extern const struct file_operations f2fs_dir_operations;
@@ -3510,6 +3655,7 @@ void f2fs_truncate_inline_inode(struct inode *inode,
int f2fs_read_inline_data(struct inode *inode, struct page *page);
int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page);
int f2fs_convert_inline_inode(struct inode *inode);
+int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry);
int f2fs_write_inline_data(struct inode *inode, struct page *page);
bool f2fs_recover_inline_data(struct inode *inode, struct page *npage);
struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir,
@@ -3602,7 +3748,85 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode)
*/
static inline bool f2fs_post_read_required(struct inode *inode)
{
- return f2fs_encrypted_file(inode) || fsverity_active(inode);
+ return f2fs_encrypted_file(inode) || fsverity_active(inode) ||
+ f2fs_compressed_file(inode);
+}
+
+/*
+ * compress.c
+ */
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+bool f2fs_is_compressed_page(struct page *page);
+struct page *f2fs_compress_control_page(struct page *page);
+int f2fs_prepare_compress_overwrite(struct inode *inode,
+ struct page **pagep, pgoff_t index, void **fsdata);
+bool f2fs_compress_write_end(struct inode *inode, void *fsdata,
+ pgoff_t index, unsigned copied);
+void f2fs_compress_write_end_io(struct bio *bio, struct page *page);
+bool f2fs_is_compress_backend_ready(struct inode *inode);
+void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity);
+bool f2fs_cluster_is_empty(struct compress_ctx *cc);
+bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index);
+void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page);
+int f2fs_write_multi_pages(struct compress_ctx *cc,
+ int *submitted,
+ struct writeback_control *wbc,
+ enum iostat_type io_type);
+int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index);
+int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
+ unsigned nr_pages, sector_t *last_block_in_bio,
+ bool is_readahead);
+struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc);
+void f2fs_free_dic(struct decompress_io_ctx *dic);
+void f2fs_decompress_end_io(struct page **rpages,
+ unsigned int cluster_size, bool err, bool verity);
+int f2fs_init_compress_ctx(struct compress_ctx *cc);
+void f2fs_destroy_compress_ctx(struct compress_ctx *cc);
+void f2fs_init_compress_info(struct f2fs_sb_info *sbi);
+#else
+static inline bool f2fs_is_compressed_page(struct page *page) { return false; }
+static inline bool f2fs_is_compress_backend_ready(struct inode *inode)
+{
+ if (!f2fs_compressed_file(inode))
+ return true;
+ /* not support compression */
+ return false;
+}
+static inline struct page *f2fs_compress_control_page(struct page *page)
+{
+ WARN_ON_ONCE(1);
+ return ERR_PTR(-EINVAL);
+}
+#endif
+
+static inline void set_compress_context(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ F2FS_I(inode)->i_compress_algorithm =
+ F2FS_OPTION(sbi).compress_algorithm;
+ F2FS_I(inode)->i_log_cluster_size =
+ F2FS_OPTION(sbi).compress_log_size;
+ F2FS_I(inode)->i_cluster_size =
+ 1 << F2FS_I(inode)->i_log_cluster_size;
+ F2FS_I(inode)->i_flags |= F2FS_COMPR_FL;
+ set_inode_flag(inode, FI_COMPRESSED_FILE);
+ stat_inc_compr_inode(inode);
+}
+
+static inline u64 f2fs_disable_compressed_file(struct inode *inode)
+{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+
+ if (!f2fs_compressed_file(inode))
+ return 0;
+ if (fi->i_compr_blocks)
+ return fi->i_compr_blocks;
+
+ fi->i_flags &= ~F2FS_COMPR_FL;
+ clear_inode_flag(inode, FI_COMPRESSED_FILE);
+ stat_dec_compr_inode(inode);
+ return 0;
}
#define F2FS_FEATURE_FUNCS(name, flagname) \
@@ -3623,6 +3847,7 @@ F2FS_FEATURE_FUNCS(lost_found, LOST_FOUND);
F2FS_FEATURE_FUNCS(verity, VERITY);
F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM);
F2FS_FEATURE_FUNCS(casefold, CASEFOLD);
+F2FS_FEATURE_FUNCS(compression, COMPRESSION);
#ifdef CONFIG_BLK_DEV_ZONED
static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi,
@@ -3704,6 +3929,30 @@ static inline bool f2fs_may_encrypt(struct inode *inode)
#endif
}
+static inline bool f2fs_may_compress(struct inode *inode)
+{
+ if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) ||
+ f2fs_is_atomic_file(inode) ||
+ f2fs_is_volatile_file(inode))
+ return false;
+ return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode);
+}
+
+static inline void f2fs_i_compr_blocks_update(struct inode *inode,
+ u64 blocks, bool add)
+{
+ int diff = F2FS_I(inode)->i_cluster_size - blocks;
+
+ if (add) {
+ F2FS_I(inode)->i_compr_blocks += diff;
+ stat_add_compr_blocks(inode, diff);
+ } else {
+ F2FS_I(inode)->i_compr_blocks -= diff;
+ stat_sub_compr_blocks(inode, diff);
+ }
+ f2fs_mark_inode_dirty_sync(inode, true);
+}
+
static inline int block_unaligned_IO(struct inode *inode,
struct kiocb *iocb, struct iov_iter *iter)
{
@@ -3735,6 +3984,8 @@ static inline bool f2fs_force_buffered_io(struct inode *inode,
return true;
if (f2fs_is_multi_device(sbi))
return true;
+ if (f2fs_compressed_file(inode))
+ return true;
/*
* for blkzoned device, fallback direct IO to buffered IO, so
* all IOs can be serialized by log-structured write.
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 85af112e868d..86ddbb55d2b1 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -50,8 +50,9 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
struct page *page = vmf->page;
struct inode *inode = file_inode(vmf->vma->vm_file);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct dnode_of_data dn = { .node_changed = false };
- int err;
+ struct dnode_of_data dn;
+ bool need_alloc = true;
+ int err = 0;
if (unlikely(f2fs_cp_error(sbi))) {
err = -EIO;
@@ -63,6 +64,26 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
goto err;
}
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (f2fs_compressed_file(inode)) {
+ int ret = f2fs_is_compressed_cluster(inode, page->index);
+
+ if (ret < 0) {
+ err = ret;
+ goto err;
+ } else if (ret) {
+ if (ret < F2FS_I(inode)->i_cluster_size) {
+ err = -EAGAIN;
+ goto err;
+ }
+ need_alloc = false;
+ }
+ }
+#endif
+ /* should do out of any locked page */
+ if (need_alloc)
+ f2fs_balance_fs(sbi, true);
+
sb_start_pagefault(inode->i_sb);
f2fs_bug_on(sbi, f2fs_has_inline_data(inode));
@@ -78,15 +99,17 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
goto out_sem;
}
- /* block allocation */
- __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = f2fs_get_block(&dn, page->index);
- f2fs_put_dnode(&dn);
- __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
- if (err) {
- unlock_page(page);
- goto out_sem;
+ if (need_alloc) {
+ /* block allocation */
+ __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = f2fs_get_block(&dn, page->index);
+ f2fs_put_dnode(&dn);
+ __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
+ if (err) {
+ unlock_page(page);
+ goto out_sem;
+ }
}
/* fill the page */
@@ -120,8 +143,6 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
out_sem:
up_read(&F2FS_I(inode)->i_mmap_sem);
- f2fs_balance_fs(sbi, dn.node_changed);
-
sb_end_pagefault(inode->i_sb);
err:
return block_page_mkwrite_return(err);
@@ -155,6 +176,8 @@ static inline enum cp_reason_type need_do_checkpoint(struct inode *inode)
if (!S_ISREG(inode->i_mode))
cp_reason = CP_NON_REGULAR;
+ else if (f2fs_compressed_file(inode))
+ cp_reason = CP_COMPRESSED;
else if (inode->i_nlink != 1)
cp_reason = CP_HARDLINK;
else if (is_sbi_flag_set(sbi, SBI_NEED_CP))
@@ -485,6 +508,9 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
return -EIO;
+ if (!f2fs_is_compress_backend_ready(inode))
+ return -EOPNOTSUPP;
+
/* we don't need to use inline_data strictly */
err = f2fs_convert_inline_inode(inode);
if (err)
@@ -492,6 +518,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
vma->vm_ops = &f2fs_file_vm_ops;
+ set_inode_flag(inode, FI_MMAP_FILE);
return 0;
}
@@ -502,6 +529,9 @@ static int f2fs_file_open(struct inode *inode, struct file *filp)
if (err)
return err;
+ if (!f2fs_is_compress_backend_ready(inode))
+ return -EOPNOTSUPP;
+
err = fsverity_file_open(inode, filp);
if (err)
return err;
@@ -518,6 +548,9 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
int nr_free = 0, ofs = dn->ofs_in_node, len = count;
__le32 *addr;
int base = 0;
+ bool compressed_cluster = false;
+ int cluster_index = 0, valid_blocks = 0;
+ int cluster_size = F2FS_I(dn->inode)->i_cluster_size;
if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode))
base = get_extra_isize(dn->inode);
@@ -525,26 +558,43 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
raw_node = F2FS_NODE(dn->node_page);
addr = blkaddr_in_node(raw_node) + base + ofs;
- for (; count > 0; count--, addr++, dn->ofs_in_node++) {
+ /* Assumption: truncateion starts with cluster */
+ for (; count > 0; count--, addr++, dn->ofs_in_node++, cluster_index++) {
block_t blkaddr = le32_to_cpu(*addr);
+ if (f2fs_compressed_file(dn->inode) &&
+ !(cluster_index & (cluster_size - 1))) {
+ if (compressed_cluster)
+ f2fs_i_compr_blocks_update(dn->inode,
+ valid_blocks, false);
+ compressed_cluster = (blkaddr == COMPRESS_ADDR);
+ valid_blocks = 0;
+ }
+
if (blkaddr == NULL_ADDR)
continue;
dn->data_blkaddr = NULL_ADDR;
f2fs_set_data_blkaddr(dn);
- if (__is_valid_data_blkaddr(blkaddr) &&
- !f2fs_is_valid_blkaddr(sbi, blkaddr,
+ if (__is_valid_data_blkaddr(blkaddr)) {
+ if (!f2fs_is_valid_blkaddr(sbi, blkaddr,
DATA_GENERIC_ENHANCE))
- continue;
+ continue;
+ if (compressed_cluster)
+ valid_blocks++;
+ }
- f2fs_invalidate_blocks(sbi, blkaddr);
if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page))
clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN);
+
+ f2fs_invalidate_blocks(sbi, blkaddr);
nr_free++;
}
+ if (compressed_cluster)
+ f2fs_i_compr_blocks_update(dn->inode, valid_blocks, false);
+
if (nr_free) {
pgoff_t fofs;
/*
@@ -587,6 +637,9 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
return 0;
}
+ if (f2fs_compressed_file(inode))
+ return 0;
+
page = f2fs_get_lock_data_page(inode, index, true);
if (IS_ERR(page))
return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page);
@@ -602,7 +655,7 @@ truncate_out:
return 0;
}
-int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock)
+static int do_truncate_blocks(struct inode *inode, u64 from, bool lock)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dnode_of_data dn;
@@ -667,6 +720,28 @@ free_partial:
return err;
}
+int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock)
+{
+ u64 free_from = from;
+
+ /*
+ * for compressed file, only support cluster size
+ * aligned truncation.
+ */
+ if (f2fs_compressed_file(inode)) {
+ size_t cluster_shift = PAGE_SHIFT +
+ F2FS_I(inode)->i_log_cluster_size;
+ size_t cluster_mask = (1 << cluster_shift) - 1;
+
+ free_from = from >> cluster_shift;
+ if (from & cluster_mask)
+ free_from++;
+ free_from <<= cluster_shift;
+ }
+
+ return do_truncate_blocks(inode, free_from, lock);
+}
+
int f2fs_truncate(struct inode *inode)
{
int err;
@@ -786,6 +861,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
return -EIO;
+ if ((attr->ia_valid & ATTR_SIZE) &&
+ !f2fs_is_compress_backend_ready(inode))
+ return -EOPNOTSUPP;
+
err = setattr_prepare(dentry, attr);
if (err)
return err;
@@ -1026,8 +1105,8 @@ next_dnode:
} else if (ret == -ENOENT) {
if (dn.max_level == 0)
return -ENOENT;
- done = min((pgoff_t)ADDRS_PER_BLOCK(inode) - dn.ofs_in_node,
- len);
+ done = min((pgoff_t)ADDRS_PER_BLOCK(inode) -
+ dn.ofs_in_node, len);
blkaddr += done;
do_replace += done;
goto next;
@@ -1190,13 +1269,13 @@ static int __exchange_data_block(struct inode *src_inode,
src_blkaddr = f2fs_kvzalloc(F2FS_I_SB(src_inode),
array_size(olen, sizeof(block_t)),
- GFP_KERNEL);
+ GFP_NOFS);
if (!src_blkaddr)
return -ENOMEM;
do_replace = f2fs_kvzalloc(F2FS_I_SB(src_inode),
array_size(olen, sizeof(int)),
- GFP_KERNEL);
+ GFP_NOFS);
if (!do_replace) {
kvfree(src_blkaddr);
return -ENOMEM;
@@ -1563,7 +1642,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
next_alloc:
if (has_not_enough_free_secs(sbi, 0,
GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) {
- mutex_lock(&sbi->gc_mutex);
+ down_write(&sbi->gc_lock);
err = f2fs_gc(sbi, true, false, NULL_SEGNO);
if (err && err != -ENODATA && err != -EAGAIN)
goto out_err;
@@ -1621,6 +1700,8 @@ static long f2fs_fallocate(struct file *file, int mode,
return -EIO;
if (!f2fs_is_checkpoint_ready(F2FS_I_SB(inode)))
return -ENOSPC;
+ if (!f2fs_is_compress_backend_ready(inode))
+ return -EOPNOTSUPP;
/* f2fs only support ->fallocate for regular file */
if (!S_ISREG(inode->i_mode))
@@ -1630,6 +1711,11 @@ static long f2fs_fallocate(struct file *file, int mode,
(mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
return -EOPNOTSUPP;
+ if (f2fs_compressed_file(inode) &&
+ (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE |
+ FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE)))
+ return -EOPNOTSUPP;
+
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
FALLOC_FL_INSERT_RANGE))
@@ -1719,7 +1805,40 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
return -ENOTEMPTY;
}
+ if (iflags & (F2FS_COMPR_FL | F2FS_NOCOMP_FL)) {
+ if (!f2fs_sb_has_compression(F2FS_I_SB(inode)))
+ return -EOPNOTSUPP;
+ if ((iflags & F2FS_COMPR_FL) && (iflags & F2FS_NOCOMP_FL))
+ return -EINVAL;
+ }
+
+ if ((iflags ^ fi->i_flags) & F2FS_COMPR_FL) {
+ if (S_ISREG(inode->i_mode) &&
+ (fi->i_flags & F2FS_COMPR_FL || i_size_read(inode) ||
+ F2FS_HAS_BLOCKS(inode)))
+ return -EINVAL;
+ if (iflags & F2FS_NOCOMP_FL)
+ return -EINVAL;
+ if (iflags & F2FS_COMPR_FL) {
+ int err = f2fs_convert_inline_inode(inode);
+
+ if (err)
+ return err;
+
+ if (!f2fs_may_compress(inode))
+ return -EINVAL;
+
+ set_compress_context(inode);
+ }
+ }
+ if ((iflags ^ fi->i_flags) & F2FS_NOCOMP_FL) {
+ if (fi->i_flags & F2FS_COMPR_FL)
+ return -EINVAL;
+ }
+
fi->i_flags = iflags | (fi->i_flags & ~mask);
+ f2fs_bug_on(F2FS_I_SB(inode), (fi->i_flags & F2FS_COMPR_FL) &&
+ (fi->i_flags & F2FS_NOCOMP_FL));
if (fi->i_flags & F2FS_PROJINHERIT_FL)
set_inode_flag(inode, FI_PROJ_INHERIT);
@@ -1745,11 +1864,13 @@ static const struct {
u32 iflag;
u32 fsflag;
} f2fs_fsflags_map[] = {
+ { F2FS_COMPR_FL, FS_COMPR_FL },
{ F2FS_SYNC_FL, FS_SYNC_FL },
{ F2FS_IMMUTABLE_FL, FS_IMMUTABLE_FL },
{ F2FS_APPEND_FL, FS_APPEND_FL },
{ F2FS_NODUMP_FL, FS_NODUMP_FL },
{ F2FS_NOATIME_FL, FS_NOATIME_FL },
+ { F2FS_NOCOMP_FL, FS_NOCOMP_FL },
{ F2FS_INDEX_FL, FS_INDEX_FL },
{ F2FS_DIRSYNC_FL, FS_DIRSYNC_FL },
{ F2FS_PROJINHERIT_FL, FS_PROJINHERIT_FL },
@@ -1757,11 +1878,13 @@ static const struct {
};
#define F2FS_GETTABLE_FS_FL ( \
+ FS_COMPR_FL | \
FS_SYNC_FL | \
FS_IMMUTABLE_FL | \
FS_APPEND_FL | \
FS_NODUMP_FL | \
FS_NOATIME_FL | \
+ FS_NOCOMP_FL | \
FS_INDEX_FL | \
FS_DIRSYNC_FL | \
FS_PROJINHERIT_FL | \
@@ -1772,11 +1895,13 @@ static const struct {
FS_CASEFOLD_FL)
#define F2FS_SETTABLE_FS_FL ( \
+ FS_COMPR_FL | \
FS_SYNC_FL | \
FS_IMMUTABLE_FL | \
FS_APPEND_FL | \
FS_NODUMP_FL | \
FS_NOATIME_FL | \
+ FS_NOCOMP_FL | \
FS_DIRSYNC_FL | \
FS_PROJINHERIT_FL | \
FS_CASEFOLD_FL)
@@ -1897,6 +2022,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
inode_lock(inode);
+ f2fs_disable_compressed_file(inode);
+
if (f2fs_is_atomic_file(inode)) {
if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST))
ret = -EINVAL;
@@ -1935,7 +2062,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
F2FS_I(inode)->inmem_task = current;
- stat_inc_atomic_write(inode);
stat_update_max_atomic_write(inode);
out:
inode_unlock(inode);
@@ -2324,12 +2450,12 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
return ret;
if (!sync) {
- if (!mutex_trylock(&sbi->gc_mutex)) {
+ if (!down_write_trylock(&sbi->gc_lock)) {
ret = -EBUSY;
goto out;
}
} else {
- mutex_lock(&sbi->gc_mutex);
+ down_write(&sbi->gc_lock);
}
ret = f2fs_gc(sbi, sync, true, NULL_SEGNO);
@@ -2367,12 +2493,12 @@ static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg)
do_more:
if (!range.sync) {
- if (!mutex_trylock(&sbi->gc_mutex)) {
+ if (!down_write_trylock(&sbi->gc_lock)) {
ret = -EBUSY;
goto out;
}
} else {
- mutex_lock(&sbi->gc_mutex);
+ down_write(&sbi->gc_lock);
}
ret = f2fs_gc(sbi, range.sync, true, GET_SEGNO(sbi, range.start));
@@ -2803,7 +2929,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg)
end_segno = min(start_segno + range.segments, dev_end_segno);
while (start_segno < end_segno) {
- if (!mutex_trylock(&sbi->gc_mutex)) {
+ if (!down_write_trylock(&sbi->gc_lock)) {
ret = -EBUSY;
goto out;
}
@@ -3098,10 +3224,16 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
ret = -EAGAIN;
goto out;
}
+
ret = f2fs_convert_inline_inode(inode);
if (ret)
goto out;
+ if (f2fs_disable_compressed_file(inode)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
set_inode_flag(inode, FI_PIN_FILE);
ret = F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN];
done:
@@ -3350,6 +3482,17 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
}
}
+static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+
+ if (!f2fs_is_compress_backend_ready(inode))
+ return -EOPNOTSUPP;
+
+ return generic_file_read_iter(iocb, iter);
+}
+
static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
@@ -3361,6 +3504,9 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
goto out;
}
+ if (!f2fs_is_compress_backend_ready(inode))
+ return -EOPNOTSUPP;
+
if (iocb->ki_flags & IOCB_NOWAIT) {
if (!inode_trylock(inode)) {
ret = -EAGAIN;
@@ -3389,18 +3535,41 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = -EAGAIN;
goto out;
}
- } else {
- preallocated = true;
- target_size = iocb->ki_pos + iov_iter_count(from);
+ goto write;
+ }
- err = f2fs_preallocate_blocks(iocb, from);
- if (err) {
- clear_inode_flag(inode, FI_NO_PREALLOC);
- inode_unlock(inode);
- ret = err;
- goto out;
- }
+ if (is_inode_flag_set(inode, FI_NO_PREALLOC))
+ goto write;
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ /*
+ * Convert inline data for Direct I/O before entering
+ * f2fs_direct_IO().
+ */
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ goto out_err;
+ /*
+ * If force_buffere_io() is true, we have to allocate
+ * blocks all the time, since f2fs_direct_IO will fall
+ * back to buffered IO.
+ */
+ if (!f2fs_force_buffered_io(inode, iocb, from) &&
+ allow_outplace_dio(inode, iocb, from))
+ goto write;
+ }
+ preallocated = true;
+ target_size = iocb->ki_pos + iov_iter_count(from);
+
+ err = f2fs_preallocate_blocks(iocb, from);
+ if (err) {
+out_err:
+ clear_inode_flag(inode, FI_NO_PREALLOC);
+ inode_unlock(inode);
+ ret = err;
+ goto out;
}
+write:
ret = __generic_file_write_iter(iocb, from);
clear_inode_flag(inode, FI_NO_PREALLOC);
@@ -3475,7 +3644,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
const struct file_operations f2fs_file_operations = {
.llseek = f2fs_llseek,
- .read_iter = generic_file_read_iter,
+ .read_iter = f2fs_file_read_iter,
.write_iter = f2fs_file_write_iter,
.open = f2fs_file_open,
.release = f2fs_release_file,
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index b3d399623290..db8725d473b5 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -78,18 +78,18 @@ static int gc_thread_func(void *data)
*/
if (sbi->gc_mode == GC_URGENT) {
wait_ms = gc_th->urgent_sleep_time;
- mutex_lock(&sbi->gc_mutex);
+ down_write(&sbi->gc_lock);
goto do_gc;
}
- if (!mutex_trylock(&sbi->gc_mutex)) {
+ if (!down_write_trylock(&sbi->gc_lock)) {
stat_other_skip_bggc_count(sbi);
goto next;
}
if (!is_idle(sbi, GC_TIME)) {
increase_sleep_time(gc_th, &wait_ms);
- mutex_unlock(&sbi->gc_mutex);
+ up_write(&sbi->gc_lock);
stat_io_skip_bggc_count(sbi);
goto next;
}
@@ -99,7 +99,7 @@ static int gc_thread_func(void *data)
else
increase_sleep_time(gc_th, &wait_ms);
do_gc:
- stat_inc_bggc_count(sbi);
+ stat_inc_bggc_count(sbi->stat_info);
/* if return value is not zero, no victim was selected */
if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO))
@@ -1049,8 +1049,10 @@ next_step:
if (phase == 3) {
inode = f2fs_iget(sb, dni.ino);
- if (IS_ERR(inode) || is_bad_inode(inode))
+ if (IS_ERR(inode) || is_bad_inode(inode)) {
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
continue;
+ }
if (!down_write_trylock(
&F2FS_I(inode)->i_gc_rwsem[WRITE])) {
@@ -1368,7 +1370,7 @@ stop:
reserved_segments(sbi),
prefree_segments(sbi));
- mutex_unlock(&sbi->gc_mutex);
+ up_write(&sbi->gc_lock);
put_gc_inode(&gc_list);
@@ -1407,9 +1409,9 @@ static int free_segment_range(struct f2fs_sb_info *sbi, unsigned int start,
.iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
};
- mutex_lock(&sbi->gc_mutex);
+ down_write(&sbi->gc_lock);
do_garbage_collect(sbi, segno, &gc_list, FG_GC);
- mutex_unlock(&sbi->gc_mutex);
+ up_write(&sbi->gc_lock);
put_gc_inode(&gc_list);
if (get_valid_blocks(sbi, segno, true))
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 896db0416f0e..4167e5408151 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -368,7 +368,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
struct f2fs_dentry_ptr src, dst;
int err;
- page = f2fs_grab_cache_page(dir->i_mapping, 0, false);
+ page = f2fs_grab_cache_page(dir->i_mapping, 0, true);
if (!page) {
f2fs_put_page(ipage, 1);
return -ENOMEM;
@@ -530,7 +530,7 @@ recover:
return err;
}
-static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
+static int do_convert_inline_dir(struct inode *dir, struct page *ipage,
void *inline_dentry)
{
if (!F2FS_I(dir)->i_dir_level)
@@ -539,6 +539,44 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
return f2fs_move_rehashed_dirents(dir, ipage, inline_dentry);
}
+int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+ struct page *ipage;
+ struct fscrypt_name fname;
+ void *inline_dentry = NULL;
+ int err = 0;
+
+ if (!f2fs_has_inline_dentry(dir))
+ return 0;
+
+ f2fs_lock_op(sbi);
+
+ err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
+ if (err)
+ goto out;
+
+ ipage = f2fs_get_node_page(sbi, dir->i_ino);
+ if (IS_ERR(ipage)) {
+ err = PTR_ERR(ipage);
+ goto out;
+ }
+
+ if (f2fs_has_enough_room(dir, ipage, &fname)) {
+ f2fs_put_page(ipage, 1);
+ goto out;
+ }
+
+ inline_dentry = inline_data_addr(dir, ipage);
+
+ err = do_convert_inline_dir(dir, ipage, inline_dentry);
+ if (!err)
+ f2fs_put_page(ipage, 1);
+out:
+ f2fs_unlock_op(sbi);
+ return err;
+}
+
int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
const struct qstr *orig_name,
struct inode *inode, nid_t ino, umode_t mode)
@@ -562,7 +600,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
bit_pos = f2fs_room_for_filename(d.bitmap, slots, d.max);
if (bit_pos >= d.max) {
- err = f2fs_convert_inline_dir(dir, ipage, inline_dentry);
+ err = do_convert_inline_dir(dir, ipage, inline_dentry);
if (err)
return err;
err = -EAGAIN;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 502bd491336a..78c3f1d70f1d 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -200,6 +200,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct f2fs_inode *ri = F2FS_INODE(node_page);
unsigned long long iblocks;
iblocks = le64_to_cpu(F2FS_INODE(node_page)->i_blocks);
@@ -286,6 +287,19 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
return false;
}
+ if (f2fs_has_extra_attr(inode) && f2fs_sb_has_compression(sbi) &&
+ fi->i_flags & F2FS_COMPR_FL &&
+ F2FS_FITS_IN_INODE(ri, fi->i_extra_isize,
+ i_log_cluster_size)) {
+ if (ri->i_compress_algorithm >= COMPRESS_MAX)
+ return false;
+ if (le64_to_cpu(ri->i_compr_blocks) > inode->i_blocks)
+ return false;
+ if (ri->i_log_cluster_size < MIN_COMPRESS_LOG_SIZE ||
+ ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE)
+ return false;
+ }
+
return true;
}
@@ -407,6 +421,18 @@ static int do_read_inode(struct inode *inode)
fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec);
}
+ if (f2fs_has_extra_attr(inode) && f2fs_sb_has_compression(sbi) &&
+ (fi->i_flags & F2FS_COMPR_FL)) {
+ if (F2FS_FITS_IN_INODE(ri, fi->i_extra_isize,
+ i_log_cluster_size)) {
+ fi->i_compr_blocks = le64_to_cpu(ri->i_compr_blocks);
+ fi->i_compress_algorithm = ri->i_compress_algorithm;
+ fi->i_log_cluster_size = ri->i_log_cluster_size;
+ fi->i_cluster_size = 1 << fi->i_log_cluster_size;
+ set_inode_flag(inode, FI_COMPRESSED_FILE);
+ }
+ }
+
F2FS_I(inode)->i_disk_time[0] = inode->i_atime;
F2FS_I(inode)->i_disk_time[1] = inode->i_ctime;
F2FS_I(inode)->i_disk_time[2] = inode->i_mtime;
@@ -416,6 +442,8 @@ static int do_read_inode(struct inode *inode)
stat_inc_inline_xattr(inode);
stat_inc_inline_inode(inode);
stat_inc_inline_dir(inode);
+ stat_inc_compr_inode(inode);
+ stat_add_compr_blocks(inode, F2FS_I(inode)->i_compr_blocks);
return 0;
}
@@ -569,6 +597,17 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
ri->i_crtime_nsec =
cpu_to_le32(F2FS_I(inode)->i_crtime.tv_nsec);
}
+
+ if (f2fs_sb_has_compression(F2FS_I_SB(inode)) &&
+ F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize,
+ i_log_cluster_size)) {
+ ri->i_compr_blocks =
+ cpu_to_le64(F2FS_I(inode)->i_compr_blocks);
+ ri->i_compress_algorithm =
+ F2FS_I(inode)->i_compress_algorithm;
+ ri->i_log_cluster_size =
+ F2FS_I(inode)->i_log_cluster_size;
+ }
}
__set_inode_rdev(inode, ri);
@@ -711,6 +750,8 @@ no_delete:
stat_dec_inline_xattr(inode);
stat_dec_inline_dir(inode);
stat_dec_inline_inode(inode);
+ stat_dec_compr_inode(inode);
+ stat_sub_compr_blocks(inode, F2FS_I(inode)->i_compr_blocks);
if (likely(!f2fs_cp_error(sbi) &&
!is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index a1c507b0b4ac..2aa035422c0f 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -119,6 +119,13 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL)
set_inode_flag(inode, FI_PROJ_INHERIT);
+ if (f2fs_sb_has_compression(sbi)) {
+ /* Inherit the compression flag in directory */
+ if ((F2FS_I(dir)->i_flags & F2FS_COMPR_FL) &&
+ f2fs_may_compress(inode))
+ set_compress_context(inode);
+ }
+
f2fs_set_inode_flags(inode);
trace_f2fs_new_inode(inode, 0);
@@ -149,6 +156,9 @@ static inline int is_extension_exist(const unsigned char *s, const char *sub)
size_t sublen = strlen(sub);
int i;
+ if (sublen == 1 && *sub == '*')
+ return 1;
+
/*
* filename format of multimedia file should be defined as:
* "filename + '.' + extension + (optional: '.' + temp extension)".
@@ -262,6 +272,45 @@ int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name,
return 0;
}
+static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode,
+ const unsigned char *name)
+{
+ __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
+ unsigned char (*ext)[F2FS_EXTENSION_LEN];
+ unsigned int ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
+ int i, cold_count, hot_count;
+
+ if (!f2fs_sb_has_compression(sbi) ||
+ is_inode_flag_set(inode, FI_COMPRESSED_FILE) ||
+ F2FS_I(inode)->i_flags & F2FS_NOCOMP_FL ||
+ !f2fs_may_compress(inode))
+ return;
+
+ down_read(&sbi->sb_lock);
+
+ cold_count = le32_to_cpu(sbi->raw_super->extension_count);
+ hot_count = sbi->raw_super->hot_ext_count;
+
+ for (i = cold_count; i < cold_count + hot_count; i++) {
+ if (is_extension_exist(name, extlist[i])) {
+ up_read(&sbi->sb_lock);
+ return;
+ }
+ }
+
+ up_read(&sbi->sb_lock);
+
+ ext = F2FS_OPTION(sbi).extensions;
+
+ for (i = 0; i < ext_cnt; i++) {
+ if (!is_extension_exist(name, ext[i]))
+ continue;
+
+ set_compress_context(inode);
+ return;
+ }
+}
+
static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
@@ -286,6 +335,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
if (!test_opt(sbi, DISABLE_EXT_IDENTIFY))
set_file_temperature(sbi, inode, dentry->d_name.name);
+ set_compress_inode(sbi, inode, dentry->d_name.name);
+
inode->i_op = &f2fs_file_inode_operations;
inode->i_fop = &f2fs_file_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
@@ -797,6 +848,7 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
if (whiteout) {
f2fs_i_links_write(inode, false);
+ inode->i_state |= I_LINKABLE;
*whiteout = inode;
} else {
d_tmpfile(dentry, inode);
@@ -849,12 +901,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *old_inode = d_inode(old_dentry);
struct inode *new_inode = d_inode(new_dentry);
struct inode *whiteout = NULL;
- struct page *old_dir_page;
+ struct page *old_dir_page = NULL;
struct page *old_page, *new_page = NULL;
struct f2fs_dir_entry *old_dir_entry = NULL;
struct f2fs_dir_entry *old_entry;
struct f2fs_dir_entry *new_entry;
- bool is_old_inline = f2fs_has_inline_dentry(old_dir);
int err;
if (unlikely(f2fs_cp_error(sbi)))
@@ -867,6 +918,26 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
F2FS_I(old_dentry->d_inode)->i_projid)))
return -EXDEV;
+ /*
+ * If new_inode is null, the below renaming flow will
+ * add a link in old_dir which can conver inline_dir.
+ * After then, if we failed to get the entry due to other
+ * reasons like ENOMEM, we had to remove the new entry.
+ * Instead of adding such the error handling routine, let's
+ * simply convert first here.
+ */
+ if (old_dir == new_dir && !new_inode) {
+ err = f2fs_try_convert_inline_dir(old_dir, new_dentry);
+ if (err)
+ return err;
+ }
+
+ if (flags & RENAME_WHITEOUT) {
+ err = f2fs_create_whiteout(old_dir, &whiteout);
+ if (err)
+ return err;
+ }
+
err = dquot_initialize(old_dir);
if (err)
goto out;
@@ -898,17 +969,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
}
- if (flags & RENAME_WHITEOUT) {
- err = f2fs_create_whiteout(old_dir, &whiteout);
- if (err)
- goto out_dir;
- }
-
if (new_inode) {
err = -ENOTEMPTY;
if (old_dir_entry && !f2fs_empty_dir(new_inode))
- goto out_whiteout;
+ goto out_dir;
err = -ENOENT;
new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name,
@@ -916,7 +981,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (!new_entry) {
if (IS_ERR(new_page))
err = PTR_ERR(new_page);
- goto out_whiteout;
+ goto out_dir;
}
f2fs_balance_fs(sbi, true);
@@ -928,6 +993,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
goto put_out_dir;
f2fs_set_link(new_dir, new_entry, new_page, old_inode);
+ new_page = NULL;
new_inode->i_ctime = current_time(new_inode);
down_write(&F2FS_I(new_inode)->i_sem);
@@ -948,33 +1014,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
err = f2fs_add_link(new_dentry, old_inode);
if (err) {
f2fs_unlock_op(sbi);
- goto out_whiteout;
+ goto out_dir;
}
if (old_dir_entry)
f2fs_i_links_write(new_dir, true);
-
- /*
- * old entry and new entry can locate in the same inline
- * dentry in inode, when attaching new entry in inline dentry,
- * it could force inline dentry conversion, after that,
- * old_entry and old_page will point to wrong address, in
- * order to avoid this, let's do the check and update here.
- */
- if (is_old_inline && !f2fs_has_inline_dentry(old_dir)) {
- f2fs_put_page(old_page, 0);
- old_page = NULL;
-
- old_entry = f2fs_find_entry(old_dir,
- &old_dentry->d_name, &old_page);
- if (!old_entry) {
- err = -ENOENT;
- if (IS_ERR(old_page))
- err = PTR_ERR(old_page);
- f2fs_unlock_op(sbi);
- goto out_whiteout;
- }
- }
}
down_write(&F2FS_I(old_inode)->i_sem);
@@ -989,9 +1033,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_mark_inode_dirty_sync(old_inode, false);
f2fs_delete_entry(old_entry, old_page, old_dir, NULL);
+ old_page = NULL;
if (whiteout) {
- whiteout->i_state |= I_LINKABLE;
set_inode_flag(whiteout, FI_INC_LINK);
err = f2fs_add_link(old_dentry, whiteout);
if (err)
@@ -1025,17 +1069,15 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
put_out_dir:
f2fs_unlock_op(sbi);
- if (new_page)
- f2fs_put_page(new_page, 0);
-out_whiteout:
- if (whiteout)
- iput(whiteout);
+ f2fs_put_page(new_page, 0);
out_dir:
if (old_dir_entry)
f2fs_put_page(old_dir_page, 0);
out_old:
f2fs_put_page(old_page, 0);
out:
+ if (whiteout)
+ iput(whiteout);
return err;
}
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 76477f71d4ee..763d5c0951d1 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -723,6 +723,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
int ret = 0;
unsigned long s_flags = sbi->sb->s_flags;
bool need_writecp = false;
+ bool fix_curseg_write_pointer = false;
#ifdef CONFIG_QUOTA
int quota_enabled;
#endif
@@ -774,6 +775,8 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
sbi->sb->s_flags = s_flags;
}
skip:
+ fix_curseg_write_pointer = !check_only || list_empty(&inode_list);
+
destroy_fsync_dnodes(&inode_list, err);
destroy_fsync_dnodes(&tmp_inode_list, err);
@@ -784,9 +787,22 @@ skip:
if (err) {
truncate_inode_pages_final(NODE_MAPPING(sbi));
truncate_inode_pages_final(META_MAPPING(sbi));
- } else {
- clear_sbi_flag(sbi, SBI_POR_DOING);
}
+
+ /*
+ * If fsync data succeeds or there is no fsync data to recover,
+ * and the f2fs is not read only, check and fix zoned block devices'
+ * write pointer consistency.
+ */
+ if (!err && fix_curseg_write_pointer && !f2fs_readonly(sbi->sb) &&
+ f2fs_sb_has_blkzoned(sbi)) {
+ err = f2fs_fix_curseg_write_pointer(sbi);
+ ret = err;
+ }
+
+ if (!err)
+ clear_sbi_flag(sbi, SBI_POR_DOING);
+
mutex_unlock(&sbi->cp_mutex);
/* let's drop all the directory inodes for clean checkpoint */
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 56e81447e2f3..cf0eb002cfd4 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -334,7 +334,6 @@ void f2fs_drop_inmem_pages(struct inode *inode)
}
fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0;
- stat_dec_atomic_write(inode);
spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
if (!list_empty(&fi->inmem_ilist))
@@ -505,7 +504,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
* dir/node pages without enough free segments.
*/
if (has_not_enough_free_secs(sbi, 0, 0)) {
- mutex_lock(&sbi->gc_mutex);
+ down_write(&sbi->gc_lock);
f2fs_gc(sbi, false, false, NULL_SEGNO);
}
}
@@ -2225,7 +2224,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
struct sit_info *sit_i = SIT_I(sbi);
f2fs_bug_on(sbi, addr == NULL_ADDR);
- if (addr == NEW_ADDR)
+ if (addr == NEW_ADDR || addr == COMPRESS_ADDR)
return;
invalidate_mapping_pages(META_MAPPING(sbi), addr, addr);
@@ -2861,9 +2860,9 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
if (sbi->discard_blks == 0)
goto out;
- mutex_lock(&sbi->gc_mutex);
+ down_write(&sbi->gc_lock);
err = f2fs_write_checkpoint(sbi, &cpc);
- mutex_unlock(&sbi->gc_mutex);
+ up_write(&sbi->gc_lock);
if (err)
goto out;
@@ -3036,7 +3035,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
if (fio->type == DATA) {
struct inode *inode = fio->page->mapping->host;
- if (is_cold_data(fio->page) || file_is_cold(inode))
+ if (is_cold_data(fio->page) || file_is_cold(inode) ||
+ f2fs_compressed_file(inode))
return CURSEG_COLD_DATA;
if (file_is_hot(inode) ||
is_inode_flag_set(inode, FI_HOT_DATA) ||
@@ -3289,7 +3289,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio)
stat_inc_inplace_blocks(fio->sbi);
- if (fio->bio)
+ if (fio->bio && !(SM_I(sbi)->ipu_policy & (1 << F2FS_IPU_NOCACHE)))
err = f2fs_merge_page_bio(fio);
else
err = f2fs_submit_page_bio(fio);
@@ -4368,6 +4368,263 @@ out:
return 0;
}
+#ifdef CONFIG_BLK_DEV_ZONED
+
+static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
+ struct f2fs_dev_info *fdev,
+ struct blk_zone *zone)
+{
+ unsigned int wp_segno, wp_blkoff, zone_secno, zone_segno, segno;
+ block_t zone_block, wp_block, last_valid_block;
+ unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT;
+ int i, s, b, ret;
+ struct seg_entry *se;
+
+ if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ)
+ return 0;
+
+ wp_block = fdev->start_blk + (zone->wp >> log_sectors_per_block);
+ wp_segno = GET_SEGNO(sbi, wp_block);
+ wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno);
+ zone_block = fdev->start_blk + (zone->start >> log_sectors_per_block);
+ zone_segno = GET_SEGNO(sbi, zone_block);
+ zone_secno = GET_SEC_FROM_SEG(sbi, zone_segno);
+
+ if (zone_segno >= MAIN_SEGS(sbi))
+ return 0;
+
+ /*
+ * Skip check of zones cursegs point to, since
+ * fix_curseg_write_pointer() checks them.
+ */
+ for (i = 0; i < NO_CHECK_TYPE; i++)
+ if (zone_secno == GET_SEC_FROM_SEG(sbi,
+ CURSEG_I(sbi, i)->segno))
+ return 0;
+
+ /*
+ * Get last valid block of the zone.
+ */
+ last_valid_block = zone_block - 1;
+ for (s = sbi->segs_per_sec - 1; s >= 0; s--) {
+ segno = zone_segno + s;
+ se = get_seg_entry(sbi, segno);
+ for (b = sbi->blocks_per_seg - 1; b >= 0; b--)
+ if (f2fs_test_bit(b, se->cur_valid_map)) {
+ last_valid_block = START_BLOCK(sbi, segno) + b;
+ break;
+ }
+ if (last_valid_block >= zone_block)
+ break;
+ }
+
+ /*
+ * If last valid block is beyond the write pointer, report the
+ * inconsistency. This inconsistency does not cause write error
+ * because the zone will not be selected for write operation until
+ * it get discarded. Just report it.
+ */
+ if (last_valid_block >= wp_block) {
+ f2fs_notice(sbi, "Valid block beyond write pointer: "
+ "valid block[0x%x,0x%x] wp[0x%x,0x%x]",
+ GET_SEGNO(sbi, last_valid_block),
+ GET_BLKOFF_FROM_SEG0(sbi, last_valid_block),
+ wp_segno, wp_blkoff);
+ return 0;
+ }
+
+ /*
+ * If there is no valid block in the zone and if write pointer is
+ * not at zone start, reset the write pointer.
+ */
+ if (last_valid_block + 1 == zone_block && zone->wp != zone->start) {
+ f2fs_notice(sbi,
+ "Zone without valid block has non-zero write "
+ "pointer. Reset the write pointer: wp[0x%x,0x%x]",
+ wp_segno, wp_blkoff);
+ ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block,
+ zone->len >> log_sectors_per_block);
+ if (ret) {
+ f2fs_err(sbi, "Discard zone failed: %s (errno=%d)",
+ fdev->path, ret);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static struct f2fs_dev_info *get_target_zoned_dev(struct f2fs_sb_info *sbi,
+ block_t zone_blkaddr)
+{
+ int i;
+
+ for (i = 0; i < sbi->s_ndevs; i++) {
+ if (!bdev_is_zoned(FDEV(i).bdev))
+ continue;
+ if (sbi->s_ndevs == 1 || (FDEV(i).start_blk <= zone_blkaddr &&
+ zone_blkaddr <= FDEV(i).end_blk))
+ return &FDEV(i);
+ }
+
+ return NULL;
+}
+
+static int report_one_zone_cb(struct blk_zone *zone, unsigned int idx,
+ void *data) {
+ memcpy(data, zone, sizeof(struct blk_zone));
+ return 0;
+}
+
+static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type)
+{
+ struct curseg_info *cs = CURSEG_I(sbi, type);
+ struct f2fs_dev_info *zbd;
+ struct blk_zone zone;
+ unsigned int cs_section, wp_segno, wp_blkoff, wp_sector_off;
+ block_t cs_zone_block, wp_block;
+ unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT;
+ sector_t zone_sector;
+ int err;
+
+ cs_section = GET_SEC_FROM_SEG(sbi, cs->segno);
+ cs_zone_block = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, cs_section));
+
+ zbd = get_target_zoned_dev(sbi, cs_zone_block);
+ if (!zbd)
+ return 0;
+
+ /* report zone for the sector the curseg points to */
+ zone_sector = (sector_t)(cs_zone_block - zbd->start_blk)
+ << log_sectors_per_block;
+ err = blkdev_report_zones(zbd->bdev, zone_sector, 1,
+ report_one_zone_cb, &zone);
+ if (err != 1) {
+ f2fs_err(sbi, "Report zone failed: %s errno=(%d)",
+ zbd->path, err);
+ return err;
+ }
+
+ if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ)
+ return 0;
+
+ wp_block = zbd->start_blk + (zone.wp >> log_sectors_per_block);
+ wp_segno = GET_SEGNO(sbi, wp_block);
+ wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno);
+ wp_sector_off = zone.wp & GENMASK(log_sectors_per_block - 1, 0);
+
+ if (cs->segno == wp_segno && cs->next_blkoff == wp_blkoff &&
+ wp_sector_off == 0)
+ return 0;
+
+ f2fs_notice(sbi, "Unaligned curseg[%d] with write pointer: "
+ "curseg[0x%x,0x%x] wp[0x%x,0x%x]",
+ type, cs->segno, cs->next_blkoff, wp_segno, wp_blkoff);
+
+ f2fs_notice(sbi, "Assign new section to curseg[%d]: "
+ "curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff);
+ allocate_segment_by_default(sbi, type, true);
+
+ /* check consistency of the zone curseg pointed to */
+ if (check_zone_write_pointer(sbi, zbd, &zone))
+ return -EIO;
+
+ /* check newly assigned zone */
+ cs_section = GET_SEC_FROM_SEG(sbi, cs->segno);
+ cs_zone_block = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, cs_section));
+
+ zbd = get_target_zoned_dev(sbi, cs_zone_block);
+ if (!zbd)
+ return 0;
+
+ zone_sector = (sector_t)(cs_zone_block - zbd->start_blk)
+ << log_sectors_per_block;
+ err = blkdev_report_zones(zbd->bdev, zone_sector, 1,
+ report_one_zone_cb, &zone);
+ if (err != 1) {
+ f2fs_err(sbi, "Report zone failed: %s errno=(%d)",
+ zbd->path, err);
+ return err;
+ }
+
+ if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ)
+ return 0;
+
+ if (zone.wp != zone.start) {
+ f2fs_notice(sbi,
+ "New zone for curseg[%d] is not yet discarded. "
+ "Reset the zone: curseg[0x%x,0x%x]",
+ type, cs->segno, cs->next_blkoff);
+ err = __f2fs_issue_discard_zone(sbi, zbd->bdev,
+ zone_sector >> log_sectors_per_block,
+ zone.len >> log_sectors_per_block);
+ if (err) {
+ f2fs_err(sbi, "Discard zone failed: %s (errno=%d)",
+ zbd->path, err);
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi)
+{
+ int i, ret;
+
+ for (i = 0; i < NO_CHECK_TYPE; i++) {
+ ret = fix_curseg_write_pointer(sbi, i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+struct check_zone_write_pointer_args {
+ struct f2fs_sb_info *sbi;
+ struct f2fs_dev_info *fdev;
+};
+
+static int check_zone_write_pointer_cb(struct blk_zone *zone, unsigned int idx,
+ void *data) {
+ struct check_zone_write_pointer_args *args;
+ args = (struct check_zone_write_pointer_args *)data;
+
+ return check_zone_write_pointer(args->sbi, args->fdev, zone);
+}
+
+int f2fs_check_write_pointer(struct f2fs_sb_info *sbi)
+{
+ int i, ret;
+ struct check_zone_write_pointer_args args;
+
+ for (i = 0; i < sbi->s_ndevs; i++) {
+ if (!bdev_is_zoned(FDEV(i).bdev))
+ continue;
+
+ args.sbi = sbi;
+ args.fdev = &FDEV(i);
+ ret = blkdev_report_zones(FDEV(i).bdev, 0, BLK_ALL_ZONES,
+ check_zone_write_pointer_cb, &args);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+#else
+int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi)
+{
+ return 0;
+}
+
+int f2fs_check_write_pointer(struct f2fs_sb_info *sbi)
+{
+ return 0;
+}
+#endif
+
/*
* Update min, max modified time for cost-benefit GC algorithm
*/
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index a95467b202ea..459dc3901a57 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -200,18 +200,6 @@ struct segment_allocation {
void (*allocate_segment)(struct f2fs_sb_info *, int, bool);
};
-/*
- * this value is set in page as a private data which indicate that
- * the page is atomically written, and it is in inmem_pages list.
- */
-#define ATOMIC_WRITTEN_PAGE ((unsigned long)-1)
-#define DUMMY_WRITTEN_PAGE ((unsigned long)-2)
-
-#define IS_ATOMIC_WRITTEN_PAGE(page) \
- (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE)
-#define IS_DUMMY_WRITTEN_PAGE(page) \
- (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE)
-
#define MAX_SKIP_GC_COUNT 16
struct inmem_pages {
@@ -619,8 +607,10 @@ static inline int utilization(struct f2fs_sb_info *sbi)
* threashold,
* F2FS_IPU_FSYNC - activated in fsync path only for high performance flash
* storages. IPU will be triggered only if the # of dirty
- * pages over min_fsync_blocks.
- * F2FS_IPUT_DISABLE - disable IPU. (=default option)
+ * pages over min_fsync_blocks. (=default option)
+ * F2FS_IPU_ASYNC - do IPU given by asynchronous write requests.
+ * F2FS_IPU_NOCACHE - disable IPU bio cache.
+ * F2FS_IPUT_DISABLE - disable IPU. (=default option in LFS mode)
*/
#define DEF_MIN_IPU_UTIL 70
#define DEF_MIN_FSYNC_BLOCKS 8
@@ -635,6 +625,7 @@ enum {
F2FS_IPU_SSR_UTIL,
F2FS_IPU_FSYNC,
F2FS_IPU_ASYNC,
+ F2FS_IPU_NOCACHE,
};
static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 5111e1ffe58a..65a7a432dfee 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -141,6 +141,9 @@ enum {
Opt_checkpoint_disable_cap,
Opt_checkpoint_disable_cap_perc,
Opt_checkpoint_enable,
+ Opt_compress_algorithm,
+ Opt_compress_log_size,
+ Opt_compress_extension,
Opt_err,
};
@@ -203,6 +206,9 @@ static match_table_t f2fs_tokens = {
{Opt_checkpoint_disable_cap, "checkpoint=disable:%u"},
{Opt_checkpoint_disable_cap_perc, "checkpoint=disable:%u%%"},
{Opt_checkpoint_enable, "checkpoint=enable"},
+ {Opt_compress_algorithm, "compress_algorithm=%s"},
+ {Opt_compress_log_size, "compress_log_size=%u"},
+ {Opt_compress_extension, "compress_extension=%s"},
{Opt_err, NULL},
};
@@ -391,8 +397,9 @@ static int parse_options(struct super_block *sb, char *options)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
substring_t args[MAX_OPT_ARGS];
+ unsigned char (*ext)[F2FS_EXTENSION_LEN];
char *p, *name;
- int arg = 0;
+ int arg = 0, ext_cnt;
kuid_t uid;
kgid_t gid;
#ifdef CONFIG_QUOTA
@@ -810,6 +817,66 @@ static int parse_options(struct super_block *sb, char *options)
case Opt_checkpoint_enable:
clear_opt(sbi, DISABLE_CHECKPOINT);
break;
+ case Opt_compress_algorithm:
+ if (!f2fs_sb_has_compression(sbi)) {
+ f2fs_err(sbi, "Compression feature if off");
+ return -EINVAL;
+ }
+ name = match_strdup(&args[0]);
+ if (!name)
+ return -ENOMEM;
+ if (strlen(name) == 3 && !strcmp(name, "lzo")) {
+ F2FS_OPTION(sbi).compress_algorithm =
+ COMPRESS_LZO;
+ } else if (strlen(name) == 3 &&
+ !strcmp(name, "lz4")) {
+ F2FS_OPTION(sbi).compress_algorithm =
+ COMPRESS_LZ4;
+ } else {
+ kfree(name);
+ return -EINVAL;
+ }
+ kfree(name);
+ break;
+ case Opt_compress_log_size:
+ if (!f2fs_sb_has_compression(sbi)) {
+ f2fs_err(sbi, "Compression feature is off");
+ return -EINVAL;
+ }
+ if (args->from && match_int(args, &arg))
+ return -EINVAL;
+ if (arg < MIN_COMPRESS_LOG_SIZE ||
+ arg > MAX_COMPRESS_LOG_SIZE) {
+ f2fs_err(sbi,
+ "Compress cluster log size is out of range");
+ return -EINVAL;
+ }
+ F2FS_OPTION(sbi).compress_log_size = arg;
+ break;
+ case Opt_compress_extension:
+ if (!f2fs_sb_has_compression(sbi)) {
+ f2fs_err(sbi, "Compression feature is off");
+ return -EINVAL;
+ }
+ name = match_strdup(&args[0]);
+ if (!name)
+ return -ENOMEM;
+
+ ext = F2FS_OPTION(sbi).extensions;
+ ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
+
+ if (strlen(name) >= F2FS_EXTENSION_LEN ||
+ ext_cnt >= COMPRESS_EXT_NUM) {
+ f2fs_err(sbi,
+ "invalid extension length/number");
+ kfree(name);
+ return -EINVAL;
+ }
+
+ strcpy(ext[ext_cnt], name);
+ F2FS_OPTION(sbi).compress_ext_cnt++;
+ kfree(name);
+ break;
default:
f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value",
p);
@@ -1125,6 +1192,8 @@ static void f2fs_put_super(struct super_block *sb)
f2fs_destroy_node_manager(sbi);
f2fs_destroy_segment_manager(sbi);
+ f2fs_destroy_post_read_wq(sbi);
+
kvfree(sbi->ckpt);
f2fs_unregister_sysfs(sbi);
@@ -1169,9 +1238,9 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
cpc.reason = __get_cp_reason(sbi);
- mutex_lock(&sbi->gc_mutex);
+ down_write(&sbi->gc_lock);
err = f2fs_write_checkpoint(sbi, &cpc);
- mutex_unlock(&sbi->gc_mutex);
+ up_write(&sbi->gc_lock);
}
f2fs_trace_ios(NULL, 1);
@@ -1213,12 +1282,10 @@ static int f2fs_statfs_project(struct super_block *sb,
return PTR_ERR(dquot);
spin_lock(&dquot->dq_dqb_lock);
- limit = 0;
- if (dquot->dq_dqb.dqb_bsoftlimit)
- limit = dquot->dq_dqb.dqb_bsoftlimit;
- if (dquot->dq_dqb.dqb_bhardlimit &&
- (!limit || dquot->dq_dqb.dqb_bhardlimit < limit))
- limit = dquot->dq_dqb.dqb_bhardlimit;
+ limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit,
+ dquot->dq_dqb.dqb_bhardlimit);
+ if (limit)
+ limit >>= sb->s_blocksize_bits;
if (limit && buf->f_blocks > limit) {
curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits;
@@ -1228,12 +1295,8 @@ static int f2fs_statfs_project(struct super_block *sb,
(buf->f_blocks - curblock) : 0;
}
- limit = 0;
- if (dquot->dq_dqb.dqb_isoftlimit)
- limit = dquot->dq_dqb.dqb_isoftlimit;
- if (dquot->dq_dqb.dqb_ihardlimit &&
- (!limit || dquot->dq_dqb.dqb_ihardlimit < limit))
- limit = dquot->dq_dqb.dqb_ihardlimit;
+ limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit,
+ dquot->dq_dqb.dqb_ihardlimit);
if (limit && buf->f_files > limit) {
buf->f_files = limit;
@@ -1340,6 +1403,35 @@ static inline void f2fs_show_quota_options(struct seq_file *seq,
#endif
}
+static inline void f2fs_show_compress_options(struct seq_file *seq,
+ struct super_block *sb)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ char *algtype = "";
+ int i;
+
+ if (!f2fs_sb_has_compression(sbi))
+ return;
+
+ switch (F2FS_OPTION(sbi).compress_algorithm) {
+ case COMPRESS_LZO:
+ algtype = "lzo";
+ break;
+ case COMPRESS_LZ4:
+ algtype = "lz4";
+ break;
+ }
+ seq_printf(seq, ",compress_algorithm=%s", algtype);
+
+ seq_printf(seq, ",compress_log_size=%u",
+ F2FS_OPTION(sbi).compress_log_size);
+
+ for (i = 0; i < F2FS_OPTION(sbi).compress_ext_cnt; i++) {
+ seq_printf(seq, ",compress_extension=%s",
+ F2FS_OPTION(sbi).extensions[i]);
+ }
+}
+
static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
{
struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
@@ -1462,6 +1554,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_printf(seq, ",fsync_mode=%s", "strict");
else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_NOBARRIER)
seq_printf(seq, ",fsync_mode=%s", "nobarrier");
+
+ f2fs_show_compress_options(seq, sbi->sb);
return 0;
}
@@ -1476,6 +1570,9 @@ static void default_options(struct f2fs_sb_info *sbi)
F2FS_OPTION(sbi).test_dummy_encryption = false;
F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID);
F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID);
+ F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZO;
+ F2FS_OPTION(sbi).compress_log_size = MIN_COMPRESS_LOG_SIZE;
+ F2FS_OPTION(sbi).compress_ext_cnt = 0;
set_opt(sbi, BG_GC);
set_opt(sbi, INLINE_XATTR);
@@ -1524,7 +1621,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
f2fs_update_time(sbi, DISABLE_TIME);
while (!f2fs_time_over(sbi, DISABLE_TIME)) {
- mutex_lock(&sbi->gc_mutex);
+ down_write(&sbi->gc_lock);
err = f2fs_gc(sbi, true, false, NULL_SEGNO);
if (err == -ENODATA) {
err = 0;
@@ -1546,7 +1643,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
goto restore_flag;
}
- mutex_lock(&sbi->gc_mutex);
+ down_write(&sbi->gc_lock);
cpc.reason = CP_PAUSE;
set_sbi_flag(sbi, SBI_CP_DISABLED);
err = f2fs_write_checkpoint(sbi, &cpc);
@@ -1558,7 +1655,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
spin_unlock(&sbi->stat_lock);
out_unlock:
- mutex_unlock(&sbi->gc_mutex);
+ up_write(&sbi->gc_lock);
restore_flag:
sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
return err;
@@ -1566,12 +1663,12 @@ restore_flag:
static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
{
- mutex_lock(&sbi->gc_mutex);
+ down_write(&sbi->gc_lock);
f2fs_dirty_to_prefree(sbi);
clear_sbi_flag(sbi, SBI_CP_DISABLED);
set_sbi_flag(sbi, SBI_IS_DIRTY);
- mutex_unlock(&sbi->gc_mutex);
+ up_write(&sbi->gc_lock);
f2fs_sync_fs(sbi->sb, 1);
}
@@ -2158,7 +2255,7 @@ static int f2fs_dquot_commit(struct dquot *dquot)
struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb);
int ret;
- down_read(&sbi->quota_sem);
+ down_read_nested(&sbi->quota_sem, SINGLE_DEPTH_NESTING);
ret = dquot_commit(dquot);
if (ret < 0)
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
@@ -2182,13 +2279,10 @@ static int f2fs_dquot_acquire(struct dquot *dquot)
static int f2fs_dquot_release(struct dquot *dquot)
{
struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb);
- int ret;
+ int ret = dquot_release(dquot);
- down_read(&sbi->quota_sem);
- ret = dquot_release(dquot);
if (ret < 0)
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
- up_read(&sbi->quota_sem);
return ret;
}
@@ -2196,29 +2290,22 @@ static int f2fs_dquot_mark_dquot_dirty(struct dquot *dquot)
{
struct super_block *sb = dquot->dq_sb;
struct f2fs_sb_info *sbi = F2FS_SB(sb);
- int ret;
-
- down_read(&sbi->quota_sem);
- ret = dquot_mark_dquot_dirty(dquot);
+ int ret = dquot_mark_dquot_dirty(dquot);
/* if we are using journalled quota */
if (is_journalled_quota(sbi))
set_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH);
- up_read(&sbi->quota_sem);
return ret;
}
static int f2fs_dquot_commit_info(struct super_block *sb, int type)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
- int ret;
+ int ret = dquot_commit_info(sb, type);
- down_read(&sbi->quota_sem);
- ret = dquot_commit_info(sb, type);
if (ret < 0)
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
- up_read(&sbi->quota_sem);
return ret;
}
@@ -3311,7 +3398,7 @@ try_onemore:
/* init f2fs-specific super block info */
sbi->valid_super_block = valid_super_block;
- mutex_init(&sbi->gc_mutex);
+ init_rwsem(&sbi->gc_lock);
mutex_init(&sbi->writepages);
mutex_init(&sbi->cp_mutex);
mutex_init(&sbi->resize_mutex);
@@ -3400,6 +3487,12 @@ try_onemore:
goto free_devices;
}
+ err = f2fs_init_post_read_wq(sbi);
+ if (err) {
+ f2fs_err(sbi, "Failed to initialize post read workqueue");
+ goto free_devices;
+ }
+
sbi->total_valid_node_count =
le32_to_cpu(sbi->ckpt->valid_node_count);
percpu_counter_set(&sbi->total_valid_inode_count,
@@ -3544,6 +3637,17 @@ try_onemore:
goto free_meta;
}
}
+
+ /*
+ * If the f2fs is not readonly and fsync data recovery succeeds,
+ * check zoned block devices' write pointer consistency.
+ */
+ if (!err && !f2fs_readonly(sb) && f2fs_sb_has_blkzoned(sbi)) {
+ err = f2fs_check_write_pointer(sbi);
+ if (err)
+ goto free_meta;
+ }
+
reset_checkpoint:
/* f2fs_recover_fsync_data() cleared this already */
clear_sbi_flag(sbi, SBI_POR_DOING);
@@ -3621,6 +3725,7 @@ free_nm:
f2fs_destroy_node_manager(sbi);
free_sm:
f2fs_destroy_segment_manager(sbi);
+ f2fs_destroy_post_read_wq(sbi);
free_devices:
destroy_device_list(sbi);
kvfree(sbi->ckpt);
@@ -3762,8 +3867,12 @@ static int __init init_f2fs_fs(void)
err = f2fs_init_bio_entry_cache();
if (err)
goto free_post_read;
+ err = f2fs_init_bioset();
+ if (err)
+ goto free_bio_enrty_cache;
return 0;
-
+free_bio_enrty_cache:
+ f2fs_destroy_bio_entry_cache();
free_post_read:
f2fs_destroy_post_read_processing();
free_root_stats:
@@ -3789,6 +3898,7 @@ fail:
static void __exit exit_f2fs_fs(void)
{
+ f2fs_destroy_bioset();
f2fs_destroy_bio_entry_cache();
f2fs_destroy_post_read_processing();
f2fs_destroy_root_stats();
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 70945ceb9c0c..91d649790b1b 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -25,6 +25,9 @@ enum {
DCC_INFO, /* struct discard_cmd_control */
NM_INFO, /* struct f2fs_nm_info */
F2FS_SBI, /* struct f2fs_sb_info */
+#ifdef CONFIG_F2FS_STAT_FS
+ STAT_INFO, /* struct f2fs_stat_info */
+#endif
#ifdef CONFIG_F2FS_FAULT_INJECTION
FAULT_INFO_RATE, /* struct f2fs_fault_info */
FAULT_INFO_TYPE, /* struct f2fs_fault_info */
@@ -42,6 +45,9 @@ struct f2fs_attr {
int id;
};
+static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf);
+
static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
{
if (struct_type == GC_THREAD)
@@ -59,41 +65,25 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
struct_type == FAULT_INFO_TYPE)
return (unsigned char *)&F2FS_OPTION(sbi).fault_info;
#endif
+#ifdef CONFIG_F2FS_STAT_FS
+ else if (struct_type == STAT_INFO)
+ return (unsigned char *)F2FS_STAT(sbi);
+#endif
return NULL;
}
static ssize_t dirty_segments_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%llu\n",
- (unsigned long long)(dirty_segments(sbi)));
+ return sprintf(buf, "%llu\n",
+ (unsigned long long)(dirty_segments(sbi)));
}
-static ssize_t unusable_show(struct f2fs_attr *a,
+static ssize_t free_segments_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- block_t unusable;
-
- if (test_opt(sbi, DISABLE_CHECKPOINT))
- unusable = sbi->unusable_block_count;
- else
- unusable = f2fs_get_unusable_blocks(sbi);
- return snprintf(buf, PAGE_SIZE, "%llu\n",
- (unsigned long long)unusable);
-}
-
-static ssize_t encoding_show(struct f2fs_attr *a,
- struct f2fs_sb_info *sbi, char *buf)
-{
-#ifdef CONFIG_UNICODE
- if (f2fs_sb_has_casefold(sbi))
- return snprintf(buf, PAGE_SIZE, "%s (%d.%d.%d)\n",
- sbi->s_encoding->charset,
- (sbi->s_encoding->version >> 16) & 0xff,
- (sbi->s_encoding->version >> 8) & 0xff,
- sbi->s_encoding->version & 0xff);
-#endif
- return snprintf(buf, PAGE_SIZE, "(none)");
+ return sprintf(buf, "%llu\n",
+ (unsigned long long)(free_segments(sbi)));
}
static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
@@ -102,10 +92,10 @@ static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
struct super_block *sb = sbi->sb;
if (!sb->s_bdev->bd_part)
- return snprintf(buf, PAGE_SIZE, "0\n");
+ return sprintf(buf, "0\n");
- return snprintf(buf, PAGE_SIZE, "%llu\n",
- (unsigned long long)(sbi->kbytes_written +
+ return sprintf(buf, "%llu\n",
+ (unsigned long long)(sbi->kbytes_written +
BD_PART_WRITTEN(sbi)));
}
@@ -116,7 +106,7 @@ static ssize_t features_show(struct f2fs_attr *a,
int len = 0;
if (!sb->s_bdev->bd_part)
- return snprintf(buf, PAGE_SIZE, "0\n");
+ return sprintf(buf, "0\n");
if (f2fs_sb_has_encrypt(sbi))
len += snprintf(buf, PAGE_SIZE - len, "%s",
@@ -154,6 +144,9 @@ static ssize_t features_show(struct f2fs_attr *a,
if (f2fs_sb_has_casefold(sbi))
len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "casefold");
+ if (f2fs_sb_has_compression(sbi))
+ len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len ? ", " : "", "compression");
len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "pin_file");
len += snprintf(buf + len, PAGE_SIZE - len, "\n");
@@ -163,9 +156,66 @@ static ssize_t features_show(struct f2fs_attr *a,
static ssize_t current_reserved_blocks_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return snprintf(buf, PAGE_SIZE, "%u\n", sbi->current_reserved_blocks);
+ return sprintf(buf, "%u\n", sbi->current_reserved_blocks);
+}
+
+static ssize_t unusable_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ block_t unusable;
+
+ if (test_opt(sbi, DISABLE_CHECKPOINT))
+ unusable = sbi->unusable_block_count;
+ else
+ unusable = f2fs_get_unusable_blocks(sbi);
+ return sprintf(buf, "%llu\n", (unsigned long long)unusable);
+}
+
+static ssize_t encoding_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+#ifdef CONFIG_UNICODE
+ if (f2fs_sb_has_casefold(sbi))
+ return snprintf(buf, PAGE_SIZE, "%s (%d.%d.%d)\n",
+ sbi->s_encoding->charset,
+ (sbi->s_encoding->version >> 16) & 0xff,
+ (sbi->s_encoding->version >> 8) & 0xff,
+ sbi->s_encoding->version & 0xff);
+#endif
+ return sprintf(buf, "(none)");
}
+#ifdef CONFIG_F2FS_STAT_FS
+static ssize_t moved_blocks_foreground_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ struct f2fs_stat_info *si = F2FS_STAT(sbi);
+
+ return sprintf(buf, "%llu\n",
+ (unsigned long long)(si->tot_blks -
+ (si->bg_data_blks + si->bg_node_blks)));
+}
+
+static ssize_t moved_blocks_background_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ struct f2fs_stat_info *si = F2FS_STAT(sbi);
+
+ return sprintf(buf, "%llu\n",
+ (unsigned long long)(si->bg_data_blks + si->bg_node_blks));
+}
+
+static ssize_t avg_vblocks_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ struct f2fs_stat_info *si = F2FS_STAT(sbi);
+
+ si->dirty_count = dirty_segments(sbi);
+ f2fs_update_sit_info(sbi);
+ return sprintf(buf, "%llu\n", (unsigned long long)(si->avg_vblocks));
+}
+#endif
+
static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
@@ -199,7 +249,7 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
ui = (unsigned int *)(ptr + a->offset);
- return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
+ return sprintf(buf, "%u\n", *ui);
}
static ssize_t __sbi_store(struct f2fs_attr *a,
@@ -389,6 +439,7 @@ enum feat_id {
FEAT_VERITY,
FEAT_SB_CHECKSUM,
FEAT_CASEFOLD,
+ FEAT_COMPRESSION,
};
static ssize_t f2fs_feature_show(struct f2fs_attr *a,
@@ -408,7 +459,8 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a,
case FEAT_VERITY:
case FEAT_SB_CHECKSUM:
case FEAT_CASEFOLD:
- return snprintf(buf, PAGE_SIZE, "supported\n");
+ case FEAT_COMPRESSION:
+ return sprintf(buf, "supported\n");
}
return 0;
}
@@ -437,6 +489,14 @@ static struct f2fs_attr f2fs_attr_##_name = { \
.id = _id, \
}
+#define F2FS_STAT_ATTR(_struct_type, _struct_name, _name, _elname) \
+static struct f2fs_attr f2fs_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = 0444 }, \
+ .show = f2fs_sbi_show, \
+ .struct_type = _struct_type, \
+ .offset = offsetof(struct _struct_name, _elname), \
+}
+
F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent_sleep_time,
urgent_sleep_time);
F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time);
@@ -478,11 +538,21 @@ F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate);
F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type);
#endif
F2FS_GENERAL_RO_ATTR(dirty_segments);
+F2FS_GENERAL_RO_ATTR(free_segments);
F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
F2FS_GENERAL_RO_ATTR(features);
F2FS_GENERAL_RO_ATTR(current_reserved_blocks);
F2FS_GENERAL_RO_ATTR(unusable);
F2FS_GENERAL_RO_ATTR(encoding);
+#ifdef CONFIG_F2FS_STAT_FS
+F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count);
+F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count);
+F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, gc_foreground_calls, call_count);
+F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, gc_background_calls, bg_gc);
+F2FS_GENERAL_RO_ATTR(moved_blocks_background);
+F2FS_GENERAL_RO_ATTR(moved_blocks_foreground);
+F2FS_GENERAL_RO_ATTR(avg_vblocks);
+#endif
#ifdef CONFIG_FS_ENCRYPTION
F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO);
@@ -503,6 +573,7 @@ F2FS_FEATURE_RO_ATTR(verity, FEAT_VERITY);
#endif
F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM);
F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD);
+F2FS_FEATURE_RO_ATTR(compression, FEAT_COMPRESSION);
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
@@ -543,12 +614,22 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(inject_type),
#endif
ATTR_LIST(dirty_segments),
+ ATTR_LIST(free_segments),
ATTR_LIST(unusable),
ATTR_LIST(lifetime_write_kbytes),
ATTR_LIST(features),
ATTR_LIST(reserved_blocks),
ATTR_LIST(current_reserved_blocks),
ATTR_LIST(encoding),
+#ifdef CONFIG_F2FS_STAT_FS
+ ATTR_LIST(cp_foreground_calls),
+ ATTR_LIST(cp_background_calls),
+ ATTR_LIST(gc_foreground_calls),
+ ATTR_LIST(gc_background_calls),
+ ATTR_LIST(moved_blocks_foreground),
+ ATTR_LIST(moved_blocks_background),
+ ATTR_LIST(avg_vblocks),
+#endif
NULL,
};
ATTRIBUTE_GROUPS(f2fs);
@@ -573,6 +654,7 @@ static struct attribute *f2fs_feat_attrs[] = {
#endif
ATTR_LIST(sb_checksum),
ATTR_LIST(casefold),
+ ATTR_LIST(compression),
NULL,
};
ATTRIBUTE_GROUPS(f2fs_feat);
@@ -733,10 +815,12 @@ int __init f2fs_init_sysfs(void)
ret = kobject_init_and_add(&f2fs_feat, &f2fs_feat_ktype,
NULL, "features");
- if (ret)
+ if (ret) {
+ kobject_put(&f2fs_feat);
kset_unregister(&f2fs_kset);
- else
+ } else {
f2fs_proc_root = proc_mkdir("fs/f2fs", NULL);
+ }
return ret;
}
@@ -757,8 +841,11 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
init_completion(&sbi->s_kobj_unregister);
err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL,
"%s", sb->s_id);
- if (err)
+ if (err) {
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
return err;
+ }
if (f2fs_proc_root)
sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
@@ -786,4 +873,5 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi)
remove_proc_entry(sbi->sb->s_id, f2fs_proc_root);
}
kobject_del(&sbi->s_kobj);
+ kobject_put(&sbi->s_kobj);
}
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index a401ef72bc82..d7d430a6f130 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -222,12 +222,55 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf,
return size;
}
+/*
+ * Prefetch some pages from the file's Merkle tree.
+ *
+ * This is basically a stripped-down version of __do_page_cache_readahead()
+ * which works on pages past i_size.
+ */
+static void f2fs_merkle_tree_readahead(struct address_space *mapping,
+ pgoff_t start_index, unsigned long count)
+{
+ LIST_HEAD(pages);
+ unsigned int nr_pages = 0;
+ struct page *page;
+ pgoff_t index;
+ struct blk_plug plug;
+
+ for (index = start_index; index < start_index + count; index++) {
+ page = xa_load(&mapping->i_pages, index);
+ if (!page || xa_is_value(page)) {
+ page = __page_cache_alloc(readahead_gfp_mask(mapping));
+ if (!page)
+ break;
+ page->index = index;
+ list_add(&page->lru, &pages);
+ nr_pages++;
+ }
+ }
+ blk_start_plug(&plug);
+ f2fs_mpage_readpages(mapping, &pages, NULL, nr_pages, true);
+ blk_finish_plug(&plug);
+}
+
static struct page *f2fs_read_merkle_tree_page(struct inode *inode,
- pgoff_t index)
+ pgoff_t index,
+ unsigned long num_ra_pages)
{
+ struct page *page;
+
index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT;
- return read_mapping_page(inode->i_mapping, index, NULL);
+ page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
+ if (!page || !PageUptodate(page)) {
+ if (page)
+ put_page(page);
+ else if (num_ra_pages > 1)
+ f2fs_merkle_tree_readahead(inode->i_mapping, index,
+ num_ra_pages);
+ page = read_mapping_page(inode->i_mapping, index, NULL);
+ }
+ return page;
}
static int f2fs_write_merkle_tree_block(struct inode *inode, const void *buf,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 5f04c5c810fb..594b05ae16c9 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -21,6 +21,7 @@
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <asm/unaligned.h>
+#include <linux/random.h>
#include <linux/iversion.h>
#include "fat.h"
@@ -521,7 +522,7 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
inode->i_uid = sbi->options.fs_uid;
inode->i_gid = sbi->options.fs_gid;
inode_inc_iversion(inode);
- inode->i_generation = get_seconds();
+ inode->i_generation = prandom_u32();
if ((de->attr & ATTR_DIR) && !IS_FREE(de->name)) {
inode->i_generation &= ~1;
diff --git a/fs/file.c b/fs/file.c
index 2f4fcf985079..a364e1a9b7e8 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -642,7 +642,9 @@ out_unlock:
EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
/*
- * variant of __close_fd that gets a ref on the file for later fput
+ * variant of __close_fd that gets a ref on the file for later fput.
+ * The caller must ensure that filp_close() called on the file, and then
+ * an fput().
*/
int __close_fd_get_file(unsigned int fd, struct file **res)
{
@@ -662,7 +664,7 @@ int __close_fd_get_file(unsigned int fd, struct file **res)
spin_unlock(&files->file_lock);
get_file(file);
*res = file;
- return filp_close(file, files);
+ return 0;
out_unlock:
spin_unlock(&files->file_lock);
@@ -706,9 +708,9 @@ void do_close_on_exec(struct files_struct *files)
spin_unlock(&files->file_lock);
}
-static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs)
+static struct file *__fget_files(struct files_struct *files, unsigned int fd,
+ fmode_t mask, unsigned int refs)
{
- struct files_struct *files = current->files;
struct file *file;
rcu_read_lock();
@@ -729,6 +731,12 @@ loop:
return file;
}
+static inline struct file *__fget(unsigned int fd, fmode_t mask,
+ unsigned int refs)
+{
+ return __fget_files(current->files, fd, mask, refs);
+}
+
struct file *fget_many(unsigned int fd, unsigned int refs)
{
return __fget(fd, FMODE_PATH, refs);
@@ -746,6 +754,18 @@ struct file *fget_raw(unsigned int fd)
}
EXPORT_SYMBOL(fget_raw);
+struct file *fget_task(struct task_struct *task, unsigned int fd)
+{
+ struct file *file = NULL;
+
+ task_lock(task);
+ if (task->files)
+ file = __fget_files(task->files, fd, 0, 1);
+ task_unlock(task);
+
+ return file;
+}
+
/*
* Lightweight file lookup - no refcnt increment if fd table isn't shared.
*
@@ -960,7 +980,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
return ksys_dup3(oldfd, newfd, 0);
}
-SYSCALL_DEFINE1(dup, unsigned int, fildes)
+int ksys_dup(unsigned int fildes)
{
int ret = -EBADF;
struct file *file = fget_raw(fildes);
@@ -975,6 +995,11 @@ SYSCALL_DEFINE1(dup, unsigned int, fildes)
return ret;
}
+SYSCALL_DEFINE1(dup, unsigned int, fildes)
+{
+ return ksys_dup(fildes);
+}
+
int f_dupfd(unsigned int from, struct file *file, unsigned flags)
{
int err;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 335607b8c5c0..76ac9c7d32ec 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -2063,7 +2063,7 @@ void wb_workfn(struct work_struct *work)
struct bdi_writeback, dwork);
long pages_written;
- set_worker_desc("flush-%s", dev_name(wb->bdi->dev));
+ set_worker_desc("flush-%s", bdi_dev_name(wb->bdi));
current->flags |= PF_SWAPWRITE;
if (likely(!current_is_workqueue_rescuer() ||
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a63d779eac10..ce715380143c 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -882,6 +882,7 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
struct fuse_args_pages *ap = &ia->ap;
loff_t pos = page_offset(ap->pages[0]);
size_t count = ap->num_pages << PAGE_SHIFT;
+ ssize_t res;
int err;
ap->args.out_pages = true;
@@ -896,7 +897,8 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
if (!err)
return;
} else {
- err = fuse_simple_request(fc, &ap->args);
+ res = fuse_simple_request(fc, &ap->args);
+ err = res < 0 ? res : 0;
}
fuse_readpages_end(fc, &ap->args, err);
}
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 6d0783e2e276..f71c384064c8 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -242,19 +242,35 @@ extern void hfs_mark_mdb_dirty(struct super_block *sb);
/*
* There are two time systems. Both are based on seconds since
* a particular time/date.
- * Unix: unsigned lil-endian since 00:00 GMT, Jan. 1, 1970
+ * Unix: signed little-endian since 00:00 GMT, Jan. 1, 1970
* mac: unsigned big-endian since 00:00 GMT, Jan. 1, 1904
*
+ * HFS implementations are highly inconsistent, this one matches the
+ * traditional behavior of 64-bit Linux, giving the most useful
+ * time range between 1970 and 2106, by treating any on-disk timestamp
+ * under HFS_UTC_OFFSET (Jan 1 1970) as a time between 2040 and 2106.
*/
-#define __hfs_u_to_mtime(sec) cpu_to_be32(sec + 2082844800U - sys_tz.tz_minuteswest * 60)
-#define __hfs_m_to_utime(sec) (be32_to_cpu(sec) - 2082844800U + sys_tz.tz_minuteswest * 60)
+#define HFS_UTC_OFFSET 2082844800U
+static inline time64_t __hfs_m_to_utime(__be32 mt)
+{
+ time64_t ut = (u32)(be32_to_cpu(mt) - HFS_UTC_OFFSET);
+
+ return ut + sys_tz.tz_minuteswest * 60;
+}
+
+static inline __be32 __hfs_u_to_mtime(time64_t ut)
+{
+ ut -= sys_tz.tz_minuteswest * 60;
+
+ return cpu_to_be32(lower_32_bits(ut) + HFS_UTC_OFFSET);
+}
#define HFS_I(inode) (container_of(inode, struct hfs_inode_info, vfs_inode))
#define HFS_SB(sb) ((struct hfs_sb_info *)(sb)->s_fs_info)
-#define hfs_m_to_utime(time) (struct timespec){ .tv_sec = __hfs_m_to_utime(time) }
-#define hfs_u_to_mtime(time) __hfs_u_to_mtime((time).tv_sec)
-#define hfs_mtime() __hfs_u_to_mtime(get_seconds())
+#define hfs_m_to_utime(time) (struct timespec64){ .tv_sec = __hfs_m_to_utime(time) }
+#define hfs_u_to_mtime(time) __hfs_u_to_mtime((time).tv_sec)
+#define hfs_mtime() __hfs_u_to_mtime(ktime_get_real_seconds())
static inline const char *hfs_mdb_name(struct super_block *sb)
{
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index da243c84e93b..2f224b98ee94 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -351,7 +351,7 @@ static int hfs_read_inode(struct inode *inode, void *data)
inode->i_mode &= ~hsb->s_file_umask;
inode->i_mode |= S_IFREG;
inode->i_ctime = inode->i_atime = inode->i_mtime =
- timespec_to_timespec64(hfs_m_to_utime(rec->file.MdDat));
+ hfs_m_to_utime(rec->file.MdDat);
inode->i_op = &hfs_file_inode_operations;
inode->i_fop = &hfs_file_operations;
inode->i_mapping->a_ops = &hfs_aops;
@@ -362,7 +362,7 @@ static int hfs_read_inode(struct inode *inode, void *data)
HFS_I(inode)->fs_blocks = 0;
inode->i_mode = S_IFDIR | (S_IRWXUGO & ~hsb->s_dir_umask);
inode->i_ctime = inode->i_atime = inode->i_mtime =
- timespec_to_timespec64(hfs_m_to_utime(rec->dir.MdDat));
+ hfs_m_to_utime(rec->dir.MdDat);
inode->i_op = &hfs_dir_inode_operations;
inode->i_fop = &hfs_dir_operations;
break;
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index b8471bf05def..3b03fff68543 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -533,13 +533,31 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, void *buf,
void **data, int op, int op_flags);
int hfsplus_read_wrapper(struct super_block *sb);
-/* time macros */
-#define __hfsp_mt2ut(t) (be32_to_cpu(t) - 2082844800U)
-#define __hfsp_ut2mt(t) (cpu_to_be32(t + 2082844800U))
+/*
+ * time helpers: convert between 1904-base and 1970-base timestamps
+ *
+ * HFS+ implementations are highly inconsistent, this one matches the
+ * traditional behavior of 64-bit Linux, giving the most useful
+ * time range between 1970 and 2106, by treating any on-disk timestamp
+ * under HFSPLUS_UTC_OFFSET (Jan 1 1970) as a time between 2040 and 2106.
+ */
+#define HFSPLUS_UTC_OFFSET 2082844800U
+
+static inline time64_t __hfsp_mt2ut(__be32 mt)
+{
+ time64_t ut = (u32)(be32_to_cpu(mt) - HFSPLUS_UTC_OFFSET);
+
+ return ut;
+}
+
+static inline __be32 __hfsp_ut2mt(time64_t ut)
+{
+ return cpu_to_be32(lower_32_bits(ut) + HFSPLUS_UTC_OFFSET);
+}
/* compatibility */
-#define hfsp_mt2ut(t) (struct timespec){ .tv_sec = __hfsp_mt2ut(t) }
+#define hfsp_mt2ut(t) (struct timespec64){ .tv_sec = __hfsp_mt2ut(t) }
#define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec)
-#define hfsp_now2mt() __hfsp_ut2mt(get_seconds())
+#define hfsp_now2mt() __hfsp_ut2mt(ktime_get_real_seconds())
#endif
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index d131c8ea7eb6..94bd83b36644 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -504,9 +504,9 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
hfsplus_get_perms(inode, &folder->permissions, 1);
set_nlink(inode, 1);
inode->i_size = 2 + be32_to_cpu(folder->valence);
- inode->i_atime = timespec_to_timespec64(hfsp_mt2ut(folder->access_date));
- inode->i_mtime = timespec_to_timespec64(hfsp_mt2ut(folder->content_mod_date));
- inode->i_ctime = timespec_to_timespec64(hfsp_mt2ut(folder->attribute_mod_date));
+ inode->i_atime = hfsp_mt2ut(folder->access_date);
+ inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
+ inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date);
HFSPLUS_I(inode)->create_date = folder->create_date;
HFSPLUS_I(inode)->fs_blocks = 0;
if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) {
@@ -542,9 +542,9 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
init_special_inode(inode, inode->i_mode,
be32_to_cpu(file->permissions.dev));
}
- inode->i_atime = timespec_to_timespec64(hfsp_mt2ut(file->access_date));
- inode->i_mtime = timespec_to_timespec64(hfsp_mt2ut(file->content_mod_date));
- inode->i_ctime = timespec_to_timespec64(hfsp_mt2ut(file->attribute_mod_date));
+ inode->i_atime = hfsp_mt2ut(file->access_date);
+ inode->i_mtime = hfsp_mt2ut(file->content_mod_date);
+ inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date);
HFSPLUS_I(inode)->create_date = file->create_date;
} else {
pr_err("bad catalog entry used to create inode\n");
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index f4295aa19350..69cb796f6270 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -37,16 +37,20 @@
* is on, and remove the appropriate bits from attr->ia_mode (attr is a
* "struct iattr *"). -BlaisorBlade
*/
+struct hostfs_timespec {
+ long long tv_sec;
+ long long tv_nsec;
+};
struct hostfs_iattr {
- unsigned int ia_valid;
- unsigned short ia_mode;
- uid_t ia_uid;
- gid_t ia_gid;
- loff_t ia_size;
- struct timespec ia_atime;
- struct timespec ia_mtime;
- struct timespec ia_ctime;
+ unsigned int ia_valid;
+ unsigned short ia_mode;
+ uid_t ia_uid;
+ gid_t ia_gid;
+ loff_t ia_size;
+ struct hostfs_timespec ia_atime;
+ struct hostfs_timespec ia_mtime;
+ struct hostfs_timespec ia_ctime;
};
struct hostfs_stat {
@@ -56,7 +60,7 @@ struct hostfs_stat {
unsigned int uid;
unsigned int gid;
unsigned long long size;
- struct timespec atime, mtime, ctime;
+ struct hostfs_timespec atime, mtime, ctime;
unsigned int blksize;
unsigned long long blocks;
unsigned int maj;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 5a7eb0c79839..e6b8c49076bb 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -549,9 +549,9 @@ static int read_name(struct inode *ino, char *name)
set_nlink(ino, st.nlink);
i_uid_write(ino, st.uid);
i_gid_write(ino, st.gid);
- ino->i_atime = timespec_to_timespec64(st.atime);
- ino->i_mtime = timespec_to_timespec64(st.mtime);
- ino->i_ctime = timespec_to_timespec64(st.ctime);
+ ino->i_atime = (struct timespec64){ st.atime.tv_sec, st.atime.tv_nsec };
+ ino->i_mtime = (struct timespec64){ st.mtime.tv_sec, st.mtime.tv_nsec };
+ ino->i_ctime = (struct timespec64){ st.ctime.tv_sec, st.ctime.tv_nsec };
ino->i_size = st.size;
ino->i_blocks = st.blocks;
return 0;
@@ -820,15 +820,18 @@ static int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
}
if (attr->ia_valid & ATTR_ATIME) {
attrs.ia_valid |= HOSTFS_ATTR_ATIME;
- attrs.ia_atime = timespec64_to_timespec(attr->ia_atime);
+ attrs.ia_atime = (struct hostfs_timespec)
+ { attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec };
}
if (attr->ia_valid & ATTR_MTIME) {
attrs.ia_valid |= HOSTFS_ATTR_MTIME;
- attrs.ia_mtime = timespec64_to_timespec(attr->ia_mtime);
+ attrs.ia_mtime = (struct hostfs_timespec)
+ { attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec };
}
if (attr->ia_valid & ATTR_CTIME) {
attrs.ia_valid |= HOSTFS_ATTR_CTIME;
- attrs.ia_ctime = timespec64_to_timespec(attr->ia_ctime);
+ attrs.ia_ctime = (struct hostfs_timespec)
+ { attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec };
}
if (attr->ia_valid & ATTR_ATIME_SET) {
attrs.ia_valid |= HOSTFS_ATTR_ATIME_SET;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d5c2a3158610..a66e425884d1 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1498,8 +1498,10 @@ static int __init init_hugetlbfs_fs(void)
/* other hstates are optional */
i = 0;
for_each_hstate(h) {
- if (i == default_hstate_idx)
+ if (i == default_hstate_idx) {
+ i++;
continue;
+ }
mnt = mount_one_hugetlbfs(h);
if (IS_ERR(mnt))
diff --git a/fs/inode.c b/fs/inode.c
index 96d62d97694e..ea15c6d9f274 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -12,6 +12,7 @@
#include <linux/security.h>
#include <linux/cdev.h>
#include <linux/memblock.h>
+#include <linux/fscrypt.h>
#include <linux/fsnotify.h>
#include <linux/mount.h>
#include <linux/posix_acl.h>
@@ -2252,7 +2253,7 @@ int vfs_ioc_setflags_prepare(struct inode *inode, unsigned int oldflags,
!capable(CAP_LINUX_IMMUTABLE))
return -EPERM;
- return 0;
+ return fscrypt_prepare_setflags(inode, oldflags, flags);
}
EXPORT_SYMBOL(vfs_ioc_setflags_prepare);
diff --git a/fs/internal.h b/fs/internal.h
index 4a7da1df573d..f3f280b952a3 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -38,7 +38,7 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
/*
* buffer.c
*/
-extern void guard_bio_eod(int rw, struct bio *bio);
+extern void guard_bio_eod(struct bio *bio);
extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
get_block_t *get_block, struct iomap *iomap);
@@ -124,6 +124,8 @@ extern struct file *do_filp_open(int dfd, struct filename *pathname,
const struct open_flags *op);
extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
const char *, const struct open_flags *);
+extern struct open_how build_open_how(int flags, umode_t mode);
+extern int build_open_flags(const struct open_how *how, struct open_flags *op);
long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
long do_faccessat(int dfd, const char __user *filename, int mode);
@@ -180,11 +182,11 @@ extern void mnt_pin_kill(struct mount *m);
*/
extern const struct dentry_operations ns_dentry_operations;
-/*
- * fs/ioctl.c
- */
-extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd,
- unsigned long arg);
-
/* direct-io.c: */
int sb_init_dio_done_wq(struct super_block *sb);
+
+/*
+ * fs/stat.c:
+ */
+unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, int flags);
+int cp_statx(const struct kstat *stat, struct statx __user *buffer);
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 541c8a3e0bbb..cb60a42b9fdf 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -56,7 +56,8 @@ struct io_worker {
struct rcu_head rcu;
struct mm_struct *mm;
- const struct cred *creds;
+ const struct cred *cur_creds;
+ const struct cred *saved_creds;
struct files_struct *restore_files;
};
@@ -109,10 +110,10 @@ struct io_wq {
struct task_struct *manager;
struct user_struct *user;
- const struct cred *creds;
- struct mm_struct *mm;
refcount_t refs;
struct completion done;
+
+ refcount_t use_refs;
};
static bool io_worker_get(struct io_worker *worker)
@@ -135,9 +136,9 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
{
bool dropped_lock = false;
- if (worker->creds) {
- revert_creds(worker->creds);
- worker->creds = NULL;
+ if (worker->saved_creds) {
+ revert_creds(worker->saved_creds);
+ worker->cur_creds = worker->saved_creds = NULL;
}
if (current->files != worker->restore_files) {
@@ -396,6 +397,43 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash)
return NULL;
}
+static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
+{
+ if (worker->mm) {
+ unuse_mm(worker->mm);
+ mmput(worker->mm);
+ worker->mm = NULL;
+ }
+ if (!work->mm) {
+ set_fs(KERNEL_DS);
+ return;
+ }
+ if (mmget_not_zero(work->mm)) {
+ use_mm(work->mm);
+ if (!worker->mm)
+ set_fs(USER_DS);
+ worker->mm = work->mm;
+ /* hang on to this mm */
+ work->mm = NULL;
+ return;
+ }
+
+ /* failed grabbing mm, ensure work gets cancelled */
+ work->flags |= IO_WQ_WORK_CANCEL;
+}
+
+static void io_wq_switch_creds(struct io_worker *worker,
+ struct io_wq_work *work)
+{
+ const struct cred *old_creds = override_creds(work->creds);
+
+ worker->cur_creds = work->creds;
+ if (worker->saved_creds)
+ put_cred(old_creds); /* creds set by previous switch */
+ else
+ worker->saved_creds = old_creds;
+}
+
static void io_worker_handle_work(struct io_worker *worker)
__releases(wqe->lock)
{
@@ -438,20 +476,19 @@ next:
if (work->flags & IO_WQ_WORK_CB)
work->func(&work);
- if ((work->flags & IO_WQ_WORK_NEEDS_FILES) &&
- current->files != work->files) {
+ if (work->files && current->files != work->files) {
task_lock(current);
current->files = work->files;
task_unlock(current);
}
- if ((work->flags & IO_WQ_WORK_NEEDS_USER) && !worker->mm &&
- wq->mm && mmget_not_zero(wq->mm)) {
- use_mm(wq->mm);
- set_fs(USER_DS);
- worker->mm = wq->mm;
- }
- if (!worker->creds)
- worker->creds = override_creds(wq->creds);
+ if (work->mm != worker->mm)
+ io_wq_switch_mm(worker, work);
+ if (worker->cur_creds != work->creds)
+ io_wq_switch_creds(worker, work);
+ /*
+ * OK to set IO_WQ_WORK_CANCEL even for uncancellable work,
+ * the worker function will do the right thing.
+ */
if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
work->flags |= IO_WQ_WORK_CANCEL;
if (worker->mm)
@@ -716,6 +753,7 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
+ int work_flags;
unsigned long flags;
/*
@@ -730,12 +768,14 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
return;
}
+ work_flags = work->flags;
spin_lock_irqsave(&wqe->lock, flags);
wq_list_add_tail(&work->list, &wqe->work_list);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
spin_unlock_irqrestore(&wqe->lock, flags);
- if (!atomic_read(&acct->nr_running))
+ if ((work_flags & IO_WQ_WORK_CONCURRENT) ||
+ !atomic_read(&acct->nr_running))
io_wqe_wake_worker(wqe, acct);
}
@@ -824,6 +864,7 @@ static bool io_work_cancel(struct io_worker *worker, void *cancel_data)
*/
spin_lock_irqsave(&worker->lock, flags);
if (worker->cur_work &&
+ !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) &&
data->cancel(worker->cur_work, data->caller_data)) {
send_sig(SIGINT, worker->task, 1);
ret = true;
@@ -898,7 +939,8 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
return false;
spin_lock_irqsave(&worker->lock, flags);
- if (worker->cur_work == work) {
+ if (worker->cur_work == work &&
+ !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL)) {
send_sig(SIGINT, worker->task, 1);
ret = true;
}
@@ -1022,7 +1064,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
/* caller must already hold a reference to this */
wq->user = data->user;
- wq->creds = data->creds;
for_each_node(node) {
struct io_wqe *wqe;
@@ -1049,9 +1090,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
init_completion(&wq->done);
- /* caller must have already done mmgrab() on this mm */
- wq->mm = data->mm;
-
wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager");
if (!IS_ERR(wq->manager)) {
wake_up_process(wq->manager);
@@ -1060,6 +1098,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
ret = -ENOMEM;
goto err;
}
+ refcount_set(&wq->use_refs, 1);
reinit_completion(&wq->done);
return wq;
}
@@ -1074,13 +1113,21 @@ err:
return ERR_PTR(ret);
}
+bool io_wq_get(struct io_wq *wq, struct io_wq_data *data)
+{
+ if (data->get_work != wq->get_work || data->put_work != wq->put_work)
+ return false;
+
+ return refcount_inc_not_zero(&wq->use_refs);
+}
+
static bool io_wq_worker_wake(struct io_worker *worker, void *data)
{
wake_up_process(worker->task);
return false;
}
-void io_wq_destroy(struct io_wq *wq)
+static void __io_wq_destroy(struct io_wq *wq)
{
int node;
@@ -1100,3 +1147,9 @@ void io_wq_destroy(struct io_wq *wq)
kfree(wq->wqes);
kfree(wq);
}
+
+void io_wq_destroy(struct io_wq *wq)
+{
+ if (refcount_dec_and_test(&wq->use_refs))
+ __io_wq_destroy(wq);
+}
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 3f5e356de980..50b3378febf2 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -7,11 +7,11 @@ enum {
IO_WQ_WORK_CANCEL = 1,
IO_WQ_WORK_HAS_MM = 2,
IO_WQ_WORK_HASHED = 4,
- IO_WQ_WORK_NEEDS_USER = 8,
- IO_WQ_WORK_NEEDS_FILES = 16,
IO_WQ_WORK_UNBOUND = 32,
IO_WQ_WORK_INTERNAL = 64,
IO_WQ_WORK_CB = 128,
+ IO_WQ_WORK_NO_CANCEL = 256,
+ IO_WQ_WORK_CONCURRENT = 512,
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
};
@@ -72,6 +72,8 @@ struct io_wq_work {
};
void (*func)(struct io_wq_work **);
struct files_struct *files;
+ struct mm_struct *mm;
+ const struct cred *creds;
unsigned flags;
};
@@ -81,21 +83,22 @@ struct io_wq_work {
(work)->func = _func; \
(work)->flags = 0; \
(work)->files = NULL; \
+ (work)->mm = NULL; \
+ (work)->creds = NULL; \
} while (0) \
typedef void (get_work_fn)(struct io_wq_work *);
typedef void (put_work_fn)(struct io_wq_work *);
struct io_wq_data {
- struct mm_struct *mm;
struct user_struct *user;
- const struct cred *creds;
get_work_fn *get_work;
put_work_fn *put_work;
};
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
+bool io_wq_get(struct io_wq *wq, struct io_wq_data *data);
void io_wq_destroy(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 562e3a1a1bf9..1806afddfea5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -46,6 +46,7 @@
#include <linux/compat.h>
#include <linux/refcount.h>
#include <linux/uio.h>
+#include <linux/bits.h>
#include <linux/sched/signal.h>
#include <linux/fs.h>
@@ -70,6 +71,10 @@
#include <linux/sizes.h>
#include <linux/hugetlb.h>
#include <linux/highmem.h>
+#include <linux/namei.h>
+#include <linux/fsnotify.h>
+#include <linux/fadvise.h>
+#include <linux/eventpoll.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -177,6 +182,21 @@ struct fixed_file_table {
struct file **files;
};
+enum {
+ FFD_F_ATOMIC,
+};
+
+struct fixed_file_data {
+ struct fixed_file_table *table;
+ struct io_ring_ctx *ctx;
+
+ struct percpu_ref refs;
+ struct llist_head put_llist;
+ unsigned long state;
+ struct work_struct ref_work;
+ struct completion done;
+};
+
struct io_ring_ctx {
struct {
struct percpu_ref refs;
@@ -184,10 +204,11 @@ struct io_ring_ctx {
struct {
unsigned int flags;
- bool compat;
- bool account_mem;
- bool cq_overflow_flushed;
- bool drain_next;
+ int compat: 1;
+ int account_mem: 1;
+ int cq_overflow_flushed: 1;
+ int drain_next: 1;
+ int eventfd_async: 1;
/*
* Ring buffer of indices into array of io_uring_sqe, which is
@@ -207,13 +228,14 @@ struct io_ring_ctx {
unsigned sq_thread_idle;
unsigned cached_sq_dropped;
atomic_t cached_cq_overflow;
- struct io_uring_sqe *sq_sqes;
+ unsigned long sq_check_overflow;
struct list_head defer_list;
struct list_head timeout_list;
struct list_head cq_overflow_list;
wait_queue_head_t inflight_wait;
+ struct io_uring_sqe *sq_sqes;
} ____cacheline_aligned_in_smp;
struct io_rings *rings;
@@ -229,8 +251,10 @@ struct io_ring_ctx {
* readers must ensure that ->refs is alive as long as the file* is
* used. Only updated through io_uring_register(2).
*/
- struct fixed_file_table *file_table;
+ struct fixed_file_data *file_data;
unsigned nr_user_files;
+ int ring_fd;
+ struct file *ring_file;
/* if used, fixed mapped user buffers */
unsigned nr_user_bufs;
@@ -250,11 +274,14 @@ struct io_ring_ctx {
struct socket *ring_sock;
#endif
+ struct idr personality_idr;
+
struct {
unsigned cached_cq_tail;
unsigned cq_entries;
unsigned cq_mask;
atomic_t cq_timeouts;
+ unsigned long cq_check_overflow;
struct wait_queue_head cq_wait;
struct fasync_struct *cq_fasync;
struct eventfd_ctx *cq_ev_fd;
@@ -267,7 +294,8 @@ struct io_ring_ctx {
struct {
spinlock_t completion_lock;
- bool poll_multi_file;
+ struct llist_head poll_llist;
+
/*
* ->poll_list is protected by the ctx->uring_lock for
* io_uring instances that don't use IORING_SETUP_SQPOLL.
@@ -277,6 +305,7 @@ struct io_ring_ctx {
struct list_head poll_list;
struct hlist_head *cancel_hash;
unsigned cancel_hash_bits;
+ bool poll_multi_file;
spinlock_t inflight_lock;
struct list_head inflight_list;
@@ -299,6 +328,12 @@ struct io_poll_iocb {
struct wait_queue_entry wait;
};
+struct io_close {
+ struct file *file;
+ struct file *put_file;
+ int fd;
+};
+
struct io_timeout_data {
struct io_kiocb *req;
struct hrtimer timer;
@@ -319,6 +354,7 @@ struct io_sync {
loff_t len;
loff_t off;
int flags;
+ int mode;
};
struct io_cancel {
@@ -348,8 +384,52 @@ struct io_connect {
struct io_sr_msg {
struct file *file;
- struct user_msghdr __user *msg;
+ union {
+ struct user_msghdr __user *msg;
+ void __user *buf;
+ };
int msg_flags;
+ size_t len;
+};
+
+struct io_open {
+ struct file *file;
+ int dfd;
+ union {
+ unsigned mask;
+ };
+ struct filename *filename;
+ struct statx __user *buffer;
+ struct open_how how;
+};
+
+struct io_files_update {
+ struct file *file;
+ u64 arg;
+ u32 nr_args;
+ u32 offset;
+};
+
+struct io_fadvise {
+ struct file *file;
+ u64 offset;
+ u32 len;
+ u32 advice;
+};
+
+struct io_madvise {
+ struct file *file;
+ u64 addr;
+ u32 len;
+ u32 advice;
+};
+
+struct io_epoll {
+ struct file *file;
+ int epfd;
+ int op;
+ int fd;
+ struct epoll_event event;
};
struct io_async_connect {
@@ -370,15 +450,79 @@ struct io_async_rw {
ssize_t size;
};
+struct io_async_open {
+ struct filename *filename;
+};
+
struct io_async_ctx {
union {
struct io_async_rw rw;
struct io_async_msghdr msg;
struct io_async_connect connect;
struct io_timeout_data timeout;
+ struct io_async_open open;
};
};
+enum {
+ REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
+ REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
+ REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
+ REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
+ REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
+
+ REQ_F_LINK_NEXT_BIT,
+ REQ_F_FAIL_LINK_BIT,
+ REQ_F_INFLIGHT_BIT,
+ REQ_F_CUR_POS_BIT,
+ REQ_F_NOWAIT_BIT,
+ REQ_F_IOPOLL_COMPLETED_BIT,
+ REQ_F_LINK_TIMEOUT_BIT,
+ REQ_F_TIMEOUT_BIT,
+ REQ_F_ISREG_BIT,
+ REQ_F_MUST_PUNT_BIT,
+ REQ_F_TIMEOUT_NOSEQ_BIT,
+ REQ_F_COMP_LOCKED_BIT,
+};
+
+enum {
+ /* ctx owns file */
+ REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
+ /* drain existing IO first */
+ REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
+ /* linked sqes */
+ REQ_F_LINK = BIT(REQ_F_LINK_BIT),
+ /* doesn't sever on completion < 0 */
+ REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
+ /* IOSQE_ASYNC */
+ REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
+
+ /* already grabbed next link */
+ REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT),
+ /* fail rest of links */
+ REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT),
+ /* on inflight list */
+ REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
+ /* read/write uses file position */
+ REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
+ /* must not punt to workers */
+ REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
+ /* polled IO has completed */
+ REQ_F_IOPOLL_COMPLETED = BIT(REQ_F_IOPOLL_COMPLETED_BIT),
+ /* has linked timeout */
+ REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
+ /* timeout request */
+ REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT),
+ /* regular file */
+ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
+ /* must be punted even for NONBLOCK */
+ REQ_F_MUST_PUNT = BIT(REQ_F_MUST_PUNT_BIT),
+ /* no timeout sequence */
+ REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT),
+ /* completion under lock */
+ REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT),
+};
+
/*
* NOTE! Each of the iocb union members has the file pointer
* as the first entry in their struct definition. So you can
@@ -396,11 +540,19 @@ struct io_kiocb {
struct io_timeout timeout;
struct io_connect connect;
struct io_sr_msg sr_msg;
+ struct io_open open;
+ struct io_close close;
+ struct io_files_update files_update;
+ struct io_fadvise fadvise;
+ struct io_madvise madvise;
+ struct io_epoll epoll;
};
struct io_async_ctx *io;
- struct file *ring_file;
- int ring_fd;
+ /*
+ * llist_node is only used for poll deferred completions
+ */
+ struct llist_node llist_node;
bool has_user;
bool in_async;
bool needs_fixed_file;
@@ -414,23 +566,6 @@ struct io_kiocb {
struct list_head link_list;
unsigned int flags;
refcount_t refs;
-#define REQ_F_NOWAIT 1 /* must not punt to workers */
-#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */
-#define REQ_F_FIXED_FILE 4 /* ctx owns file */
-#define REQ_F_LINK_NEXT 8 /* already grabbed next link */
-#define REQ_F_IO_DRAIN 16 /* drain existing IO first */
-#define REQ_F_IO_DRAINED 32 /* drain done */
-#define REQ_F_LINK 64 /* linked sqes */
-#define REQ_F_LINK_TIMEOUT 128 /* has linked timeout */
-#define REQ_F_FAIL_LINK 256 /* fail rest of links */
-#define REQ_F_DRAIN_LINK 512 /* link should be fully drained */
-#define REQ_F_TIMEOUT 1024 /* timeout request */
-#define REQ_F_ISREG 2048 /* regular file */
-#define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */
-#define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */
-#define REQ_F_INFLIGHT 16384 /* on inflight list */
-#define REQ_F_COMP_LOCKED 32768 /* completion under lock */
-#define REQ_F_HARDLINK 65536 /* doesn't sever on completion < 0 */
u64 user_data;
u32 result;
u32 sequence;
@@ -463,14 +598,162 @@ struct io_submit_state {
unsigned int ios_left;
};
+struct io_op_def {
+ /* needs req->io allocated for deferral/async */
+ unsigned async_ctx : 1;
+ /* needs current->mm setup, does mm access */
+ unsigned needs_mm : 1;
+ /* needs req->file assigned */
+ unsigned needs_file : 1;
+ /* needs req->file assigned IFF fd is >= 0 */
+ unsigned fd_non_neg : 1;
+ /* hash wq insertion if file is a regular file */
+ unsigned hash_reg_file : 1;
+ /* unbound wq insertion if file is a non-regular file */
+ unsigned unbound_nonreg_file : 1;
+ /* opcode is not supported by this kernel */
+ unsigned not_supported : 1;
+ /* needs file table */
+ unsigned file_table : 1;
+};
+
+static const struct io_op_def io_op_defs[] = {
+ [IORING_OP_NOP] = {},
+ [IORING_OP_READV] = {
+ .async_ctx = 1,
+ .needs_mm = 1,
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ },
+ [IORING_OP_WRITEV] = {
+ .async_ctx = 1,
+ .needs_mm = 1,
+ .needs_file = 1,
+ .hash_reg_file = 1,
+ .unbound_nonreg_file = 1,
+ },
+ [IORING_OP_FSYNC] = {
+ .needs_file = 1,
+ },
+ [IORING_OP_READ_FIXED] = {
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ },
+ [IORING_OP_WRITE_FIXED] = {
+ .needs_file = 1,
+ .hash_reg_file = 1,
+ .unbound_nonreg_file = 1,
+ },
+ [IORING_OP_POLL_ADD] = {
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ },
+ [IORING_OP_POLL_REMOVE] = {},
+ [IORING_OP_SYNC_FILE_RANGE] = {
+ .needs_file = 1,
+ },
+ [IORING_OP_SENDMSG] = {
+ .async_ctx = 1,
+ .needs_mm = 1,
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ },
+ [IORING_OP_RECVMSG] = {
+ .async_ctx = 1,
+ .needs_mm = 1,
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ },
+ [IORING_OP_TIMEOUT] = {
+ .async_ctx = 1,
+ .needs_mm = 1,
+ },
+ [IORING_OP_TIMEOUT_REMOVE] = {},
+ [IORING_OP_ACCEPT] = {
+ .needs_mm = 1,
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ .file_table = 1,
+ },
+ [IORING_OP_ASYNC_CANCEL] = {},
+ [IORING_OP_LINK_TIMEOUT] = {
+ .async_ctx = 1,
+ .needs_mm = 1,
+ },
+ [IORING_OP_CONNECT] = {
+ .async_ctx = 1,
+ .needs_mm = 1,
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ },
+ [IORING_OP_FALLOCATE] = {
+ .needs_file = 1,
+ },
+ [IORING_OP_OPENAT] = {
+ .needs_file = 1,
+ .fd_non_neg = 1,
+ .file_table = 1,
+ },
+ [IORING_OP_CLOSE] = {
+ .needs_file = 1,
+ .file_table = 1,
+ },
+ [IORING_OP_FILES_UPDATE] = {
+ .needs_mm = 1,
+ .file_table = 1,
+ },
+ [IORING_OP_STATX] = {
+ .needs_mm = 1,
+ .needs_file = 1,
+ .fd_non_neg = 1,
+ },
+ [IORING_OP_READ] = {
+ .needs_mm = 1,
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ },
+ [IORING_OP_WRITE] = {
+ .needs_mm = 1,
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ },
+ [IORING_OP_FADVISE] = {
+ .needs_file = 1,
+ },
+ [IORING_OP_MADVISE] = {
+ .needs_mm = 1,
+ },
+ [IORING_OP_SEND] = {
+ .needs_mm = 1,
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ },
+ [IORING_OP_RECV] = {
+ .needs_mm = 1,
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ },
+ [IORING_OP_OPENAT2] = {
+ .needs_file = 1,
+ .fd_non_neg = 1,
+ .file_table = 1,
+ },
+ [IORING_OP_EPOLL_CTL] = {
+ .unbound_nonreg_file = 1,
+ .file_table = 1,
+ },
+};
+
static void io_wq_submit_work(struct io_wq_work **workptr);
static void io_cqring_fill_event(struct io_kiocb *req, long res);
-static void __io_free_req(struct io_kiocb *req);
static void io_put_req(struct io_kiocb *req);
-static void io_double_put_req(struct io_kiocb *req);
static void __io_double_put_req(struct io_kiocb *req);
static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
static void io_queue_linked_timeout(struct io_kiocb *req);
+static int __io_sqe_files_update(struct io_ring_ctx *ctx,
+ struct io_uring_files_update *ip,
+ unsigned nr_args);
+static int io_grab_files(struct io_kiocb *req);
static struct kmem_cache *req_cachep;
@@ -537,9 +820,11 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->cq_overflow_list);
init_completion(&ctx->completions[0]);
init_completion(&ctx->completions[1]);
+ idr_init(&ctx->personality_idr);
mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->wait);
spin_lock_init(&ctx->completion_lock);
+ init_llist_head(&ctx->poll_llist);
INIT_LIST_HEAD(&ctx->poll_list);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
@@ -566,7 +851,7 @@ static inline bool __req_need_defer(struct io_kiocb *req)
static inline bool req_need_defer(struct io_kiocb *req)
{
- if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) == REQ_F_IO_DRAIN)
+ if (unlikely(req->flags & REQ_F_IO_DRAIN))
return __req_need_defer(req);
return false;
@@ -606,53 +891,53 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
{
struct io_rings *rings = ctx->rings;
- if (ctx->cached_cq_tail != READ_ONCE(rings->cq.tail)) {
- /* order cqe stores with ring update */
- smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
+ /* order cqe stores with ring update */
+ smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
- if (wq_has_sleeper(&ctx->cq_wait)) {
- wake_up_interruptible(&ctx->cq_wait);
- kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
- }
+ if (wq_has_sleeper(&ctx->cq_wait)) {
+ wake_up_interruptible(&ctx->cq_wait);
+ kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
}
}
-static inline bool io_req_needs_user(struct io_kiocb *req)
+static inline void io_req_work_grab_env(struct io_kiocb *req,
+ const struct io_op_def *def)
{
- return !(req->opcode == IORING_OP_READ_FIXED ||
- req->opcode == IORING_OP_WRITE_FIXED);
+ if (!req->work.mm && def->needs_mm) {
+ mmgrab(current->mm);
+ req->work.mm = current->mm;
+ }
+ if (!req->work.creds)
+ req->work.creds = get_current_cred();
+}
+
+static inline void io_req_work_drop_env(struct io_kiocb *req)
+{
+ if (req->work.mm) {
+ mmdrop(req->work.mm);
+ req->work.mm = NULL;
+ }
+ if (req->work.creds) {
+ put_cred(req->work.creds);
+ req->work.creds = NULL;
+ }
}
static inline bool io_prep_async_work(struct io_kiocb *req,
struct io_kiocb **link)
{
+ const struct io_op_def *def = &io_op_defs[req->opcode];
bool do_hashed = false;
- switch (req->opcode) {
- case IORING_OP_WRITEV:
- case IORING_OP_WRITE_FIXED:
- /* only regular files should be hashed for writes */
- if (req->flags & REQ_F_ISREG)
+ if (req->flags & REQ_F_ISREG) {
+ if (def->hash_reg_file)
do_hashed = true;
- /* fall-through */
- case IORING_OP_READV:
- case IORING_OP_READ_FIXED:
- case IORING_OP_SENDMSG:
- case IORING_OP_RECVMSG:
- case IORING_OP_ACCEPT:
- case IORING_OP_POLL_ADD:
- case IORING_OP_CONNECT:
- /*
- * We know REQ_F_ISREG is not set on some of these
- * opcodes, but this enables us to keep the check in
- * just one place.
- */
- if (!(req->flags & REQ_F_ISREG))
+ } else {
+ if (def->unbound_nonreg_file)
req->work.flags |= IO_WQ_WORK_UNBOUND;
- break;
}
- if (io_req_needs_user(req))
- req->work.flags |= IO_WQ_WORK_NEEDS_USER;
+
+ io_req_work_grab_env(req, def);
*link = io_prep_linked_timeout(req);
return do_hashed;
@@ -711,10 +996,8 @@ static void io_commit_cqring(struct io_ring_ctx *ctx)
__io_commit_cqring(ctx);
- while ((req = io_get_deferred_req(ctx)) != NULL) {
- req->flags |= REQ_F_IO_DRAINED;
+ while ((req = io_get_deferred_req(ctx)) != NULL)
io_queue_async_work(req);
- }
}
static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
@@ -735,13 +1018,20 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
return &rings->cqes[tail & ctx->cq_mask];
}
+static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
+{
+ if (!ctx->eventfd_async)
+ return true;
+ return io_wq_current_is_worker() || in_interrupt();
+}
+
static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
{
if (waitqueue_active(&ctx->wait))
wake_up(&ctx->wait);
if (waitqueue_active(&ctx->sqo_wait))
wake_up(&ctx->sqo_wait);
- if (ctx->cq_ev_fd)
+ if (ctx->cq_ev_fd && io_should_trigger_evfd(ctx))
eventfd_signal(ctx->cq_ev_fd, 1);
}
@@ -766,7 +1056,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
/* if force is set, the ring is going away. always drop after that */
if (force)
- ctx->cq_overflow_flushed = true;
+ ctx->cq_overflow_flushed = 1;
cqe = NULL;
while (!list_empty(&ctx->cq_overflow_list)) {
@@ -788,6 +1078,10 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
}
io_commit_cqring(ctx);
+ if (cqe) {
+ clear_bit(0, &ctx->sq_check_overflow);
+ clear_bit(0, &ctx->cq_check_overflow);
+ }
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
@@ -821,6 +1115,10 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
WRITE_ONCE(ctx->rings->cq_overflow,
atomic_inc_return(&ctx->cached_cq_overflow));
} else {
+ if (list_empty(&ctx->cq_overflow_list)) {
+ set_bit(0, &ctx->sq_check_overflow);
+ set_bit(0, &ctx->cq_check_overflow);
+ }
refcount_inc(&req->refs);
req->result = res;
list_add_tail(&req->list, &ctx->cq_overflow_list);
@@ -863,9 +1161,6 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
struct io_kiocb *req;
- if (!percpu_ref_tryget(&ctx->refs))
- return NULL;
-
if (!state) {
req = kmem_cache_alloc(req_cachep, gfp);
if (unlikely(!req))
@@ -898,7 +1193,6 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
got_it:
req->io = NULL;
- req->ring_file = NULL;
req->file = NULL;
req->ctx = ctx;
req->flags = 0;
@@ -915,24 +1209,35 @@ fallback:
return NULL;
}
-static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
+static void __io_req_do_free(struct io_kiocb *req)
{
- if (*nr) {
- kmem_cache_free_bulk(req_cachep, *nr, reqs);
- percpu_ref_put_many(&ctx->refs, *nr);
- *nr = 0;
+ if (likely(!io_is_fallback_req(req)))
+ kmem_cache_free(req_cachep, req);
+ else
+ clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req);
+}
+
+static void __io_req_aux_free(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ kfree(req->io);
+ if (req->file) {
+ if (req->flags & REQ_F_FIXED_FILE)
+ percpu_ref_put(&ctx->file_data->refs);
+ else
+ fput(req->file);
}
+
+ io_req_work_drop_env(req);
}
static void __io_free_req(struct io_kiocb *req)
{
- struct io_ring_ctx *ctx = req->ctx;
+ __io_req_aux_free(req);
- if (req->io)
- kfree(req->io);
- if (req->file && !(req->flags & REQ_F_FIXED_FILE))
- fput(req->file);
if (req->flags & REQ_F_INFLIGHT) {
+ struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
spin_lock_irqsave(&ctx->inflight_lock, flags);
@@ -941,11 +1246,63 @@ static void __io_free_req(struct io_kiocb *req)
wake_up(&ctx->inflight_wait);
spin_unlock_irqrestore(&ctx->inflight_lock, flags);
}
- percpu_ref_put(&ctx->refs);
- if (likely(!io_is_fallback_req(req)))
- kmem_cache_free(req_cachep, req);
- else
- clear_bit_unlock(0, (unsigned long *) ctx->fallback_req);
+
+ percpu_ref_put(&req->ctx->refs);
+ __io_req_do_free(req);
+}
+
+struct req_batch {
+ void *reqs[IO_IOPOLL_BATCH];
+ int to_free;
+ int need_iter;
+};
+
+static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
+{
+ int fixed_refs = rb->to_free;
+
+ if (!rb->to_free)
+ return;
+ if (rb->need_iter) {
+ int i, inflight = 0;
+ unsigned long flags;
+
+ fixed_refs = 0;
+ for (i = 0; i < rb->to_free; i++) {
+ struct io_kiocb *req = rb->reqs[i];
+
+ if (req->flags & REQ_F_FIXED_FILE) {
+ req->file = NULL;
+ fixed_refs++;
+ }
+ if (req->flags & REQ_F_INFLIGHT)
+ inflight++;
+ __io_req_aux_free(req);
+ }
+ if (!inflight)
+ goto do_free;
+
+ spin_lock_irqsave(&ctx->inflight_lock, flags);
+ for (i = 0; i < rb->to_free; i++) {
+ struct io_kiocb *req = rb->reqs[i];
+
+ if (req->flags & REQ_F_INFLIGHT) {
+ list_del(&req->inflight_entry);
+ if (!--inflight)
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&ctx->inflight_lock, flags);
+
+ if (waitqueue_active(&ctx->inflight_wait))
+ wake_up(&ctx->inflight_wait);
+ }
+do_free:
+ kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
+ if (fixed_refs)
+ percpu_ref_put_many(&ctx->file_data->refs, fixed_refs);
+ percpu_ref_put_many(&ctx->refs, rb->to_free);
+ rb->to_free = rb->need_iter = 0;
}
static bool io_link_cancel_timeout(struct io_kiocb *req)
@@ -1118,19 +1475,21 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
{
struct io_rings *rings = ctx->rings;
- /*
- * noflush == true is from the waitqueue handler, just ensure we wake
- * up the task, and the next invocation will flush the entries. We
- * cannot safely to it from here.
- */
- if (noflush && !list_empty(&ctx->cq_overflow_list))
- return -1U;
+ if (test_bit(0, &ctx->cq_check_overflow)) {
+ /*
+ * noflush == true is from the waitqueue handler, just ensure
+ * we wake up the task, and the next invocation will flush the
+ * entries. We cannot safely to it from here.
+ */
+ if (noflush && !list_empty(&ctx->cq_overflow_list))
+ return -1U;
- io_cqring_overflow_flush(ctx, false);
+ io_cqring_overflow_flush(ctx, false);
+ }
/* See comment at the top of this file */
smp_rmb();
- return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
+ return ctx->cached_cq_tail - READ_ONCE(rings->cq.head);
}
static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
@@ -1141,17 +1500,30 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
}
+static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
+{
+ if ((req->flags & REQ_F_LINK) || io_is_fallback_req(req))
+ return false;
+
+ if (!(req->flags & REQ_F_FIXED_FILE) || req->io)
+ rb->need_iter++;
+
+ rb->reqs[rb->to_free++] = req;
+ if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
+ io_free_req_many(req->ctx, rb);
+ return true;
+}
+
/*
* Find and free completed poll iocbs
*/
static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
struct list_head *done)
{
- void *reqs[IO_IOPOLL_BATCH];
+ struct req_batch rb;
struct io_kiocb *req;
- int to_free;
- to_free = 0;
+ rb.to_free = rb.need_iter = 0;
while (!list_empty(done)) {
req = list_first_entry(done, struct io_kiocb, list);
list_del(&req->list);
@@ -1159,26 +1531,13 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
io_cqring_fill_event(req, req->result);
(*nr_events)++;
- if (refcount_dec_and_test(&req->refs)) {
- /* If we're not using fixed files, we have to pair the
- * completion part with the file put. Use regular
- * completions for those, only batch free for fixed
- * file and non-linked commands.
- */
- if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
- REQ_F_FIXED_FILE) && !io_is_fallback_req(req) &&
- !req->io) {
- reqs[to_free++] = req;
- if (to_free == ARRAY_SIZE(reqs))
- io_free_req_many(ctx, reqs, &to_free);
- } else {
- io_free_req(req);
- }
- }
+ if (refcount_dec_and_test(&req->refs) &&
+ !io_req_multi_free(&rb, req))
+ io_free_req(req);
}
io_commit_cqring(ctx);
- io_free_req_many(ctx, reqs, &to_free);
+ io_free_req_many(ctx, &rb);
}
static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
@@ -1503,6 +1862,10 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
req->flags |= REQ_F_ISREG;
kiocb->ki_pos = READ_ONCE(sqe->off);
+ if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) {
+ req->flags |= REQ_F_CUR_POS;
+ kiocb->ki_pos = req->file->f_pos;
+ }
kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
@@ -1574,6 +1937,10 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt,
bool in_async)
{
+ struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
+
+ if (req->flags & REQ_F_CUR_POS)
+ req->file->f_pos = kiocb->ki_pos;
if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw)
*nxt = __io_complete_rw(kiocb, ret);
else
@@ -1671,6 +2038,13 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
if (req->rw.kiocb.private)
return -EINVAL;
+ if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
+ ssize_t ret;
+ ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
+ *iovec = NULL;
+ return ret;
+ }
+
if (req->io) {
struct io_async_rw *iorw = &req->io->rw;
@@ -1767,6 +2141,8 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
static int io_alloc_async_ctx(struct io_kiocb *req)
{
+ if (!io_op_defs[req->opcode].async_ctx)
+ return 0;
req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
return req->io == NULL;
}
@@ -1786,6 +2162,8 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
struct iovec *iovec, struct iovec *fast_iov,
struct iov_iter *iter)
{
+ if (!io_op_defs[req->opcode].async_ctx)
+ return 0;
if (!req->io && io_alloc_async_ctx(req))
return -ENOMEM;
@@ -1840,6 +2218,7 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
if (!force_nonblock)
req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
+ req->result = 0;
io_size = ret;
if (req->flags & REQ_F_LINK)
req->result = io_size;
@@ -1863,18 +2242,6 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
else
ret2 = loop_rw_iter(READ, req->file, kiocb, &iter);
- /*
- * In case of a short read, punt to async. This can happen
- * if we have data partially cached. Alternatively we can
- * return the short read, in which case the application will
- * need to issue another SQE and wait for it. That SQE will
- * need async punt anyway, so it's more efficient to do it
- * here.
- */
- if (force_nonblock && !(req->flags & REQ_F_NOWAIT) &&
- (req->flags & REQ_F_ISREG) &&
- ret2 > 0 && ret2 < io_size)
- ret2 = -EAGAIN;
/* Catch -EAGAIN return for forced non-blocking submission */
if (!force_nonblock || ret2 != -EAGAIN) {
kiocb_done(kiocb, ret2, nxt, req->in_async);
@@ -1939,6 +2306,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
if (!force_nonblock)
req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
+ req->result = 0;
io_size = ret;
if (req->flags & REQ_F_LINK)
req->result = io_size;
@@ -2046,6 +2414,28 @@ static bool io_req_cancelled(struct io_kiocb *req)
return false;
}
+static void io_link_work_cb(struct io_wq_work **workptr)
+{
+ struct io_wq_work *work = *workptr;
+ struct io_kiocb *link = work->data;
+
+ io_queue_linked_timeout(link);
+ work->func = io_wq_submit_work;
+}
+
+static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
+{
+ struct io_kiocb *link;
+
+ io_prep_async_work(nxt, &link);
+ *workptr = &nxt->work;
+ if (link) {
+ nxt->work.flags |= IO_WQ_WORK_CB;
+ nxt->work.func = io_link_work_cb;
+ nxt->work.data = link;
+ }
+}
+
static void io_fsync_finish(struct io_wq_work **workptr)
{
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
@@ -2064,7 +2454,7 @@ static void io_fsync_finish(struct io_wq_work **workptr)
io_cqring_add_event(req, ret);
io_put_req_find_next(req, &nxt);
if (nxt)
- *workptr = &nxt->work;
+ io_wq_assign_next(workptr, nxt);
}
static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt,
@@ -2086,6 +2476,421 @@ static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt,
return 0;
}
+static void io_fallocate_finish(struct io_wq_work **workptr)
+{
+ struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+ struct io_kiocb *nxt = NULL;
+ int ret;
+
+ ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
+ req->sync.len);
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_cqring_add_event(req, ret);
+ io_put_req_find_next(req, &nxt);
+ if (nxt)
+ io_wq_assign_next(workptr, nxt);
+}
+
+static int io_fallocate_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
+ return -EINVAL;
+
+ req->sync.off = READ_ONCE(sqe->off);
+ req->sync.len = READ_ONCE(sqe->addr);
+ req->sync.mode = READ_ONCE(sqe->len);
+ return 0;
+}
+
+static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt,
+ bool force_nonblock)
+{
+ struct io_wq_work *work, *old_work;
+
+ /* fallocate always requiring blocking context */
+ if (force_nonblock) {
+ io_put_req(req);
+ req->work.func = io_fallocate_finish;
+ return -EAGAIN;
+ }
+
+ work = old_work = &req->work;
+ io_fallocate_finish(&work);
+ if (work && work != old_work)
+ *nxt = container_of(work, struct io_kiocb, work);
+
+ return 0;
+}
+
+static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ const char __user *fname;
+ int ret;
+
+ if (sqe->ioprio || sqe->buf_index)
+ return -EINVAL;
+
+ req->open.dfd = READ_ONCE(sqe->fd);
+ req->open.how.mode = READ_ONCE(sqe->len);
+ fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ req->open.how.flags = READ_ONCE(sqe->open_flags);
+
+ req->open.filename = getname(fname);
+ if (IS_ERR(req->open.filename)) {
+ ret = PTR_ERR(req->open.filename);
+ req->open.filename = NULL;
+ return ret;
+ }
+
+ return 0;
+}
+
+static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct open_how __user *how;
+ const char __user *fname;
+ size_t len;
+ int ret;
+
+ if (sqe->ioprio || sqe->buf_index)
+ return -EINVAL;
+
+ req->open.dfd = READ_ONCE(sqe->fd);
+ fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+ len = READ_ONCE(sqe->len);
+
+ if (len < OPEN_HOW_SIZE_VER0)
+ return -EINVAL;
+
+ ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
+ len);
+ if (ret)
+ return ret;
+
+ if (!(req->open.how.flags & O_PATH) && force_o_largefile())
+ req->open.how.flags |= O_LARGEFILE;
+
+ req->open.filename = getname(fname);
+ if (IS_ERR(req->open.filename)) {
+ ret = PTR_ERR(req->open.filename);
+ req->open.filename = NULL;
+ return ret;
+ }
+
+ return 0;
+}
+
+static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt,
+ bool force_nonblock)
+{
+ struct open_flags op;
+ struct file *file;
+ int ret;
+
+ if (force_nonblock)
+ return -EAGAIN;
+
+ ret = build_open_flags(&req->open.how, &op);
+ if (ret)
+ goto err;
+
+ ret = get_unused_fd_flags(req->open.how.flags);
+ if (ret < 0)
+ goto err;
+
+ file = do_filp_open(req->open.dfd, req->open.filename, &op);
+ if (IS_ERR(file)) {
+ put_unused_fd(ret);
+ ret = PTR_ERR(file);
+ } else {
+ fsnotify_open(file);
+ fd_install(ret, file);
+ }
+err:
+ putname(req->open.filename);
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_cqring_add_event(req, ret);
+ io_put_req_find_next(req, nxt);
+ return 0;
+}
+
+static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt,
+ bool force_nonblock)
+{
+ req->open.how = build_open_how(req->open.how.flags, req->open.how.mode);
+ return io_openat2(req, nxt, force_nonblock);
+}
+
+static int io_epoll_ctl_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+#if defined(CONFIG_EPOLL)
+ if (sqe->ioprio || sqe->buf_index)
+ return -EINVAL;
+
+ req->epoll.epfd = READ_ONCE(sqe->fd);
+ req->epoll.op = READ_ONCE(sqe->len);
+ req->epoll.fd = READ_ONCE(sqe->off);
+
+ if (ep_op_has_event(req->epoll.op)) {
+ struct epoll_event __user *ev;
+
+ ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
+ return -EFAULT;
+ }
+
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static int io_epoll_ctl(struct io_kiocb *req, struct io_kiocb **nxt,
+ bool force_nonblock)
+{
+#if defined(CONFIG_EPOLL)
+ struct io_epoll *ie = &req->epoll;
+ int ret;
+
+ ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
+ if (force_nonblock && ret == -EAGAIN)
+ return -EAGAIN;
+
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_cqring_add_event(req, ret);
+ io_put_req_find_next(req, nxt);
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
+ if (sqe->ioprio || sqe->buf_index || sqe->off)
+ return -EINVAL;
+
+ req->madvise.addr = READ_ONCE(sqe->addr);
+ req->madvise.len = READ_ONCE(sqe->len);
+ req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt,
+ bool force_nonblock)
+{
+#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
+ struct io_madvise *ma = &req->madvise;
+ int ret;
+
+ if (force_nonblock)
+ return -EAGAIN;
+
+ ret = do_madvise(ma->addr, ma->len, ma->advice);
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_cqring_add_event(req, ret);
+ io_put_req_find_next(req, nxt);
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ if (sqe->ioprio || sqe->buf_index || sqe->addr)
+ return -EINVAL;
+
+ req->fadvise.offset = READ_ONCE(sqe->off);
+ req->fadvise.len = READ_ONCE(sqe->len);
+ req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
+ return 0;
+}
+
+static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt,
+ bool force_nonblock)
+{
+ struct io_fadvise *fa = &req->fadvise;
+ int ret;
+
+ /* DONTNEED may block, others _should_ not */
+ if (fa->advice == POSIX_FADV_DONTNEED && force_nonblock)
+ return -EAGAIN;
+
+ ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_cqring_add_event(req, ret);
+ io_put_req_find_next(req, nxt);
+ return 0;
+}
+
+static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ const char __user *fname;
+ unsigned lookup_flags;
+ int ret;
+
+ if (sqe->ioprio || sqe->buf_index)
+ return -EINVAL;
+
+ req->open.dfd = READ_ONCE(sqe->fd);
+ req->open.mask = READ_ONCE(sqe->len);
+ fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ req->open.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+ req->open.how.flags = READ_ONCE(sqe->statx_flags);
+
+ if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.how.flags))
+ return -EINVAL;
+
+ req->open.filename = getname_flags(fname, lookup_flags, NULL);
+ if (IS_ERR(req->open.filename)) {
+ ret = PTR_ERR(req->open.filename);
+ req->open.filename = NULL;
+ return ret;
+ }
+
+ return 0;
+}
+
+static int io_statx(struct io_kiocb *req, struct io_kiocb **nxt,
+ bool force_nonblock)
+{
+ struct io_open *ctx = &req->open;
+ unsigned lookup_flags;
+ struct path path;
+ struct kstat stat;
+ int ret;
+
+ if (force_nonblock)
+ return -EAGAIN;
+
+ if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->how.flags))
+ return -EINVAL;
+
+retry:
+ /* filename_lookup() drops it, keep a reference */
+ ctx->filename->refcnt++;
+
+ ret = filename_lookup(ctx->dfd, ctx->filename, lookup_flags, &path,
+ NULL);
+ if (ret)
+ goto err;
+
+ ret = vfs_getattr(&path, &stat, ctx->mask, ctx->how.flags);
+ path_put(&path);
+ if (retry_estale(ret, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
+ if (!ret)
+ ret = cp_statx(&stat, ctx->buffer);
+err:
+ putname(ctx->filename);
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_cqring_add_event(req, ret);
+ io_put_req_find_next(req, nxt);
+ return 0;
+}
+
+static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ /*
+ * If we queue this for async, it must not be cancellable. That would
+ * leave the 'file' in an undeterminate state.
+ */
+ req->work.flags |= IO_WQ_WORK_NO_CANCEL;
+
+ if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
+ sqe->rw_flags || sqe->buf_index)
+ return -EINVAL;
+ if (sqe->flags & IOSQE_FIXED_FILE)
+ return -EINVAL;
+
+ req->close.fd = READ_ONCE(sqe->fd);
+ if (req->file->f_op == &io_uring_fops ||
+ req->close.fd == req->ctx->ring_fd)
+ return -EBADF;
+
+ return 0;
+}
+
+static void io_close_finish(struct io_wq_work **workptr)
+{
+ struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+ struct io_kiocb *nxt = NULL;
+
+ /* Invoked with files, we need to do the close */
+ if (req->work.files) {
+ int ret;
+
+ ret = filp_close(req->close.put_file, req->work.files);
+ if (ret < 0) {
+ req_set_fail_links(req);
+ }
+ io_cqring_add_event(req, ret);
+ }
+
+ fput(req->close.put_file);
+
+ /* we bypassed the re-issue, drop the submission reference */
+ io_put_req(req);
+ io_put_req_find_next(req, &nxt);
+ if (nxt)
+ io_wq_assign_next(workptr, nxt);
+}
+
+static int io_close(struct io_kiocb *req, struct io_kiocb **nxt,
+ bool force_nonblock)
+{
+ int ret;
+
+ req->close.put_file = NULL;
+ ret = __close_fd_get_file(req->close.fd, &req->close.put_file);
+ if (ret < 0)
+ return ret;
+
+ /* if the file has a flush method, be safe and punt to async */
+ if (req->close.put_file->f_op->flush && !io_wq_current_is_worker())
+ goto eagain;
+
+ /*
+ * No ->flush(), safely close from here and just punt the
+ * fput() to async context.
+ */
+ ret = filp_close(req->close.put_file, current->files);
+
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_cqring_add_event(req, ret);
+
+ if (io_wq_current_is_worker()) {
+ struct io_wq_work *old_work, *work;
+
+ old_work = work = &req->work;
+ io_close_finish(&work);
+ if (work && work != old_work)
+ *nxt = container_of(work, struct io_kiocb, work);
+ return 0;
+ }
+
+eagain:
+ req->work.func = io_close_finish;
+ return -EAGAIN;
+}
+
static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -2120,7 +2925,7 @@ static void io_sync_file_range_finish(struct io_wq_work **workptr)
io_cqring_add_event(req, ret);
io_put_req_find_next(req, &nxt);
if (nxt)
- *workptr = &nxt->work;
+ io_wq_assign_next(workptr, nxt);
}
static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt,
@@ -2163,8 +2968,9 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sr->msg_flags = READ_ONCE(sqe->msg_flags);
sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ sr->len = READ_ONCE(sqe->len);
- if (!io)
+ if (!io || req->opcode == IORING_OP_SEND)
return 0;
io->msg.iov = io->msg.fast_iov;
@@ -2244,6 +3050,56 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
#endif
}
+static int io_send(struct io_kiocb *req, struct io_kiocb **nxt,
+ bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+ struct socket *sock;
+ int ret;
+
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+
+ sock = sock_from_file(req->file, &ret);
+ if (sock) {
+ struct io_sr_msg *sr = &req->sr_msg;
+ struct msghdr msg;
+ struct iovec iov;
+ unsigned flags;
+
+ ret = import_single_range(WRITE, sr->buf, sr->len, &iov,
+ &msg.msg_iter);
+ if (ret)
+ return ret;
+
+ msg.msg_name = NULL;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_namelen = 0;
+
+ flags = req->sr_msg.msg_flags;
+ if (flags & MSG_DONTWAIT)
+ req->flags |= REQ_F_NOWAIT;
+ else if (force_nonblock)
+ flags |= MSG_DONTWAIT;
+
+ ret = __sys_sendmsg_sock(sock, &msg, flags);
+ if (force_nonblock && ret == -EAGAIN)
+ return -EAGAIN;
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ }
+
+ io_cqring_add_event(req, ret);
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_put_req_find_next(req, nxt);
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
static int io_recvmsg_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
@@ -2254,7 +3110,7 @@ static int io_recvmsg_prep(struct io_kiocb *req,
sr->msg_flags = READ_ONCE(sqe->msg_flags);
sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
- if (!io)
+ if (!io || req->opcode == IORING_OP_RECV)
return 0;
io->msg.iov = io->msg.fast_iov;
@@ -2336,6 +3192,59 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
#endif
}
+static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt,
+ bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+ struct socket *sock;
+ int ret;
+
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+
+ sock = sock_from_file(req->file, &ret);
+ if (sock) {
+ struct io_sr_msg *sr = &req->sr_msg;
+ struct msghdr msg;
+ struct iovec iov;
+ unsigned flags;
+
+ ret = import_single_range(READ, sr->buf, sr->len, &iov,
+ &msg.msg_iter);
+ if (ret)
+ return ret;
+
+ msg.msg_name = NULL;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_namelen = 0;
+ msg.msg_iocb = NULL;
+ msg.msg_flags = 0;
+
+ flags = req->sr_msg.msg_flags;
+ if (flags & MSG_DONTWAIT)
+ req->flags |= REQ_F_NOWAIT;
+ else if (force_nonblock)
+ flags |= MSG_DONTWAIT;
+
+ ret = __sys_recvmsg_sock(sock, &msg, NULL, NULL, flags);
+ if (force_nonblock && ret == -EAGAIN)
+ return -EAGAIN;
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ }
+
+ io_cqring_add_event(req, ret);
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_put_req_find_next(req, nxt);
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+
static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_NET)
@@ -2386,7 +3295,7 @@ static void io_accept_finish(struct io_wq_work **workptr)
return;
__io_accept(req, &nxt, false);
if (nxt)
- *workptr = &nxt->work;
+ io_wq_assign_next(workptr, nxt);
}
#endif
@@ -2399,7 +3308,6 @@ static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
ret = __io_accept(req, nxt, force_nonblock);
if (ret == -EAGAIN && force_nonblock) {
req->work.func = io_accept_finish;
- req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
io_put_req(req);
return -EAGAIN;
}
@@ -2617,7 +3525,40 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
req_set_fail_links(req);
io_put_req_find_next(req, &nxt);
if (nxt)
- *workptr = &nxt->work;
+ io_wq_assign_next(workptr, nxt);
+}
+
+static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes)
+{
+ struct io_kiocb *req, *tmp;
+ struct req_batch rb;
+
+ rb.to_free = rb.need_iter = 0;
+ spin_lock_irq(&ctx->completion_lock);
+ llist_for_each_entry_safe(req, tmp, nodes, llist_node) {
+ hash_del(&req->hash_node);
+ io_poll_complete(req, req->result, 0);
+
+ if (refcount_dec_and_test(&req->refs) &&
+ !io_req_multi_free(&rb, req)) {
+ req->flags |= REQ_F_COMP_LOCKED;
+ io_free_req(req);
+ }
+ }
+ spin_unlock_irq(&ctx->completion_lock);
+
+ io_cqring_ev_posted(ctx);
+ io_free_req_many(ctx, &rb);
+}
+
+static void io_poll_flush(struct io_wq_work **workptr)
+{
+ struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+ struct llist_node *nodes;
+
+ nodes = llist_del_all(&req->ctx->poll_llist);
+ if (nodes)
+ __io_poll_flush(req->ctx, nodes);
}
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
@@ -2627,7 +3568,6 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
struct io_ring_ctx *ctx = req->ctx;
__poll_t mask = key_to_poll(key);
- unsigned long flags;
/* for instances that support it check for an event match first: */
if (mask && !(mask & poll->events))
@@ -2641,17 +3581,31 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
* If we have a link timeout we're going to need the completion_lock
* for finalizing the request, mark us as having grabbed that already.
*/
- if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
- hash_del(&req->hash_node);
- io_poll_complete(req, mask, 0);
- req->flags |= REQ_F_COMP_LOCKED;
- io_put_req(req);
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ if (mask) {
+ unsigned long flags;
- io_cqring_ev_posted(ctx);
- } else {
- io_queue_async_work(req);
+ if (llist_empty(&ctx->poll_llist) &&
+ spin_trylock_irqsave(&ctx->completion_lock, flags)) {
+ hash_del(&req->hash_node);
+ io_poll_complete(req, mask, 0);
+ req->flags |= REQ_F_COMP_LOCKED;
+ io_put_req(req);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+ io_cqring_ev_posted(ctx);
+ req = NULL;
+ } else {
+ req->result = mask;
+ req->llist_node.next = NULL;
+ /* if the list wasn't empty, we're done */
+ if (!llist_add(&req->llist_node, &ctx->poll_llist))
+ req = NULL;
+ else
+ req->work.func = io_poll_flush;
+ }
}
+ if (req)
+ io_queue_async_work(req);
return 1;
}
@@ -3056,20 +4010,67 @@ static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt)
return 0;
}
+static int io_files_update_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ if (sqe->flags || sqe->ioprio || sqe->rw_flags)
+ return -EINVAL;
+
+ req->files_update.offset = READ_ONCE(sqe->off);
+ req->files_update.nr_args = READ_ONCE(sqe->len);
+ if (!req->files_update.nr_args)
+ return -EINVAL;
+ req->files_update.arg = READ_ONCE(sqe->addr);
+ return 0;
+}
+
+static int io_files_update(struct io_kiocb *req, bool force_nonblock)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_uring_files_update up;
+ int ret;
+
+ if (force_nonblock)
+ return -EAGAIN;
+
+ up.offset = req->files_update.offset;
+ up.fds = req->files_update.arg;
+
+ mutex_lock(&ctx->uring_lock);
+ ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args);
+ mutex_unlock(&ctx->uring_lock);
+
+ if (ret < 0)
+ req_set_fail_links(req);
+ io_cqring_add_event(req, ret);
+ io_put_req(req);
+ return 0;
+}
+
static int io_req_defer_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
ssize_t ret = 0;
+ if (io_op_defs[req->opcode].file_table) {
+ ret = io_grab_files(req);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ io_req_work_grab_env(req, &io_op_defs[req->opcode]);
+
switch (req->opcode) {
case IORING_OP_NOP:
break;
case IORING_OP_READV:
case IORING_OP_READ_FIXED:
+ case IORING_OP_READ:
ret = io_read_prep(req, sqe, true);
break;
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
+ case IORING_OP_WRITE:
ret = io_write_prep(req, sqe, true);
break;
case IORING_OP_POLL_ADD:
@@ -3085,9 +4086,11 @@ static int io_req_defer_prep(struct io_kiocb *req,
ret = io_prep_sfr(req, sqe);
break;
case IORING_OP_SENDMSG:
+ case IORING_OP_SEND:
ret = io_sendmsg_prep(req, sqe);
break;
case IORING_OP_RECVMSG:
+ case IORING_OP_RECV:
ret = io_recvmsg_prep(req, sqe);
break;
case IORING_OP_CONNECT:
@@ -3108,6 +4111,33 @@ static int io_req_defer_prep(struct io_kiocb *req,
case IORING_OP_ACCEPT:
ret = io_accept_prep(req, sqe);
break;
+ case IORING_OP_FALLOCATE:
+ ret = io_fallocate_prep(req, sqe);
+ break;
+ case IORING_OP_OPENAT:
+ ret = io_openat_prep(req, sqe);
+ break;
+ case IORING_OP_CLOSE:
+ ret = io_close_prep(req, sqe);
+ break;
+ case IORING_OP_FILES_UPDATE:
+ ret = io_files_update_prep(req, sqe);
+ break;
+ case IORING_OP_STATX:
+ ret = io_statx_prep(req, sqe);
+ break;
+ case IORING_OP_FADVISE:
+ ret = io_fadvise_prep(req, sqe);
+ break;
+ case IORING_OP_MADVISE:
+ ret = io_madvise_prep(req, sqe);
+ break;
+ case IORING_OP_OPENAT2:
+ ret = io_openat2_prep(req, sqe);
+ break;
+ case IORING_OP_EPOLL_CTL:
+ ret = io_epoll_ctl_prep(req, sqe);
+ break;
default:
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
req->opcode);
@@ -3158,6 +4188,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
break;
case IORING_OP_READV:
case IORING_OP_READ_FIXED:
+ case IORING_OP_READ:
if (sqe) {
ret = io_read_prep(req, sqe, force_nonblock);
if (ret < 0)
@@ -3167,6 +4198,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
break;
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
+ case IORING_OP_WRITE:
if (sqe) {
ret = io_write_prep(req, sqe, force_nonblock);
if (ret < 0)
@@ -3207,20 +4239,28 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
ret = io_sync_file_range(req, nxt, force_nonblock);
break;
case IORING_OP_SENDMSG:
+ case IORING_OP_SEND:
if (sqe) {
ret = io_sendmsg_prep(req, sqe);
if (ret < 0)
break;
}
- ret = io_sendmsg(req, nxt, force_nonblock);
+ if (req->opcode == IORING_OP_SENDMSG)
+ ret = io_sendmsg(req, nxt, force_nonblock);
+ else
+ ret = io_send(req, nxt, force_nonblock);
break;
case IORING_OP_RECVMSG:
+ case IORING_OP_RECV:
if (sqe) {
ret = io_recvmsg_prep(req, sqe);
if (ret)
break;
}
- ret = io_recvmsg(req, nxt, force_nonblock);
+ if (req->opcode == IORING_OP_RECVMSG)
+ ret = io_recvmsg(req, nxt, force_nonblock);
+ else
+ ret = io_recv(req, nxt, force_nonblock);
break;
case IORING_OP_TIMEOUT:
if (sqe) {
@@ -3262,6 +4302,78 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
}
ret = io_async_cancel(req, nxt);
break;
+ case IORING_OP_FALLOCATE:
+ if (sqe) {
+ ret = io_fallocate_prep(req, sqe);
+ if (ret)
+ break;
+ }
+ ret = io_fallocate(req, nxt, force_nonblock);
+ break;
+ case IORING_OP_OPENAT:
+ if (sqe) {
+ ret = io_openat_prep(req, sqe);
+ if (ret)
+ break;
+ }
+ ret = io_openat(req, nxt, force_nonblock);
+ break;
+ case IORING_OP_CLOSE:
+ if (sqe) {
+ ret = io_close_prep(req, sqe);
+ if (ret)
+ break;
+ }
+ ret = io_close(req, nxt, force_nonblock);
+ break;
+ case IORING_OP_FILES_UPDATE:
+ if (sqe) {
+ ret = io_files_update_prep(req, sqe);
+ if (ret)
+ break;
+ }
+ ret = io_files_update(req, force_nonblock);
+ break;
+ case IORING_OP_STATX:
+ if (sqe) {
+ ret = io_statx_prep(req, sqe);
+ if (ret)
+ break;
+ }
+ ret = io_statx(req, nxt, force_nonblock);
+ break;
+ case IORING_OP_FADVISE:
+ if (sqe) {
+ ret = io_fadvise_prep(req, sqe);
+ if (ret)
+ break;
+ }
+ ret = io_fadvise(req, nxt, force_nonblock);
+ break;
+ case IORING_OP_MADVISE:
+ if (sqe) {
+ ret = io_madvise_prep(req, sqe);
+ if (ret)
+ break;
+ }
+ ret = io_madvise(req, nxt, force_nonblock);
+ break;
+ case IORING_OP_OPENAT2:
+ if (sqe) {
+ ret = io_openat2_prep(req, sqe);
+ if (ret)
+ break;
+ }
+ ret = io_openat2(req, nxt, force_nonblock);
+ break;
+ case IORING_OP_EPOLL_CTL:
+ if (sqe) {
+ ret = io_epoll_ctl_prep(req, sqe);
+ if (ret)
+ break;
+ }
+ ret = io_epoll_ctl(req, nxt, force_nonblock);
+ break;
default:
ret = -EINVAL;
break;
@@ -3271,24 +4383,24 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return ret;
if (ctx->flags & IORING_SETUP_IOPOLL) {
+ const bool in_async = io_wq_current_is_worker();
+
if (req->result == -EAGAIN)
return -EAGAIN;
+ /* workqueue context doesn't hold uring_lock, grab it now */
+ if (in_async)
+ mutex_lock(&ctx->uring_lock);
+
io_iopoll_req_issued(req);
+
+ if (in_async)
+ mutex_unlock(&ctx->uring_lock);
}
return 0;
}
-static void io_link_work_cb(struct io_wq_work **workptr)
-{
- struct io_wq_work *work = *workptr;
- struct io_kiocb *link = work->data;
-
- io_queue_linked_timeout(link);
- work->func = io_wq_submit_work;
-}
-
static void io_wq_submit_work(struct io_wq_work **workptr)
{
struct io_wq_work *work = *workptr;
@@ -3296,8 +4408,11 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
struct io_kiocb *nxt = NULL;
int ret = 0;
- if (work->flags & IO_WQ_WORK_CANCEL)
+ /* if NO_CANCEL is set, we must still run the work */
+ if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
+ IO_WQ_WORK_CANCEL) {
ret = -ECANCELED;
+ }
if (!ret) {
req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0;
@@ -3325,39 +4440,17 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
}
/* if a dependent link is ready, pass it back */
- if (!ret && nxt) {
- struct io_kiocb *link;
-
- io_prep_async_work(nxt, &link);
- *workptr = &nxt->work;
- if (link) {
- nxt->work.flags |= IO_WQ_WORK_CB;
- nxt->work.func = io_link_work_cb;
- nxt->work.data = link;
- }
- }
-}
-
-static bool io_req_op_valid(int op)
-{
- return op >= IORING_OP_NOP && op < IORING_OP_LAST;
+ if (!ret && nxt)
+ io_wq_assign_next(workptr, nxt);
}
-static int io_req_needs_file(struct io_kiocb *req)
+static int io_req_needs_file(struct io_kiocb *req, int fd)
{
- switch (req->opcode) {
- case IORING_OP_NOP:
- case IORING_OP_POLL_REMOVE:
- case IORING_OP_TIMEOUT:
- case IORING_OP_TIMEOUT_REMOVE:
- case IORING_OP_ASYNC_CANCEL:
- case IORING_OP_LINK_TIMEOUT:
+ if (!io_op_defs[req->opcode].needs_file)
return 0;
- default:
- if (io_req_op_valid(req->opcode))
- return 1;
- return -EINVAL;
- }
+ if (fd == -1 && io_op_defs[req->opcode].fd_non_neg)
+ return 0;
+ return 1;
}
static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
@@ -3365,8 +4458,8 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
{
struct fixed_file_table *table;
- table = &ctx->file_table[index >> IORING_FILE_TABLE_SHIFT];
- return table->files[index & IORING_FILE_TABLE_MASK];
+ table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
+ return table->files[index & IORING_FILE_TABLE_MASK];;
}
static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
@@ -3374,20 +4467,16 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
{
struct io_ring_ctx *ctx = req->ctx;
unsigned flags;
- int fd, ret;
+ int fd;
flags = READ_ONCE(sqe->flags);
fd = READ_ONCE(sqe->fd);
- if (flags & IOSQE_IO_DRAIN)
- req->flags |= REQ_F_IO_DRAIN;
-
- ret = io_req_needs_file(req);
- if (ret <= 0)
- return ret;
+ if (!io_req_needs_file(req, fd))
+ return 0;
if (flags & IOSQE_FIXED_FILE) {
- if (unlikely(!ctx->file_table ||
+ if (unlikely(!ctx->file_data ||
(unsigned) fd >= ctx->nr_user_files))
return -EBADF;
fd = array_index_nospec(fd, ctx->nr_user_files);
@@ -3395,6 +4484,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
if (!req->file)
return -EBADF;
req->flags |= REQ_F_FIXED_FILE;
+ percpu_ref_get(&ctx->file_data->refs);
} else {
if (req->needs_fixed_file)
return -EBADF;
@@ -3412,6 +4502,11 @@ static int io_grab_files(struct io_kiocb *req)
int ret = -EBADF;
struct io_ring_ctx *ctx = req->ctx;
+ if (req->work.files)
+ return 0;
+ if (!ctx->ring_file)
+ return -EBADF;
+
rcu_read_lock();
spin_lock_irq(&ctx->inflight_lock);
/*
@@ -3420,7 +4515,7 @@ static int io_grab_files(struct io_kiocb *req)
* the fd has changed since we started down this path, and disallow
* this operation if it has.
*/
- if (fcheck(req->ring_fd) == req->ring_file) {
+ if (fcheck(ctx->ring_fd) == ctx->ring_file) {
list_add(&req->inflight_entry, &ctx->inflight_list);
req->flags |= REQ_F_INFLIGHT;
req->work.files = current->files;
@@ -3526,7 +4621,8 @@ again:
*/
if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
(req->flags & REQ_F_MUST_PUNT))) {
- if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) {
+punt:
+ if (io_op_defs[req->opcode].file_table) {
ret = io_grab_files(req);
if (ret)
goto err;
@@ -3561,6 +4657,9 @@ done_req:
if (nxt) {
req = nxt;
nxt = NULL;
+
+ if (req->flags & REQ_F_FORCE_ASYNC)
+ goto punt;
goto again;
}
}
@@ -3569,21 +4668,27 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
int ret;
- if (unlikely(req->ctx->drain_next)) {
- req->flags |= REQ_F_IO_DRAIN;
- req->ctx->drain_next = false;
- }
- req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK);
-
ret = io_req_defer(req, sqe);
if (ret) {
if (ret != -EIOCBQUEUED) {
+fail_req:
io_cqring_add_event(req, ret);
req_set_fail_links(req);
io_double_put_req(req);
}
- } else
+ } else if (req->flags & REQ_F_FORCE_ASYNC) {
+ ret = io_req_defer_prep(req, sqe);
+ if (unlikely(ret < 0))
+ goto fail_req;
+ /*
+ * Never try inline submit of IOSQE_ASYNC is set, go straight
+ * to async execution.
+ */
+ req->work.flags |= IO_WQ_WORK_CONCURRENT;
+ io_queue_async_work(req);
+ } else {
__io_queue_sqe(req, sqe);
+ }
}
static inline void io_queue_link_head(struct io_kiocb *req)
@@ -3596,25 +4701,47 @@ static inline void io_queue_link_head(struct io_kiocb *req)
}
#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
- IOSQE_IO_HARDLINK)
+ IOSQE_IO_HARDLINK | IOSQE_ASYNC)
static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
struct io_submit_state *state, struct io_kiocb **link)
{
+ const struct cred *old_creds = NULL;
struct io_ring_ctx *ctx = req->ctx;
- int ret;
+ unsigned int sqe_flags;
+ int ret, id;
+
+ sqe_flags = READ_ONCE(sqe->flags);
/* enforce forwards compatibility on users */
- if (unlikely(sqe->flags & ~SQE_VALID_FLAGS)) {
+ if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
ret = -EINVAL;
goto err_req;
}
+ id = READ_ONCE(sqe->personality);
+ if (id) {
+ const struct cred *personality_creds;
+
+ personality_creds = idr_find(&ctx->personality_idr, id);
+ if (unlikely(!personality_creds)) {
+ ret = -EINVAL;
+ goto err_req;
+ }
+ old_creds = override_creds(personality_creds);
+ }
+
+ /* same numerical values with corresponding REQ_F_*, safe to copy */
+ req->flags |= sqe_flags & (IOSQE_IO_DRAIN|IOSQE_IO_HARDLINK|
+ IOSQE_ASYNC);
+
ret = io_req_set_file(state, req, sqe);
if (unlikely(ret)) {
err_req:
io_cqring_add_event(req, ret);
io_double_put_req(req);
+ if (old_creds)
+ revert_creds(old_creds);
return false;
}
@@ -3626,14 +4753,19 @@ err_req:
* conditions are true (normal request), then just queue it.
*/
if (*link) {
- struct io_kiocb *prev = *link;
-
- if (sqe->flags & IOSQE_IO_DRAIN)
- (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN;
-
- if (sqe->flags & IOSQE_IO_HARDLINK)
- req->flags |= REQ_F_HARDLINK;
+ struct io_kiocb *head = *link;
+ /*
+ * Taking sequential execution of a link, draining both sides
+ * of the link also fullfils IOSQE_IO_DRAIN semantics for all
+ * requests in the link. So, it drains the head and the
+ * next after the link request. The last one is done via
+ * drain_next flag to persist the effect across calls.
+ */
+ if (sqe_flags & IOSQE_IO_DRAIN) {
+ head->flags |= REQ_F_IO_DRAIN;
+ ctx->drain_next = 1;
+ }
if (io_alloc_async_ctx(req)) {
ret = -EAGAIN;
goto err_req;
@@ -3642,25 +4774,36 @@ err_req:
ret = io_req_defer_prep(req, sqe);
if (ret) {
/* fail even hard links since we don't submit */
- prev->flags |= REQ_F_FAIL_LINK;
+ head->flags |= REQ_F_FAIL_LINK;
goto err_req;
}
- trace_io_uring_link(ctx, req, prev);
- list_add_tail(&req->link_list, &prev->link_list);
- } else if (sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
- req->flags |= REQ_F_LINK;
- if (sqe->flags & IOSQE_IO_HARDLINK)
- req->flags |= REQ_F_HARDLINK;
-
- INIT_LIST_HEAD(&req->link_list);
- ret = io_req_defer_prep(req, sqe);
- if (ret)
- req->flags |= REQ_F_FAIL_LINK;
- *link = req;
+ trace_io_uring_link(ctx, req, head);
+ list_add_tail(&req->link_list, &head->link_list);
+
+ /* last request of a link, enqueue the link */
+ if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK))) {
+ io_queue_link_head(head);
+ *link = NULL;
+ }
} else {
- io_queue_sqe(req, sqe);
+ if (unlikely(ctx->drain_next)) {
+ req->flags |= REQ_F_IO_DRAIN;
+ req->ctx->drain_next = 0;
+ }
+ if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
+ req->flags |= REQ_F_LINK;
+ INIT_LIST_HEAD(&req->link_list);
+ ret = io_req_defer_prep(req, sqe);
+ if (ret)
+ req->flags |= REQ_F_FAIL_LINK;
+ *link = req;
+ } else {
+ io_queue_sqe(req, sqe);
+ }
}
+ if (old_creds)
+ revert_creds(old_creds);
return true;
}
@@ -3692,14 +4835,12 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
{
struct io_rings *rings = ctx->rings;
- if (ctx->cached_sq_head != READ_ONCE(rings->sq.head)) {
- /*
- * Ensure any loads from the SQEs are done at this point,
- * since once we write the new head, the application could
- * write new data to them.
- */
- smp_store_release(&rings->sq.head, ctx->cached_sq_head);
- }
+ /*
+ * Ensure any loads from the SQEs are done at this point,
+ * since once we write the new head, the application could
+ * write new data to them.
+ */
+ smp_store_release(&rings->sq.head, ctx->cached_sq_head);
}
/*
@@ -3713,7 +4854,6 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe **sqe_ptr)
{
- struct io_rings *rings = ctx->rings;
u32 *sq_array = ctx->sq_array;
unsigned head;
@@ -3725,12 +4865,7 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
* 2) allows the kernel side to track the head on its own, even
* though the application is the one updating it.
*/
- head = ctx->cached_sq_head;
- /* make sure SQ entry isn't read before tail */
- if (unlikely(head == smp_load_acquire(&rings->sq.tail)))
- return false;
-
- head = READ_ONCE(sq_array[head & ctx->sq_mask]);
+ head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]);
if (likely(head < ctx->sq_entries)) {
/*
* All io need record the previous position, if LINK vs DARIN,
@@ -3748,7 +4883,7 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
/* drop invalid entries */
ctx->cached_sq_head++;
ctx->cached_sq_dropped++;
- WRITE_ONCE(rings->sq_dropped, ctx->cached_sq_dropped);
+ WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
return false;
}
@@ -3762,19 +4897,29 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
bool mm_fault = false;
/* if we have a backlog and couldn't flush it all, return BUSY */
- if (!list_empty(&ctx->cq_overflow_list) &&
- !io_cqring_overflow_flush(ctx, false))
- return -EBUSY;
+ if (test_bit(0, &ctx->sq_check_overflow)) {
+ if (!list_empty(&ctx->cq_overflow_list) &&
+ !io_cqring_overflow_flush(ctx, false))
+ return -EBUSY;
+ }
+
+ /* make sure SQ entry isn't read before tail */
+ nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
+
+ if (!percpu_ref_tryget_many(&ctx->refs, nr))
+ return -EAGAIN;
if (nr > IO_PLUG_THRESHOLD) {
io_submit_state_start(&state, nr);
statep = &state;
}
+ ctx->ring_fd = ring_fd;
+ ctx->ring_file = ring_file;
+
for (i = 0; i < nr; i++) {
const struct io_uring_sqe *sqe;
struct io_kiocb *req;
- unsigned int sqe_flags;
req = io_get_req(ctx, statep);
if (unlikely(!req)) {
@@ -3783,11 +4928,20 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
break;
}
if (!io_get_sqring(ctx, req, &sqe)) {
- __io_free_req(req);
+ __io_req_do_free(req);
break;
}
- if (io_req_needs_user(req) && !*mm) {
+ /* will complete beyond this point, count as submitted */
+ submitted++;
+
+ if (unlikely(req->opcode >= IORING_OP_LAST)) {
+ io_cqring_add_event(req, -EINVAL);
+ io_double_put_req(req);
+ break;
+ }
+
+ if (io_op_defs[req->opcode].needs_mm && !*mm) {
mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
if (!mm_fault) {
use_mm(ctx->sqo_mm);
@@ -3795,27 +4949,20 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
}
}
- submitted++;
- sqe_flags = sqe->flags;
-
- req->ring_file = ring_file;
- req->ring_fd = ring_fd;
req->has_user = *mm != NULL;
req->in_async = async;
req->needs_fixed_file = async;
- trace_io_uring_submit_sqe(ctx, req->user_data, true, async);
+ trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
+ true, async);
if (!io_submit_sqe(req, sqe, statep, &link))
break;
- /*
- * If previous wasn't linked and we have a linked command,
- * that's the end of the chain. Submit the previous link.
- */
- if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) && link) {
- io_queue_link_head(link);
- link = NULL;
- }
}
+ if (unlikely(submitted != nr)) {
+ int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
+
+ percpu_ref_put_many(&ctx->refs, nr - ref_used);
+ }
if (link)
io_queue_link_head(link);
if (statep)
@@ -3938,7 +5085,6 @@ static int io_sq_thread(void *data)
ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
}
- to_submit = min(to_submit, ctx->sq_entries);
mutex_lock(&ctx->uring_lock);
ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
mutex_unlock(&ctx->uring_lock);
@@ -4069,19 +5215,40 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
#endif
}
+static void io_file_ref_kill(struct percpu_ref *ref)
+{
+ struct fixed_file_data *data;
+
+ data = container_of(ref, struct fixed_file_data, refs);
+ complete(&data->done);
+}
+
static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
+ struct fixed_file_data *data = ctx->file_data;
unsigned nr_tables, i;
- if (!ctx->file_table)
+ if (!data)
return -ENXIO;
+ /* protect against inflight atomic switch, which drops the ref */
+ percpu_ref_get(&data->refs);
+ /* wait for existing switches */
+ flush_work(&data->ref_work);
+ percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill);
+ wait_for_completion(&data->done);
+ percpu_ref_put(&data->refs);
+ /* flush potential new switch */
+ flush_work(&data->ref_work);
+ percpu_ref_exit(&data->refs);
+
__io_sqe_files_unregister(ctx);
nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
for (i = 0; i < nr_tables; i++)
- kfree(ctx->file_table[i].files);
- kfree(ctx->file_table);
- ctx->file_table = NULL;
+ kfree(data->table[i].files);
+ kfree(data->table);
+ kfree(data);
+ ctx->file_data = NULL;
ctx->nr_user_files = 0;
return 0;
}
@@ -4112,16 +5279,6 @@ static void io_finish_async(struct io_ring_ctx *ctx)
}
#if defined(CONFIG_UNIX)
-static void io_destruct_skb(struct sk_buff *skb)
-{
- struct io_ring_ctx *ctx = skb->sk->sk_user_data;
-
- if (ctx->io_wq)
- io_wq_flush(ctx->io_wq);
-
- unix_destruct_scm(skb);
-}
-
/*
* Ensure the UNIX gc is aware of our file set, so we are certain that
* the io_uring can be safely unregistered on process exit, even if we have
@@ -4169,7 +5326,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
fpl->max = SCM_MAX_FD;
fpl->count = nr_files;
UNIXCB(skb).fp = fpl;
- skb->destructor = io_destruct_skb;
+ skb->destructor = unix_destruct_scm;
refcount_add(skb->truesize, &sk->sk_wmem_alloc);
skb_queue_head(&sk->sk_receive_queue, skb);
@@ -4231,7 +5388,7 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
int i;
for (i = 0; i < nr_tables; i++) {
- struct fixed_file_table *table = &ctx->file_table[i];
+ struct fixed_file_table *table = &ctx->file_data->table[i];
unsigned this_files;
this_files = min(nr_files, IORING_MAX_FILES_TABLE);
@@ -4246,36 +5403,159 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
return 0;
for (i = 0; i < nr_tables; i++) {
- struct fixed_file_table *table = &ctx->file_table[i];
+ struct fixed_file_table *table = &ctx->file_data->table[i];
kfree(table->files);
}
return 1;
}
+static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
+{
+#if defined(CONFIG_UNIX)
+ struct sock *sock = ctx->ring_sock->sk;
+ struct sk_buff_head list, *head = &sock->sk_receive_queue;
+ struct sk_buff *skb;
+ int i;
+
+ __skb_queue_head_init(&list);
+
+ /*
+ * Find the skb that holds this file in its SCM_RIGHTS. When found,
+ * remove this entry and rearrange the file array.
+ */
+ skb = skb_dequeue(head);
+ while (skb) {
+ struct scm_fp_list *fp;
+
+ fp = UNIXCB(skb).fp;
+ for (i = 0; i < fp->count; i++) {
+ int left;
+
+ if (fp->fp[i] != file)
+ continue;
+
+ unix_notinflight(fp->user, fp->fp[i]);
+ left = fp->count - 1 - i;
+ if (left) {
+ memmove(&fp->fp[i], &fp->fp[i + 1],
+ left * sizeof(struct file *));
+ }
+ fp->count--;
+ if (!fp->count) {
+ kfree_skb(skb);
+ skb = NULL;
+ } else {
+ __skb_queue_tail(&list, skb);
+ }
+ fput(file);
+ file = NULL;
+ break;
+ }
+
+ if (!file)
+ break;
+
+ __skb_queue_tail(&list, skb);
+
+ skb = skb_dequeue(head);
+ }
+
+ if (skb_peek(&list)) {
+ spin_lock_irq(&head->lock);
+ while ((skb = __skb_dequeue(&list)) != NULL)
+ __skb_queue_tail(head, skb);
+ spin_unlock_irq(&head->lock);
+ }
+#else
+ fput(file);
+#endif
+}
+
+struct io_file_put {
+ struct llist_node llist;
+ struct file *file;
+ struct completion *done;
+};
+
+static void io_ring_file_ref_switch(struct work_struct *work)
+{
+ struct io_file_put *pfile, *tmp;
+ struct fixed_file_data *data;
+ struct llist_node *node;
+
+ data = container_of(work, struct fixed_file_data, ref_work);
+
+ while ((node = llist_del_all(&data->put_llist)) != NULL) {
+ llist_for_each_entry_safe(pfile, tmp, node, llist) {
+ io_ring_file_put(data->ctx, pfile->file);
+ if (pfile->done)
+ complete(pfile->done);
+ else
+ kfree(pfile);
+ }
+ }
+
+ percpu_ref_get(&data->refs);
+ percpu_ref_switch_to_percpu(&data->refs);
+}
+
+static void io_file_data_ref_zero(struct percpu_ref *ref)
+{
+ struct fixed_file_data *data;
+
+ data = container_of(ref, struct fixed_file_data, refs);
+
+ /* we can't safely switch from inside this context, punt to wq */
+ queue_work(system_wq, &data->ref_work);
+}
+
static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args)
{
__s32 __user *fds = (__s32 __user *) arg;
unsigned nr_tables;
+ struct file *file;
int fd, ret = 0;
unsigned i;
- if (ctx->file_table)
+ if (ctx->file_data)
return -EBUSY;
if (!nr_args)
return -EINVAL;
if (nr_args > IORING_MAX_FIXED_FILES)
return -EMFILE;
+ ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL);
+ if (!ctx->file_data)
+ return -ENOMEM;
+ ctx->file_data->ctx = ctx;
+ init_completion(&ctx->file_data->done);
+
nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
- ctx->file_table = kcalloc(nr_tables, sizeof(struct fixed_file_table),
+ ctx->file_data->table = kcalloc(nr_tables,
+ sizeof(struct fixed_file_table),
GFP_KERNEL);
- if (!ctx->file_table)
+ if (!ctx->file_data->table) {
+ kfree(ctx->file_data);
+ ctx->file_data = NULL;
return -ENOMEM;
+ }
+
+ if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero,
+ PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
+ kfree(ctx->file_data->table);
+ kfree(ctx->file_data);
+ ctx->file_data = NULL;
+ return -ENOMEM;
+ }
+ ctx->file_data->put_llist.first = NULL;
+ INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch);
if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
- kfree(ctx->file_table);
- ctx->file_table = NULL;
+ percpu_ref_exit(&ctx->file_data->refs);
+ kfree(ctx->file_data->table);
+ kfree(ctx->file_data);
+ ctx->file_data = NULL;
return -ENOMEM;
}
@@ -4292,13 +5572,14 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
continue;
}
- table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT];
+ table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
index = i & IORING_FILE_TABLE_MASK;
- table->files[index] = fget(fd);
+ file = fget(fd);
ret = -EBADF;
- if (!table->files[index])
+ if (!file)
break;
+
/*
* Don't allow io_uring instances to be registered. If UNIX
* isn't enabled, then this causes a reference cycle and this
@@ -4306,26 +5587,26 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
* handle it just fine, but there's still no point in allowing
* a ring fd as it doesn't support regular read/write anyway.
*/
- if (table->files[index]->f_op == &io_uring_fops) {
- fput(table->files[index]);
+ if (file->f_op == &io_uring_fops) {
+ fput(file);
break;
}
ret = 0;
+ table->files[index] = file;
}
if (ret) {
for (i = 0; i < ctx->nr_user_files; i++) {
- struct file *file;
-
file = io_file_from_index(ctx, i);
if (file)
fput(file);
}
for (i = 0; i < nr_tables; i++)
- kfree(ctx->file_table[i].files);
+ kfree(ctx->file_data->table[i].files);
- kfree(ctx->file_table);
- ctx->file_table = NULL;
+ kfree(ctx->file_data->table);
+ kfree(ctx->file_data);
+ ctx->file_data = NULL;
ctx->nr_user_files = 0;
return ret;
}
@@ -4337,69 +5618,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return ret;
}
-static void io_sqe_file_unregister(struct io_ring_ctx *ctx, int index)
-{
-#if defined(CONFIG_UNIX)
- struct file *file = io_file_from_index(ctx, index);
- struct sock *sock = ctx->ring_sock->sk;
- struct sk_buff_head list, *head = &sock->sk_receive_queue;
- struct sk_buff *skb;
- int i;
-
- __skb_queue_head_init(&list);
-
- /*
- * Find the skb that holds this file in its SCM_RIGHTS. When found,
- * remove this entry and rearrange the file array.
- */
- skb = skb_dequeue(head);
- while (skb) {
- struct scm_fp_list *fp;
-
- fp = UNIXCB(skb).fp;
- for (i = 0; i < fp->count; i++) {
- int left;
-
- if (fp->fp[i] != file)
- continue;
-
- unix_notinflight(fp->user, fp->fp[i]);
- left = fp->count - 1 - i;
- if (left) {
- memmove(&fp->fp[i], &fp->fp[i + 1],
- left * sizeof(struct file *));
- }
- fp->count--;
- if (!fp->count) {
- kfree_skb(skb);
- skb = NULL;
- } else {
- __skb_queue_tail(&list, skb);
- }
- fput(file);
- file = NULL;
- break;
- }
-
- if (!file)
- break;
-
- __skb_queue_tail(&list, skb);
-
- skb = skb_dequeue(head);
- }
-
- if (skb_peek(&list)) {
- spin_lock_irq(&head->lock);
- while ((skb = __skb_dequeue(&list)) != NULL)
- __skb_queue_tail(head, skb);
- spin_unlock_irq(&head->lock);
- }
-#else
- fput(io_file_from_index(ctx, index));
-#endif
-}
-
static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
int index)
{
@@ -4443,27 +5661,65 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
#endif
}
-static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
- unsigned nr_args)
+static void io_atomic_switch(struct percpu_ref *ref)
{
- struct io_uring_files_update up;
+ struct fixed_file_data *data;
+
+ data = container_of(ref, struct fixed_file_data, refs);
+ clear_bit(FFD_F_ATOMIC, &data->state);
+}
+
+static bool io_queue_file_removal(struct fixed_file_data *data,
+ struct file *file)
+{
+ struct io_file_put *pfile, pfile_stack;
+ DECLARE_COMPLETION_ONSTACK(done);
+
+ /*
+ * If we fail allocating the struct we need for doing async reomval
+ * of this file, just punt to sync and wait for it.
+ */
+ pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
+ if (!pfile) {
+ pfile = &pfile_stack;
+ pfile->done = &done;
+ }
+
+ pfile->file = file;
+ llist_add(&pfile->llist, &data->put_llist);
+
+ if (pfile == &pfile_stack) {
+ if (!test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
+ percpu_ref_put(&data->refs);
+ percpu_ref_switch_to_atomic(&data->refs,
+ io_atomic_switch);
+ }
+ wait_for_completion(&done);
+ flush_work(&data->ref_work);
+ return false;
+ }
+
+ return true;
+}
+
+static int __io_sqe_files_update(struct io_ring_ctx *ctx,
+ struct io_uring_files_update *up,
+ unsigned nr_args)
+{
+ struct fixed_file_data *data = ctx->file_data;
+ bool ref_switch = false;
+ struct file *file;
__s32 __user *fds;
int fd, i, err;
__u32 done;
- if (!ctx->file_table)
- return -ENXIO;
- if (!nr_args)
- return -EINVAL;
- if (copy_from_user(&up, arg, sizeof(up)))
- return -EFAULT;
- if (check_add_overflow(up.offset, nr_args, &done))
+ if (check_add_overflow(up->offset, nr_args, &done))
return -EOVERFLOW;
if (done > ctx->nr_user_files)
return -EINVAL;
done = 0;
- fds = (__s32 __user *) up.fds;
+ fds = u64_to_user_ptr(up->fds);
while (nr_args) {
struct fixed_file_table *table;
unsigned index;
@@ -4473,16 +5729,16 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
err = -EFAULT;
break;
}
- i = array_index_nospec(up.offset, ctx->nr_user_files);
- table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT];
+ i = array_index_nospec(up->offset, ctx->nr_user_files);
+ table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
index = i & IORING_FILE_TABLE_MASK;
if (table->files[index]) {
- io_sqe_file_unregister(ctx, i);
+ file = io_file_from_index(ctx, index);
table->files[index] = NULL;
+ if (io_queue_file_removal(data, file))
+ ref_switch = true;
}
if (fd != -1) {
- struct file *file;
-
file = fget(fd);
if (!file) {
err = -EBADF;
@@ -4508,11 +5764,32 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
}
nr_args--;
done++;
- up.offset++;
+ up->offset++;
+ }
+
+ if (ref_switch && !test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
+ percpu_ref_put(&data->refs);
+ percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
}
return done ? done : err;
}
+static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned nr_args)
+{
+ struct io_uring_files_update up;
+
+ if (!ctx->file_data)
+ return -ENXIO;
+ if (!nr_args)
+ return -EINVAL;
+ if (copy_from_user(&up, arg, sizeof(up)))
+ return -EFAULT;
+ if (up.resv)
+ return -EINVAL;
+
+ return __io_sqe_files_update(ctx, &up, nr_args);
+}
static void io_put_work(struct io_wq_work *work)
{
@@ -4528,11 +5805,56 @@ static void io_get_work(struct io_wq_work *work)
refcount_inc(&req->refs);
}
+static int io_init_wq_offload(struct io_ring_ctx *ctx,
+ struct io_uring_params *p)
+{
+ struct io_wq_data data;
+ struct fd f;
+ struct io_ring_ctx *ctx_attach;
+ unsigned int concurrency;
+ int ret = 0;
+
+ data.user = ctx->user;
+ data.get_work = io_get_work;
+ data.put_work = io_put_work;
+
+ if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
+ /* Do QD, or 4 * CPUS, whatever is smallest */
+ concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
+
+ ctx->io_wq = io_wq_create(concurrency, &data);
+ if (IS_ERR(ctx->io_wq)) {
+ ret = PTR_ERR(ctx->io_wq);
+ ctx->io_wq = NULL;
+ }
+ return ret;
+ }
+
+ f = fdget(p->wq_fd);
+ if (!f.file)
+ return -EBADF;
+
+ if (f.file->f_op != &io_uring_fops) {
+ ret = -EINVAL;
+ goto out_fput;
+ }
+
+ ctx_attach = f.file->private_data;
+ /* @io_wq is protected by holding the fd */
+ if (!io_wq_get(ctx_attach->io_wq, &data)) {
+ ret = -EINVAL;
+ goto out_fput;
+ }
+
+ ctx->io_wq = ctx_attach->io_wq;
+out_fput:
+ fdput(f);
+ return ret;
+}
+
static int io_sq_offload_start(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{
- struct io_wq_data data;
- unsigned concurrency;
int ret;
init_waitqueue_head(&ctx->sqo_wait);
@@ -4576,20 +5898,9 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
goto err;
}
- data.mm = ctx->sqo_mm;
- data.user = ctx->user;
- data.creds = ctx->creds;
- data.get_work = io_get_work;
- data.put_work = io_put_work;
-
- /* Do QD, or 4 * CPUS, whatever is smallest */
- concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
- ctx->io_wq = io_wq_create(concurrency, &data);
- if (IS_ERR(ctx->io_wq)) {
- ret = PTR_ERR(ctx->io_wq);
- ctx->io_wq = NULL;
+ ret = io_init_wq_offload(ctx, p);
+ if (ret)
goto err;
- }
return 0;
err:
@@ -4694,7 +6005,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
for (j = 0; j < imu->nr_bvecs; j++)
- put_user_page(imu->bvec[j].bv_page);
+ unpin_user_page(imu->bvec[j].bv_page);
if (ctx->account_mem)
io_unaccount_mem(ctx->user, imu->nr_bvecs);
@@ -4815,7 +6126,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
ret = 0;
down_read(&current->mm->mmap_sem);
- pret = get_user_pages(ubuf, nr_pages,
+ pret = pin_user_pages(ubuf, nr_pages,
FOLL_WRITE | FOLL_LONGTERM,
pages, vmas);
if (pret == nr_pages) {
@@ -4839,7 +6150,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
* release any pages we did get
*/
if (pret > 0)
- put_user_pages(pages, pret);
+ unpin_user_pages(pages, pret);
if (ctx->account_mem)
io_unaccount_mem(ctx->user, nr_pages);
kvfree(imu->bvec);
@@ -4967,6 +6278,17 @@ static int io_uring_fasync(int fd, struct file *file, int on)
return fasync_helper(fd, file, on, &ctx->cq_fasync);
}
+static int io_remove_personalities(int id, void *p, void *data)
+{
+ struct io_ring_ctx *ctx = data;
+ const struct cred *cred;
+
+ cred = idr_remove(&ctx->personality_idr, id);
+ if (cred)
+ put_cred(cred);
+ return 0;
+}
+
static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{
mutex_lock(&ctx->uring_lock);
@@ -4983,6 +6305,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
/* if we failed setting up the ctx, we might not have any rings */
if (ctx->rings)
io_cqring_overflow_flush(ctx, true);
+ idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
wait_for_completion(&ctx->completions[0]);
io_ring_ctx_free(ctx);
}
@@ -5036,10 +6359,6 @@ static int io_uring_flush(struct file *file, void *data)
struct io_ring_ctx *ctx = file->private_data;
io_uring_cancel_files(ctx, data);
- if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) {
- io_cqring_overflow_flush(ctx, true);
- io_wq_cancel_all(ctx->io_wq);
- }
return 0;
}
@@ -5153,7 +6472,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
} else if (to_submit) {
struct mm_struct *cur_mm;
- to_submit = min(to_submit, ctx->sq_entries);
mutex_lock(&ctx->uring_lock);
/* already have mm, so io_submit_sqes() won't try to grab it */
cur_mm = ctx->sqo_mm;
@@ -5269,7 +6587,6 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx)
#if defined(CONFIG_UNIX)
ctx->ring_sock->file = file;
- ctx->ring_sock->sk->sk_user_data = ctx;
#endif
fd_install(ret, file);
return ret;
@@ -5288,8 +6605,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
bool account_mem;
int ret;
- if (!entries || entries > IORING_MAX_ENTRIES)
+ if (!entries)
return -EINVAL;
+ if (entries > IORING_MAX_ENTRIES) {
+ if (!(p->flags & IORING_SETUP_CLAMP))
+ return -EINVAL;
+ entries = IORING_MAX_ENTRIES;
+ }
/*
* Use twice as many entries for the CQ ring. It's possible for the
@@ -5306,8 +6628,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
* to a power-of-two, if it isn't already. We do NOT impose
* any cq vs sq ring sizing.
*/
- if (p->cq_entries < p->sq_entries || p->cq_entries > IORING_MAX_CQ_ENTRIES)
+ if (p->cq_entries < p->sq_entries)
return -EINVAL;
+ if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
+ if (!(p->flags & IORING_SETUP_CLAMP))
+ return -EINVAL;
+ p->cq_entries = IORING_MAX_CQ_ENTRIES;
+ }
p->cq_entries = roundup_pow_of_two(p->cq_entries);
} else {
p->cq_entries = 2 * p->sq_entries;
@@ -5372,7 +6699,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
goto err;
p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
- IORING_FEAT_SUBMIT_STABLE;
+ IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
+ IORING_FEAT_CUR_PERSONALITY;
trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
return ret;
err:
@@ -5399,7 +6727,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
}
if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
- IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE))
+ IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
+ IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ))
return -EINVAL;
ret = io_uring_create(entries, &p);
@@ -5418,6 +6747,84 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries,
return io_uring_setup(entries, params);
}
+static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
+{
+ struct io_uring_probe *p;
+ size_t size;
+ int i, ret;
+
+ size = struct_size(p, ops, nr_args);
+ if (size == SIZE_MAX)
+ return -EOVERFLOW;
+ p = kzalloc(size, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ ret = -EFAULT;
+ if (copy_from_user(p, arg, size))
+ goto out;
+ ret = -EINVAL;
+ if (memchr_inv(p, 0, size))
+ goto out;
+
+ p->last_op = IORING_OP_LAST - 1;
+ if (nr_args > IORING_OP_LAST)
+ nr_args = IORING_OP_LAST;
+
+ for (i = 0; i < nr_args; i++) {
+ p->ops[i].op = i;
+ if (!io_op_defs[i].not_supported)
+ p->ops[i].flags = IO_URING_OP_SUPPORTED;
+ }
+ p->ops_len = i;
+
+ ret = 0;
+ if (copy_to_user(arg, p, size))
+ ret = -EFAULT;
+out:
+ kfree(p);
+ return ret;
+}
+
+static int io_register_personality(struct io_ring_ctx *ctx)
+{
+ const struct cred *creds = get_current_cred();
+ int id;
+
+ id = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
+ USHRT_MAX, GFP_KERNEL);
+ if (id < 0)
+ put_cred(creds);
+ return id;
+}
+
+static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
+{
+ const struct cred *old_creds;
+
+ old_creds = idr_remove(&ctx->personality_idr, id);
+ if (old_creds) {
+ put_cred(old_creds);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static bool io_register_op_must_quiesce(int op)
+{
+ switch (op) {
+ case IORING_UNREGISTER_FILES:
+ case IORING_REGISTER_FILES_UPDATE:
+ case IORING_REGISTER_PROBE:
+ case IORING_REGISTER_PERSONALITY:
+ case IORING_UNREGISTER_PERSONALITY:
+ return false;
+ default:
+ return true;
+ }
+}
+
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock)
@@ -5433,18 +6840,26 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
if (percpu_ref_is_dying(&ctx->refs))
return -ENXIO;
- percpu_ref_kill(&ctx->refs);
+ if (io_register_op_must_quiesce(opcode)) {
+ percpu_ref_kill(&ctx->refs);
- /*
- * Drop uring mutex before waiting for references to exit. If another
- * thread is currently inside io_uring_enter() it might need to grab
- * the uring_lock to make progress. If we hold it here across the drain
- * wait, then we can deadlock. It's safe to drop the mutex here, since
- * no new references will come in after we've killed the percpu ref.
- */
- mutex_unlock(&ctx->uring_lock);
- wait_for_completion(&ctx->completions[0]);
- mutex_lock(&ctx->uring_lock);
+ /*
+ * Drop uring mutex before waiting for references to exit. If
+ * another thread is currently inside io_uring_enter() it might
+ * need to grab the uring_lock to make progress. If we hold it
+ * here across the drain wait, then we can deadlock. It's safe
+ * to drop the mutex here, since no new references will come in
+ * after we've killed the percpu ref.
+ */
+ mutex_unlock(&ctx->uring_lock);
+ ret = wait_for_completion_interruptible(&ctx->completions[0]);
+ mutex_lock(&ctx->uring_lock);
+ if (ret) {
+ percpu_ref_resurrect(&ctx->refs);
+ ret = -EINTR;
+ goto out;
+ }
+ }
switch (opcode) {
case IORING_REGISTER_BUFFERS:
@@ -5469,10 +6884,17 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
ret = io_sqe_files_update(ctx, arg, nr_args);
break;
case IORING_REGISTER_EVENTFD:
+ case IORING_REGISTER_EVENTFD_ASYNC:
ret = -EINVAL;
if (nr_args != 1)
break;
ret = io_eventfd_register(ctx, arg);
+ if (ret)
+ break;
+ if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
+ ctx->eventfd_async = 1;
+ else
+ ctx->eventfd_async = 0;
break;
case IORING_UNREGISTER_EVENTFD:
ret = -EINVAL;
@@ -5480,14 +6902,35 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_eventfd_unregister(ctx);
break;
+ case IORING_REGISTER_PROBE:
+ ret = -EINVAL;
+ if (!arg || nr_args > 256)
+ break;
+ ret = io_probe(ctx, arg, nr_args);
+ break;
+ case IORING_REGISTER_PERSONALITY:
+ ret = -EINVAL;
+ if (arg || nr_args)
+ break;
+ ret = io_register_personality(ctx);
+ break;
+ case IORING_UNREGISTER_PERSONALITY:
+ ret = -EINVAL;
+ if (arg)
+ break;
+ ret = io_unregister_personality(ctx, nr_args);
+ break;
default:
ret = -EINVAL;
break;
}
- /* bring the ctx back to life */
- reinit_completion(&ctx->completions[0]);
- percpu_ref_reinit(&ctx->refs);
+ if (io_register_op_must_quiesce(opcode)) {
+ /* bring the ctx back to life */
+ percpu_ref_reinit(&ctx->refs);
+out:
+ reinit_completion(&ctx->completions[0]);
+ }
return ret;
}
@@ -5520,6 +6963,7 @@ out_fput:
static int __init io_uring_init(void)
{
+ BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
return 0;
};
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 2f5e4e5b97e1..7c9a5df5a597 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -467,7 +467,7 @@ EXPORT_SYMBOL(generic_block_fiemap);
* Only the l_start, l_len and l_whence fields of the 'struct space_resv'
* are used here, rest are ignored.
*/
-int ioctl_preallocate(struct file *filp, int mode, void __user *argp)
+static int ioctl_preallocate(struct file *filp, int mode, void __user *argp)
{
struct inode *inode = file_inode(filp);
struct space_resv sr;
@@ -495,8 +495,8 @@ int ioctl_preallocate(struct file *filp, int mode, void __user *argp)
/* on ia32 l_start is on a 32-bit boundary */
#if defined CONFIG_COMPAT && defined(CONFIG_X86_64)
/* just account for different alignment */
-int compat_ioctl_preallocate(struct file *file, int mode,
- struct space_resv_32 __user *argp)
+static int compat_ioctl_preallocate(struct file *file, int mode,
+ struct space_resv_32 __user *argp)
{
struct inode *inode = file_inode(file);
struct space_resv_32 sr;
@@ -521,11 +521,9 @@ int compat_ioctl_preallocate(struct file *file, int mode,
}
#endif
-static int file_ioctl(struct file *filp, unsigned int cmd,
- unsigned long arg)
+static int file_ioctl(struct file *filp, unsigned int cmd, int __user *p)
{
struct inode *inode = file_inode(filp);
- int __user *p = (int __user *)arg;
switch (cmd) {
case FIBMAP:
@@ -542,7 +540,7 @@ static int file_ioctl(struct file *filp, unsigned int cmd,
return ioctl_preallocate(filp, FALLOC_FL_ZERO_RANGE, p);
}
- return vfs_ioctl(filp, cmd, arg);
+ return -ENOIOCTLCMD;
}
static int ioctl_fionbio(struct file *filp, int __user *argp)
@@ -661,53 +659,48 @@ out:
}
/*
- * When you add any new common ioctls to the switches above and below
- * please update compat_sys_ioctl() too.
- *
* do_vfs_ioctl() is not for drivers and not intended to be EXPORT_SYMBOL()'d.
* It's just a simple helper for sys_ioctl and compat_sys_ioctl.
+ *
+ * When you add any new common ioctls to the switches above and below,
+ * please ensure they have compatible arguments in compat mode.
*/
-int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
- unsigned long arg)
+static int do_vfs_ioctl(struct file *filp, unsigned int fd,
+ unsigned int cmd, unsigned long arg)
{
- int error = 0;
void __user *argp = (void __user *)arg;
struct inode *inode = file_inode(filp);
switch (cmd) {
case FIOCLEX:
set_close_on_exec(fd, 1);
- break;
+ return 0;
case FIONCLEX:
set_close_on_exec(fd, 0);
- break;
+ return 0;
case FIONBIO:
- error = ioctl_fionbio(filp, argp);
- break;
+ return ioctl_fionbio(filp, argp);
case FIOASYNC:
- error = ioctl_fioasync(fd, filp, argp);
- break;
+ return ioctl_fioasync(fd, filp, argp);
case FIOQSIZE:
if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) ||
S_ISLNK(inode->i_mode)) {
loff_t res = inode_get_bytes(inode);
- error = copy_to_user(argp, &res, sizeof(res)) ?
- -EFAULT : 0;
- } else
- error = -ENOTTY;
- break;
+ return copy_to_user(argp, &res, sizeof(res)) ?
+ -EFAULT : 0;
+ }
+
+ return -ENOTTY;
case FIFREEZE:
- error = ioctl_fsfreeze(filp);
- break;
+ return ioctl_fsfreeze(filp);
case FITHAW:
- error = ioctl_fsthaw(filp);
- break;
+ return ioctl_fsthaw(filp);
case FS_IOC_FIEMAP:
return ioctl_fiemap(filp, argp);
@@ -716,6 +709,7 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
/* anon_bdev filesystems may not have a block size */
if (!inode->i_sb->s_blocksize)
return -EINVAL;
+
return put_user(inode->i_sb->s_blocksize, (int __user *)argp);
case FICLONE:
@@ -729,24 +723,30 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
default:
if (S_ISREG(inode->i_mode))
- error = file_ioctl(filp, cmd, arg);
- else
- error = vfs_ioctl(filp, cmd, arg);
+ return file_ioctl(filp, cmd, argp);
break;
}
- return error;
+
+ return -ENOIOCTLCMD;
}
int ksys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
{
- int error;
struct fd f = fdget(fd);
+ int error;
if (!f.file)
return -EBADF;
+
error = security_file_ioctl(f.file, cmd, arg);
- if (!error)
- error = do_vfs_ioctl(f.file, fd, cmd, arg);
+ if (error)
+ goto out;
+
+ error = do_vfs_ioctl(f.file, fd, cmd, arg);
+ if (error == -ENOIOCTLCMD)
+ error = vfs_ioctl(f.file, cmd, arg);
+
+out:
fdput(f);
return error;
}
@@ -788,4 +788,65 @@ long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return file->f_op->unlocked_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
}
EXPORT_SYMBOL(compat_ptr_ioctl);
+
+COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
+ compat_ulong_t, arg)
+{
+ struct fd f = fdget(fd);
+ int error;
+
+ if (!f.file)
+ return -EBADF;
+
+ /* RED-PEN how should LSM module know it's handling 32bit? */
+ error = security_file_ioctl(f.file, cmd, arg);
+ if (error)
+ goto out;
+
+ switch (cmd) {
+ /* FICLONE takes an int argument, so don't use compat_ptr() */
+ case FICLONE:
+ error = ioctl_file_clone(f.file, arg, 0, 0, 0);
+ break;
+
+#if defined(CONFIG_X86_64)
+ /* these get messy on amd64 due to alignment differences */
+ case FS_IOC_RESVSP_32:
+ case FS_IOC_RESVSP64_32:
+ error = compat_ioctl_preallocate(f.file, 0, compat_ptr(arg));
+ break;
+ case FS_IOC_UNRESVSP_32:
+ case FS_IOC_UNRESVSP64_32:
+ error = compat_ioctl_preallocate(f.file, FALLOC_FL_PUNCH_HOLE,
+ compat_ptr(arg));
+ break;
+ case FS_IOC_ZERO_RANGE_32:
+ error = compat_ioctl_preallocate(f.file, FALLOC_FL_ZERO_RANGE,
+ compat_ptr(arg));
+ break;
+#endif
+
+ /*
+ * everything else in do_vfs_ioctl() takes either a compatible
+ * pointer argument or no argument -- call it with a modified
+ * argument.
+ */
+ default:
+ error = do_vfs_ioctl(f.file, fd, cmd,
+ (unsigned long)compat_ptr(arg));
+ if (error != -ENOIOCTLCMD)
+ break;
+
+ if (f.file->f_op->compat_ioctl)
+ error = f.file->f_op->compat_ioctl(f.file, cmd, arg);
+ if (error == -ENOIOCTLCMD)
+ error = -ENOTTY;
+ break;
+ }
+
+ out:
+ fdput(f);
+
+ return error;
+}
#endif
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 8fff6677a5da..96bf33986d03 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -164,7 +164,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
"journal space in %s\n", __func__,
journal->j_devname);
WARN_ON(1);
- jbd2_journal_abort(journal, 0);
+ jbd2_journal_abort(journal, -EIO);
}
write_lock(&journal->j_state_lock);
} else {
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7f0b362b3842..2494095e0340 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -782,7 +782,7 @@ start_journal_io:
err = journal_submit_commit_record(journal, commit_transaction,
&cbh, crc32_sum);
if (err)
- __jbd2_journal_abort_hard(journal);
+ jbd2_journal_abort(journal, err);
}
blk_finish_plug(&plug);
@@ -875,7 +875,7 @@ start_journal_io:
err = journal_submit_commit_record(journal, commit_transaction,
&cbh, crc32_sum);
if (err)
- __jbd2_journal_abort_hard(journal);
+ jbd2_journal_abort(journal, err);
}
if (cbh)
err = journal_wait_on_commit_record(journal, cbh);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 5e408ee24a1a..60bf8ff78913 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -96,7 +96,6 @@ EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
EXPORT_SYMBOL(jbd2_inode_cache);
-static void __journal_abort_soft (journal_t *journal, int errno);
static int jbd2_journal_create_slab(size_t slab_size);
#ifdef CONFIG_JBD2_DEBUG
@@ -805,7 +804,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
"at offset %lu on %s\n",
__func__, blocknr, journal->j_devname);
err = -EIO;
- __journal_abort_soft(journal, err);
+ jbd2_journal_abort(journal, err);
}
} else {
*retp = blocknr; /* +journal->j_blk_offset */
@@ -982,6 +981,7 @@ static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos)
static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos)
{
+ (*pos)++;
return NULL;
}
@@ -1710,6 +1710,11 @@ int jbd2_journal_load(journal_t *journal)
journal->j_devname);
return -EFSCORRUPTED;
}
+ /*
+ * clear JBD2_ABORT flag initialized in journal_init_common
+ * here to update log tail information with the newest seq.
+ */
+ journal->j_flags &= ~JBD2_ABORT;
/* OK, we've finished with the dynamic journal bits:
* reinitialise the dynamic contents of the superblock in memory
@@ -1717,7 +1722,6 @@ int jbd2_journal_load(journal_t *journal)
if (journal_reset(journal))
goto recovery_error;
- journal->j_flags &= ~JBD2_ABORT;
journal->j_flags |= JBD2_LOADED;
return 0;
@@ -2098,67 +2102,6 @@ int jbd2_journal_wipe(journal_t *journal, int write)
return err;
}
-/*
- * Journal abort has very specific semantics, which we describe
- * for journal abort.
- *
- * Two internal functions, which provide abort to the jbd layer
- * itself are here.
- */
-
-/*
- * Quick version for internal journal use (doesn't lock the journal).
- * Aborts hard --- we mark the abort as occurred, but do _nothing_ else,
- * and don't attempt to make any other journal updates.
- */
-void __jbd2_journal_abort_hard(journal_t *journal)
-{
- transaction_t *transaction;
-
- if (journal->j_flags & JBD2_ABORT)
- return;
-
- printk(KERN_ERR "Aborting journal on device %s.\n",
- journal->j_devname);
-
- write_lock(&journal->j_state_lock);
- journal->j_flags |= JBD2_ABORT;
- transaction = journal->j_running_transaction;
- if (transaction)
- __jbd2_log_start_commit(journal, transaction->t_tid);
- write_unlock(&journal->j_state_lock);
-}
-
-/* Soft abort: record the abort error status in the journal superblock,
- * but don't do any other IO. */
-static void __journal_abort_soft (journal_t *journal, int errno)
-{
- int old_errno;
-
- write_lock(&journal->j_state_lock);
- old_errno = journal->j_errno;
- if (!journal->j_errno || errno == -ESHUTDOWN)
- journal->j_errno = errno;
-
- if (journal->j_flags & JBD2_ABORT) {
- write_unlock(&journal->j_state_lock);
- if (!old_errno && old_errno != -ESHUTDOWN &&
- errno == -ESHUTDOWN)
- jbd2_journal_update_sb_errno(journal);
- return;
- }
- write_unlock(&journal->j_state_lock);
-
- __jbd2_journal_abort_hard(journal);
-
- if (errno) {
- jbd2_journal_update_sb_errno(journal);
- write_lock(&journal->j_state_lock);
- journal->j_flags |= JBD2_REC_ERR;
- write_unlock(&journal->j_state_lock);
- }
-}
-
/**
* void jbd2_journal_abort () - Shutdown the journal immediately.
* @journal: the journal to shutdown.
@@ -2198,16 +2141,51 @@ static void __journal_abort_soft (journal_t *journal, int errno)
* failure to disk. ext3_error, for example, now uses this
* functionality.
*
- * Errors which originate from within the journaling layer will NOT
- * supply an errno; a null errno implies that absolutely no further
- * writes are done to the journal (unless there are any already in
- * progress).
- *
*/
void jbd2_journal_abort(journal_t *journal, int errno)
{
- __journal_abort_soft(journal, errno);
+ transaction_t *transaction;
+
+ /*
+ * ESHUTDOWN always takes precedence because a file system check
+ * caused by any other journal abort error is not required after
+ * a shutdown triggered.
+ */
+ write_lock(&journal->j_state_lock);
+ if (journal->j_flags & JBD2_ABORT) {
+ int old_errno = journal->j_errno;
+
+ write_unlock(&journal->j_state_lock);
+ if (old_errno != -ESHUTDOWN && errno == -ESHUTDOWN) {
+ journal->j_errno = errno;
+ jbd2_journal_update_sb_errno(journal);
+ }
+ return;
+ }
+
+ /*
+ * Mark the abort as occurred and start current running transaction
+ * to release all journaled buffer.
+ */
+ pr_err("Aborting journal on device %s.\n", journal->j_devname);
+
+ journal->j_flags |= JBD2_ABORT;
+ journal->j_errno = errno;
+ transaction = journal->j_running_transaction;
+ if (transaction)
+ __jbd2_log_start_commit(journal, transaction->t_tid);
+ write_unlock(&journal->j_state_lock);
+
+ /*
+ * Record errno to the journal super block, so that fsck and jbd2
+ * layer could realise that a filesystem check is needed.
+ */
+ jbd2_journal_update_sb_errno(journal);
+
+ write_lock(&journal->j_state_lock);
+ journal->j_flags |= JBD2_REC_ERR;
+ write_unlock(&journal->j_state_lock);
}
/**
@@ -2556,7 +2534,6 @@ static void __journal_remove_journal_head(struct buffer_head *bh)
{
struct journal_head *jh = bh2jh(bh);
- J_ASSERT_JH(jh, jh->b_jcount >= 0);
J_ASSERT_JH(jh, jh->b_transaction == NULL);
J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 27b9f9dee434..e77a5a0b4e46 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -525,7 +525,7 @@ EXPORT_SYMBOL(jbd2__journal_start);
* modified buffers in the log. We block until the log can guarantee
* that much space. Additionally, if rsv_blocks > 0, we also create another
* handle with rsv_blocks reserved blocks in the journal. This handle is
- * is stored in h_rsv_handle. It is not attached to any particular transaction
+ * stored in h_rsv_handle. It is not attached to any particular transaction
* and thus doesn't block transaction commit. If the caller uses this reserved
* handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop()
* on the parent handle will dispose the reserved one. Reserved handle has to
@@ -1595,7 +1595,7 @@ out:
* Allow this call even if the handle has aborted --- it may be part of
* the caller's cleanup after an abort.
*/
-int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
+int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh)
{
transaction_t *transaction = handle->h_transaction;
journal_t *journal;
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 9d96e6871e1a..9aec80b9d7c6 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -1266,7 +1266,7 @@ void kernfs_activate(struct kernfs_node *kn)
pos = NULL;
while ((pos = kernfs_next_descendant_post(pos, kn))) {
- if (!pos || (pos->flags & KERNFS_ACTIVATED))
+ if (pos->flags & KERNFS_ACTIVATED)
continue;
WARN_ON_ONCE(pos->parent && RB_EMPTY_NODE(&pos->rb));
diff --git a/fs/mpage.c b/fs/mpage.c
index a63620cdb73a..ccba3c4c4479 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -62,7 +62,7 @@ static struct bio *mpage_bio_submit(int op, int op_flags, struct bio *bio)
{
bio->bi_end_io = mpage_end_io;
bio_set_op_attrs(bio, op, op_flags);
- guard_bio_eod(op, bio);
+ guard_bio_eod(bio);
submit_bio(bio);
return NULL;
}
diff --git a/fs/namei.c b/fs/namei.c
index d6c91d1e88cb..4167109297e0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -491,7 +491,7 @@ struct nameidata {
struct path root;
struct inode *inode; /* path.dentry.d_inode */
unsigned int flags;
- unsigned seq, m_seq;
+ unsigned seq, m_seq, r_seq;
int last_type;
unsigned depth;
int total_link_count;
@@ -641,6 +641,14 @@ static bool legitimize_links(struct nameidata *nd)
static bool legitimize_root(struct nameidata *nd)
{
+ /*
+ * For scoped-lookups (where nd->root has been zeroed), we need to
+ * restart the whole lookup from scratch -- because set_root() is wrong
+ * for these lookups (nd->dfd is the root, not the filesystem root).
+ */
+ if (!nd->root.mnt && (nd->flags & LOOKUP_IS_SCOPED))
+ return false;
+ /* Nothing to do if nd->root is zero or is managed by the VFS user. */
if (!nd->root.mnt || (nd->flags & LOOKUP_ROOT))
return true;
nd->flags |= LOOKUP_ROOT_GRABBED;
@@ -776,12 +784,37 @@ static int complete_walk(struct nameidata *nd)
int status;
if (nd->flags & LOOKUP_RCU) {
- if (!(nd->flags & LOOKUP_ROOT))
+ /*
+ * We don't want to zero nd->root for scoped-lookups or
+ * externally-managed nd->root.
+ */
+ if (!(nd->flags & (LOOKUP_ROOT | LOOKUP_IS_SCOPED)))
nd->root.mnt = NULL;
if (unlikely(unlazy_walk(nd)))
return -ECHILD;
}
+ if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
+ /*
+ * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't
+ * ever step outside the root during lookup" and should already
+ * be guaranteed by the rest of namei, we want to avoid a namei
+ * BUG resulting in userspace being given a path that was not
+ * scoped within the root at some point during the lookup.
+ *
+ * So, do a final sanity-check to make sure that in the
+ * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED)
+ * we won't silently return an fd completely outside of the
+ * requested root to userspace.
+ *
+ * Userspace could move the path outside the root after this
+ * check, but as discussed elsewhere this is not a concern (the
+ * resolved file was inside the root at some point).
+ */
+ if (!path_is_under(&nd->path, &nd->root))
+ return -EXDEV;
+ }
+
if (likely(!(nd->flags & LOOKUP_JUMPED)))
return 0;
@@ -798,10 +831,18 @@ static int complete_walk(struct nameidata *nd)
return status;
}
-static void set_root(struct nameidata *nd)
+static int set_root(struct nameidata *nd)
{
struct fs_struct *fs = current->fs;
+ /*
+ * Jumping to the real root in a scoped-lookup is a BUG in namei, but we
+ * still have to ensure it doesn't happen because it will cause a breakout
+ * from the dirfd.
+ */
+ if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED))
+ return -ENOTRECOVERABLE;
+
if (nd->flags & LOOKUP_RCU) {
unsigned seq;
@@ -814,6 +855,7 @@ static void set_root(struct nameidata *nd)
get_fs_root(fs, &nd->root);
nd->flags |= LOOKUP_ROOT_GRABBED;
}
+ return 0;
}
static void path_put_conditional(struct path *path, struct nameidata *nd)
@@ -837,6 +879,18 @@ static inline void path_to_nameidata(const struct path *path,
static int nd_jump_root(struct nameidata *nd)
{
+ if (unlikely(nd->flags & LOOKUP_BENEATH))
+ return -EXDEV;
+ if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
+ /* Absolute path arguments to path_init() are allowed. */
+ if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt)
+ return -EXDEV;
+ }
+ if (!nd->root.mnt) {
+ int error = set_root(nd);
+ if (error)
+ return error;
+ }
if (nd->flags & LOOKUP_RCU) {
struct dentry *d;
nd->path = nd->root;
@@ -859,14 +913,32 @@ static int nd_jump_root(struct nameidata *nd)
* Helper to directly jump to a known parsed path from ->get_link,
* caller must have taken a reference to path beforehand.
*/
-void nd_jump_link(struct path *path)
+int nd_jump_link(struct path *path)
{
+ int error = -ELOOP;
struct nameidata *nd = current->nameidata;
- path_put(&nd->path);
+ if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
+ goto err;
+
+ error = -EXDEV;
+ if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
+ if (nd->path.mnt != path->mnt)
+ goto err;
+ }
+ /* Not currently safe for scoped-lookups. */
+ if (unlikely(nd->flags & LOOKUP_IS_SCOPED))
+ goto err;
+
+ path_put(&nd->path);
nd->path = *path;
nd->inode = nd->path.dentry->d_inode;
nd->flags |= LOOKUP_JUMPED;
+ return 0;
+
+err:
+ path_put(path);
+ return error;
}
static inline void put_link(struct nameidata *nd)
@@ -1001,7 +1073,8 @@ static int may_linkat(struct path *link)
* may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
* should be allowed, or not, on files that already
* exist.
- * @dir: the sticky parent directory
+ * @dir_mode: mode bits of directory
+ * @dir_uid: owner of directory
* @inode: the inode of the file to open
*
* Block an O_CREAT open of a FIFO (or a regular file) when:
@@ -1017,18 +1090,18 @@ static int may_linkat(struct path *link)
*
* Returns 0 if the open is allowed, -ve on error.
*/
-static int may_create_in_sticky(struct dentry * const dir,
+static int may_create_in_sticky(umode_t dir_mode, kuid_t dir_uid,
struct inode * const inode)
{
if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
(!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
- likely(!(dir->d_inode->i_mode & S_ISVTX)) ||
- uid_eq(inode->i_uid, dir->d_inode->i_uid) ||
+ likely(!(dir_mode & S_ISVTX)) ||
+ uid_eq(inode->i_uid, dir_uid) ||
uid_eq(current_fsuid(), inode->i_uid))
return 0;
- if (likely(dir->d_inode->i_mode & 0002) ||
- (dir->d_inode->i_mode & 0020 &&
+ if (likely(dir_mode & 0002) ||
+ (dir_mode & 0020 &&
((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) ||
(sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) {
const char *operation = S_ISFIFO(inode->i_mode) ?
@@ -1049,6 +1122,9 @@ const char *get_link(struct nameidata *nd)
int error;
const char *res;
+ if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS))
+ return ERR_PTR(-ELOOP);
+
if (!(nd->flags & LOOKUP_RCU)) {
touch_atime(&last->link);
cond_resched();
@@ -1083,10 +1159,9 @@ const char *get_link(struct nameidata *nd)
return res;
}
if (*res == '/') {
- if (!nd->root.mnt)
- set_root(nd);
- if (unlikely(nd_jump_root(nd)))
- return ERR_PTR(-ECHILD);
+ error = nd_jump_root(nd);
+ if (unlikely(error))
+ return ERR_PTR(error);
while (unlikely(*++res == '/'))
;
}
@@ -1232,6 +1307,7 @@ static int follow_managed(struct path *path, struct nameidata *nd)
BUG_ON(!path->dentry->d_op);
BUG_ON(!path->dentry->d_op->d_manage);
ret = path->dentry->d_op->d_manage(path, false);
+ flags = smp_load_acquire(&path->dentry->d_flags);
if (ret < 0)
break;
}
@@ -1267,10 +1343,14 @@ static int follow_managed(struct path *path, struct nameidata *nd)
break;
}
- if (need_mntput && path->mnt == mnt)
- mntput(path->mnt);
- if (need_mntput)
- nd->flags |= LOOKUP_JUMPED;
+ if (need_mntput) {
+ if (path->mnt == mnt)
+ mntput(path->mnt);
+ if (unlikely(nd->flags & LOOKUP_NO_XDEV))
+ ret = -EXDEV;
+ else
+ nd->flags |= LOOKUP_JUMPED;
+ }
if (ret == -EISDIR || !ret)
ret = 1;
if (ret > 0 && unlikely(d_flags_negative(flags)))
@@ -1331,6 +1411,8 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
mounted = __lookup_mnt(path->mnt, path->dentry);
if (!mounted)
break;
+ if (unlikely(nd->flags & LOOKUP_NO_XDEV))
+ return false;
path->mnt = &mounted->mnt;
path->dentry = mounted->mnt.mnt_root;
nd->flags |= LOOKUP_JUMPED;
@@ -1351,8 +1433,11 @@ static int follow_dotdot_rcu(struct nameidata *nd)
struct inode *inode = nd->inode;
while (1) {
- if (path_equal(&nd->path, &nd->root))
+ if (path_equal(&nd->path, &nd->root)) {
+ if (unlikely(nd->flags & LOOKUP_BENEATH))
+ return -ECHILD;
break;
+ }
if (nd->path.dentry != nd->path.mnt->mnt_root) {
struct dentry *old = nd->path.dentry;
struct dentry *parent = old->d_parent;
@@ -1365,7 +1450,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
nd->path.dentry = parent;
nd->seq = seq;
if (unlikely(!path_connected(&nd->path)))
- return -ENOENT;
+ return -ECHILD;
break;
} else {
struct mount *mnt = real_mount(nd->path.mnt);
@@ -1377,6 +1462,8 @@ static int follow_dotdot_rcu(struct nameidata *nd)
return -ECHILD;
if (&mparent->mnt == nd->path.mnt)
break;
+ if (unlikely(nd->flags & LOOKUP_NO_XDEV))
+ return -ECHILD;
/* we know that mountpoint was pinned */
nd->path.dentry = mountpoint;
nd->path.mnt = &mparent->mnt;
@@ -1391,6 +1478,8 @@ static int follow_dotdot_rcu(struct nameidata *nd)
return -ECHILD;
if (!mounted)
break;
+ if (unlikely(nd->flags & LOOKUP_NO_XDEV))
+ return -ECHILD;
nd->path.mnt = &mounted->mnt;
nd->path.dentry = mounted->mnt.mnt_root;
inode = nd->path.dentry->d_inode;
@@ -1478,9 +1567,12 @@ static int path_parent_directory(struct path *path)
static int follow_dotdot(struct nameidata *nd)
{
- while(1) {
- if (path_equal(&nd->path, &nd->root))
+ while (1) {
+ if (path_equal(&nd->path, &nd->root)) {
+ if (unlikely(nd->flags & LOOKUP_BENEATH))
+ return -EXDEV;
break;
+ }
if (nd->path.dentry != nd->path.mnt->mnt_root) {
int ret = path_parent_directory(&nd->path);
if (ret)
@@ -1489,6 +1581,8 @@ static int follow_dotdot(struct nameidata *nd)
}
if (!follow_up(&nd->path))
break;
+ if (unlikely(nd->flags & LOOKUP_NO_XDEV))
+ return -EXDEV;
}
follow_mount(&nd->path);
nd->inode = nd->path.dentry->d_inode;
@@ -1649,17 +1743,15 @@ again:
if (IS_ERR(dentry))
return dentry;
if (unlikely(!d_in_lookup(dentry))) {
- if (!(flags & LOOKUP_NO_REVAL)) {
- int error = d_revalidate(dentry, flags);
- if (unlikely(error <= 0)) {
- if (!error) {
- d_invalidate(dentry);
- dput(dentry);
- goto again;
- }
+ int error = d_revalidate(dentry, flags);
+ if (unlikely(error <= 0)) {
+ if (!error) {
+ d_invalidate(dentry);
dput(dentry);
- dentry = ERR_PTR(error);
+ goto again;
}
+ dput(dentry);
+ dentry = ERR_PTR(error);
}
} else {
old = inode->i_op->lookup(inode, dentry, flags);
@@ -1699,12 +1791,33 @@ static inline int may_lookup(struct nameidata *nd)
static inline int handle_dots(struct nameidata *nd, int type)
{
if (type == LAST_DOTDOT) {
- if (!nd->root.mnt)
- set_root(nd);
- if (nd->flags & LOOKUP_RCU) {
- return follow_dotdot_rcu(nd);
- } else
- return follow_dotdot(nd);
+ int error = 0;
+
+ if (!nd->root.mnt) {
+ error = set_root(nd);
+ if (error)
+ return error;
+ }
+ if (nd->flags & LOOKUP_RCU)
+ error = follow_dotdot_rcu(nd);
+ else
+ error = follow_dotdot(nd);
+ if (error)
+ return error;
+
+ if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
+ /*
+ * If there was a racing rename or mount along our
+ * path, then we can't be sure that ".." hasn't jumped
+ * above nd->root (and so userspace should retry or use
+ * some fallback).
+ */
+ smp_rmb();
+ if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)))
+ return -EAGAIN;
+ if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)))
+ return -EAGAIN;
+ }
}
return 0;
}
@@ -2158,6 +2271,7 @@ OK:
/* must be paired with terminate_walk() */
static const char *path_init(struct nameidata *nd, unsigned flags)
{
+ int error;
const char *s = nd->name->name;
if (!*s)
@@ -2168,6 +2282,11 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
nd->last_type = LAST_ROOT; /* if there are only slashes... */
nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
nd->depth = 0;
+
+ nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
+ nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
+ smp_rmb();
+
if (flags & LOOKUP_ROOT) {
struct dentry *root = nd->root.dentry;
struct inode *inode = root->d_inode;
@@ -2176,9 +2295,8 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
nd->path = nd->root;
nd->inode = inode;
if (flags & LOOKUP_RCU) {
- nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+ nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
nd->root_seq = nd->seq;
- nd->m_seq = read_seqbegin(&mount_lock);
} else {
path_get(&nd->path);
}
@@ -2189,13 +2307,16 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
nd->path.mnt = NULL;
nd->path.dentry = NULL;
- nd->m_seq = read_seqbegin(&mount_lock);
- if (*s == '/') {
- set_root(nd);
- if (likely(!nd_jump_root(nd)))
- return s;
- return ERR_PTR(-ECHILD);
- } else if (nd->dfd == AT_FDCWD) {
+ /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
+ if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {
+ error = nd_jump_root(nd);
+ if (unlikely(error))
+ return ERR_PTR(error);
+ return s;
+ }
+
+ /* Relative pathname -- get the starting-point it is relative to. */
+ if (nd->dfd == AT_FDCWD) {
if (flags & LOOKUP_RCU) {
struct fs_struct *fs = current->fs;
unsigned seq;
@@ -2210,7 +2331,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
get_fs_pwd(current->fs, &nd->path);
nd->inode = nd->path.dentry->d_inode;
}
- return s;
} else {
/* Caller must check execute permissions on the starting path component */
struct fd f = fdget_raw(nd->dfd);
@@ -2235,8 +2355,19 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
nd->inode = nd->path.dentry->d_inode;
}
fdput(f);
- return s;
}
+
+ /* For scoped-lookups we need to set the root to the dirfd as well. */
+ if (flags & LOOKUP_IS_SCOPED) {
+ nd->root = nd->path;
+ if (flags & LOOKUP_RCU) {
+ nd->root_seq = nd->seq;
+ } else {
+ path_get(&nd->root);
+ nd->flags |= LOOKUP_ROOT_GRABBED;
+ }
+ }
+ return s;
}
static const char *trailing_symlink(struct nameidata *nd)
@@ -2618,72 +2749,6 @@ int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
EXPORT_SYMBOL(user_path_at_empty);
/**
- * mountpoint_last - look up last component for umount
- * @nd: pathwalk nameidata - currently pointing at parent directory of "last"
- *
- * This is a special lookup_last function just for umount. In this case, we
- * need to resolve the path without doing any revalidation.
- *
- * The nameidata should be the result of doing a LOOKUP_PARENT pathwalk. Since
- * mountpoints are always pinned in the dcache, their ancestors are too. Thus,
- * in almost all cases, this lookup will be served out of the dcache. The only
- * cases where it won't are if nd->last refers to a symlink or the path is
- * bogus and it doesn't exist.
- *
- * Returns:
- * -error: if there was an error during lookup. This includes -ENOENT if the
- * lookup found a negative dentry.
- *
- * 0: if we successfully resolved nd->last and found it to not to be a
- * symlink that needs to be followed.
- *
- * 1: if we successfully resolved nd->last and found it to be a symlink
- * that needs to be followed.
- */
-static int
-mountpoint_last(struct nameidata *nd)
-{
- int error = 0;
- struct dentry *dir = nd->path.dentry;
- struct path path;
-
- /* If we're in rcuwalk, drop out of it to handle last component */
- if (nd->flags & LOOKUP_RCU) {
- if (unlazy_walk(nd))
- return -ECHILD;
- }
-
- nd->flags &= ~LOOKUP_PARENT;
-
- if (unlikely(nd->last_type != LAST_NORM)) {
- error = handle_dots(nd, nd->last_type);
- if (error)
- return error;
- path.dentry = dget(nd->path.dentry);
- } else {
- path.dentry = d_lookup(dir, &nd->last);
- if (!path.dentry) {
- /*
- * No cached dentry. Mounted dentries are pinned in the
- * cache, so that means that this dentry is probably
- * a symlink or the path doesn't actually point
- * to a mounted dentry.
- */
- path.dentry = lookup_slow(&nd->last, dir,
- nd->flags | LOOKUP_NO_REVAL);
- if (IS_ERR(path.dentry))
- return PTR_ERR(path.dentry);
- }
- }
- if (d_flags_negative(smp_load_acquire(&path.dentry->d_flags))) {
- dput(path.dentry);
- return -ENOENT;
- }
- path.mnt = nd->path.mnt;
- return step_into(nd, &path, 0, d_backing_inode(path.dentry), 0);
-}
-
-/**
* path_mountpoint - look up a path to be umounted
* @nd: lookup context
* @flags: lookup flags
@@ -2699,14 +2764,17 @@ path_mountpoint(struct nameidata *nd, unsigned flags, struct path *path)
int err;
while (!(err = link_path_walk(s, nd)) &&
- (err = mountpoint_last(nd)) > 0) {
+ (err = lookup_last(nd)) > 0) {
s = trailing_symlink(nd);
}
+ if (!err && (nd->flags & LOOKUP_RCU))
+ err = unlazy_walk(nd);
+ if (!err)
+ err = handle_lookup_down(nd);
if (!err) {
*path = nd->path;
nd->path.mnt = NULL;
nd->path.dentry = NULL;
- follow_mount(path);
}
terminate_walk(nd);
return err;
@@ -3265,6 +3333,8 @@ static int do_last(struct nameidata *nd,
struct file *file, const struct open_flags *op)
{
struct dentry *dir = nd->path.dentry;
+ kuid_t dir_uid = dir->d_inode->i_uid;
+ umode_t dir_mode = dir->d_inode->i_mode;
int open_flag = op->open_flag;
bool will_truncate = (open_flag & O_TRUNC) != 0;
bool got_write = false;
@@ -3395,7 +3465,7 @@ finish_open:
error = -EISDIR;
if (d_is_dir(nd->path.dentry))
goto out;
- error = may_create_in_sticky(dir,
+ error = may_create_in_sticky(dir_mode, dir_uid,
d_backing_inode(nd->path.dentry));
if (unlikely(error))
goto out;
diff --git a/fs/namespace.c b/fs/namespace.c
index be601d3a8008..5e1bf611a9eb 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1728,7 +1728,7 @@ static bool is_mnt_ns_file(struct dentry *dentry)
dentry->d_fsdata == &mntns_operations;
}
-struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
+static struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
{
return container_of(ns, struct mnt_namespace, ns);
}
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 15f271401dcc..573b1da9342c 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -84,8 +84,10 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
return FSCACHE_CHECKAUX_OBSOLETE;
memset(&auxdata, 0, sizeof(auxdata));
- auxdata.mtime = timespec64_to_timespec(nfsi->vfs_inode.i_mtime);
- auxdata.ctime = timespec64_to_timespec(nfsi->vfs_inode.i_ctime);
+ auxdata.mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec;
+ auxdata.mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec;
+ auxdata.ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec;
+ auxdata.ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec;
if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode);
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 3800ab6f08fa..7def925d3af5 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -238,8 +238,10 @@ void nfs_fscache_init_inode(struct inode *inode)
return;
memset(&auxdata, 0, sizeof(auxdata));
- auxdata.mtime = timespec64_to_timespec(nfsi->vfs_inode.i_mtime);
- auxdata.ctime = timespec64_to_timespec(nfsi->vfs_inode.i_ctime);
+ auxdata.mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec;
+ auxdata.mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec;
+ auxdata.ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec;
+ auxdata.ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec;
if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode);
@@ -263,8 +265,10 @@ void nfs_fscache_clear_inode(struct inode *inode)
dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie);
memset(&auxdata, 0, sizeof(auxdata));
- auxdata.mtime = timespec64_to_timespec(nfsi->vfs_inode.i_mtime);
- auxdata.ctime = timespec64_to_timespec(nfsi->vfs_inode.i_ctime);
+ auxdata.mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec;
+ auxdata.mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec;
+ auxdata.ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec;
+ auxdata.ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec;
fscache_relinquish_cookie(cookie, &auxdata, false);
nfsi->fscache = NULL;
}
@@ -305,8 +309,10 @@ void nfs_fscache_open_file(struct inode *inode, struct file *filp)
return;
memset(&auxdata, 0, sizeof(auxdata));
- auxdata.mtime = timespec64_to_timespec(nfsi->vfs_inode.i_mtime);
- auxdata.ctime = timespec64_to_timespec(nfsi->vfs_inode.i_ctime);
+ auxdata.mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec;
+ auxdata.mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec;
+ auxdata.ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec;
+ auxdata.ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec;
if (inode_is_open_for_write(inode)) {
dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi);
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index ad041cfbf9ec..6754c8607230 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -62,9 +62,11 @@ struct nfs_fscache_key {
* cache object.
*/
struct nfs_fscache_inode_auxdata {
- struct timespec mtime;
- struct timespec ctime;
- u64 change_attr;
+ s64 mtime_sec;
+ s64 mtime_nsec;
+ s64 ctime_sec;
+ s64 ctime_nsec;
+ u64 change_attr;
};
/*
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 936c57779ff4..728d88b6a698 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4097,7 +4097,7 @@ static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, str
status = NFS_ATTR_FATTR_ATIME;
bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS;
}
- dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec);
+ dprintk("%s: atime=%lld\n", __func__, time->tv_sec);
return status;
}
@@ -4115,7 +4115,7 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s
status = NFS_ATTR_FATTR_CTIME;
bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA;
}
- dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec);
+ dprintk("%s: ctime=%lld\n", __func__, time->tv_sec);
return status;
}
@@ -4132,8 +4132,8 @@ static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap,
status = decode_attr_time(xdr, time);
bitmap[1] &= ~FATTR4_WORD1_TIME_DELTA;
}
- dprintk("%s: time_delta=%ld %ld\n", __func__, (long)time->tv_sec,
- (long)time->tv_nsec);
+ dprintk("%s: time_delta=%lld %ld\n", __func__, time->tv_sec,
+ time->tv_nsec);
return status;
}
@@ -4197,7 +4197,7 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str
status = NFS_ATTR_FATTR_MTIME;
bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY;
}
- dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec);
+ dprintk("%s: mtime=%lld\n", __func__, time->tv_sec);
return status;
}
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index f64a33d2a1d1..2a82dcce5fc1 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -206,7 +206,6 @@ TRACE_DEFINE_ENUM(LOOKUP_AUTOMOUNT);
TRACE_DEFINE_ENUM(LOOKUP_PARENT);
TRACE_DEFINE_ENUM(LOOKUP_REVAL);
TRACE_DEFINE_ENUM(LOOKUP_RCU);
-TRACE_DEFINE_ENUM(LOOKUP_NO_REVAL);
TRACE_DEFINE_ENUM(LOOKUP_OPEN);
TRACE_DEFINE_ENUM(LOOKUP_CREATE);
TRACE_DEFINE_ENUM(LOOKUP_EXCL);
@@ -224,7 +223,6 @@ TRACE_DEFINE_ENUM(LOOKUP_DOWN);
{ LOOKUP_PARENT, "PARENT" }, \
{ LOOKUP_REVAL, "REVAL" }, \
{ LOOKUP_RCU, "RCU" }, \
- { LOOKUP_NO_REVAL, "NO_REVAL" }, \
{ LOOKUP_OPEN, "OPEN" }, \
{ LOOKUP_CREATE, "CREATE" }, \
{ LOOKUP_EXCL, "EXCL" }, \
diff --git a/fs/nsfs.c b/fs/nsfs.c
index a0431642c6b5..b13bfd406820 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -3,6 +3,7 @@
#include <linux/pseudo_fs.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/proc_fs.h>
#include <linux/proc_ns.h>
#include <linux/magic.h>
#include <linux/ktime.h>
@@ -11,6 +12,8 @@
#include <linux/nsfs.h>
#include <linux/uaccess.h>
+#include "internal.h"
+
static struct vfsmount *nsfs_mnt;
static long ns_ioctl(struct file *filp, unsigned int ioctl,
@@ -52,7 +55,7 @@ static void nsfs_evict(struct inode *inode)
ns->ops->put(ns);
}
-static void *__ns_get_path(struct path *path, struct ns_common *ns)
+static int __ns_get_path(struct path *path, struct ns_common *ns)
{
struct vfsmount *mnt = nsfs_mnt;
struct dentry *dentry;
@@ -71,13 +74,13 @@ static void *__ns_get_path(struct path *path, struct ns_common *ns)
got_it:
path->mnt = mntget(mnt);
path->dentry = dentry;
- return NULL;
+ return 0;
slow:
rcu_read_unlock();
inode = new_inode_pseudo(mnt->mnt_sb);
if (!inode) {
ns->ops->put(ns);
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
}
inode->i_ino = ns->inum;
inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
@@ -89,7 +92,7 @@ slow:
dentry = d_alloc_anon(mnt->mnt_sb);
if (!dentry) {
iput(inode);
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
}
d_instantiate(dentry, inode);
dentry->d_fsdata = (void *)ns->ops;
@@ -98,23 +101,22 @@ slow:
d_delete(dentry); /* make sure ->d_prune() does nothing */
dput(dentry);
cpu_relax();
- return ERR_PTR(-EAGAIN);
+ return -EAGAIN;
}
goto got_it;
}
-void *ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb,
+int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb,
void *private_data)
{
- void *ret;
+ int ret;
do {
struct ns_common *ns = ns_get_cb(private_data);
if (!ns)
- return ERR_PTR(-ENOENT);
-
+ return -ENOENT;
ret = __ns_get_path(path, ns);
- } while (ret == ERR_PTR(-EAGAIN));
+ } while (ret == -EAGAIN);
return ret;
}
@@ -131,7 +133,7 @@ static struct ns_common *ns_get_path_task(void *private_data)
return args->ns_ops->get(args->task);
}
-void *ns_get_path(struct path *path, struct task_struct *task,
+int ns_get_path(struct path *path, struct task_struct *task,
const struct proc_ns_operations *ns_ops)
{
struct ns_get_path_task_args args = {
@@ -147,7 +149,7 @@ int open_related_ns(struct ns_common *ns,
{
struct path path = {};
struct file *f;
- void *err;
+ int err;
int fd;
fd = get_unused_fd_flags(O_CLOEXEC);
@@ -164,11 +166,11 @@ int open_related_ns(struct ns_common *ns,
}
err = __ns_get_path(&path, relative);
- } while (err == ERR_PTR(-EAGAIN));
+ } while (err == -EAGAIN);
- if (IS_ERR(err)) {
+ if (err) {
put_unused_fd(fd);
- return PTR_ERR(err);
+ return err;
}
f = dentry_open(&path, O_RDONLY, current_cred());
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 5c424a099280..1ef24574f481 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -73,7 +73,7 @@ static void o2quo_fence_self(void)
"system by restarting ***\n");
emergency_restart();
break;
- };
+ }
}
/* Indicate that a timeout occurred on a heartbeat region write. The
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index 38b224372776..5e700b45d32d 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,6 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
-ccflags-y := -I $(srctree)/$(src)/..
-
obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 4de89af96abf..6abaded3ff6b 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -23,15 +23,15 @@
#include <linux/spinlock.h>
-#include "cluster/heartbeat.h"
-#include "cluster/nodemanager.h"
-#include "cluster/tcp.h"
+#include "../cluster/heartbeat.h"
+#include "../cluster/nodemanager.h"
+#include "../cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
#define MLOG_MASK_PREFIX ML_DLM
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
struct dlm_lock *lock);
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index aaf24548b02a..0463dce65bb2 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -688,10 +688,6 @@ struct dlm_begin_reco
__be32 pad2;
};
-
-#define BITS_PER_BYTE 8
-#define BITS_TO_BYTES(bits) (((bits)+BITS_PER_BYTE-1)/BITS_PER_BYTE)
-
struct dlm_query_join_request
{
u8 node_idx;
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 965f45dbe17b..6051edc33aef 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -23,9 +23,9 @@
#include <linux/spinlock.h>
-#include "cluster/heartbeat.h"
-#include "cluster/nodemanager.h"
-#include "cluster/tcp.h"
+#include "../cluster/heartbeat.h"
+#include "../cluster/nodemanager.h"
+#include "../cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
@@ -33,7 +33,7 @@
#include "dlmconvert.h"
#define MLOG_MASK_PREFIX ML_DLM
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
/* NOTE: __dlmconvert_master is the only function in here that
* needs a spinlock held on entry (res->spinlock) and it is the
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 4d0b452012b2..c5c6efba7b5e 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -17,9 +17,9 @@
#include <linux/debugfs.h>
#include <linux/export.h>
-#include "cluster/heartbeat.h"
-#include "cluster/nodemanager.h"
-#include "cluster/tcp.h"
+#include "../cluster/heartbeat.h"
+#include "../cluster/nodemanager.h"
+#include "../cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
@@ -27,7 +27,7 @@
#include "dlmdebug.h"
#define MLOG_MASK_PREFIX ML_DLM
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
static int stringify_lockname(const char *lockname, int locklen, char *buf,
int len);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index ee6f459f9770..357cfc702ce3 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -20,9 +20,9 @@
#include <linux/debugfs.h>
#include <linux/sched/signal.h>
-#include "cluster/heartbeat.h"
-#include "cluster/nodemanager.h"
-#include "cluster/tcp.h"
+#include "../cluster/heartbeat.h"
+#include "../cluster/nodemanager.h"
+#include "../cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
@@ -30,7 +30,7 @@
#include "dlmdebug.h"
#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
/*
* ocfs2 node maps are array of long int, which limits to send them freely
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index baff087f3863..83f0760e4fba 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -25,9 +25,9 @@
#include <linux/delay.h>
-#include "cluster/heartbeat.h"
-#include "cluster/nodemanager.h"
-#include "cluster/tcp.h"
+#include "../cluster/heartbeat.h"
+#include "../cluster/nodemanager.h"
+#include "../cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
@@ -35,7 +35,7 @@
#include "dlmconvert.h"
#define MLOG_MASK_PREFIX ML_DLM
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
static struct kmem_cache *dlm_lock_cache;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 74b768ca1cd8..900f7e466d11 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -25,9 +25,9 @@
#include <linux/delay.h>
-#include "cluster/heartbeat.h"
-#include "cluster/nodemanager.h"
-#include "cluster/tcp.h"
+#include "../cluster/heartbeat.h"
+#include "../cluster/nodemanager.h"
+#include "../cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
@@ -35,7 +35,7 @@
#include "dlmdebug.h"
#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
static void dlm_mle_node_down(struct dlm_ctxt *dlm,
struct dlm_master_list_entry *mle,
@@ -2554,8 +2554,6 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
if (!dlm_grab(dlm))
return -EINVAL;
- BUG_ON(target == O2NM_MAX_NODES);
-
name = res->lockname.name;
namelen = res->lockname.len;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 064ce5bbc3f6..4b566e88582f 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -26,16 +26,16 @@
#include <linux/delay.h>
-#include "cluster/heartbeat.h"
-#include "cluster/nodemanager.h"
-#include "cluster/tcp.h"
+#include "../cluster/heartbeat.h"
+#include "../cluster/nodemanager.h"
+#include "../cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
#include "dlmdomain.h"
#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_RECOVERY)
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
@@ -1668,7 +1668,7 @@ static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
u8 nodenum, u8 *real_master)
{
- int ret = -EINVAL;
+ int ret;
struct dlm_master_requery req;
int status = DLM_LOCK_RES_OWNER_UNKNOWN;
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 61c51c268460..fd40c17cd022 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -25,16 +25,16 @@
#include <linux/delay.h>
-#include "cluster/heartbeat.h"
-#include "cluster/nodemanager.h"
-#include "cluster/tcp.h"
+#include "../cluster/heartbeat.h"
+#include "../cluster/nodemanager.h"
+#include "../cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
#include "dlmdomain.h"
#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_THREAD)
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
static int dlm_thread(void *data);
static void dlm_flush_asts(struct dlm_ctxt *dlm);
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 3883633e82eb..dcb17ca8ae74 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -23,15 +23,15 @@
#include <linux/spinlock.h>
#include <linux/delay.h>
-#include "cluster/heartbeat.h"
-#include "cluster/nodemanager.h"
-#include "cluster/tcp.h"
+#include "../cluster/heartbeat.h"
+#include "../cluster/nodemanager.h"
+#include "../cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
#define MLOG_MASK_PREFIX ML_DLM
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
#define DLM_UNLOCK_FREE_LOCK 0x00000001
#define DLM_UNLOCK_CALL_AST 0x00000002
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
index a9874e441bd4..c7895f65be0e 100644
--- a/fs/ocfs2/dlmfs/Makefile
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -1,6 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
-ccflags-y := -I $(srctree)/$(src)/..
-
obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
ocfs2_dlmfs-objs := userdlm.o dlmfs.o
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 4f1668c81e1f..8e4f1ace467c 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -33,11 +33,11 @@
#include <linux/uaccess.h>
-#include "stackglue.h"
+#include "../stackglue.h"
#include "userdlm.h"
#define MLOG_MASK_PREFIX ML_DLMFS
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
static const struct super_operations dlmfs_ops;
diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c
index 525b14ddfba5..3df5be25bfb1 100644
--- a/fs/ocfs2/dlmfs/userdlm.c
+++ b/fs/ocfs2/dlmfs/userdlm.c
@@ -21,12 +21,12 @@
#include <linux/types.h>
#include <linux/crc32.h>
-#include "ocfs2_lockingver.h"
-#include "stackglue.h"
+#include "../ocfs2_lockingver.h"
+#include "../stackglue.h"
#include "userdlm.h"
#define MLOG_MASK_PREFIX ML_DLMFS
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 1c4c51f3df60..cb9e6a73bea9 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -570,7 +570,7 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
mlog_bug_on_msg(1, "type: %d\n", type);
ops = NULL; /* thanks, gcc */
break;
- };
+ }
ocfs2_build_lock_name(type, OCFS2_I(inode)->ip_blkno,
generation, res->l_name);
@@ -3282,6 +3282,7 @@ static void ocfs2_dlm_init_debug(struct ocfs2_super *osb)
debugfs_create_u32("locking_filter", 0600, osb->osb_debug_root,
&dlm_debug->d_filter_secs);
+ ocfs2_get_dlm_debug(dlm_debug);
}
static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 1afe57f425a0..68ba354cf361 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1066,6 +1066,14 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);
+ if (replayed) {
+ jbd2_journal_lock_updates(journal->j_journal);
+ status = jbd2_journal_flush(journal->j_journal);
+ jbd2_journal_unlock_updates(journal->j_journal);
+ if (status < 0)
+ mlog_errno(status);
+ }
+
status = ocfs2_journal_toggle_dirty(osb, 1, replayed);
if (status < 0) {
mlog_errno(status);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3103ba7f97a2..bfe611ed1b1d 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -597,9 +597,11 @@ static inline void ocfs2_update_inode_fsync_trans(handle_t *handle,
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- oi->i_sync_tid = handle->h_transaction->t_tid;
- if (datasync)
- oi->i_datasync_tid = handle->h_transaction->t_tid;
+ if (!is_handle_aborted(handle)) {
+ oi->i_sync_tid = handle->h_transaction->t_tid;
+ if (datasync)
+ oi->i_datasync_tid = handle->h_transaction->t_tid;
+ }
}
#endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 8ea51cf27b97..da65251ef815 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -586,8 +586,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
mlog_errno(status);
}
- oi->i_sync_tid = handle->h_transaction->t_tid;
- oi->i_datasync_tid = handle->h_transaction->t_tid;
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
leave:
if (status < 0) {
diff --git a/fs/open.c b/fs/open.c
index b62f5c0923a8..0788b3715731 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -955,48 +955,83 @@ struct file *open_with_fake_path(const struct path *path, int flags,
}
EXPORT_SYMBOL(open_with_fake_path);
-static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
+#define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE))
+#define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)
+
+inline struct open_how build_open_how(int flags, umode_t mode)
+{
+ struct open_how how = {
+ .flags = flags & VALID_OPEN_FLAGS,
+ .mode = mode & S_IALLUGO,
+ };
+
+ /* O_PATH beats everything else. */
+ if (how.flags & O_PATH)
+ how.flags &= O_PATH_FLAGS;
+ /* Modes should only be set for create-like flags. */
+ if (!WILL_CREATE(how.flags))
+ how.mode = 0;
+ return how;
+}
+
+inline int build_open_flags(const struct open_how *how, struct open_flags *op)
{
+ int flags = how->flags;
int lookup_flags = 0;
int acc_mode = ACC_MODE(flags);
+ /* Must never be set by userspace */
+ flags &= ~(FMODE_NONOTIFY | O_CLOEXEC);
+
/*
- * Clear out all open flags we don't know about so that we don't report
- * them in fcntl(F_GETFD) or similar interfaces.
+ * Older syscalls implicitly clear all of the invalid flags or argument
+ * values before calling build_open_flags(), but openat2(2) checks all
+ * of its arguments.
*/
- flags &= VALID_OPEN_FLAGS;
+ if (flags & ~VALID_OPEN_FLAGS)
+ return -EINVAL;
+ if (how->resolve & ~VALID_RESOLVE_FLAGS)
+ return -EINVAL;
- if (flags & (O_CREAT | __O_TMPFILE))
- op->mode = (mode & S_IALLUGO) | S_IFREG;
- else
+ /* Deal with the mode. */
+ if (WILL_CREATE(flags)) {
+ if (how->mode & ~S_IALLUGO)
+ return -EINVAL;
+ op->mode = how->mode | S_IFREG;
+ } else {
+ if (how->mode != 0)
+ return -EINVAL;
op->mode = 0;
-
- /* Must never be set by userspace */
- flags &= ~FMODE_NONOTIFY & ~O_CLOEXEC;
+ }
/*
- * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only
- * check for O_DSYNC if the need any syncing at all we enforce it's
- * always set instead of having to deal with possibly weird behaviour
- * for malicious applications setting only __O_SYNC.
+ * In order to ensure programs get explicit errors when trying to use
+ * O_TMPFILE on old kernels, O_TMPFILE is implemented such that it
+ * looks like (O_DIRECTORY|O_RDWR & ~O_CREAT) to old kernels. But we
+ * have to require userspace to explicitly set it.
*/
- if (flags & __O_SYNC)
- flags |= O_DSYNC;
-
if (flags & __O_TMPFILE) {
if ((flags & O_TMPFILE_MASK) != O_TMPFILE)
return -EINVAL;
if (!(acc_mode & MAY_WRITE))
return -EINVAL;
- } else if (flags & O_PATH) {
- /*
- * If we have O_PATH in the open flag. Then we
- * cannot have anything other than the below set of flags
- */
- flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
+ }
+ if (flags & O_PATH) {
+ /* O_PATH only permits certain other flags to be set. */
+ if (flags & ~O_PATH_FLAGS)
+ return -EINVAL;
acc_mode = 0;
}
+ /*
+ * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only
+ * check for O_DSYNC if the need any syncing at all we enforce it's
+ * always set instead of having to deal with possibly weird behaviour
+ * for malicious applications setting only __O_SYNC.
+ */
+ if (flags & __O_SYNC)
+ flags |= O_DSYNC;
+
op->open_flag = flags;
/* O_TRUNC implies we need access checks for write permissions */
@@ -1022,6 +1057,18 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
lookup_flags |= LOOKUP_DIRECTORY;
if (!(flags & O_NOFOLLOW))
lookup_flags |= LOOKUP_FOLLOW;
+
+ if (how->resolve & RESOLVE_NO_XDEV)
+ lookup_flags |= LOOKUP_NO_XDEV;
+ if (how->resolve & RESOLVE_NO_MAGICLINKS)
+ lookup_flags |= LOOKUP_NO_MAGICLINKS;
+ if (how->resolve & RESOLVE_NO_SYMLINKS)
+ lookup_flags |= LOOKUP_NO_SYMLINKS;
+ if (how->resolve & RESOLVE_BENEATH)
+ lookup_flags |= LOOKUP_BENEATH;
+ if (how->resolve & RESOLVE_IN_ROOT)
+ lookup_flags |= LOOKUP_IN_ROOT;
+
op->lookup_flags = lookup_flags;
return 0;
}
@@ -1040,8 +1087,11 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
struct file *file_open_name(struct filename *name, int flags, umode_t mode)
{
struct open_flags op;
- int err = build_open_flags(flags, mode, &op);
- return err ? ERR_PTR(err) : do_filp_open(AT_FDCWD, name, &op);
+ struct open_how how = build_open_how(flags, mode);
+ int err = build_open_flags(&how, &op);
+ if (err)
+ return ERR_PTR(err);
+ return do_filp_open(AT_FDCWD, name, &op);
}
/**
@@ -1072,17 +1122,19 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
const char *filename, int flags, umode_t mode)
{
struct open_flags op;
- int err = build_open_flags(flags, mode, &op);
+ struct open_how how = build_open_how(flags, mode);
+ int err = build_open_flags(&how, &op);
if (err)
return ERR_PTR(err);
return do_file_open_root(dentry, mnt, filename, &op);
}
EXPORT_SYMBOL(file_open_root);
-long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
+static long do_sys_openat2(int dfd, const char __user *filename,
+ struct open_how *how)
{
struct open_flags op;
- int fd = build_open_flags(flags, mode, &op);
+ int fd = build_open_flags(how, &op);
struct filename *tmp;
if (fd)
@@ -1092,7 +1144,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
if (IS_ERR(tmp))
return PTR_ERR(tmp);
- fd = get_unused_fd_flags(flags);
+ fd = get_unused_fd_flags(how->flags);
if (fd >= 0) {
struct file *f = do_filp_open(dfd, tmp, &op);
if (IS_ERR(f)) {
@@ -1107,12 +1159,16 @@ long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
return fd;
}
-SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
+long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
- if (force_o_largefile())
- flags |= O_LARGEFILE;
+ struct open_how how = build_open_how(flags, mode);
+ return do_sys_openat2(dfd, filename, &how);
+}
- return do_sys_open(AT_FDCWD, filename, flags, mode);
+
+SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
+{
+ return ksys_open(filename, flags, mode);
}
SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
@@ -1120,10 +1176,32 @@ SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
{
if (force_o_largefile())
flags |= O_LARGEFILE;
-
return do_sys_open(dfd, filename, flags, mode);
}
+SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename,
+ struct open_how __user *, how, size_t, usize)
+{
+ int err;
+ struct open_how tmp;
+
+ BUILD_BUG_ON(sizeof(struct open_how) < OPEN_HOW_SIZE_VER0);
+ BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_LATEST);
+
+ if (unlikely(usize < OPEN_HOW_SIZE_VER0))
+ return -EINVAL;
+
+ err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize);
+ if (err)
+ return err;
+
+ /* O_LARGEFILE is only allowed for non-O_PATH. */
+ if (!(tmp.flags & O_PATH) && force_o_largefile())
+ tmp.flags |= O_LARGEFILE;
+
+ return do_sys_openat2(dfd, filename, &tmp);
+}
+
#ifdef CONFIG_COMPAT
/*
* Exactly like sys_open(), except that it doesn't set the
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 84ad1c90d535..249672bf54fe 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -631,12 +631,15 @@ EXPORT_SYMBOL_GPL(posix_acl_create);
/**
* posix_acl_update_mode - update mode in set_acl
+ * @inode: target inode
+ * @mode_p: mode (pointer) for update
+ * @acl: acl pointer
*
* Update the file mode when setting an ACL: compute the new file permission
* bits based on the ACL. In addition, if the ACL is equivalent to the new
- * file mode, set *acl to NULL to indicate that no ACL should be set.
+ * file mode, set *@acl to NULL to indicate that no ACL should be set.
*
- * As with chmod, clear the setgit bit if the caller is not in the owning group
+ * As with chmod, clear the setgid bit if the caller is not in the owning group
* or capable of CAP_FSETID (see inode_change_ok).
*
* Called from set_acl inode operations.
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 733881a6387b..27ef84d99f59 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -103,3 +103,7 @@ config PROC_CHILDREN
config PROC_PID_ARCH_STATUS
def_bool n
depends on PROC_FS
+
+config PROC_CPU_RESCTRL
+ def_bool n
+ depends on PROC_FS
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ebea9501afb8..c7c64272b0fa 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -94,6 +94,8 @@
#include <linux/sched/debug.h>
#include <linux/sched/stat.h>
#include <linux/posix-timers.h>
+#include <linux/time_namespace.h>
+#include <linux/resctrl.h>
#include <trace/events/oom.h>
#include "internal.h"
#include "fd.h"
@@ -1533,6 +1535,96 @@ static const struct file_operations proc_pid_sched_autogroup_operations = {
#endif /* CONFIG_SCHED_AUTOGROUP */
+#ifdef CONFIG_TIME_NS
+static int timens_offsets_show(struct seq_file *m, void *v)
+{
+ struct task_struct *p;
+
+ p = get_proc_task(file_inode(m->file));
+ if (!p)
+ return -ESRCH;
+ proc_timens_show_offsets(p, m);
+
+ put_task_struct(p);
+
+ return 0;
+}
+
+static ssize_t timens_offsets_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct inode *inode = file_inode(file);
+ struct proc_timens_offset offsets[2];
+ char *kbuf = NULL, *pos, *next_line;
+ struct task_struct *p;
+ int ret, noffsets;
+
+ /* Only allow < page size writes at the beginning of the file */
+ if ((*ppos != 0) || (count >= PAGE_SIZE))
+ return -EINVAL;
+
+ /* Slurp in the user data */
+ kbuf = memdup_user_nul(buf, count);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
+
+ /* Parse the user data */
+ ret = -EINVAL;
+ noffsets = 0;
+ for (pos = kbuf; pos; pos = next_line) {
+ struct proc_timens_offset *off = &offsets[noffsets];
+ int err;
+
+ /* Find the end of line and ensure we don't look past it */
+ next_line = strchr(pos, '\n');
+ if (next_line) {
+ *next_line = '\0';
+ next_line++;
+ if (*next_line == '\0')
+ next_line = NULL;
+ }
+
+ err = sscanf(pos, "%u %lld %lu", &off->clockid,
+ &off->val.tv_sec, &off->val.tv_nsec);
+ if (err != 3 || off->val.tv_nsec >= NSEC_PER_SEC)
+ goto out;
+ noffsets++;
+ if (noffsets == ARRAY_SIZE(offsets)) {
+ if (next_line)
+ count = next_line - kbuf;
+ break;
+ }
+ }
+
+ ret = -ESRCH;
+ p = get_proc_task(inode);
+ if (!p)
+ goto out;
+ ret = proc_timens_set_offset(file, p, offsets, noffsets);
+ put_task_struct(p);
+ if (ret)
+ goto out;
+
+ ret = count;
+out:
+ kfree(kbuf);
+ return ret;
+}
+
+static int timens_offsets_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, timens_offsets_show, inode);
+}
+
+static const struct file_operations proc_timens_offsets_operations = {
+ .open = timens_offsets_open,
+ .read = seq_read,
+ .write = timens_offsets_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif /* CONFIG_TIME_NS */
+
static ssize_t comm_write(struct file *file, const char __user *buf,
size_t count, loff_t *offset)
{
@@ -1626,8 +1718,7 @@ static const char *proc_pid_get_link(struct dentry *dentry,
if (error)
goto out;
- nd_jump_link(&path);
- return NULL;
+ error = nd_jump_link(&path);
out:
return ERR_PTR(error);
}
@@ -3016,6 +3107,9 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_SCHED_AUTOGROUP
REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
#endif
+#ifdef CONFIG_TIME_NS
+ REG("timens_offsets", S_IRUGO|S_IWUSR, proc_timens_offsets_operations),
+#endif
REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
ONE("syscall", S_IRUSR, proc_pid_syscall),
@@ -3061,6 +3155,9 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_CGROUPS
ONE("cgroup", S_IRUGO, proc_cgroup_show),
#endif
+#ifdef CONFIG_PROC_CPU_RESCTRL
+ ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show),
+#endif
ONE("oom_score", S_IRUGO, proc_oom_score),
REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
@@ -3461,6 +3558,9 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_CGROUPS
ONE("cgroup", S_IRUGO, proc_cgroup_show),
#endif
+#ifdef CONFIG_PROC_CPU_RESCTRL
+ ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show),
+#endif
ONE("oom_score", S_IRUGO, proc_oom_score),
REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index dd2b35f78b09..8e159fc78c0a 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -33,6 +33,10 @@ static const struct proc_ns_operations *ns_entries[] = {
#ifdef CONFIG_CGROUPS
&cgroupns_operations,
#endif
+#ifdef CONFIG_TIME_NS
+ &timens_operations,
+ &timens_for_children_operations,
+#endif
};
static const char *proc_ns_get_link(struct dentry *dentry,
@@ -42,22 +46,26 @@ static const char *proc_ns_get_link(struct dentry *dentry,
const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
struct task_struct *task;
struct path ns_path;
- void *error = ERR_PTR(-EACCES);
+ int error = -EACCES;
if (!dentry)
return ERR_PTR(-ECHILD);
task = get_proc_task(inode);
if (!task)
- return error;
+ return ERR_PTR(-EACCES);
- if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
- error = ns_get_path(&ns_path, task, ns_ops);
- if (!error)
- nd_jump_link(&ns_path);
- }
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
+ goto out;
+
+ error = ns_get_path(&ns_path, task, ns_ops);
+ if (error)
+ goto out;
+
+ error = nd_jump_link(&ns_path);
+out:
put_task_struct(task);
- return error;
+ return ERR_PTR(error);
}
static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen)
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index a4c2791ab70b..5a1b228964fb 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -5,6 +5,7 @@
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/time.h>
+#include <linux/time_namespace.h>
#include <linux/kernel_stat.h>
static int uptime_proc_show(struct seq_file *m, void *v)
@@ -20,6 +21,8 @@ static int uptime_proc_show(struct seq_file *m, void *v)
nsec += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
ktime_get_boottime_ts64(&uptime);
+ timens_add_boottime(&uptime);
+
idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
idle.tv_nsec = rem;
seq_printf(m, "%lu.%02lu %lu.%02lu\n",
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 8caff834f002..013486b5125e 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -407,6 +407,17 @@ static int notrace ramoops_pstore_write(struct pstore_record *record)
prz = cxt->dprzs[cxt->dump_write_cnt];
+ /*
+ * Since this is a new crash dump, we need to reset the buffer in
+ * case it still has an old dump present. Without this, the new dump
+ * will get appended, which would seriously confuse anything trying
+ * to check dump file contents. Specifically, ramoops_read_kmsg_hdr()
+ * expects to find a dump header in the beginning of buffer data, so
+ * we must to reset the buffer values, in order to ensure that the
+ * header will be written to the beginning of the buffer.
+ */
+ persistent_ram_zap(prz);
+
/* Build header and append record contents. */
hlen = ramoops_write_kmsg_hdr(prz, record);
if (!hlen)
@@ -572,6 +583,7 @@ static int ramoops_init_przs(const char *name,
prz_ar[i] = persistent_ram_new(*paddr, zone_sz, sig,
&cxt->ecc_info,
cxt->memtype, flags, label);
+ kfree(label);
if (IS_ERR(prz_ar[i])) {
err = PTR_ERR(prz_ar[i]);
dev_err(dev, "failed to request %s mem region (0x%zx@0x%llx): %d\n",
@@ -617,6 +629,7 @@ static int ramoops_init_prz(const char *name,
label = kasprintf(GFP_KERNEL, "ramoops:%s", name);
*prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info,
cxt->memtype, PRZ_FLAG_ZAP_OLD, label);
+ kfree(label);
if (IS_ERR(*prz)) {
int err = PTR_ERR(*prz);
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 8823f65888f0..1f4d8c06f9be 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -574,7 +574,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
/* Initialize general buffer state. */
raw_spin_lock_init(&prz->buffer_lock);
prz->flags = flags;
- prz->label = label;
+ prz->label = kstrdup(label, GFP_KERNEL);
ret = persistent_ram_buffer_map(start, size, prz, memtype);
if (ret)
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 53429c29c784..58fc2a7c7fd1 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -22,8 +22,6 @@ MODULE_AUTHOR("Jan Kara");
MODULE_DESCRIPTION("Quota format v2 support");
MODULE_LICENSE("GPL");
-#define __QUOTA_V2_PARANOIA
-
static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot);
static void v2r0_disk2memdqb(struct dquot *dquot, void *dp);
static int v2r0_is_id(void *dp, struct dquot *dquot);
diff --git a/fs/quota/quotaio_v1.h b/fs/quota/quotaio_v1.h
index bd11e2c08119..31dca9a89176 100644
--- a/fs/quota/quotaio_v1.h
+++ b/fs/quota/quotaio_v1.h
@@ -25,8 +25,10 @@ struct v1_disk_dqblk {
__u32 dqb_ihardlimit; /* absolute limit on allocated inodes */
__u32 dqb_isoftlimit; /* preferred inode limit */
__u32 dqb_curinodes; /* current # allocated inodes */
- time_t dqb_btime; /* time limit for excessive disk use */
- time_t dqb_itime; /* time limit for excessive inode use */
+
+ /* below fields differ in length on 32-bit vs 64-bit architectures */
+ unsigned long dqb_btime; /* time limit for excessive disk use */
+ unsigned long dqb_itime; /* time limit for excessive inode use */
};
#define v1_dqoff(UID) ((loff_t)((UID) * sizeof (struct v1_disk_dqblk)))
diff --git a/fs/read_write.c b/fs/read_write.c
index 5bbf587f5bc1..7458fccc59e1 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1777,10 +1777,9 @@ static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
* else. Assume that the offsets have already been checked for block
* alignment.
*
- * For deduplication we always scale down to the previous block because we
- * can't meaningfully compare post-EOF contents.
- *
- * For clone we only link a partial EOF block above the destination file's EOF.
+ * For clone we only link a partial EOF block above or at the destination file's
+ * EOF. For deduplication we accept a partial EOF block only if it ends at the
+ * destination file's EOF (can not link it into the middle of a file).
*
* Shorten the request if possible.
*/
@@ -1796,8 +1795,7 @@ static int generic_remap_check_len(struct inode *inode_in,
if ((*len & blkmask) == 0)
return 0;
- if ((remap_flags & REMAP_FILE_DEDUP) ||
- pos_out + *len < i_size_read(inode_out))
+ if (pos_out + *len < i_size_read(inode_out))
new_len &= ~blkmask;
if (new_len == *len)
diff --git a/fs/readdir.c b/fs/readdir.c
index d26d5ea4de7b..de2eceffdee8 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -102,10 +102,14 @@ EXPORT_SYMBOL(iterate_dir);
* filename length, and the above "soft error" worry means
* that it's probably better left alone until we have that
* issue clarified.
+ *
+ * Note the PATH_MAX check - it's arbitrary but the real
+ * kernel limit on a possible path component, not NAME_MAX,
+ * which is the technical standard limit.
*/
static int verify_dirent_name(const char *name, int len)
{
- if (!len)
+ if (len <= 0 || len >= PATH_MAX)
return -EIO;
if (memchr(name, '/', len))
return -EIO;
@@ -206,7 +210,7 @@ struct linux_dirent {
struct getdents_callback {
struct dir_context ctx;
struct linux_dirent __user * current_dir;
- struct linux_dirent __user * previous;
+ int prev_reclen;
int count;
int error;
};
@@ -214,12 +218,13 @@ struct getdents_callback {
static int filldir(struct dir_context *ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
- struct linux_dirent __user * dirent;
+ struct linux_dirent __user *dirent, *prev;
struct getdents_callback *buf =
container_of(ctx, struct getdents_callback, ctx);
unsigned long d_ino;
int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2,
sizeof(long));
+ int prev_reclen;
buf->error = verify_dirent_name(name, namlen);
if (unlikely(buf->error))
@@ -232,28 +237,24 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen,
buf->error = -EOVERFLOW;
return -EOVERFLOW;
}
- dirent = buf->previous;
- if (dirent && signal_pending(current))
+ prev_reclen = buf->prev_reclen;
+ if (prev_reclen && signal_pending(current))
return -EINTR;
-
- /*
- * Note! This range-checks 'previous' (which may be NULL).
- * The real range was checked in getdents
- */
- if (!user_access_begin(dirent, sizeof(*dirent)))
- goto efault;
- if (dirent)
- unsafe_put_user(offset, &dirent->d_off, efault_end);
dirent = buf->current_dir;
+ prev = (void __user *) dirent - prev_reclen;
+ if (!user_access_begin(prev, reclen + prev_reclen))
+ goto efault;
+
+ /* This might be 'dirent->d_off', but if so it will get overwritten */
+ unsafe_put_user(offset, &prev->d_off, efault_end);
unsafe_put_user(d_ino, &dirent->d_ino, efault_end);
unsafe_put_user(reclen, &dirent->d_reclen, efault_end);
unsafe_put_user(d_type, (char __user *) dirent + reclen - 1, efault_end);
unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end);
user_access_end();
- buf->previous = dirent;
- dirent = (void __user *)dirent + reclen;
- buf->current_dir = dirent;
+ buf->current_dir = (void __user *)dirent + reclen;
+ buf->prev_reclen = reclen;
buf->count -= reclen;
return 0;
efault_end:
@@ -267,7 +268,6 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
struct linux_dirent __user *, dirent, unsigned int, count)
{
struct fd f;
- struct linux_dirent __user * lastdirent;
struct getdents_callback buf = {
.ctx.actor = filldir,
.count = count,
@@ -285,8 +285,10 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
error = iterate_dir(f.file, &buf.ctx);
if (error >= 0)
error = buf.error;
- lastdirent = buf.previous;
- if (lastdirent) {
+ if (buf.prev_reclen) {
+ struct linux_dirent __user * lastdirent;
+ lastdirent = (void __user *)buf.current_dir - buf.prev_reclen;
+
if (put_user(buf.ctx.pos, &lastdirent->d_off))
error = -EFAULT;
else
@@ -299,7 +301,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
struct getdents_callback64 {
struct dir_context ctx;
struct linux_dirent64 __user * current_dir;
- struct linux_dirent64 __user * previous;
+ int prev_reclen;
int count;
int error;
};
@@ -307,11 +309,12 @@ struct getdents_callback64 {
static int filldir64(struct dir_context *ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
- struct linux_dirent64 __user *dirent;
+ struct linux_dirent64 __user *dirent, *prev;
struct getdents_callback64 *buf =
container_of(ctx, struct getdents_callback64, ctx);
int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
sizeof(u64));
+ int prev_reclen;
buf->error = verify_dirent_name(name, namlen);
if (unlikely(buf->error))
@@ -319,30 +322,27 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen,
buf->error = -EINVAL; /* only used if we fail.. */
if (reclen > buf->count)
return -EINVAL;
- dirent = buf->previous;
- if (dirent && signal_pending(current))
+ prev_reclen = buf->prev_reclen;
+ if (prev_reclen && signal_pending(current))
return -EINTR;
-
- /*
- * Note! This range-checks 'previous' (which may be NULL).
- * The real range was checked in getdents
- */
- if (!user_access_begin(dirent, sizeof(*dirent)))
- goto efault;
- if (dirent)
- unsafe_put_user(offset, &dirent->d_off, efault_end);
dirent = buf->current_dir;
+ prev = (void __user *)dirent - prev_reclen;
+ if (!user_access_begin(prev, reclen + prev_reclen))
+ goto efault;
+
+ /* This might be 'dirent->d_off', but if so it will get overwritten */
+ unsafe_put_user(offset, &prev->d_off, efault_end);
unsafe_put_user(ino, &dirent->d_ino, efault_end);
unsafe_put_user(reclen, &dirent->d_reclen, efault_end);
unsafe_put_user(d_type, &dirent->d_type, efault_end);
unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end);
user_access_end();
- buf->previous = dirent;
- dirent = (void __user *)dirent + reclen;
- buf->current_dir = dirent;
+ buf->prev_reclen = reclen;
+ buf->current_dir = (void __user *)dirent + reclen;
buf->count -= reclen;
return 0;
+
efault_end:
user_access_end();
efault:
@@ -354,7 +354,6 @@ int ksys_getdents64(unsigned int fd, struct linux_dirent64 __user *dirent,
unsigned int count)
{
struct fd f;
- struct linux_dirent64 __user * lastdirent;
struct getdents_callback64 buf = {
.ctx.actor = filldir64,
.count = count,
@@ -372,9 +371,11 @@ int ksys_getdents64(unsigned int fd, struct linux_dirent64 __user *dirent,
error = iterate_dir(f.file, &buf.ctx);
if (error >= 0)
error = buf.error;
- lastdirent = buf.previous;
- if (lastdirent) {
+ if (buf.prev_reclen) {
+ struct linux_dirent64 __user * lastdirent;
typeof(lastdirent->d_off) d_off = buf.ctx.pos;
+
+ lastdirent = (void __user *) buf.current_dir - buf.prev_reclen;
if (__put_user(d_off, &lastdirent->d_off))
error = -EFAULT;
else
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 4b3e3e73b512..072156c4f895 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -56,8 +56,6 @@
/* gets a struct reiserfs_journal_list * from a list head */
#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
j_list))
-#define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
- j_working_list))
/* must be correct to keep the desc and commit structs at 4k */
#define JOURNAL_TRANS_HALF 1018
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index f2cf3441fdfc..ff336513c254 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -63,7 +63,6 @@ static int show_version(struct seq_file *m, void *unused)
#define MAP( i ) D4C( objectid_map( sb, rs )[ i ] )
#define DJF( x ) le32_to_cpu( rs -> x )
-#define DJV( x ) le32_to_cpu( s_v1 -> x )
#define DJP( x ) le32_to_cpu( jp -> x )
#define JF( x ) ( r -> s_journal -> x )
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index da9ebe33882b..8bf88d690729 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -918,12 +918,6 @@ int comp_items(const struct item_head *stored_ih, const struct treepath *path)
return memcmp(stored_ih, ih, IH_SIZE);
}
-/* unformatted nodes are not logged anymore, ever. This is safe now */
-#define held_by_others(bh) (atomic_read(&(bh)->b_count) > 1)
-
-/* block can not be forgotten as it is in I/O or held by someone */
-#define block_in_use(bh) (buffer_locked(bh) || (held_by_others(bh)))
-
/* prepare for delete or cut of direct item */
static inline int prepare_for_direct_item(struct treepath *path,
struct item_head *le_ih,
@@ -2246,7 +2240,8 @@ error_out:
/* also releases the path */
unfix_nodes(&s_ins_balance);
#ifdef REISERQUOTA_DEBUG
- reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
+ if (inode)
+ reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
"reiserquota insert_item(): freeing %u id=%u type=%c",
quota_bytes, inode->i_uid, head2type(ih));
#endif
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 3244037b1286..a6bce5b1fb1d 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -629,6 +629,7 @@ static void reiserfs_put_super(struct super_block *s)
reiserfs_write_unlock(s);
mutex_destroy(&REISERFS_SB(s)->lock);
destroy_workqueue(REISERFS_SB(s)->commit_wq);
+ kfree(REISERFS_SB(s)->s_jdev);
kfree(s->s_fs_info);
s->s_fs_info = NULL;
}
@@ -1947,7 +1948,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
if (!sbi->s_jdev) {
SWARN(silent, s, "", "Cannot allocate memory for "
"journal device name");
- goto error;
+ goto error_unlocked;
}
}
#ifdef CONFIG_QUOTA
@@ -2240,6 +2241,7 @@ error_unlocked:
kfree(qf_names[j]);
}
#endif
+ kfree(sbi->s_jdev);
kfree(sbi);
s->s_fs_info = NULL;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 62b40df36c98..28b241cd6987 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -319,8 +319,12 @@ static int reiserfs_for_each_xattr(struct inode *inode,
out_dir:
dput(dir);
out:
- /* -ENODATA isn't an error */
- if (err == -ENODATA)
+ /*
+ * -ENODATA: this object doesn't have any xattrs
+ * -EOPNOTSUPP: this file system doesn't have xattrs enabled on disk.
+ * Neither are errors
+ */
+ if (err == -ENODATA || err == -EOPNOTSUPP)
err = 0;
return err;
}
diff --git a/fs/stack.c b/fs/stack.c
index 4ef2c056269d..c9830924eb12 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -23,7 +23,7 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src)
/*
* But on 32-bit, we ought to make an effort to keep the two halves of
- * i_blocks in sync despite SMP or PREEMPT - though stat's
+ * i_blocks in sync despite SMP or PREEMPTION - though stat's
* generic_fillattr() doesn't bother, and we won't be applying quotas
* (where i_blocks does become important) at the upper level.
*
@@ -38,14 +38,14 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src)
spin_unlock(&src->i_lock);
/*
- * If CONFIG_SMP or CONFIG_PREEMPT on 32-bit, it's vital for
+ * If CONFIG_SMP or CONFIG_PREEMPTION on 32-bit, it's vital for
* fsstack_copy_inode_size() to hold some lock around
* i_size_write(), otherwise i_size_read() may spin forever (see
* include/linux/fs.h). We don't necessarily hold i_mutex when this
* is called, so take i_lock for that case.
*
* And if on 32-bit, continue our effort to keep the two halves of
- * i_blocks in sync despite SMP or PREEMPT: use i_lock for that case
+ * i_blocks in sync despite SMP or PREEMPTION: use i_lock for that case
* too, and do both at once by combining the tests.
*
* There is none of this locking overhead in the 64-bit case.
diff --git a/fs/stat.c b/fs/stat.c
index c38e4c2e1221..030008796479 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -21,6 +21,8 @@
#include <linux/uaccess.h>
#include <asm/unistd.h>
+#include "internal.h"
+
/**
* generic_fillattr - Fill in the basic attributes from the inode struct
* @inode: Inode to use as the source
@@ -150,6 +152,23 @@ int vfs_statx_fd(unsigned int fd, struct kstat *stat,
}
EXPORT_SYMBOL(vfs_statx_fd);
+inline unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, int flags)
+{
+ if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
+ AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0)
+ return -EINVAL;
+
+ *lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
+ if (flags & AT_SYMLINK_NOFOLLOW)
+ *lookup_flags &= ~LOOKUP_FOLLOW;
+ if (flags & AT_NO_AUTOMOUNT)
+ *lookup_flags &= ~LOOKUP_AUTOMOUNT;
+ if (flags & AT_EMPTY_PATH)
+ *lookup_flags |= LOOKUP_EMPTY;
+
+ return 0;
+}
+
/**
* vfs_statx - Get basic and extra attributes by filename
* @dfd: A file descriptor representing the base dir for a relative filename
@@ -170,19 +189,10 @@ int vfs_statx(int dfd, const char __user *filename, int flags,
{
struct path path;
int error = -EINVAL;
- unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
+ unsigned lookup_flags;
- if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
- AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0)
+ if (vfs_stat_set_lookup_flags(&lookup_flags, flags))
return -EINVAL;
-
- if (flags & AT_SYMLINK_NOFOLLOW)
- lookup_flags &= ~LOOKUP_FOLLOW;
- if (flags & AT_NO_AUTOMOUNT)
- lookup_flags &= ~LOOKUP_AUTOMOUNT;
- if (flags & AT_EMPTY_PATH)
- lookup_flags |= LOOKUP_EMPTY;
-
retry:
error = user_path_at(dfd, filename, lookup_flags, &path);
if (error)
@@ -523,7 +533,7 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename,
}
#endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */
-static noinline_for_stack int
+noinline_for_stack int
cp_statx(const struct kstat *stat, struct statx __user *buffer)
{
struct statx tmp;
diff --git a/fs/timerfd.c b/fs/timerfd.c
index ac7f59a58f94..c5509d2448e3 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -26,6 +26,7 @@
#include <linux/syscalls.h>
#include <linux/compat.h>
#include <linux/rcupdate.h>
+#include <linux/time_namespace.h>
struct timerfd_ctx {
union {
@@ -196,6 +197,8 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
}
if (texp != 0) {
+ if (flags & TFD_TIMER_ABSTIME)
+ texp = timens_ktime_to_host(clockid, texp);
if (isalarm(ctx)) {
if (flags & TFD_TIMER_ABSTIME)
alarm_start(&ctx->t.alarm, texp);
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index 69932bcfa920..45d3d207fb99 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -12,6 +12,7 @@ config UBIFS_FS
select CRYPTO_ZSTD if UBIFS_FS_ZSTD
select CRYPTO_HASH_INFO
select UBIFS_FS_XATTR if FS_ENCRYPTION
+ select FS_ENCRYPTION_ALGS if FS_ENCRYPTION
depends on MTD_UBI
help
UBIFS is a file system for flash devices which works on top of UBI.
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 0b98e3c8b461..ef85ec167a84 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -81,7 +81,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
struct ubifs_inode *ui;
bool encrypted = false;
- if (ubifs_crypt_is_encrypted(dir)) {
+ if (IS_ENCRYPTED(dir)) {
err = fscrypt_get_encryption_info(dir);
if (err) {
ubifs_err(c, "fscrypt_get_encryption_info failed: %i", err);
@@ -225,9 +225,9 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
goto done;
}
- if (nm.hash) {
- ubifs_assert(c, fname_len(&nm) == 0);
- ubifs_assert(c, fname_name(&nm) == NULL);
+ if (fname_name(&nm) == NULL) {
+ if (nm.hash & ~UBIFS_S_KEY_HASH_MASK)
+ goto done; /* ENOENT */
dent_key_init_hash(c, &key, dir->i_ino, nm.hash);
err = ubifs_tnc_lookup_dh(c, &key, dent, nm.minor_hash);
} else {
@@ -261,7 +261,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
goto done;
}
- if (ubifs_crypt_is_encrypted(dir) &&
+ if (IS_ENCRYPTED(dir) &&
(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
!fscrypt_has_permitted_context(dir, inode)) {
ubifs_warn(c, "Inconsistent encryption contexts: %lu/%lu",
@@ -499,7 +499,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
struct ubifs_dent_node *dent;
struct inode *dir = file_inode(file);
struct ubifs_info *c = dir->i_sb->s_fs_info;
- bool encrypted = ubifs_crypt_is_encrypted(dir);
+ bool encrypted = IS_ENCRYPTED(dir);
dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos);
@@ -512,7 +512,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
if (encrypted) {
err = fscrypt_get_encryption_info(dir);
- if (err && err != -ENOKEY)
+ if (err)
return err;
err = fscrypt_fname_alloc_buffer(dir, UBIFS_MAX_NLEN, &fstr);
@@ -1618,7 +1618,7 @@ int ubifs_getattr(const struct path *path, struct kstat *stat,
static int ubifs_dir_open(struct inode *dir, struct file *file)
{
- if (ubifs_crypt_is_encrypted(dir))
+ if (IS_ENCRYPTED(dir))
return fscrypt_get_encryption_info(dir) ? -EACCES : 0;
return 0;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index cd52585c8f4f..bc4dec5b1633 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -67,7 +67,7 @@ static int read_block(struct inode *inode, void *addr, unsigned int block,
dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
- if (ubifs_crypt_is_encrypted(inode)) {
+ if (IS_ENCRYPTED(inode)) {
err = ubifs_decrypt(inode, dn, &dlen, block);
if (err)
goto dump;
@@ -647,7 +647,7 @@ static int populate_page(struct ubifs_info *c, struct page *page,
dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
out_len = UBIFS_BLOCK_SIZE;
- if (ubifs_crypt_is_encrypted(inode)) {
+ if (IS_ENCRYPTED(inode)) {
err = ubifs_decrypt(inode, dn, &dlen, page_block);
if (err)
goto out_err;
@@ -786,7 +786,9 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu,
if (page_offset > end_index)
break;
- page = find_or_create_page(mapping, page_offset, ra_gfp_mask);
+ page = pagecache_get_page(mapping, page_offset,
+ FGP_LOCK|FGP_ACCESSED|FGP_CREAT|FGP_NOWAIT,
+ ra_gfp_mask);
if (!page)
break;
if (!PageUptodate(page))
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 5dc5abca11c7..d49fc04f2d7d 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -17,10 +17,14 @@
#include "ubifs.h"
/* Need to be kept consistent with checked flags in ioctl2ubifs() */
-#define UBIFS_SUPPORTED_IOCTL_FLAGS \
+#define UBIFS_SETTABLE_IOCTL_FLAGS \
(FS_COMPR_FL | FS_SYNC_FL | FS_APPEND_FL | \
FS_IMMUTABLE_FL | FS_DIRSYNC_FL)
+/* Need to be kept consistent with checked flags in ubifs2ioctl() */
+#define UBIFS_GETTABLE_IOCTL_FLAGS \
+ (UBIFS_SETTABLE_IOCTL_FLAGS | FS_ENCRYPT_FL)
+
/**
* ubifs_set_inode_flags - set VFS inode flags.
* @inode: VFS inode to set flags for
@@ -91,6 +95,8 @@ static int ubifs2ioctl(int ubifs_flags)
ioctl_flags |= FS_IMMUTABLE_FL;
if (ubifs_flags & UBIFS_DIRSYNC_FL)
ioctl_flags |= FS_DIRSYNC_FL;
+ if (ubifs_flags & UBIFS_CRYPT_FL)
+ ioctl_flags |= FS_ENCRYPT_FL;
return ioctl_flags;
}
@@ -113,7 +119,8 @@ static int setflags(struct inode *inode, int flags)
if (err)
goto out_unlock;
- ui->flags = ioctl2ubifs(flags);
+ ui->flags &= ~ioctl2ubifs(UBIFS_SETTABLE_IOCTL_FLAGS);
+ ui->flags |= ioctl2ubifs(flags);
ubifs_set_inode_flags(inode);
inode->i_ctime = current_time(inode);
release = ui->dirty;
@@ -155,8 +162,9 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
if (get_user(flags, (int __user *) arg))
return -EFAULT;
- if (flags & ~UBIFS_SUPPORTED_IOCTL_FLAGS)
+ if (flags & ~UBIFS_GETTABLE_IOCTL_FLAGS)
return -EOPNOTSUPP;
+ flags &= UBIFS_SETTABLE_IOCTL_FLAGS;
if (!S_ISDIR(inode->i_mode))
flags &= ~FS_DIRSYNC_FL;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 388fe8f5dc51..3bf8b1fda9d7 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -588,7 +588,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
if (!xent) {
dent->ch.node_type = UBIFS_DENT_NODE;
- if (nm->hash)
+ if (fname_name(nm) == NULL)
dent_key_init_hash(c, &dent_key, dir->i_ino, nm->hash);
else
dent_key_init(c, &dent_key, dir->i_ino, nm);
@@ -646,7 +646,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
ubifs_add_auth_dirt(c, lnum);
if (deletion) {
- if (nm->hash)
+ if (fname_name(nm) == NULL)
err = ubifs_tnc_remove_dh(c, &dent_key, nm->minor_hash);
else
err = ubifs_tnc_remove_nm(c, &dent_key, nm);
@@ -727,7 +727,7 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1;
int write_len;
struct ubifs_inode *ui = ubifs_inode(inode);
- bool encrypted = ubifs_crypt_is_encrypted(inode);
+ bool encrypted = IS_ENCRYPTED(inode);
u8 hash[UBIFS_HASH_ARR_SZ];
dbg_jnlk(key, "ino %lu, blk %u, len %d, key ",
@@ -1449,7 +1449,7 @@ static int truncate_data_node(const struct ubifs_info *c, const struct inode *in
dlen = old_dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
compr_type = le16_to_cpu(dn->compr_type);
- if (ubifs_crypt_is_encrypted(inode)) {
+ if (IS_ENCRYPTED(inode)) {
err = ubifs_decrypt(inode, dn, &dlen, block);
if (err)
goto out;
@@ -1465,7 +1465,7 @@ static int truncate_data_node(const struct ubifs_info *c, const struct inode *in
ubifs_compress(c, buf, *new_len, &dn->data, &out_len, &compr_type);
}
- if (ubifs_crypt_is_encrypted(inode)) {
+ if (IS_ENCRYPTED(inode)) {
err = ubifs_encrypt(inode, dn, out_len, &old_dlen, block);
if (err)
goto out;
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index afa704ff5ca0..8142d9d6fe5d 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -150,7 +150,6 @@ static inline void dent_key_init(const struct ubifs_info *c,
uint32_t hash = c->key_hash(fname_name(nm), fname_len(nm));
ubifs_assert(c, !(hash & ~UBIFS_S_KEY_HASH_MASK));
- ubifs_assert(c, !nm->hash && !nm->minor_hash);
key->u32[0] = inum;
key->u32[1] = hash | (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS);
}
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 54d6db61106f..edf43ddd7dce 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -129,7 +129,7 @@ static void __orphan_drop(struct ubifs_info *c, struct ubifs_orphan *o)
static void orphan_delete(struct ubifs_info *c, struct ubifs_orphan *orph)
{
if (orph->del) {
- dbg_gen("deleted twice ino %lu", orph->inum);
+ dbg_gen("deleted twice ino %lu", (unsigned long)orph->inum);
return;
}
@@ -137,7 +137,7 @@ static void orphan_delete(struct ubifs_info *c, struct ubifs_orphan *orph)
orph->del = 1;
orph->dnext = c->orph_dnext;
c->orph_dnext = orph;
- dbg_gen("delete later ino %lu", orph->inum);
+ dbg_gen("delete later ino %lu", (unsigned long)orph->inum);
return;
}
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 2b7c04bf8983..17c90dff7266 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -161,7 +161,7 @@ static int create_default_filesystem(struct ubifs_info *c)
sup = kzalloc(ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size), GFP_KERNEL);
mst = kzalloc(c->mst_node_alsz, GFP_KERNEL);
idx_node_size = ubifs_idx_node_sz(c, 1);
- idx = kzalloc(ALIGN(tmp, c->min_io_size), GFP_KERNEL);
+ idx = kzalloc(ALIGN(idx_node_size, c->min_io_size), GFP_KERNEL);
ino = kzalloc(ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size), GFP_KERNEL);
cs = kzalloc(ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size), GFP_KERNEL);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 5e1e8ec0589e..7fc2f3f07c16 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1599,6 +1599,7 @@ out_free:
vfree(c->ileb_buf);
vfree(c->sbuf);
kfree(c->bottom_up_buf);
+ kfree(c->sup_node);
ubifs_debugging_exit(c);
return err;
}
@@ -1641,6 +1642,7 @@ static void ubifs_umount(struct ubifs_info *c)
vfree(c->ileb_buf);
vfree(c->sbuf);
kfree(c->bottom_up_buf);
+ kfree(c->sup_node);
ubifs_debugging_exit(c);
}
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index c55f212dcb75..bff682309fbe 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -2095,13 +2095,6 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn,
extern const struct fscrypt_operations ubifs_crypt_operations;
-static inline bool ubifs_crypt_is_encrypted(const struct inode *inode)
-{
- const struct ubifs_inode *ui = ubifs_inode(inode);
-
- return ui->flags & UBIFS_CRYPT_FL;
-}
-
/* Normal UBIFS messages */
__printf(2, 3)
void ubifs_msg(const struct ubifs_info *c, const char *fmt, ...);
diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h
index fb7f2c7bec9c..3fd85464abd5 100644
--- a/fs/udf/ecma_167.h
+++ b/fs/udf/ecma_167.h
@@ -4,7 +4,8 @@
* This file is based on ECMA-167 3rd edition (June 1997)
* http://www.ecma.ch
*
- * Copyright (c) 2001-2002 Ben Fennema <bfennema@falcon.csc.calpoly.edu>
+ * Copyright (c) 2001-2002 Ben Fennema
+ * Copyright (c) 2017-2019 Pali Rohár <pali.rohar@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -32,11 +33,19 @@
* SUCH DAMAGE.
*/
+/**
+ * @file
+ * ECMA-167r3 defines and structure definitions
+ */
+
#include <linux/types.h>
#ifndef _ECMA_167_H
#define _ECMA_167_H 1
+/* Character sets and coding - d-characters (ECMA 167r3 1/7.2) */
+typedef uint8_t dchars;
+
/* Character set specification (ECMA 167r3 1/7.2.1) */
struct charspec {
uint8_t charSetType;
@@ -54,6 +63,7 @@ struct charspec {
#define CHARSPEC_TYPE_CS7 0x07 /* (1/7.2.9) */
#define CHARSPEC_TYPE_CS8 0x08 /* (1/7.2.10) */
+/* Fixed-length character fields - d-string (EMCA 167r3 1/7.2.12) */
typedef uint8_t dstring;
/* Timestamp (ECMA 167r3 1/7.3) */
@@ -85,22 +95,8 @@ struct regid {
} __packed;
/* Flags (ECMA 167r3 1/7.4.1) */
-#define ENTITYID_FLAGS_DIRTY 0x00
-#define ENTITYID_FLAGS_PROTECTED 0x01
-
-/* OSTA UDF 2.1.5.2 */
-#define UDF_ID_COMPLIANT "*OSTA UDF Compliant"
-
-/* OSTA UDF 2.1.5.3 */
-struct domainEntityIDSuffix {
- uint16_t revision;
- uint8_t flags;
- uint8_t reserved[5];
-};
-
-/* OSTA UDF 2.1.5.3 */
-#define ENTITYIDSUFFIX_FLAGS_HARDWRITEPROTECT 0
-#define ENTITYIDSUFFIX_FLAGS_SOFTWRITEPROTECT 1
+#define ENTITYID_FLAGS_DIRTY 0x01
+#define ENTITYID_FLAGS_PROTECTED 0x02
/* Volume Structure Descriptor (ECMA 167r3 2/9.1) */
#define VSD_STD_ID_LEN 5
@@ -202,6 +198,13 @@ struct NSRDesc {
uint8_t structData[2040];
} __packed;
+/* Generic Descriptor */
+struct genericDesc {
+ struct tag descTag;
+ __le32 volDescSeqNum;
+ uint8_t reserved[492];
+} __packed;
+
/* Primary Volume Descriptor (ECMA 167r3 3/10.1) */
struct primaryVolDesc {
struct tag descTag;
@@ -316,7 +319,7 @@ struct genericPartitionMap {
/* Partition Map Type (ECMA 167r3 3/10.7.1.1) */
#define GP_PARTITION_MAP_TYPE_UNDEF 0x00
-#define GP_PARTIITON_MAP_TYPE_1 0x01
+#define GP_PARTITION_MAP_TYPE_1 0x01
#define GP_PARTITION_MAP_TYPE_2 0x02
/* Type 1 Partition Map (ECMA 167r3 3/10.7.2) */
@@ -723,6 +726,7 @@ struct appUseExtAttr {
#define EXTATTR_DEV_SPEC 12
#define EXTATTR_IMP_USE 2048
#define EXTATTR_APP_USE 65536
+#define EXTATTR_SUBTYPE 1
/* Unallocated Space Entry (ECMA 167r3 4/14.11) */
struct unallocSpaceEntry {
@@ -754,10 +758,12 @@ struct partitionIntegrityEntry {
/* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */
/* Extent Length (ECMA 167r3 4/14.14.1.1) */
+#define EXT_LENGTH_MASK 0x3FFFFFFF
+#define EXT_TYPE_MASK 0xC0000000
#define EXT_RECORDED_ALLOCATED 0x00000000
#define EXT_NOT_RECORDED_ALLOCATED 0x40000000
#define EXT_NOT_RECORDED_NOT_ALLOCATED 0x80000000
-#define EXT_NEXT_EXTENT_ALLOCDECS 0xC0000000
+#define EXT_NEXT_EXTENT_ALLOCDESCS 0xC0000000
/* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */
@@ -774,7 +780,7 @@ struct pathComponent {
uint8_t componentType;
uint8_t lengthComponentIdent;
__le16 componentFileVersionNum;
- dstring componentIdent[0];
+ dchars componentIdent[0];
} __packed;
/* File Entry (ECMA 167r3 4/14.17) */
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index ea80036d7897..e875bc5668ee 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1981,10 +1981,10 @@ int udf_setup_indirect_aext(struct inode *inode, udf_pblk_t block,
__udf_add_aext(inode, &nepos, &cp_loc, cp_len, 1);
udf_write_aext(inode, epos, &nepos.block,
- sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDECS, 0);
+ sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDESCS, 0);
} else {
__udf_add_aext(inode, epos, &nepos.block,
- sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDECS, 0);
+ sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDESCS, 0);
}
brelse(epos->bh);
@@ -2143,7 +2143,7 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
unsigned int indirections = 0;
while ((etype = udf_current_aext(inode, epos, eloc, elen, inc)) ==
- (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
+ (EXT_NEXT_EXTENT_ALLOCDESCS >> 30)) {
udf_pblk_t block;
if (++indirections > UDF_MAX_INDIR_EXTS) {
diff --git a/fs/udf/osta_udf.h b/fs/udf/osta_udf.h
index a4da59e38b7f..35e61b2cacfe 100644
--- a/fs/udf/osta_udf.h
+++ b/fs/udf/osta_udf.h
@@ -1,10 +1,11 @@
/*
* osta_udf.h
*
- * This file is based on OSTA UDF(tm) 2.50 (April 30, 2003)
+ * This file is based on OSTA UDF(tm) 2.60 (March 1, 2005)
* http://www.osta.org
*
- * Copyright (c) 2001-2004 Ben Fennema <bfennema@falcon.csc.calpoly.edu>
+ * Copyright (c) 2001-2004 Ben Fennema
+ * Copyright (c) 2017-2019 Pali Rohár <pali.rohar@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -32,38 +33,57 @@
* SUCH DAMAGE.
*/
+/**
+ * @file
+ * OSTA-UDF defines and structure definitions
+ */
+
#include "ecma_167.h"
#ifndef _OSTA_UDF_H
#define _OSTA_UDF_H 1
-/* OSTA CS0 Charspec (UDF 2.50 2.1.2) */
+/* OSTA CS0 Charspec (UDF 2.60 2.1.2) */
#define UDF_CHAR_SET_TYPE 0
#define UDF_CHAR_SET_INFO "OSTA Compressed Unicode"
-/* Entity Identifier (UDF 2.50 2.1.5) */
-/* Identifiers (UDF 2.50 2.1.5.2) */
+/* Entity Identifier (UDF 2.60 2.1.5) */
+/* Identifiers (UDF 2.60 2.1.5.2) */
+/* Implementation Use Extended Attribute (UDF 2.60 3.3.4.5) */
+/* Virtual Allocation Table (UDF 1.50 2.2.10) */
+/* Logical Volume Extended Information (UDF 1.50 Errata, DCN 5003, 3.3.4.5.1.3) */
+/* OS2EA (UDF 1.50 3.3.4.5.3.1) */
+/* MacUniqueIDTable (UDF 1.50 3.3.4.5.4.3) */
+/* MacResourceFork (UDF 1.50 3.3.4.5.4.4) */
#define UDF_ID_DEVELOPER "*Linux UDFFS"
#define UDF_ID_COMPLIANT "*OSTA UDF Compliant"
#define UDF_ID_LV_INFO "*UDF LV Info"
#define UDF_ID_FREE_EA "*UDF FreeEASpace"
#define UDF_ID_FREE_APP_EA "*UDF FreeAppEASpace"
#define UDF_ID_DVD_CGMS "*UDF DVD CGMS Info"
+#define UDF_ID_VAT_LVEXTENSION "*UDF VAT LVExtension"
#define UDF_ID_OS2_EA "*UDF OS/2 EA"
#define UDF_ID_OS2_EA_LENGTH "*UDF OS/2 EALength"
#define UDF_ID_MAC_VOLUME "*UDF Mac VolumeInfo"
#define UDF_ID_MAC_FINDER "*UDF Mac FinderInfo"
#define UDF_ID_MAC_UNIQUE "*UDF Mac UniqueIDTable"
#define UDF_ID_MAC_RESOURCE "*UDF Mac ResourceFork"
+#define UDF_ID_OS400_DIRINFO "*UDF OS/400 DirInfo"
#define UDF_ID_VIRTUAL "*UDF Virtual Partition"
#define UDF_ID_SPARABLE "*UDF Sparable Partition"
#define UDF_ID_ALLOC "*UDF Virtual Alloc Tbl"
#define UDF_ID_SPARING "*UDF Sparing Table"
#define UDF_ID_METADATA "*UDF Metadata Partition"
-/* Identifier Suffix (UDF 2.50 2.1.5.3) */
-#define IS_DF_HARD_WRITE_PROTECT 0x01
-#define IS_DF_SOFT_WRITE_PROTECT 0x02
+/* Identifier Suffix (UDF 2.60 2.1.5.3) */
+#define DOMAIN_FLAGS_HARD_WRITE_PROTECT 0x01
+#define DOMAIN_FLAGS_SOFT_WRITE_PROTECT 0x02
+
+struct domainIdentSuffix {
+ __le16 UDFRevision;
+ uint8_t domainFlags;
+ uint8_t reserved[5];
+} __packed;
struct UDFIdentSuffix {
__le16 UDFRevision;
@@ -75,15 +95,15 @@ struct UDFIdentSuffix {
struct impIdentSuffix {
uint8_t OSClass;
uint8_t OSIdentifier;
- uint8_t reserved[6];
+ uint8_t impUse[6];
} __packed;
struct appIdentSuffix {
uint8_t impUse[8];
} __packed;
-/* Logical Volume Integrity Descriptor (UDF 2.50 2.2.6) */
-/* Implementation Use (UDF 2.50 2.2.6.4) */
+/* Logical Volume Integrity Descriptor (UDF 2.60 2.2.6) */
+/* Implementation Use (UDF 2.60 2.2.6.4) */
struct logicalVolIntegrityDescImpUse {
struct regid impIdent;
__le32 numFiles;
@@ -94,8 +114,8 @@ struct logicalVolIntegrityDescImpUse {
uint8_t impUse[0];
} __packed;
-/* Implementation Use Volume Descriptor (UDF 2.50 2.2.7) */
-/* Implementation Use (UDF 2.50 2.2.7.2) */
+/* Implementation Use Volume Descriptor (UDF 2.60 2.2.7) */
+/* Implementation Use (UDF 2.60 2.2.7.2) */
struct impUseVolDescImpUse {
struct charspec LVICharset;
dstring logicalVolIdent[128];
@@ -115,7 +135,7 @@ struct udfPartitionMap2 {
__le16 partitionNum;
} __packed;
-/* Virtual Partition Map (UDF 2.50 2.2.8) */
+/* Virtual Partition Map (UDF 2.60 2.2.8) */
struct virtualPartitionMap {
uint8_t partitionMapType;
uint8_t partitionMapLength;
@@ -126,7 +146,7 @@ struct virtualPartitionMap {
uint8_t reserved2[24];
} __packed;
-/* Sparable Partition Map (UDF 2.50 2.2.9) */
+/* Sparable Partition Map (UDF 2.60 2.2.9) */
struct sparablePartitionMap {
uint8_t partitionMapType;
uint8_t partitionMapLength;
@@ -141,7 +161,7 @@ struct sparablePartitionMap {
__le32 locSparingTable[4];
} __packed;
-/* Metadata Partition Map (UDF 2.4.0 2.2.10) */
+/* Metadata Partition Map (UDF 2.60 2.2.10) */
struct metadataPartitionMap {
uint8_t partitionMapType;
uint8_t partitionMapLength;
@@ -160,14 +180,14 @@ struct metadataPartitionMap {
/* Virtual Allocation Table (UDF 1.5 2.2.10) */
struct virtualAllocationTable15 {
- __le32 VirtualSector[0];
+ __le32 vatEntry[0];
struct regid vatIdent;
__le32 previousVATICBLoc;
} __packed;
#define ICBTAG_FILE_TYPE_VAT15 0x00U
-/* Virtual Allocation Table (UDF 2.50 2.2.11) */
+/* Virtual Allocation Table (UDF 2.60 2.2.11) */
struct virtualAllocationTable20 {
__le16 lengthHeader;
__le16 lengthImpUse;
@@ -175,9 +195,9 @@ struct virtualAllocationTable20 {
__le32 previousVATICBLoc;
__le32 numFiles;
__le32 numDirs;
- __le16 minReadRevision;
- __le16 minWriteRevision;
- __le16 maxWriteRevision;
+ __le16 minUDFReadRev;
+ __le16 minUDFWriteRev;
+ __le16 maxUDFWriteRev;
__le16 reserved;
uint8_t impUse[0];
__le32 vatEntry[0];
@@ -185,7 +205,7 @@ struct virtualAllocationTable20 {
#define ICBTAG_FILE_TYPE_VAT20 0xF8U
-/* Sparing Table (UDF 2.50 2.2.12) */
+/* Sparing Table (UDF 2.60 2.2.12) */
struct sparingEntry {
__le32 origLocation;
__le32 mappedLocation;
@@ -201,12 +221,12 @@ struct sparingTable {
mapEntry[0];
} __packed;
-/* Metadata File (and Metadata Mirror File) (UDF 2.50 2.2.13.1) */
+/* Metadata File (and Metadata Mirror File) (UDF 2.60 2.2.13.1) */
#define ICBTAG_FILE_TYPE_MAIN 0xFA
#define ICBTAG_FILE_TYPE_MIRROR 0xFB
#define ICBTAG_FILE_TYPE_BITMAP 0xFC
-/* struct struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */
+/* struct struct long_ad ICB - ADImpUse (UDF 2.60 2.2.4.3) */
struct allocDescImpUse {
__le16 flags;
uint8_t impUse[4];
@@ -214,17 +234,17 @@ struct allocDescImpUse {
#define AD_IU_EXT_ERASED 0x0001
-/* Real-Time Files (UDF 2.50 6.11) */
+/* Real-Time Files (UDF 2.60 6.11) */
#define ICBTAG_FILE_TYPE_REALTIME 0xF9U
-/* Implementation Use Extended Attribute (UDF 2.50 3.3.4.5) */
-/* FreeEASpace (UDF 2.50 3.3.4.5.1.1) */
+/* Implementation Use Extended Attribute (UDF 2.60 3.3.4.5) */
+/* FreeEASpace (UDF 2.60 3.3.4.5.1.1) */
struct freeEaSpace {
__le16 headerChecksum;
uint8_t freeEASpace[0];
} __packed;
-/* DVD Copyright Management Information (UDF 2.50 3.3.4.5.1.2) */
+/* DVD Copyright Management Information (UDF 2.60 3.3.4.5.1.2) */
struct DVDCopyrightImpUse {
__le16 headerChecksum;
uint8_t CGMSInfo;
@@ -232,20 +252,35 @@ struct DVDCopyrightImpUse {
uint8_t protectionSystemInfo[4];
} __packed;
-/* Application Use Extended Attribute (UDF 2.50 3.3.4.6) */
-/* FreeAppEASpace (UDF 2.50 3.3.4.6.1) */
+/* Logical Volume Extended Information (UDF 1.50 Errata, DCN 5003, 3.3.4.5.1.3) */
+struct LVExtensionEA {
+ __le16 headerChecksum;
+ __le64 verificationID;
+ __le32 numFiles;
+ __le32 numDirs;
+ dstring logicalVolIdent[128];
+} __packed;
+
+/* Application Use Extended Attribute (UDF 2.60 3.3.4.6) */
+/* FreeAppEASpace (UDF 2.60 3.3.4.6.1) */
struct freeAppEASpace {
__le16 headerChecksum;
uint8_t freeEASpace[0];
} __packed;
-/* UDF Defined System Stream (UDF 2.50 3.3.7) */
+/* UDF Defined System Stream (UDF 2.60 3.3.7) */
#define UDF_ID_UNIQUE_ID "*UDF Unique ID Mapping Data"
#define UDF_ID_NON_ALLOC "*UDF Non-Allocatable Space"
#define UDF_ID_POWER_CAL "*UDF Power Cal Table"
#define UDF_ID_BACKUP "*UDF Backup"
-/* Operating System Identifiers (UDF 2.50 6.3) */
+/* UDF Defined Non-System Streams (UDF 2.60 3.3.8) */
+#define UDF_ID_MAC_RESOURCE_FORK_STREAM "*UDF Macintosh Resource Fork"
+/* #define UDF_ID_OS2_EA "*UDF OS/2 EA" */
+#define UDF_ID_NT_ACL "*UDF NT ACL"
+#define UDF_ID_UNIX_ACL "*UDF UNIX ACL"
+
+/* Operating System Identifiers (UDF 2.60 6.3) */
#define UDF_OS_CLASS_UNDEF 0x00U
#define UDF_OS_CLASS_DOS 0x01U
#define UDF_OS_CLASS_OS2 0x02U
@@ -270,6 +305,7 @@ struct freeAppEASpace {
#define UDF_OS_ID_LINUX 0x05U
#define UDF_OS_ID_MKLINUX 0x06U
#define UDF_OS_ID_FREEBSD 0x07U
+#define UDF_OS_ID_NETBSD 0x08U
#define UDF_OS_ID_WIN9X 0x00U
#define UDF_OS_ID_WINNT 0x00U
#define UDF_OS_ID_OS400 0x00U
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 8c28e93e9b73..f747bf72edbe 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -767,20 +767,20 @@ static int udf_check_vsd(struct super_block *sb)
static int udf_verify_domain_identifier(struct super_block *sb,
struct regid *ident, char *dname)
{
- struct domainEntityIDSuffix *suffix;
+ struct domainIdentSuffix *suffix;
if (memcmp(ident->ident, UDF_ID_COMPLIANT, strlen(UDF_ID_COMPLIANT))) {
udf_warn(sb, "Not OSTA UDF compliant %s descriptor.\n", dname);
goto force_ro;
}
- if (ident->flags & (1 << ENTITYID_FLAGS_DIRTY)) {
+ if (ident->flags & ENTITYID_FLAGS_DIRTY) {
udf_warn(sb, "Possibly not OSTA UDF compliant %s descriptor.\n",
dname);
goto force_ro;
}
- suffix = (struct domainEntityIDSuffix *)ident->identSuffix;
- if (suffix->flags & (1 << ENTITYIDSUFFIX_FLAGS_HARDWRITEPROTECT) ||
- suffix->flags & (1 << ENTITYIDSUFFIX_FLAGS_SOFTWRITEPROTECT)) {
+ suffix = (struct domainIdentSuffix *)ident->identSuffix;
+ if ((suffix->domainFlags & DOMAIN_FLAGS_HARD_WRITE_PROTECT) ||
+ (suffix->domainFlags & DOMAIN_FLAGS_SOFT_WRITE_PROTECT)) {
if (!sb_rdonly(sb)) {
udf_warn(sb, "Descriptor for %s marked write protected."
" Forcing read only mount.\n", dname);
@@ -1035,7 +1035,6 @@ static int check_partition_desc(struct super_block *sb,
switch (le32_to_cpu(p->accessType)) {
case PD_ACCESS_TYPE_READ_ONLY:
case PD_ACCESS_TYPE_WRITE_ONCE:
- case PD_ACCESS_TYPE_REWRITABLE:
case PD_ACCESS_TYPE_NONE:
goto force_ro;
}
@@ -1063,7 +1062,8 @@ static int check_partition_desc(struct super_block *sb,
goto force_ro;
if (map->s_partition_type == UDF_VIRTUAL_MAP15 ||
- map->s_partition_type == UDF_VIRTUAL_MAP20)
+ map->s_partition_type == UDF_VIRTUAL_MAP20 ||
+ map->s_partition_type == UDF_METADATA_MAP25)
goto force_ro;
return 0;
@@ -2402,6 +2402,10 @@ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_blocks = sbi->s_partmaps[sbi->s_partition].s_partition_len;
buf->f_bfree = udf_count_free(sb);
buf->f_bavail = buf->f_bfree;
+ /*
+ * Let's pretend each free block is also a free 'inode' since UDF does
+ * not have separate preallocated table of inodes.
+ */
buf->f_files = (lvidiu != NULL ? (le32_to_cpu(lvidiu->numFiles) +
le32_to_cpu(lvidiu->numDirs)) : 0)
+ buf->f_bfree;
@@ -2492,17 +2496,29 @@ static unsigned int udf_count_free_table(struct super_block *sb,
static unsigned int udf_count_free(struct super_block *sb)
{
unsigned int accum = 0;
- struct udf_sb_info *sbi;
+ struct udf_sb_info *sbi = UDF_SB(sb);
struct udf_part_map *map;
+ unsigned int part = sbi->s_partition;
+ int ptype = sbi->s_partmaps[part].s_partition_type;
+
+ if (ptype == UDF_METADATA_MAP25) {
+ part = sbi->s_partmaps[part].s_type_specific.s_metadata.
+ s_phys_partition_ref;
+ } else if (ptype == UDF_VIRTUAL_MAP15 || ptype == UDF_VIRTUAL_MAP20) {
+ /*
+ * Filesystems with VAT are append-only and we cannot write to
+ * them. Let's just report 0 here.
+ */
+ return 0;
+ }
- sbi = UDF_SB(sb);
if (sbi->s_lvid_bh) {
struct logicalVolIntegrityDesc *lvid =
(struct logicalVolIntegrityDesc *)
sbi->s_lvid_bh->b_data;
- if (le32_to_cpu(lvid->numOfPartitions) > sbi->s_partition) {
+ if (le32_to_cpu(lvid->numOfPartitions) > part) {
accum = le32_to_cpu(
- lvid->freeSpaceTable[sbi->s_partition]);
+ lvid->freeSpaceTable[part]);
if (accum == 0xFFFFFFFF)
accum = 0;
}
@@ -2511,7 +2527,7 @@ static unsigned int udf_count_free(struct super_block *sb)
if (accum)
return accum;
- map = &sbi->s_partmaps[sbi->s_partition];
+ map = &sbi->s_partmaps[part];
if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) {
accum += udf_count_free_bitmap(sb,
map->s_uspace.s_bitmap);
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 63a47f1e1d52..532cda99644e 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -241,7 +241,7 @@ int udf_truncate_extents(struct inode *inode)
while ((etype = udf_current_aext(inode, &epos, &eloc,
&elen, 0)) != -1) {
- if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
+ if (etype == (EXT_NEXT_EXTENT_ALLOCDESCS >> 30)) {
udf_write_aext(inode, &epos, &neloc, nelen, 0);
if (indirect_ext_len) {
/* We managed to free all extents in the
diff --git a/fs/verity/enable.c b/fs/verity/enable.c
index b79e3fd19d11..d98bea308fd7 100644
--- a/fs/verity/enable.c
+++ b/fs/verity/enable.c
@@ -8,18 +8,48 @@
#include "fsverity_private.h"
#include <crypto/hash.h>
+#include <linux/backing-dev.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/sched/signal.h>
#include <linux/uaccess.h>
-static int build_merkle_tree_level(struct inode *inode, unsigned int level,
+/*
+ * Read a file data page for Merkle tree construction. Do aggressive readahead,
+ * since we're sequentially reading the entire file.
+ */
+static struct page *read_file_data_page(struct file *filp, pgoff_t index,
+ struct file_ra_state *ra,
+ unsigned long remaining_pages)
+{
+ struct page *page;
+
+ page = find_get_page_flags(filp->f_mapping, index, FGP_ACCESSED);
+ if (!page || !PageUptodate(page)) {
+ if (page)
+ put_page(page);
+ else
+ page_cache_sync_readahead(filp->f_mapping, ra, filp,
+ index, remaining_pages);
+ page = read_mapping_page(filp->f_mapping, index, NULL);
+ if (IS_ERR(page))
+ return page;
+ }
+ if (PageReadahead(page))
+ page_cache_async_readahead(filp->f_mapping, ra, filp, page,
+ index, remaining_pages);
+ return page;
+}
+
+static int build_merkle_tree_level(struct file *filp, unsigned int level,
u64 num_blocks_to_hash,
const struct merkle_tree_params *params,
u8 *pending_hashes,
struct ahash_request *req)
{
+ struct inode *inode = file_inode(filp);
const struct fsverity_operations *vops = inode->i_sb->s_vop;
+ struct file_ra_state ra = { 0 };
unsigned int pending_size = 0;
u64 dst_block_num;
u64 i;
@@ -36,6 +66,8 @@ static int build_merkle_tree_level(struct inode *inode, unsigned int level,
dst_block_num = 0; /* unused */
}
+ file_ra_state_init(&ra, filp->f_mapping);
+
for (i = 0; i < num_blocks_to_hash; i++) {
struct page *src_page;
@@ -45,7 +77,8 @@ static int build_merkle_tree_level(struct inode *inode, unsigned int level,
if (level == 0) {
/* Leaf: hashing a data block */
- src_page = read_mapping_page(inode->i_mapping, i, NULL);
+ src_page = read_file_data_page(filp, i, &ra,
+ num_blocks_to_hash - i);
if (IS_ERR(src_page)) {
err = PTR_ERR(src_page);
fsverity_err(inode,
@@ -54,9 +87,14 @@ static int build_merkle_tree_level(struct inode *inode, unsigned int level,
return err;
}
} else {
+ unsigned long num_ra_pages =
+ min_t(unsigned long, num_blocks_to_hash - i,
+ inode->i_sb->s_bdi->io_pages);
+
/* Non-leaf: hashing hash block from level below */
src_page = vops->read_merkle_tree_page(inode,
- params->level_start[level - 1] + i);
+ params->level_start[level - 1] + i,
+ num_ra_pages);
if (IS_ERR(src_page)) {
err = PTR_ERR(src_page);
fsverity_err(inode,
@@ -103,17 +141,18 @@ static int build_merkle_tree_level(struct inode *inode, unsigned int level,
}
/*
- * Build the Merkle tree for the given inode using the given parameters, and
+ * Build the Merkle tree for the given file using the given parameters, and
* return the root hash in @root_hash.
*
* The tree is written to a filesystem-specific location as determined by the
* ->write_merkle_tree_block() method. However, the blocks that comprise the
* tree are the same for all filesystems.
*/
-static int build_merkle_tree(struct inode *inode,
+static int build_merkle_tree(struct file *filp,
const struct merkle_tree_params *params,
u8 *root_hash)
{
+ struct inode *inode = file_inode(filp);
u8 *pending_hashes;
struct ahash_request *req;
u64 blocks;
@@ -126,9 +165,11 @@ static int build_merkle_tree(struct inode *inode,
return 0;
}
+ /* This allocation never fails, since it's mempool-backed. */
+ req = fsverity_alloc_hash_request(params->hash_alg, GFP_KERNEL);
+
pending_hashes = kmalloc(params->block_size, GFP_KERNEL);
- req = ahash_request_alloc(params->hash_alg->tfm, GFP_KERNEL);
- if (!pending_hashes || !req)
+ if (!pending_hashes)
goto out;
/*
@@ -139,7 +180,7 @@ static int build_merkle_tree(struct inode *inode,
blocks = (inode->i_size + params->block_size - 1) >>
params->log_blocksize;
for (level = 0; level <= params->num_levels; level++) {
- err = build_merkle_tree_level(inode, level, blocks, params,
+ err = build_merkle_tree_level(filp, level, blocks, params,
pending_hashes, req);
if (err)
goto out;
@@ -150,7 +191,7 @@ static int build_merkle_tree(struct inode *inode,
err = 0;
out:
kfree(pending_hashes);
- ahash_request_free(req);
+ fsverity_free_hash_request(params->hash_alg, req);
return err;
}
@@ -175,8 +216,7 @@ static int enable_verity(struct file *filp,
/* Get the salt if the user provided one */
if (arg->salt_size &&
- copy_from_user(desc->salt,
- (const u8 __user *)(uintptr_t)arg->salt_ptr,
+ copy_from_user(desc->salt, u64_to_user_ptr(arg->salt_ptr),
arg->salt_size)) {
err = -EFAULT;
goto out;
@@ -185,8 +225,7 @@ static int enable_verity(struct file *filp,
/* Get the signature if the user provided one */
if (arg->sig_size &&
- copy_from_user(desc->signature,
- (const u8 __user *)(uintptr_t)arg->sig_ptr,
+ copy_from_user(desc->signature, u64_to_user_ptr(arg->sig_ptr),
arg->sig_size)) {
err = -EFAULT;
goto out;
@@ -227,7 +266,7 @@ static int enable_verity(struct file *filp,
*/
pr_debug("Building Merkle tree...\n");
BUILD_BUG_ON(sizeof(desc->root_hash) < FS_VERITY_MAX_DIGEST_SIZE);
- err = build_merkle_tree(inode, &params, desc->root_hash);
+ err = build_merkle_tree(filp, &params, desc->root_hash);
if (err) {
fsverity_err(inode, "Error %d building Merkle tree", err);
goto rollback;
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index e74c79b64d88..74768cf539da 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -16,6 +16,7 @@
#include <crypto/sha.h>
#include <linux/fsverity.h>
+#include <linux/mempool.h>
struct ahash_request;
@@ -37,11 +38,12 @@ struct fsverity_hash_alg {
const char *name; /* crypto API name, e.g. sha256 */
unsigned int digest_size; /* digest size in bytes, e.g. 32 for SHA-256 */
unsigned int block_size; /* block size in bytes, e.g. 64 for SHA-256 */
+ mempool_t req_pool; /* mempool with a preallocated hash request */
};
/* Merkle tree parameters: hash algorithm, initial hash state, and topology */
struct merkle_tree_params {
- const struct fsverity_hash_alg *hash_alg; /* the hash algorithm */
+ struct fsverity_hash_alg *hash_alg; /* the hash algorithm */
const u8 *hashstate; /* initial hash state or NULL */
unsigned int digest_size; /* same as hash_alg->digest_size */
unsigned int block_size; /* size of data and tree blocks */
@@ -50,6 +52,7 @@ struct merkle_tree_params {
unsigned int log_arity; /* log2(hashes_per_block) */
unsigned int num_levels; /* number of levels in Merkle tree */
u64 tree_size; /* Merkle tree size in bytes */
+ unsigned long level0_blocks; /* number of blocks in tree level 0 */
/*
* Starting block index for each tree level, ordered from leaf level (0)
@@ -114,14 +117,18 @@ struct fsverity_signed_digest {
extern struct fsverity_hash_alg fsverity_hash_algs[];
-const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode,
- unsigned int num);
-const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
+struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode,
+ unsigned int num);
+struct ahash_request *fsverity_alloc_hash_request(struct fsverity_hash_alg *alg,
+ gfp_t gfp_flags);
+void fsverity_free_hash_request(struct fsverity_hash_alg *alg,
+ struct ahash_request *req);
+const u8 *fsverity_prepare_hash_state(struct fsverity_hash_alg *alg,
const u8 *salt, size_t salt_size);
int fsverity_hash_page(const struct merkle_tree_params *params,
const struct inode *inode,
struct ahash_request *req, struct page *page, u8 *out);
-int fsverity_hash_buffer(const struct fsverity_hash_alg *alg,
+int fsverity_hash_buffer(struct fsverity_hash_alg *alg,
const void *data, size_t size, u8 *out);
void __init fsverity_check_hash_algs(void);
diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c
index 31e6d7d2389a..c37e186ebeb6 100644
--- a/fs/verity/hash_algs.c
+++ b/fs/verity/hash_algs.c
@@ -24,6 +24,8 @@ struct fsverity_hash_alg fsverity_hash_algs[] = {
},
};
+static DEFINE_MUTEX(fsverity_hash_alg_init_mutex);
+
/**
* fsverity_get_hash_alg() - validate and prepare a hash algorithm
* @inode: optional inode for logging purposes
@@ -36,8 +38,8 @@ struct fsverity_hash_alg fsverity_hash_algs[] = {
*
* Return: pointer to the hash alg on success, else an ERR_PTR()
*/
-const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode,
- unsigned int num)
+struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode,
+ unsigned int num)
{
struct fsverity_hash_alg *alg;
struct crypto_ahash *tfm;
@@ -50,10 +52,15 @@ const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode,
}
alg = &fsverity_hash_algs[num];
- /* pairs with cmpxchg() below */
- tfm = READ_ONCE(alg->tfm);
- if (likely(tfm != NULL))
+ /* pairs with smp_store_release() below */
+ if (likely(smp_load_acquire(&alg->tfm) != NULL))
return alg;
+
+ mutex_lock(&fsverity_hash_alg_init_mutex);
+
+ if (alg->tfm != NULL)
+ goto out_unlock;
+
/*
* Using the shash API would make things a bit simpler, but the ahash
* API is preferable as it allows the use of crypto accelerators.
@@ -64,12 +71,14 @@ const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode,
fsverity_warn(inode,
"Missing crypto API support for hash algorithm \"%s\"",
alg->name);
- return ERR_PTR(-ENOPKG);
+ alg = ERR_PTR(-ENOPKG);
+ goto out_unlock;
}
fsverity_err(inode,
"Error allocating hash algorithm \"%s\": %ld",
alg->name, PTR_ERR(tfm));
- return ERR_CAST(tfm);
+ alg = ERR_CAST(tfm);
+ goto out_unlock;
}
err = -EINVAL;
@@ -78,18 +87,61 @@ const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode,
if (WARN_ON(alg->block_size != crypto_ahash_blocksize(tfm)))
goto err_free_tfm;
+ err = mempool_init_kmalloc_pool(&alg->req_pool, 1,
+ sizeof(struct ahash_request) +
+ crypto_ahash_reqsize(tfm));
+ if (err)
+ goto err_free_tfm;
+
pr_info("%s using implementation \"%s\"\n",
alg->name, crypto_ahash_driver_name(tfm));
- /* pairs with READ_ONCE() above */
- if (cmpxchg(&alg->tfm, NULL, tfm) != NULL)
- crypto_free_ahash(tfm);
-
- return alg;
+ /* pairs with smp_load_acquire() above */
+ smp_store_release(&alg->tfm, tfm);
+ goto out_unlock;
err_free_tfm:
crypto_free_ahash(tfm);
- return ERR_PTR(err);
+ alg = ERR_PTR(err);
+out_unlock:
+ mutex_unlock(&fsverity_hash_alg_init_mutex);
+ return alg;
+}
+
+/**
+ * fsverity_alloc_hash_request() - allocate a hash request object
+ * @alg: the hash algorithm for which to allocate the request
+ * @gfp_flags: memory allocation flags
+ *
+ * This is mempool-backed, so this never fails if __GFP_DIRECT_RECLAIM is set in
+ * @gfp_flags. However, in that case this might need to wait for all
+ * previously-allocated requests to be freed. So to avoid deadlocks, callers
+ * must never need multiple requests at a time to make forward progress.
+ *
+ * Return: the request object on success; NULL on failure (but see above)
+ */
+struct ahash_request *fsverity_alloc_hash_request(struct fsverity_hash_alg *alg,
+ gfp_t gfp_flags)
+{
+ struct ahash_request *req = mempool_alloc(&alg->req_pool, gfp_flags);
+
+ if (req)
+ ahash_request_set_tfm(req, alg->tfm);
+ return req;
+}
+
+/**
+ * fsverity_free_hash_request() - free a hash request object
+ * @alg: the hash algorithm
+ * @req: the hash request object to free
+ */
+void fsverity_free_hash_request(struct fsverity_hash_alg *alg,
+ struct ahash_request *req)
+{
+ if (req) {
+ ahash_request_zero(req);
+ mempool_free(req, &alg->req_pool);
+ }
}
/**
@@ -101,7 +153,7 @@ err_free_tfm:
* Return: NULL if the salt is empty, otherwise the kmalloc()'ed precomputed
* initial hash state on success or an ERR_PTR() on failure.
*/
-const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
+const u8 *fsverity_prepare_hash_state(struct fsverity_hash_alg *alg,
const u8 *salt, size_t salt_size)
{
u8 *hashstate = NULL;
@@ -119,11 +171,8 @@ const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
if (!hashstate)
return ERR_PTR(-ENOMEM);
- req = ahash_request_alloc(alg->tfm, GFP_KERNEL);
- if (!req) {
- err = -ENOMEM;
- goto err_free;
- }
+ /* This allocation never fails, since it's mempool-backed. */
+ req = fsverity_alloc_hash_request(alg, GFP_KERNEL);
/*
* Zero-pad the salt to the next multiple of the input size of the hash
@@ -158,7 +207,7 @@ const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
if (err)
goto err_free;
out:
- ahash_request_free(req);
+ fsverity_free_hash_request(alg, req);
kfree(padded_salt);
return hashstate;
@@ -229,7 +278,7 @@ int fsverity_hash_page(const struct merkle_tree_params *params,
*
* Return: 0 on success, -errno on failure
*/
-int fsverity_hash_buffer(const struct fsverity_hash_alg *alg,
+int fsverity_hash_buffer(struct fsverity_hash_alg *alg,
const void *data, size_t size, u8 *out)
{
struct ahash_request *req;
@@ -237,9 +286,8 @@ int fsverity_hash_buffer(const struct fsverity_hash_alg *alg,
DECLARE_CRYPTO_WAIT(wait);
int err;
- req = ahash_request_alloc(alg->tfm, GFP_KERNEL);
- if (!req)
- return -ENOMEM;
+ /* This allocation never fails, since it's mempool-backed. */
+ req = fsverity_alloc_hash_request(alg, GFP_KERNEL);
sg_init_one(&sg, data, size);
ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
@@ -249,7 +297,7 @@ int fsverity_hash_buffer(const struct fsverity_hash_alg *alg,
err = crypto_wait_req(crypto_ahash_digest(req), &wait);
- ahash_request_free(req);
+ fsverity_free_hash_request(alg, req);
return err;
}
diff --git a/fs/verity/open.c b/fs/verity/open.c
index 63d1004b688c..c5fe6948e262 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -31,7 +31,7 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
unsigned int log_blocksize,
const u8 *salt, size_t salt_size)
{
- const struct fsverity_hash_alg *hash_alg;
+ struct fsverity_hash_alg *hash_alg;
int err;
u64 blocks;
u64 offset;
@@ -102,6 +102,7 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
/* temporarily using level_start[] to store blocks in level */
params->level_start[params->num_levels++] = blocks;
}
+ params->level0_blocks = params->level_start[0];
/* Compute the starting block of each level */
offset = 0;
@@ -126,7 +127,7 @@ out_err:
* Compute the file measurement by hashing the fsverity_descriptor excluding the
* signature and with the sig_size field set to 0.
*/
-static int compute_file_measurement(const struct fsverity_hash_alg *hash_alg,
+static int compute_file_measurement(struct fsverity_hash_alg *hash_alg,
struct fsverity_descriptor *desc,
u8 *measurement)
{
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index 3e8f2de44667..e0cb62da3864 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -84,7 +84,8 @@ static inline int cmp_hashes(const struct fsverity_info *vi,
* Return: true if the page is valid, else false.
*/
static bool verify_page(struct inode *inode, const struct fsverity_info *vi,
- struct ahash_request *req, struct page *data_page)
+ struct ahash_request *req, struct page *data_page,
+ unsigned long level0_ra_pages)
{
const struct merkle_tree_params *params = &vi->tree_params;
const unsigned int hsize = params->digest_size;
@@ -117,8 +118,8 @@ static bool verify_page(struct inode *inode, const struct fsverity_info *vi,
pr_debug_ratelimited("Level %d: hindex=%lu, hoffset=%u\n",
level, hindex, hoffset);
- hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode,
- hindex);
+ hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode, hindex,
+ level == 0 ? level0_ra_pages : 0);
if (IS_ERR(hpage)) {
err = PTR_ERR(hpage);
fsverity_err(inode,
@@ -191,13 +192,12 @@ bool fsverity_verify_page(struct page *page)
struct ahash_request *req;
bool valid;
- req = ahash_request_alloc(vi->tree_params.hash_alg->tfm, GFP_NOFS);
- if (unlikely(!req))
- return false;
+ /* This allocation never fails, since it's mempool-backed. */
+ req = fsverity_alloc_hash_request(vi->tree_params.hash_alg, GFP_NOFS);
- valid = verify_page(inode, vi, req, page);
+ valid = verify_page(inode, vi, req, page, 0);
- ahash_request_free(req);
+ fsverity_free_hash_request(vi->tree_params.hash_alg, req);
return valid;
}
@@ -222,25 +222,42 @@ void fsverity_verify_bio(struct bio *bio)
{
struct inode *inode = bio_first_page_all(bio)->mapping->host;
const struct fsverity_info *vi = inode->i_verity_info;
+ const struct merkle_tree_params *params = &vi->tree_params;
struct ahash_request *req;
struct bio_vec *bv;
struct bvec_iter_all iter_all;
-
- req = ahash_request_alloc(vi->tree_params.hash_alg->tfm, GFP_NOFS);
- if (unlikely(!req)) {
+ unsigned long max_ra_pages = 0;
+
+ /* This allocation never fails, since it's mempool-backed. */
+ req = fsverity_alloc_hash_request(params->hash_alg, GFP_NOFS);
+
+ if (bio->bi_opf & REQ_RAHEAD) {
+ /*
+ * If this bio is for data readahead, then we also do readahead
+ * of the first (largest) level of the Merkle tree. Namely,
+ * when a Merkle tree page is read, we also try to piggy-back on
+ * some additional pages -- up to 1/4 the number of data pages.
+ *
+ * This improves sequential read performance, as it greatly
+ * reduces the number of I/O requests made to the Merkle tree.
+ */
bio_for_each_segment_all(bv, bio, iter_all)
- SetPageError(bv->bv_page);
- return;
+ max_ra_pages++;
+ max_ra_pages /= 4;
}
bio_for_each_segment_all(bv, bio, iter_all) {
struct page *page = bv->bv_page;
+ unsigned long level0_index = page->index >> params->log_arity;
+ unsigned long level0_ra_pages =
+ min(max_ra_pages, params->level0_blocks - level0_index);
- if (!PageError(page) && !verify_page(inode, vi, req, page))
+ if (!PageError(page) &&
+ !verify_page(inode, vi, req, page, level0_ra_pages))
SetPageError(page);
}
- ahash_request_free(req);
+ fsverity_free_hash_request(params->hash_alg, req);
}
EXPORT_SYMBOL_GPL(fsverity_verify_bio);
#endif /* CONFIG_BLOCK */
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 0d7fcc983b3d..e6149720ce02 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -62,6 +62,7 @@ xfs_attr_args_init(
struct xfs_da_args *args,
struct xfs_inode *dp,
const unsigned char *name,
+ size_t namelen,
int flags)
{
@@ -74,7 +75,7 @@ xfs_attr_args_init(
args->dp = dp;
args->flags = flags;
args->name = name;
- args->namelen = strlen((const char *)name);
+ args->namelen = namelen;
if (args->namelen >= MAXNAMELEN)
return -EFAULT; /* match IRIX behaviour */
@@ -139,6 +140,7 @@ int
xfs_attr_get(
struct xfs_inode *ip,
const unsigned char *name,
+ size_t namelen,
unsigned char **value,
int *valuelenp,
int flags)
@@ -154,7 +156,7 @@ xfs_attr_get(
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
- error = xfs_attr_args_init(&args, ip, name, flags);
+ error = xfs_attr_args_init(&args, ip, name, namelen, flags);
if (error)
return error;
@@ -338,6 +340,7 @@ int
xfs_attr_set(
struct xfs_inode *dp,
const unsigned char *name,
+ size_t namelen,
unsigned char *value,
int valuelen,
int flags)
@@ -353,7 +356,7 @@ xfs_attr_set(
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
- error = xfs_attr_args_init(&args, dp, name, flags);
+ error = xfs_attr_args_init(&args, dp, name, namelen, flags);
if (error)
return error;
@@ -442,6 +445,7 @@ int
xfs_attr_remove(
struct xfs_inode *dp,
const unsigned char *name,
+ size_t namelen,
int flags)
{
struct xfs_mount *mp = dp->i_mount;
@@ -453,7 +457,7 @@ xfs_attr_remove(
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
- error = xfs_attr_args_init(&args, dp, name, flags);
+ error = xfs_attr_args_init(&args, dp, name, namelen, flags);
if (error)
return error;
@@ -1007,7 +1011,7 @@ restart:
* The INCOMPLETE flag means that we will find the "old"
* attr, not the "new" one.
*/
- args->flags |= XFS_ATTR_INCOMPLETE;
+ args->op_flags |= XFS_DA_OP_INCOMPLETE;
state = xfs_da_state_alloc();
state->args = args;
state->mp = mp;
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index 94badfa1743e..4243b2272642 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -26,7 +26,7 @@ struct xfs_attr_list_context;
*========================================================================*/
-#define ATTR_DONTFOLLOW 0x0001 /* -- unused, from IRIX -- */
+#define ATTR_DONTFOLLOW 0x0001 /* -- ignored, from IRIX -- */
#define ATTR_ROOT 0x0002 /* use attrs in root (trusted) namespace */
#define ATTR_TRUST 0x0004 /* -- unused, from IRIX -- */
#define ATTR_SECURE 0x0008 /* use attrs in security namespace */
@@ -37,7 +37,10 @@ struct xfs_attr_list_context;
#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */
#define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */
-#define ATTR_ALLOC 0x8000 /* allocate xattr buffer on demand */
+#define ATTR_ALLOC 0x8000 /* [kernel] allocate xattr buffer on demand */
+
+#define ATTR_KERNEL_FLAGS \
+ (ATTR_KERNOTIME | ATTR_KERNOVAL | ATTR_INCOMPLETE | ATTR_ALLOC)
#define XFS_ATTR_FLAGS \
{ ATTR_DONTFOLLOW, "DONTFOLLOW" }, \
@@ -145,11 +148,13 @@ int xfs_attr_list_int(struct xfs_attr_list_context *);
int xfs_inode_hasattr(struct xfs_inode *ip);
int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args);
int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
- unsigned char **value, int *valuelenp, int flags);
+ size_t namelen, unsigned char **value, int *valuelenp,
+ int flags);
int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
- unsigned char *value, int valuelen, int flags);
+ size_t namelen, unsigned char *value, int valuelen, int flags);
int xfs_attr_set_args(struct xfs_da_args *args);
-int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
+int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name,
+ size_t namelen, int flags);
int xfs_attr_remove_args(struct xfs_da_args *args);
int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
int flags, struct attrlist_cursor_kern *cursor);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 08d4b10ae2d5..fed537a4353d 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -2403,8 +2403,8 @@ xfs_attr3_leaf_lookup_int(
* If we are looking for INCOMPLETE entries, show only those.
* If we are looking for complete entries, show only those.
*/
- if ((args->flags & XFS_ATTR_INCOMPLETE) !=
- (entry->flags & XFS_ATTR_INCOMPLETE)) {
+ if (!!(args->op_flags & XFS_DA_OP_INCOMPLETE) !=
+ !!(entry->flags & XFS_ATTR_INCOMPLETE)) {
continue;
}
if (entry->flags & XFS_ATTR_LOCAL) {
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index f4a188e28b7b..73615b1dd1a8 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -39,15 +39,6 @@ struct xfs_attr3_icleaf_hdr {
} freemap[XFS_ATTR_LEAF_MAPSIZE];
};
-/*
- * Used to keep a list of "remote value" extents when unlinking an inode.
- */
-typedef struct xfs_attr_inactive_list {
- xfs_dablk_t valueblk; /* block number of value bytes */
- int valuelen; /* number of bytes in value */
-} xfs_attr_inactive_list_t;
-
-
/*========================================================================
* Function prototypes for the kernel.
*========================================================================*/
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index a6ef5df42669..a266d05df146 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -26,6 +26,23 @@
#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */
/*
+ * Remote Attribute Values
+ * =======================
+ *
+ * Remote extended attribute values are conceptually simple -- they're written
+ * to data blocks mapped by an inode's attribute fork, and they have an upper
+ * size limit of 64k. Setting a value does not involve the XFS log.
+ *
+ * However, on a v5 filesystem, maximally sized remote attr values require one
+ * block more than 64k worth of space to hold both the remote attribute value
+ * header (64 bytes). On a 4k block filesystem this results in a 68k buffer;
+ * on a 64k block filesystem, this would be a 128k buffer. Note that the log
+ * format can only handle a dirty buffer of XFS_MAX_BLOCKSIZE length (64k).
+ * Therefore, we /must/ ensure that remote attribute value buffers never touch
+ * the logging system and therefore never have a log item.
+ */
+
+/*
* Each contiguous block has a header, so it is not just a simple attribute
* length to FSB conversion.
*/
@@ -401,17 +418,25 @@ xfs_attr_rmtval_get(
(map[i].br_startblock != HOLESTARTBLOCK));
dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
- error = xfs_trans_read_buf(mp, args->trans,
- mp->m_ddev_targp,
- dblkno, dblkcnt, 0, &bp,
- &xfs_attr3_rmt_buf_ops);
- if (error)
+ bp = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt, 0,
+ &xfs_attr3_rmt_buf_ops);
+ if (!bp)
+ return -ENOMEM;
+ error = bp->b_error;
+ if (error) {
+ xfs_buf_ioerror_alert(bp, __func__);
+ xfs_buf_relse(bp);
+
+ /* bad CRC means corrupted metadata */
+ if (error == -EFSBADCRC)
+ error = -EFSCORRUPTED;
return error;
+ }
error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
&offset, &valuelen,
&dst);
- xfs_trans_brelse(args->trans, bp);
+ xfs_buf_relse(bp);
if (error)
return error;
@@ -552,6 +577,33 @@ xfs_attr_rmtval_set(
return 0;
}
+/* Mark stale any incore buffers for the remote value. */
+int
+xfs_attr_rmtval_stale(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *map,
+ xfs_buf_flags_t incore_flags)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_buf *bp;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ if (XFS_IS_CORRUPT(mp, map->br_startblock == DELAYSTARTBLOCK) ||
+ XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK))
+ return -EFSCORRUPTED;
+
+ bp = xfs_buf_incore(mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, map->br_startblock),
+ XFS_FSB_TO_BB(mp, map->br_blockcount), incore_flags);
+ if (bp) {
+ xfs_buf_stale(bp);
+ xfs_buf_relse(bp);
+ }
+
+ return 0;
+}
+
/*
* Remove the value associated with an attribute by deleting the
* out-of-line buffer that it is stored on.
@@ -560,7 +612,6 @@ int
xfs_attr_rmtval_remove(
struct xfs_da_args *args)
{
- struct xfs_mount *mp = args->dp->i_mount;
xfs_dablk_t lblkno;
int blkcnt;
int error;
@@ -575,9 +626,6 @@ xfs_attr_rmtval_remove(
blkcnt = args->rmtblkcnt;
while (blkcnt > 0) {
struct xfs_bmbt_irec map;
- struct xfs_buf *bp;
- xfs_daddr_t dblkno;
- int dblkcnt;
int nmap;
/*
@@ -588,22 +636,11 @@ xfs_attr_rmtval_remove(
blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK);
if (error)
return error;
- ASSERT(nmap == 1);
- ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
- (map.br_startblock != HOLESTARTBLOCK));
-
- dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
- dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
-
- /*
- * If the "remote" value is in the cache, remove it.
- */
- bp = xfs_buf_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK);
- if (bp) {
- xfs_buf_stale(bp);
- xfs_buf_relse(bp);
- bp = NULL;
- }
+ if (XFS_IS_CORRUPT(args->dp->i_mount, nmap != 1))
+ return -EFSCORRUPTED;
+ error = xfs_attr_rmtval_stale(args->dp, &map, XBF_TRYLOCK);
+ if (error)
+ return error;
lblkno += map.br_blockcount;
blkcnt -= map.br_blockcount;
diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h
index 9d20b66ad379..6fb4572845ce 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.h
+++ b/fs/xfs/libxfs/xfs_attr_remote.h
@@ -11,5 +11,7 @@ int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen);
int xfs_attr_rmtval_get(struct xfs_da_args *args);
int xfs_attr_rmtval_set(struct xfs_da_args *args);
int xfs_attr_rmtval_remove(struct xfs_da_args *args);
+int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map,
+ xfs_buf_flags_t incore_flags);
#endif /* __XFS_ATTR_REMOTE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index e2cc98931552..b22c7e928eb1 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -2389,8 +2389,6 @@ xfs_btree_lshift(
XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
if (level > 0) {
/* It's a nonleaf. operate on keys and ptrs */
- int i; /* loop index */
-
for (i = 0; i < rrecs; i++) {
error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level);
if (error)
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index e16610d1c14f..0f4fbb0889ff 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -89,6 +89,7 @@ typedef struct xfs_da_args {
#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */
#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */
#define XFS_DA_OP_ALLOCVAL 0x0020 /* lookup to alloc buffer if found */
+#define XFS_DA_OP_INCOMPLETE 0x0040 /* lookup INCOMPLETE attr keys */
#define XFS_DA_OP_FLAGS \
{ XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \
@@ -96,7 +97,8 @@ typedef struct xfs_da_args {
{ XFS_DA_OP_ADDNAME, "ADDNAME" }, \
{ XFS_DA_OP_OKNOENT, "OKNOENT" }, \
{ XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \
- { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" }
+ { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" }, \
+ { XFS_DA_OP_INCOMPLETE, "INCOMPLETE" }
/*
* Storage for holding state during Btree searches and split/join ops.
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 3dee33043e09..734837a9b51a 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -217,7 +217,7 @@ typedef struct xfs_dir2_sf_entry {
* A 64-bit or 32-bit inode number follows here, at a variable offset
* after the name.
*/
-} xfs_dir2_sf_entry_t;
+} __packed xfs_dir2_sf_entry_t;
static inline int xfs_dir2_sf_hdr_size(int i8count)
{
@@ -683,8 +683,6 @@ struct xfs_attr3_leafblock {
/*
* Flags used in the leaf_entry[i].flags field.
- * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
- * on the system call, they are "or"ed together for various operations.
*/
#define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */
#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 1b7dcbae051c..77e9fa385980 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1540,6 +1540,13 @@ typedef struct xfs_bmdr_block {
#define BMBT_BLOCKCOUNT_BITLEN 21
#define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1)
+#define BMBT_BLOCKCOUNT_MASK ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1)
+
+/*
+ * bmbt records have a file offset (block) field that is 54 bits wide, so this
+ * is the largest xfs_fileoff_t that we ever expect to see.
+ */
+#define XFS_MAX_FILEOFF (BMBT_STARTOFF_MASK + BMBT_BLOCKCOUNT_MASK)
typedef struct xfs_bmbt_rec {
__be64 l0, l1;
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 8ef31d71a9c7..9bac0d2e56dc 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -462,11 +462,20 @@ static inline uint xfs_log_dinode_size(int version)
#define XFS_BLF_GDQUOT_BUF (1<<4)
/*
- * This is the structure used to lay out a buf log item in the
- * log. The data map describes which 128 byte chunks of the buffer
- * have been logged.
- */
-#define XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD)
+ * This is the structure used to lay out a buf log item in the log. The data
+ * map describes which 128 byte chunks of the buffer have been logged.
+ *
+ * The placement of blf_map_size causes blf_data_map to start at an odd
+ * multiple of sizeof(unsigned int) offset within the struct. Because the data
+ * bitmap size will always be an even number, the end of the data_map (and
+ * therefore the structure) will also be at an odd multiple of sizeof(unsigned
+ * int). Some 64-bit compilers will insert padding at the end of the struct to
+ * ensure 64-bit alignment of blf_blkno, but 32-bit ones will not. Therefore,
+ * XFS_BLF_DATAMAP_SIZE must be an odd number to make the padding explicit and
+ * keep the structure size consistent between 32-bit and 64-bit platforms.
+ */
+#define __XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD)
+#define XFS_BLF_DATAMAP_SIZE (__XFS_BLF_DATAMAP_SIZE + 1)
typedef struct xfs_buf_log_format {
unsigned short blf_type; /* buf log item type indicator */
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 60c61d7052a8..c3422403b169 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -75,7 +75,6 @@ static inline xfs_extlen_t
xrep_calc_ag_resblks(
struct xfs_scrub *sc)
{
- ASSERT(!(sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR));
return 0;
}
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 3362bae28b46..096203119934 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -329,7 +329,7 @@ TRACE_EVENT(xchk_btree_op_error,
__field(int, level)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, bno)
- __field(int, ptr);
+ __field(int, ptr)
__field(int, error)
__field(void *, ret_ip)
),
@@ -414,7 +414,7 @@ TRACE_EVENT(xchk_btree_error,
__field(int, level)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, bno)
- __field(int, ptr);
+ __field(int, ptr)
__field(void *, ret_ip)
),
TP_fast_assign(
@@ -452,7 +452,7 @@ TRACE_EVENT(xchk_ifork_btree_error,
__field(int, level)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, bno)
- __field(int, ptr);
+ __field(int, ptr)
__field(void *, ret_ip)
),
TP_fast_assign(
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 91693fce34a8..cd743fad8478 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -145,7 +145,8 @@ xfs_get_acl(struct inode *inode, int type)
* go out to the disk.
*/
len = XFS_ACL_MAX_SIZE(ip->i_mount);
- error = xfs_attr_get(ip, ea_name, (unsigned char **)&xfs_acl, &len,
+ error = xfs_attr_get(ip, ea_name, strlen(ea_name),
+ (unsigned char **)&xfs_acl, &len,
ATTR_ALLOC | ATTR_ROOT);
if (error) {
/*
@@ -196,15 +197,17 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
len -= sizeof(struct xfs_acl_entry) *
(XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count);
- error = xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
- len, ATTR_ROOT);
+ error = xfs_attr_set(ip, ea_name, strlen(ea_name),
+ (unsigned char *)xfs_acl, len, ATTR_ROOT);
kmem_free(xfs_acl);
} else {
/*
* A NULL ACL argument means we want to remove the ACL.
*/
- error = xfs_attr_remove(ip, ea_name, ATTR_ROOT);
+ error = xfs_attr_remove(ip, ea_name,
+ strlen(ea_name),
+ ATTR_ROOT);
/*
* If the attribute didn't exist to start with that's fine.
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 5ff49523d8ea..8fbb841cd6fe 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -25,22 +25,18 @@
#include "xfs_error.h"
/*
- * Look at all the extents for this logical region,
- * invalidate any buffers that are incore/in transactions.
+ * Invalidate any incore buffers associated with this remote attribute value
+ * extent. We never log remote attribute value buffers, which means that they
+ * won't be attached to a transaction and are therefore safe to mark stale.
+ * The actual bunmapi will be taken care of later.
*/
STATIC int
-xfs_attr3_leaf_freextent(
- struct xfs_trans **trans,
+xfs_attr3_rmt_stale(
struct xfs_inode *dp,
xfs_dablk_t blkno,
int blkcnt)
{
struct xfs_bmbt_irec map;
- struct xfs_buf *bp;
- xfs_dablk_t tblkno;
- xfs_daddr_t dblkno;
- int tblkcnt;
- int dblkcnt;
int nmap;
int error;
@@ -48,47 +44,29 @@ xfs_attr3_leaf_freextent(
* Roll through the "value", invalidating the attribute value's
* blocks.
*/
- tblkno = blkno;
- tblkcnt = blkcnt;
- while (tblkcnt > 0) {
+ while (blkcnt > 0) {
/*
* Try to remember where we decided to put the value.
*/
nmap = 1;
- error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt,
+ error = xfs_bmapi_read(dp, (xfs_fileoff_t)blkno, blkcnt,
&map, &nmap, XFS_BMAPI_ATTRFORK);
- if (error) {
+ if (error)
return error;
- }
- ASSERT(nmap == 1);
- ASSERT(map.br_startblock != DELAYSTARTBLOCK);
+ if (XFS_IS_CORRUPT(dp->i_mount, nmap != 1))
+ return -EFSCORRUPTED;
/*
- * If it's a hole, these are already unmapped
- * so there's nothing to invalidate.
+ * Mark any incore buffers for the remote value as stale. We
+ * never log remote attr value buffers, so the buffer should be
+ * easy to kill.
*/
- if (map.br_startblock != HOLESTARTBLOCK) {
-
- dblkno = XFS_FSB_TO_DADDR(dp->i_mount,
- map.br_startblock);
- dblkcnt = XFS_FSB_TO_BB(dp->i_mount,
- map.br_blockcount);
- bp = xfs_trans_get_buf(*trans,
- dp->i_mount->m_ddev_targp,
- dblkno, dblkcnt, 0);
- if (!bp)
- return -ENOMEM;
- xfs_trans_binval(*trans, bp);
- /*
- * Roll to next transaction.
- */
- error = xfs_trans_roll_inode(trans, dp);
- if (error)
- return error;
- }
+ error = xfs_attr_rmtval_stale(dp, &map, 0);
+ if (error)
+ return error;
- tblkno += map.br_blockcount;
- tblkcnt -= map.br_blockcount;
+ blkno += map.br_blockcount;
+ blkcnt -= map.br_blockcount;
}
return 0;
@@ -102,86 +80,45 @@ xfs_attr3_leaf_freextent(
*/
STATIC int
xfs_attr3_leaf_inactive(
- struct xfs_trans **trans,
- struct xfs_inode *dp,
- struct xfs_buf *bp)
+ struct xfs_trans **trans,
+ struct xfs_inode *dp,
+ struct xfs_buf *bp)
{
- struct xfs_attr_leafblock *leaf;
- struct xfs_attr3_icleaf_hdr ichdr;
- struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_attr_leafblock *leaf = bp->b_addr;
+ struct xfs_attr_leaf_entry *entry;
struct xfs_attr_leaf_name_remote *name_rmt;
- struct xfs_attr_inactive_list *list;
- struct xfs_attr_inactive_list *lp;
- int error;
- int count;
- int size;
- int tmp;
- int i;
- struct xfs_mount *mp = bp->b_mount;
+ int error = 0;
+ int i;
- leaf = bp->b_addr;
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
/*
- * Count the number of "remote" value extents.
+ * Find the remote value extents for this leaf and invalidate their
+ * incore buffers.
*/
- count = 0;
entry = xfs_attr3_leaf_entryp(leaf);
for (i = 0; i < ichdr.count; entry++, i++) {
- if (be16_to_cpu(entry->nameidx) &&
- ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
- name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
- if (name_rmt->valueblk)
- count++;
- }
- }
+ int blkcnt;
- /*
- * If there are no "remote" values, we're done.
- */
- if (count == 0) {
- xfs_trans_brelse(*trans, bp);
- return 0;
- }
-
- /*
- * Allocate storage for a list of all the "remote" value extents.
- */
- size = count * sizeof(xfs_attr_inactive_list_t);
- list = kmem_alloc(size, 0);
-
- /*
- * Identify each of the "remote" value extents.
- */
- lp = list;
- entry = xfs_attr3_leaf_entryp(leaf);
- for (i = 0; i < ichdr.count; entry++, i++) {
- if (be16_to_cpu(entry->nameidx) &&
- ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
- name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
- if (name_rmt->valueblk) {
- lp->valueblk = be32_to_cpu(name_rmt->valueblk);
- lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount,
- be32_to_cpu(name_rmt->valuelen));
- lp++;
- }
- }
- }
- xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */
+ if (!entry->nameidx || (entry->flags & XFS_ATTR_LOCAL))
+ continue;
- /*
- * Invalidate each of the "remote" value extents.
- */
- error = 0;
- for (lp = list, i = 0; i < count; i++, lp++) {
- tmp = xfs_attr3_leaf_freextent(trans, dp,
- lp->valueblk, lp->valuelen);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
+ if (!name_rmt->valueblk)
+ continue;
- if (error == 0)
- error = tmp; /* save only the 1st errno */
+ blkcnt = xfs_attr3_rmt_blocks(dp->i_mount,
+ be32_to_cpu(name_rmt->valuelen));
+ error = xfs_attr3_rmt_stale(dp,
+ be32_to_cpu(name_rmt->valueblk), blkcnt);
+ if (error)
+ goto err;
}
- kmem_free(list);
+ xfs_trans_brelse(*trans, bp);
+err:
return error;
}
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 3984779e5911..5be8973a452c 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -27,6 +27,23 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp);
+/* Is this log iovec plausibly large enough to contain the buffer log format? */
+bool
+xfs_buf_log_check_iovec(
+ struct xfs_log_iovec *iovec)
+{
+ struct xfs_buf_log_format *blfp = iovec->i_addr;
+ char *bmp_end;
+ char *item_end;
+
+ if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->i_len)
+ return false;
+
+ item_end = (char *)iovec->i_addr + iovec->i_len;
+ bmp_end = (char *)&blfp->blf_data_map[blfp->blf_map_size];
+ return bmp_end <= item_end;
+}
+
static inline int
xfs_buf_log_format_size(
struct xfs_buf_log_format *blfp)
@@ -688,7 +705,7 @@ static const struct xfs_item_ops xfs_buf_item_ops = {
.iop_push = xfs_buf_item_push,
};
-STATIC int
+STATIC void
xfs_buf_item_get_format(
struct xfs_buf_log_item *bip,
int count)
@@ -698,14 +715,11 @@ xfs_buf_item_get_format(
if (count == 1) {
bip->bli_formats = &bip->__bli_format;
- return 0;
+ return;
}
bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format),
0);
- if (!bip->bli_formats)
- return -ENOMEM;
- return 0;
}
STATIC void
@@ -731,7 +745,6 @@ xfs_buf_item_init(
struct xfs_buf_log_item *bip = bp->b_log_item;
int chunks;
int map_size;
- int error;
int i;
/*
@@ -760,19 +773,22 @@ xfs_buf_item_init(
* Discontiguous buffer support follows the layout of the underlying
* buffer. This makes the implementation as simple as possible.
*/
- error = xfs_buf_item_get_format(bip, bp->b_map_count);
- ASSERT(error == 0);
- if (error) { /* to stop gcc throwing set-but-unused warnings */
- kmem_cache_free(xfs_buf_item_zone, bip);
- return error;
- }
-
+ xfs_buf_item_get_format(bip, bp->b_map_count);
for (i = 0; i < bip->bli_format_count; i++) {
chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len),
XFS_BLF_CHUNK);
map_size = DIV_ROUND_UP(chunks, NBWORD);
+ if (map_size > XFS_BLF_DATAMAP_SIZE) {
+ kmem_cache_free(xfs_buf_item_zone, bip);
+ xfs_err(mp,
+ "buffer item dirty bitmap (%u uints) too small to reflect %u bytes!",
+ map_size,
+ BBTOB(bp->b_maps[i].bm_len));
+ return -EFSCORRUPTED;
+ }
+
bip->bli_formats[i].blf_type = XFS_LI_BUF;
bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn;
bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len;
@@ -805,6 +821,9 @@ xfs_buf_item_log_segment(
uint end_bit;
uint mask;
+ ASSERT(first < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD);
+ ASSERT(last < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD);
+
/*
* Convert byte offsets to bit numbers.
*/
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 4a054b11011a..30114b510332 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -61,6 +61,7 @@ void xfs_buf_iodone_callbacks(struct xfs_buf *);
void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *,
struct list_head *);
+bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec);
extern kmem_zone_t *xfs_buf_item_zone;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 2bff21ca9d78..9cfd3209f52b 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -137,7 +137,7 @@ xfs_qm_adjust_dqtimers(
(d->d_blk_hardlimit &&
(be64_to_cpu(d->d_bcount) >
be64_to_cpu(d->d_blk_hardlimit)))) {
- d->d_btimer = cpu_to_be32(get_seconds() +
+ d->d_btimer = cpu_to_be32(ktime_get_real_seconds() +
mp->m_quotainfo->qi_btimelimit);
} else {
d->d_bwarns = 0;
@@ -160,7 +160,7 @@ xfs_qm_adjust_dqtimers(
(d->d_ino_hardlimit &&
(be64_to_cpu(d->d_icount) >
be64_to_cpu(d->d_ino_hardlimit)))) {
- d->d_itimer = cpu_to_be32(get_seconds() +
+ d->d_itimer = cpu_to_be32(ktime_get_real_seconds() +
mp->m_quotainfo->qi_itimelimit);
} else {
d->d_iwarns = 0;
@@ -183,7 +183,7 @@ xfs_qm_adjust_dqtimers(
(d->d_rtb_hardlimit &&
(be64_to_cpu(d->d_rtbcount) >
be64_to_cpu(d->d_rtb_hardlimit)))) {
- d->d_rtbtimer = cpu_to_be32(get_seconds() +
+ d->d_rtbtimer = cpu_to_be32(ktime_get_real_seconds() +
mp->m_quotainfo->qi_rtbtimelimit);
} else {
d->d_rtbwarns = 0;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c93250108952..b8a4a3f29b36 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -187,7 +187,12 @@ xfs_file_dio_aio_read(
file_accessed(iocb->ki_filp);
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
+ return -EAGAIN;
+ } else {
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ }
ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL,
is_sync_kiocb(iocb));
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 401da197f012..1979a0055763 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1518,10 +1518,8 @@ xfs_itruncate_extents_flags(
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp = *tpp;
xfs_fileoff_t first_unmap_block;
- xfs_fileoff_t last_block;
xfs_filblks_t unmap_len;
int error = 0;
- int done = 0;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
@@ -1541,21 +1539,22 @@ xfs_itruncate_extents_flags(
* the end of the file (in a crash where the space is allocated
* but the inode size is not yet updated), simply remove any
* blocks which show up between the new EOF and the maximum
- * possible file size. If the first block to be removed is
- * beyond the maximum file size (ie it is the same as last_block),
- * then there is nothing to do.
+ * possible file size.
+ *
+ * We have to free all the blocks to the bmbt maximum offset, even if
+ * the page cache can't scale that far.
*/
first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
- last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
- if (first_unmap_block == last_block)
+ if (first_unmap_block >= XFS_MAX_FILEOFF) {
+ WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF);
return 0;
+ }
- ASSERT(first_unmap_block < last_block);
- unmap_len = last_block - first_unmap_block + 1;
- while (!done) {
+ unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1;
+ while (unmap_len > 0) {
ASSERT(tp->t_firstblock == NULLFSBLOCK);
- error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags,
- XFS_ITRUNC_MAX_EXTENTS, &done);
+ error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len,
+ flags, XFS_ITRUNC_MAX_EXTENTS);
if (error)
goto out;
@@ -1575,7 +1574,7 @@ xfs_itruncate_extents_flags(
if (whichfork == XFS_DATA_FORK) {
/* Remove all pending CoW reservations. */
error = xfs_reflink_cancel_cow_blocks(ip, &tp,
- first_unmap_block, last_block, true);
+ first_unmap_block, XFS_MAX_FILEOFF, true);
if (error)
goto out;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 7b35d62ede9f..d42de92cb283 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -357,6 +357,7 @@ xfs_attrmulti_attr_get(
{
unsigned char *kbuf;
int error = -EFAULT;
+ size_t namelen;
if (*len > XFS_XATTR_SIZE_MAX)
return -EINVAL;
@@ -364,7 +365,9 @@ xfs_attrmulti_attr_get(
if (!kbuf)
return -ENOMEM;
- error = xfs_attr_get(XFS_I(inode), name, &kbuf, (int *)len, flags);
+ namelen = strlen(name);
+ error = xfs_attr_get(XFS_I(inode), name, namelen, &kbuf, (int *)len,
+ flags);
if (error)
goto out_kfree;
@@ -386,6 +389,7 @@ xfs_attrmulti_attr_set(
{
unsigned char *kbuf;
int error;
+ size_t namelen;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return -EPERM;
@@ -396,7 +400,8 @@ xfs_attrmulti_attr_set(
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
- error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
+ namelen = strlen(name);
+ error = xfs_attr_set(XFS_I(inode), name, namelen, kbuf, len, flags);
if (!error)
xfs_forget_acl(inode, name, flags);
kfree(kbuf);
@@ -410,10 +415,12 @@ xfs_attrmulti_attr_remove(
uint32_t flags)
{
int error;
+ size_t namelen;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return -EPERM;
- error = xfs_attr_remove(XFS_I(inode), name, flags);
+ namelen = strlen(name);
+ error = xfs_attr_remove(XFS_I(inode), name, namelen, flags);
if (!error)
xfs_forget_acl(inode, name, flags);
return error;
@@ -462,6 +469,13 @@ xfs_attrmulti_by_handle(
error = 0;
for (i = 0; i < am_hreq.opcount; i++) {
+ if ((ops[i].am_flags & ATTR_ROOT) &&
+ (ops[i].am_flags & ATTR_SECURE)) {
+ ops[i].am_error = -EINVAL;
+ continue;
+ }
+ ops[i].am_flags &= ~ATTR_KERNEL_FLAGS;
+
ops[i].am_error = strncpy_from_user((char *)attr_name,
ops[i].am_attrname, MAXNAMELEN);
if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index c4c4f09113d3..769581a79c58 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -107,7 +107,7 @@ xfs_ioctl32_bstime_copyin(
xfs_bstime_t *bstime,
compat_xfs_bstime_t __user *bstime32)
{
- compat_time_t sec32; /* tv_sec differs on 64 vs. 32 */
+ old_time32_t sec32; /* tv_sec differs on 64 vs. 32 */
if (get_user(sec32, &bstime32->tv_sec) ||
get_user(bstime->tv_nsec, &bstime32->tv_nsec))
@@ -450,6 +450,13 @@ xfs_compat_attrmulti_by_handle(
error = 0;
for (i = 0; i < am_hreq.opcount; i++) {
+ if ((ops[i].am_flags & ATTR_ROOT) &&
+ (ops[i].am_flags & ATTR_SECURE)) {
+ ops[i].am_error = -EINVAL;
+ continue;
+ }
+ ops[i].am_flags &= ~ATTR_KERNEL_FLAGS;
+
ops[i].am_error = strncpy_from_user((char *)attr_name,
compat_ptr(ops[i].am_attrname),
MAXNAMELEN);
diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h
index 8c7743cd490e..053de7d894cd 100644
--- a/fs/xfs/xfs_ioctl32.h
+++ b/fs/xfs/xfs_ioctl32.h
@@ -32,7 +32,7 @@
#endif
typedef struct compat_xfs_bstime {
- compat_time_t tv_sec; /* seconds */
+ old_time32_t tv_sec; /* seconds */
__s32 tv_nsec; /* and nanoseconds */
} compat_xfs_bstime_t;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 28e2d1f37267..bb590a267a7f 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -923,7 +923,7 @@ xfs_buffered_write_iomap_begin(
xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
/* Trim the mapping to the nearest shared extent boundary. */
- error = xfs_inode_need_cow(ip, &imap, &shared);
+ error = xfs_bmap_trim_cow(ip, &imap, &shared);
if (error)
goto out_unlock;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 8afe69ca188b..81f2f93caec0 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -50,8 +50,10 @@ xfs_initxattrs(
int error = 0;
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
- error = xfs_attr_set(ip, xattr->name, xattr->value,
- xattr->value_len, ATTR_SECURE);
+ error = xfs_attr_set(ip, xattr->name,
+ strlen(xattr->name),
+ xattr->value, xattr->value_len,
+ ATTR_SECURE);
if (error < 0)
break;
}
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 99ec3fba4548..0d683fb96396 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1934,6 +1934,12 @@ xlog_recover_buffer_pass1(
struct list_head *bucket;
struct xfs_buf_cancel *bcp;
+ if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) {
+ xfs_err(log->l_mp, "bad buffer log item size (%d)",
+ item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
/*
* If this isn't a cancel buffer item, then just return.
*/
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index b6701b4f59a9..5f04d8a5ab2a 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -111,6 +111,7 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10);
/* log structures */
+ XFS_CHECK_STRUCT_SIZE(struct xfs_buf_log_format, 88);
XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24);
XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32, 28);
XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64, 32);
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 7823af39008b..4e57edca8bce 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -64,9 +64,9 @@ struct xfs_quotainfo {
struct xfs_inode *qi_pquotaip; /* project quota inode */
struct list_lru qi_lru;
int qi_dquots;
- time_t qi_btimelimit; /* limit for blks timer */
- time_t qi_itimelimit; /* limit for inodes timer */
- time_t qi_rtbtimelimit;/* limit for rt blks timer */
+ time64_t qi_btimelimit; /* limit for blks timer */
+ time64_t qi_itimelimit; /* limit for inodes timer */
+ time64_t qi_rtbtimelimit;/* limit for rt blks timer */
xfs_qwarncnt_t qi_bwarnlimit; /* limit for blks warnings */
xfs_qwarncnt_t qi_iwarnlimit; /* limit for inodes warnings */
xfs_qwarncnt_t qi_rtbwarnlimit;/* limit for rt blks warnings */
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index c7de17deeae6..38669e827206 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -37,9 +37,9 @@ xfs_qm_fill_state(
tstate->flags |= QCI_SYSFILE;
tstate->blocks = ip->i_d.di_nblocks;
tstate->nextents = ip->i_d.di_nextents;
- tstate->spc_timelimit = q->qi_btimelimit;
- tstate->ino_timelimit = q->qi_itimelimit;
- tstate->rt_spc_timelimit = q->qi_rtbtimelimit;
+ tstate->spc_timelimit = (u32)q->qi_btimelimit;
+ tstate->ino_timelimit = (u32)q->qi_itimelimit;
+ tstate->rt_spc_timelimit = (u32)q->qi_rtbtimelimit;
tstate->spc_warnlimit = q->qi_bwarnlimit;
tstate->ino_warnlimit = q->qi_iwarnlimit;
tstate->rt_spc_warnlimit = q->qi_rtbwarnlimit;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index de451235c4ee..e723b267a247 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -223,8 +223,8 @@ xfs_reflink_trim_around_shared(
}
}
-bool
-xfs_inode_need_cow(
+int
+xfs_bmap_trim_cow(
struct xfs_inode *ip,
struct xfs_bmbt_irec *imap,
bool *shared)
@@ -327,7 +327,7 @@ xfs_find_trim_cow_extent(
if (cmap->br_startoff > offset_fsb) {
xfs_trim_extent(imap, imap->br_startoff,
cmap->br_startoff - imap->br_startoff);
- return xfs_inode_need_cow(ip, imap, shared);
+ return xfs_bmap_trim_cow(ip, imap, shared);
}
*shared = true;
@@ -1457,7 +1457,8 @@ xfs_reflink_clear_inode_flag(
* We didn't find any shared blocks so turn off the reflink flag.
* First, get rid of any leftover CoW mappings.
*/
- error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true);
+ error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, XFS_MAX_FILEOFF,
+ true);
if (error)
return error;
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index d18ad7f4fb64..3e4fd46373ab 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -22,7 +22,7 @@ extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal);
extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
struct xfs_bmbt_irec *irec, bool *shared);
-bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
+int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
bool *shared);
int xfs_reflink_allocate_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index d9ae27ddf253..760901783944 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -193,32 +193,6 @@ xfs_fs_show_options(
return 0;
}
-static uint64_t
-xfs_max_file_offset(
- unsigned int blockshift)
-{
- unsigned int pagefactor = 1;
- unsigned int bitshift = BITS_PER_LONG - 1;
-
- /* Figure out maximum filesize, on Linux this can depend on
- * the filesystem blocksize (on 32 bit platforms).
- * __block_write_begin does this in an [unsigned] long long...
- * page->index << (PAGE_SHIFT - bbits)
- * So, for page sized blocks (4K on 32 bit platforms),
- * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
- * (((u64)PAGE_SIZE << (BITS_PER_LONG-1))-1)
- * but for smaller blocksizes it is less (bbits = log2 bsize).
- */
-
-#if BITS_PER_LONG == 32
- ASSERT(sizeof(sector_t) == 8);
- pagefactor = PAGE_SIZE;
- bitshift = BITS_PER_LONG;
-#endif
-
- return (((uint64_t)pagefactor) << bitshift) - 1;
-}
-
/*
* Set parameters for inode allocation heuristics, taking into account
* filesystem size and inode32/inode64 mount options; i.e. specifically
@@ -1424,6 +1398,26 @@ xfs_fc_fill_super(
if (error)
goto out_free_sb;
+ /*
+ * XFS block mappings use 54 bits to store the logical block offset.
+ * This should suffice to handle the maximum file size that the VFS
+ * supports (currently 2^63 bytes on 64-bit and ULONG_MAX << PAGE_SHIFT
+ * bytes on 32-bit), but as XFS and VFS have gotten the s_maxbytes
+ * calculation wrong on 32-bit kernels in the past, we'll add a WARN_ON
+ * to check this assertion.
+ *
+ * Avoid integer overflow by comparing the maximum bmbt offset to the
+ * maximum pagecache offset in units of fs blocks.
+ */
+ if (XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE) > XFS_MAX_FILEOFF) {
+ xfs_warn(mp,
+"MAX_LFS_FILESIZE block offset (%llu) exceeds extent map maximum (%llu)!",
+ XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE),
+ XFS_MAX_FILEOFF);
+ error = -EINVAL;
+ goto out_free_sb;
+ }
+
error = xfs_filestream_mount(mp);
if (error)
goto out_free_sb;
@@ -1435,7 +1429,7 @@ xfs_fc_fill_super(
sb->s_magic = XFS_SUPER_MAGIC;
sb->s_blocksize = mp->m_sb.sb_blocksize;
sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
- sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits);
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_max_links = XFS_MAXLINK;
sb->s_time_gran = 1;
sb->s_time_min = S32_MIN;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index a86be7f807ee..e242988f57fb 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -218,8 +218,8 @@ DECLARE_EVENT_CLASS(xfs_bmap_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
- __field(void *, leaf);
- __field(int, pos);
+ __field(void *, leaf)
+ __field(int, pos)
__field(xfs_fileoff_t, startoff)
__field(xfs_fsblock_t, startblock)
__field(xfs_filblks_t, blockcount)
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index a6fe2d8dc40f..d1b9869bc5fa 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -580,7 +580,7 @@ xfs_trans_dqresv(
{
xfs_qcnt_t hardlimit;
xfs_qcnt_t softlimit;
- time_t timer;
+ time64_t timer;
xfs_qwarncnt_t warns;
xfs_qwarncnt_t warnlimit;
xfs_qcnt_t total_count;
@@ -635,7 +635,8 @@ xfs_trans_dqresv(
goto error_return;
}
if (softlimit && total_count > softlimit) {
- if ((timer != 0 && get_seconds() > timer) ||
+ if ((timer != 0 &&
+ ktime_get_real_seconds() > timer) ||
(warns != 0 && warns >= warnlimit)) {
xfs_quota_warn(mp, dqp,
QUOTA_NL_BSOFTLONGWARN);
@@ -662,7 +663,8 @@ xfs_trans_dqresv(
goto error_return;
}
if (softlimit && total_count > softlimit) {
- if ((timer != 0 && get_seconds() > timer) ||
+ if ((timer != 0 &&
+ ktime_get_real_seconds() > timer) ||
(warns != 0 && warns >= warnlimit)) {
xfs_quota_warn(mp, dqp,
QUOTA_NL_ISOFTLONGWARN);
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 383f0203d103..b0fedb543f97 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -24,6 +24,7 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused,
int xflags = handler->flags;
struct xfs_inode *ip = XFS_I(inode);
int error, asize = size;
+ size_t namelen = strlen(name);
/* Convert Linux syscall to XFS internal ATTR flags */
if (!size) {
@@ -31,7 +32,8 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused,
value = NULL;
}
- error = xfs_attr_get(ip, name, (unsigned char **)&value, &asize, xflags);
+ error = xfs_attr_get(ip, name, namelen, (unsigned char **)&value,
+ &asize, xflags);
if (error)
return error;
return asize;
@@ -67,6 +69,7 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused,
int xflags = handler->flags;
struct xfs_inode *ip = XFS_I(inode);
int error;
+ size_t namelen = strlen(name);
/* Convert Linux syscall to XFS internal ATTR flags */
if (flags & XATTR_CREATE)
@@ -74,10 +77,11 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused,
if (flags & XATTR_REPLACE)
xflags |= ATTR_REPLACE;
- if (!value)
- return xfs_attr_remove(ip, (unsigned char *)name, xflags);
- error = xfs_attr_set(ip, (unsigned char *)name,
- (void *)value, size, xflags);
+ if (value)
+ error = xfs_attr_set(ip, name, namelen, (void *)value, size,
+ xflags);
+ else
+ error = xfs_attr_remove(ip, name, namelen, xflags);
if (!error)
xfs_forget_acl(inode, name, xflags);