summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/affs/affs.h4
-rw-r--r--fs/affs/amigaffs.h144
-rw-r--r--fs/affs/dir.c2
-rw-r--r--fs/affs/file.c10
-rw-r--r--fs/affs/inode.c1
-rw-r--r--fs/affs/namei.c87
-rw-r--r--fs/befs/linuxvfs.c15
-rw-r--r--fs/binfmt_misc.c2
-rw-r--r--fs/block_dev.c117
-rw-r--r--fs/btrfs/ctree.c9
-rw-r--r--fs/btrfs/free-space-tree.c3
-rw-r--r--fs/btrfs/ioctl.c9
-rw-r--r--fs/btrfs/send.c27
-rw-r--r--fs/buffer.c32
-rw-r--r--fs/ceph/file.c9
-rw-r--r--fs/ceph/mds_client.c4
-rw-r--r--fs/cifs/cifs_unicode.c6
-rw-r--r--fs/cifs/cifs_unicode.h5
-rw-r--r--fs/cifs/cifsencrypt.c4
-rw-r--r--fs/cifs/cifsfs.c15
-rw-r--r--fs/cifs/cifsglob.h20
-rw-r--r--fs/cifs/cifsproto.h3
-rw-r--r--fs/cifs/cifssmb.c13
-rw-r--r--fs/cifs/connect.c9
-rw-r--r--fs/cifs/file.c357
-rw-r--r--fs/cifs/inode.c28
-rw-r--r--fs/cifs/ioctl.c4
-rw-r--r--fs/cifs/misc.c122
-rw-r--r--fs/cifs/netmisc.c6
-rw-r--r--fs/cifs/smb2misc.c5
-rw-r--r--fs/cifs/smb2ops.c1
-rw-r--r--fs/cifs/smb2pdu.c14
-rw-r--r--fs/compat_ioctl.c2
-rw-r--r--fs/crypto/fname.c90
-rw-r--r--fs/crypto/fscrypt_private.h13
-rw-r--r--fs/crypto/keyinfo.c3
-rw-r--r--fs/crypto/policy.c98
-rw-r--r--fs/dax.c337
-rw-r--r--fs/dcache.c4
-rw-r--r--fs/debugfs/inode.c2
-rw-r--r--fs/exofs/dir.c3
-rw-r--r--fs/ext2/inode.c9
-rw-r--r--fs/ext4/Makefile10
-rw-r--r--fs/ext4/ext4.h13
-rw-r--r--fs/ext4/fsmap.c722
-rw-r--r--fs/ext4/fsmap.h69
-rw-r--r--fs/ext4/ialloc.c17
-rw-r--r--fs/ext4/inline.c2
-rw-r--r--fs/ext4/inode.c49
-rw-r--r--fs/ext4/ioctl.c92
-rw-r--r--fs/ext4/mballoc.c53
-rw-r--r--fs/ext4/mballoc.h17
-rw-r--r--fs/ext4/namei.c129
-rw-r--r--fs/ext4/page-io.c11
-rw-r--r--fs/ext4/super.c18
-rw-r--r--fs/ext4/sysfs.c8
-rw-r--r--fs/ext4/xattr.c63
-rw-r--r--fs/f2fs/checkpoint.c77
-rw-r--r--fs/f2fs/data.c200
-rw-r--r--fs/f2fs/debug.c62
-rw-r--r--fs/f2fs/dir.c71
-rw-r--r--fs/f2fs/extent_cache.c326
-rw-r--r--fs/f2fs/f2fs.h343
-rw-r--r--fs/f2fs/file.c177
-rw-r--r--fs/f2fs/gc.c93
-rw-r--r--fs/f2fs/hash.c7
-rw-r--r--fs/f2fs/inline.c38
-rw-r--r--fs/f2fs/inode.c23
-rw-r--r--fs/f2fs/namei.c60
-rw-r--r--fs/f2fs/node.c150
-rw-r--r--fs/f2fs/node.h31
-rw-r--r--fs/f2fs/recovery.c8
-rw-r--r--fs/f2fs/segment.c792
-rw-r--r--fs/f2fs/segment.h144
-rw-r--r--fs/f2fs/super.c48
-rw-r--r--fs/f2fs/trace.c4
-rw-r--r--fs/f2fs/xattr.c31
-rw-r--r--fs/f2fs/xattr.h8
-rw-r--r--fs/fcntl.c14
-rw-r--r--fs/file.c2
-rw-r--r--fs/fuse/control.c2
-rw-r--r--fs/gfs2/bmap.c741
-rw-r--r--fs/gfs2/file.c6
-rw-r--r--fs/gfs2/glock.c81
-rw-r--r--fs/gfs2/incore.h8
-rw-r--r--fs/gfs2/inode.c4
-rw-r--r--fs/gfs2/rgrp.c7
-rw-r--r--fs/gfs2/rgrp.h7
-rw-r--r--fs/gfs2/super.c11
-rw-r--r--fs/hfs/extent.c4
-rw-r--r--fs/hfsplus/extents.c5
-rw-r--r--fs/inode.c5
-rw-r--r--fs/internal.h2
-rw-r--r--fs/iomap.c19
-rw-r--r--fs/jbd2/journal.c28
-rw-r--r--fs/libfs.c2
-rw-r--r--fs/namei.c2
-rw-r--r--fs/nfs/internal.h6
-rw-r--r--fs/nfs/super.c28
-rw-r--r--fs/nfsd/nfsctl.c2
-rw-r--r--fs/nsfs.c4
-rw-r--r--fs/open.c6
-rw-r--r--fs/orangefs/devorangefs-req.c4
-rw-r--r--fs/orangefs/dir.c640
-rw-r--r--fs/orangefs/downcall.h16
-rw-r--r--fs/orangefs/file.c6
-rw-r--r--fs/orangefs/inode.c19
-rw-r--r--fs/orangefs/namei.c5
-rw-r--r--fs/orangefs/orangefs-debugfs.c3
-rw-r--r--fs/orangefs/orangefs-dev-proto.h7
-rw-r--r--fs/orangefs/orangefs-kernel.h9
-rw-r--r--fs/orangefs/orangefs-utils.c98
-rw-r--r--fs/orangefs/protocol.h9
-rw-r--r--fs/orangefs/super.c28
-rw-r--r--fs/orangefs/waitqueue.c9
-rw-r--r--fs/orangefs/xattr.c26
-rw-r--r--fs/proc/base.c5
-rw-r--r--fs/proc/generic.c1
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/namespaces.c1
-rw-r--r--fs/proc/proc_sysctl.c4
-rw-r--r--fs/reiserfs/item_ops.c24
-rw-r--r--fs/select.c5
-rw-r--r--fs/seq_file.c16
-rw-r--r--fs/tracefs/inode.c2
-rw-r--r--fs/ubifs/dir.c23
-rw-r--r--fs/ubifs/file.c12
-rw-r--r--fs/ubifs/ioctl.c2
-rw-r--r--fs/ubifs/misc.h10
-rw-r--r--fs/ubifs/sb.c14
-rw-r--r--fs/ubifs/xattr.c6
-rw-r--r--fs/ufs/ialloc.c6
-rw-r--r--fs/xattr.c27
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/kmem.c2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c57
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h12
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c172
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c354
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h14
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c43
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.h22
-rw-r--r--fs/xfs/libxfs/xfs_btree.c15
-rw-r--r--fs/xfs/libxfs/xfs_btree.h2
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c7
-rw-r--r--fs/xfs/libxfs/xfs_format.h11
-rw-r--r--fs/xfs/libxfs/xfs_fs.h13
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c2
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c90
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c56
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h4
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c70
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.h23
-rw-r--r--fs/xfs/xfs_aops.c12
-rw-r--r--fs/xfs/xfs_bmap_item.c6
-rw-r--r--fs/xfs/xfs_bmap_util.c20
-rw-r--r--fs/xfs/xfs_buf.c24
-rw-r--r--fs/xfs/xfs_buf.h2
-rw-r--r--fs/xfs/xfs_dir2_readdir.c15
-rw-r--r--fs/xfs/xfs_discard.c10
-rw-r--r--fs/xfs/xfs_extfree_item.c1
-rw-r--r--fs/xfs/xfs_fsmap.c940
-rw-r--r--fs/xfs/xfs_fsmap.h53
-rw-r--r--fs/xfs/xfs_icache.c58
-rw-r--r--fs/xfs/xfs_icache.h8
-rw-r--r--fs/xfs/xfs_inode.c9
-rw-r--r--fs/xfs/xfs_inode.h4
-rw-r--r--fs/xfs/xfs_inode_item.c29
-rw-r--r--fs/xfs/xfs_ioctl.c89
-rw-r--r--fs/xfs/xfs_ioctl32.c2
-rw-r--r--fs/xfs/xfs_iomap.c18
-rw-r--r--fs/xfs/xfs_linux.h85
-rw-r--r--fs/xfs/xfs_log.c4
-rw-r--r--fs/xfs/xfs_log_recover.c2
-rw-r--r--fs/xfs/xfs_mount.c4
-rw-r--r--fs/xfs/xfs_mount.h5
-rw-r--r--fs/xfs/xfs_qm.c11
-rw-r--r--fs/xfs/xfs_qm_syscalls.c3
-rw-r--r--fs/xfs/xfs_refcount_item.c1
-rw-r--r--fs/xfs/xfs_reflink.c39
-rw-r--r--fs/xfs/xfs_rmap_item.c1
-rw-r--r--fs/xfs/xfs_rtalloc.h22
-rw-r--r--fs/xfs/xfs_super.c8
-rw-r--r--fs/xfs/xfs_trace.c1
-rw-r--r--fs/xfs/xfs_trace.h144
-rw-r--r--fs/xfs/xfs_trans.c39
-rw-r--r--fs/xfs/xfs_trans.h3
-rw-r--r--fs/xfs/xfs_trans_ail.c71
-rw-r--r--fs/xfs/xfs_trans_priv.h15
189 files changed, 7126 insertions, 3251 deletions
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 2f8bab390d13..773749be8290 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -7,7 +7,7 @@
#include <linux/types.h>
#include <linux/fs.h>
#include <linux/buffer_head.h>
-#include <linux/amigaffs.h>
+#include "amigaffs.h"
#include <linux/mutex.h>
#include <linux/workqueue.h>
@@ -173,7 +173,7 @@ extern int affs_link(struct dentry *olddentry, struct inode *dir,
struct dentry *dentry);
extern int affs_symlink(struct inode *dir, struct dentry *dentry,
const char *symname);
-extern int affs_rename(struct inode *old_dir, struct dentry *old_dentry,
+extern int affs_rename2(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags);
diff --git a/fs/affs/amigaffs.h b/fs/affs/amigaffs.h
new file mode 100644
index 000000000000..43b41c06aa37
--- /dev/null
+++ b/fs/affs/amigaffs.h
@@ -0,0 +1,144 @@
+#ifndef AMIGAFFS_H
+#define AMIGAFFS_H
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+
+#define FS_OFS 0x444F5300
+#define FS_FFS 0x444F5301
+#define FS_INTLOFS 0x444F5302
+#define FS_INTLFFS 0x444F5303
+#define FS_DCOFS 0x444F5304
+#define FS_DCFFS 0x444F5305
+#define MUFS_FS 0x6d754653 /* 'muFS' */
+#define MUFS_OFS 0x6d754600 /* 'muF\0' */
+#define MUFS_FFS 0x6d754601 /* 'muF\1' */
+#define MUFS_INTLOFS 0x6d754602 /* 'muF\2' */
+#define MUFS_INTLFFS 0x6d754603 /* 'muF\3' */
+#define MUFS_DCOFS 0x6d754604 /* 'muF\4' */
+#define MUFS_DCFFS 0x6d754605 /* 'muF\5' */
+
+#define T_SHORT 2
+#define T_LIST 16
+#define T_DATA 8
+
+#define ST_LINKFILE -4
+#define ST_FILE -3
+#define ST_ROOT 1
+#define ST_USERDIR 2
+#define ST_SOFTLINK 3
+#define ST_LINKDIR 4
+
+#define AFFS_ROOT_BMAPS 25
+
+struct affs_date {
+ __be32 days;
+ __be32 mins;
+ __be32 ticks;
+};
+
+struct affs_short_date {
+ __be16 days;
+ __be16 mins;
+ __be16 ticks;
+};
+
+struct affs_root_head {
+ __be32 ptype;
+ __be32 spare1;
+ __be32 spare2;
+ __be32 hash_size;
+ __be32 spare3;
+ __be32 checksum;
+ __be32 hashtable[1];
+};
+
+struct affs_root_tail {
+ __be32 bm_flag;
+ __be32 bm_blk[AFFS_ROOT_BMAPS];
+ __be32 bm_ext;
+ struct affs_date root_change;
+ u8 disk_name[32];
+ __be32 spare1;
+ __be32 spare2;
+ struct affs_date disk_change;
+ struct affs_date disk_create;
+ __be32 spare3;
+ __be32 spare4;
+ __be32 dcache;
+ __be32 stype;
+};
+
+struct affs_head {
+ __be32 ptype;
+ __be32 key;
+ __be32 block_count;
+ __be32 spare1;
+ __be32 first_data;
+ __be32 checksum;
+ __be32 table[1];
+};
+
+struct affs_tail {
+ __be32 spare1;
+ __be16 uid;
+ __be16 gid;
+ __be32 protect;
+ __be32 size;
+ u8 comment[92];
+ struct affs_date change;
+ u8 name[32];
+ __be32 spare2;
+ __be32 original;
+ __be32 link_chain;
+ __be32 spare[5];
+ __be32 hash_chain;
+ __be32 parent;
+ __be32 extension;
+ __be32 stype;
+};
+
+struct slink_front
+{
+ __be32 ptype;
+ __be32 key;
+ __be32 spare1[3];
+ __be32 checksum;
+ u8 symname[1]; /* depends on block size */
+};
+
+struct affs_data_head
+{
+ __be32 ptype;
+ __be32 key;
+ __be32 sequence;
+ __be32 size;
+ __be32 next;
+ __be32 checksum;
+ u8 data[1]; /* depends on block size */
+};
+
+/* Permission bits */
+
+#define FIBF_OTR_READ 0x8000
+#define FIBF_OTR_WRITE 0x4000
+#define FIBF_OTR_EXECUTE 0x2000
+#define FIBF_OTR_DELETE 0x1000
+#define FIBF_GRP_READ 0x0800
+#define FIBF_GRP_WRITE 0x0400
+#define FIBF_GRP_EXECUTE 0x0200
+#define FIBF_GRP_DELETE 0x0100
+
+#define FIBF_HIDDEN 0x0080
+#define FIBF_SCRIPT 0x0040
+#define FIBF_PURE 0x0020 /* no use under linux */
+#define FIBF_ARCHIVED 0x0010 /* never set, always cleared on write */
+#define FIBF_NOREAD 0x0008 /* 0 means allowed */
+#define FIBF_NOWRITE 0x0004 /* 0 means allowed */
+#define FIBF_NOEXECUTE 0x0002 /* 0 means allowed, ignored under linux */
+#define FIBF_NODELETE 0x0001 /* 0 means allowed */
+
+#define FIBF_OWNER 0x000F /* Bits pertaining to owner */
+#define FIBF_MASK 0xEE0E /* Bits modified by Linux */
+
+#endif
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index f1e7294381c5..591ecd7f3063 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -35,7 +35,7 @@ const struct inode_operations affs_dir_inode_operations = {
.symlink = affs_symlink,
.mkdir = affs_mkdir,
.rmdir = affs_rmdir,
- .rename = affs_rename,
+ .rename = affs_rename2,
.setattr = affs_notify_change,
};
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 0deec9cc2362..196ee7f6fdc4 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -499,7 +499,7 @@ affs_getemptyblk_ino(struct inode *inode, int block)
}
static int
-affs_do_readpage_ofs(struct page *page, unsigned to)
+affs_do_readpage_ofs(struct page *page, unsigned to, int create)
{
struct inode *inode = page->mapping->host;
struct super_block *sb = inode->i_sb;
@@ -518,7 +518,7 @@ affs_do_readpage_ofs(struct page *page, unsigned to)
boff = tmp % bsize;
while (pos < to) {
- bh = affs_bread_ino(inode, bidx, 0);
+ bh = affs_bread_ino(inode, bidx, create);
if (IS_ERR(bh))
return PTR_ERR(bh);
tmp = min(bsize - boff, to - pos);
@@ -620,7 +620,7 @@ affs_readpage_ofs(struct file *file, struct page *page)
memset(page_address(page) + to, 0, PAGE_SIZE - to);
}
- err = affs_do_readpage_ofs(page, to);
+ err = affs_do_readpage_ofs(page, to, 0);
if (!err)
SetPageUptodate(page);
unlock_page(page);
@@ -657,7 +657,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
return 0;
/* XXX: inefficient but safe in the face of short writes */
- err = affs_do_readpage_ofs(page, PAGE_SIZE);
+ err = affs_do_readpage_ofs(page, PAGE_SIZE, 1);
if (err) {
unlock_page(page);
put_page(page);
@@ -679,7 +679,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
int written;
from = pos & (PAGE_SIZE - 1);
- to = pos + len;
+ to = from + len;
/*
* XXX: not sure if this can handle short copies (len < copied), but
* we don't have to, because the page should always be uptodate here,
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index abcc59899229..fd4ef3c40e40 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -140,6 +140,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
inode->i_fop = &affs_file_operations;
break;
case ST_SOFTLINK:
+ inode->i_size = strlen((char *)AFFS_HEAD(bh)->table);
inode->i_mode |= S_IFLNK;
inode_nohighmem(inode);
inode->i_op = &affs_symlink_inode_operations;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 96dd1d09a273..46d3ace6761d 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -365,6 +365,7 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
symname++;
}
*p = 0;
+ inode->i_size = i + 1;
mark_buffer_dirty_inode(bh, inode);
affs_brelse(bh);
mark_inode_dirty(inode);
@@ -393,21 +394,14 @@ affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
return affs_add_entry(dir, inode, dentry, ST_LINKFILE);
}
-int
+static int
affs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
+ struct inode *new_dir, struct dentry *new_dentry)
{
struct super_block *sb = old_dir->i_sb;
struct buffer_head *bh = NULL;
int retval;
- if (flags & ~RENAME_NOREPLACE)
- return -EINVAL;
-
- pr_debug("%s(old=%lu,\"%pd\" to new=%lu,\"%pd\")\n", __func__,
- old_dir->i_ino, old_dentry, new_dir->i_ino, new_dentry);
-
retval = affs_check_name(new_dentry->d_name.name,
new_dentry->d_name.len,
affs_nofilenametruncate(old_dentry));
@@ -447,6 +441,76 @@ done:
return retval;
}
+static int
+affs_xrename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+
+ struct super_block *sb = old_dir->i_sb;
+ struct buffer_head *bh_old = NULL;
+ struct buffer_head *bh_new = NULL;
+ int retval;
+
+ bh_old = affs_bread(sb, d_inode(old_dentry)->i_ino);
+ if (!bh_old)
+ return -EIO;
+
+ bh_new = affs_bread(sb, d_inode(new_dentry)->i_ino);
+ if (!bh_new)
+ return -EIO;
+
+ /* Remove old header from its parent directory. */
+ affs_lock_dir(old_dir);
+ retval = affs_remove_hash(old_dir, bh_old);
+ affs_unlock_dir(old_dir);
+ if (retval)
+ goto done;
+
+ /* Remove new header from its parent directory. */
+ affs_lock_dir(new_dir);
+ retval = affs_remove_hash(new_dir, bh_new);
+ affs_unlock_dir(new_dir);
+ if (retval)
+ goto done;
+
+ /* Insert old into the new directory with the new name. */
+ affs_copy_name(AFFS_TAIL(sb, bh_old)->name, new_dentry);
+ affs_fix_checksum(sb, bh_old);
+ affs_lock_dir(new_dir);
+ retval = affs_insert_hash(new_dir, bh_old);
+ affs_unlock_dir(new_dir);
+
+ /* Insert new into the old directory with the old name. */
+ affs_copy_name(AFFS_TAIL(sb, bh_new)->name, old_dentry);
+ affs_fix_checksum(sb, bh_new);
+ affs_lock_dir(old_dir);
+ retval = affs_insert_hash(old_dir, bh_new);
+ affs_unlock_dir(old_dir);
+done:
+ mark_buffer_dirty_inode(bh_old, new_dir);
+ mark_buffer_dirty_inode(bh_new, old_dir);
+ affs_brelse(bh_old);
+ affs_brelse(bh_new);
+ return retval;
+}
+
+int affs_rename2(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
+{
+
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+ return -EINVAL;
+
+ pr_debug("%s(old=%lu,\"%pd\" to new=%lu,\"%pd\")\n", __func__,
+ old_dir->i_ino, old_dentry, new_dir->i_ino, new_dentry);
+
+ if (flags & RENAME_EXCHANGE)
+ return affs_xrename(old_dir, old_dentry, new_dir, new_dentry);
+
+ return affs_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
static struct dentry *affs_get_parent(struct dentry *child)
{
struct inode *parent;
@@ -477,11 +541,6 @@ static struct inode *affs_nfs_get_inode(struct super_block *sb, u64 ino,
if (IS_ERR(inode))
return ERR_CAST(inode);
- if (generation && inode->i_generation != generation) {
- iput(inode);
- return ERR_PTR(-ESTALE);
- }
-
return inode;
}
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index c500e954debb..63e7c4760bfb 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -58,6 +58,7 @@ static struct dentry *befs_fh_to_dentry(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type);
static struct dentry *befs_fh_to_parent(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type);
+static struct dentry *befs_get_parent(struct dentry *child);
static const struct super_operations befs_sops = {
.alloc_inode = befs_alloc_inode, /* allocate a new inode */
@@ -93,6 +94,7 @@ static const struct address_space_operations befs_symlink_aops = {
static const struct export_operations befs_export_operations = {
.fh_to_dentry = befs_fh_to_dentry,
.fh_to_parent = befs_fh_to_parent,
+ .get_parent = befs_get_parent,
};
/*
@@ -667,6 +669,19 @@ static struct dentry *befs_fh_to_parent(struct super_block *sb,
befs_nfs_get_inode);
}
+static struct dentry *befs_get_parent(struct dentry *child)
+{
+ struct inode *parent;
+ struct befs_inode_info *befs_ino = BEFS_I(d_inode(child));
+
+ parent = befs_iget(child->d_sb,
+ (unsigned long)befs_ino->i_parent.start);
+ if (IS_ERR(parent))
+ return ERR_CAST(parent);
+
+ return d_obtain_alias(parent);
+}
+
enum {
Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err,
};
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index bee1a36bc2ec..f4718098ac31 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -818,7 +818,7 @@ static const struct super_operations s_ops = {
static int bm_fill_super(struct super_block *sb, void *data, int silent)
{
int err;
- static struct tree_descr bm_files[] = {
+ static const struct tree_descr bm_files[] = {
[2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO},
[3] = {"register", &bm_register_operations, S_IWUSR},
/* last one */ {""}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 0d435c794d76..2a305c1a2d88 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -18,6 +18,7 @@
#include <linux/module.h>
#include <linux/blkpg.h>
#include <linux/magic.h>
+#include <linux/dax.h>
#include <linux/buffer_head.h>
#include <linux/swap.h>
#include <linux/pagevec.h>
@@ -716,50 +717,18 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL_GPL(bdev_write_page);
-/**
- * bdev_direct_access() - Get the address for directly-accessibly memory
- * @bdev: The device containing the memory
- * @dax: control and output parameters for ->direct_access
- *
- * If a block device is made up of directly addressable memory, this function
- * will tell the caller the PFN and the address of the memory. The address
- * may be directly dereferenced within the kernel without the need to call
- * ioremap(), kmap() or similar. The PFN is suitable for inserting into
- * page tables.
- *
- * Return: negative errno if an error occurs, otherwise the number of bytes
- * accessible at this address.
- */
-long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
+int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
+ pgoff_t *pgoff)
{
- sector_t sector = dax->sector;
- long avail, size = dax->size;
- const struct block_device_operations *ops = bdev->bd_disk->fops;
+ phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512;
- /*
- * The device driver is allowed to sleep, in order to make the
- * memory directly accessible.
- */
- might_sleep();
-
- if (size < 0)
- return size;
- if (!blk_queue_dax(bdev_get_queue(bdev)) || !ops->direct_access)
- return -EOPNOTSUPP;
- if ((sector + DIV_ROUND_UP(size, 512)) >
- part_nr_sects_read(bdev->bd_part))
- return -ERANGE;
- sector += get_start_sect(bdev);
- if (sector % (PAGE_SIZE / 512))
+ if (pgoff)
+ *pgoff = PHYS_PFN(phys_off);
+ if (phys_off % PAGE_SIZE || size % PAGE_SIZE)
return -EINVAL;
- avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn, size);
- if (!avail)
- return -ERANGE;
- if (avail > 0 && avail & ~PAGE_MASK)
- return -ENXIO;
- return min(avail, size);
+ return 0;
}
-EXPORT_SYMBOL_GPL(bdev_direct_access);
+EXPORT_SYMBOL(bdev_dax_pgoff);
/**
* bdev_dax_supported() - Check if the device supports dax for filesystem
@@ -773,62 +742,46 @@ EXPORT_SYMBOL_GPL(bdev_direct_access);
*/
int bdev_dax_supported(struct super_block *sb, int blocksize)
{
- struct blk_dax_ctl dax = {
- .sector = 0,
- .size = PAGE_SIZE,
- };
- int err;
+ struct block_device *bdev = sb->s_bdev;
+ struct dax_device *dax_dev;
+ pgoff_t pgoff;
+ int err, id;
+ void *kaddr;
+ pfn_t pfn;
+ long len;
if (blocksize != PAGE_SIZE) {
vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax");
return -EINVAL;
}
- err = bdev_direct_access(sb->s_bdev, &dax);
- if (err < 0) {
- switch (err) {
- case -EOPNOTSUPP:
- vfs_msg(sb, KERN_ERR,
- "error: device does not support dax");
- break;
- case -EINVAL:
- vfs_msg(sb, KERN_ERR,
- "error: unaligned partition for dax");
- break;
- default:
- vfs_msg(sb, KERN_ERR,
- "error: dax access failed (%d)", err);
- }
+ err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
+ if (err) {
+ vfs_msg(sb, KERN_ERR, "error: unaligned partition for dax");
return err;
}
- return 0;
-}
-EXPORT_SYMBOL_GPL(bdev_dax_supported);
-
-/**
- * bdev_dax_capable() - Return if the raw device is capable for dax
- * @bdev: The device for raw block device access
- */
-bool bdev_dax_capable(struct block_device *bdev)
-{
- struct blk_dax_ctl dax = {
- .size = PAGE_SIZE,
- };
+ dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+ if (!dax_dev) {
+ vfs_msg(sb, KERN_ERR, "error: device does not support dax");
+ return -EOPNOTSUPP;
+ }
- if (!IS_ENABLED(CONFIG_FS_DAX))
- return false;
+ id = dax_read_lock();
+ len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn);
+ dax_read_unlock(id);
- dax.sector = 0;
- if (bdev_direct_access(bdev, &dax) < 0)
- return false;
+ put_dax(dax_dev);
- dax.sector = bdev->bd_part->nr_sects - (PAGE_SIZE / 512);
- if (bdev_direct_access(bdev, &dax) < 0)
- return false;
+ if (len < 1) {
+ vfs_msg(sb, KERN_ERR,
+ "error: dax access failed (%ld)", len);
+ return len < 0 ? len : -EIO;
+ }
- return true;
+ return 0;
}
+EXPORT_SYMBOL_GPL(bdev_dax_supported);
/*
* pseudo-fs
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 7dc8844037e0..1c3b6c54d5ee 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -5392,13 +5392,10 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
goto out;
}
- tmp_buf = kmalloc(fs_info->nodesize, GFP_KERNEL | __GFP_NOWARN);
+ tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
if (!tmp_buf) {
- tmp_buf = vmalloc(fs_info->nodesize);
- if (!tmp_buf) {
- ret = -ENOMEM;
- goto out;
- }
+ ret = -ENOMEM;
+ goto out;
}
left_path->search_commit_root = 1;
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index dd7fb22a955a..fc0bd8406758 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -167,8 +167,7 @@ static u8 *alloc_bitmap(u32 bitmap_size)
if (mem)
return mem;
- return __vmalloc(bitmap_size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO,
- PAGE_KERNEL);
+ return __vmalloc(bitmap_size, GFP_NOFS | __GFP_ZERO, PAGE_KERNEL);
}
int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index dabfc7ac48a6..922a66fce401 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3539,12 +3539,9 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
u64 last_dest_end = destoff;
ret = -ENOMEM;
- buf = kmalloc(fs_info->nodesize, GFP_KERNEL | __GFP_NOWARN);
- if (!buf) {
- buf = vmalloc(fs_info->nodesize);
- if (!buf)
- return ret;
- }
+ buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
+ if (!buf)
+ return ret;
path = btrfs_alloc_path();
if (!path) {
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index a60d5bfb8a49..3f645cd67b54 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -6360,22 +6360,16 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
sctx->clone_roots_cnt = arg->clone_sources_count;
sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
- sctx->send_buf = kmalloc(sctx->send_max_size, GFP_KERNEL | __GFP_NOWARN);
+ sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL);
if (!sctx->send_buf) {
- sctx->send_buf = vmalloc(sctx->send_max_size);
- if (!sctx->send_buf) {
- ret = -ENOMEM;
- goto out;
- }
+ ret = -ENOMEM;
+ goto out;
}
- sctx->read_buf = kmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL | __GFP_NOWARN);
+ sctx->read_buf = kvmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL);
if (!sctx->read_buf) {
- sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE);
- if (!sctx->read_buf) {
- ret = -ENOMEM;
- goto out;
- }
+ ret = -ENOMEM;
+ goto out;
}
sctx->pending_dir_moves = RB_ROOT;
@@ -6396,13 +6390,10 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources);
if (arg->clone_sources_count) {
- clone_sources_tmp = kmalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN);
+ clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL);
if (!clone_sources_tmp) {
- clone_sources_tmp = vmalloc(alloc_size);
- if (!clone_sources_tmp) {
- ret = -ENOMEM;
- goto out;
- }
+ ret = -ENOMEM;
+ goto out;
}
ret = copy_from_user(clone_sources_tmp, arg->clone_sources,
diff --git a/fs/buffer.c b/fs/buffer.c
index 9196f2a270da..161be58c5cb0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -49,7 +49,6 @@
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
- unsigned long bio_flags,
struct writeback_control *wbc);
#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
@@ -1830,7 +1829,7 @@ int __block_write_full_page(struct inode *inode, struct page *page,
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
- submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, 0, wbc);
+ submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
nr_underway++;
}
bh = next;
@@ -1884,7 +1883,7 @@ recover:
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
clear_buffer_dirty(bh);
- submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, 0, wbc);
+ submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
nr_underway++;
}
bh = next;
@@ -2379,8 +2378,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size)
goto out;
err = pagecache_write_begin(NULL, mapping, size, 0,
- AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
- &page, &fsdata);
+ AOP_FLAG_CONT_EXPAND, &page, &fsdata);
if (err)
goto out;
@@ -2415,9 +2413,8 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
}
len = PAGE_SIZE - zerofrom;
- err = pagecache_write_begin(file, mapping, curpos, len,
- AOP_FLAG_UNINTERRUPTIBLE,
- &page, &fsdata);
+ err = pagecache_write_begin(file, mapping, curpos, len, 0,
+ &page, &fsdata);
if (err)
goto out;
zero_user(page, zerofrom, len);
@@ -2449,9 +2446,8 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
}
len = offset - zerofrom;
- err = pagecache_write_begin(file, mapping, curpos, len,
- AOP_FLAG_UNINTERRUPTIBLE,
- &page, &fsdata);
+ err = pagecache_write_begin(file, mapping, curpos, len, 0,
+ &page, &fsdata);
if (err)
goto out;
zero_user(page, zerofrom, len);
@@ -3095,7 +3091,7 @@ void guard_bio_eod(int op, struct bio *bio)
}
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
- unsigned long bio_flags, struct writeback_control *wbc)
+ struct writeback_control *wbc)
{
struct bio *bio;
@@ -3130,7 +3126,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
bio->bi_end_io = end_bio_bh_io_sync;
bio->bi_private = bh;
- bio->bi_flags |= bio_flags;
/* Take care of bh's that straddle the end of the device */
guard_bio_eod(op, bio);
@@ -3145,16 +3140,9 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
return 0;
}
-int _submit_bh(int op, int op_flags, struct buffer_head *bh,
- unsigned long bio_flags)
+int submit_bh(int op, int op_flags, struct buffer_head *bh)
{
- return submit_bh_wbc(op, op_flags, bh, bio_flags, NULL);
-}
-EXPORT_SYMBOL_GPL(_submit_bh);
-
-int submit_bh(int op, int op_flags, struct buffer_head *bh)
-{
- return submit_bh_wbc(op, op_flags, bh, 0, NULL);
+ return submit_bh_wbc(op, op_flags, bh, NULL);
}
EXPORT_SYMBOL(submit_bh);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 26cc95421cca..18c045e2ead6 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -74,12 +74,9 @@ dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
align = (unsigned long)(it->iov->iov_base + it->iov_offset) &
(PAGE_SIZE - 1);
npages = calc_pages_for(align, nbytes);
- pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
- if (!pages) {
- pages = vmalloc(sizeof(*pages) * npages);
- if (!pages)
- return ERR_PTR(-ENOMEM);
- }
+ pages = kvmalloc(sizeof(*pages) * npages, GFP_KERNEL);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
for (idx = 0; idx < npages; ) {
size_t start;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index c681762d76e6..1d3fa90d40b9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1666,6 +1666,7 @@ struct ceph_mds_request *
ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
{
struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
+ struct timespec ts;
if (!req)
return ERR_PTR(-ENOMEM);
@@ -1684,7 +1685,8 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
init_completion(&req->r_safe_completion);
INIT_LIST_HEAD(&req->r_unsafe_item);
- req->r_stamp = current_fs_time(mdsc->fsc->sb);
+ ktime_get_real_ts(&ts);
+ req->r_stamp = timespec_trunc(ts, mdsc->fsc->sb->s_time_gran);
req->r_op = op;
req->r_direct_mode = mode;
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 02b071bf3732..a0b3e7d1be48 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -83,6 +83,9 @@ convert_sfm_char(const __u16 src_char, char *target)
case SFM_COLON:
*target = ':';
break;
+ case SFM_DOUBLEQUOTE:
+ *target = '"';
+ break;
case SFM_ASTERISK:
*target = '*';
break;
@@ -418,6 +421,9 @@ static __le16 convert_to_sfm_char(char src_char, bool end_of_string)
case ':':
dest_char = cpu_to_le16(SFM_COLON);
break;
+ case '"':
+ dest_char = cpu_to_le16(SFM_DOUBLEQUOTE);
+ break;
case '*':
dest_char = cpu_to_le16(SFM_ASTERISK);
break;
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 3d7298cc0aeb..8a79a34e66b8 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -57,6 +57,7 @@
* not conflict (although almost does) with the mapping above.
*/
+#define SFM_DOUBLEQUOTE ((__u16) 0xF020)
#define SFM_ASTERISK ((__u16) 0xF021)
#define SFM_QUESTION ((__u16) 0xF025)
#define SFM_COLON ((__u16) 0xF022)
@@ -64,8 +65,8 @@
#define SFM_LESSTHAN ((__u16) 0xF023)
#define SFM_PIPE ((__u16) 0xF027)
#define SFM_SLASH ((__u16) 0xF026)
-#define SFM_PERIOD ((__u16) 0xF028)
-#define SFM_SPACE ((__u16) 0xF029)
+#define SFM_SPACE ((__u16) 0xF028)
+#define SFM_PERIOD ((__u16) 0xF029)
/*
* Mapping mechanism to use when one of the seven reserved characters is
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 058ac9b36f04..68abbb0db608 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -478,6 +478,7 @@ find_timestamp(struct cifs_ses *ses)
unsigned char *blobptr;
unsigned char *blobend;
struct ntlmssp2_name *attrptr;
+ struct timespec ts;
if (!ses->auth_key.len || !ses->auth_key.response)
return 0;
@@ -502,7 +503,8 @@ find_timestamp(struct cifs_ses *ses)
blobptr += attrsize; /* advance attr value */
}
- return cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+ ktime_get_real_ts(&ts);
+ return cpu_to_le64(cifs_UnixTimeToNT(ts));
}
static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index d0d11b73b2af..9a1667e0e8d6 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -88,6 +88,7 @@ extern mempool_t *cifs_req_poolp;
extern mempool_t *cifs_mid_poolp;
struct workqueue_struct *cifsiod_wq;
+struct workqueue_struct *cifsoplockd_wq;
__u32 cifs_lock_secret;
/*
@@ -1375,9 +1376,16 @@ init_cifs(void)
goto out_clean_proc;
}
+ cifsoplockd_wq = alloc_workqueue("cifsoplockd",
+ WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+ if (!cifsoplockd_wq) {
+ rc = -ENOMEM;
+ goto out_destroy_cifsiod_wq;
+ }
+
rc = cifs_fscache_register();
if (rc)
- goto out_destroy_wq;
+ goto out_destroy_cifsoplockd_wq;
rc = cifs_init_inodecache();
if (rc)
@@ -1425,7 +1433,9 @@ out_destroy_inodecache:
cifs_destroy_inodecache();
out_unreg_fscache:
cifs_fscache_unregister();
-out_destroy_wq:
+out_destroy_cifsoplockd_wq:
+ destroy_workqueue(cifsoplockd_wq);
+out_destroy_cifsiod_wq:
destroy_workqueue(cifsiod_wq);
out_clean_proc:
cifs_proc_clean();
@@ -1448,6 +1458,7 @@ exit_cifs(void)
cifs_destroy_mids();
cifs_destroy_inodecache();
cifs_fscache_unregister();
+ destroy_workqueue(cifsoplockd_wq);
destroy_workqueue(cifsiod_wq);
cifs_proc_clean();
}
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 37f5a41cc50c..8be55be70faf 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1115,6 +1115,23 @@ struct cifs_io_parms {
struct cifs_tcon *tcon;
};
+struct cifs_aio_ctx {
+ struct kref refcount;
+ struct list_head list;
+ struct mutex aio_mutex;
+ struct completion done;
+ struct iov_iter iter;
+ struct kiocb *iocb;
+ struct cifsFileInfo *cfile;
+ struct bio_vec *bv;
+ loff_t pos;
+ unsigned int npages;
+ ssize_t rc;
+ unsigned int len;
+ unsigned int total_len;
+ bool should_dirty;
+};
+
struct cifs_readdata;
/* asynchronous read support */
@@ -1124,6 +1141,7 @@ struct cifs_readdata {
struct completion done;
struct cifsFileInfo *cfile;
struct address_space *mapping;
+ struct cifs_aio_ctx *ctx;
__u64 offset;
unsigned int bytes;
unsigned int got_bytes;
@@ -1154,6 +1172,7 @@ struct cifs_writedata {
enum writeback_sync_modes sync_mode;
struct work_struct work;
struct cifsFileInfo *cfile;
+ struct cifs_aio_ctx *ctx;
__u64 offset;
pid_t pid;
unsigned int bytes;
@@ -1683,6 +1702,7 @@ void cifs_oplock_break(struct work_struct *work);
extern const struct slow_work_ops cifs_oplock_break_ops;
extern struct workqueue_struct *cifsiod_wq;
+extern struct workqueue_struct *cifsoplockd_wq;
extern __u32 cifs_lock_secret;
extern mempool_t *cifs_mid_poolp;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 97e5d236d265..e49958c3f8bb 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -535,4 +535,7 @@ int __cifs_calc_signature(struct smb_rqst *rqst,
struct shash_desc *shash);
enum securityEnum cifs_select_sectype(struct TCP_Server_Info *,
enum securityEnum);
+struct cifs_aio_ctx *cifs_aio_ctx_alloc(void);
+void cifs_aio_ctx_release(struct kref *refcount);
+int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw);
#endif /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 5d21f00ae341..4c01b3f9abf0 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -478,14 +478,14 @@ decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr)
* this requirement.
*/
int val, seconds, remain, result;
- struct timespec ts, utc;
- utc = CURRENT_TIME;
+ struct timespec ts;
+ unsigned long utc = ktime_get_real_seconds();
ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
rsp->SrvTime.Time, 0);
cifs_dbg(FYI, "SrvTime %d sec since 1970 (utc: %d) diff: %d\n",
- (int)ts.tv_sec, (int)utc.tv_sec,
- (int)(utc.tv_sec - ts.tv_sec));
- val = (int)(utc.tv_sec - ts.tv_sec);
+ (int)ts.tv_sec, (int)utc,
+ (int)(utc - ts.tv_sec));
+ val = (int)(utc - ts.tv_sec);
seconds = abs(val);
result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
remain = seconds % MIN_TZ_ADJ;
@@ -718,6 +718,9 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
if (rc)
return rc;
+ if (server->capabilities & CAP_UNICODE)
+ smb->hdr.Flags2 |= SMBFLG2_UNICODE;
+
/* set up echo request */
smb->hdr.Tid = 0xffff;
smb->hdr.WordCount = 1;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 9bc0b4d6d065..9365c0cf77ad 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1946,9 +1946,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
}
if (!got_ip) {
+ int len;
+ const char *slash;
+
/* No ip= option specified? Try to get it from UNC */
- if (!cifs_convert_address(dstaddr, &vol->UNC[2],
- strlen(&vol->UNC[2]))) {
+ /* Use the address part of the UNC. */
+ slash = strchr(&vol->UNC[2], '\\');
+ len = slash - &vol->UNC[2];
+ if (!cifs_convert_address(dstaddr, &vol->UNC[2], len)) {
pr_err("Unable to determine destination address.\n");
goto cifs_parse_mount_err;
}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 21d404535739..6ef78ad838e6 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2458,11 +2458,14 @@ cifs_uncached_writedata_release(struct kref *refcount)
struct cifs_writedata *wdata = container_of(refcount,
struct cifs_writedata, refcount);
+ kref_put(&wdata->ctx->refcount, cifs_aio_ctx_release);
for (i = 0; i < wdata->nr_pages; i++)
put_page(wdata->pages[i]);
cifs_writedata_release(refcount);
}
+static void collect_uncached_write_data(struct cifs_aio_ctx *ctx);
+
static void
cifs_uncached_writev_complete(struct work_struct *work)
{
@@ -2478,7 +2481,8 @@ cifs_uncached_writev_complete(struct work_struct *work)
spin_unlock(&inode->i_lock);
complete(&wdata->done);
-
+ collect_uncached_write_data(wdata->ctx);
+ /* the below call can possibly free the last ref to aio ctx */
kref_put(&wdata->refcount, cifs_uncached_writedata_release);
}
@@ -2527,7 +2531,8 @@ wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from,
static int
cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
struct cifsFileInfo *open_file,
- struct cifs_sb_info *cifs_sb, struct list_head *wdata_list)
+ struct cifs_sb_info *cifs_sb, struct list_head *wdata_list,
+ struct cifs_aio_ctx *ctx)
{
int rc = 0;
size_t cur_len;
@@ -2595,6 +2600,8 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
wdata->pagesz = PAGE_SIZE;
wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
wdata->credits = credits;
+ wdata->ctx = ctx;
+ kref_get(&ctx->refcount);
if (!wdata->cfile->invalidHandle ||
!(rc = cifs_reopen_file(wdata->cfile, false)))
@@ -2620,81 +2627,61 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
return rc;
}
-ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
+static void collect_uncached_write_data(struct cifs_aio_ctx *ctx)
{
- struct file *file = iocb->ki_filp;
- ssize_t total_written = 0;
- struct cifsFileInfo *open_file;
+ struct cifs_writedata *wdata, *tmp;
struct cifs_tcon *tcon;
struct cifs_sb_info *cifs_sb;
- struct cifs_writedata *wdata, *tmp;
- struct list_head wdata_list;
- struct iov_iter saved_from = *from;
+ struct dentry *dentry = ctx->cfile->dentry;
+ unsigned int i;
int rc;
- /*
- * BB - optimize the way when signing is disabled. We can drop this
- * extra memory-to-memory copying and use iovec buffers for constructing
- * write request.
- */
-
- rc = generic_write_checks(iocb, from);
- if (rc <= 0)
- return rc;
-
- INIT_LIST_HEAD(&wdata_list);
- cifs_sb = CIFS_FILE_SB(file);
- open_file = file->private_data;
- tcon = tlink_tcon(open_file->tlink);
-
- if (!tcon->ses->server->ops->async_writev)
- return -ENOSYS;
+ tcon = tlink_tcon(ctx->cfile->tlink);
+ cifs_sb = CIFS_SB(dentry->d_sb);
- rc = cifs_write_from_iter(iocb->ki_pos, iov_iter_count(from), from,
- open_file, cifs_sb, &wdata_list);
+ mutex_lock(&ctx->aio_mutex);
- /*
- * If at least one write was successfully sent, then discard any rc
- * value from the later writes. If the other write succeeds, then
- * we'll end up returning whatever was written. If it fails, then
- * we'll get a new rc value from that.
- */
- if (!list_empty(&wdata_list))
- rc = 0;
+ if (list_empty(&ctx->list)) {
+ mutex_unlock(&ctx->aio_mutex);
+ return;
+ }
+ rc = ctx->rc;
/*
* Wait for and collect replies for any successful sends in order of
- * increasing offset. Once an error is hit or we get a fatal signal
- * while waiting, then return without waiting for any more replies.
+ * increasing offset. Once an error is hit, then return without waiting
+ * for any more replies.
*/
restart_loop:
- list_for_each_entry_safe(wdata, tmp, &wdata_list, list) {
+ list_for_each_entry_safe(wdata, tmp, &ctx->list, list) {
if (!rc) {
- /* FIXME: freezable too? */
- rc = wait_for_completion_killable(&wdata->done);
- if (rc)
- rc = -EINTR;
- else if (wdata->result)
+ if (!try_wait_for_completion(&wdata->done)) {
+ mutex_unlock(&ctx->aio_mutex);
+ return;
+ }
+
+ if (wdata->result)
rc = wdata->result;
else
- total_written += wdata->bytes;
+ ctx->total_len += wdata->bytes;
/* resend call if it's a retryable error */
if (rc == -EAGAIN) {
struct list_head tmp_list;
- struct iov_iter tmp_from = saved_from;
+ struct iov_iter tmp_from = ctx->iter;
INIT_LIST_HEAD(&tmp_list);
list_del_init(&wdata->list);
iov_iter_advance(&tmp_from,
- wdata->offset - iocb->ki_pos);
+ wdata->offset - ctx->pos);
rc = cifs_write_from_iter(wdata->offset,
wdata->bytes, &tmp_from,
- open_file, cifs_sb, &tmp_list);
+ ctx->cfile, cifs_sb, &tmp_list,
+ ctx);
- list_splice(&tmp_list, &wdata_list);
+ list_splice(&tmp_list, &ctx->list);
kref_put(&wdata->refcount,
cifs_uncached_writedata_release);
@@ -2705,12 +2692,111 @@ restart_loop:
kref_put(&wdata->refcount, cifs_uncached_writedata_release);
}
+ for (i = 0; i < ctx->npages; i++)
+ put_page(ctx->bv[i].bv_page);
+
+ cifs_stats_bytes_written(tcon, ctx->total_len);
+ set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(dentry->d_inode)->flags);
+
+ ctx->rc = (rc == 0) ? ctx->total_len : rc;
+
+ mutex_unlock(&ctx->aio_mutex);
+
+ if (ctx->iocb && ctx->iocb->ki_complete)
+ ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0);
+ else
+ complete(&ctx->done);
+}
+
+ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t total_written = 0;
+ struct cifsFileInfo *cfile;
+ struct cifs_tcon *tcon;
+ struct cifs_sb_info *cifs_sb;
+ struct cifs_aio_ctx *ctx;
+ struct iov_iter saved_from = *from;
+ int rc;
+
+ /*
+ * BB - optimize the way when signing is disabled. We can drop this
+ * extra memory-to-memory copying and use iovec buffers for constructing
+ * write request.
+ */
+
+ rc = generic_write_checks(iocb, from);
+ if (rc <= 0)
+ return rc;
+
+ cifs_sb = CIFS_FILE_SB(file);
+ cfile = file->private_data;
+ tcon = tlink_tcon(cfile->tlink);
+
+ if (!tcon->ses->server->ops->async_writev)
+ return -ENOSYS;
+
+ ctx = cifs_aio_ctx_alloc();
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->cfile = cifsFileInfo_get(cfile);
+
+ if (!is_sync_kiocb(iocb))
+ ctx->iocb = iocb;
+
+ ctx->pos = iocb->ki_pos;
+
+ rc = setup_aio_ctx_iter(ctx, from, WRITE);
+ if (rc) {
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+ return rc;
+ }
+
+ /* grab a lock here due to read response handlers can access ctx */
+ mutex_lock(&ctx->aio_mutex);
+
+ rc = cifs_write_from_iter(iocb->ki_pos, ctx->len, &saved_from,
+ cfile, cifs_sb, &ctx->list, ctx);
+
+ /*
+ * If at least one write was successfully sent, then discard any rc
+ * value from the later writes. If the other write succeeds, then
+ * we'll end up returning whatever was written. If it fails, then
+ * we'll get a new rc value from that.
+ */
+ if (!list_empty(&ctx->list))
+ rc = 0;
+
+ mutex_unlock(&ctx->aio_mutex);
+
+ if (rc) {
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+ return rc;
+ }
+
+ if (!is_sync_kiocb(iocb)) {
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+ return -EIOCBQUEUED;
+ }
+
+ rc = wait_for_completion_killable(&ctx->done);
+ if (rc) {
+ mutex_lock(&ctx->aio_mutex);
+ ctx->rc = rc = -EINTR;
+ total_written = ctx->total_len;
+ mutex_unlock(&ctx->aio_mutex);
+ } else {
+ rc = ctx->rc;
+ total_written = ctx->total_len;
+ }
+
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+
if (unlikely(!total_written))
return rc;
iocb->ki_pos += total_written;
- set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(file_inode(file))->flags);
- cifs_stats_bytes_written(tcon, total_written);
return total_written;
}
@@ -2859,6 +2945,7 @@ cifs_uncached_readdata_release(struct kref *refcount)
struct cifs_readdata, refcount);
unsigned int i;
+ kref_put(&rdata->ctx->refcount, cifs_aio_ctx_release);
for (i = 0; i < rdata->nr_pages; i++) {
put_page(rdata->pages[i]);
rdata->pages[i] = NULL;
@@ -2900,6 +2987,8 @@ cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter)
return remaining ? -EFAULT : 0;
}
+static void collect_uncached_read_data(struct cifs_aio_ctx *ctx);
+
static void
cifs_uncached_readv_complete(struct work_struct *work)
{
@@ -2907,6 +2996,8 @@ cifs_uncached_readv_complete(struct work_struct *work)
struct cifs_readdata, work);
complete(&rdata->done);
+ collect_uncached_read_data(rdata->ctx);
+ /* the below call can possibly free the last ref to aio ctx */
kref_put(&rdata->refcount, cifs_uncached_readdata_release);
}
@@ -2973,7 +3064,8 @@ cifs_uncached_copy_into_pages(struct TCP_Server_Info *server,
static int
cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
- struct cifs_sb_info *cifs_sb, struct list_head *rdata_list)
+ struct cifs_sb_info *cifs_sb, struct list_head *rdata_list,
+ struct cifs_aio_ctx *ctx)
{
struct cifs_readdata *rdata;
unsigned int npages, rsize, credits;
@@ -3020,6 +3112,8 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
rdata->read_into_pages = cifs_uncached_read_into_pages;
rdata->copy_into_pages = cifs_uncached_copy_into_pages;
rdata->credits = credits;
+ rdata->ctx = ctx;
+ kref_get(&ctx->refcount);
if (!rdata->cfile->invalidHandle ||
!(rc = cifs_reopen_file(rdata->cfile, true)))
@@ -3042,50 +3136,37 @@ error:
return rc;
}
-ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
+static void
+collect_uncached_read_data(struct cifs_aio_ctx *ctx)
{
- struct file *file = iocb->ki_filp;
- ssize_t rc;
- size_t len;
- ssize_t total_read = 0;
- loff_t offset = iocb->ki_pos;
+ struct cifs_readdata *rdata, *tmp;
+ struct iov_iter *to = &ctx->iter;
struct cifs_sb_info *cifs_sb;
struct cifs_tcon *tcon;
- struct cifsFileInfo *open_file;
- struct cifs_readdata *rdata, *tmp;
- struct list_head rdata_list;
-
- len = iov_iter_count(to);
- if (!len)
- return 0;
-
- INIT_LIST_HEAD(&rdata_list);
- cifs_sb = CIFS_FILE_SB(file);
- open_file = file->private_data;
- tcon = tlink_tcon(open_file->tlink);
-
- if (!tcon->ses->server->ops->async_readv)
- return -ENOSYS;
+ unsigned int i;
+ int rc;
- if ((file->f_flags & O_ACCMODE) == O_WRONLY)
- cifs_dbg(FYI, "attempting read on write only file instance\n");
+ tcon = tlink_tcon(ctx->cfile->tlink);
+ cifs_sb = CIFS_SB(ctx->cfile->dentry->d_sb);
- rc = cifs_send_async_read(offset, len, open_file, cifs_sb, &rdata_list);
+ mutex_lock(&ctx->aio_mutex);
- /* if at least one read request send succeeded, then reset rc */
- if (!list_empty(&rdata_list))
- rc = 0;
+ if (list_empty(&ctx->list)) {
+ mutex_unlock(&ctx->aio_mutex);
+ return;
+ }
- len = iov_iter_count(to);
+ rc = ctx->rc;
/* the loop below should proceed in the order of increasing offsets */
again:
- list_for_each_entry_safe(rdata, tmp, &rdata_list, list) {
+ list_for_each_entry_safe(rdata, tmp, &ctx->list, list) {
if (!rc) {
- /* FIXME: freezable sleep too? */
- rc = wait_for_completion_killable(&rdata->done);
- if (rc)
- rc = -EINTR;
- else if (rdata->result == -EAGAIN) {
+ if (!try_wait_for_completion(&rdata->done)) {
+ mutex_unlock(&ctx->aio_mutex);
+ return;
+ }
+
+ if (rdata->result == -EAGAIN) {
/* resend call if it's a retryable error */
struct list_head tmp_list;
unsigned int got_bytes = rdata->got_bytes;
@@ -3111,9 +3192,9 @@ again:
rdata->offset + got_bytes,
rdata->bytes - got_bytes,
rdata->cfile, cifs_sb,
- &tmp_list);
+ &tmp_list, ctx);
- list_splice(&tmp_list, &rdata_list);
+ list_splice(&tmp_list, &ctx->list);
kref_put(&rdata->refcount,
cifs_uncached_readdata_release);
@@ -3131,14 +3212,110 @@ again:
kref_put(&rdata->refcount, cifs_uncached_readdata_release);
}
- total_read = len - iov_iter_count(to);
+ for (i = 0; i < ctx->npages; i++) {
+ if (ctx->should_dirty)
+ set_page_dirty(ctx->bv[i].bv_page);
+ put_page(ctx->bv[i].bv_page);
+ }
+
+ ctx->total_len = ctx->len - iov_iter_count(to);
- cifs_stats_bytes_read(tcon, total_read);
+ cifs_stats_bytes_read(tcon, ctx->total_len);
/* mask nodata case */
if (rc == -ENODATA)
rc = 0;
+ ctx->rc = (rc == 0) ? ctx->total_len : rc;
+
+ mutex_unlock(&ctx->aio_mutex);
+
+ if (ctx->iocb && ctx->iocb->ki_complete)
+ ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0);
+ else
+ complete(&ctx->done);
+}
+
+ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t rc;
+ size_t len;
+ ssize_t total_read = 0;
+ loff_t offset = iocb->ki_pos;
+ struct cifs_sb_info *cifs_sb;
+ struct cifs_tcon *tcon;
+ struct cifsFileInfo *cfile;
+ struct cifs_aio_ctx *ctx;
+
+ len = iov_iter_count(to);
+ if (!len)
+ return 0;
+
+ cifs_sb = CIFS_FILE_SB(file);
+ cfile = file->private_data;
+ tcon = tlink_tcon(cfile->tlink);
+
+ if (!tcon->ses->server->ops->async_readv)
+ return -ENOSYS;
+
+ if ((file->f_flags & O_ACCMODE) == O_WRONLY)
+ cifs_dbg(FYI, "attempting read on write only file instance\n");
+
+ ctx = cifs_aio_ctx_alloc();
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->cfile = cifsFileInfo_get(cfile);
+
+ if (!is_sync_kiocb(iocb))
+ ctx->iocb = iocb;
+
+ if (to->type & ITER_IOVEC)
+ ctx->should_dirty = true;
+
+ rc = setup_aio_ctx_iter(ctx, to, READ);
+ if (rc) {
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+ return rc;
+ }
+
+ len = ctx->len;
+
+ /* grab a lock here due to read response handlers can access ctx */
+ mutex_lock(&ctx->aio_mutex);
+
+ rc = cifs_send_async_read(offset, len, cfile, cifs_sb, &ctx->list, ctx);
+
+ /* if at least one read request send succeeded, then reset rc */
+ if (!list_empty(&ctx->list))
+ rc = 0;
+
+ mutex_unlock(&ctx->aio_mutex);
+
+ if (rc) {
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+ return rc;
+ }
+
+ if (!is_sync_kiocb(iocb)) {
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+ return -EIOCBQUEUED;
+ }
+
+ rc = wait_for_completion_killable(&ctx->done);
+ if (rc) {
+ mutex_lock(&ctx->aio_mutex);
+ ctx->rc = rc = -EINTR;
+ total_read = ctx->total_len;
+ mutex_unlock(&ctx->aio_mutex);
+ } else {
+ rc = ctx->rc;
+ total_read = ctx->total_len;
+ }
+
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+
if (total_read) {
iocb->ki_pos += total_read;
return total_read;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index b261db34103c..c3b2fa0b2ec8 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -322,9 +322,9 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU;
fattr->cf_uid = cifs_sb->mnt_uid;
fattr->cf_gid = cifs_sb->mnt_gid;
- fattr->cf_atime = CURRENT_TIME;
- fattr->cf_ctime = CURRENT_TIME;
- fattr->cf_mtime = CURRENT_TIME;
+ ktime_get_real_ts(&fattr->cf_mtime);
+ fattr->cf_mtime = timespec_trunc(fattr->cf_mtime, sb->s_time_gran);
+ fattr->cf_atime = fattr->cf_ctime = fattr->cf_mtime;
fattr->cf_nlink = 2;
fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL;
}
@@ -586,9 +586,10 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
/* Fill a cifs_fattr struct with info from FILE_ALL_INFO */
static void
cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
- struct cifs_sb_info *cifs_sb, bool adjust_tz,
+ struct super_block *sb, bool adjust_tz,
bool symlink)
{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
memset(fattr, 0, sizeof(*fattr));
@@ -598,8 +599,10 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
if (info->LastAccessTime)
fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime);
- else
- fattr->cf_atime = CURRENT_TIME;
+ else {
+ ktime_get_real_ts(&fattr->cf_atime);
+ fattr->cf_atime = timespec_trunc(fattr->cf_atime, sb->s_time_gran);
+ }
fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime);
fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
@@ -659,7 +662,6 @@ cifs_get_file_info(struct file *filp)
FILE_ALL_INFO find_data;
struct cifs_fattr fattr;
struct inode *inode = file_inode(filp);
- struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifsFileInfo *cfile = filp->private_data;
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct TCP_Server_Info *server = tcon->ses->server;
@@ -671,7 +673,7 @@ cifs_get_file_info(struct file *filp)
rc = server->ops->query_file_info(xid, tcon, &cfile->fid, &find_data);
switch (rc) {
case 0:
- cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false,
+ cifs_all_info_to_fattr(&fattr, &find_data, inode->i_sb, false,
false);
break;
case -EREMOTE:
@@ -753,7 +755,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
}
if (!rc) {
- cifs_all_info_to_fattr(&fattr, data, cifs_sb, adjust_tz,
+ cifs_all_info_to_fattr(&fattr, data, sb, adjust_tz,
symlink);
} else if (rc == -EREMOTE) {
cifs_create_dfs_fattr(&fattr, sb);
@@ -1363,9 +1365,9 @@ out_reval:
cifs_inode = CIFS_I(inode);
cifs_inode->time = 0; /* will force revalidate to get info
when needed */
- inode->i_ctime = current_fs_time(sb);
+ inode->i_ctime = current_time(inode);
}
- dir->i_ctime = dir->i_mtime = current_fs_time(sb);
+ dir->i_ctime = dir->i_mtime = current_time(dir);
cifs_inode = CIFS_I(dir);
CIFS_I(dir)->time = 0; /* force revalidate of dir as well */
unlink_out:
@@ -1633,7 +1635,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
cifsInode->time = 0;
d_inode(direntry)->i_ctime = inode->i_ctime = inode->i_mtime =
- current_fs_time(inode->i_sb);
+ current_time(inode);
rmdir_exit:
kfree(full_path);
@@ -1806,7 +1808,7 @@ unlink_target:
CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0;
source_dir->i_ctime = source_dir->i_mtime = target_dir->i_ctime =
- target_dir->i_mtime = current_fs_time(source_dir->i_sb);
+ target_dir->i_mtime = current_time(source_dir);
cifs_rename_exit:
kfree(info_buf_source);
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 7f4bba574930..76fb0917dc8c 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -209,10 +209,14 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
rc = -EOPNOTSUPP;
break;
case CIFS_IOC_GET_MNT_INFO:
+ if (pSMBFile == NULL)
+ break;
tcon = tlink_tcon(pSMBFile->tlink);
rc = smb_mnt_get_fsinfo(xid, tcon, (void __user *)arg);
break;
case CIFS_ENUMERATE_SNAPSHOTS:
+ if (pSMBFile == NULL)
+ break;
if (arg == 0) {
rc = -EINVAL;
goto cifs_ioc_exit;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 843787850435..b08531977daa 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -22,6 +22,7 @@
#include <linux/slab.h>
#include <linux/ctype.h>
#include <linux/mempool.h>
+#include <linux/vmalloc.h>
#include "cifspdu.h"
#include "cifsglob.h"
#include "cifsproto.h"
@@ -488,7 +489,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
&pCifsInode->flags);
- queue_work(cifsiod_wq,
+ queue_work(cifsoplockd_wq,
&netfile->oplock_break);
netfile->oplock_break_cancelled = false;
@@ -741,3 +742,122 @@ parse_DFS_referrals_exit:
}
return rc;
}
+
+struct cifs_aio_ctx *
+cifs_aio_ctx_alloc(void)
+{
+ struct cifs_aio_ctx *ctx;
+
+ ctx = kzalloc(sizeof(struct cifs_aio_ctx), GFP_KERNEL);
+ if (!ctx)
+ return NULL;
+
+ INIT_LIST_HEAD(&ctx->list);
+ mutex_init(&ctx->aio_mutex);
+ init_completion(&ctx->done);
+ kref_init(&ctx->refcount);
+ return ctx;
+}
+
+void
+cifs_aio_ctx_release(struct kref *refcount)
+{
+ struct cifs_aio_ctx *ctx = container_of(refcount,
+ struct cifs_aio_ctx, refcount);
+
+ cifsFileInfo_put(ctx->cfile);
+ kvfree(ctx->bv);
+ kfree(ctx);
+}
+
+#define CIFS_AIO_KMALLOC_LIMIT (1024 * 1024)
+
+int
+setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw)
+{
+ ssize_t rc;
+ unsigned int cur_npages;
+ unsigned int npages = 0;
+ unsigned int i;
+ size_t len;
+ size_t count = iov_iter_count(iter);
+ unsigned int saved_len;
+ size_t start;
+ unsigned int max_pages = iov_iter_npages(iter, INT_MAX);
+ struct page **pages = NULL;
+ struct bio_vec *bv = NULL;
+
+ if (iter->type & ITER_KVEC) {
+ memcpy(&ctx->iter, iter, sizeof(struct iov_iter));
+ ctx->len = count;
+ iov_iter_advance(iter, count);
+ return 0;
+ }
+
+ if (max_pages * sizeof(struct bio_vec) <= CIFS_AIO_KMALLOC_LIMIT)
+ bv = kmalloc_array(max_pages, sizeof(struct bio_vec),
+ GFP_KERNEL);
+
+ if (!bv) {
+ bv = vmalloc(max_pages * sizeof(struct bio_vec));
+ if (!bv)
+ return -ENOMEM;
+ }
+
+ if (max_pages * sizeof(struct page *) <= CIFS_AIO_KMALLOC_LIMIT)
+ pages = kmalloc_array(max_pages, sizeof(struct page *),
+ GFP_KERNEL);
+
+ if (!pages) {
+ pages = vmalloc(max_pages * sizeof(struct page *));
+ if (!bv) {
+ kvfree(bv);
+ return -ENOMEM;
+ }
+ }
+
+ saved_len = count;
+
+ while (count && npages < max_pages) {
+ rc = iov_iter_get_pages(iter, pages, count, max_pages, &start);
+ if (rc < 0) {
+ cifs_dbg(VFS, "couldn't get user pages (rc=%zd)\n", rc);
+ break;
+ }
+
+ if (rc > count) {
+ cifs_dbg(VFS, "get pages rc=%zd more than %zu\n", rc,
+ count);
+ break;
+ }
+
+ iov_iter_advance(iter, rc);
+ count -= rc;
+ rc += start;
+ cur_npages = DIV_ROUND_UP(rc, PAGE_SIZE);
+
+ if (npages + cur_npages > max_pages) {
+ cifs_dbg(VFS, "out of vec array capacity (%u vs %u)\n",
+ npages + cur_npages, max_pages);
+ break;
+ }
+
+ for (i = 0; i < cur_npages; i++) {
+ len = rc > PAGE_SIZE ? PAGE_SIZE : rc;
+ bv[npages + i].bv_page = pages[i];
+ bv[npages + i].bv_offset = start;
+ bv[npages + i].bv_len = len - start;
+ rc -= len;
+ start = 0;
+ }
+
+ npages += cur_npages;
+ }
+
+ kvfree(pages);
+ ctx->bv = bv;
+ ctx->len = saved_len - count;
+ ctx->npages = npages;
+ iov_iter_bvec(&ctx->iter, ITER_BVEC | rw, ctx->bv, npages, ctx->len);
+ return 0;
+}
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index abae6dd2c6b9..cc88f4f0325e 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -980,10 +980,10 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
cifs_dbg(VFS, "illegal hours %d\n", st->Hours);
days = sd->Day;
month = sd->Month;
- if ((days > 31) || (month > 12)) {
+ if (days < 1 || days > 31 || month < 1 || month > 12) {
cifs_dbg(VFS, "illegal date, month %d day: %d\n", month, days);
- if (month > 12)
- month = 12;
+ days = clamp(days, 1, 31);
+ month = clamp(month, 1, 12);
}
month -= 1;
days += total_days_of_prev_months[month];
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 1a04b3a5beb1..7b08a1446a7f 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -499,7 +499,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
else
cfile->oplock_break_cancelled = true;
- queue_work(cifsiod_wq, &cfile->oplock_break);
+ queue_work(cifsoplockd_wq, &cfile->oplock_break);
kfree(lw);
return true;
}
@@ -643,7 +643,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
&cinode->flags);
spin_unlock(&cfile->file_info_lock);
- queue_work(cifsiod_wq, &cfile->oplock_break);
+ queue_work(cifsoplockd_wq,
+ &cfile->oplock_break);
spin_unlock(&tcon->open_file_lock);
spin_unlock(&cifs_tcp_ses_lock);
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 152e37f2ad92..c58691834eb2 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -942,6 +942,7 @@ smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon,
}
if (snapshot_in.snapshot_array_size < sizeof(struct smb_snapshot_array)) {
rc = -ERANGE;
+ kfree(retbuf);
return rc;
}
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index fb0da096c2ce..48ff7703b919 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -633,8 +633,12 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
}
if (rsplen != sizeof(struct validate_negotiate_info_rsp)) {
- cifs_dbg(VFS, "invalid size of protocol negotiate response\n");
- return -EIO;
+ cifs_dbg(VFS, "invalid protocol negotiate response size: %d\n",
+ rsplen);
+
+ /* relax check since Mac returns max bufsize allowed on ioctl */
+ if (rsplen > CIFSMaxBufSize)
+ return -EIO;
}
/* check validate negotiate info response matches what we got earlier */
@@ -1854,8 +1858,12 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
* than one credit. Windows typically sets this smaller, but for some
* ioctls it may be useful to allow server to send more. No point
* limiting what the server can send as long as fits in one credit
+ * Unfortunately - we can not handle more than CIFS_MAX_MSG_SIZE
+ * (by default, note that it can be overridden to make max larger)
+ * in responses (except for read responses which can be bigger.
+ * We may want to bump this limit up
*/
- req->MaxOutputResponse = cpu_to_le32(0xFF00); /* < 64K uses 1 credit */
+ req->MaxOutputResponse = cpu_to_le32(CIFSMaxBufSize);
if (is_fsctl)
req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 11d087b2b28e..6116d5275a3e 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -833,7 +833,7 @@ static int compat_ioctl_preallocate(struct file *file,
*/
#define XFORM(i) (((i) ^ ((i) << 27) ^ ((i) << 17)) & 0xffffffff)
-#define COMPATIBLE_IOCTL(cmd) XFORM(cmd),
+#define COMPATIBLE_IOCTL(cmd) XFORM((u32)cmd),
/* ioctl should not be warned about even if it's not implemented.
Valid reasons to use this:
- It is implemented with ->compat_ioctl on some device, but programs
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 37b49894c762..d1bb02b1ee58 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -159,6 +159,8 @@ static int fname_decrypt(struct inode *inode,
static const char *lookup_table =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
+#define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3)
+
/**
* digest_encode() -
*
@@ -230,11 +232,14 @@ EXPORT_SYMBOL(fscrypt_fname_encrypted_size);
int fscrypt_fname_alloc_buffer(const struct inode *inode,
u32 ilen, struct fscrypt_str *crypto_str)
{
- unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen);
+ u32 olen = fscrypt_fname_encrypted_size(inode, ilen);
+ const u32 max_encoded_len =
+ max_t(u32, BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE),
+ 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name)));
crypto_str->len = olen;
- if (olen < FS_FNAME_CRYPTO_DIGEST_SIZE * 2)
- olen = FS_FNAME_CRYPTO_DIGEST_SIZE * 2;
+ olen = max(olen, max_encoded_len);
+
/*
* Allocated buffer can hold one more character to null-terminate the
* string
@@ -266,6 +271,10 @@ EXPORT_SYMBOL(fscrypt_fname_free_buffer);
*
* The caller must have allocated sufficient memory for the @oname string.
*
+ * If the key is available, we'll decrypt the disk name; otherwise, we'll encode
+ * it for presentation. Short names are directly base64-encoded, while long
+ * names are encoded in fscrypt_digested_name format.
+ *
* Return: 0 on success, -errno on failure
*/
int fscrypt_fname_disk_to_usr(struct inode *inode,
@@ -274,7 +283,7 @@ int fscrypt_fname_disk_to_usr(struct inode *inode,
struct fscrypt_str *oname)
{
const struct qstr qname = FSTR_TO_QSTR(iname);
- char buf[24];
+ struct fscrypt_digested_name digested_name;
if (fscrypt_is_dot_dotdot(&qname)) {
oname->name[0] = '.';
@@ -289,20 +298,24 @@ int fscrypt_fname_disk_to_usr(struct inode *inode,
if (inode->i_crypt_info)
return fname_decrypt(inode, iname, oname);
- if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) {
+ if (iname->len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) {
oname->len = digest_encode(iname->name, iname->len,
oname->name);
return 0;
}
if (hash) {
- memcpy(buf, &hash, 4);
- memcpy(buf + 4, &minor_hash, 4);
+ digested_name.hash = hash;
+ digested_name.minor_hash = minor_hash;
} else {
- memset(buf, 0, 8);
+ digested_name.hash = 0;
+ digested_name.minor_hash = 0;
}
- memcpy(buf + 8, iname->name + iname->len - 16, 16);
+ memcpy(digested_name.digest,
+ FSCRYPT_FNAME_DIGEST(iname->name, iname->len),
+ FSCRYPT_FNAME_DIGEST_SIZE);
oname->name[0] = '_';
- oname->len = 1 + digest_encode(buf, 24, oname->name + 1);
+ oname->len = 1 + digest_encode((const char *)&digested_name,
+ sizeof(digested_name), oname->name + 1);
return 0;
}
EXPORT_SYMBOL(fscrypt_fname_disk_to_usr);
@@ -336,10 +349,35 @@ int fscrypt_fname_usr_to_disk(struct inode *inode,
}
EXPORT_SYMBOL(fscrypt_fname_usr_to_disk);
+/**
+ * fscrypt_setup_filename() - prepare to search a possibly encrypted directory
+ * @dir: the directory that will be searched
+ * @iname: the user-provided filename being searched for
+ * @lookup: 1 if we're allowed to proceed without the key because it's
+ * ->lookup() or we're finding the dir_entry for deletion; 0 if we cannot
+ * proceed without the key because we're going to create the dir_entry.
+ * @fname: the filename information to be filled in
+ *
+ * Given a user-provided filename @iname, this function sets @fname->disk_name
+ * to the name that would be stored in the on-disk directory entry, if possible.
+ * If the directory is unencrypted this is simply @iname. Else, if we have the
+ * directory's encryption key, then @iname is the plaintext, so we encrypt it to
+ * get the disk_name.
+ *
+ * Else, for keyless @lookup operations, @iname is the presented ciphertext, so
+ * we decode it to get either the ciphertext disk_name (for short names) or the
+ * fscrypt_digested_name (for long names). Non-@lookup operations will be
+ * impossible in this case, so we fail them with ENOKEY.
+ *
+ * If successful, fscrypt_free_filename() must be called later to clean up.
+ *
+ * Return: 0 on success, -errno on failure
+ */
int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
int lookup, struct fscrypt_name *fname)
{
- int ret = 0, bigname = 0;
+ int ret;
+ int digested;
memset(fname, 0, sizeof(struct fscrypt_name));
fname->usr_fname = iname;
@@ -373,25 +411,37 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
* We don't have the key and we are doing a lookup; decode the
* user-supplied name
*/
- if (iname->name[0] == '_')
- bigname = 1;
- if ((bigname && (iname->len != 33)) || (!bigname && (iname->len > 43)))
- return -ENOENT;
+ if (iname->name[0] == '_') {
+ if (iname->len !=
+ 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name)))
+ return -ENOENT;
+ digested = 1;
+ } else {
+ if (iname->len >
+ BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE))
+ return -ENOENT;
+ digested = 0;
+ }
- fname->crypto_buf.name = kmalloc(32, GFP_KERNEL);
+ fname->crypto_buf.name =
+ kmalloc(max_t(size_t, FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE,
+ sizeof(struct fscrypt_digested_name)),
+ GFP_KERNEL);
if (fname->crypto_buf.name == NULL)
return -ENOMEM;
- ret = digest_decode(iname->name + bigname, iname->len - bigname,
+ ret = digest_decode(iname->name + digested, iname->len - digested,
fname->crypto_buf.name);
if (ret < 0) {
ret = -ENOENT;
goto errout;
}
fname->crypto_buf.len = ret;
- if (bigname) {
- memcpy(&fname->hash, fname->crypto_buf.name, 4);
- memcpy(&fname->minor_hash, fname->crypto_buf.name + 4, 4);
+ if (digested) {
+ const struct fscrypt_digested_name *n =
+ (const void *)fname->crypto_buf.name;
+ fname->hash = n->hash;
+ fname->minor_hash = n->minor_hash;
} else {
fname->disk_name.name = fname->crypto_buf.name;
fname->disk_name.len = fname->crypto_buf.len;
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index e39696e64494..1e1f8a361b75 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -13,8 +13,6 @@
#include <linux/fscrypt_supp.h>
-#define FS_FNAME_CRYPTO_DIGEST_SIZE 32
-
/* Encryption parameters */
#define FS_XTS_TWEAK_SIZE 16
#define FS_AES_128_ECB_KEY_SIZE 16
@@ -22,10 +20,6 @@
#define FS_AES_256_CBC_KEY_SIZE 32
#define FS_AES_256_CTS_KEY_SIZE 32
#define FS_AES_256_XTS_KEY_SIZE 64
-#define FS_MAX_KEY_SIZE 64
-
-#define FS_KEY_DESC_PREFIX "fscrypt:"
-#define FS_KEY_DESC_PREFIX_SIZE 8
#define FS_KEY_DERIVATION_NONCE_SIZE 16
@@ -51,13 +45,6 @@ struct fscrypt_context {
#define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1
-/* This is passed in from userspace into the kernel keyring */
-struct fscrypt_key {
- u32 mode;
- u8 raw[FS_MAX_KEY_SIZE];
- u32 size;
-} __packed;
-
/*
* A pointer to this structure is stored in the file system's in-core
* representation of an inode.
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 8cdfddce2b34..179e578b875b 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -183,9 +183,6 @@ int fscrypt_get_encryption_info(struct inode *inode)
if (res)
return res;
- if (!inode->i_sb->s_cop->get_context)
- return -EOPNOTSUPP;
-
res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
if (res < 0) {
if (!fscrypt_dummy_context_enabled(inode) ||
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 4908906d54d5..210976e7a269 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -34,9 +34,6 @@ static int create_encryption_context_from_policy(struct inode *inode,
{
struct fscrypt_context ctx;
- if (!inode->i_sb->s_cop->set_context)
- return -EOPNOTSUPP;
-
ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
FS_KEY_DESCRIPTOR_SIZE);
@@ -87,8 +84,6 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg)
if (ret == -ENODATA) {
if (!S_ISDIR(inode->i_mode))
ret = -ENOTDIR;
- else if (!inode->i_sb->s_cop->empty_dir)
- ret = -EOPNOTSUPP;
else if (!inode->i_sb->s_cop->empty_dir(inode))
ret = -ENOTEMPTY;
else
@@ -118,8 +113,7 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
struct fscrypt_policy policy;
int res;
- if (!inode->i_sb->s_cop->get_context ||
- !inode->i_sb->s_cop->is_encrypted(inode))
+ if (!inode->i_sb->s_cop->is_encrypted(inode))
return -ENODATA;
res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
@@ -143,27 +137,61 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
}
EXPORT_SYMBOL(fscrypt_ioctl_get_policy);
+/**
+ * fscrypt_has_permitted_context() - is a file's encryption policy permitted
+ * within its directory?
+ *
+ * @parent: inode for parent directory
+ * @child: inode for file being looked up, opened, or linked into @parent
+ *
+ * Filesystems must call this before permitting access to an inode in a
+ * situation where the parent directory is encrypted (either before allowing
+ * ->lookup() to succeed, or for a regular file before allowing it to be opened)
+ * and before any operation that involves linking an inode into an encrypted
+ * directory, including link, rename, and cross rename. It enforces the
+ * constraint that within a given encrypted directory tree, all files use the
+ * same encryption policy. The pre-access check is needed to detect potentially
+ * malicious offline violations of this constraint, while the link and rename
+ * checks are needed to prevent online violations of this constraint.
+ *
+ * Return: 1 if permitted, 0 if forbidden. If forbidden, the caller must fail
+ * the filesystem operation with EPERM.
+ */
int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
{
- struct fscrypt_info *parent_ci, *child_ci;
+ const struct fscrypt_operations *cops = parent->i_sb->s_cop;
+ const struct fscrypt_info *parent_ci, *child_ci;
+ struct fscrypt_context parent_ctx, child_ctx;
int res;
- if ((parent == NULL) || (child == NULL)) {
- printk(KERN_ERR "parent %p child %p\n", parent, child);
- BUG_ON(1);
- }
-
/* No restrictions on file types which are never encrypted */
if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) &&
!S_ISLNK(child->i_mode))
return 1;
- /* no restrictions if the parent directory is not encrypted */
- if (!parent->i_sb->s_cop->is_encrypted(parent))
+ /* No restrictions if the parent directory is unencrypted */
+ if (!cops->is_encrypted(parent))
return 1;
- /* if the child directory is not encrypted, this is always a problem */
- if (!parent->i_sb->s_cop->is_encrypted(child))
+
+ /* Encrypted directories must not contain unencrypted files */
+ if (!cops->is_encrypted(child))
return 0;
+
+ /*
+ * Both parent and child are encrypted, so verify they use the same
+ * encryption policy. Compare the fscrypt_info structs if the keys are
+ * available, otherwise retrieve and compare the fscrypt_contexts.
+ *
+ * Note that the fscrypt_context retrieval will be required frequently
+ * when accessing an encrypted directory tree without the key.
+ * Performance-wise this is not a big deal because we already don't
+ * really optimize for file access without the key (to the extent that
+ * such access is even possible), given that any attempted access
+ * already causes a fscrypt_context retrieval and keyring search.
+ *
+ * In any case, if an unexpected error occurs, fall back to "forbidden".
+ */
+
res = fscrypt_get_encryption_info(parent);
if (res)
return 0;
@@ -172,17 +200,32 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
return 0;
parent_ci = parent->i_crypt_info;
child_ci = child->i_crypt_info;
- if (!parent_ci && !child_ci)
- return 1;
- if (!parent_ci || !child_ci)
+
+ if (parent_ci && child_ci) {
+ return memcmp(parent_ci->ci_master_key, child_ci->ci_master_key,
+ FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+ (parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
+ (parent_ci->ci_filename_mode ==
+ child_ci->ci_filename_mode) &&
+ (parent_ci->ci_flags == child_ci->ci_flags);
+ }
+
+ res = cops->get_context(parent, &parent_ctx, sizeof(parent_ctx));
+ if (res != sizeof(parent_ctx))
return 0;
- return (memcmp(parent_ci->ci_master_key,
- child_ci->ci_master_key,
- FS_KEY_DESCRIPTOR_SIZE) == 0 &&
- (parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
- (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) &&
- (parent_ci->ci_flags == child_ci->ci_flags));
+ res = cops->get_context(child, &child_ctx, sizeof(child_ctx));
+ if (res != sizeof(child_ctx))
+ return 0;
+
+ return memcmp(parent_ctx.master_key_descriptor,
+ child_ctx.master_key_descriptor,
+ FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+ (parent_ctx.contents_encryption_mode ==
+ child_ctx.contents_encryption_mode) &&
+ (parent_ctx.filenames_encryption_mode ==
+ child_ctx.filenames_encryption_mode) &&
+ (parent_ctx.flags == child_ctx.flags);
}
EXPORT_SYMBOL(fscrypt_has_permitted_context);
@@ -202,9 +245,6 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child,
struct fscrypt_info *ci;
int res;
- if (!parent->i_sb->s_cop->set_context)
- return -EOPNOTSUPP;
-
res = fscrypt_get_encryption_info(parent);
if (res < 0)
return res;
diff --git a/fs/dax.c b/fs/dax.c
index 6433650be833..66d79067eedf 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -55,32 +55,6 @@ static int __init init_dax_wait_table(void)
}
fs_initcall(init_dax_wait_table);
-static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
-{
- struct request_queue *q = bdev->bd_queue;
- long rc = -EIO;
-
- dax->addr = ERR_PTR(-EIO);
- if (blk_queue_enter(q, true) != 0)
- return rc;
-
- rc = bdev_direct_access(bdev, dax);
- if (rc < 0) {
- dax->addr = ERR_PTR(rc);
- blk_queue_exit(q);
- return rc;
- }
- return rc;
-}
-
-static void dax_unmap_atomic(struct block_device *bdev,
- const struct blk_dax_ctl *dax)
-{
- if (IS_ERR(dax->addr))
- return;
- blk_queue_exit(bdev->bd_queue);
-}
-
static int dax_is_pmd_entry(void *entry)
{
return (unsigned long)entry & RADIX_DAX_PMD;
@@ -101,26 +75,6 @@ static int dax_is_empty_entry(void *entry)
return (unsigned long)entry & RADIX_DAX_EMPTY;
}
-struct page *read_dax_sector(struct block_device *bdev, sector_t n)
-{
- struct page *page = alloc_pages(GFP_KERNEL, 0);
- struct blk_dax_ctl dax = {
- .size = PAGE_SIZE,
- .sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
- };
- long rc;
-
- if (!page)
- return ERR_PTR(-ENOMEM);
-
- rc = dax_map_atomic(bdev, &dax);
- if (rc < 0)
- return ERR_PTR(rc);
- memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
- dax_unmap_atomic(bdev, &dax);
- return page;
-}
-
/*
* DAX radix tree locking
*/
@@ -555,21 +509,25 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
static int dax_load_hole(struct address_space *mapping, void **entry,
struct vm_fault *vmf)
{
+ struct inode *inode = mapping->host;
struct page *page;
int ret;
/* Hole page already exists? Return it... */
if (!radix_tree_exceptional_entry(*entry)) {
page = *entry;
- goto out;
+ goto finish_fault;
}
/* This will replace locked radix tree entry with a hole page */
page = find_or_create_page(mapping, vmf->pgoff,
vmf->gfp_mask | __GFP_ZERO);
- if (!page)
- return VM_FAULT_OOM;
- out:
+ if (!page) {
+ ret = VM_FAULT_OOM;
+ goto out;
+ }
+
+finish_fault:
vmf->page = page;
ret = finish_fault(vmf);
vmf->page = NULL;
@@ -577,26 +535,37 @@ static int dax_load_hole(struct address_space *mapping, void **entry,
if (!ret) {
/* Grab reference for PTE that is now referencing the page */
get_page(page);
- return VM_FAULT_NOPAGE;
+ ret = VM_FAULT_NOPAGE;
}
+out:
+ trace_dax_load_hole(inode, vmf, ret);
return ret;
}
-static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
- struct page *to, unsigned long vaddr)
+static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
+ sector_t sector, size_t size, struct page *to,
+ unsigned long vaddr)
{
- struct blk_dax_ctl dax = {
- .sector = sector,
- .size = size,
- };
- void *vto;
-
- if (dax_map_atomic(bdev, &dax) < 0)
- return PTR_ERR(dax.addr);
+ void *vto, *kaddr;
+ pgoff_t pgoff;
+ pfn_t pfn;
+ long rc;
+ int id;
+
+ rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+ if (rc)
+ return rc;
+
+ id = dax_read_lock();
+ rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
+ if (rc < 0) {
+ dax_read_unlock(id);
+ return rc;
+ }
vto = kmap_atomic(to);
- copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
+ copy_user_page(vto, (void __force *)kaddr, vaddr, to);
kunmap_atomic(vto);
- dax_unmap_atomic(bdev, &dax);
+ dax_read_unlock(id);
return 0;
}
@@ -764,12 +733,16 @@ unlock_pte:
}
static int dax_writeback_one(struct block_device *bdev,
- struct address_space *mapping, pgoff_t index, void *entry)
+ struct dax_device *dax_dev, struct address_space *mapping,
+ pgoff_t index, void *entry)
{
struct radix_tree_root *page_tree = &mapping->page_tree;
- struct blk_dax_ctl dax;
- void *entry2, **slot;
- int ret = 0;
+ void *entry2, **slot, *kaddr;
+ long ret = 0, id;
+ sector_t sector;
+ pgoff_t pgoff;
+ size_t size;
+ pfn_t pfn;
/*
* A page got tagged dirty in DAX mapping? Something is seriously
@@ -818,26 +791,29 @@ static int dax_writeback_one(struct block_device *bdev,
* 'entry'. This allows us to flush for PMD_SIZE and not have to
* worry about partial PMD writebacks.
*/
- dax.sector = dax_radix_sector(entry);
- dax.size = PAGE_SIZE << dax_radix_order(entry);
+ sector = dax_radix_sector(entry);
+ size = PAGE_SIZE << dax_radix_order(entry);
+
+ id = dax_read_lock();
+ ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+ if (ret)
+ goto dax_unlock;
/*
- * We cannot hold tree_lock while calling dax_map_atomic() because it
- * eventually calls cond_resched().
+ * dax_direct_access() may sleep, so cannot hold tree_lock over
+ * its invocation.
*/
- ret = dax_map_atomic(bdev, &dax);
- if (ret < 0) {
- put_locked_mapping_entry(mapping, index, entry);
- return ret;
- }
+ ret = dax_direct_access(dax_dev, pgoff, size / PAGE_SIZE, &kaddr, &pfn);
+ if (ret < 0)
+ goto dax_unlock;
- if (WARN_ON_ONCE(ret < dax.size)) {
+ if (WARN_ON_ONCE(ret < size / PAGE_SIZE)) {
ret = -EIO;
- goto unmap;
+ goto dax_unlock;
}
- dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn));
- wb_cache_pmem(dax.addr, dax.size);
+ dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn));
+ wb_cache_pmem(kaddr, size);
/*
* After we have flushed the cache, we can clear the dirty tag. There
* cannot be new dirty data in the pfn after the flush has completed as
@@ -847,8 +823,9 @@ static int dax_writeback_one(struct block_device *bdev,
spin_lock_irq(&mapping->tree_lock);
radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
spin_unlock_irq(&mapping->tree_lock);
- unmap:
- dax_unmap_atomic(bdev, &dax);
+ trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
+ dax_unlock:
+ dax_read_unlock(id);
put_locked_mapping_entry(mapping, index, entry);
return ret;
@@ -869,6 +846,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
struct inode *inode = mapping->host;
pgoff_t start_index, end_index;
pgoff_t indices[PAGEVEC_SIZE];
+ struct dax_device *dax_dev;
struct pagevec pvec;
bool done = false;
int i, ret = 0;
@@ -879,9 +857,15 @@ int dax_writeback_mapping_range(struct address_space *mapping,
if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
return 0;
+ dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+ if (!dax_dev)
+ return -EIO;
+
start_index = wbc->range_start >> PAGE_SHIFT;
end_index = wbc->range_end >> PAGE_SHIFT;
+ trace_dax_writeback_range(inode, start_index, end_index);
+
tag_pages_for_writeback(mapping, start_index, end_index);
pagevec_init(&pvec, 0);
@@ -899,38 +883,50 @@ int dax_writeback_mapping_range(struct address_space *mapping,
break;
}
- ret = dax_writeback_one(bdev, mapping, indices[i],
- pvec.pages[i]);
+ ret = dax_writeback_one(bdev, dax_dev, mapping,
+ indices[i], pvec.pages[i]);
if (ret < 0)
- return ret;
+ goto out;
}
}
- return 0;
+out:
+ put_dax(dax_dev);
+ trace_dax_writeback_range_done(inode, start_index, end_index);
+ return (ret < 0 ? ret : 0);
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
static int dax_insert_mapping(struct address_space *mapping,
- struct block_device *bdev, sector_t sector, size_t size,
- void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
+ struct block_device *bdev, struct dax_device *dax_dev,
+ sector_t sector, size_t size, void **entryp,
+ struct vm_area_struct *vma, struct vm_fault *vmf)
{
unsigned long vaddr = vmf->address;
- struct blk_dax_ctl dax = {
- .sector = sector,
- .size = size,
- };
- void *ret;
void *entry = *entryp;
+ void *ret, *kaddr;
+ pgoff_t pgoff;
+ int id, rc;
+ pfn_t pfn;
- if (dax_map_atomic(bdev, &dax) < 0)
- return PTR_ERR(dax.addr);
- dax_unmap_atomic(bdev, &dax);
+ rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+ if (rc)
+ return rc;
+
+ id = dax_read_lock();
+ rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
+ if (rc < 0) {
+ dax_read_unlock(id);
+ return rc;
+ }
+ dax_read_unlock(id);
- ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0);
+ ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
if (IS_ERR(ret))
return PTR_ERR(ret);
*entryp = ret;
- return vm_insert_mixed(vma, vaddr, dax.pfn);
+ trace_dax_insert_mapping(mapping->host, vmf, ret);
+ return vm_insert_mixed(vma, vaddr, pfn);
}
/**
@@ -941,6 +937,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf)
{
struct file *file = vmf->vma->vm_file;
struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
void *entry, **slot;
pgoff_t index = vmf->pgoff;
@@ -950,6 +947,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf)
if (entry)
put_unlocked_mapping_entry(mapping, index, entry);
spin_unlock_irq(&mapping->tree_lock);
+ trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE);
return VM_FAULT_NOPAGE;
}
radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
@@ -962,6 +960,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf)
*/
finish_mkwrite_fault(vmf);
put_locked_mapping_entry(mapping, index, entry);
+ trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE);
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
@@ -979,24 +978,34 @@ static bool dax_range_is_aligned(struct block_device *bdev,
return true;
}
-int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
- unsigned int offset, unsigned int length)
+int __dax_zero_page_range(struct block_device *bdev,
+ struct dax_device *dax_dev, sector_t sector,
+ unsigned int offset, unsigned int size)
{
- struct blk_dax_ctl dax = {
- .sector = sector,
- .size = PAGE_SIZE,
- };
-
- if (dax_range_is_aligned(bdev, offset, length)) {
- sector_t start_sector = dax.sector + (offset >> 9);
+ if (dax_range_is_aligned(bdev, offset, size)) {
+ sector_t start_sector = sector + (offset >> 9);
return blkdev_issue_zeroout(bdev, start_sector,
- length >> 9, GFP_NOFS, 0);
+ size >> 9, GFP_NOFS, 0);
} else {
- if (dax_map_atomic(bdev, &dax) < 0)
- return PTR_ERR(dax.addr);
- clear_pmem(dax.addr + offset, length);
- dax_unmap_atomic(bdev, &dax);
+ pgoff_t pgoff;
+ long rc, id;
+ void *kaddr;
+ pfn_t pfn;
+
+ rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+ if (rc)
+ return rc;
+
+ id = dax_read_lock();
+ rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr,
+ &pfn);
+ if (rc < 0) {
+ dax_read_unlock(id);
+ return rc;
+ }
+ clear_pmem(kaddr + offset, size);
+ dax_read_unlock(id);
}
return 0;
}
@@ -1011,9 +1020,12 @@ static loff_t
dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
struct iomap *iomap)
{
+ struct block_device *bdev = iomap->bdev;
+ struct dax_device *dax_dev = iomap->dax_dev;
struct iov_iter *iter = data;
loff_t end = pos + length, done = 0;
ssize_t ret = 0;
+ int id;
if (iov_iter_rw(iter) == READ) {
end = min(end, i_size_read(inode));
@@ -1038,34 +1050,42 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
(end - 1) >> PAGE_SHIFT);
}
+ id = dax_read_lock();
while (pos < end) {
unsigned offset = pos & (PAGE_SIZE - 1);
- struct blk_dax_ctl dax = { 0 };
+ const size_t size = ALIGN(length + offset, PAGE_SIZE);
+ const sector_t sector = dax_iomap_sector(iomap, pos);
ssize_t map_len;
+ pgoff_t pgoff;
+ void *kaddr;
+ pfn_t pfn;
if (fatal_signal_pending(current)) {
ret = -EINTR;
break;
}
- dax.sector = dax_iomap_sector(iomap, pos);
- dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
- map_len = dax_map_atomic(iomap->bdev, &dax);
+ ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+ if (ret)
+ break;
+
+ map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
+ &kaddr, &pfn);
if (map_len < 0) {
ret = map_len;
break;
}
- dax.addr += offset;
+ map_len = PFN_PHYS(map_len);
+ kaddr += offset;
map_len -= offset;
if (map_len > end - pos)
map_len = end - pos;
if (iov_iter_rw(iter) == WRITE)
- map_len = copy_from_iter_pmem(dax.addr, map_len, iter);
+ map_len = copy_from_iter_pmem(kaddr, map_len, iter);
else
- map_len = copy_to_iter(dax.addr, map_len, iter);
- dax_unmap_atomic(iomap->bdev, &dax);
+ map_len = copy_to_iter(kaddr, map_len, iter);
if (map_len <= 0) {
ret = map_len ? map_len : -EFAULT;
break;
@@ -1075,6 +1095,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
length -= map_len;
done += map_len;
}
+ dax_read_unlock(id);
return done ? done : ret;
}
@@ -1142,13 +1163,16 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
int vmf_ret = 0;
void *entry;
+ trace_dax_pte_fault(inode, vmf, vmf_ret);
/*
* Check whether offset isn't beyond end of file now. Caller is supposed
* to hold locks serializing us with truncate / punch hole so this is
* a reliable test.
*/
- if (pos >= i_size_read(inode))
- return VM_FAULT_SIGBUS;
+ if (pos >= i_size_read(inode)) {
+ vmf_ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
flags |= IOMAP_WRITE;
@@ -1159,8 +1183,10 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
* that we never have to deal with more than a single extent here.
*/
error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
- if (error)
- return dax_fault_return(error);
+ if (error) {
+ vmf_ret = dax_fault_return(error);
+ goto out;
+ }
if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
vmf_ret = dax_fault_return(-EIO); /* fs corruption? */
goto finish_iomap;
@@ -1181,8 +1207,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
clear_user_highpage(vmf->cow_page, vaddr);
break;
case IOMAP_MAPPED:
- error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE,
- vmf->cow_page, vaddr);
+ error = copy_user_dax(iomap.bdev, iomap.dax_dev,
+ sector, PAGE_SIZE, vmf->cow_page, vaddr);
break;
default:
WARN_ON_ONCE(1);
@@ -1207,8 +1233,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
major = VM_FAULT_MAJOR;
}
- error = dax_insert_mapping(mapping, iomap.bdev, sector,
- PAGE_SIZE, &entry, vmf->vma, vmf);
+ error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
+ sector, PAGE_SIZE, &entry, vmf->vma, vmf);
/* -EBUSY is fine, somebody else faulted on the same PTE */
if (error == -EBUSY)
error = 0;
@@ -1244,6 +1270,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
*/
ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
}
+out:
+ trace_dax_pte_fault_done(inode, vmf, vmf_ret);
return vmf_ret;
}
@@ -1258,41 +1286,48 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
loff_t pos, void **entryp)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+ const sector_t sector = dax_iomap_sector(iomap, pos);
+ struct dax_device *dax_dev = iomap->dax_dev;
struct block_device *bdev = iomap->bdev;
struct inode *inode = mapping->host;
- struct blk_dax_ctl dax = {
- .sector = dax_iomap_sector(iomap, pos),
- .size = PMD_SIZE,
- };
- long length = dax_map_atomic(bdev, &dax);
- void *ret = NULL;
-
- if (length < 0) /* dax_map_atomic() failed */
+ const size_t size = PMD_SIZE;
+ void *ret = NULL, *kaddr;
+ long length = 0;
+ pgoff_t pgoff;
+ pfn_t pfn;
+ int id;
+
+ if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
goto fallback;
- if (length < PMD_SIZE)
- goto unmap_fallback;
- if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
- goto unmap_fallback;
- if (!pfn_t_devmap(dax.pfn))
- goto unmap_fallback;
-
- dax_unmap_atomic(bdev, &dax);
- ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector,
+ id = dax_read_lock();
+ length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
+ if (length < 0)
+ goto unlock_fallback;
+ length = PFN_PHYS(length);
+
+ if (length < size)
+ goto unlock_fallback;
+ if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR)
+ goto unlock_fallback;
+ if (!pfn_t_devmap(pfn))
+ goto unlock_fallback;
+ dax_read_unlock(id);
+
+ ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector,
RADIX_DAX_PMD);
if (IS_ERR(ret))
goto fallback;
*entryp = ret;
- trace_dax_pmd_insert_mapping(inode, vmf, length, dax.pfn, ret);
+ trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
- dax.pfn, vmf->flags & FAULT_FLAG_WRITE);
+ pfn, vmf->flags & FAULT_FLAG_WRITE);
- unmap_fallback:
- dax_unmap_atomic(bdev, &dax);
+unlock_fallback:
+ dax_read_unlock(id);
fallback:
- trace_dax_pmd_insert_mapping_fallback(inode, vmf, length,
- dax.pfn, ret);
+ trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret);
return VM_FAULT_FALLBACK;
}
diff --git a/fs/dcache.c b/fs/dcache.c
index 95d71eda8142..cddf39777835 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -419,6 +419,8 @@ static void dentry_lru_add(struct dentry *dentry)
{
if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
d_lru_add(dentry);
+ else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED)))
+ dentry->d_flags |= DCACHE_REFERENCED;
}
/**
@@ -779,8 +781,6 @@ repeat:
goto kill_it;
}
- if (!(dentry->d_flags & DCACHE_REFERENCED))
- dentry->d_flags |= DCACHE_REFERENCED;
dentry_lru_add(dentry);
dentry->d_lockref.count--;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 7fd4ec4bb214..e892ae7d89f8 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -199,7 +199,7 @@ static const struct dentry_operations debugfs_dops = {
static int debug_fill_super(struct super_block *sb, void *data, int silent)
{
- static struct tree_descr debug_files[] = {{""}};
+ static const struct tree_descr debug_files[] = {{""}};
struct debugfs_fs_info *fsi;
int err;
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 42f9a0a0c4ca..8eeb694332fe 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -405,8 +405,7 @@ int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
int err;
lock_page(page);
- err = exofs_write_begin(NULL, page->mapping, pos, len,
- AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+ err = exofs_write_begin(NULL, page->mapping, pos, len, 0, &page, NULL);
if (err)
EXOFS_ERR("exofs_set_link: exofs_write_begin FAILED => %d\n",
err);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 3a38c1b84e3c..26d77f9f8c12 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -799,6 +799,7 @@ int ext2_get_block(struct inode *inode, sector_t iblock,
static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned flags, struct iomap *iomap)
{
+ struct block_device *bdev;
unsigned int blkbits = inode->i_blkbits;
unsigned long first_block = offset >> blkbits;
unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits;
@@ -812,8 +813,13 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
return ret;
iomap->flags = 0;
- iomap->bdev = inode->i_sb->s_bdev;
+ bdev = inode->i_sb->s_bdev;
+ iomap->bdev = bdev;
iomap->offset = (u64)first_block << blkbits;
+ if (blk_queue_dax(bdev->bd_queue))
+ iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+ else
+ iomap->dax_dev = NULL;
if (ret == 0) {
iomap->type = IOMAP_HOLE;
@@ -835,6 +841,7 @@ static int
ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length,
ssize_t written, unsigned flags, struct iomap *iomap)
{
+ put_dax(iomap->dax_dev);
if (iomap->type == IOMAP_MAPPED &&
written < length &&
(flags & IOMAP_WRITE))
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 354103f3490c..d9beca1653c5 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -4,11 +4,11 @@
obj-$(CONFIG_EXT4_FS) += ext4.o
-ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
- ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
- ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
- mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
- xattr_trusted.o inline.o readpage.o sysfs.o
+ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
+ extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \
+ indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \
+ mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \
+ super.o symlink.o sysfs.o xattr.o xattr_trusted.o xattr_user.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f7b465b4fb69..8e8046104f4d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2356,17 +2356,16 @@ extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
void *buf, int buf_size,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **dest_de);
-int ext4_insert_dentry(struct inode *dir,
- struct inode *inode,
- struct ext4_dir_entry_2 *de,
- int buf_size,
- struct ext4_filename *fname);
+void ext4_insert_dentry(struct inode *inode,
+ struct ext4_dir_entry_2 *de,
+ int buf_size,
+ struct ext4_filename *fname);
static inline void ext4_update_dx_flag(struct inode *inode)
{
if (!ext4_has_feature_dir_index(inode->i_sb))
ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
}
-static unsigned char ext4_filetype_table[] = {
+static const unsigned char ext4_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
};
@@ -3050,7 +3049,7 @@ extern int ext4_handle_dirty_dirent_node(handle_t *handle,
struct inode *inode,
struct buffer_head *bh);
#define S_SHIFT 12
-static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
+static const unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
[S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE,
[S_IFDIR >> S_SHIFT] = EXT4_FT_DIR,
[S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV,
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
new file mode 100644
index 000000000000..b19436098837
--- /dev/null
+++ b/fs/ext4/fsmap.c
@@ -0,0 +1,722 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "ext4.h"
+#include <linux/fsmap.h>
+#include "fsmap.h"
+#include "mballoc.h"
+#include <linux/sort.h>
+#include <linux/list_sort.h>
+#include <trace/events/ext4.h>
+
+/* Convert an ext4_fsmap to an fsmap. */
+void ext4_fsmap_from_internal(struct super_block *sb, struct fsmap *dest,
+ struct ext4_fsmap *src)
+{
+ dest->fmr_device = src->fmr_device;
+ dest->fmr_flags = src->fmr_flags;
+ dest->fmr_physical = src->fmr_physical << sb->s_blocksize_bits;
+ dest->fmr_owner = src->fmr_owner;
+ dest->fmr_offset = 0;
+ dest->fmr_length = src->fmr_length << sb->s_blocksize_bits;
+ dest->fmr_reserved[0] = 0;
+ dest->fmr_reserved[1] = 0;
+ dest->fmr_reserved[2] = 0;
+}
+
+/* Convert an fsmap to an ext4_fsmap. */
+void ext4_fsmap_to_internal(struct super_block *sb, struct ext4_fsmap *dest,
+ struct fsmap *src)
+{
+ dest->fmr_device = src->fmr_device;
+ dest->fmr_flags = src->fmr_flags;
+ dest->fmr_physical = src->fmr_physical >> sb->s_blocksize_bits;
+ dest->fmr_owner = src->fmr_owner;
+ dest->fmr_length = src->fmr_length >> sb->s_blocksize_bits;
+}
+
+/* getfsmap query state */
+struct ext4_getfsmap_info {
+ struct ext4_fsmap_head *gfi_head;
+ ext4_fsmap_format_t gfi_formatter; /* formatting fn */
+ void *gfi_format_arg;/* format buffer */
+ ext4_fsblk_t gfi_next_fsblk; /* next fsblock we expect */
+ u32 gfi_dev; /* device id */
+ ext4_group_t gfi_agno; /* bg number, if applicable */
+ struct ext4_fsmap gfi_low; /* low rmap key */
+ struct ext4_fsmap gfi_high; /* high rmap key */
+ struct ext4_fsmap gfi_lastfree; /* free ext at end of last bg */
+ struct list_head gfi_meta_list; /* fixed metadata list */
+ bool gfi_last; /* last extent? */
+};
+
+/* Associate a device with a getfsmap handler. */
+struct ext4_getfsmap_dev {
+ int (*gfd_fn)(struct super_block *sb,
+ struct ext4_fsmap *keys,
+ struct ext4_getfsmap_info *info);
+ u32 gfd_dev;
+};
+
+/* Compare two getfsmap device handlers. */
+static int ext4_getfsmap_dev_compare(const void *p1, const void *p2)
+{
+ const struct ext4_getfsmap_dev *d1 = p1;
+ const struct ext4_getfsmap_dev *d2 = p2;
+
+ return d1->gfd_dev - d2->gfd_dev;
+}
+
+/* Compare a record against our starting point */
+static bool ext4_getfsmap_rec_before_low_key(struct ext4_getfsmap_info *info,
+ struct ext4_fsmap *rec)
+{
+ return rec->fmr_physical < info->gfi_low.fmr_physical;
+}
+
+/*
+ * Format a reverse mapping for getfsmap, having translated rm_startblock
+ * into the appropriate daddr units.
+ */
+static int ext4_getfsmap_helper(struct super_block *sb,
+ struct ext4_getfsmap_info *info,
+ struct ext4_fsmap *rec)
+{
+ struct ext4_fsmap fmr;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_fsblk_t rec_fsblk = rec->fmr_physical;
+ ext4_group_t agno;
+ ext4_grpblk_t cno;
+ int error;
+
+ if (fatal_signal_pending(current))
+ return -EINTR;
+
+ /*
+ * Filter out records that start before our startpoint, if the
+ * caller requested that.
+ */
+ if (ext4_getfsmap_rec_before_low_key(info, rec)) {
+ rec_fsblk += rec->fmr_length;
+ if (info->gfi_next_fsblk < rec_fsblk)
+ info->gfi_next_fsblk = rec_fsblk;
+ return EXT4_QUERY_RANGE_CONTINUE;
+ }
+
+ /* Are we just counting mappings? */
+ if (info->gfi_head->fmh_count == 0) {
+ if (rec_fsblk > info->gfi_next_fsblk)
+ info->gfi_head->fmh_entries++;
+
+ if (info->gfi_last)
+ return EXT4_QUERY_RANGE_CONTINUE;
+
+ info->gfi_head->fmh_entries++;
+
+ rec_fsblk += rec->fmr_length;
+ if (info->gfi_next_fsblk < rec_fsblk)
+ info->gfi_next_fsblk = rec_fsblk;
+ return EXT4_QUERY_RANGE_CONTINUE;
+ }
+
+ /*
+ * If the record starts past the last physical block we saw,
+ * then we've found a gap. Report the gap as being owned by
+ * whatever the caller specified is the missing owner.
+ */
+ if (rec_fsblk > info->gfi_next_fsblk) {
+ if (info->gfi_head->fmh_entries >= info->gfi_head->fmh_count)
+ return EXT4_QUERY_RANGE_ABORT;
+
+ ext4_get_group_no_and_offset(sb, info->gfi_next_fsblk,
+ &agno, &cno);
+ trace_ext4_fsmap_mapping(sb, info->gfi_dev, agno,
+ EXT4_C2B(sbi, cno),
+ rec_fsblk - info->gfi_next_fsblk,
+ EXT4_FMR_OWN_UNKNOWN);
+
+ fmr.fmr_device = info->gfi_dev;
+ fmr.fmr_physical = info->gfi_next_fsblk;
+ fmr.fmr_owner = EXT4_FMR_OWN_UNKNOWN;
+ fmr.fmr_length = rec_fsblk - info->gfi_next_fsblk;
+ fmr.fmr_flags = FMR_OF_SPECIAL_OWNER;
+ error = info->gfi_formatter(&fmr, info->gfi_format_arg);
+ if (error)
+ return error;
+ info->gfi_head->fmh_entries++;
+ }
+
+ if (info->gfi_last)
+ goto out;
+
+ /* Fill out the extent we found */
+ if (info->gfi_head->fmh_entries >= info->gfi_head->fmh_count)
+ return EXT4_QUERY_RANGE_ABORT;
+
+ ext4_get_group_no_and_offset(sb, rec_fsblk, &agno, &cno);
+ trace_ext4_fsmap_mapping(sb, info->gfi_dev, agno, EXT4_C2B(sbi, cno),
+ rec->fmr_length, rec->fmr_owner);
+
+ fmr.fmr_device = info->gfi_dev;
+ fmr.fmr_physical = rec_fsblk;
+ fmr.fmr_owner = rec->fmr_owner;
+ fmr.fmr_flags = FMR_OF_SPECIAL_OWNER;
+ fmr.fmr_length = rec->fmr_length;
+ error = info->gfi_formatter(&fmr, info->gfi_format_arg);
+ if (error)
+ return error;
+ info->gfi_head->fmh_entries++;
+
+out:
+ rec_fsblk += rec->fmr_length;
+ if (info->gfi_next_fsblk < rec_fsblk)
+ info->gfi_next_fsblk = rec_fsblk;
+ return EXT4_QUERY_RANGE_CONTINUE;
+}
+
+static inline ext4_fsblk_t ext4_fsmap_next_pblk(struct ext4_fsmap *fmr)
+{
+ return fmr->fmr_physical + fmr->fmr_length;
+}
+
+/* Transform a blockgroup's free record into a fsmap */
+static int ext4_getfsmap_datadev_helper(struct super_block *sb,
+ ext4_group_t agno, ext4_grpblk_t start,
+ ext4_grpblk_t len, void *priv)
+{
+ struct ext4_fsmap irec;
+ struct ext4_getfsmap_info *info = priv;
+ struct ext4_fsmap *p;
+ struct ext4_fsmap *tmp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_fsblk_t fsb;
+ ext4_fsblk_t fslen;
+ int error;
+
+ fsb = (EXT4_C2B(sbi, start) + ext4_group_first_block_no(sb, agno));
+ fslen = EXT4_C2B(sbi, len);
+
+ /* If the retained free extent record is set... */
+ if (info->gfi_lastfree.fmr_owner) {
+ /* ...and abuts this one, lengthen it and return. */
+ if (ext4_fsmap_next_pblk(&info->gfi_lastfree) == fsb) {
+ info->gfi_lastfree.fmr_length += fslen;
+ return 0;
+ }
+
+ /*
+ * There's a gap between the two free extents; emit the
+ * retained extent prior to merging the meta_list.
+ */
+ error = ext4_getfsmap_helper(sb, info, &info->gfi_lastfree);
+ if (error)
+ return error;
+ info->gfi_lastfree.fmr_owner = 0;
+ }
+
+ /* Merge in any relevant extents from the meta_list */
+ list_for_each_entry_safe(p, tmp, &info->gfi_meta_list, fmr_list) {
+ if (p->fmr_physical + p->fmr_length <= info->gfi_next_fsblk) {
+ list_del(&p->fmr_list);
+ kfree(p);
+ } else if (p->fmr_physical < fsb) {
+ error = ext4_getfsmap_helper(sb, info, p);
+ if (error)
+ return error;
+
+ list_del(&p->fmr_list);
+ kfree(p);
+ }
+ }
+
+ irec.fmr_device = 0;
+ irec.fmr_physical = fsb;
+ irec.fmr_length = fslen;
+ irec.fmr_owner = EXT4_FMR_OWN_FREE;
+ irec.fmr_flags = 0;
+
+ /* If this is a free extent at the end of a bg, buffer it. */
+ if (ext4_fsmap_next_pblk(&irec) ==
+ ext4_group_first_block_no(sb, agno + 1)) {
+ info->gfi_lastfree = irec;
+ return 0;
+ }
+
+ /* Otherwise, emit it */
+ return ext4_getfsmap_helper(sb, info, &irec);
+}
+
+/* Execute a getfsmap query against the log device. */
+static int ext4_getfsmap_logdev(struct super_block *sb, struct ext4_fsmap *keys,
+ struct ext4_getfsmap_info *info)
+{
+ journal_t *journal = EXT4_SB(sb)->s_journal;
+ struct ext4_fsmap irec;
+
+ /* Set up search keys */
+ info->gfi_low = keys[0];
+ info->gfi_low.fmr_length = 0;
+
+ memset(&info->gfi_high, 0xFF, sizeof(info->gfi_high));
+
+ trace_ext4_fsmap_low_key(sb, info->gfi_dev, 0,
+ info->gfi_low.fmr_physical,
+ info->gfi_low.fmr_length,
+ info->gfi_low.fmr_owner);
+
+ trace_ext4_fsmap_high_key(sb, info->gfi_dev, 0,
+ info->gfi_high.fmr_physical,
+ info->gfi_high.fmr_length,
+ info->gfi_high.fmr_owner);
+
+ if (keys[0].fmr_physical > 0)
+ return 0;
+
+ /* Fabricate an rmap entry for the external log device. */
+ irec.fmr_physical = journal->j_blk_offset;
+ irec.fmr_length = journal->j_maxlen;
+ irec.fmr_owner = EXT4_FMR_OWN_LOG;
+ irec.fmr_flags = 0;
+
+ return ext4_getfsmap_helper(sb, info, &irec);
+}
+
+/* Helper to fill out an ext4_fsmap. */
+static inline int ext4_getfsmap_fill(struct list_head *meta_list,
+ ext4_fsblk_t fsb, ext4_fsblk_t len,
+ uint64_t owner)
+{
+ struct ext4_fsmap *fsm;
+
+ fsm = kmalloc(sizeof(*fsm), GFP_NOFS);
+ if (!fsm)
+ return -ENOMEM;
+ fsm->fmr_device = 0;
+ fsm->fmr_flags = 0;
+ fsm->fmr_physical = fsb;
+ fsm->fmr_owner = owner;
+ fsm->fmr_length = len;
+ list_add_tail(&fsm->fmr_list, meta_list);
+
+ return 0;
+}
+
+/*
+ * This function returns the number of file system metadata blocks at
+ * the beginning of a block group, including the reserved gdt blocks.
+ */
+static unsigned int ext4_getfsmap_find_sb(struct super_block *sb,
+ ext4_group_t agno,
+ struct list_head *meta_list)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_fsblk_t fsb = ext4_group_first_block_no(sb, agno);
+ ext4_fsblk_t len;
+ unsigned long first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
+ unsigned long metagroup = agno / EXT4_DESC_PER_BLOCK(sb);
+ int error;
+
+ /* Record the superblock. */
+ if (ext4_bg_has_super(sb, agno)) {
+ error = ext4_getfsmap_fill(meta_list, fsb, 1, EXT4_FMR_OWN_FS);
+ if (error)
+ return error;
+ fsb++;
+ }
+
+ /* Record the group descriptors. */
+ len = ext4_bg_num_gdb(sb, agno);
+ if (!len)
+ return 0;
+ error = ext4_getfsmap_fill(meta_list, fsb, len,
+ EXT4_FMR_OWN_GDT);
+ if (error)
+ return error;
+ fsb += len;
+
+ /* Reserved GDT blocks */
+ if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) {
+ len = le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
+ error = ext4_getfsmap_fill(meta_list, fsb, len,
+ EXT4_FMR_OWN_RESV_GDT);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/* Compare two fsmap items. */
+static int ext4_getfsmap_compare(void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ struct ext4_fsmap *fa;
+ struct ext4_fsmap *fb;
+
+ fa = container_of(a, struct ext4_fsmap, fmr_list);
+ fb = container_of(b, struct ext4_fsmap, fmr_list);
+ if (fa->fmr_physical < fb->fmr_physical)
+ return -1;
+ else if (fa->fmr_physical > fb->fmr_physical)
+ return 1;
+ return 0;
+}
+
+/* Merge adjacent extents of fixed metadata. */
+static void ext4_getfsmap_merge_fixed_metadata(struct list_head *meta_list)
+{
+ struct ext4_fsmap *p;
+ struct ext4_fsmap *prev = NULL;
+ struct ext4_fsmap *tmp;
+
+ list_for_each_entry_safe(p, tmp, meta_list, fmr_list) {
+ if (!prev) {
+ prev = p;
+ continue;
+ }
+
+ if (prev->fmr_owner == p->fmr_owner &&
+ prev->fmr_physical + prev->fmr_length == p->fmr_physical) {
+ prev->fmr_length += p->fmr_length;
+ list_del(&p->fmr_list);
+ kfree(p);
+ } else
+ prev = p;
+ }
+}
+
+/* Free a list of fixed metadata. */
+static void ext4_getfsmap_free_fixed_metadata(struct list_head *meta_list)
+{
+ struct ext4_fsmap *p;
+ struct ext4_fsmap *tmp;
+
+ list_for_each_entry_safe(p, tmp, meta_list, fmr_list) {
+ list_del(&p->fmr_list);
+ kfree(p);
+ }
+}
+
+/* Find all the fixed metadata in the filesystem. */
+int ext4_getfsmap_find_fixed_metadata(struct super_block *sb,
+ struct list_head *meta_list)
+{
+ struct ext4_group_desc *gdp;
+ ext4_group_t agno;
+ int error;
+
+ INIT_LIST_HEAD(meta_list);
+
+ /* Collect everything. */
+ for (agno = 0; agno < EXT4_SB(sb)->s_groups_count; agno++) {
+ gdp = ext4_get_group_desc(sb, agno, NULL);
+ if (!gdp) {
+ error = -EFSCORRUPTED;
+ goto err;
+ }
+
+ /* Superblock & GDT */
+ error = ext4_getfsmap_find_sb(sb, agno, meta_list);
+ if (error)
+ goto err;
+
+ /* Block bitmap */
+ error = ext4_getfsmap_fill(meta_list,
+ ext4_block_bitmap(sb, gdp), 1,
+ EXT4_FMR_OWN_BLKBM);
+ if (error)
+ goto err;
+
+ /* Inode bitmap */
+ error = ext4_getfsmap_fill(meta_list,
+ ext4_inode_bitmap(sb, gdp), 1,
+ EXT4_FMR_OWN_INOBM);
+ if (error)
+ goto err;
+
+ /* Inodes */
+ error = ext4_getfsmap_fill(meta_list,
+ ext4_inode_table(sb, gdp),
+ EXT4_SB(sb)->s_itb_per_group,
+ EXT4_FMR_OWN_INODES);
+ if (error)
+ goto err;
+ }
+
+ /* Sort the list */
+ list_sort(NULL, meta_list, ext4_getfsmap_compare);
+
+ /* Merge adjacent extents */
+ ext4_getfsmap_merge_fixed_metadata(meta_list);
+
+ return 0;
+err:
+ ext4_getfsmap_free_fixed_metadata(meta_list);
+ return error;
+}
+
+/* Execute a getfsmap query against the buddy bitmaps */
+static int ext4_getfsmap_datadev(struct super_block *sb,
+ struct ext4_fsmap *keys,
+ struct ext4_getfsmap_info *info)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_fsblk_t start_fsb;
+ ext4_fsblk_t end_fsb;
+ ext4_fsblk_t eofs;
+ ext4_group_t start_ag;
+ ext4_group_t end_ag;
+ ext4_grpblk_t first_cluster;
+ ext4_grpblk_t last_cluster;
+ int error = 0;
+
+ eofs = ext4_blocks_count(sbi->s_es);
+ if (keys[0].fmr_physical >= eofs)
+ return 0;
+ if (keys[1].fmr_physical >= eofs)
+ keys[1].fmr_physical = eofs - 1;
+ start_fsb = keys[0].fmr_physical;
+ end_fsb = keys[1].fmr_physical;
+
+ /* Determine first and last group to examine based on start and end */
+ ext4_get_group_no_and_offset(sb, start_fsb, &start_ag, &first_cluster);
+ ext4_get_group_no_and_offset(sb, end_fsb, &end_ag, &last_cluster);
+
+ /*
+ * Convert the fsmap low/high keys to bg based keys. Initialize
+ * low to the fsmap low key and max out the high key to the end
+ * of the bg.
+ */
+ info->gfi_low = keys[0];
+ info->gfi_low.fmr_physical = EXT4_C2B(sbi, first_cluster);
+ info->gfi_low.fmr_length = 0;
+
+ memset(&info->gfi_high, 0xFF, sizeof(info->gfi_high));
+
+ /* Assemble a list of all the fixed-location metadata. */
+ error = ext4_getfsmap_find_fixed_metadata(sb, &info->gfi_meta_list);
+ if (error)
+ goto err;
+
+ /* Query each bg */
+ for (info->gfi_agno = start_ag;
+ info->gfi_agno <= end_ag;
+ info->gfi_agno++) {
+ /*
+ * Set the bg high key from the fsmap high key if this
+ * is the last bg that we're querying.
+ */
+ if (info->gfi_agno == end_ag) {
+ info->gfi_high = keys[1];
+ info->gfi_high.fmr_physical = EXT4_C2B(sbi,
+ last_cluster);
+ info->gfi_high.fmr_length = 0;
+ }
+
+ trace_ext4_fsmap_low_key(sb, info->gfi_dev, info->gfi_agno,
+ info->gfi_low.fmr_physical,
+ info->gfi_low.fmr_length,
+ info->gfi_low.fmr_owner);
+
+ trace_ext4_fsmap_high_key(sb, info->gfi_dev, info->gfi_agno,
+ info->gfi_high.fmr_physical,
+ info->gfi_high.fmr_length,
+ info->gfi_high.fmr_owner);
+
+ error = ext4_mballoc_query_range(sb, info->gfi_agno,
+ EXT4_B2C(sbi, info->gfi_low.fmr_physical),
+ EXT4_B2C(sbi, info->gfi_high.fmr_physical),
+ ext4_getfsmap_datadev_helper, info);
+ if (error)
+ goto err;
+
+ /*
+ * Set the bg low key to the start of the bg prior to
+ * moving on to the next bg.
+ */
+ if (info->gfi_agno == start_ag)
+ memset(&info->gfi_low, 0, sizeof(info->gfi_low));
+ }
+
+ /* Do we have a retained free extent? */
+ if (info->gfi_lastfree.fmr_owner) {
+ error = ext4_getfsmap_helper(sb, info, &info->gfi_lastfree);
+ if (error)
+ goto err;
+ }
+
+ /* Report any gaps at the end of the bg */
+ info->gfi_last = true;
+ error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster, 0, info);
+ if (error)
+ goto err;
+
+err:
+ ext4_getfsmap_free_fixed_metadata(&info->gfi_meta_list);
+ return error;
+}
+
+/* Do we recognize the device? */
+static bool ext4_getfsmap_is_valid_device(struct super_block *sb,
+ struct ext4_fsmap *fm)
+{
+ if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
+ fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev))
+ return true;
+ if (EXT4_SB(sb)->journal_bdev &&
+ fm->fmr_device == new_encode_dev(EXT4_SB(sb)->journal_bdev->bd_dev))
+ return true;
+ return false;
+}
+
+/* Ensure that the low key is less than the high key. */
+static bool ext4_getfsmap_check_keys(struct ext4_fsmap *low_key,
+ struct ext4_fsmap *high_key)
+{
+ if (low_key->fmr_device > high_key->fmr_device)
+ return false;
+ if (low_key->fmr_device < high_key->fmr_device)
+ return true;
+
+ if (low_key->fmr_physical > high_key->fmr_physical)
+ return false;
+ if (low_key->fmr_physical < high_key->fmr_physical)
+ return true;
+
+ if (low_key->fmr_owner > high_key->fmr_owner)
+ return false;
+ if (low_key->fmr_owner < high_key->fmr_owner)
+ return true;
+
+ return false;
+}
+
+#define EXT4_GETFSMAP_DEVS 2
+/*
+ * Get filesystem's extents as described in head, and format for
+ * output. Calls formatter to fill the user's buffer until all
+ * extents are mapped, until the passed-in head->fmh_count slots have
+ * been filled, or until the formatter short-circuits the loop, if it
+ * is tracking filled-in extents on its own.
+ *
+ * Key to Confusion
+ * ----------------
+ * There are multiple levels of keys and counters at work here:
+ * _fsmap_head.fmh_keys -- low and high fsmap keys passed in;
+ * these reflect fs-wide block addrs.
+ * dkeys -- fmh_keys used to query each device;
+ * these are fmh_keys but w/ the low key
+ * bumped up by fmr_length.
+ * _getfsmap_info.gfi_next_fsblk-- next fs block we expect to see; this
+ * is how we detect gaps in the fsmap
+ * records and report them.
+ * _getfsmap_info.gfi_low/high -- per-bg low/high keys computed from
+ * dkeys; used to query the free space.
+ */
+int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head,
+ ext4_fsmap_format_t formatter, void *arg)
+{
+ struct ext4_fsmap dkeys[2]; /* per-dev keys */
+ struct ext4_getfsmap_dev handlers[EXT4_GETFSMAP_DEVS];
+ struct ext4_getfsmap_info info = {0};
+ int i;
+ int error = 0;
+
+ if (head->fmh_iflags & ~FMH_IF_VALID)
+ return -EINVAL;
+ if (!ext4_getfsmap_is_valid_device(sb, &head->fmh_keys[0]) ||
+ !ext4_getfsmap_is_valid_device(sb, &head->fmh_keys[1]))
+ return -EINVAL;
+
+ head->fmh_entries = 0;
+
+ /* Set up our device handlers. */
+ memset(handlers, 0, sizeof(handlers));
+ handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev);
+ handlers[0].gfd_fn = ext4_getfsmap_datadev;
+ if (EXT4_SB(sb)->journal_bdev) {
+ handlers[1].gfd_dev = new_encode_dev(
+ EXT4_SB(sb)->journal_bdev->bd_dev);
+ handlers[1].gfd_fn = ext4_getfsmap_logdev;
+ }
+
+ sort(handlers, EXT4_GETFSMAP_DEVS, sizeof(struct ext4_getfsmap_dev),
+ ext4_getfsmap_dev_compare, NULL);
+
+ /*
+ * To continue where we left off, we allow userspace to use the
+ * last mapping from a previous call as the low key of the next.
+ * This is identified by a non-zero length in the low key. We
+ * have to increment the low key in this scenario to ensure we
+ * don't return the same mapping again, and instead return the
+ * very next mapping.
+ *
+ * Bump the physical offset as there can be no other mapping for
+ * the same physical block range.
+ */
+ dkeys[0] = head->fmh_keys[0];
+ dkeys[0].fmr_physical += dkeys[0].fmr_length;
+ dkeys[0].fmr_owner = 0;
+ dkeys[0].fmr_length = 0;
+ memset(&dkeys[1], 0xFF, sizeof(struct ext4_fsmap));
+
+ if (!ext4_getfsmap_check_keys(dkeys, &head->fmh_keys[1]))
+ return -EINVAL;
+
+ info.gfi_next_fsblk = head->fmh_keys[0].fmr_physical +
+ head->fmh_keys[0].fmr_length;
+ info.gfi_formatter = formatter;
+ info.gfi_format_arg = arg;
+ info.gfi_head = head;
+
+ /* For each device we support... */
+ for (i = 0; i < EXT4_GETFSMAP_DEVS; i++) {
+ /* Is this device within the range the user asked for? */
+ if (!handlers[i].gfd_fn)
+ continue;
+ if (head->fmh_keys[0].fmr_device > handlers[i].gfd_dev)
+ continue;
+ if (head->fmh_keys[1].fmr_device < handlers[i].gfd_dev)
+ break;
+
+ /*
+ * If this device number matches the high key, we have
+ * to pass the high key to the handler to limit the
+ * query results. If the device number exceeds the
+ * low key, zero out the low key so that we get
+ * everything from the beginning.
+ */
+ if (handlers[i].gfd_dev == head->fmh_keys[1].fmr_device)
+ dkeys[1] = head->fmh_keys[1];
+ if (handlers[i].gfd_dev > head->fmh_keys[0].fmr_device)
+ memset(&dkeys[0], 0, sizeof(struct ext4_fsmap));
+
+ info.gfi_dev = handlers[i].gfd_dev;
+ info.gfi_last = false;
+ info.gfi_agno = -1;
+ error = handlers[i].gfd_fn(sb, dkeys, &info);
+ if (error)
+ break;
+ info.gfi_next_fsblk = 0;
+ }
+
+ head->fmh_oflags = FMH_OF_DEV_T;
+ return error;
+}
diff --git a/fs/ext4/fsmap.h b/fs/ext4/fsmap.h
new file mode 100644
index 000000000000..9a2cd367cc66
--- /dev/null
+++ b/fs/ext4/fsmap.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __EXT4_FSMAP_H__
+#define __EXT4_FSMAP_H__
+
+struct fsmap;
+
+/* internal fsmap representation */
+struct ext4_fsmap {
+ struct list_head fmr_list;
+ dev_t fmr_device; /* device id */
+ uint32_t fmr_flags; /* mapping flags */
+ uint64_t fmr_physical; /* device offset of segment */
+ uint64_t fmr_owner; /* owner id */
+ uint64_t fmr_length; /* length of segment, blocks */
+};
+
+struct ext4_fsmap_head {
+ uint32_t fmh_iflags; /* control flags */
+ uint32_t fmh_oflags; /* output flags */
+ unsigned int fmh_count; /* # of entries in array incl. input */
+ unsigned int fmh_entries; /* # of entries filled in (output). */
+
+ struct ext4_fsmap fmh_keys[2]; /* low and high keys */
+};
+
+void ext4_fsmap_from_internal(struct super_block *sb, struct fsmap *dest,
+ struct ext4_fsmap *src);
+void ext4_fsmap_to_internal(struct super_block *sb, struct ext4_fsmap *dest,
+ struct fsmap *src);
+
+/* fsmap to userspace formatter - copy to user & advance pointer */
+typedef int (*ext4_fsmap_format_t)(struct ext4_fsmap *, void *);
+
+int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head,
+ ext4_fsmap_format_t formatter, void *arg);
+
+#define EXT4_QUERY_RANGE_ABORT 1
+#define EXT4_QUERY_RANGE_CONTINUE 0
+
+/* fmr_owner special values for FS_IOC_GETFSMAP; some share w/ XFS */
+#define EXT4_FMR_OWN_FREE FMR_OWN_FREE /* free space */
+#define EXT4_FMR_OWN_UNKNOWN FMR_OWN_UNKNOWN /* unknown owner */
+#define EXT4_FMR_OWN_FS FMR_OWNER('X', 1) /* static fs metadata */
+#define EXT4_FMR_OWN_LOG FMR_OWNER('X', 2) /* journalling log */
+#define EXT4_FMR_OWN_INODES FMR_OWNER('X', 5) /* inodes */
+#define EXT4_FMR_OWN_GDT FMR_OWNER('f', 1) /* group descriptors */
+#define EXT4_FMR_OWN_RESV_GDT FMR_OWNER('f', 2) /* reserved gdt blocks */
+#define EXT4_FMR_OWN_BLKBM FMR_OWNER('f', 3) /* inode bitmap */
+#define EXT4_FMR_OWN_INOBM FMR_OWNER('f', 4) /* block bitmap */
+
+#endif /* __EXT4_FSMAP_H__ */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 17bc043308f3..98ac2f1f23b3 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1098,6 +1098,17 @@ got:
if (err)
goto fail_drop;
+ /*
+ * Since the encryption xattr will always be unique, create it first so
+ * that it's less likely to end up in an external xattr block and
+ * prevent its deduplication.
+ */
+ if (encrypt) {
+ err = fscrypt_inherit_context(dir, inode, handle, true);
+ if (err)
+ goto fail_free_drop;
+ }
+
err = ext4_init_acl(handle, inode, dir);
if (err)
goto fail_free_drop;
@@ -1119,12 +1130,6 @@ got:
ei->i_datasync_tid = handle->h_transaction->t_tid;
}
- if (encrypt) {
- err = fscrypt_inherit_context(dir, inode, handle, true);
- if (err)
- goto fail_free_drop;
- }
-
err = ext4_mark_inode_dirty(handle, inode);
if (err) {
ext4_std_error(sb, err);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 375fb1c05d49..d5dea4c293ef 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1034,7 +1034,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
err = ext4_journal_get_write_access(handle, iloc->bh);
if (err)
return err;
- ext4_insert_dentry(dir, inode, de, inline_size, fname);
+ ext4_insert_dentry(inode, de, inline_size, fname);
ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 10b574ab354b..5834c4d76be8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1643,6 +1643,7 @@ struct mpage_da_data {
*/
struct ext4_map_blocks map;
struct ext4_io_submit io_submit; /* IO submission data */
+ unsigned int do_map:1;
};
static void mpage_release_unused_pages(struct mpage_da_data *mpd,
@@ -2179,6 +2180,9 @@ static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
/* First block in the extent? */
if (map->m_len == 0) {
+ /* We cannot map unless handle is started... */
+ if (!mpd->do_map)
+ return false;
map->m_lblk = lblk;
map->m_len = 1;
map->m_flags = bh->b_state & BH_FLAGS;
@@ -2231,6 +2235,9 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,
/* Found extent to map? */
if (mpd->map.m_len)
return 0;
+ /* Buffer needs mapping and handle is not started? */
+ if (!mpd->do_map)
+ return 0;
/* Everything mapped so far and we hit EOF */
break;
}
@@ -2747,6 +2754,29 @@ retry:
tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
done = false;
blk_start_plug(&plug);
+
+ /*
+ * First writeback pages that don't need mapping - we can avoid
+ * starting a transaction unnecessarily and also avoid being blocked
+ * in the block layer on device congestion while having transaction
+ * started.
+ */
+ mpd.do_map = 0;
+ mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+ if (!mpd.io_submit.io_end) {
+ ret = -ENOMEM;
+ goto unplug;
+ }
+ ret = mpage_prepare_extent_to_map(&mpd);
+ /* Submit prepared bio */
+ ext4_io_submit(&mpd.io_submit);
+ ext4_put_io_end_defer(mpd.io_submit.io_end);
+ mpd.io_submit.io_end = NULL;
+ /* Unlock pages we didn't use */
+ mpage_release_unused_pages(&mpd, false);
+ if (ret < 0)
+ goto unplug;
+
while (!done && mpd.first_page <= mpd.last_page) {
/* For each extent of pages we use new io_end */
mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
@@ -2775,8 +2805,10 @@ retry:
wbc->nr_to_write, inode->i_ino, ret);
/* Release allocated io_end */
ext4_put_io_end(mpd.io_submit.io_end);
+ mpd.io_submit.io_end = NULL;
break;
}
+ mpd.do_map = 1;
trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
ret = mpage_prepare_extent_to_map(&mpd);
@@ -2807,6 +2839,7 @@ retry:
if (!ext4_handle_valid(handle) || handle->h_sync == 0) {
ext4_journal_stop(handle);
handle = NULL;
+ mpd.do_map = 0;
}
/* Submit prepared bio */
ext4_io_submit(&mpd.io_submit);
@@ -2824,6 +2857,7 @@ retry:
ext4_journal_stop(handle);
} else
ext4_put_io_end(mpd.io_submit.io_end);
+ mpd.io_submit.io_end = NULL;
if (ret == -ENOSPC && sbi->s_journal) {
/*
@@ -2839,6 +2873,7 @@ retry:
if (ret)
break;
}
+unplug:
blk_finish_plug(&plug);
if (!ret && !cycled && wbc->nr_to_write > 0) {
cycled = 1;
@@ -3305,6 +3340,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned flags, struct iomap *iomap)
{
+ struct block_device *bdev;
unsigned int blkbits = inode->i_blkbits;
unsigned long first_block = offset >> blkbits;
unsigned long last_block = (offset + length - 1) >> blkbits;
@@ -3373,7 +3409,12 @@ retry:
}
iomap->flags = 0;
- iomap->bdev = inode->i_sb->s_bdev;
+ bdev = inode->i_sb->s_bdev;
+ iomap->bdev = bdev;
+ if (blk_queue_dax(bdev->bd_queue))
+ iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+ else
+ iomap->dax_dev = NULL;
iomap->offset = first_block << blkbits;
if (ret == 0) {
@@ -3406,6 +3447,7 @@ static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
int blkbits = inode->i_blkbits;
bool truncate = false;
+ put_dax(iomap->dax_dev);
if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
return 0;
@@ -5848,6 +5890,11 @@ int ext4_page_mkwrite(struct vm_fault *vmf)
file_update_time(vma->vm_file);
down_read(&EXT4_I(inode)->i_mmap_sem);
+
+ ret = ext4_convert_inline_data(inode);
+ if (ret)
+ goto out_ret;
+
/* Delalloc case is easy... */
if (test_opt(inode->i_sb, DELALLOC) &&
!ext4_should_journal_data(inode) &&
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 184e74eb3004..0c21e22acd74 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -19,6 +19,9 @@
#include <linux/delay.h>
#include "ext4_jbd2.h"
#include "ext4.h"
+#include <linux/fsmap.h>
+#include "fsmap.h"
+#include <trace/events/ext4.h>
/**
* Swap memory between @a and @b for @len bytes.
@@ -443,7 +446,7 @@ static inline unsigned long ext4_xflags_to_iflags(__u32 xflags)
return iflags;
}
-int ext4_shutdown(struct super_block *sb, unsigned long arg)
+static int ext4_shutdown(struct super_block *sb, unsigned long arg)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
__u32 flags;
@@ -489,6 +492,90 @@ int ext4_shutdown(struct super_block *sb, unsigned long arg)
return 0;
}
+struct getfsmap_info {
+ struct super_block *gi_sb;
+ struct fsmap_head __user *gi_data;
+ unsigned int gi_idx;
+ __u32 gi_last_flags;
+};
+
+static int ext4_getfsmap_format(struct ext4_fsmap *xfm, void *priv)
+{
+ struct getfsmap_info *info = priv;
+ struct fsmap fm;
+
+ trace_ext4_getfsmap_mapping(info->gi_sb, xfm);
+
+ info->gi_last_flags = xfm->fmr_flags;
+ ext4_fsmap_from_internal(info->gi_sb, &fm, xfm);
+ if (copy_to_user(&info->gi_data->fmh_recs[info->gi_idx++], &fm,
+ sizeof(struct fsmap)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int ext4_ioc_getfsmap(struct super_block *sb,
+ struct fsmap_head __user *arg)
+{
+ struct getfsmap_info info = {0};
+ struct ext4_fsmap_head xhead = {0};
+ struct fsmap_head head;
+ bool aborted = false;
+ int error;
+
+ if (copy_from_user(&head, arg, sizeof(struct fsmap_head)))
+ return -EFAULT;
+ if (memchr_inv(head.fmh_reserved, 0, sizeof(head.fmh_reserved)) ||
+ memchr_inv(head.fmh_keys[0].fmr_reserved, 0,
+ sizeof(head.fmh_keys[0].fmr_reserved)) ||
+ memchr_inv(head.fmh_keys[1].fmr_reserved, 0,
+ sizeof(head.fmh_keys[1].fmr_reserved)))
+ return -EINVAL;
+ /*
+ * ext4 doesn't report file extents at all, so the only valid
+ * file offsets are the magic ones (all zeroes or all ones).
+ */
+ if (head.fmh_keys[0].fmr_offset ||
+ (head.fmh_keys[1].fmr_offset != 0 &&
+ head.fmh_keys[1].fmr_offset != -1ULL))
+ return -EINVAL;
+
+ xhead.fmh_iflags = head.fmh_iflags;
+ xhead.fmh_count = head.fmh_count;
+ ext4_fsmap_to_internal(sb, &xhead.fmh_keys[0], &head.fmh_keys[0]);
+ ext4_fsmap_to_internal(sb, &xhead.fmh_keys[1], &head.fmh_keys[1]);
+
+ trace_ext4_getfsmap_low_key(sb, &xhead.fmh_keys[0]);
+ trace_ext4_getfsmap_high_key(sb, &xhead.fmh_keys[1]);
+
+ info.gi_sb = sb;
+ info.gi_data = arg;
+ error = ext4_getfsmap(sb, &xhead, ext4_getfsmap_format, &info);
+ if (error == EXT4_QUERY_RANGE_ABORT) {
+ error = 0;
+ aborted = true;
+ } else if (error)
+ return error;
+
+ /* If we didn't abort, set the "last" flag in the last fmx */
+ if (!aborted && info.gi_idx) {
+ info.gi_last_flags |= FMR_OF_LAST;
+ if (copy_to_user(&info.gi_data->fmh_recs[info.gi_idx - 1].fmr_flags,
+ &info.gi_last_flags,
+ sizeof(info.gi_last_flags)))
+ return -EFAULT;
+ }
+
+ /* copy back header */
+ head.fmh_entries = xhead.fmh_entries;
+ head.fmh_oflags = xhead.fmh_oflags;
+ if (copy_to_user(arg, &head, sizeof(struct fsmap_head)))
+ return -EFAULT;
+
+ return 0;
+}
+
long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -499,6 +586,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);
switch (cmd) {
+ case FS_IOC_GETFSMAP:
+ return ext4_ioc_getfsmap(sb, (void __user *)arg);
case EXT4_IOC_GETFLAGS:
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
return put_user(flags, (int __user *) arg);
@@ -1007,6 +1096,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC_GET_ENCRYPTION_PWSALT:
case EXT4_IOC_GET_ENCRYPTION_POLICY:
case EXT4_IOC_SHUTDOWN:
+ case FS_IOC_GETFSMAP:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 354dc1a894c2..5083bce20ac4 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -357,7 +357,7 @@ static struct kmem_cache *ext4_free_data_cachep;
#define NR_GRPINFO_CACHES 8
static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
-static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
+static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
"ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
"ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
"ext4_groupinfo_64k", "ext4_groupinfo_128k"
@@ -2393,7 +2393,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
return 0;
size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
- new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL);
+ new_groupinfo = kvzalloc(size, GFP_KERNEL);
if (!new_groupinfo) {
ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
return -ENOMEM;
@@ -5277,3 +5277,52 @@ out:
range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
return ret;
}
+
+/* Iterate all the free extents in the group. */
+int
+ext4_mballoc_query_range(
+ struct super_block *sb,
+ ext4_group_t group,
+ ext4_grpblk_t start,
+ ext4_grpblk_t end,
+ ext4_mballoc_query_range_fn formatter,
+ void *priv)
+{
+ void *bitmap;
+ ext4_grpblk_t next;
+ struct ext4_buddy e4b;
+ int error;
+
+ error = ext4_mb_load_buddy(sb, group, &e4b);
+ if (error)
+ return error;
+ bitmap = e4b.bd_bitmap;
+
+ ext4_lock_group(sb, group);
+
+ start = (e4b.bd_info->bb_first_free > start) ?
+ e4b.bd_info->bb_first_free : start;
+ if (end >= EXT4_CLUSTERS_PER_GROUP(sb))
+ end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
+
+ while (start <= end) {
+ start = mb_find_next_zero_bit(bitmap, end + 1, start);
+ if (start > end)
+ break;
+ next = mb_find_next_bit(bitmap, end + 1, start);
+
+ ext4_unlock_group(sb, group);
+ error = formatter(sb, group, start, next - start, priv);
+ if (error)
+ goto out_unload;
+ ext4_lock_group(sb, group);
+
+ start = next + 1;
+ }
+
+ ext4_unlock_group(sb, group);
+out_unload:
+ ext4_mb_unload_buddy(&e4b);
+
+ return error;
+}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 1aba469f8220..2bed62084a8c 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -199,4 +199,21 @@ static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
return ext4_group_first_block_no(sb, fex->fe_group) +
(fex->fe_start << EXT4_SB(sb)->s_cluster_bits);
}
+
+typedef int (*ext4_mballoc_query_range_fn)(
+ struct super_block *sb,
+ ext4_group_t agno,
+ ext4_grpblk_t start,
+ ext4_grpblk_t len,
+ void *priv);
+
+int
+ext4_mballoc_query_range(
+ struct super_block *sb,
+ ext4_group_t agno,
+ ext4_grpblk_t start,
+ ext4_grpblk_t end,
+ ext4_mballoc_query_range_fn formatter,
+ void *priv);
+
#endif
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 07e5e1405771..b81f7d46f344 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1237,37 +1237,24 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
}
/*
- * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure.
+ * Test whether a directory entry matches the filename being searched for.
*
- * `len <= EXT4_NAME_LEN' is guaranteed by caller.
- * `de != NULL' is guaranteed by caller.
+ * Return: %true if the directory entry matches, otherwise %false.
*/
-static inline int ext4_match(struct ext4_filename *fname,
- struct ext4_dir_entry_2 *de)
+static inline bool ext4_match(const struct ext4_filename *fname,
+ const struct ext4_dir_entry_2 *de)
{
- const void *name = fname_name(fname);
- u32 len = fname_len(fname);
+ struct fscrypt_name f;
if (!de->inode)
- return 0;
+ return false;
+ f.usr_fname = fname->usr_fname;
+ f.disk_name = fname->disk_name;
#ifdef CONFIG_EXT4_FS_ENCRYPTION
- if (unlikely(!name)) {
- if (fname->usr_fname->name[0] == '_') {
- int ret;
- if (de->name_len < 16)
- return 0;
- ret = memcmp(de->name + de->name_len - 16,
- fname->crypto_buf.name + 8, 16);
- return (ret == 0) ? 1 : 0;
- }
- name = fname->crypto_buf.name;
- len = fname->crypto_buf.len;
- }
+ f.crypto_buf = fname->crypto_buf;
#endif
- if (de->name_len != len)
- return 0;
- return (memcmp(de->name, name, len) == 0) ? 1 : 0;
+ return fscrypt_match_name(&f, de->name, de->name_len);
}
/*
@@ -1281,48 +1268,31 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
struct ext4_dir_entry_2 * de;
char * dlimit;
int de_len;
- int res;
de = (struct ext4_dir_entry_2 *)search_buf;
dlimit = search_buf + buf_size;
while ((char *) de < dlimit) {
/* this code is executed quadratically often */
/* do minimal checking `by hand' */
- if ((char *) de + de->name_len <= dlimit) {
- res = ext4_match(fname, de);
- if (res < 0) {
- res = -1;
- goto return_result;
- }
- if (res > 0) {
- /* found a match - just to be sure, do
- * a full check */
- if (ext4_check_dir_entry(dir, NULL, de, bh,
- bh->b_data,
- bh->b_size, offset)) {
- res = -1;
- goto return_result;
- }
- *res_dir = de;
- res = 1;
- goto return_result;
- }
-
+ if ((char *) de + de->name_len <= dlimit &&
+ ext4_match(fname, de)) {
+ /* found a match - just to be sure, do
+ * a full check */
+ if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data,
+ bh->b_size, offset))
+ return -1;
+ *res_dir = de;
+ return 1;
}
/* prevent looping on a bad block */
de_len = ext4_rec_len_from_disk(de->rec_len,
dir->i_sb->s_blocksize);
- if (de_len <= 0) {
- res = -1;
- goto return_result;
- }
+ if (de_len <= 0)
+ return -1;
offset += de_len;
de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
}
-
- res = 0;
-return_result:
- return res;
+ return 0;
}
static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
@@ -1616,16 +1586,9 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
if (!IS_ERR(inode) && ext4_encrypted_inode(dir) &&
(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
!fscrypt_has_permitted_context(dir, inode)) {
- int nokey = ext4_encrypted_inode(inode) &&
- !fscrypt_has_encryption_key(inode);
- if (nokey) {
- iput(inode);
- return ERR_PTR(-ENOKEY);
- }
ext4_warning(inode->i_sb,
"Inconsistent encryption contexts: %lu/%lu",
- (unsigned long) dir->i_ino,
- (unsigned long) inode->i_ino);
+ dir->i_ino, inode->i_ino);
iput(inode);
return ERR_PTR(-EPERM);
}
@@ -1833,24 +1796,15 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
int nlen, rlen;
unsigned int offset = 0;
char *top;
- int res;
de = (struct ext4_dir_entry_2 *)buf;
top = buf + buf_size - reclen;
while ((char *) de <= top) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
- buf, buf_size, offset)) {
- res = -EFSCORRUPTED;
- goto return_result;
- }
- /* Provide crypto context and crypto buffer to ext4 match */
- res = ext4_match(fname, de);
- if (res < 0)
- goto return_result;
- if (res > 0) {
- res = -EEXIST;
- goto return_result;
- }
+ buf, buf_size, offset))
+ return -EFSCORRUPTED;
+ if (ext4_match(fname, de))
+ return -EEXIST;
nlen = EXT4_DIR_REC_LEN(de->name_len);
rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
if ((de->inode ? rlen - nlen : rlen) >= reclen)
@@ -1858,22 +1812,17 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
offset += rlen;
}
-
if ((char *) de > top)
- res = -ENOSPC;
- else {
- *dest_de = de;
- res = 0;
- }
-return_result:
- return res;
+ return -ENOSPC;
+
+ *dest_de = de;
+ return 0;
}
-int ext4_insert_dentry(struct inode *dir,
- struct inode *inode,
- struct ext4_dir_entry_2 *de,
- int buf_size,
- struct ext4_filename *fname)
+void ext4_insert_dentry(struct inode *inode,
+ struct ext4_dir_entry_2 *de,
+ int buf_size,
+ struct ext4_filename *fname)
{
int nlen, rlen;
@@ -1892,7 +1841,6 @@ int ext4_insert_dentry(struct inode *dir,
ext4_set_de_type(inode->i_sb, de, inode->i_mode);
de->name_len = fname_len(fname);
memcpy(de->name, fname_name(fname), fname_len(fname));
- return 0;
}
/*
@@ -1928,11 +1876,8 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
return err;
}
- /* By now the buffer is marked for journaling. Due to crypto operations,
- * the following function call may fail */
- err = ext4_insert_dentry(dir, inode, de, blocksize, fname);
- if (err < 0)
- return err;
+ /* By now the buffer is marked for journaling */
+ ext4_insert_dentry(inode, de, blocksize, fname);
/*
* XXX shouldn't update any times until successful
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 208241b06662..1a82138ba739 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -297,8 +297,17 @@ static void ext4_end_bio(struct bio *bio)
{
ext4_io_end_t *io_end = bio->bi_private;
sector_t bi_sector = bio->bi_iter.bi_sector;
+ char b[BDEVNAME_SIZE];
- BUG_ON(!io_end);
+ if (WARN_ONCE(!io_end, "io_end is NULL: %s: sector %Lu len %u err %d\n",
+ bdevname(bio->bi_bdev, b),
+ (long long) bio->bi_iter.bi_sector,
+ (unsigned) bio_sectors(bio),
+ bio->bi_error)) {
+ ext4_finish_bio(bio);
+ bio_put(bio);
+ return;
+ }
bio->bi_end_io = NULL;
if (bio->bi_error) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a9c72e39a4ee..c90edf09b0c3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -49,6 +49,7 @@
#include "xattr.h"
#include "acl.h"
#include "mballoc.h"
+#include "fsmap.h"
#define CREATE_TRACE_POINTS
#include <trace/events/ext4.h>
@@ -1230,7 +1231,7 @@ static const struct fscrypt_operations ext4_cryptops = {
#endif
#ifdef CONFIG_QUOTA
-static char *quotatypes[] = INITQFNAMES;
+static const char * const quotatypes[] = INITQFNAMES;
#define QTYPE2NAME(t) (quotatypes[t])
static int ext4_write_dquot(struct dquot *dquot);
@@ -1443,7 +1444,8 @@ static ext4_fsblk_t get_sb_block(void **data)
}
#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
-static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n"
+static const char deprecated_msg[] =
+ "Mount option \"%s\" will be removed by %s\n"
"Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
#ifdef CONFIG_QUOTA
@@ -2153,7 +2155,7 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
return 0;
size = roundup_pow_of_two(size * sizeof(struct flex_groups));
- new_groups = ext4_kvzalloc(size, GFP_KERNEL);
+ new_groups = kvzalloc(size, GFP_KERNEL);
if (!new_groups) {
ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups",
size / (int) sizeof(struct flex_groups));
@@ -3887,7 +3889,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
}
- sbi->s_group_desc = ext4_kvmalloc(db_count *
+ sbi->s_group_desc = kvmalloc(db_count *
sizeof(struct buffer_head *),
GFP_KERNEL);
if (sbi->s_group_desc == NULL) {
@@ -3898,6 +3900,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
bgl_lock_init(sbi->s_blockgroup_lock);
+ /* Pre-read the descriptors into the buffer cache */
+ for (i = 0; i < db_count; i++) {
+ block = descriptor_loc(sb, logical_sb_block, i);
+ sb_breadahead(sb, block);
+ }
+
for (i = 0; i < db_count; i++) {
block = descriptor_loc(sb, logical_sb_block, i);
sbi->s_group_desc[i] = sb_bread_unmovable(sb, block);
@@ -4650,7 +4658,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
if (sync) {
unlock_buffer(sbh);
error = __sync_dirty_buffer(sbh,
- test_opt(sb, BARRIER) ? REQ_FUA : REQ_SYNC);
+ REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0));
if (error)
return error;
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 42145be5c6b4..d74dc5f81a04 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -34,7 +34,7 @@ typedef enum {
ptr_ext4_super_block_offset,
} attr_ptr_t;
-static const char *proc_dirname = "fs/ext4";
+static const char proc_dirname[] = "fs/ext4";
static struct proc_dir_entry *ext4_proc_root;
struct ext4_attr {
@@ -375,7 +375,7 @@ static const struct file_operations ext4_seq_##name##_fops = { \
PROC_FILE_SHOW_DEFN(es_shrinker_info);
PROC_FILE_SHOW_DEFN(options);
-static struct ext4_proc_files {
+static const struct ext4_proc_files {
const char *name;
const struct file_operations *fops;
} proc_files[] = {
@@ -388,7 +388,7 @@ static struct ext4_proc_files {
int ext4_register_sysfs(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_proc_files *p;
+ const struct ext4_proc_files *p;
int err;
sbi->s_kobj.kset = &ext4_kset;
@@ -412,7 +412,7 @@ int ext4_register_sysfs(struct super_block *sb)
void ext4_unregister_sysfs(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_proc_files *p;
+ const struct ext4_proc_files *p;
if (sbi->s_proc) {
for (p = proc_files; p->name; p++)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 996e7900d4c8..8fb7ce14e6eb 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -78,10 +78,8 @@ static struct buffer_head *ext4_xattr_cache_find(struct inode *,
struct mb_cache_entry **);
static void ext4_xattr_rehash(struct ext4_xattr_header *,
struct ext4_xattr_entry *);
-static int ext4_xattr_list(struct dentry *dentry, char *buffer,
- size_t buffer_size);
-static const struct xattr_handler *ext4_xattr_handler_map[] = {
+static const struct xattr_handler * const ext4_xattr_handler_map[] = {
[EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler,
#ifdef CONFIG_EXT4_FS_POSIX_ACL
[EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
@@ -163,20 +161,9 @@ ext4_xattr_handler(int name_index)
return handler;
}
-/*
- * Inode operation listxattr()
- *
- * d_inode(dentry)->i_mutex: don't care
- */
-ssize_t
-ext4_listxattr(struct dentry *dentry, char *buffer, size_t size)
-{
- return ext4_xattr_list(dentry, buffer, size);
-}
-
static int
-ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end,
- void *value_start)
+ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end,
+ void *value_start)
{
struct ext4_xattr_entry *e = entry;
@@ -230,8 +217,8 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh)
return -EFSCORRUPTED;
if (!ext4_xattr_block_csum_verify(inode, bh))
return -EFSBADCRC;
- error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size,
- bh->b_data);
+ error = ext4_xattr_check_entries(BFIRST(bh), bh->b_data + bh->b_size,
+ bh->b_data);
if (!error)
set_buffer_verified(bh);
return error;
@@ -246,7 +233,7 @@ __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
if (end - (void *)header < sizeof(*header) + sizeof(u32) ||
(header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)))
goto errout;
- error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header));
+ error = ext4_xattr_check_entries(IFIRST(header), end, IFIRST(header));
errout:
if (error)
__ext4_error_inode(inode, function, line, 0,
@@ -257,20 +244,9 @@ errout:
#define xattr_check_inode(inode, header, end) \
__xattr_check_inode((inode), (header), (end), __func__, __LINE__)
-static inline int
-ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size)
-{
- size_t value_size = le32_to_cpu(entry->e_value_size);
-
- if (entry->e_value_block != 0 || value_size > size ||
- le16_to_cpu(entry->e_value_offs) + value_size > size)
- return -EFSCORRUPTED;
- return 0;
-}
-
static int
ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
- const char *name, size_t size, int sorted)
+ const char *name, int sorted)
{
struct ext4_xattr_entry *entry;
size_t name_len;
@@ -290,8 +266,6 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
break;
}
*pentry = entry;
- if (!cmp && ext4_xattr_check_entry(entry, size))
- return -EFSCORRUPTED;
return cmp ? -ENODATA : 0;
}
@@ -319,7 +293,6 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
ea_bdebug(bh, "b_count=%d, refcount=%d",
atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
if (ext4_xattr_check_block(inode, bh)) {
-bad_block:
EXT4_ERROR_INODE(inode, "bad block %llu",
EXT4_I(inode)->i_file_acl);
error = -EFSCORRUPTED;
@@ -327,9 +300,7 @@ bad_block:
}
ext4_xattr_cache_insert(ext4_mb_cache, bh);
entry = BFIRST(bh);
- error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
- if (error == -EFSCORRUPTED)
- goto bad_block;
+ error = ext4_xattr_find_entry(&entry, name_index, name, 1);
if (error)
goto cleanup;
size = le32_to_cpu(entry->e_value_size);
@@ -366,13 +337,12 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
return error;
raw_inode = ext4_raw_inode(&iloc);
header = IHDR(inode, raw_inode);
- entry = IFIRST(header);
end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
error = xattr_check_inode(inode, header, end);
if (error)
goto cleanup;
- error = ext4_xattr_find_entry(&entry, name_index, name,
- end - (void *)entry, 0);
+ entry = IFIRST(header);
+ error = ext4_xattr_find_entry(&entry, name_index, name, 0);
if (error)
goto cleanup;
size = le32_to_cpu(entry->e_value_size);
@@ -519,7 +489,9 @@ cleanup:
}
/*
- * ext4_xattr_list()
+ * Inode operation listxattr()
+ *
+ * d_inode(dentry)->i_rwsem: don't care
*
* Copy a list of attribute names into the buffer
* provided, or compute the buffer size required.
@@ -528,8 +500,8 @@ cleanup:
* Returns a negative error number on failure, or the number of bytes
* used / required on success.
*/
-static int
-ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+ssize_t
+ext4_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
{
int ret, ret2;
@@ -804,7 +776,7 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
bs->s.end = bs->bh->b_data + bs->bh->b_size;
bs->s.here = bs->s.first;
error = ext4_xattr_find_entry(&bs->s.here, i->name_index,
- i->name, bs->bh->b_size, 1);
+ i->name, 1);
if (error && error != -ENODATA)
goto cleanup;
bs->s.not_found = error;
@@ -1076,8 +1048,7 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
return error;
/* Find the named attribute. */
error = ext4_xattr_find_entry(&is->s.here, i->name_index,
- i->name, is->s.end -
- (void *)is->s.base, 0);
+ i->name, 0);
if (error && error != -ENODATA)
return error;
is->s.not_found = error;
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 0339daf4ca02..ea9c317b5916 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -275,10 +275,11 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META))
goto skip_write;
- trace_f2fs_writepages(mapping->host, wbc, META);
+ /* if locked failed, cp will flush dirty pages instead */
+ if (!mutex_trylock(&sbi->cp_mutex))
+ goto skip_write;
- /* if mounting is failed, skip writing node pages */
- mutex_lock(&sbi->cp_mutex);
+ trace_f2fs_writepages(mapping->host, wbc, META);
diff = nr_pages_to_write(sbi, META, wbc);
written = sync_meta_pages(sbi, META, wbc->nr_to_write);
mutex_unlock(&sbi->cp_mutex);
@@ -567,7 +568,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
if (ni.blk_addr != NULL_ADDR) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_msg(sbi->sb, KERN_WARNING,
- "%s: orphan failed (ino=%x), run fsck to fix.",
+ "%s: orphan failed (ino=%x) by kernel, retry mount.",
__func__, ino);
return -EIO;
}
@@ -677,7 +678,7 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr,
*cp_block = (struct f2fs_checkpoint *)page_address(*cp_page);
crc_offset = le32_to_cpu((*cp_block)->checksum_offset);
- if (crc_offset >= blk_size) {
+ if (crc_offset > (blk_size - sizeof(__le32))) {
f2fs_msg(sbi->sb, KERN_WARNING,
"invalid crc_offset: %zu", crc_offset);
return -EINVAL;
@@ -816,7 +817,9 @@ static void __add_dirty_inode(struct inode *inode, enum inode_type type)
return;
set_inode_flag(inode, flag);
- list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]);
+ if (!f2fs_is_volatile_file(inode))
+ list_add_tail(&F2FS_I(inode)->dirty_list,
+ &sbi->inode_list[type]);
stat_inc_dirty_inode(sbi, type);
}
@@ -941,6 +944,19 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi)
return 0;
}
+static void __prepare_cp_block(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ nid_t last_nid = nm_i->next_scan_nid;
+
+ next_free_nid(sbi, &last_nid);
+ ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
+ ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi));
+ ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi));
+ ckpt->next_free_nid = cpu_to_le32(last_nid);
+}
+
/*
* Freeze all the FS-operations for checkpoint.
*/
@@ -964,21 +980,26 @@ retry_flush_dents:
err = sync_dirty_inodes(sbi, DIR_INODE);
if (err)
goto out;
+ cond_resched();
goto retry_flush_dents;
}
+ /*
+ * POR: we should ensure that there are no dirty node pages
+ * until finishing nat/sit flush. inode->i_blocks can be updated.
+ */
+ down_write(&sbi->node_change);
+
if (get_pages(sbi, F2FS_DIRTY_IMETA)) {
+ up_write(&sbi->node_change);
f2fs_unlock_all(sbi);
err = f2fs_sync_inode_meta(sbi);
if (err)
goto out;
+ cond_resched();
goto retry_flush_dents;
}
- /*
- * POR: we should ensure that there are no dirty node pages
- * until finishing nat/sit flush.
- */
retry_flush_nodes:
down_write(&sbi->node_write);
@@ -986,11 +1007,20 @@ retry_flush_nodes:
up_write(&sbi->node_write);
err = sync_node_pages(sbi, &wbc);
if (err) {
+ up_write(&sbi->node_change);
f2fs_unlock_all(sbi);
goto out;
}
+ cond_resched();
goto retry_flush_nodes;
}
+
+ /*
+ * sbi->node_change is used only for AIO write_begin path which produces
+ * dirty node blocks and some checkpoint values by block allocation.
+ */
+ __prepare_cp_block(sbi);
+ up_write(&sbi->node_change);
out:
blk_finish_plug(&plug);
return err;
@@ -1024,16 +1054,20 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
spin_lock(&sbi->cp_lock);
- if (cpc->reason == CP_UMOUNT && ckpt->cp_pack_total_block_count >
+ if ((cpc->reason & CP_UMOUNT) &&
+ le32_to_cpu(ckpt->cp_pack_total_block_count) >
sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks)
disable_nat_bits(sbi, false);
- if (cpc->reason == CP_UMOUNT)
+ if (cpc->reason & CP_TRIMMED)
+ __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG);
+
+ if (cpc->reason & CP_UMOUNT)
__set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
else
__clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
- if (cpc->reason == CP_FASTBOOT)
+ if (cpc->reason & CP_FASTBOOT)
__set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
else
__clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
@@ -1057,7 +1091,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
struct f2fs_nm_info *nm_i = NM_I(sbi);
unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num;
- nid_t last_nid = nm_i->next_scan_nid;
block_t start_blk;
unsigned int data_sum_blocks, orphan_blocks;
__u32 crc32 = 0;
@@ -1074,14 +1107,11 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
return -EIO;
}
- next_free_nid(sbi, &last_nid);
-
/*
* modify checkpoint
* version number is already updated
*/
ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi));
- ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
ckpt->cur_node_segno[i] =
@@ -1100,10 +1130,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
curseg_alloc_type(sbi, i + CURSEG_HOT_DATA);
}
- ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi));
- ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi));
- ckpt->next_free_nid = cpu_to_le32(last_nid);
-
/* 2 cp + n data seg summary + orphan inode blocks */
data_sum_blocks = npages_for_summary_flush(sbi, false);
spin_lock(&sbi->cp_lock);
@@ -1143,7 +1169,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* write nat bits */
if (enabled_nat_bits(sbi, cpc)) {
__u64 cp_ver = cur_cp_version(ckpt);
- unsigned int i;
block_t blk;
cp_ver |= ((__u64)crc32 << 32);
@@ -1250,8 +1275,8 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
mutex_lock(&sbi->cp_mutex);
if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
- (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC ||
- (cpc->reason == CP_DISCARD && !sbi->discard_blks)))
+ ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) ||
+ ((cpc->reason & CP_DISCARD) && !sbi->discard_blks)))
goto out;
if (unlikely(f2fs_cp_error(sbi))) {
err = -EIO;
@@ -1273,7 +1298,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
f2fs_flush_merged_bios(sbi);
/* this is the case of multiple fstrims without any changes */
- if (cpc->reason == CP_DISCARD) {
+ if (cpc->reason & CP_DISCARD) {
if (!exist_trim_candidates(sbi, cpc)) {
unblock_operations(sbi);
goto out;
@@ -1311,7 +1336,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
unblock_operations(sbi);
stat_inc_cp_count(sbi->stat_info);
- if (cpc->reason == CP_RECOVERY)
+ if (cpc->reason & CP_RECOVERY)
f2fs_msg(sbi->sb, KERN_NOTICE,
"checkpoint: version = %llx", ckpt_ver);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 1602b4bccae6..7c0f6bdf817d 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -309,7 +309,7 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
if (type >= META_FLUSH) {
io->fio.type = META_FLUSH;
io->fio.op = REQ_OP_WRITE;
- io->fio.op_flags = REQ_META | REQ_PRIO;
+ io->fio.op_flags = REQ_META | REQ_PRIO | REQ_SYNC;
if (!test_opt(sbi, NOBARRIER))
io->fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
}
@@ -341,7 +341,7 @@ void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi)
/*
* Fill the locked page with data located in the block address.
- * Return unlocked page.
+ * A caller needs to unlock the page on failure.
*/
int f2fs_submit_page_bio(struct f2fs_io_info *fio)
{
@@ -362,6 +362,9 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
bio_set_op_attrs(bio, fio->op, fio->op_flags);
__submit_bio(fio->sbi, bio, fio->type);
+
+ if (!is_read_io(fio->op))
+ inc_page_count(fio->sbi, WB_DATA_TYPE(fio->page));
return 0;
}
@@ -787,6 +790,21 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
return err;
}
+static inline void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock)
+{
+ if (flag == F2FS_GET_BLOCK_PRE_AIO) {
+ if (lock)
+ down_read(&sbi->node_change);
+ else
+ up_read(&sbi->node_change);
+ } else {
+ if (lock)
+ f2fs_lock_op(sbi);
+ else
+ f2fs_unlock_op(sbi);
+ }
+}
+
/*
* f2fs_map_blocks() now supported readahead/bmap/rw direct_IO with
* f2fs_map_blocks structure.
@@ -829,7 +847,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
next_dnode:
if (create)
- f2fs_lock_op(sbi);
+ __do_map_lock(sbi, flag, true);
/* When reading holes, we need its node page */
set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -939,7 +957,7 @@ skip:
f2fs_put_dnode(&dn);
if (create) {
- f2fs_unlock_op(sbi);
+ __do_map_lock(sbi, flag, false);
f2fs_balance_fs(sbi, dn.node_changed);
}
goto next_dnode;
@@ -948,7 +966,7 @@ sync_out:
f2fs_put_dnode(&dn);
unlock_out:
if (create) {
- f2fs_unlock_op(sbi);
+ __do_map_lock(sbi, flag, false);
f2fs_balance_fs(sbi, dn.node_changed);
}
out:
@@ -1151,9 +1169,10 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
for (page_idx = 0; nr_pages; page_idx++, nr_pages--) {
- prefetchw(&page->flags);
if (pages) {
page = list_last_entry(pages, struct page, lru);
+
+ prefetchw(&page->flags);
list_del(&page->lru);
if (add_to_page_cache_lru(page, mapping,
page->index,
@@ -1283,17 +1302,83 @@ static int f2fs_read_data_pages(struct file *file,
return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages);
}
+static int encrypt_one_page(struct f2fs_io_info *fio)
+{
+ struct inode *inode = fio->page->mapping->host;
+ gfp_t gfp_flags = GFP_NOFS;
+
+ if (!f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode))
+ return 0;
+
+ /* wait for GCed encrypted page writeback */
+ f2fs_wait_on_encrypted_page_writeback(fio->sbi, fio->old_blkaddr);
+
+retry_encrypt:
+ fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page,
+ PAGE_SIZE, 0, fio->page->index, gfp_flags);
+ if (!IS_ERR(fio->encrypted_page))
+ return 0;
+
+ /* flush pending IOs and wait for a while in the ENOMEM case */
+ if (PTR_ERR(fio->encrypted_page) == -ENOMEM) {
+ f2fs_flush_merged_bios(fio->sbi);
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ gfp_flags |= __GFP_NOFAIL;
+ goto retry_encrypt;
+ }
+ return PTR_ERR(fio->encrypted_page);
+}
+
+static inline bool need_inplace_update(struct f2fs_io_info *fio)
+{
+ struct inode *inode = fio->page->mapping->host;
+
+ if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode))
+ return false;
+ if (is_cold_data(fio->page))
+ return false;
+ if (IS_ATOMIC_WRITTEN_PAGE(fio->page))
+ return false;
+
+ return need_inplace_update_policy(inode, fio);
+}
+
+static inline bool valid_ipu_blkaddr(struct f2fs_io_info *fio)
+{
+ if (fio->old_blkaddr == NEW_ADDR)
+ return false;
+ if (fio->old_blkaddr == NULL_ADDR)
+ return false;
+ return true;
+}
+
int do_write_data_page(struct f2fs_io_info *fio)
{
struct page *page = fio->page;
struct inode *inode = page->mapping->host;
struct dnode_of_data dn;
+ struct extent_info ei = {0,0,0};
+ bool ipu_force = false;
int err = 0;
set_new_dnode(&dn, inode, NULL, NULL, 0);
+ if (need_inplace_update(fio) &&
+ f2fs_lookup_extent_cache(inode, page->index, &ei)) {
+ fio->old_blkaddr = ei.blk + page->index - ei.fofs;
+
+ if (valid_ipu_blkaddr(fio)) {
+ ipu_force = true;
+ fio->need_lock = false;
+ goto got_it;
+ }
+ }
+
+ if (fio->need_lock)
+ f2fs_lock_op(fio->sbi);
+
err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE);
if (err)
- return err;
+ goto out;
fio->old_blkaddr = dn.data_blkaddr;
@@ -1302,31 +1387,10 @@ int do_write_data_page(struct f2fs_io_info *fio)
ClearPageUptodate(page);
goto out_writepage;
}
-
- if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
- gfp_t gfp_flags = GFP_NOFS;
-
- /* wait for GCed encrypted page writeback */
- f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode),
- fio->old_blkaddr);
-retry_encrypt:
- fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page,
- PAGE_SIZE, 0,
- fio->page->index,
- gfp_flags);
- if (IS_ERR(fio->encrypted_page)) {
- err = PTR_ERR(fio->encrypted_page);
- if (err == -ENOMEM) {
- /* flush pending ios and wait for a while */
- f2fs_flush_merged_bios(F2FS_I_SB(inode));
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- gfp_flags |= __GFP_NOFAIL;
- err = 0;
- goto retry_encrypt;
- }
- goto out_writepage;
- }
- }
+got_it:
+ err = encrypt_one_page(fio);
+ if (err)
+ goto out_writepage;
set_page_writeback(page);
@@ -1334,22 +1398,27 @@ retry_encrypt:
* If current allocation needs SSR,
* it had better in-place writes for updated data.
*/
- if (unlikely(fio->old_blkaddr != NEW_ADDR &&
- !is_cold_data(page) &&
- !IS_ATOMIC_WRITTEN_PAGE(page) &&
- need_inplace_update(inode))) {
- rewrite_data_page(fio);
+ if (ipu_force || (valid_ipu_blkaddr(fio) && need_inplace_update(fio))) {
+ f2fs_put_dnode(&dn);
+ if (fio->need_lock)
+ f2fs_unlock_op(fio->sbi);
+ err = rewrite_data_page(fio);
+ trace_f2fs_do_write_data_page(fio->page, IPU);
set_inode_flag(inode, FI_UPDATE_WRITE);
- trace_f2fs_do_write_data_page(page, IPU);
- } else {
- write_data_page(&dn, fio);
- trace_f2fs_do_write_data_page(page, OPU);
- set_inode_flag(inode, FI_APPEND_WRITE);
- if (page->index == 0)
- set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
+ return err;
}
+
+ /* LFS mode write path */
+ write_data_page(&dn, fio);
+ trace_f2fs_do_write_data_page(page, OPU);
+ set_inode_flag(inode, FI_APPEND_WRITE);
+ if (page->index == 0)
+ set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
out_writepage:
f2fs_put_dnode(&dn);
+out:
+ if (fio->need_lock)
+ f2fs_unlock_op(fio->sbi);
return err;
}
@@ -1370,9 +1439,11 @@ static int __write_data_page(struct page *page, bool *submitted,
.type = DATA,
.op = REQ_OP_WRITE,
.op_flags = wbc_to_write_flags(wbc),
+ .old_blkaddr = NULL_ADDR,
.page = page,
.encrypted_page = NULL,
.submitted = false,
+ .need_lock = true,
};
trace_f2fs_writepage(page, DATA);
@@ -1408,6 +1479,7 @@ write:
/* Dentry blocks are controlled by checkpoint */
if (S_ISDIR(inode->i_mode)) {
+ fio.need_lock = false;
err = do_write_data_page(&fio);
goto done;
}
@@ -1416,6 +1488,8 @@ write:
need_balance_fs = true;
else if (has_not_enough_free_secs(sbi, 0, 0))
goto redirty_out;
+ else
+ set_inode_flag(inode, FI_HOT_DATA);
err = -EAGAIN;
if (f2fs_has_inline_data(inode)) {
@@ -1423,12 +1497,12 @@ write:
if (!err)
goto out;
}
- f2fs_lock_op(sbi);
+
if (err == -EAGAIN)
err = do_write_data_page(&fio);
if (F2FS_I(inode)->last_disk_size < psize)
F2FS_I(inode)->last_disk_size = psize;
- f2fs_unlock_op(sbi);
+
done:
if (err && err != -ENOENT)
goto redirty_out;
@@ -1441,12 +1515,14 @@ out:
if (wbc->for_reclaim) {
f2fs_submit_merged_bio_cond(sbi, inode, 0, page->index,
DATA, WRITE);
+ clear_inode_flag(inode, FI_HOT_DATA);
remove_dirty_inode(inode);
submitted = NULL;
}
unlock_page(page);
- f2fs_balance_fs(sbi, need_balance_fs);
+ if (!S_ISDIR(inode->i_mode))
+ f2fs_balance_fs(sbi, need_balance_fs);
if (unlikely(f2fs_cp_error(sbi))) {
f2fs_submit_merged_bio(sbi, DATA, WRITE);
@@ -1495,6 +1571,12 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
pagevec_init(&pvec, 0);
+ if (get_dirty_pages(mapping->host) <=
+ SM_I(F2FS_M_SB(mapping))->min_hot_blocks)
+ set_inode_flag(mapping->host, FI_HOT_DATA);
+ else
+ clear_inode_flag(mapping->host, FI_HOT_DATA);
+
if (wbc->range_cyclic) {
writeback_index = mapping->writeback_index; /* prev offset */
index = writeback_index;
@@ -1580,8 +1662,10 @@ continue_unlock:
last_idx = page->index;
}
- if (--wbc->nr_to_write <= 0 &&
- wbc->sync_mode == WB_SYNC_NONE) {
+ /* give a priority to WB_SYNC threads */
+ if ((atomic_read(&F2FS_M_SB(mapping)->wb_sync_req) ||
+ --wbc->nr_to_write <= 0) &&
+ wbc->sync_mode == WB_SYNC_NONE) {
done = 1;
break;
}
@@ -1637,9 +1721,18 @@ static int f2fs_write_data_pages(struct address_space *mapping,
trace_f2fs_writepages(mapping->host, wbc, DATA);
+ /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ atomic_inc(&sbi->wb_sync_req);
+ else if (atomic_read(&sbi->wb_sync_req))
+ goto skip_write;
+
blk_start_plug(&plug);
ret = f2fs_write_cache_pages(mapping, wbc);
blk_finish_plug(&plug);
+
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ atomic_dec(&sbi->wb_sync_req);
/*
* if some pages were truncated, we cannot guarantee its mapping->host
* to detect pending bios.
@@ -1687,7 +1780,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
if (f2fs_has_inline_data(inode) ||
(pos & PAGE_MASK) >= i_size_read(inode)) {
- f2fs_lock_op(sbi);
+ __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
locked = true;
}
restart:
@@ -1723,7 +1816,8 @@ restart:
err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
if (err || dn.data_blkaddr == NULL_ADDR) {
f2fs_put_dnode(&dn);
- f2fs_lock_op(sbi);
+ __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO,
+ true);
locked = true;
goto restart;
}
@@ -1737,7 +1831,7 @@ out:
f2fs_put_dnode(&dn);
unlock_out:
if (locked)
- f2fs_unlock_op(sbi);
+ __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
return err;
}
@@ -1951,7 +2045,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
/* This is atomic written page, keep Private */
if (IS_ATOMIC_WRITTEN_PAGE(page))
- return;
+ return drop_inmem_page(inode, page);
set_page_private(page, 0);
ClearPagePrivate(page);
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index ee2d0a485fc3..87f449845f5f 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -51,15 +51,26 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
si->aw_cnt = atomic_read(&sbi->aw_cnt);
+ si->vw_cnt = atomic_read(&sbi->vw_cnt);
si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt);
+ si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt);
si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA);
si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA);
- if (SM_I(sbi) && SM_I(sbi)->fcc_info)
- si->nr_flush =
- atomic_read(&SM_I(sbi)->fcc_info->submit_flush);
- if (SM_I(sbi) && SM_I(sbi)->dcc_info)
- si->nr_discard =
- atomic_read(&SM_I(sbi)->dcc_info->submit_discard);
+ if (SM_I(sbi) && SM_I(sbi)->fcc_info) {
+ si->nr_flushed =
+ atomic_read(&SM_I(sbi)->fcc_info->issued_flush);
+ si->nr_flushing =
+ atomic_read(&SM_I(sbi)->fcc_info->issing_flush);
+ }
+ if (SM_I(sbi) && SM_I(sbi)->dcc_info) {
+ si->nr_discarded =
+ atomic_read(&SM_I(sbi)->dcc_info->issued_discard);
+ si->nr_discarding =
+ atomic_read(&SM_I(sbi)->dcc_info->issing_discard);
+ si->nr_discard_cmd =
+ atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt);
+ si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks;
+ }
si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
si->rsvd_segs = reserved_segments(sbi);
si->overp_segs = overprovision_segments(sbi);
@@ -86,6 +97,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->sits = MAIN_SEGS(sbi);
si->dirty_sits = SIT_I(sbi)->dirty_sentries;
si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST];
+ si->avail_nids = NM_I(sbi)->available_nids;
si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST];
si->bg_gc = sbi->bg_gc;
si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
@@ -99,8 +111,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) {
struct curseg_info *curseg = CURSEG_I(sbi, i);
si->curseg[i] = curseg->segno;
- si->cursec[i] = curseg->segno / sbi->segs_per_sec;
- si->curzone[i] = si->cursec[i] / sbi->secs_per_zone;
+ si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno);
+ si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]);
}
for (i = 0; i < 2; i++) {
@@ -124,10 +136,10 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
bimodal = 0;
total_vblocks = 0;
- blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg;
+ blks_per_sec = BLKS_PER_SEC(sbi);
hblks_per_sec = blks_per_sec / 2;
for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
- vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+ vblocks = get_valid_blocks(sbi, segno, true);
dist = abs(vblocks - hblks_per_sec);
bimodal += dist * dist;
@@ -156,7 +168,11 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
if (si->base_mem)
goto get_cache;
- si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize;
+ /* build stat */
+ si->base_mem = sizeof(struct f2fs_stat_info);
+
+ /* build superblock */
+ si->base_mem += sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize;
si->base_mem += 2 * sizeof(struct f2fs_inode_info);
si->base_mem += sizeof(*sbi->ckpt);
si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE;
@@ -208,8 +224,11 @@ get_cache:
/* build merge flush thread */
if (SM_I(sbi)->fcc_info)
si->cache_mem += sizeof(struct flush_cmd_control);
- if (SM_I(sbi)->dcc_info)
+ if (SM_I(sbi)->dcc_info) {
si->cache_mem += sizeof(struct discard_cmd_control);
+ si->cache_mem += sizeof(struct discard_cmd) *
+ atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt);
+ }
/* free nids */
si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] +
@@ -330,11 +349,16 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n",
si->ext_tree, si->zombie_tree, si->ext_node);
seq_puts(s, "\nBalancing F2FS Async:\n");
- seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: %4d, Discard: %4d)\n",
+ seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: (%4d %4d), "
+ "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n",
si->nr_wb_cp_data, si->nr_wb_data,
- si->nr_flush, si->nr_discard);
- seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d)\n",
- si->inmem_pages, si->aw_cnt, si->max_aw_cnt);
+ si->nr_flushing, si->nr_flushed,
+ si->nr_discarding, si->nr_discarded,
+ si->nr_discard_cmd, si->undiscard_blks);
+ seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), "
+ "volatile IO: %4d (Max. %4d)\n",
+ si->inmem_pages, si->aw_cnt, si->max_aw_cnt,
+ si->vw_cnt, si->max_vw_cnt);
seq_printf(s, " - nodes: %4d in %4d\n",
si->ndirty_node, si->node_pages);
seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n",
@@ -347,8 +371,8 @@ static int stat_show(struct seq_file *s, void *v)
si->ndirty_imeta);
seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n",
si->dirty_nats, si->nats, si->dirty_sits, si->sits);
- seq_printf(s, " - free_nids: %9d, alloc_nids: %9d\n",
- si->free_nids, si->alloc_nids);
+ seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n",
+ si->free_nids, si->avail_nids, si->alloc_nids);
seq_puts(s, "\nDistribution of User Blocks:");
seq_puts(s, " [ valid | invalid | free ]\n");
seq_puts(s, " [");
@@ -434,7 +458,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
atomic_set(&sbi->inplace_count, 0);
atomic_set(&sbi->aw_cnt, 0);
+ atomic_set(&sbi->vw_cnt, 0);
atomic_set(&sbi->max_aw_cnt, 0);
+ atomic_set(&sbi->max_vw_cnt, 0);
mutex_lock(&f2fs_stat_mutex);
list_add_tail(&si->stat_list, &f2fs_stat_list);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 8d5c62b07b28..94756f55a97e 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -94,7 +94,7 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page);
- make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1);
+ make_dentry_ptr_block(NULL, &d, dentry_blk);
de = find_target_dentry(fname, namehash, max_slots, &d);
if (de)
*res_page = dentry_page;
@@ -111,8 +111,6 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname,
struct f2fs_dir_entry *de;
unsigned long bit_pos = 0;
int max_len = 0;
- struct fscrypt_str de_name = FSTR_INIT(NULL, 0);
- struct fscrypt_str *name = &fname->disk_name;
if (max_slots)
*max_slots = 0;
@@ -130,17 +128,9 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname,
continue;
}
- /* encrypted case */
- de_name.name = d->filename[bit_pos];
- de_name.len = le16_to_cpu(de->name_len);
-
- /* show encrypted name */
- if (fname->hash) {
- if (de->hash_code == cpu_to_le32(fname->hash))
- goto found;
- } else if (de_name.len == name->len &&
- de->hash_code == namehash &&
- !memcmp(de_name.name, name->name, name->len))
+ if (de->hash_code == namehash &&
+ fscrypt_match_name(fname, d->filename[bit_pos],
+ le16_to_cpu(de->name_len)))
goto found;
if (max_slots && max_len > *max_slots)
@@ -170,12 +160,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
struct f2fs_dir_entry *de = NULL;
bool room = false;
int max_slots;
- f2fs_hash_t namehash;
-
- if(fname->hash)
- namehash = cpu_to_le32(fname->hash);
- else
- namehash = f2fs_dentry_hash(&name);
+ f2fs_hash_t namehash = f2fs_dentry_hash(&name, fname);
nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
nblock = bucket_blocks(level);
@@ -207,13 +192,9 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
f2fs_put_page(dentry_page, 0);
}
- /* This is to increase the speed of f2fs_create */
- if (!de && room) {
- F2FS_I(dir)->task = current;
- if (F2FS_I(dir)->chash != namehash) {
- F2FS_I(dir)->chash = namehash;
- F2FS_I(dir)->clevel = level;
- }
+ if (!de && room && F2FS_I(dir)->chash != namehash) {
+ F2FS_I(dir)->chash = namehash;
+ F2FS_I(dir)->clevel = level;
}
return de;
@@ -254,6 +235,9 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir,
break;
}
out:
+ /* This is to increase the speed of f2fs_create */
+ if (!de)
+ F2FS_I(dir)->task = current;
return de;
}
@@ -337,24 +321,6 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage)
set_page_dirty(ipage);
}
-int update_dent_inode(struct inode *inode, struct inode *to,
- const struct qstr *name)
-{
- struct page *page;
-
- if (file_enc_name(to))
- return 0;
-
- page = get_node_page(F2FS_I_SB(inode), inode->i_ino);
- if (IS_ERR(page))
- return PTR_ERR(page);
-
- init_dent_inode(name, page);
- f2fs_put_page(page, 1);
-
- return 0;
-}
-
void do_make_empty_dir(struct inode *inode, struct inode *parent,
struct f2fs_dentry_ptr *d)
{
@@ -384,7 +350,7 @@ static int make_empty_dir(struct inode *inode,
dentry_blk = kmap_atomic(dentry_page);
- make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1);
+ make_dentry_ptr_block(NULL, &d, dentry_blk);
do_make_empty_dir(inode, parent, &d);
kunmap_atomic(dentry_blk);
@@ -438,8 +404,11 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
set_cold_node(inode, page);
}
- if (new_name)
+ if (new_name) {
init_dent_inode(new_name, page);
+ if (f2fs_encrypted_inode(dir))
+ file_set_enc_name(inode);
+ }
/*
* This file should be checkpointed during fsync.
@@ -542,7 +511,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name,
level = 0;
slots = GET_DENTRY_SLOTS(new_name->len);
- dentry_hash = f2fs_dentry_hash(new_name);
+ dentry_hash = f2fs_dentry_hash(new_name, NULL);
current_depth = F2FS_I(dir)->i_current_depth;
if (F2FS_I(dir)->chash == dentry_hash) {
@@ -599,11 +568,9 @@ add_dentry:
err = PTR_ERR(page);
goto fail;
}
- if (f2fs_encrypted_inode(dir))
- file_set_enc_name(inode);
}
- make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1);
+ make_dentry_ptr_block(NULL, &d, dentry_blk);
f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos);
set_page_dirty(dentry_page);
@@ -911,7 +878,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
dentry_blk = kmap(dentry_page);
- make_dentry_ptr(inode, &d, (void *)dentry_blk, 1);
+ make_dentry_ptr_block(inode, &d, dentry_blk);
err = f2fs_fill_dentries(ctx, &d,
n * NR_DENTRY_IN_BLOCK, &fstr);
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index c6934f014e0f..2f98d7039701 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -18,6 +18,179 @@
#include "node.h"
#include <trace/events/f2fs.h>
+static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re,
+ unsigned int ofs)
+{
+ if (cached_re) {
+ if (cached_re->ofs <= ofs &&
+ cached_re->ofs + cached_re->len > ofs) {
+ return cached_re;
+ }
+ }
+ return NULL;
+}
+
+static struct rb_entry *__lookup_rb_tree_slow(struct rb_root *root,
+ unsigned int ofs)
+{
+ struct rb_node *node = root->rb_node;
+ struct rb_entry *re;
+
+ while (node) {
+ re = rb_entry(node, struct rb_entry, rb_node);
+
+ if (ofs < re->ofs)
+ node = node->rb_left;
+ else if (ofs >= re->ofs + re->len)
+ node = node->rb_right;
+ else
+ return re;
+ }
+ return NULL;
+}
+
+struct rb_entry *__lookup_rb_tree(struct rb_root *root,
+ struct rb_entry *cached_re, unsigned int ofs)
+{
+ struct rb_entry *re;
+
+ re = __lookup_rb_tree_fast(cached_re, ofs);
+ if (!re)
+ return __lookup_rb_tree_slow(root, ofs);
+
+ return re;
+}
+
+struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi,
+ struct rb_root *root, struct rb_node **parent,
+ unsigned int ofs)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_entry *re;
+
+ while (*p) {
+ *parent = *p;
+ re = rb_entry(*parent, struct rb_entry, rb_node);
+
+ if (ofs < re->ofs)
+ p = &(*p)->rb_left;
+ else if (ofs >= re->ofs + re->len)
+ p = &(*p)->rb_right;
+ else
+ f2fs_bug_on(sbi, 1);
+ }
+
+ return p;
+}
+
+/*
+ * lookup rb entry in position of @ofs in rb-tree,
+ * if hit, return the entry, otherwise, return NULL
+ * @prev_ex: extent before ofs
+ * @next_ex: extent after ofs
+ * @insert_p: insert point for new extent at ofs
+ * in order to simpfy the insertion after.
+ * tree must stay unchanged between lookup and insertion.
+ */
+struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root,
+ struct rb_entry *cached_re,
+ unsigned int ofs,
+ struct rb_entry **prev_entry,
+ struct rb_entry **next_entry,
+ struct rb_node ***insert_p,
+ struct rb_node **insert_parent,
+ bool force)
+{
+ struct rb_node **pnode = &root->rb_node;
+ struct rb_node *parent = NULL, *tmp_node;
+ struct rb_entry *re = cached_re;
+
+ *insert_p = NULL;
+ *insert_parent = NULL;
+ *prev_entry = NULL;
+ *next_entry = NULL;
+
+ if (RB_EMPTY_ROOT(root))
+ return NULL;
+
+ if (re) {
+ if (re->ofs <= ofs && re->ofs + re->len > ofs)
+ goto lookup_neighbors;
+ }
+
+ while (*pnode) {
+ parent = *pnode;
+ re = rb_entry(*pnode, struct rb_entry, rb_node);
+
+ if (ofs < re->ofs)
+ pnode = &(*pnode)->rb_left;
+ else if (ofs >= re->ofs + re->len)
+ pnode = &(*pnode)->rb_right;
+ else
+ goto lookup_neighbors;
+ }
+
+ *insert_p = pnode;
+ *insert_parent = parent;
+
+ re = rb_entry(parent, struct rb_entry, rb_node);
+ tmp_node = parent;
+ if (parent && ofs > re->ofs)
+ tmp_node = rb_next(parent);
+ *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node);
+
+ tmp_node = parent;
+ if (parent && ofs < re->ofs)
+ tmp_node = rb_prev(parent);
+ *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node);
+ return NULL;
+
+lookup_neighbors:
+ if (ofs == re->ofs || force) {
+ /* lookup prev node for merging backward later */
+ tmp_node = rb_prev(&re->rb_node);
+ *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node);
+ }
+ if (ofs == re->ofs + re->len - 1 || force) {
+ /* lookup next node for merging frontward later */
+ tmp_node = rb_next(&re->rb_node);
+ *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node);
+ }
+ return re;
+}
+
+bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi,
+ struct rb_root *root)
+{
+#ifdef CONFIG_F2FS_CHECK_FS
+ struct rb_node *cur = rb_first(root), *next;
+ struct rb_entry *cur_re, *next_re;
+
+ if (!cur)
+ return true;
+
+ while (cur) {
+ next = rb_next(cur);
+ if (!next)
+ return true;
+
+ cur_re = rb_entry(cur, struct rb_entry, rb_node);
+ next_re = rb_entry(next, struct rb_entry, rb_node);
+
+ if (cur_re->ofs + cur_re->len > next_re->ofs) {
+ f2fs_msg(sbi->sb, KERN_INFO, "inconsistent rbtree, "
+ "cur(%u, %u) next(%u, %u)",
+ cur_re->ofs, cur_re->len,
+ next_re->ofs, next_re->len);
+ return false;
+ }
+
+ cur = next;
+ }
+#endif
+ return true;
+}
+
static struct kmem_cache *extent_tree_slab;
static struct kmem_cache *extent_node_slab;
@@ -102,36 +275,6 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode)
return et;
}
-static struct extent_node *__lookup_extent_tree(struct f2fs_sb_info *sbi,
- struct extent_tree *et, unsigned int fofs)
-{
- struct rb_node *node = et->root.rb_node;
- struct extent_node *en = et->cached_en;
-
- if (en) {
- struct extent_info *cei = &en->ei;
-
- if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) {
- stat_inc_cached_node_hit(sbi);
- return en;
- }
- }
-
- while (node) {
- en = rb_entry(node, struct extent_node, rb_node);
-
- if (fofs < en->ei.fofs) {
- node = node->rb_left;
- } else if (fofs >= en->ei.fofs + en->ei.len) {
- node = node->rb_right;
- } else {
- stat_inc_rbtree_node_hit(sbi);
- return en;
- }
- }
- return NULL;
-}
-
static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi,
struct extent_tree *et, struct extent_info *ei)
{
@@ -237,17 +380,24 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
goto out;
}
- en = __lookup_extent_tree(sbi, et, pgofs);
- if (en) {
- *ei = en->ei;
- spin_lock(&sbi->extent_lock);
- if (!list_empty(&en->list)) {
- list_move_tail(&en->list, &sbi->extent_list);
- et->cached_en = en;
- }
- spin_unlock(&sbi->extent_lock);
- ret = true;
+ en = (struct extent_node *)__lookup_rb_tree(&et->root,
+ (struct rb_entry *)et->cached_en, pgofs);
+ if (!en)
+ goto out;
+
+ if (en == et->cached_en)
+ stat_inc_cached_node_hit(sbi);
+ else
+ stat_inc_rbtree_node_hit(sbi);
+
+ *ei = en->ei;
+ spin_lock(&sbi->extent_lock);
+ if (!list_empty(&en->list)) {
+ list_move_tail(&en->list, &sbi->extent_list);
+ et->cached_en = en;
}
+ spin_unlock(&sbi->extent_lock);
+ ret = true;
out:
stat_inc_total_hit(sbi);
read_unlock(&et->lock);
@@ -256,83 +406,6 @@ out:
return ret;
}
-
-/*
- * lookup extent at @fofs, if hit, return the extent
- * if not, return NULL and
- * @prev_ex: extent before fofs
- * @next_ex: extent after fofs
- * @insert_p: insert point for new extent at fofs
- * in order to simpfy the insertion after.
- * tree must stay unchanged between lookup and insertion.
- */
-static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et,
- unsigned int fofs,
- struct extent_node **prev_ex,
- struct extent_node **next_ex,
- struct rb_node ***insert_p,
- struct rb_node **insert_parent)
-{
- struct rb_node **pnode = &et->root.rb_node;
- struct rb_node *parent = NULL, *tmp_node;
- struct extent_node *en = et->cached_en;
-
- *insert_p = NULL;
- *insert_parent = NULL;
- *prev_ex = NULL;
- *next_ex = NULL;
-
- if (RB_EMPTY_ROOT(&et->root))
- return NULL;
-
- if (en) {
- struct extent_info *cei = &en->ei;
-
- if (cei->fofs <= fofs && cei->fofs + cei->len > fofs)
- goto lookup_neighbors;
- }
-
- while (*pnode) {
- parent = *pnode;
- en = rb_entry(*pnode, struct extent_node, rb_node);
-
- if (fofs < en->ei.fofs)
- pnode = &(*pnode)->rb_left;
- else if (fofs >= en->ei.fofs + en->ei.len)
- pnode = &(*pnode)->rb_right;
- else
- goto lookup_neighbors;
- }
-
- *insert_p = pnode;
- *insert_parent = parent;
-
- en = rb_entry(parent, struct extent_node, rb_node);
- tmp_node = parent;
- if (parent && fofs > en->ei.fofs)
- tmp_node = rb_next(parent);
- *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node);
-
- tmp_node = parent;
- if (parent && fofs < en->ei.fofs)
- tmp_node = rb_prev(parent);
- *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node);
- return NULL;
-
-lookup_neighbors:
- if (fofs == en->ei.fofs) {
- /* lookup prev node for merging backward later */
- tmp_node = rb_prev(&en->rb_node);
- *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node);
- }
- if (fofs == en->ei.fofs + en->ei.len - 1) {
- /* lookup next node for merging frontward later */
- tmp_node = rb_next(&en->rb_node);
- *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node);
- }
- return en;
-}
-
static struct extent_node *__try_merge_extent_node(struct inode *inode,
struct extent_tree *et, struct extent_info *ei,
struct extent_node *prev_ex,
@@ -387,17 +460,7 @@ static struct extent_node *__insert_extent_tree(struct inode *inode,
goto do_insert;
}
- while (*p) {
- parent = *p;
- en = rb_entry(parent, struct extent_node, rb_node);
-
- if (ei->fofs < en->ei.fofs)
- p = &(*p)->rb_left;
- else if (ei->fofs >= en->ei.fofs + en->ei.len)
- p = &(*p)->rb_right;
- else
- f2fs_bug_on(sbi, 1);
- }
+ p = __lookup_rb_tree_for_insert(sbi, &et->root, &parent, ei->fofs);
do_insert:
en = __attach_extent_node(sbi, et, ei, parent, p);
if (!en)
@@ -447,8 +510,11 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
__drop_largest_extent(inode, fofs, len);
/* 1. lookup first extent node in range [fofs, fofs + len - 1] */
- en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en,
- &insert_p, &insert_parent);
+ en = (struct extent_node *)__lookup_rb_tree_ret(&et->root,
+ (struct rb_entry *)et->cached_en, fofs,
+ (struct rb_entry **)&prev_en,
+ (struct rb_entry **)&next_en,
+ &insert_p, &insert_parent, false);
if (!en)
en = next_en;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 0a6e115562f6..2185c7a040a1 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -50,6 +50,7 @@ enum {
FAULT_BLOCK,
FAULT_DIR_DEPTH,
FAULT_EVICT_INODE,
+ FAULT_TRUNCATE,
FAULT_IO,
FAULT_CHECKPOINT,
FAULT_MAX,
@@ -62,7 +63,7 @@ struct f2fs_fault_info {
};
extern char *fault_name[FAULT_MAX];
-#define IS_FAULT_SET(fi, type) (fi->inject_type & (1 << (type)))
+#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type)))
#endif
/*
@@ -88,9 +89,9 @@ extern char *fault_name[FAULT_MAX];
#define F2FS_MOUNT_ADAPTIVE 0x00020000
#define F2FS_MOUNT_LFS 0x00040000
-#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
-#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
-#define test_opt(sbi, option) (sbi->mount_opt.opt & F2FS_MOUNT_##option)
+#define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option)
+#define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option)
+#define test_opt(sbi, option) ((sbi)->mount_opt.opt & F2FS_MOUNT_##option)
#define ver_after(a, b) (typecheck(unsigned long long, a) && \
typecheck(unsigned long long, b) && \
@@ -124,22 +125,20 @@ enum {
SIT_BITMAP
};
-enum {
- CP_UMOUNT,
- CP_FASTBOOT,
- CP_SYNC,
- CP_RECOVERY,
- CP_DISCARD,
-};
+#define CP_UMOUNT 0x00000001
+#define CP_FASTBOOT 0x00000002
+#define CP_SYNC 0x00000004
+#define CP_RECOVERY 0x00000008
+#define CP_DISCARD 0x00000010
+#define CP_TRIMMED 0x00000020
#define DEF_BATCHED_TRIM_SECTIONS 2048
#define BATCHED_TRIM_SEGMENTS(sbi) \
- (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec)
+ (GET_SEG_FROM_SEC(sbi, SM_I(sbi)->trim_sections))
#define BATCHED_TRIM_BLOCKS(sbi) \
(BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg)
-#define MAX_DISCARD_BLOCKS(sbi) \
- ((1 << (sbi)->log_blocks_per_seg) * (sbi)->segs_per_sec)
-#define DISCARD_ISSUE_RATE 8
+#define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi)
+#define DISCARD_ISSUE_RATE 8
#define DEF_CP_INTERVAL 60 /* 60 secs */
#define DEF_IDLE_INTERVAL 5 /* 5 secs */
@@ -181,37 +180,63 @@ struct inode_entry {
struct inode *inode; /* vfs inode pointer */
};
-/* for the list of blockaddresses to be discarded */
+/* for the bitmap indicate blocks to be discarded */
struct discard_entry {
struct list_head list; /* list head */
- block_t blkaddr; /* block address to be discarded */
- int len; /* # of consecutive blocks of the discard */
+ block_t start_blkaddr; /* start blockaddr of current segment */
+ unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */
};
+/* max discard pend list number */
+#define MAX_PLIST_NUM 512
+#define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \
+ (MAX_PLIST_NUM - 1) : (blk_num - 1))
+
enum {
D_PREP,
D_SUBMIT,
D_DONE,
};
+struct discard_info {
+ block_t lstart; /* logical start address */
+ block_t len; /* length */
+ block_t start; /* actual start address in dev */
+};
+
struct discard_cmd {
+ struct rb_node rb_node; /* rb node located in rb-tree */
+ union {
+ struct {
+ block_t lstart; /* logical start address */
+ block_t len; /* length */
+ block_t start; /* actual start address in dev */
+ };
+ struct discard_info di; /* discard info */
+
+ };
struct list_head list; /* command list */
struct completion wait; /* compleation */
- block_t lstart; /* logical start address */
- block_t len; /* length */
- struct bio *bio; /* bio */
- int state; /* state */
+ struct block_device *bdev; /* bdev */
+ unsigned short ref; /* reference count */
+ unsigned char state; /* state */
+ int error; /* bio error */
};
struct discard_cmd_control {
struct task_struct *f2fs_issue_discard; /* discard thread */
- struct list_head discard_entry_list; /* 4KB discard entry list */
- int nr_discards; /* # of discards in the list */
- struct list_head discard_cmd_list; /* discard cmd list */
+ struct list_head entry_list; /* 4KB discard entry list */
+ struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */
+ struct list_head wait_list; /* store on-flushing entries */
wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */
struct mutex cmd_lock;
- int max_discards; /* max. discards to be issued */
- atomic_t submit_discard; /* # of issued discard */
+ unsigned int nr_discards; /* # of discards in the list */
+ unsigned int max_discards; /* max. discards to be issued */
+ unsigned int undiscard_blks; /* # of undiscard blocks */
+ atomic_t issued_discard; /* # of issued discard */
+ atomic_t issing_discard; /* # of issing discard */
+ atomic_t discard_cmd_cnt; /* # of cached cmd count */
+ struct rb_root root; /* root of discard rb-tree */
};
/* for the list of fsync inodes, used only during recovery */
@@ -222,13 +247,13 @@ struct fsync_inode_entry {
block_t last_dentry; /* block address locating the last dentry */
};
-#define nats_in_cursum(jnl) (le16_to_cpu(jnl->n_nats))
-#define sits_in_cursum(jnl) (le16_to_cpu(jnl->n_sits))
+#define nats_in_cursum(jnl) (le16_to_cpu((jnl)->n_nats))
+#define sits_in_cursum(jnl) (le16_to_cpu((jnl)->n_sits))
-#define nat_in_journal(jnl, i) (jnl->nat_j.entries[i].ne)
-#define nid_in_journal(jnl, i) (jnl->nat_j.entries[i].nid)
-#define sit_in_journal(jnl, i) (jnl->sit_j.entries[i].se)
-#define segno_in_journal(jnl, i) (jnl->sit_j.entries[i].segno)
+#define nat_in_journal(jnl, i) ((jnl)->nat_j.entries[i].ne)
+#define nid_in_journal(jnl, i) ((jnl)->nat_j.entries[i].nid)
+#define sit_in_journal(jnl, i) ((jnl)->sit_j.entries[i].se)
+#define segno_in_journal(jnl, i) ((jnl)->sit_j.entries[i].segno)
#define MAX_NAT_JENTRIES(jnl) (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl))
#define MAX_SIT_JENTRIES(jnl) (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl))
@@ -270,11 +295,14 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
#define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3)
#define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4)
#define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5)
-#define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6)
+#define F2FS_IOC_GARBAGE_COLLECT _IOW(F2FS_IOCTL_MAGIC, 6, __u32)
#define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7)
-#define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8)
+#define F2FS_IOC_DEFRAGMENT _IOWR(F2FS_IOCTL_MAGIC, 8, \
+ struct f2fs_defragment)
#define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \
struct f2fs_move_range)
+#define F2FS_IOC_FLUSH_DEVICE _IOW(F2FS_IOCTL_MAGIC, 10, \
+ struct f2fs_flush_device)
#define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY
#define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY
@@ -311,6 +339,11 @@ struct f2fs_move_range {
u64 len; /* size to move */
};
+struct f2fs_flush_device {
+ u32 dev_num; /* device number to flush */
+ u32 segments; /* # of segments to flush */
+};
+
/*
* For INODE and NODE manager
*/
@@ -323,26 +356,24 @@ struct f2fs_dentry_ptr {
int max;
};
-static inline void make_dentry_ptr(struct inode *inode,
- struct f2fs_dentry_ptr *d, void *src, int type)
+static inline void make_dentry_ptr_block(struct inode *inode,
+ struct f2fs_dentry_ptr *d, struct f2fs_dentry_block *t)
{
d->inode = inode;
+ d->max = NR_DENTRY_IN_BLOCK;
+ d->bitmap = &t->dentry_bitmap;
+ d->dentry = t->dentry;
+ d->filename = t->filename;
+}
- if (type == 1) {
- struct f2fs_dentry_block *t = (struct f2fs_dentry_block *)src;
-
- d->max = NR_DENTRY_IN_BLOCK;
- d->bitmap = &t->dentry_bitmap;
- d->dentry = t->dentry;
- d->filename = t->filename;
- } else {
- struct f2fs_inline_dentry *t = (struct f2fs_inline_dentry *)src;
-
- d->max = NR_INLINE_DENTRY;
- d->bitmap = &t->dentry_bitmap;
- d->dentry = t->dentry;
- d->filename = t->filename;
- }
+static inline void make_dentry_ptr_inline(struct inode *inode,
+ struct f2fs_dentry_ptr *d, struct f2fs_inline_dentry *t)
+{
+ d->inode = inode;
+ d->max = NR_INLINE_DENTRY;
+ d->bitmap = &t->dentry_bitmap;
+ d->dentry = t->dentry;
+ d->filename = t->filename;
}
/*
@@ -374,16 +405,30 @@ enum {
/* number of extent info in extent cache we try to shrink */
#define EXTENT_CACHE_SHRINK_NUMBER 128
+struct rb_entry {
+ struct rb_node rb_node; /* rb node located in rb-tree */
+ unsigned int ofs; /* start offset of the entry */
+ unsigned int len; /* length of the entry */
+};
+
struct extent_info {
unsigned int fofs; /* start offset in a file */
- u32 blk; /* start block address of the extent */
unsigned int len; /* length of the extent */
+ u32 blk; /* start block address of the extent */
};
struct extent_node {
- struct rb_node rb_node; /* rb node located in rb-tree */
+ struct rb_node rb_node;
+ union {
+ struct {
+ unsigned int fofs;
+ unsigned int len;
+ u32 blk;
+ };
+ struct extent_info ei; /* extent info */
+
+ };
struct list_head list; /* node in global extent list of sbi */
- struct extent_info ei; /* extent info */
struct extent_tree *et; /* extent tree pointer */
};
@@ -500,6 +545,24 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs,
ei->len = len;
}
+static inline bool __is_discard_mergeable(struct discard_info *back,
+ struct discard_info *front)
+{
+ return back->lstart + back->len == front->lstart;
+}
+
+static inline bool __is_discard_back_mergeable(struct discard_info *cur,
+ struct discard_info *back)
+{
+ return __is_discard_mergeable(back, cur);
+}
+
+static inline bool __is_discard_front_mergeable(struct discard_info *cur,
+ struct discard_info *front)
+{
+ return __is_discard_mergeable(cur, front);
+}
+
static inline bool __is_extent_mergeable(struct extent_info *back,
struct extent_info *front)
{
@@ -562,7 +625,6 @@ struct f2fs_nm_info {
unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE];
unsigned char *nat_block_bitmap;
unsigned short *free_nid_count; /* free nid count of NAT block */
- spinlock_t free_nid_lock; /* protect updating of nid count */
/* for checkpoint */
char *nat_bitmap; /* NAT bitmap pointer */
@@ -641,7 +703,8 @@ struct flush_cmd {
struct flush_cmd_control {
struct task_struct *f2fs_issue_flush; /* flush thread */
wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */
- atomic_t submit_flush; /* # of issued flushes */
+ atomic_t issued_flush; /* # of issued flushes */
+ atomic_t issing_flush; /* # of issing flushes */
struct llist_head issue_list; /* list for command issue */
struct llist_node *dispatch_list; /* list for command dispatch */
};
@@ -672,6 +735,7 @@ struct f2fs_sm_info {
unsigned int ipu_policy; /* in-place-update policy */
unsigned int min_ipu_util; /* in-place-update threshold */
unsigned int min_fsync_blocks; /* threshold for fsync */
+ unsigned int min_hot_blocks; /* threshold for hot block allocation */
/* for flush command control */
struct flush_cmd_control *fcc_info;
@@ -722,6 +786,7 @@ enum page_type {
META_FLUSH,
INMEM, /* the below types are used by tracepoints only. */
INMEM_DROP,
+ INMEM_INVALIDATE,
INMEM_REVOKE,
IPU,
OPU,
@@ -737,9 +802,10 @@ struct f2fs_io_info {
struct page *page; /* page to be written */
struct page *encrypted_page; /* encrypted page */
bool submitted; /* indicate IO submission */
+ bool need_lock; /* indicate we need to lock cp_rwsem */
};
-#define is_read_io(rw) (rw == READ)
+#define is_read_io(rw) ((rw) == READ)
struct f2fs_bio_info {
struct f2fs_sb_info *sbi; /* f2fs superblock */
struct bio *bio; /* bios to merge */
@@ -827,6 +893,7 @@ struct f2fs_sb_info {
struct mutex cp_mutex; /* checkpoint procedure lock */
struct rw_semaphore cp_rwsem; /* blocking FS operations */
struct rw_semaphore node_write; /* locking node writes */
+ struct rw_semaphore node_change; /* locking node change */
wait_queue_head_t cp_wait;
unsigned long last_time[MAX_TIME]; /* to store time in jiffies */
long interval_time[MAX_TIME]; /* to store thresholds */
@@ -879,6 +946,9 @@ struct f2fs_sb_info {
/* # of allocated blocks */
struct percpu_counter alloc_valid_block_count;
+ /* writeback control */
+ atomic_t wb_sync_req; /* count # of WB_SYNC threads */
+
/* valid inode count */
struct percpu_counter total_valid_inode_count;
@@ -912,11 +982,12 @@ struct f2fs_sb_info {
atomic_t inline_inode; /* # of inline_data inodes */
atomic_t inline_dir; /* # of inline_dentry inodes */
atomic_t aw_cnt; /* # of atomic writes */
+ atomic_t vw_cnt; /* # of volatile writes */
atomic_t max_aw_cnt; /* max # of atomic writes */
+ atomic_t max_vw_cnt; /* max # of volatile writes */
int bg_gc; /* background gc calls */
unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */
#endif
- unsigned int last_victim[2]; /* last victim segment # */
spinlock_t stat_lock; /* lock for stat operations */
/* For sysfs suppport */
@@ -971,8 +1042,8 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
* and the return value is in kbytes. s is of struct f2fs_sb_info.
*/
#define BD_PART_WRITTEN(s) \
-(((u64)part_stat_read(s->sb->s_bdev->bd_part, sectors[1]) - \
- s->sectors_written_start) >> 1)
+(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[1]) - \
+ (s)->sectors_written_start) >> 1)
static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
{
@@ -1193,7 +1264,7 @@ static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi,
{
bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
- return (cpc) ? (cpc->reason == CP_UMOUNT) && set : set;
+ return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set;
}
static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
@@ -1229,7 +1300,7 @@ static inline int __get_cp_reason(struct f2fs_sb_info *sbi)
static inline bool __remain_node_summaries(int reason)
{
- return (reason == CP_UMOUNT || reason == CP_FASTBOOT);
+ return (reason & (CP_UMOUNT | CP_FASTBOOT));
}
static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi)
@@ -1707,6 +1778,7 @@ enum {
FI_DO_DEFRAG, /* indicate defragment is running */
FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */
FI_NO_PREALLOC, /* indicate skipped preallocated blocks */
+ FI_HOT_DATA, /* indicate file is hot */
};
static inline void __mark_inode_dirty_flag(struct inode *inode,
@@ -1869,12 +1941,6 @@ static inline int f2fs_has_inline_data(struct inode *inode)
return is_inode_flag_set(inode, FI_INLINE_DATA);
}
-static inline void f2fs_clear_inline_inode(struct inode *inode)
-{
- clear_inode_flag(inode, FI_INLINE_DATA);
- clear_inode_flag(inode, FI_DATA_EXIST);
-}
-
static inline int f2fs_exist_data(struct inode *inode)
{
return is_inode_flag_set(inode, FI_DATA_EXIST);
@@ -2005,36 +2071,10 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
return kmalloc(size, flags);
}
-static inline void *f2fs_kvmalloc(size_t size, gfp_t flags)
-{
- void *ret;
-
- ret = kmalloc(size, flags | __GFP_NOWARN);
- if (!ret)
- ret = __vmalloc(size, flags, PAGE_KERNEL);
- return ret;
-}
-
-static inline void *f2fs_kvzalloc(size_t size, gfp_t flags)
-{
- void *ret;
-
- ret = kzalloc(size, flags | __GFP_NOWARN);
- if (!ret)
- ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
- return ret;
-}
-
#define get_inode_mode(i) \
((is_inode_flag_set(i, FI_ACL_MODE)) ? \
(F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
-/* get offset of first page in next direct node */
-#define PGOFS_OF_NEXT_DNODE(pgofs, inode) \
- ((pgofs < ADDRS_PER_INODE(inode)) ? ADDRS_PER_INODE(inode) : \
- (pgofs - ADDRS_PER_INODE(inode) + ADDRS_PER_BLOCK) / \
- ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode))
-
/*
* file.c
*/
@@ -2096,8 +2136,6 @@ ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr,
struct page **page);
void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
struct page *page, struct inode *inode);
-int update_dent_inode(struct inode *inode, struct inode *to,
- const struct qstr *name);
void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
const struct qstr *name, f2fs_hash_t name_hash,
unsigned int bit_pos);
@@ -2133,7 +2171,8 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi);
/*
* hash.c
*/
-f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info);
+f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info,
+ struct fscrypt_name *fname);
/*
* node.c
@@ -2184,6 +2223,7 @@ void destroy_node_manager_caches(void);
*/
void register_inmem_page(struct inode *inode, struct page *page);
void drop_inmem_pages(struct inode *inode);
+void drop_inmem_page(struct inode *inode, struct page *page);
int commit_inmem_pages(struct inode *inode);
void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need);
void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi);
@@ -2193,7 +2233,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free);
void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr);
bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr);
void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new);
-void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr);
+void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi);
void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc);
void release_discard_addrs(struct f2fs_sb_info *sbi);
int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
@@ -2205,7 +2245,7 @@ void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr);
void write_meta_page(struct f2fs_sb_info *sbi, struct page *page);
void write_node_page(unsigned int nid, struct f2fs_io_info *fio);
void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio);
-void rewrite_data_page(struct f2fs_io_info *fio);
+int rewrite_data_page(struct f2fs_io_info *fio);
void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
block_t old_blkaddr, block_t new_blkaddr,
bool recover_curseg, bool recover_newaddr);
@@ -2310,7 +2350,8 @@ int f2fs_migrate_page(struct address_space *mapping, struct page *newpage,
int start_gc_thread(struct f2fs_sb_info *sbi);
void stop_gc_thread(struct f2fs_sb_info *sbi);
block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode);
-int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background);
+int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background,
+ unsigned int segno);
void build_gc_manager(struct f2fs_sb_info *sbi);
/*
@@ -2334,11 +2375,15 @@ struct f2fs_stat_info {
int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta;
int inmem_pages;
unsigned int ndirty_dirs, ndirty_files, ndirty_all;
- int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids;
+ int nats, dirty_nats, sits, dirty_sits;
+ int free_nids, avail_nids, alloc_nids;
int total_count, utilization;
- int bg_gc, nr_wb_cp_data, nr_wb_data, nr_flush, nr_discard;
+ int bg_gc, nr_wb_cp_data, nr_wb_data;
+ int nr_flushing, nr_flushed, nr_discarding, nr_discarded;
+ int nr_discard_cmd;
+ unsigned int undiscard_blks;
int inline_xattr, inline_inode, inline_dir, append, update, orphans;
- int aw_cnt, max_aw_cnt;
+ int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt;
unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks;
unsigned int bimodal, avg_vblocks;
int util_free, util_valid, util_invalid;
@@ -2421,11 +2466,22 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
if (cur > max) \
atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \
} while (0)
+#define stat_inc_volatile_write(inode) \
+ (atomic_inc(&F2FS_I_SB(inode)->vw_cnt))
+#define stat_dec_volatile_write(inode) \
+ (atomic_dec(&F2FS_I_SB(inode)->vw_cnt))
+#define stat_update_max_volatile_write(inode) \
+ do { \
+ int cur = atomic_read(&F2FS_I_SB(inode)->vw_cnt); \
+ int max = atomic_read(&F2FS_I_SB(inode)->max_vw_cnt); \
+ if (cur > max) \
+ atomic_set(&F2FS_I_SB(inode)->max_vw_cnt, cur); \
+ } while (0)
#define stat_inc_seg_count(sbi, type, gc_type) \
do { \
struct f2fs_stat_info *si = F2FS_STAT(sbi); \
- (si)->tot_segs++; \
- if (type == SUM_TYPE_DATA) { \
+ si->tot_segs++; \
+ if ((type) == SUM_TYPE_DATA) { \
si->data_segs++; \
si->bg_data_segs += (gc_type == BG_GC) ? 1 : 0; \
} else { \
@@ -2435,14 +2491,14 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
} while (0)
#define stat_inc_tot_blk_count(si, blks) \
- (si->tot_blks += (blks))
+ ((si)->tot_blks += (blks))
#define stat_inc_data_blk_count(sbi, blks, gc_type) \
do { \
struct f2fs_stat_info *si = F2FS_STAT(sbi); \
stat_inc_tot_blk_count(si, blks); \
si->data_blks += (blks); \
- si->bg_data_blks += (gc_type == BG_GC) ? (blks) : 0; \
+ si->bg_data_blks += ((gc_type) == BG_GC) ? (blks) : 0; \
} while (0)
#define stat_inc_node_blk_count(sbi, blks, gc_type) \
@@ -2450,7 +2506,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
struct f2fs_stat_info *si = F2FS_STAT(sbi); \
stat_inc_tot_blk_count(si, blks); \
si->node_blks += (blks); \
- si->bg_node_blks += (gc_type == BG_GC) ? (blks) : 0; \
+ si->bg_node_blks += ((gc_type) == BG_GC) ? (blks) : 0; \
} while (0)
int f2fs_build_stats(struct f2fs_sb_info *sbi);
@@ -2458,32 +2514,35 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi);
int __init f2fs_create_root_stats(void);
void f2fs_destroy_root_stats(void);
#else
-#define stat_inc_cp_count(si)
-#define stat_inc_bg_cp_count(si)
-#define stat_inc_call_count(si)
-#define stat_inc_bggc_count(si)
-#define stat_inc_dirty_inode(sbi, type)
-#define stat_dec_dirty_inode(sbi, type)
-#define stat_inc_total_hit(sb)
-#define stat_inc_rbtree_node_hit(sb)
-#define stat_inc_largest_node_hit(sbi)
-#define stat_inc_cached_node_hit(sbi)
-#define stat_inc_inline_xattr(inode)
-#define stat_dec_inline_xattr(inode)
-#define stat_inc_inline_inode(inode)
-#define stat_dec_inline_inode(inode)
-#define stat_inc_inline_dir(inode)
-#define stat_dec_inline_dir(inode)
-#define stat_inc_atomic_write(inode)
-#define stat_dec_atomic_write(inode)
-#define stat_update_max_atomic_write(inode)
-#define stat_inc_seg_type(sbi, curseg)
-#define stat_inc_block_count(sbi, curseg)
-#define stat_inc_inplace_blocks(sbi)
-#define stat_inc_seg_count(sbi, type, gc_type)
-#define stat_inc_tot_blk_count(si, blks)
-#define stat_inc_data_blk_count(sbi, blks, gc_type)
-#define stat_inc_node_blk_count(sbi, blks, gc_type)
+#define stat_inc_cp_count(si) do { } while (0)
+#define stat_inc_bg_cp_count(si) do { } while (0)
+#define stat_inc_call_count(si) do { } while (0)
+#define stat_inc_bggc_count(si) do { } while (0)
+#define stat_inc_dirty_inode(sbi, type) do { } while (0)
+#define stat_dec_dirty_inode(sbi, type) do { } while (0)
+#define stat_inc_total_hit(sb) do { } while (0)
+#define stat_inc_rbtree_node_hit(sb) do { } while (0)
+#define stat_inc_largest_node_hit(sbi) do { } while (0)
+#define stat_inc_cached_node_hit(sbi) do { } while (0)
+#define stat_inc_inline_xattr(inode) do { } while (0)
+#define stat_dec_inline_xattr(inode) do { } while (0)
+#define stat_inc_inline_inode(inode) do { } while (0)
+#define stat_dec_inline_inode(inode) do { } while (0)
+#define stat_inc_inline_dir(inode) do { } while (0)
+#define stat_dec_inline_dir(inode) do { } while (0)
+#define stat_inc_atomic_write(inode) do { } while (0)
+#define stat_dec_atomic_write(inode) do { } while (0)
+#define stat_update_max_atomic_write(inode) do { } while (0)
+#define stat_inc_volatile_write(inode) do { } while (0)
+#define stat_dec_volatile_write(inode) do { } while (0)
+#define stat_update_max_volatile_write(inode) do { } while (0)
+#define stat_inc_seg_type(sbi, curseg) do { } while (0)
+#define stat_inc_block_count(sbi, curseg) do { } while (0)
+#define stat_inc_inplace_blocks(sbi) do { } while (0)
+#define stat_inc_seg_count(sbi, type, gc_type) do { } while (0)
+#define stat_inc_tot_blk_count(si, blks) do { } while (0)
+#define stat_inc_data_blk_count(sbi, blks, gc_type) do { } while (0)
+#define stat_inc_node_blk_count(sbi, blks, gc_type) do { } while (0)
static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
@@ -2509,7 +2568,7 @@ extern struct kmem_cache *inode_entry_slab;
bool f2fs_may_inline_data(struct inode *inode);
bool f2fs_may_inline_dentry(struct inode *inode);
void read_inline_data(struct page *page, struct page *ipage);
-bool truncate_inline_inode(struct page *ipage, u64 from);
+void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from);
int f2fs_read_inline_data(struct inode *inode, struct page *page);
int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page);
int f2fs_convert_inline_inode(struct inode *inode);
@@ -2544,6 +2603,18 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi);
/*
* extent_cache.c
*/
+struct rb_entry *__lookup_rb_tree(struct rb_root *root,
+ struct rb_entry *cached_re, unsigned int ofs);
+struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi,
+ struct rb_root *root, struct rb_node **parent,
+ unsigned int ofs);
+struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root,
+ struct rb_entry *cached_re, unsigned int ofs,
+ struct rb_entry **prev_entry, struct rb_entry **next_entry,
+ struct rb_node ***insert_p, struct rb_node **insert_parent,
+ bool force);
+bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi,
+ struct rb_root *root);
unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink);
bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext);
void f2fs_drop_extent_tree(struct inode *inode);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 5f7317875a67..61af721329fa 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -116,11 +116,6 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
if (!dentry)
return 0;
- if (update_dent_inode(inode, inode, &dentry->d_name)) {
- dput(dentry);
- return 0;
- }
-
*pino = parent_ino(dentry);
dput(dentry);
return 1;
@@ -528,7 +523,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
page = get_lock_data_page(inode, index, true);
if (IS_ERR(page))
- return 0;
+ return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page);
truncate_out:
f2fs_wait_on_page_writeback(page, DATA, true);
zero_user(page, offset, PAGE_SIZE - offset);
@@ -566,9 +561,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
}
if (f2fs_has_inline_data(inode)) {
- truncate_inline_inode(ipage, from);
- if (from == 0)
- clear_inode_flag(inode, FI_DATA_EXIST);
+ truncate_inline_inode(inode, ipage, from);
f2fs_put_page(ipage, 1);
truncate_page = true;
goto out;
@@ -617,6 +610,12 @@ int f2fs_truncate(struct inode *inode)
trace_f2fs_truncate(inode);
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) {
+ f2fs_show_injection_info(FAULT_TRUNCATE);
+ return -EIO;
+ }
+#endif
/* we should check inline_data size */
if (!f2fs_may_inline_data(inode)) {
err = f2fs_convert_inline_inode(inode);
@@ -1012,11 +1011,11 @@ static int __exchange_data_block(struct inode *src_inode,
while (len) {
olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len);
- src_blkaddr = f2fs_kvzalloc(sizeof(block_t) * olen, GFP_KERNEL);
+ src_blkaddr = kvzalloc(sizeof(block_t) * olen, GFP_KERNEL);
if (!src_blkaddr)
return -ENOMEM;
- do_replace = f2fs_kvzalloc(sizeof(int) * olen, GFP_KERNEL);
+ do_replace = kvzalloc(sizeof(int) * olen, GFP_KERNEL);
if (!do_replace) {
kvfree(src_blkaddr);
return -ENOMEM;
@@ -1188,8 +1187,6 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
if (ret)
return ret;
- if (offset + len > new_size)
- new_size = offset + len;
new_size = max_t(loff_t, new_size, offset + len);
} else {
if (off_start) {
@@ -1257,8 +1254,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
int ret = 0;
new_size = i_size_read(inode) + len;
- if (new_size > inode->i_sb->s_maxbytes)
- return -EFBIG;
+ ret = inode_newsize_ok(inode, new_size);
+ if (ret)
+ return ret;
if (offset >= i_size_read(inode))
return -EINVAL;
@@ -1428,6 +1426,7 @@ static int f2fs_release_file(struct inode *inode, struct file *filp)
drop_inmem_pages(inode);
if (f2fs_is_volatile_file(inode)) {
clear_inode_flag(inode, FI_VOLATILE_FILE);
+ stat_dec_volatile_write(inode);
set_inode_flag(inode, FI_DROP_CACHE);
filemap_fdatawrite(inode->i_mapping);
clear_inode_flag(inode, FI_DROP_CACHE);
@@ -1474,10 +1473,10 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
if (ret)
return ret;
- flags = f2fs_mask_flags(inode->i_mode, flags);
-
inode_lock(inode);
+ flags = f2fs_mask_flags(inode->i_mode, flags);
+
oldflags = fi->i_flags;
if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
@@ -1491,10 +1490,11 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
flags = flags & FS_FL_USER_MODIFIABLE;
flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
fi->i_flags = flags;
- inode_unlock(inode);
inode->i_ctime = current_time(inode);
f2fs_set_inode_flags(inode);
+
+ inode_unlock(inode);
out:
mnt_drop_write_file(filp);
return ret;
@@ -1515,6 +1515,9 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
if (!inode_owner_or_capable(inode))
return -EACCES;
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
ret = mnt_want_write_file(filp);
if (ret)
return ret;
@@ -1529,20 +1532,25 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
goto out;
set_inode_flag(inode, FI_ATOMIC_FILE);
+ set_inode_flag(inode, FI_HOT_DATA);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
if (!get_dirty_pages(inode))
- goto out;
+ goto inc_stat;
f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING,
"Unexpected flush for atomic writes: ino=%lu, npages=%u",
inode->i_ino, get_dirty_pages(inode));
ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
- if (ret)
+ if (ret) {
clear_inode_flag(inode, FI_ATOMIC_FILE);
-out:
+ goto out;
+ }
+
+inc_stat:
stat_inc_atomic_write(inode);
stat_update_max_atomic_write(inode);
+out:
inode_unlock(inode);
mnt_drop_write_file(filp);
return ret;
@@ -1592,6 +1600,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
if (!inode_owner_or_capable(inode))
return -EACCES;
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
ret = mnt_want_write_file(filp);
if (ret)
return ret;
@@ -1605,6 +1616,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
if (ret)
goto out;
+ stat_inc_volatile_write(inode);
+ stat_update_max_volatile_write(inode);
+
set_inode_flag(inode, FI_VOLATILE_FILE);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
out:
@@ -1660,6 +1674,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
drop_inmem_pages(inode);
if (f2fs_is_volatile_file(inode)) {
clear_inode_flag(inode, FI_VOLATILE_FILE);
+ stat_dec_volatile_write(inode);
ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
}
@@ -1841,7 +1856,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
mutex_lock(&sbi->gc_mutex);
}
- ret = f2fs_gc(sbi, sync, true);
+ ret = f2fs_gc(sbi, sync, true, NULL_SEGNO);
out:
mnt_drop_write_file(filp);
return ret;
@@ -1879,13 +1894,12 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
pgoff_t pg_start, pg_end;
unsigned int blk_per_seg = sbi->blocks_per_seg;
unsigned int total = 0, sec_num;
- unsigned int pages_per_sec = sbi->segs_per_sec * blk_per_seg;
block_t blk_end = 0;
bool fragmented = false;
int err;
/* if in-place-update policy is enabled, don't waste time here */
- if (need_inplace_update(inode))
+ if (need_inplace_update_policy(inode, NULL))
return -EINVAL;
pg_start = range->start >> PAGE_SHIFT;
@@ -1943,7 +1957,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
map.m_lblk = pg_start;
map.m_len = pg_end - pg_start;
- sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec;
+ sec_num = (map.m_len + BLKS_PER_SEC(sbi) - 1) / BLKS_PER_SEC(sbi);
/*
* make sure there are enough free section for LFS allocation, this can
@@ -2020,42 +2034,40 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!S_ISREG(inode->i_mode))
+ if (!S_ISREG(inode->i_mode) || f2fs_is_atomic_file(inode))
return -EINVAL;
- err = mnt_want_write_file(filp);
- if (err)
- return err;
-
- if (f2fs_readonly(sbi->sb)) {
- err = -EROFS;
- goto out;
- }
+ if (f2fs_readonly(sbi->sb))
+ return -EROFS;
if (copy_from_user(&range, (struct f2fs_defragment __user *)arg,
- sizeof(range))) {
- err = -EFAULT;
- goto out;
- }
+ sizeof(range)))
+ return -EFAULT;
/* verify alignment of offset & size */
- if (range.start & (F2FS_BLKSIZE - 1) ||
- range.len & (F2FS_BLKSIZE - 1)) {
- err = -EINVAL;
- goto out;
- }
+ if (range.start & (F2FS_BLKSIZE - 1) || range.len & (F2FS_BLKSIZE - 1))
+ return -EINVAL;
+
+ if (unlikely((range.start + range.len) >> PAGE_SHIFT >
+ sbi->max_file_blocks))
+ return -EINVAL;
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
err = f2fs_defragment_range(sbi, filp, &range);
+ mnt_drop_write_file(filp);
+
f2fs_update_time(sbi, REQ_TIME);
if (err < 0)
- goto out;
+ return err;
if (copy_to_user((struct f2fs_defragment __user *)arg, &range,
sizeof(range)))
- err = -EFAULT;
-out:
- mnt_drop_write_file(filp);
- return err;
+ return -EFAULT;
+
+ return 0;
}
static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
@@ -2189,6 +2201,8 @@ static int f2fs_ioc_move_range(struct file *filp, unsigned long arg)
range.pos_out, range.len);
mnt_drop_write_file(filp);
+ if (err)
+ goto err_out;
if (copy_to_user((struct f2fs_move_range __user *)arg,
&range, sizeof(range)))
@@ -2198,6 +2212,69 @@ err_out:
return err;
}
+static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct sit_info *sm = SIT_I(sbi);
+ unsigned int start_segno = 0, end_segno = 0;
+ unsigned int dev_start_segno = 0, dev_end_segno = 0;
+ struct f2fs_flush_device range;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (f2fs_readonly(sbi->sb))
+ return -EROFS;
+
+ if (copy_from_user(&range, (struct f2fs_flush_device __user *)arg,
+ sizeof(range)))
+ return -EFAULT;
+
+ if (sbi->s_ndevs <= 1 || sbi->s_ndevs - 1 <= range.dev_num ||
+ sbi->segs_per_sec != 1) {
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "Can't flush %u in %d for segs_per_sec %u != 1\n",
+ range.dev_num, sbi->s_ndevs,
+ sbi->segs_per_sec);
+ return -EINVAL;
+ }
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ if (range.dev_num != 0)
+ dev_start_segno = GET_SEGNO(sbi, FDEV(range.dev_num).start_blk);
+ dev_end_segno = GET_SEGNO(sbi, FDEV(range.dev_num).end_blk);
+
+ start_segno = sm->last_victim[FLUSH_DEVICE];
+ if (start_segno < dev_start_segno || start_segno >= dev_end_segno)
+ start_segno = dev_start_segno;
+ end_segno = min(start_segno + range.segments, dev_end_segno);
+
+ while (start_segno < end_segno) {
+ if (!mutex_trylock(&sbi->gc_mutex)) {
+ ret = -EBUSY;
+ goto out;
+ }
+ sm->last_victim[GC_CB] = end_segno + 1;
+ sm->last_victim[GC_GREEDY] = end_segno + 1;
+ sm->last_victim[ALLOC_NEXT] = end_segno + 1;
+ ret = f2fs_gc(sbi, true, true, start_segno);
+ if (ret == -EAGAIN)
+ ret = 0;
+ else if (ret < 0)
+ break;
+ start_segno++;
+ }
+out:
+ mnt_drop_write_file(filp);
+ return ret;
+}
+
+
long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
switch (cmd) {
@@ -2235,6 +2312,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_ioc_defragment(filp, arg);
case F2FS_IOC_MOVE_RANGE:
return f2fs_ioc_move_range(filp, arg);
+ case F2FS_IOC_FLUSH_DEVICE:
+ return f2fs_ioc_flush_device(filp, arg);
default:
return -ENOTTY;
}
@@ -2302,8 +2381,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case F2FS_IOC_GARBAGE_COLLECT:
case F2FS_IOC_WRITE_CHECKPOINT:
case F2FS_IOC_DEFRAGMENT:
- break;
case F2FS_IOC_MOVE_RANGE:
+ case F2FS_IOC_FLUSH_DEVICE:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 418fd9881646..026522107ca3 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -84,7 +84,7 @@ static int gc_thread_func(void *data)
stat_inc_bggc_count(sbi);
/* if return value is not zero, no victim was selected */
- if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true))
+ if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO))
wait_ms = gc_th->no_gc_sleep_time;
trace_f2fs_background_gc(sbi->sb, wait_ms,
@@ -172,7 +172,11 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
if (gc_type != FG_GC && p->max_search > sbi->max_victim_search)
p->max_search = sbi->max_victim_search;
- p->offset = sbi->last_victim[p->gc_mode];
+ /* let's select beginning hot/small space first */
+ if (type == CURSEG_HOT_DATA || IS_NODESEG(type))
+ p->offset = 0;
+ else
+ p->offset = SIT_I(sbi)->last_victim[p->gc_mode];
}
static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
@@ -182,7 +186,7 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
if (p->alloc_mode == SSR)
return sbi->blocks_per_seg;
if (p->gc_mode == GC_GREEDY)
- return sbi->blocks_per_seg * p->ofs_unit;
+ return 2 * sbi->blocks_per_seg * p->ofs_unit;
else if (p->gc_mode == GC_CB)
return UINT_MAX;
else /* No other gc_mode */
@@ -207,7 +211,7 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
continue;
clear_bit(secno, dirty_i->victim_secmap);
- return secno * sbi->segs_per_sec;
+ return GET_SEG_FROM_SEC(sbi, secno);
}
return NULL_SEGNO;
}
@@ -215,8 +219,8 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
{
struct sit_info *sit_i = SIT_I(sbi);
- unsigned int secno = GET_SECNO(sbi, segno);
- unsigned int start = secno * sbi->segs_per_sec;
+ unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+ unsigned int start = GET_SEG_FROM_SEC(sbi, secno);
unsigned long long mtime = 0;
unsigned int vblocks;
unsigned char age = 0;
@@ -225,7 +229,7 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
for (i = 0; i < sbi->segs_per_sec; i++)
mtime += get_seg_entry(sbi, start + i)->mtime;
- vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+ vblocks = get_valid_blocks(sbi, segno, true);
mtime = div_u64(mtime, sbi->segs_per_sec);
vblocks = div_u64(vblocks, sbi->segs_per_sec);
@@ -248,7 +252,7 @@ static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi,
unsigned int segno)
{
unsigned int valid_blocks =
- get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+ get_valid_blocks(sbi, segno, true);
return IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
valid_blocks * 2 : valid_blocks;
@@ -291,6 +295,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
unsigned int *result, int gc_type, int type, char alloc_mode)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ struct sit_info *sm = SIT_I(sbi);
struct victim_sel_policy p;
unsigned int secno, last_victim;
unsigned int last_segment = MAIN_SEGS(sbi);
@@ -304,10 +309,18 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
p.min_segno = NULL_SEGNO;
p.min_cost = get_max_cost(sbi, &p);
+ if (*result != NULL_SEGNO) {
+ if (IS_DATASEG(get_seg_entry(sbi, *result)->type) &&
+ get_valid_blocks(sbi, *result, false) &&
+ !sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result)))
+ p.min_segno = *result;
+ goto out;
+ }
+
if (p.max_search == 0)
goto out;
- last_victim = sbi->last_victim[p.gc_mode];
+ last_victim = sm->last_victim[p.gc_mode];
if (p.alloc_mode == LFS && gc_type == FG_GC) {
p.min_segno = check_bg_victims(sbi);
if (p.min_segno != NULL_SEGNO)
@@ -320,9 +333,10 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
segno = find_next_bit(p.dirty_segmap, last_segment, p.offset);
if (segno >= last_segment) {
- if (sbi->last_victim[p.gc_mode]) {
- last_segment = sbi->last_victim[p.gc_mode];
- sbi->last_victim[p.gc_mode] = 0;
+ if (sm->last_victim[p.gc_mode]) {
+ last_segment =
+ sm->last_victim[p.gc_mode];
+ sm->last_victim[p.gc_mode] = 0;
p.offset = 0;
continue;
}
@@ -339,7 +353,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
nsearched++;
}
- secno = GET_SECNO(sbi, segno);
+ secno = GET_SEC_FROM_SEG(sbi, segno);
if (sec_usage_check(sbi, secno))
goto next;
@@ -357,17 +371,18 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
}
next:
if (nsearched >= p.max_search) {
- if (!sbi->last_victim[p.gc_mode] && segno <= last_victim)
- sbi->last_victim[p.gc_mode] = last_victim + 1;
+ if (!sm->last_victim[p.gc_mode] && segno <= last_victim)
+ sm->last_victim[p.gc_mode] = last_victim + 1;
else
- sbi->last_victim[p.gc_mode] = segno + 1;
+ sm->last_victim[p.gc_mode] = segno + 1;
+ sm->last_victim[p.gc_mode] %= MAIN_SEGS(sbi);
break;
}
}
if (p.min_segno != NULL_SEGNO) {
got_it:
if (p.alloc_mode == LFS) {
- secno = GET_SECNO(sbi, p.min_segno);
+ secno = GET_SEC_FROM_SEG(sbi, p.min_segno);
if (gc_type == FG_GC)
sbi->cur_victim_sec = secno;
else
@@ -550,8 +565,10 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
get_node_info(sbi, nid, dni);
if (sum->version != dni->version) {
- f2fs_put_page(node_page, 1);
- return false;
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "%s: valid data with mismatched node version.",
+ __func__);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
}
*nofs = ofs_of_node(node_page);
@@ -697,8 +714,10 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type,
.type = DATA,
.op = REQ_OP_WRITE,
.op_flags = REQ_SYNC,
+ .old_blkaddr = NULL_ADDR,
.page = page,
.encrypted_page = NULL,
+ .need_lock = true,
};
bool is_dirty = PageDirty(page);
int err;
@@ -890,7 +909,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
GET_SUM_BLOCK(sbi, segno));
f2fs_put_page(sum_page, 0);
- if (get_valid_blocks(sbi, segno, 1) == 0 ||
+ if (get_valid_blocks(sbi, segno, false) == 0 ||
!PageUptodate(sum_page) ||
unlikely(f2fs_cp_error(sbi)))
goto next;
@@ -905,7 +924,6 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
* - mutex_lock(sentry_lock) - change_curseg()
* - lock_page(sum_page)
*/
-
if (type == SUM_TYPE_NODE)
gc_node_segment(sbi, sum->entries, segno, gc_type);
else
@@ -924,7 +942,7 @@ next:
blk_finish_plug(&plug);
if (gc_type == FG_GC &&
- get_valid_blocks(sbi, start_segno, sbi->segs_per_sec) == 0)
+ get_valid_blocks(sbi, start_segno, true) == 0)
sec_freed = 1;
stat_inc_call_count(sbi->stat_info);
@@ -932,13 +950,14 @@ next:
return sec_freed;
}
-int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background)
+int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
+ bool background, unsigned int segno)
{
- unsigned int segno;
int gc_type = sync ? FG_GC : BG_GC;
int sec_freed = 0;
int ret = -EINVAL;
struct cp_control cpc;
+ unsigned int init_segno = segno;
struct gc_inode_list gc_list = {
.ilist = LIST_HEAD_INIT(gc_list.ilist),
.iroot = RADIX_TREE_INIT(GFP_NOFS),
@@ -959,9 +978,11 @@ gc_more:
* threshold, we can make them free by checkpoint. Then, we
* secure free segments which doesn't need fggc any more.
*/
- ret = write_checkpoint(sbi, &cpc);
- if (ret)
- goto stop;
+ if (prefree_segments(sbi)) {
+ ret = write_checkpoint(sbi, &cpc);
+ if (ret)
+ goto stop;
+ }
if (has_not_enough_free_secs(sbi, 0, 0))
gc_type = FG_GC;
}
@@ -981,13 +1002,17 @@ gc_more:
sbi->cur_victim_sec = NULL_SEGNO;
if (!sync) {
- if (has_not_enough_free_secs(sbi, sec_freed, 0))
+ if (has_not_enough_free_secs(sbi, sec_freed, 0)) {
+ segno = NULL_SEGNO;
goto gc_more;
+ }
if (gc_type == FG_GC)
ret = write_checkpoint(sbi, &cpc);
}
stop:
+ SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
+ SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno;
mutex_unlock(&sbi->gc_mutex);
put_gc_inode(&gc_list);
@@ -999,7 +1024,7 @@ stop:
void build_gc_manager(struct f2fs_sb_info *sbi)
{
- u64 main_count, resv_count, ovp_count, blocks_per_sec;
+ u64 main_count, resv_count, ovp_count;
DIRTY_I(sbi)->v_ops = &default_v_ops;
@@ -1007,8 +1032,12 @@ void build_gc_manager(struct f2fs_sb_info *sbi)
main_count = SM_I(sbi)->main_segments << sbi->log_blocks_per_seg;
resv_count = SM_I(sbi)->reserved_segments << sbi->log_blocks_per_seg;
ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg;
- blocks_per_sec = sbi->blocks_per_seg * sbi->segs_per_sec;
- sbi->fggc_threshold = div64_u64((main_count - ovp_count) * blocks_per_sec,
- (main_count - resv_count));
+ sbi->fggc_threshold = div64_u64((main_count - ovp_count) *
+ BLKS_PER_SEC(sbi), (main_count - resv_count));
+
+ /* give warm/cold data area from slower device */
+ if (sbi->s_ndevs && sbi->segs_per_sec == 1)
+ SIT_I(sbi)->last_victim[ALLOC_NEXT] =
+ GET_SEGNO(sbi, FDEV(0).end_blk) + 1;
}
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
index 71b7206c431e..eb2e031ea887 100644
--- a/fs/f2fs/hash.c
+++ b/fs/f2fs/hash.c
@@ -70,7 +70,8 @@ static void str2hashbuf(const unsigned char *msg, size_t len,
*buf++ = pad;
}
-f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info)
+f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info,
+ struct fscrypt_name *fname)
{
__u32 hash;
f2fs_hash_t f2fs_hash;
@@ -79,6 +80,10 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info)
const unsigned char *name = name_info->name;
size_t len = name_info->len;
+ /* encrypted bigname case */
+ if (fname && !fname->disk_name.name)
+ return cpu_to_le32(fname->hash);
+
if (is_dot_dotdot(name_info))
return 0;
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index e32a9e527968..e4c527c4e7d0 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -63,19 +63,21 @@ void read_inline_data(struct page *page, struct page *ipage)
SetPageUptodate(page);
}
-bool truncate_inline_inode(struct page *ipage, u64 from)
+void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from)
{
void *addr;
if (from >= MAX_INLINE_DATA)
- return false;
+ return;
addr = inline_data_addr(ipage);
f2fs_wait_on_page_writeback(ipage, NODE, true);
memset(addr + from, 0, MAX_INLINE_DATA - from);
set_page_dirty(ipage);
- return true;
+
+ if (from == 0)
+ clear_inode_flag(inode, FI_DATA_EXIST);
}
int f2fs_read_inline_data(struct inode *inode, struct page *page)
@@ -135,6 +137,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
/* write data page to try to make data consistent */
set_page_writeback(page);
fio.old_blkaddr = dn->data_blkaddr;
+ set_inode_flag(dn->inode, FI_HOT_DATA);
write_data_page(dn, &fio);
f2fs_wait_on_page_writeback(page, DATA, true);
if (dirty) {
@@ -146,11 +149,11 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
set_inode_flag(dn->inode, FI_APPEND_WRITE);
/* clear inline data and flag after data writeback */
- truncate_inline_inode(dn->inode_page, 0);
+ truncate_inline_inode(dn->inode, dn->inode_page, 0);
clear_inline_node(dn->inode_page);
clear_out:
stat_dec_inline_inode(dn->inode);
- f2fs_clear_inline_inode(dn->inode);
+ clear_inode_flag(dn->inode, FI_INLINE_DATA);
f2fs_put_dnode(dn);
return 0;
}
@@ -267,9 +270,8 @@ process_inline:
if (f2fs_has_inline_data(inode)) {
ipage = get_node_page(sbi, inode->i_ino);
f2fs_bug_on(sbi, IS_ERR(ipage));
- if (!truncate_inline_inode(ipage, 0))
- return false;
- f2fs_clear_inline_inode(inode);
+ truncate_inline_inode(inode, ipage, 0);
+ clear_inode_flag(inode, FI_INLINE_DATA);
f2fs_put_page(ipage, 1);
} else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
if (truncate_blocks(inode, 0, false))
@@ -296,11 +298,11 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
return NULL;
}
- namehash = f2fs_dentry_hash(&name);
+ namehash = f2fs_dentry_hash(&name, fname);
inline_dentry = inline_data_addr(ipage);
- make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2);
+ make_dentry_ptr_inline(NULL, &d, inline_dentry);
de = find_target_dentry(fname, namehash, NULL, &d);
unlock_page(ipage);
if (de)
@@ -319,7 +321,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent,
dentry_blk = inline_data_addr(ipage);
- make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2);
+ make_dentry_ptr_inline(NULL, &d, dentry_blk);
do_make_empty_dir(inode, parent, &d);
set_page_dirty(ipage);
@@ -380,7 +382,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
set_page_dirty(page);
/* clear inline dir and flag after data writeback */
- truncate_inline_inode(ipage, 0);
+ truncate_inline_inode(dir, ipage, 0);
stat_dec_inline_dir(dir);
clear_inode_flag(dir, FI_INLINE_DENTRY);
@@ -400,7 +402,7 @@ static int f2fs_add_inline_entries(struct inode *dir,
unsigned long bit_pos = 0;
int err = 0;
- make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2);
+ make_dentry_ptr_inline(NULL, &d, inline_dentry);
while (bit_pos < d.max) {
struct f2fs_dir_entry *de;
@@ -455,7 +457,7 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
}
memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA);
- truncate_inline_inode(ipage, 0);
+ truncate_inline_inode(dir, ipage, 0);
unlock_page(ipage);
@@ -527,14 +529,12 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
err = PTR_ERR(page);
goto fail;
}
- if (f2fs_encrypted_inode(dir))
- file_set_enc_name(inode);
}
f2fs_wait_on_page_writeback(ipage, NODE, true);
- name_hash = f2fs_dentry_hash(new_name);
- make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2);
+ name_hash = f2fs_dentry_hash(new_name, NULL);
+ make_dentry_ptr_inline(NULL, &d, dentry_blk);
f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos);
set_page_dirty(ipage);
@@ -623,7 +623,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
inline_dentry = inline_data_addr(ipage);
- make_dentry_ptr(inode, &d, (void *)inline_dentry, 2);
+ make_dentry_ptr_inline(inode, &d, inline_dentry);
err = f2fs_fill_dentries(ctx, &d, 0, fstr);
if (!err)
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 24bb8213d974..518f49643092 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -316,7 +316,6 @@ retry:
} else if (err != -ENOENT) {
f2fs_stop_checkpoint(sbi, false);
}
- f2fs_inode_synced(inode);
return 0;
}
ret = update_inode(inode, node_page);
@@ -339,7 +338,8 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
* We need to balance fs here to prevent from producing dirty node pages
* during the urgent cleaning time when runing out of free sections.
*/
- if (update_inode_page(inode) && wbc && wbc->nr_to_write)
+ update_inode_page(inode);
+ if (wbc && wbc->nr_to_write)
f2fs_balance_fs(sbi, true);
return 0;
}
@@ -372,13 +372,6 @@ void f2fs_evict_inode(struct inode *inode)
if (inode->i_nlink || is_bad_inode(inode))
goto no_delete;
-#ifdef CONFIG_F2FS_FAULT_INJECTION
- if (time_to_inject(sbi, FAULT_EVICT_INODE)) {
- f2fs_show_injection_info(FAULT_EVICT_INODE);
- goto no_delete;
- }
-#endif
-
remove_ino_entry(sbi, inode->i_ino, APPEND_INO);
remove_ino_entry(sbi, inode->i_ino, UPDATE_INO);
@@ -389,6 +382,12 @@ retry:
if (F2FS_HAS_BLOCKS(inode))
err = f2fs_truncate(inode);
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (time_to_inject(sbi, FAULT_EVICT_INODE)) {
+ f2fs_show_injection_info(FAULT_EVICT_INODE);
+ err = -EIO;
+ }
+#endif
if (!err) {
f2fs_lock_op(sbi);
err = remove_inode_page(inode);
@@ -411,7 +410,10 @@ no_delete:
stat_dec_inline_dir(inode);
stat_dec_inline_inode(inode);
- invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino);
+ /* ino == 0, if f2fs_new_inode() was failed t*/
+ if (inode->i_ino)
+ invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino,
+ inode->i_ino);
if (xnid)
invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
if (inode->i_nlink) {
@@ -448,6 +450,7 @@ void handle_failed_inode(struct inode *inode)
* in a panic when flushing dirty inodes in gdirty_list.
*/
update_inode_page(inode);
+ f2fs_inode_synced(inode);
/* don't make bad inode, since it becomes a regular file. */
unlock_new_inode(inode);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 98f00a3a7f50..c31b40e5f9cf 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -148,8 +148,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
inode->i_mapping->a_ops = &f2fs_dblock_aops;
ino = inode->i_ino;
- f2fs_balance_fs(sbi, true);
-
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
if (err)
@@ -163,6 +161,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
if (IS_DIRSYNC(dir))
f2fs_sync_fs(sbi->sb, 1);
+
+ f2fs_balance_fs(sbi, true);
return 0;
out:
handle_failed_inode(inode);
@@ -324,9 +324,10 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
if (f2fs_encrypted_inode(dir) &&
(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
!fscrypt_has_permitted_context(dir, inode)) {
- bool nokey = f2fs_encrypted_inode(inode) &&
- !fscrypt_has_encryption_key(inode);
- err = nokey ? -ENOKEY : -EPERM;
+ f2fs_msg(inode->i_sb, KERN_WARNING,
+ "Inconsistent encryption contexts: %lu/%lu",
+ dir->i_ino, inode->i_ino);
+ err = -EPERM;
goto err_out;
}
return d_splice_alias(inode, dentry);
@@ -423,8 +424,6 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
inode_nohighmem(inode);
inode->i_mapping->a_ops = &f2fs_dblock_aops;
- f2fs_balance_fs(sbi, true);
-
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
if (err)
@@ -487,6 +486,8 @@ err_out:
}
kfree(sd);
+
+ f2fs_balance_fs(sbi, true);
return err;
out:
handle_failed_inode(inode);
@@ -508,8 +509,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
inode->i_mapping->a_ops = &f2fs_dblock_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO);
- f2fs_balance_fs(sbi, true);
-
set_inode_flag(inode, FI_INC_LINK);
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
@@ -524,6 +523,8 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
if (IS_DIRSYNC(dir))
f2fs_sync_fs(sbi->sb, 1);
+
+ f2fs_balance_fs(sbi, true);
return 0;
out_fail:
@@ -554,8 +555,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
init_special_inode(inode, inode->i_mode, rdev);
inode->i_op = &f2fs_special_inode_operations;
- f2fs_balance_fs(sbi, true);
-
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
if (err)
@@ -569,6 +568,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
if (IS_DIRSYNC(dir))
f2fs_sync_fs(sbi->sb, 1);
+
+ f2fs_balance_fs(sbi, true);
return 0;
out:
handle_failed_inode(inode);
@@ -595,8 +596,6 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
inode->i_mapping->a_ops = &f2fs_dblock_aops;
}
- f2fs_balance_fs(sbi, true);
-
f2fs_lock_op(sbi);
err = acquire_orphan_inode(sbi);
if (err)
@@ -622,6 +621,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
/* link_count was changed by d_tmpfile as well. */
f2fs_unlock_op(sbi);
unlock_new_inode(inode);
+
+ f2fs_balance_fs(sbi, true);
return 0;
release_out:
@@ -720,13 +721,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (err)
goto put_out_dir;
- err = update_dent_inode(old_inode, new_inode,
- &new_dentry->d_name);
- if (err) {
- release_orphan_inode(sbi);
- goto put_out_dir;
- }
-
f2fs_set_link(new_dir, new_entry, new_page, old_inode);
new_inode->i_ctime = current_time(new_inode);
@@ -779,8 +773,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
down_write(&F2FS_I(old_inode)->i_sem);
file_lost_pino(old_inode);
- if (new_inode && file_enc_name(new_inode))
- file_set_enc_name(old_inode);
up_write(&F2FS_I(old_inode)->i_sem);
old_inode->i_ctime = current_time(old_inode);
@@ -908,8 +900,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
old_nlink = old_dir_entry ? -1 : 1;
new_nlink = -old_nlink;
err = -EMLINK;
- if ((old_nlink > 0 && old_inode->i_nlink >= F2FS_LINK_MAX) ||
- (new_nlink > 0 && new_inode->i_nlink >= F2FS_LINK_MAX))
+ if ((old_nlink > 0 && old_dir->i_nlink >= F2FS_LINK_MAX) ||
+ (new_nlink > 0 && new_dir->i_nlink >= F2FS_LINK_MAX))
goto out_new_dir;
}
@@ -917,18 +909,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_lock_op(sbi);
- err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name);
- if (err)
- goto out_unlock;
- if (file_enc_name(new_inode))
- file_set_enc_name(old_inode);
-
- err = update_dent_inode(new_inode, old_inode, &old_dentry->d_name);
- if (err)
- goto out_undo;
- if (file_enc_name(old_inode))
- file_set_enc_name(new_inode);
-
/* update ".." directory entry info of old dentry */
if (old_dir_entry)
f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir);
@@ -972,14 +952,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
f2fs_sync_fs(sbi->sb, 1);
return 0;
-out_undo:
- /*
- * Still we may fail to recover name info of f2fs_inode here
- * Drop it, once its name is set as encrypted
- */
- update_dent_inode(old_inode, old_inode, &old_dentry->d_name);
-out_unlock:
- f2fs_unlock_op(sbi);
out_new_dir:
if (new_dir_entry) {
f2fs_dentry_kunmap(new_inode, new_dir_page);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 481aa8dc79f4..4547c5c5cd98 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -22,7 +22,7 @@
#include "trace.h"
#include <trace/events/f2fs.h>
-#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock)
+#define on_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock)
static struct kmem_cache *nat_entry_slab;
static struct kmem_cache *free_nid_slab;
@@ -63,8 +63,9 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
int i;
for (i = 0; i <= UPDATE_INO; i++)
- mem_size += (sbi->im[i].ino_num *
- sizeof(struct ino_entry)) >> PAGE_SHIFT;
+ mem_size += sbi->im[i].ino_num *
+ sizeof(struct ino_entry);
+ mem_size >>= PAGE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
} else if (type == EXTENT_CACHE) {
mem_size = (atomic_read(&sbi->total_ext_tree) *
@@ -177,18 +178,12 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
}
static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
- struct nat_entry *ne)
+ struct nat_entry_set *set, struct nat_entry *ne)
{
- nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
- struct nat_entry_set *head;
-
- head = radix_tree_lookup(&nm_i->nat_set_root, set);
- if (head) {
- list_move_tail(&ne->list, &nm_i->nat_entries);
- set_nat_flag(ne, IS_DIRTY, false);
- head->entry_cnt--;
- nm_i->dirty_nat_cnt--;
- }
+ list_move_tail(&ne->list, &nm_i->nat_entries);
+ set_nat_flag(ne, IS_DIRTY, false);
+ set->entry_cnt--;
+ nm_i->dirty_nat_cnt--;
}
static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
@@ -381,6 +376,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
struct page *page = NULL;
struct f2fs_nat_entry ne;
struct nat_entry *e;
+ pgoff_t index;
int i;
ni->nid = nid;
@@ -406,17 +402,21 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
node_info_from_raw_nat(ni, &ne);
}
up_read(&curseg->journal_rwsem);
- if (i >= 0)
+ if (i >= 0) {
+ up_read(&nm_i->nat_tree_lock);
goto cache;
+ }
/* Fill node_info from nat page */
- page = get_current_nat_page(sbi, start_nid);
+ index = current_nat_addr(sbi, nid);
+ up_read(&nm_i->nat_tree_lock);
+
+ page = get_meta_page(sbi, index);
nat_blk = (struct f2fs_nat_block *)page_address(page);
ne = nat_blk->entries[nid - start_nid];
node_info_from_raw_nat(ni, &ne);
f2fs_put_page(page, 1);
cache:
- up_read(&nm_i->nat_tree_lock);
/* cache nat entry */
down_write(&nm_i->nat_tree_lock);
cache_nat_entry(sbi, nid, &ne);
@@ -1463,6 +1463,9 @@ continue_unlock:
f2fs_wait_on_page_writeback(page, NODE, true);
BUG_ON(PageWriteback(page));
+ set_fsync_mark(page, 0);
+ set_dentry_mark(page, 0);
+
if (!atomic || page == last_page) {
set_fsync_mark(page, 1);
if (IS_INODE(page)) {
@@ -1766,40 +1769,67 @@ static void __remove_nid_from_list(struct f2fs_sb_info *sbi,
static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
- struct free_nid *i;
+ struct free_nid *i, *e;
struct nat_entry *ne;
- int err;
+ int err = -EINVAL;
+ bool ret = false;
/* 0 nid should not be used */
if (unlikely(nid == 0))
return false;
- if (build) {
- /* do not add allocated nids */
- ne = __lookup_nat_cache(nm_i, nid);
- if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
- nat_get_blkaddr(ne) != NULL_ADDR))
- return false;
- }
-
i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS);
i->nid = nid;
i->state = NID_NEW;
- if (radix_tree_preload(GFP_NOFS)) {
- kmem_cache_free(free_nid_slab, i);
- return true;
- }
+ if (radix_tree_preload(GFP_NOFS))
+ goto err;
spin_lock(&nm_i->nid_list_lock);
+
+ if (build) {
+ /*
+ * Thread A Thread B
+ * - f2fs_create
+ * - f2fs_new_inode
+ * - alloc_nid
+ * - __insert_nid_to_list(ALLOC_NID_LIST)
+ * - f2fs_balance_fs_bg
+ * - build_free_nids
+ * - __build_free_nids
+ * - scan_nat_page
+ * - add_free_nid
+ * - __lookup_nat_cache
+ * - f2fs_add_link
+ * - init_inode_metadata
+ * - new_inode_page
+ * - new_node_page
+ * - set_node_addr
+ * - alloc_nid_done
+ * - __remove_nid_from_list(ALLOC_NID_LIST)
+ * - __insert_nid_to_list(FREE_NID_LIST)
+ */
+ ne = __lookup_nat_cache(nm_i, nid);
+ if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
+ nat_get_blkaddr(ne) != NULL_ADDR))
+ goto err_out;
+
+ e = __lookup_free_nid_list(nm_i, nid);
+ if (e) {
+ if (e->state == NID_NEW)
+ ret = true;
+ goto err_out;
+ }
+ }
+ ret = true;
err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true);
+err_out:
spin_unlock(&nm_i->nid_list_lock);
radix_tree_preload_end();
- if (err) {
+err:
+ if (err)
kmem_cache_free(free_nid_slab, i);
- return true;
- }
- return true;
+ return ret;
}
static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid)
@@ -1821,7 +1851,7 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid)
}
static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid,
- bool set, bool build, bool locked)
+ bool set, bool build)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid);
@@ -1835,14 +1865,10 @@ static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid,
else
__clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
- if (!locked)
- spin_lock(&nm_i->free_nid_lock);
if (set)
nm_i->free_nid_count[nat_ofs]++;
else if (!build)
nm_i->free_nid_count[nat_ofs]--;
- if (!locked)
- spin_unlock(&nm_i->free_nid_lock);
}
static void scan_nat_page(struct f2fs_sb_info *sbi,
@@ -1871,7 +1897,9 @@ static void scan_nat_page(struct f2fs_sb_info *sbi,
f2fs_bug_on(sbi, blk_addr == NEW_ADDR);
if (blk_addr == NULL_ADDR)
freed = add_free_nid(sbi, start_nid, true);
- update_free_nid_bitmap(sbi, start_nid, freed, true, false);
+ spin_lock(&NM_I(sbi)->nid_list_lock);
+ update_free_nid_bitmap(sbi, start_nid, freed, true);
+ spin_unlock(&NM_I(sbi)->nid_list_lock);
}
}
@@ -1927,6 +1955,9 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
int i = 0;
nid_t nid = nm_i->next_scan_nid;
+ if (unlikely(nid >= nm_i->max_nid))
+ nid = 0;
+
/* Enough entries */
if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK)
return;
@@ -2026,7 +2057,7 @@ retry:
__insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false);
nm_i->available_nids--;
- update_free_nid_bitmap(sbi, *nid, false, false, false);
+ update_free_nid_bitmap(sbi, *nid, false, false);
spin_unlock(&nm_i->nid_list_lock);
return true;
@@ -2082,7 +2113,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
nm_i->available_nids++;
- update_free_nid_bitmap(sbi, nid, true, false, false);
+ update_free_nid_bitmap(sbi, nid, true, false);
spin_unlock(&nm_i->nid_list_lock);
@@ -2407,16 +2438,16 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
}
raw_nat_from_node_info(raw_ne, &ne->ni);
nat_reset_flag(ne);
- __clear_nat_cache_dirty(NM_I(sbi), ne);
+ __clear_nat_cache_dirty(NM_I(sbi), set, ne);
if (nat_get_blkaddr(ne) == NULL_ADDR) {
add_free_nid(sbi, nid, false);
spin_lock(&NM_I(sbi)->nid_list_lock);
NM_I(sbi)->available_nids++;
- update_free_nid_bitmap(sbi, nid, true, false, false);
+ update_free_nid_bitmap(sbi, nid, true, false);
spin_unlock(&NM_I(sbi)->nid_list_lock);
} else {
spin_lock(&NM_I(sbi)->nid_list_lock);
- update_free_nid_bitmap(sbi, nid, false, false, false);
+ update_free_nid_bitmap(sbi, nid, false, false);
spin_unlock(&NM_I(sbi)->nid_list_lock);
}
}
@@ -2428,10 +2459,11 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
f2fs_put_page(page, 1);
}
- f2fs_bug_on(sbi, set->entry_cnt);
-
- radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
- kmem_cache_free(nat_entry_set_slab, set);
+ /* Allow dirty nats by node block allocation in write_begin */
+ if (!set->entry_cnt) {
+ radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
+ kmem_cache_free(nat_entry_set_slab, set);
+ }
}
/*
@@ -2476,8 +2508,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
__flush_nat_entry_set(sbi, set, cpc);
up_write(&nm_i->nat_tree_lock);
-
- f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
+ /* Allow dirty nats by node block allocation in write_begin */
}
static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
@@ -2541,10 +2572,10 @@ inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi)
nid = i * NAT_ENTRY_PER_BLOCK;
last_nid = (i + 1) * NAT_ENTRY_PER_BLOCK;
- spin_lock(&nm_i->free_nid_lock);
+ spin_lock(&NM_I(sbi)->nid_list_lock);
for (; nid < last_nid; nid++)
- update_free_nid_bitmap(sbi, nid, true, true, true);
- spin_unlock(&nm_i->free_nid_lock);
+ update_free_nid_bitmap(sbi, nid, true, true);
+ spin_unlock(&NM_I(sbi)->nid_list_lock);
}
for (i = 0; i < nm_i->nat_blocks; i++) {
@@ -2621,23 +2652,20 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
- nm_i->free_nid_bitmap = f2fs_kvzalloc(nm_i->nat_blocks *
+ nm_i->free_nid_bitmap = kvzalloc(nm_i->nat_blocks *
NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL);
if (!nm_i->free_nid_bitmap)
return -ENOMEM;
- nm_i->nat_block_bitmap = f2fs_kvzalloc(nm_i->nat_blocks / 8,
+ nm_i->nat_block_bitmap = kvzalloc(nm_i->nat_blocks / 8,
GFP_KERNEL);
if (!nm_i->nat_block_bitmap)
return -ENOMEM;
- nm_i->free_nid_count = f2fs_kvzalloc(nm_i->nat_blocks *
+ nm_i->free_nid_count = kvzalloc(nm_i->nat_blocks *
sizeof(unsigned short), GFP_KERNEL);
if (!nm_i->free_nid_count)
return -ENOMEM;
-
- spin_lock_init(&nm_i->free_nid_lock);
-
return 0;
}
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 2f9603fa85a5..558048e33cf9 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -9,10 +9,10 @@
* published by the Free Software Foundation.
*/
/* start node id of a node block dedicated to the given node id */
-#define START_NID(nid) ((nid / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK)
+#define START_NID(nid) (((nid) / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK)
/* node block offset on the NAT area dedicated to the given start node id */
-#define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK)
+#define NAT_BLOCK_OFFSET(start_nid) ((start_nid) / NAT_ENTRY_PER_BLOCK)
/* # of pages to perform synchronous readahead before building free nids */
#define FREE_NID_PAGES 8
@@ -62,16 +62,16 @@ struct nat_entry {
struct node_info ni; /* in-memory node information */
};
-#define nat_get_nid(nat) (nat->ni.nid)
-#define nat_set_nid(nat, n) (nat->ni.nid = n)
-#define nat_get_blkaddr(nat) (nat->ni.blk_addr)
-#define nat_set_blkaddr(nat, b) (nat->ni.blk_addr = b)
-#define nat_get_ino(nat) (nat->ni.ino)
-#define nat_set_ino(nat, i) (nat->ni.ino = i)
-#define nat_get_version(nat) (nat->ni.version)
-#define nat_set_version(nat, v) (nat->ni.version = v)
+#define nat_get_nid(nat) ((nat)->ni.nid)
+#define nat_set_nid(nat, n) ((nat)->ni.nid = (n))
+#define nat_get_blkaddr(nat) ((nat)->ni.blk_addr)
+#define nat_set_blkaddr(nat, b) ((nat)->ni.blk_addr = (b))
+#define nat_get_ino(nat) ((nat)->ni.ino)
+#define nat_set_ino(nat, i) ((nat)->ni.ino = (i))
+#define nat_get_version(nat) ((nat)->ni.version)
+#define nat_set_version(nat, v) ((nat)->ni.version = (v))
-#define inc_node_version(version) (++version)
+#define inc_node_version(version) (++(version))
static inline void copy_node_info(struct node_info *dst,
struct node_info *src)
@@ -200,13 +200,16 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start)
struct f2fs_nm_info *nm_i = NM_I(sbi);
pgoff_t block_off;
pgoff_t block_addr;
- int seg_off;
+ /*
+ * block_off = segment_off * 512 + off_in_segment
+ * OLD = (segment_off * 512) * 2 + off_in_segment
+ * NEW = 2 * (segment_off * 512 + off_in_segment) - off_in_segment
+ */
block_off = NAT_BLOCK_OFFSET(start);
- seg_off = block_off >> sbi->log_blocks_per_seg;
block_addr = (pgoff_t)(nm_i->nat_blkaddr +
- (seg_off << sbi->log_blocks_per_seg << 1) +
+ (block_off << 1) -
(block_off & (sbi->blocks_per_seg - 1)));
if (f2fs_test_bit(block_off, nm_i->nat_bitmap))
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index d025aa83fb5b..907d6b7dde6a 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -198,7 +198,8 @@ static void recover_inode(struct inode *inode, struct page *page)
ino_of_node(page), name);
}
-static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
+static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
+ bool check_only)
{
struct curseg_info *curseg;
struct page *page = NULL;
@@ -225,7 +226,8 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
entry = get_fsync_inode(head, ino_of_node(page));
if (!entry) {
- if (IS_INODE(page) && is_dent_dnode(page)) {
+ if (!check_only &&
+ IS_INODE(page) && is_dent_dnode(page)) {
err = recover_inode_page(sbi, page);
if (err)
break;
@@ -569,7 +571,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
mutex_lock(&sbi->cp_mutex);
/* step #1: find fsynced inode numbers */
- err = find_fsync_dnodes(sbi, &inode_list);
+ err = find_fsync_dnodes(sbi, &inode_list, check_only);
if (err || list_empty(&inode_list))
goto out;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 29ef7088c558..96845854e7ee 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -250,6 +250,36 @@ void drop_inmem_pages(struct inode *inode)
stat_dec_atomic_write(inode);
}
+void drop_inmem_page(struct inode *inode, struct page *page)
+{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct list_head *head = &fi->inmem_pages;
+ struct inmem_pages *cur = NULL;
+
+ f2fs_bug_on(sbi, !IS_ATOMIC_WRITTEN_PAGE(page));
+
+ mutex_lock(&fi->inmem_lock);
+ list_for_each_entry(cur, head, list) {
+ if (cur->page == page)
+ break;
+ }
+
+ f2fs_bug_on(sbi, !cur || cur->page != page);
+ list_del(&cur->list);
+ mutex_unlock(&fi->inmem_lock);
+
+ dec_page_count(sbi, F2FS_INMEM_PAGES);
+ kmem_cache_free(inmem_entry_slab, cur);
+
+ ClearPageUptodate(page);
+ set_page_private(page, 0);
+ ClearPagePrivate(page);
+ f2fs_put_page(page, 0);
+
+ trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE);
+}
+
static int __commit_inmem_pages(struct inode *inode,
struct list_head *revoke_list)
{
@@ -261,7 +291,6 @@ static int __commit_inmem_pages(struct inode *inode,
.type = DATA,
.op = REQ_OP_WRITE,
.op_flags = REQ_SYNC | REQ_PRIO,
- .encrypted_page = NULL,
};
pgoff_t last_idx = ULONG_MAX;
int err = 0;
@@ -281,6 +310,9 @@ static int __commit_inmem_pages(struct inode *inode,
}
fio.page = page;
+ fio.old_blkaddr = NULL_ADDR;
+ fio.encrypted_page = NULL;
+ fio.need_lock = false,
err = do_write_data_page(&fio);
if (err) {
unlock_page(page);
@@ -358,11 +390,8 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
}
#endif
- if (!need)
- return;
-
/* balance_fs_bg is able to be pending */
- if (excess_cached_nats(sbi))
+ if (need && excess_cached_nats(sbi))
f2fs_balance_fs_bg(sbi);
/*
@@ -371,7 +400,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
*/
if (has_not_enough_free_secs(sbi, 0, 0)) {
mutex_lock(&sbi->gc_mutex);
- f2fs_gc(sbi, false, false);
+ f2fs_gc(sbi, false, false, NULL_SEGNO);
}
}
@@ -390,7 +419,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
else
build_free_nids(sbi, false, false);
- if (!is_idle(sbi))
+ if (!is_idle(sbi) && !excess_dirty_nats(sbi))
return;
/* checkpoint is the only way to shrink partial cached entries */
@@ -411,32 +440,34 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
}
}
-static int __submit_flush_wait(struct block_device *bdev)
+static int __submit_flush_wait(struct f2fs_sb_info *sbi,
+ struct block_device *bdev)
{
struct bio *bio = f2fs_bio_alloc(0);
int ret;
- bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+ bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
bio->bi_bdev = bdev;
ret = submit_bio_wait(bio);
bio_put(bio);
+
+ trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER),
+ test_opt(sbi, FLUSH_MERGE), ret);
return ret;
}
static int submit_flush_wait(struct f2fs_sb_info *sbi)
{
- int ret = __submit_flush_wait(sbi->sb->s_bdev);
+ int ret = __submit_flush_wait(sbi, sbi->sb->s_bdev);
int i;
- if (sbi->s_ndevs && !ret) {
- for (i = 1; i < sbi->s_ndevs; i++) {
- trace_f2fs_issue_flush(FDEV(i).bdev,
- test_opt(sbi, NOBARRIER),
- test_opt(sbi, FLUSH_MERGE));
- ret = __submit_flush_wait(FDEV(i).bdev);
- if (ret)
- break;
- }
+ if (!sbi->s_ndevs || ret)
+ return ret;
+
+ for (i = 1; i < sbi->s_ndevs; i++) {
+ ret = __submit_flush_wait(sbi, FDEV(i).bdev);
+ if (ret)
+ break;
}
return ret;
}
@@ -458,6 +489,8 @@ repeat:
fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
ret = submit_flush_wait(sbi);
+ atomic_inc(&fcc->issued_flush);
+
llist_for_each_entry_safe(cmd, next,
fcc->dispatch_list, llnode) {
cmd->ret = ret;
@@ -475,25 +508,29 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
{
struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info;
struct flush_cmd cmd;
+ int ret;
if (test_opt(sbi, NOBARRIER))
return 0;
- if (!test_opt(sbi, FLUSH_MERGE))
- return submit_flush_wait(sbi);
-
- if (!atomic_read(&fcc->submit_flush)) {
- int ret;
+ if (!test_opt(sbi, FLUSH_MERGE)) {
+ ret = submit_flush_wait(sbi);
+ atomic_inc(&fcc->issued_flush);
+ return ret;
+ }
- atomic_inc(&fcc->submit_flush);
+ if (!atomic_read(&fcc->issing_flush)) {
+ atomic_inc(&fcc->issing_flush);
ret = submit_flush_wait(sbi);
- atomic_dec(&fcc->submit_flush);
+ atomic_dec(&fcc->issing_flush);
+
+ atomic_inc(&fcc->issued_flush);
return ret;
}
init_completion(&cmd.wait);
- atomic_inc(&fcc->submit_flush);
+ atomic_inc(&fcc->issing_flush);
llist_add(&cmd.llnode, &fcc->issue_list);
if (!fcc->dispatch_list)
@@ -501,10 +538,10 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
if (fcc->f2fs_issue_flush) {
wait_for_completion(&cmd.wait);
- atomic_dec(&fcc->submit_flush);
+ atomic_dec(&fcc->issing_flush);
} else {
llist_del_all(&fcc->issue_list);
- atomic_set(&fcc->submit_flush, 0);
+ atomic_set(&fcc->issing_flush, 0);
}
return cmd.ret;
@@ -524,7 +561,8 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL);
if (!fcc)
return -ENOMEM;
- atomic_set(&fcc->submit_flush, 0);
+ atomic_set(&fcc->issued_flush, 0);
+ atomic_set(&fcc->issing_flush, 0);
init_waitqueue_head(&fcc->flush_wait_queue);
init_llist_head(&fcc->issue_list);
SM_I(sbi)->fcc_info = fcc;
@@ -597,8 +635,8 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t]))
dirty_i->nr_dirty[t]--;
- if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0)
- clear_bit(GET_SECNO(sbi, segno),
+ if (get_valid_blocks(sbi, segno, true) == 0)
+ clear_bit(GET_SEC_FROM_SEG(sbi, segno),
dirty_i->victim_secmap);
}
}
@@ -618,7 +656,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
mutex_lock(&dirty_i->seglist_lock);
- valid_blocks = get_valid_blocks(sbi, segno, 0);
+ valid_blocks = get_valid_blocks(sbi, segno, false);
if (valid_blocks == 0) {
__locate_dirty_segment(sbi, segno, PRE);
@@ -633,162 +671,407 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
mutex_unlock(&dirty_i->seglist_lock);
}
-static void __add_discard_cmd(struct f2fs_sb_info *sbi,
- struct bio *bio, block_t lstart, block_t len)
+static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t lstart,
+ block_t start, block_t len)
{
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
- struct list_head *cmd_list = &(dcc->discard_cmd_list);
+ struct list_head *pend_list;
struct discard_cmd *dc;
+ f2fs_bug_on(sbi, !len);
+
+ pend_list = &dcc->pend_list[plist_idx(len)];
+
dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS);
INIT_LIST_HEAD(&dc->list);
- dc->bio = bio;
- bio->bi_private = dc;
+ dc->bdev = bdev;
dc->lstart = lstart;
+ dc->start = start;
dc->len = len;
+ dc->ref = 0;
dc->state = D_PREP;
+ dc->error = 0;
init_completion(&dc->wait);
+ list_add_tail(&dc->list, pend_list);
+ atomic_inc(&dcc->discard_cmd_cnt);
+ dcc->undiscard_blks += len;
- mutex_lock(&dcc->cmd_lock);
- list_add_tail(&dc->list, cmd_list);
- mutex_unlock(&dcc->cmd_lock);
+ return dc;
}
-static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc)
+static struct discard_cmd *__attach_discard_cmd(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t lstart,
+ block_t start, block_t len,
+ struct rb_node *parent, struct rb_node **p)
{
- int err = dc->bio->bi_error;
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct discard_cmd *dc;
- if (dc->state == D_DONE)
- atomic_dec(&(SM_I(sbi)->dcc_info->submit_discard));
+ dc = __create_discard_cmd(sbi, bdev, lstart, start, len);
- if (err == -EOPNOTSUPP)
- err = 0;
+ rb_link_node(&dc->rb_node, parent, p);
+ rb_insert_color(&dc->rb_node, &dcc->root);
+
+ return dc;
+}
+
+static void __detach_discard_cmd(struct discard_cmd_control *dcc,
+ struct discard_cmd *dc)
+{
+ if (dc->state == D_DONE)
+ atomic_dec(&dcc->issing_discard);
- if (err)
- f2fs_msg(sbi->sb, KERN_INFO,
- "Issue discard failed, ret: %d", err);
- bio_put(dc->bio);
list_del(&dc->list);
+ rb_erase(&dc->rb_node, &dcc->root);
+ dcc->undiscard_blks -= dc->len;
+
kmem_cache_free(discard_cmd_slab, dc);
+
+ atomic_dec(&dcc->discard_cmd_cnt);
}
-/* This should be covered by global mutex, &sit_i->sentry_lock */
-void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr)
+static void __remove_discard_cmd(struct f2fs_sb_info *sbi,
+ struct discard_cmd *dc)
{
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
- struct list_head *wait_list = &(dcc->discard_cmd_list);
- struct discard_cmd *dc, *tmp;
- struct blk_plug plug;
- mutex_lock(&dcc->cmd_lock);
+ if (dc->error == -EOPNOTSUPP)
+ dc->error = 0;
- blk_start_plug(&plug);
+ if (dc->error)
+ f2fs_msg(sbi->sb, KERN_INFO,
+ "Issue discard failed, ret: %d", dc->error);
+ __detach_discard_cmd(dcc, dc);
+}
- list_for_each_entry_safe(dc, tmp, wait_list, list) {
+static void f2fs_submit_discard_endio(struct bio *bio)
+{
+ struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private;
- if (blkaddr == NULL_ADDR) {
- if (dc->state == D_PREP) {
- dc->state = D_SUBMIT;
- submit_bio(dc->bio);
- atomic_inc(&dcc->submit_discard);
- }
- continue;
+ dc->error = bio->bi_error;
+ dc->state = D_DONE;
+ complete(&dc->wait);
+ bio_put(bio);
+}
+
+/* this function is copied from blkdev_issue_discard from block/blk-lib.c */
+static void __submit_discard_cmd(struct f2fs_sb_info *sbi,
+ struct discard_cmd *dc)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct bio *bio = NULL;
+
+ if (dc->state != D_PREP)
+ return;
+
+ trace_f2fs_issue_discard(dc->bdev, dc->start, dc->len);
+
+ dc->error = __blkdev_issue_discard(dc->bdev,
+ SECTOR_FROM_BLOCK(dc->start),
+ SECTOR_FROM_BLOCK(dc->len),
+ GFP_NOFS, 0, &bio);
+ if (!dc->error) {
+ /* should keep before submission to avoid D_DONE right away */
+ dc->state = D_SUBMIT;
+ atomic_inc(&dcc->issued_discard);
+ atomic_inc(&dcc->issing_discard);
+ if (bio) {
+ bio->bi_private = dc;
+ bio->bi_end_io = f2fs_submit_discard_endio;
+ bio->bi_opf |= REQ_SYNC;
+ submit_bio(bio);
+ list_move_tail(&dc->list, &dcc->wait_list);
}
+ } else {
+ __remove_discard_cmd(sbi, dc);
+ }
+}
- if (dc->lstart <= blkaddr && blkaddr < dc->lstart + dc->len) {
- if (dc->state == D_SUBMIT)
- wait_for_completion_io(&dc->wait);
- else
- __remove_discard_cmd(sbi, dc);
+static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t lstart,
+ block_t start, block_t len,
+ struct rb_node **insert_p,
+ struct rb_node *insert_parent)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct rb_node **p = &dcc->root.rb_node;
+ struct rb_node *parent = NULL;
+ struct discard_cmd *dc = NULL;
+
+ if (insert_p && insert_parent) {
+ parent = insert_parent;
+ p = insert_p;
+ goto do_insert;
+ }
+
+ p = __lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, lstart);
+do_insert:
+ dc = __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, p);
+ if (!dc)
+ return NULL;
+
+ return dc;
+}
+
+static void __relocate_discard_cmd(struct discard_cmd_control *dcc,
+ struct discard_cmd *dc)
+{
+ list_move_tail(&dc->list, &dcc->pend_list[plist_idx(dc->len)]);
+}
+
+static void __punch_discard_cmd(struct f2fs_sb_info *sbi,
+ struct discard_cmd *dc, block_t blkaddr)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct discard_info di = dc->di;
+ bool modified = false;
+
+ if (dc->state == D_DONE || dc->len == 1) {
+ __remove_discard_cmd(sbi, dc);
+ return;
+ }
+
+ dcc->undiscard_blks -= di.len;
+
+ if (blkaddr > di.lstart) {
+ dc->len = blkaddr - dc->lstart;
+ dcc->undiscard_blks += dc->len;
+ __relocate_discard_cmd(dcc, dc);
+ f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root));
+ modified = true;
+ }
+
+ if (blkaddr < di.lstart + di.len - 1) {
+ if (modified) {
+ __insert_discard_tree(sbi, dc->bdev, blkaddr + 1,
+ di.start + blkaddr + 1 - di.lstart,
+ di.lstart + di.len - 1 - blkaddr,
+ NULL, NULL);
+ f2fs_bug_on(sbi,
+ !__check_rb_tree_consistence(sbi, &dcc->root));
+ } else {
+ dc->lstart++;
+ dc->len--;
+ dc->start++;
+ dcc->undiscard_blks += dc->len;
+ __relocate_discard_cmd(dcc, dc);
+ f2fs_bug_on(sbi,
+ !__check_rb_tree_consistence(sbi, &dcc->root));
}
}
- blk_finish_plug(&plug);
+}
- /* this comes from f2fs_put_super */
- if (blkaddr == NULL_ADDR) {
- list_for_each_entry_safe(dc, tmp, wait_list, list) {
- wait_for_completion_io(&dc->wait);
- __remove_discard_cmd(sbi, dc);
+static void __update_discard_tree_range(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t lstart,
+ block_t start, block_t len)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct discard_cmd *prev_dc = NULL, *next_dc = NULL;
+ struct discard_cmd *dc;
+ struct discard_info di = {0};
+ struct rb_node **insert_p = NULL, *insert_parent = NULL;
+ block_t end = lstart + len;
+
+ mutex_lock(&dcc->cmd_lock);
+
+ dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root,
+ NULL, lstart,
+ (struct rb_entry **)&prev_dc,
+ (struct rb_entry **)&next_dc,
+ &insert_p, &insert_parent, true);
+ if (dc)
+ prev_dc = dc;
+
+ if (!prev_dc) {
+ di.lstart = lstart;
+ di.len = next_dc ? next_dc->lstart - lstart : len;
+ di.len = min(di.len, len);
+ di.start = start;
+ }
+
+ while (1) {
+ struct rb_node *node;
+ bool merged = false;
+ struct discard_cmd *tdc = NULL;
+
+ if (prev_dc) {
+ di.lstart = prev_dc->lstart + prev_dc->len;
+ if (di.lstart < lstart)
+ di.lstart = lstart;
+ if (di.lstart >= end)
+ break;
+
+ if (!next_dc || next_dc->lstart > end)
+ di.len = end - di.lstart;
+ else
+ di.len = next_dc->lstart - di.lstart;
+ di.start = start + di.lstart - lstart;
+ }
+
+ if (!di.len)
+ goto next;
+
+ if (prev_dc && prev_dc->state == D_PREP &&
+ prev_dc->bdev == bdev &&
+ __is_discard_back_mergeable(&di, &prev_dc->di)) {
+ prev_dc->di.len += di.len;
+ dcc->undiscard_blks += di.len;
+ __relocate_discard_cmd(dcc, prev_dc);
+ f2fs_bug_on(sbi,
+ !__check_rb_tree_consistence(sbi, &dcc->root));
+ di = prev_dc->di;
+ tdc = prev_dc;
+ merged = true;
+ }
+
+ if (next_dc && next_dc->state == D_PREP &&
+ next_dc->bdev == bdev &&
+ __is_discard_front_mergeable(&di, &next_dc->di)) {
+ next_dc->di.lstart = di.lstart;
+ next_dc->di.len += di.len;
+ next_dc->di.start = di.start;
+ dcc->undiscard_blks += di.len;
+ __relocate_discard_cmd(dcc, next_dc);
+ if (tdc)
+ __remove_discard_cmd(sbi, tdc);
+ f2fs_bug_on(sbi,
+ !__check_rb_tree_consistence(sbi, &dcc->root));
+ merged = true;
+ }
+
+ if (!merged) {
+ __insert_discard_tree(sbi, bdev, di.lstart, di.start,
+ di.len, NULL, NULL);
+ f2fs_bug_on(sbi,
+ !__check_rb_tree_consistence(sbi, &dcc->root));
}
+ next:
+ prev_dc = next_dc;
+ if (!prev_dc)
+ break;
+
+ node = rb_next(&prev_dc->rb_node);
+ next_dc = rb_entry_safe(node, struct discard_cmd, rb_node);
}
+
mutex_unlock(&dcc->cmd_lock);
}
-static void f2fs_submit_discard_endio(struct bio *bio)
+static int __queue_discard_cmd(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t blkstart, block_t blklen)
{
- struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private;
+ block_t lblkstart = blkstart;
- complete(&dc->wait);
- dc->state = D_DONE;
+ trace_f2fs_queue_discard(bdev, blkstart, blklen);
+
+ if (sbi->s_ndevs) {
+ int devi = f2fs_target_device_index(sbi, blkstart);
+
+ blkstart -= FDEV(devi).start_blk;
+ }
+ __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen);
+ return 0;
}
-static int issue_discard_thread(void *data)
+static void __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond)
{
- struct f2fs_sb_info *sbi = data;
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
- wait_queue_head_t *q = &dcc->discard_wait_queue;
- struct list_head *cmd_list = &dcc->discard_cmd_list;
+ struct list_head *pend_list;
struct discard_cmd *dc, *tmp;
struct blk_plug plug;
- int iter = 0;
-repeat:
- if (kthread_should_stop())
- return 0;
+ int i, iter = 0;
+ mutex_lock(&dcc->cmd_lock);
blk_start_plug(&plug);
+ for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
+ pend_list = &dcc->pend_list[i];
+ list_for_each_entry_safe(dc, tmp, pend_list, list) {
+ f2fs_bug_on(sbi, dc->state != D_PREP);
+
+ if (!issue_cond || is_idle(sbi))
+ __submit_discard_cmd(sbi, dc);
+ if (issue_cond && iter++ > DISCARD_ISSUE_RATE)
+ goto out;
+ }
+ }
+out:
+ blk_finish_plug(&plug);
+ mutex_unlock(&dcc->cmd_lock);
+}
+
+static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct list_head *wait_list = &(dcc->wait_list);
+ struct discard_cmd *dc, *tmp;
mutex_lock(&dcc->cmd_lock);
- list_for_each_entry_safe(dc, tmp, cmd_list, list) {
- if (dc->state == D_PREP) {
- dc->state = D_SUBMIT;
- submit_bio(dc->bio);
- atomic_inc(&dcc->submit_discard);
- if (iter++ > DISCARD_ISSUE_RATE)
- break;
- } else if (dc->state == D_DONE) {
+ list_for_each_entry_safe(dc, tmp, wait_list, list) {
+ if (!wait_cond || dc->state == D_DONE) {
+ if (dc->ref)
+ continue;
+ wait_for_completion_io(&dc->wait);
__remove_discard_cmd(sbi, dc);
}
}
mutex_unlock(&dcc->cmd_lock);
+}
- blk_finish_plug(&plug);
+/* This should be covered by global mutex, &sit_i->sentry_lock */
+void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct discard_cmd *dc;
+ bool need_wait = false;
- iter = 0;
- congestion_wait(BLK_RW_SYNC, HZ/50);
+ mutex_lock(&dcc->cmd_lock);
+ dc = (struct discard_cmd *)__lookup_rb_tree(&dcc->root, NULL, blkaddr);
+ if (dc) {
+ if (dc->state == D_PREP) {
+ __punch_discard_cmd(sbi, dc, blkaddr);
+ } else {
+ dc->ref++;
+ need_wait = true;
+ }
+ }
+ mutex_unlock(&dcc->cmd_lock);
- wait_event_interruptible(*q,
- kthread_should_stop() || !list_empty(&dcc->discard_cmd_list));
- goto repeat;
+ if (need_wait) {
+ wait_for_completion_io(&dc->wait);
+ mutex_lock(&dcc->cmd_lock);
+ f2fs_bug_on(sbi, dc->state != D_DONE);
+ dc->ref--;
+ if (!dc->ref)
+ __remove_discard_cmd(sbi, dc);
+ mutex_unlock(&dcc->cmd_lock);
+ }
}
-
-/* this function is copied from blkdev_issue_discard from block/blk-lib.c */
-static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi,
- struct block_device *bdev, block_t blkstart, block_t blklen)
+/* This comes from f2fs_put_super */
+void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi)
{
- struct bio *bio = NULL;
- block_t lblkstart = blkstart;
- int err;
+ __issue_discard_cmd(sbi, false);
+ __wait_discard_cmd(sbi, false);
+}
- trace_f2fs_issue_discard(bdev, blkstart, blklen);
+static int issue_discard_thread(void *data)
+{
+ struct f2fs_sb_info *sbi = data;
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ wait_queue_head_t *q = &dcc->discard_wait_queue;
+repeat:
+ if (kthread_should_stop())
+ return 0;
- if (sbi->s_ndevs) {
- int devi = f2fs_target_device_index(sbi, blkstart);
+ __issue_discard_cmd(sbi, true);
+ __wait_discard_cmd(sbi, true);
- blkstart -= FDEV(devi).start_blk;
- }
- err = __blkdev_issue_discard(bdev,
- SECTOR_FROM_BLOCK(blkstart),
- SECTOR_FROM_BLOCK(blklen),
- GFP_NOFS, 0, &bio);
- if (!err && bio) {
- bio->bi_end_io = f2fs_submit_discard_endio;
- bio->bi_opf |= REQ_SYNC;
+ congestion_wait(BLK_RW_SYNC, HZ/50);
- __add_discard_cmd(sbi, bio, lblkstart, blklen);
- wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue);
- }
- return err;
+ wait_event_interruptible(*q, kthread_should_stop() ||
+ atomic_read(&dcc->discard_cmd_cnt));
+ goto repeat;
}
#ifdef CONFIG_BLK_DEV_ZONED
@@ -796,6 +1079,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
struct block_device *bdev, block_t blkstart, block_t blklen)
{
sector_t sector, nr_sects;
+ block_t lblkstart = blkstart;
int devi = 0;
if (sbi->s_ndevs) {
@@ -813,7 +1097,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
case BLK_ZONE_TYPE_CONVENTIONAL:
if (!blk_queue_discard(bdev_get_queue(bdev)))
return 0;
- return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen);
+ return __queue_discard_cmd(sbi, bdev, lblkstart, blklen);
case BLK_ZONE_TYPE_SEQWRITE_REQ:
case BLK_ZONE_TYPE_SEQWRITE_PREF:
sector = SECTOR_FROM_BLOCK(blkstart);
@@ -845,7 +1129,7 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi,
bdev_zoned_model(bdev) != BLK_ZONED_NONE)
return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen);
#endif
- return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen);
+ return __queue_discard_cmd(sbi, bdev, blkstart, blklen);
}
static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
@@ -888,32 +1172,6 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
return err;
}
-static void __add_discard_entry(struct f2fs_sb_info *sbi,
- struct cp_control *cpc, struct seg_entry *se,
- unsigned int start, unsigned int end)
-{
- struct list_head *head = &SM_I(sbi)->dcc_info->discard_entry_list;
- struct discard_entry *new, *last;
-
- if (!list_empty(head)) {
- last = list_last_entry(head, struct discard_entry, list);
- if (START_BLOCK(sbi, cpc->trim_start) + start ==
- last->blkaddr + last->len &&
- last->len < MAX_DISCARD_BLOCKS(sbi)) {
- last->len += end - start;
- goto done;
- }
- }
-
- new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
- INIT_LIST_HEAD(&new->list);
- new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start;
- new->len = end - start;
- list_add_tail(&new->list, head);
-done:
- SM_I(sbi)->dcc_info->nr_discards += end - start;
-}
-
static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
bool check_only)
{
@@ -925,7 +1183,9 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
unsigned long *discard_map = (unsigned long *)se->discard_map;
unsigned long *dmap = SIT_I(sbi)->tmp_map;
unsigned int start = 0, end = -1;
- bool force = (cpc->reason == CP_DISCARD);
+ bool force = (cpc->reason & CP_DISCARD);
+ struct discard_entry *de = NULL;
+ struct list_head *head = &SM_I(sbi)->dcc_info->entry_list;
int i;
if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi))
@@ -957,14 +1217,24 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
if (check_only)
return true;
- __add_discard_entry(sbi, cpc, se, start, end);
+ if (!de) {
+ de = f2fs_kmem_cache_alloc(discard_entry_slab,
+ GFP_F2FS_ZERO);
+ de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start);
+ list_add_tail(&de->list, head);
+ }
+
+ for (i = start; i < end; i++)
+ __set_bit_le(i, (void *)de->discard_map);
+
+ SM_I(sbi)->dcc_info->nr_discards += end - start;
}
return false;
}
void release_discard_addrs(struct f2fs_sb_info *sbi)
{
- struct list_head *head = &(SM_I(sbi)->dcc_info->discard_entry_list);
+ struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list);
struct discard_entry *entry, *this;
/* drop caches */
@@ -990,13 +1260,13 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
- struct list_head *head = &(SM_I(sbi)->dcc_info->discard_entry_list);
+ struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list);
struct discard_entry *entry, *this;
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
unsigned int start = 0, end = -1;
unsigned int secno, start_segno;
- bool force = (cpc->reason == CP_DISCARD);
+ bool force = (cpc->reason & CP_DISCARD);
mutex_lock(&dirty_i->seglist_lock);
@@ -1026,10 +1296,10 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
continue;
}
next:
- secno = GET_SECNO(sbi, start);
- start_segno = secno * sbi->segs_per_sec;
+ secno = GET_SEC_FROM_SEG(sbi, start);
+ start_segno = GET_SEG_FROM_SEC(sbi, secno);
if (!IS_CURSEC(sbi, secno) &&
- !get_valid_blocks(sbi, start, sbi->segs_per_sec))
+ !get_valid_blocks(sbi, start, true))
f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno),
sbi->segs_per_sec << sbi->log_blocks_per_seg);
@@ -1043,22 +1313,46 @@ next:
/* send small discards */
list_for_each_entry_safe(entry, this, head, list) {
- if (force && entry->len < cpc->trim_minlen)
- goto skip;
- f2fs_issue_discard(sbi, entry->blkaddr, entry->len);
- cpc->trimmed += entry->len;
+ unsigned int cur_pos = 0, next_pos, len, total_len = 0;
+ bool is_valid = test_bit_le(0, entry->discard_map);
+
+find_next:
+ if (is_valid) {
+ next_pos = find_next_zero_bit_le(entry->discard_map,
+ sbi->blocks_per_seg, cur_pos);
+ len = next_pos - cur_pos;
+
+ if (force && len < cpc->trim_minlen)
+ goto skip;
+
+ f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos,
+ len);
+ cpc->trimmed += len;
+ total_len += len;
+ } else {
+ next_pos = find_next_bit_le(entry->discard_map,
+ sbi->blocks_per_seg, cur_pos);
+ }
skip:
+ cur_pos = next_pos;
+ is_valid = !is_valid;
+
+ if (cur_pos < sbi->blocks_per_seg)
+ goto find_next;
+
list_del(&entry->list);
- SM_I(sbi)->dcc_info->nr_discards -= entry->len;
+ SM_I(sbi)->dcc_info->nr_discards -= total_len;
kmem_cache_free(discard_entry_slab, entry);
}
+
+ wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue);
}
static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
{
dev_t dev = sbi->sb->s_bdev->bd_dev;
struct discard_cmd_control *dcc;
- int err = 0;
+ int err = 0, i;
if (SM_I(sbi)->dcc_info) {
dcc = SM_I(sbi)->dcc_info;
@@ -1069,12 +1363,18 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
if (!dcc)
return -ENOMEM;
- INIT_LIST_HEAD(&dcc->discard_entry_list);
- INIT_LIST_HEAD(&dcc->discard_cmd_list);
+ INIT_LIST_HEAD(&dcc->entry_list);
+ for (i = 0; i < MAX_PLIST_NUM; i++)
+ INIT_LIST_HEAD(&dcc->pend_list[i]);
+ INIT_LIST_HEAD(&dcc->wait_list);
mutex_init(&dcc->cmd_lock);
- atomic_set(&dcc->submit_discard, 0);
+ atomic_set(&dcc->issued_discard, 0);
+ atomic_set(&dcc->issing_discard, 0);
+ atomic_set(&dcc->discard_cmd_cnt, 0);
dcc->nr_discards = 0;
- dcc->max_discards = 0;
+ dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg;
+ dcc->undiscard_blks = 0;
+ dcc->root = RB_ROOT;
init_waitqueue_head(&dcc->discard_wait_queue);
SM_I(sbi)->dcc_info = dcc;
@@ -1091,20 +1391,22 @@ init_thread:
return err;
}
-static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi, bool free)
+static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi)
{
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
- if (dcc && dcc->f2fs_issue_discard) {
+ if (!dcc)
+ return;
+
+ if (dcc->f2fs_issue_discard) {
struct task_struct *discard_thread = dcc->f2fs_issue_discard;
dcc->f2fs_issue_discard = NULL;
kthread_stop(discard_thread);
}
- if (free) {
- kfree(dcc);
- SM_I(sbi)->dcc_info = NULL;
- }
+
+ kfree(dcc);
+ SM_I(sbi)->dcc_info = NULL;
}
static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
@@ -1345,6 +1647,17 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi,
f2fs_put_page(page, 1);
}
+static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ unsigned int segno = curseg->segno + 1;
+ struct free_segmap_info *free_i = FREE_I(sbi);
+
+ if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec)
+ return !test_bit(segno, free_i->free_segmap);
+ return 0;
+}
+
/*
* Find a new segment from the free segments bitmap to right order
* This function should be returned with success, otherwise BUG
@@ -1355,8 +1668,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
struct free_segmap_info *free_i = FREE_I(sbi);
unsigned int segno, secno, zoneno;
unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone;
- unsigned int hint = *newseg / sbi->segs_per_sec;
- unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg);
+ unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg);
+ unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg);
unsigned int left_start = hint;
bool init = true;
int go_left = 0;
@@ -1366,8 +1679,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
segno = find_next_zero_bit(free_i->free_segmap,
- (hint + 1) * sbi->segs_per_sec, *newseg + 1);
- if (segno < (hint + 1) * sbi->segs_per_sec)
+ GET_SEG_FROM_SEC(sbi, hint + 1), *newseg + 1);
+ if (segno < GET_SEG_FROM_SEC(sbi, hint + 1))
goto got_it;
}
find_other_zone:
@@ -1398,8 +1711,8 @@ find_other_zone:
secno = left_start;
skip_left:
hint = secno;
- segno = secno * sbi->segs_per_sec;
- zoneno = secno / sbi->secs_per_zone;
+ segno = GET_SEG_FROM_SEC(sbi, secno);
+ zoneno = GET_ZONE_FROM_SEC(sbi, secno);
/* give up on finding another zone */
if (!init)
@@ -1443,7 +1756,7 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
struct summary_footer *sum_footer;
curseg->segno = curseg->next_segno;
- curseg->zone = GET_ZONENO_FROM_SEGNO(sbi, curseg->segno);
+ curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno);
curseg->next_blkoff = 0;
curseg->next_segno = NULL_SEGNO;
@@ -1456,6 +1769,20 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
__set_sit_entry_type(sbi, type, curseg->segno, modified);
}
+static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
+{
+ /* if segs_per_sec is large than 1, we need to keep original policy. */
+ if (sbi->segs_per_sec != 1)
+ return CURSEG_I(sbi, type)->segno;
+
+ if (type == CURSEG_HOT_DATA || IS_NODESEG(type))
+ return 0;
+
+ if (SIT_I(sbi)->last_victim[ALLOC_NEXT])
+ return SIT_I(sbi)->last_victim[ALLOC_NEXT];
+ return CURSEG_I(sbi, type)->segno;
+}
+
/*
* Allocate a current working segment.
* This function always allocates a free segment in LFS manner.
@@ -1474,6 +1801,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
if (test_opt(sbi, NOHEAP))
dir = ALLOC_RIGHT;
+ segno = __get_next_segno(sbi, type);
get_new_segment(sbi, &segno, new_sec, dir);
curseg->next_segno = segno;
reset_curseg(sbi, type, 1);
@@ -1549,12 +1877,15 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops;
+ unsigned segno = NULL_SEGNO;
int i, cnt;
bool reversed = false;
/* need_SSR() already forces to do this */
- if (v_ops->get_victim(sbi, &(curseg)->next_segno, BG_GC, type, SSR))
+ if (v_ops->get_victim(sbi, &segno, BG_GC, type, SSR)) {
+ curseg->next_segno = segno;
return 1;
+ }
/* For node segments, let's do SSR more intensively */
if (IS_NODESEG(type)) {
@@ -1578,9 +1909,10 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
for (; cnt-- > 0; reversed ? i-- : i++) {
if (i == type)
continue;
- if (v_ops->get_victim(sbi, &(curseg)->next_segno,
- BG_GC, i, SSR))
+ if (v_ops->get_victim(sbi, &segno, BG_GC, i, SSR)) {
+ curseg->next_segno = segno;
return 1;
+ }
}
return 0;
}
@@ -1592,17 +1924,21 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
int type, bool force)
{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+
if (force)
new_curseg(sbi, type, true);
else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) &&
type == CURSEG_WARM_NODE)
new_curseg(sbi, type, false);
+ else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type))
+ new_curseg(sbi, type, false);
else if (need_SSR(sbi) && get_ssr_segment(sbi, type))
change_curseg(sbi, type, true);
else
new_curseg(sbi, type, false);
- stat_inc_seg_type(sbi, CURSEG_I(sbi, type));
+ stat_inc_seg_type(sbi, curseg);
}
void allocate_new_segments(struct f2fs_sb_info *sbi)
@@ -1734,18 +2070,16 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type)
if (p_type == DATA) {
struct inode *inode = page->mapping->host;
- if (S_ISDIR(inode->i_mode))
- return CURSEG_HOT_DATA;
- else if (is_cold_data(page) || file_is_cold(inode))
+ if (is_cold_data(page) || file_is_cold(inode))
return CURSEG_COLD_DATA;
- else
- return CURSEG_WARM_DATA;
+ if (is_inode_flag_set(inode, FI_HOT_DATA))
+ return CURSEG_HOT_DATA;
+ return CURSEG_WARM_DATA;
} else {
if (IS_DNODE(page))
return is_cold_node(page) ? CURSEG_WARM_NODE :
CURSEG_HOT_NODE;
- else
- return CURSEG_COLD_NODE;
+ return CURSEG_COLD_NODE;
}
}
@@ -1788,15 +2122,14 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
stat_inc_block_count(sbi, curseg);
+ if (!__has_curseg_space(sbi, type))
+ sit_i->s_ops->allocate_segment(sbi, type, false);
/*
- * SIT information should be updated before segment allocation,
- * since SSR needs latest valid block information.
+ * SIT information should be updated after segment allocation,
+ * since we need to keep dirty segments precisely under SSR.
*/
refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr);
- if (!__has_curseg_space(sbi, type))
- sit_i->s_ops->allocate_segment(sbi, type, false);
-
mutex_unlock(&sit_i->sentry_lock);
if (page && IS_NODESEG(type))
@@ -1868,11 +2201,11 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio)
f2fs_update_data_blkaddr(dn, fio->new_blkaddr);
}
-void rewrite_data_page(struct f2fs_io_info *fio)
+int rewrite_data_page(struct f2fs_io_info *fio)
{
fio->new_blkaddr = fio->old_blkaddr;
stat_inc_inplace_blocks(fio->sbi);
- f2fs_submit_page_mbio(fio);
+ return f2fs_submit_page_bio(fio);
}
void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
@@ -2437,7 +2770,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
se = get_seg_entry(sbi, segno);
/* add discard candidates */
- if (cpc->reason != CP_DISCARD) {
+ if (!(cpc->reason & CP_DISCARD)) {
cpc->trim_start = segno;
add_discard_addrs(sbi, cpc, false);
}
@@ -2473,7 +2806,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
f2fs_bug_on(sbi, !list_empty(head));
f2fs_bug_on(sbi, sit_i->dirty_sentries);
out:
- if (cpc->reason == CP_DISCARD) {
+ if (cpc->reason & CP_DISCARD) {
__u64 trim_start = cpc->trim_start;
for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++)
@@ -2501,13 +2834,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
SM_I(sbi)->sit_info = sit_i;
- sit_i->sentries = f2fs_kvzalloc(MAIN_SEGS(sbi) *
+ sit_i->sentries = kvzalloc(MAIN_SEGS(sbi) *
sizeof(struct seg_entry), GFP_KERNEL);
if (!sit_i->sentries)
return -ENOMEM;
bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
- sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
+ sit_i->dirty_sentries_bitmap = kvzalloc(bitmap_size, GFP_KERNEL);
if (!sit_i->dirty_sentries_bitmap)
return -ENOMEM;
@@ -2540,7 +2873,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
return -ENOMEM;
if (sbi->segs_per_sec > 1) {
- sit_i->sec_entries = f2fs_kvzalloc(MAIN_SECS(sbi) *
+ sit_i->sec_entries = kvzalloc(MAIN_SECS(sbi) *
sizeof(struct sec_entry), GFP_KERNEL);
if (!sit_i->sec_entries)
return -ENOMEM;
@@ -2573,7 +2906,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
sit_i->dirty_sentries = 0;
sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK;
sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time);
- sit_i->mounted_time = CURRENT_TIME_SEC.tv_sec;
+ sit_i->mounted_time = ktime_get_real_seconds();
mutex_init(&sit_i->sentry_lock);
return 0;
}
@@ -2591,12 +2924,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
SM_I(sbi)->free_info = free_i;
bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
- free_i->free_segmap = f2fs_kvmalloc(bitmap_size, GFP_KERNEL);
+ free_i->free_segmap = kvmalloc(bitmap_size, GFP_KERNEL);
if (!free_i->free_segmap)
return -ENOMEM;
sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
- free_i->free_secmap = f2fs_kvmalloc(sec_bitmap_size, GFP_KERNEL);
+ free_i->free_secmap = kvmalloc(sec_bitmap_size, GFP_KERNEL);
if (!free_i->free_secmap)
return -ENOMEM;
@@ -2672,10 +3005,17 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
/* build discard map only one time */
if (f2fs_discard_en(sbi)) {
- memcpy(se->discard_map, se->cur_valid_map,
- SIT_VBLOCK_MAP_SIZE);
- sbi->discard_blks += sbi->blocks_per_seg -
- se->valid_blocks;
+ if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
+ memset(se->discard_map, 0xff,
+ SIT_VBLOCK_MAP_SIZE);
+ } else {
+ memcpy(se->discard_map,
+ se->cur_valid_map,
+ SIT_VBLOCK_MAP_SIZE);
+ sbi->discard_blks +=
+ sbi->blocks_per_seg -
+ se->valid_blocks;
+ }
}
if (sbi->segs_per_sec > 1)
@@ -2699,10 +3039,15 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
seg_info_from_raw_sit(se, &sit);
if (f2fs_discard_en(sbi)) {
- memcpy(se->discard_map, se->cur_valid_map,
- SIT_VBLOCK_MAP_SIZE);
- sbi->discard_blks += old_valid_blocks -
- se->valid_blocks;
+ if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
+ memset(se->discard_map, 0xff,
+ SIT_VBLOCK_MAP_SIZE);
+ } else {
+ memcpy(se->discard_map, se->cur_valid_map,
+ SIT_VBLOCK_MAP_SIZE);
+ sbi->discard_blks += old_valid_blocks -
+ se->valid_blocks;
+ }
}
if (sbi->segs_per_sec > 1)
@@ -2746,7 +3091,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
if (segno >= MAIN_SEGS(sbi))
break;
offset = segno + 1;
- valid_blocks = get_valid_blocks(sbi, segno, 0);
+ valid_blocks = get_valid_blocks(sbi, segno, false);
if (valid_blocks == sbi->blocks_per_seg || !valid_blocks)
continue;
if (valid_blocks > sbi->blocks_per_seg) {
@@ -2764,7 +3109,7 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi)
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
- dirty_i->victim_secmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
+ dirty_i->victim_secmap = kvzalloc(bitmap_size, GFP_KERNEL);
if (!dirty_i->victim_secmap)
return -ENOMEM;
return 0;
@@ -2786,7 +3131,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi)
bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
for (i = 0; i < NR_DIRTY_TYPE; i++) {
- dirty_i->dirty_segmap[i] = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
+ dirty_i->dirty_segmap[i] = kvzalloc(bitmap_size, GFP_KERNEL);
if (!dirty_i->dirty_segmap[i])
return -ENOMEM;
}
@@ -2852,6 +3197,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
+ sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS;
sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS;
@@ -2988,7 +3334,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi)
if (!sm_info)
return;
destroy_flush_cmd_control(sbi, true);
- destroy_discard_cmd_control(sbi, true);
+ destroy_discard_cmd_control(sbi);
destroy_dirty_segmap(sbi);
destroy_curseg(sbi);
destroy_free_segmap(sbi);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 5e8ad4280a50..010f336a7573 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -21,78 +21,84 @@
#define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */
/* L: Logical segment # in volume, R: Relative segment # in main area */
-#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno)
-#define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno)
+#define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno)
+#define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno)
-#define IS_DATASEG(t) (t <= CURSEG_COLD_DATA)
-#define IS_NODESEG(t) (t >= CURSEG_HOT_NODE)
+#define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA)
+#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE)
#define IS_CURSEG(sbi, seg) \
- ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \
- (seg == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \
- (seg == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \
- (seg == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \
- (seg == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \
- (seg == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno))
+ (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \
+ ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \
+ ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \
+ ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \
+ ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \
+ ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno))
#define IS_CURSEC(sbi, secno) \
- ((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \
- sbi->segs_per_sec) || \
- (secno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \
- sbi->segs_per_sec) || \
- (secno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \
- sbi->segs_per_sec) || \
- (secno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \
- sbi->segs_per_sec) || \
- (secno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \
- sbi->segs_per_sec) || \
- (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \
- sbi->segs_per_sec)) \
+ (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \
+ (sbi)->segs_per_sec) || \
+ ((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \
+ (sbi)->segs_per_sec) || \
+ ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \
+ (sbi)->segs_per_sec) || \
+ ((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \
+ (sbi)->segs_per_sec) || \
+ ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \
+ (sbi)->segs_per_sec) || \
+ ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \
+ (sbi)->segs_per_sec)) \
#define MAIN_BLKADDR(sbi) (SM_I(sbi)->main_blkaddr)
#define SEG0_BLKADDR(sbi) (SM_I(sbi)->seg0_blkaddr)
#define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments)
-#define MAIN_SECS(sbi) (sbi->total_sections)
+#define MAIN_SECS(sbi) ((sbi)->total_sections)
#define TOTAL_SEGS(sbi) (SM_I(sbi)->segment_count)
-#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << sbi->log_blocks_per_seg)
+#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg)
#define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi))
-#define SEGMENT_SIZE(sbi) (1ULL << (sbi->log_blocksize + \
- sbi->log_blocks_per_seg))
+#define SEGMENT_SIZE(sbi) (1ULL << ((sbi)->log_blocksize + \
+ (sbi)->log_blocks_per_seg))
#define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \
- (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg))
+ (GET_R2L_SEGNO(FREE_I(sbi), segno) << (sbi)->log_blocks_per_seg))
#define NEXT_FREE_BLKADDR(sbi, curseg) \
- (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff)
+ (START_BLOCK(sbi, (curseg)->segno) + (curseg)->next_blkoff)
#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi))
#define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \
- (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg)
+ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> (sbi)->log_blocks_per_seg)
#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \
- (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1))
+ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1))
#define GET_SEGNO(sbi, blk_addr) \
- (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \
+ ((((blk_addr) == NULL_ADDR) || ((blk_addr) == NEW_ADDR)) ? \
NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \
GET_SEGNO_FROM_SEG0(sbi, blk_addr)))
-#define GET_SECNO(sbi, segno) \
- ((segno) / sbi->segs_per_sec)
-#define GET_ZONENO_FROM_SEGNO(sbi, segno) \
- ((segno / sbi->segs_per_sec) / sbi->secs_per_zone)
+#define BLKS_PER_SEC(sbi) \
+ ((sbi)->segs_per_sec * (sbi)->blocks_per_seg)
+#define GET_SEC_FROM_SEG(sbi, segno) \
+ ((segno) / (sbi)->segs_per_sec)
+#define GET_SEG_FROM_SEC(sbi, secno) \
+ ((secno) * (sbi)->segs_per_sec)
+#define GET_ZONE_FROM_SEC(sbi, secno) \
+ ((secno) / (sbi)->secs_per_zone)
+#define GET_ZONE_FROM_SEG(sbi, segno) \
+ GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno))
#define GET_SUM_BLOCK(sbi, segno) \
- ((sbi->sm_info->ssa_blkaddr) + segno)
+ ((sbi)->sm_info->ssa_blkaddr + (segno))
#define GET_SUM_TYPE(footer) ((footer)->entry_type)
-#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = type)
+#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = (type))
#define SIT_ENTRY_OFFSET(sit_i, segno) \
- (segno % sit_i->sents_per_block)
+ ((segno) % (sit_i)->sents_per_block)
#define SIT_BLOCK_OFFSET(segno) \
- (segno / SIT_ENTRY_PER_BLOCK)
+ ((segno) / SIT_ENTRY_PER_BLOCK)
#define START_SEGNO(segno) \
(SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK)
#define SIT_BLK_CNT(sbi) \
@@ -103,7 +109,7 @@
#define SECTOR_FROM_BLOCK(blk_addr) \
(((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK)
#define SECTOR_TO_BLOCK(sectors) \
- (sectors >> F2FS_LOG_SECTORS_PER_BLOCK)
+ ((sectors) >> F2FS_LOG_SECTORS_PER_BLOCK)
/*
* indicate a block allocation direction: RIGHT and LEFT.
@@ -132,7 +138,10 @@ enum {
*/
enum {
GC_CB = 0,
- GC_GREEDY
+ GC_GREEDY,
+ ALLOC_NEXT,
+ FLUSH_DEVICE,
+ MAX_GC_POLICY,
};
/*
@@ -227,6 +236,8 @@ struct sit_info {
unsigned long long mounted_time; /* mount time */
unsigned long long min_mtime; /* min. modification time */
unsigned long long max_mtime; /* max. modification time */
+
+ unsigned int last_victim[MAX_GC_POLICY]; /* last victim segment # */
};
struct free_segmap_info {
@@ -303,17 +314,17 @@ static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi,
unsigned int segno)
{
struct sit_info *sit_i = SIT_I(sbi);
- return &sit_i->sec_entries[GET_SECNO(sbi, segno)];
+ return &sit_i->sec_entries[GET_SEC_FROM_SEG(sbi, segno)];
}
static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi,
- unsigned int segno, int section)
+ unsigned int segno, bool use_section)
{
/*
* In order to get # of valid blocks in a section instantly from many
* segments, f2fs manages two counting structures separately.
*/
- if (section > 1)
+ if (use_section && sbi->segs_per_sec > 1)
return get_sec_entry(sbi, segno)->valid_blocks;
else
return get_seg_entry(sbi, segno)->valid_blocks;
@@ -358,8 +369,8 @@ static inline unsigned int find_next_inuse(struct free_segmap_info *free_i,
static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
{
struct free_segmap_info *free_i = FREE_I(sbi);
- unsigned int secno = segno / sbi->segs_per_sec;
- unsigned int start_segno = secno * sbi->segs_per_sec;
+ unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+ unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno);
unsigned int next;
spin_lock(&free_i->segmap_lock);
@@ -379,7 +390,8 @@ static inline void __set_inuse(struct f2fs_sb_info *sbi,
unsigned int segno)
{
struct free_segmap_info *free_i = FREE_I(sbi);
- unsigned int secno = segno / sbi->segs_per_sec;
+ unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+
set_bit(segno, free_i->free_segmap);
free_i->free_segments--;
if (!test_and_set_bit(secno, free_i->free_secmap))
@@ -390,8 +402,8 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
unsigned int segno)
{
struct free_segmap_info *free_i = FREE_I(sbi);
- unsigned int secno = segno / sbi->segs_per_sec;
- unsigned int start_segno = secno * sbi->segs_per_sec;
+ unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+ unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno);
unsigned int next;
spin_lock(&free_i->segmap_lock);
@@ -412,7 +424,8 @@ static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi,
unsigned int segno)
{
struct free_segmap_info *free_i = FREE_I(sbi);
- unsigned int secno = segno / sbi->segs_per_sec;
+ unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+
spin_lock(&free_i->segmap_lock);
if (!test_and_set_bit(segno, free_i->free_segmap)) {
free_i->free_segments--;
@@ -477,12 +490,12 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi)
static inline int overprovision_sections(struct f2fs_sb_info *sbi)
{
- return ((unsigned int) overprovision_segments(sbi)) / sbi->segs_per_sec;
+ return GET_SEC_FROM_SEG(sbi, (unsigned int)overprovision_segments(sbi));
}
static inline int reserved_sections(struct f2fs_sb_info *sbi)
{
- return ((unsigned int) reserved_segments(sbi)) / sbi->segs_per_sec;
+ return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi));
}
static inline bool need_SSR(struct f2fs_sb_info *sbi)
@@ -495,7 +508,7 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi)
return false;
return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs +
- reserved_sections(sbi) + 1);
+ 2 * reserved_sections(sbi));
}
static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
@@ -540,6 +553,7 @@ static inline int utilization(struct f2fs_sb_info *sbi)
*/
#define DEF_MIN_IPU_UTIL 70
#define DEF_MIN_FSYNC_BLOCKS 8
+#define DEF_MIN_HOT_BLOCKS 16
enum {
F2FS_IPU_FORCE,
@@ -547,17 +561,15 @@ enum {
F2FS_IPU_UTIL,
F2FS_IPU_SSR_UTIL,
F2FS_IPU_FSYNC,
+ F2FS_IPU_ASYNC,
};
-static inline bool need_inplace_update(struct inode *inode)
+static inline bool need_inplace_update_policy(struct inode *inode,
+ struct f2fs_io_info *fio)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
unsigned int policy = SM_I(sbi)->ipu_policy;
- /* IPU can be done only for the user data */
- if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode))
- return false;
-
if (test_opt(sbi, LFS))
return false;
@@ -572,6 +584,15 @@ static inline bool need_inplace_update(struct inode *inode)
utilization(sbi) > SM_I(sbi)->min_ipu_util)
return true;
+ /*
+ * IPU for rewrite async pages
+ */
+ if (policy & (0x1 << F2FS_IPU_ASYNC) &&
+ fio && fio->op == REQ_OP_WRITE &&
+ !(fio->op_flags & REQ_SYNC) &&
+ !f2fs_encrypted_inode(inode))
+ return true;
+
/* this is only set during fdatasync */
if (policy & (0x1 << F2FS_IPU_FSYNC) &&
is_inode_flag_set(inode, FI_NEED_IPU))
@@ -691,8 +712,9 @@ static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start)
static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi)
{
struct sit_info *sit_i = SIT_I(sbi);
- return sit_i->elapsed_time + CURRENT_TIME_SEC.tv_sec -
- sit_i->mounted_time;
+ time64_t now = ktime_get_real_seconds();
+
+ return sit_i->elapsed_time + now - sit_i->mounted_time;
}
static inline void set_summary(struct f2fs_summary *sum, nid_t nid,
@@ -719,7 +741,7 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type)
static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi,
unsigned int secno)
{
- if (get_valid_blocks(sbi, secno, sbi->segs_per_sec) >=
+ if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) >=
sbi->fggc_threshold)
return true;
return false;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 96fe8ed73100..83355ec4a92c 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -49,6 +49,7 @@ char *fault_name[FAULT_MAX] = {
[FAULT_BLOCK] = "no more block",
[FAULT_DIR_DEPTH] = "too big dir depth",
[FAULT_EVICT_INODE] = "evict_inode fail",
+ [FAULT_TRUNCATE] = "truncate fail",
[FAULT_IO] = "IO error",
[FAULT_CHECKPOINT] = "checkpoint error",
};
@@ -82,6 +83,7 @@ enum {
Opt_discard,
Opt_nodiscard,
Opt_noheap,
+ Opt_heap,
Opt_user_xattr,
Opt_nouser_xattr,
Opt_acl,
@@ -116,6 +118,7 @@ static match_table_t f2fs_tokens = {
{Opt_discard, "discard"},
{Opt_nodiscard, "nodiscard"},
{Opt_noheap, "no_heap"},
+ {Opt_heap, "heap"},
{Opt_user_xattr, "user_xattr"},
{Opt_nouser_xattr, "nouser_xattr"},
{Opt_acl, "acl"},
@@ -293,6 +296,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio);
@@ -318,6 +322,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(ipu_policy),
ATTR_LIST(min_ipu_util),
ATTR_LIST(min_fsync_blocks),
+ ATTR_LIST(min_hot_blocks),
ATTR_LIST(max_victim_search),
ATTR_LIST(dir_level),
ATTR_LIST(ram_thresh),
@@ -436,6 +441,9 @@ static int parse_options(struct super_block *sb, char *options)
case Opt_noheap:
set_opt(sbi, NOHEAP);
break;
+ case Opt_heap:
+ clear_opt(sbi, NOHEAP);
+ break;
#ifdef CONFIG_F2FS_FS_XATTR
case Opt_user_xattr:
set_opt(sbi, XATTR_USER);
@@ -787,7 +795,14 @@ static void f2fs_put_super(struct super_block *sb)
}
/* be sure to wait for any on-going discard commands */
- f2fs_wait_discard_bio(sbi, NULL_ADDR);
+ f2fs_wait_discard_bios(sbi);
+
+ if (!sbi->discard_blks) {
+ struct cp_control cpc = {
+ .reason = CP_UMOUNT | CP_TRIMMED,
+ };
+ write_checkpoint(sbi, &cpc);
+ }
/* write_checkpoint can update stat informaion */
f2fs_destroy_stats(sbi);
@@ -913,7 +928,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
if (test_opt(sbi, DISCARD))
seq_puts(seq, ",discard");
if (test_opt(sbi, NOHEAP))
- seq_puts(seq, ",no_heap_alloc");
+ seq_puts(seq, ",no_heap");
+ else
+ seq_puts(seq, ",heap");
#ifdef CONFIG_F2FS_FS_XATTR
if (test_opt(sbi, XATTR_USER))
seq_puts(seq, ",user_xattr");
@@ -986,7 +1003,7 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset)
if ((i % 10) == 0)
seq_printf(seq, "%-10d", i);
seq_printf(seq, "%d|%-3u", se->type,
- get_valid_blocks(sbi, i, 1));
+ get_valid_blocks(sbi, i, false));
if ((i % 10) == 9 || i == (total_segs - 1))
seq_putc(seq, '\n');
else
@@ -1012,7 +1029,7 @@ static int segment_bits_seq_show(struct seq_file *seq, void *offset)
seq_printf(seq, "%-10d", i);
seq_printf(seq, "%d|%-3u|", se->type,
- get_valid_blocks(sbi, i, 1));
+ get_valid_blocks(sbi, i, false));
for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++)
seq_printf(seq, " %.2x", se->cur_valid_map[j]);
seq_putc(seq, '\n');
@@ -1046,6 +1063,7 @@ static void default_options(struct f2fs_sb_info *sbi)
set_opt(sbi, INLINE_DATA);
set_opt(sbi, INLINE_DENTRY);
set_opt(sbi, EXTENT_CACHE);
+ set_opt(sbi, NOHEAP);
sbi->sb->s_flags |= MS_LAZYTIME;
set_opt(sbi, FLUSH_MERGE);
if (f2fs_sb_mounted_blkzoned(sbi->sb)) {
@@ -1307,7 +1325,7 @@ static int __f2fs_commit_super(struct buffer_head *bh,
unlock_buffer(bh);
/* it's rare case, we can do fua all the time */
- return __sync_dirty_buffer(bh, REQ_PREFLUSH | REQ_FUA);
+ return __sync_dirty_buffer(bh, REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
}
static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi,
@@ -1483,6 +1501,13 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
return 1;
}
+ if (le32_to_cpu(raw_super->segment_count) > F2FS_MAX_SEGMENT) {
+ f2fs_msg(sb, KERN_INFO,
+ "Invalid segment count (%u)",
+ le32_to_cpu(raw_super->segment_count));
+ return 1;
+ }
+
/* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */
if (sanity_check_area_boundary(sbi, bh))
return 1;
@@ -1555,6 +1580,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
for (i = 0; i < NR_COUNT_TYPE; i++)
atomic_set(&sbi->nr_pages[i], 0);
+ atomic_set(&sbi->wb_sync_req, 0);
+
INIT_LIST_HEAD(&sbi->s_list);
mutex_init(&sbi->umount_mutex);
mutex_init(&sbi->wio_mutex[NODE]);
@@ -1917,6 +1944,7 @@ try_onemore:
mutex_init(&sbi->gc_mutex);
mutex_init(&sbi->cp_mutex);
init_rwsem(&sbi->node_write);
+ init_rwsem(&sbi->node_change);
/* disallow all the data/node/meta page writes */
set_sbi_flag(sbi, SBI_POR_DOING);
@@ -2022,6 +2050,10 @@ try_onemore:
f2fs_join_shrinker(sbi);
+ err = f2fs_build_stats(sbi);
+ if (err)
+ goto free_nm;
+
/* if there are nt orphan nodes free them */
err = recover_orphan_inodes(sbi);
if (err)
@@ -2046,10 +2078,6 @@ try_onemore:
goto free_root_inode;
}
- err = f2fs_build_stats(sbi);
- if (err)
- goto free_root_inode;
-
if (f2fs_proc_root)
sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
@@ -2143,7 +2171,6 @@ free_proc:
remove_proc_entry("segment_bits", sbi->s_proc);
remove_proc_entry(sb->s_id, f2fs_proc_root);
}
- f2fs_destroy_stats(sbi);
free_root_inode:
dput(sb->s_root);
sb->s_root = NULL;
@@ -2161,6 +2188,7 @@ free_node_inode:
truncate_inode_pages_final(META_MAPPING(sbi));
iput(sbi->node_inode);
mutex_unlock(&sbi->umount_mutex);
+ f2fs_destroy_stats(sbi);
free_nm:
destroy_node_manager(sbi);
free_sm:
diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
index 73b4e1d1912a..bccbbf2616d2 100644
--- a/fs/f2fs/trace.c
+++ b/fs/f2fs/trace.c
@@ -59,7 +59,7 @@ void f2fs_trace_pid(struct page *page)
pid_t pid = task_pid_nr(current);
void *p;
- page->private = pid;
+ set_page_private(page, (unsigned long)pid);
if (radix_tree_preload(GFP_NOFS))
return;
@@ -138,7 +138,7 @@ static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index,
radix_tree_for_each_slot(slot, &pids, &iter, first_index) {
results[ret] = iter.index;
- if (++ret == PIDVEC_SIZE)
+ if (++ret == max_items)
break;
}
return ret;
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 7298a4488f7f..832c5110abab 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -250,15 +250,13 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
void *cur_addr, *txattr_addr, *last_addr = NULL;
nid_t xnid = F2FS_I(inode)->i_xattr_nid;
unsigned int size = xnid ? VALID_XATTR_BLOCK_SIZE : 0;
- unsigned int inline_size = 0;
+ unsigned int inline_size = inline_xattr_size(inode);
int err = 0;
- inline_size = inline_xattr_size(inode);
-
if (!size && !inline_size)
return -ENODATA;
- txattr_addr = kzalloc(inline_size + size + sizeof(__u32),
+ txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE,
GFP_F2FS_ZERO);
if (!txattr_addr)
return -ENOMEM;
@@ -328,13 +326,14 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage,
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_xattr_header *header;
- size_t size = PAGE_SIZE, inline_size = 0;
+ nid_t xnid = F2FS_I(inode)->i_xattr_nid;
+ unsigned int size = VALID_XATTR_BLOCK_SIZE;
+ unsigned int inline_size = inline_xattr_size(inode);
void *txattr_addr;
int err;
- inline_size = inline_xattr_size(inode);
-
- txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO);
+ txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE,
+ GFP_F2FS_ZERO);
if (!txattr_addr)
return -ENOMEM;
@@ -358,19 +357,19 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage,
}
/* read from xattr node block */
- if (F2FS_I(inode)->i_xattr_nid) {
+ if (xnid) {
struct page *xpage;
void *xattr_addr;
/* The inode already has an extended attribute block. */
- xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid);
+ xpage = get_node_page(sbi, xnid);
if (IS_ERR(xpage)) {
err = PTR_ERR(xpage);
goto fail;
}
xattr_addr = page_address(xpage);
- memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE);
+ memcpy(txattr_addr + inline_size, xattr_addr, size);
f2fs_put_page(xpage, 1);
}
@@ -392,14 +391,12 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
void *txattr_addr, struct page *ipage)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- size_t inline_size = 0;
+ size_t inline_size = inline_xattr_size(inode);
void *xattr_addr;
struct page *xpage;
nid_t new_nid = 0;
int err;
- inline_size = inline_xattr_size(inode);
-
if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid)
if (!alloc_nid(sbi, &new_nid))
return -ENOSPC;
@@ -454,7 +451,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
}
xattr_addr = page_address(xpage);
- memcpy(xattr_addr, txattr_addr + inline_size, MAX_XATTR_BLOCK_SIZE);
+ memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE);
set_page_dirty(xpage);
f2fs_put_page(xpage, 1);
@@ -546,7 +543,9 @@ static bool f2fs_xattr_value_same(struct f2fs_xattr_entry *entry,
const void *value, size_t size)
{
void *pval = entry->e_name + entry->e_name_len;
- return (entry->e_value_size == size) && !memcmp(pval, value, size);
+
+ return (le16_to_cpu(entry->e_value_size) == size) &&
+ !memcmp(pval, value, size);
}
static int __f2fs_setxattr(struct inode *inode, int index,
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index d5a94928c116..dbcd1d16e669 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -58,10 +58,10 @@ struct f2fs_xattr_entry {
#define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr) + 1))
#define XATTR_ROUND (3)
-#define XATTR_ALIGN(size) ((size + XATTR_ROUND) & ~XATTR_ROUND)
+#define XATTR_ALIGN(size) (((size) + XATTR_ROUND) & ~XATTR_ROUND)
#define ENTRY_SIZE(entry) (XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + \
- entry->e_name_len + le16_to_cpu(entry->e_value_size)))
+ (entry)->e_name_len + le16_to_cpu((entry)->e_value_size)))
#define XATTR_NEXT_ENTRY(entry) ((struct f2fs_xattr_entry *)((char *)(entry) +\
ENTRY_SIZE(entry)))
@@ -72,8 +72,8 @@ struct f2fs_xattr_entry {
for (entry = XATTR_FIRST_ENTRY(addr);\
!IS_XATTR_LAST_ENTRY(entry);\
entry = XATTR_NEXT_ENTRY(entry))
-#define MAX_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer))
-#define VALID_XATTR_BLOCK_SIZE (MAX_XATTR_BLOCK_SIZE - sizeof(__u32))
+#define VALID_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer))
+#define XATTR_PADDING_SIZE (sizeof(__u32))
#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + \
VALID_XATTR_BLOCK_SIZE)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 8bd81c2e89b2..f4e7267d117f 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -899,16 +899,10 @@ static int __init fcntl_init(void)
* Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
* is defined as O_NONBLOCK on some platforms and not on others.
*/
- BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
- O_RDONLY | O_WRONLY | O_RDWR |
- O_CREAT | O_EXCL | O_NOCTTY |
- O_TRUNC | O_APPEND | /* O_NONBLOCK | */
- __O_SYNC | O_DSYNC | FASYNC |
- O_DIRECT | O_LARGEFILE | O_DIRECTORY |
- O_NOFOLLOW | O_NOATIME | O_CLOEXEC |
- __FMODE_EXEC | O_PATH | __O_TMPFILE |
- __FMODE_NONOTIFY
- ));
+ BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
+ HWEIGHT32(
+ (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
+ __FMODE_EXEC | __FMODE_NONOTIFY));
fasync_cache = kmem_cache_create("fasync_cache",
sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL);
diff --git a/fs/file.c b/fs/file.c
index ad6f094f2eff..1c2972e3a405 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -42,7 +42,7 @@ static void *alloc_fdmem(size_t size)
if (data != NULL)
return data;
}
- return __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM, PAGE_KERNEL);
+ return __vmalloc(size, GFP_KERNEL_ACCOUNT, PAGE_KERNEL);
}
static void __free_fdtable(struct fdtable *fdt)
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 6e22748b0704..b9ea99c5b5b3 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -292,7 +292,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc)
static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
{
- struct tree_descr empty_descr = {""};
+ static const struct tree_descr empty_descr = {""};
struct fuse_conn *fc;
int err;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 01b97c012c6e..4d810be532dd 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -38,11 +38,6 @@ struct metapath {
__u16 mp_list[GFS2_MAX_META_HEIGHT];
};
-struct strip_mine {
- int sm_first;
- unsigned int sm_height;
-};
-
/**
* gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
* @ip: the inode
@@ -253,6 +248,19 @@ static inline unsigned int metapath_branch_start(const struct metapath *mp)
}
/**
+ * metaptr1 - Return the first possible metadata pointer in a metaath buffer
+ * @height: The metadata height (0 = dinode)
+ * @mp: The metapath
+ */
+static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
+{
+ struct buffer_head *bh = mp->mp_bh[height];
+ if (height == 0)
+ return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
+ return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
+}
+
+/**
* metapointer - Return pointer to start of metadata in a buffer
* @height: The metadata height (0 = dinode)
* @mp: The metapath
@@ -264,10 +272,8 @@ static inline unsigned int metapath_branch_start(const struct metapath *mp)
static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
{
- struct buffer_head *bh = mp->mp_bh[height];
- unsigned int head_size = (height > 0) ?
- sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
- return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height];
+ __be64 *p = metaptr1(height, mp);
+ return p + mp->mp_list[height];
}
static void gfs2_metapath_ra(struct gfs2_glock *gl,
@@ -296,6 +302,23 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl,
}
/**
+ * lookup_mp_height - helper function for lookup_metapath
+ * @ip: the inode
+ * @mp: the metapath
+ * @h: the height which needs looking up
+ */
+static int lookup_mp_height(struct gfs2_inode *ip, struct metapath *mp, int h)
+{
+ __be64 *ptr = metapointer(h, mp);
+ u64 dblock = be64_to_cpu(*ptr);
+
+ if (!dblock)
+ return h + 1;
+
+ return gfs2_meta_indirect_buffer(ip, h + 1, dblock, &mp->mp_bh[h + 1]);
+}
+
+/**
* lookup_metapath - Walk the metadata tree to a specific point
* @ip: The inode
* @mp: The metapath
@@ -316,17 +339,10 @@ static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
{
unsigned int end_of_metadata = ip->i_height - 1;
unsigned int x;
- __be64 *ptr;
- u64 dblock;
int ret;
for (x = 0; x < end_of_metadata; x++) {
- ptr = metapointer(x, mp);
- dblock = be64_to_cpu(*ptr);
- if (!dblock)
- return x + 1;
-
- ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, &mp->mp_bh[x+1]);
+ ret = lookup_mp_height(ip, mp, x);
if (ret)
return ret;
}
@@ -334,6 +350,35 @@ static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
return ip->i_height;
}
+/**
+ * fillup_metapath - fill up buffers for the metadata path to a specific height
+ * @ip: The inode
+ * @mp: The metapath
+ * @h: The height to which it should be mapped
+ *
+ * Similar to lookup_metapath, but does lookups for a range of heights
+ *
+ * Returns: error or height of metadata tree
+ */
+
+static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
+{
+ unsigned int start_h = h - 1;
+ int ret;
+
+ if (h) {
+ /* find the first buffer we need to look up. */
+ while (start_h > 0 && mp->mp_bh[start_h] == NULL)
+ start_h--;
+ for (; start_h < h; start_h++) {
+ ret = lookup_mp_height(ip, mp, start_h);
+ if (ret)
+ return ret;
+ }
+ }
+ return ip->i_height;
+}
+
static inline void release_metapath(struct metapath *mp)
{
int i;
@@ -422,6 +467,13 @@ enum alloc_state {
/* ALLOC_UNSTUFF = 3, TBD and rather complicated */
};
+static inline unsigned int hptrs(struct gfs2_sbd *sdp, const unsigned int hgt)
+{
+ if (hgt)
+ return sdp->sd_inptrs;
+ return sdp->sd_diptrs;
+}
+
/**
* gfs2_bmap_alloc - Build a metadata tree of the requested height
* @inode: The GFS2 inode
@@ -620,7 +672,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
BUG_ON(maxlen == 0);
- memset(mp.mp_bh, 0, sizeof(mp.mp_bh));
+ memset(&mp, 0, sizeof(mp));
bmap_lock(ip, create);
clear_buffer_mapped(bh_map);
clear_buffer_new(bh_map);
@@ -702,252 +754,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
}
/**
- * do_strip - Look for a layer a particular layer of the file and strip it off
- * @ip: the inode
- * @dibh: the dinode buffer
- * @bh: A buffer of pointers
- * @top: The first pointer in the buffer
- * @bottom: One more than the last pointer
- * @height: the height this buffer is at
- * @sm: a pointer to a struct strip_mine
- *
- * Returns: errno
- */
-
-static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
- struct buffer_head *bh, __be64 *top, __be64 *bottom,
- unsigned int height, struct strip_mine *sm)
-{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct gfs2_rgrp_list rlist;
- struct gfs2_trans *tr;
- u64 bn, bstart;
- u32 blen, btotal;
- __be64 *p;
- unsigned int rg_blocks = 0;
- int metadata;
- unsigned int revokes = 0;
- int x;
- int error;
- int jblocks_rqsted;
-
- error = gfs2_rindex_update(sdp);
- if (error)
- return error;
-
- if (!*top)
- sm->sm_first = 0;
-
- if (height != sm->sm_height)
- return 0;
-
- if (sm->sm_first) {
- top++;
- sm->sm_first = 0;
- }
-
- metadata = (height != ip->i_height - 1);
- if (metadata)
- revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
- else if (ip->i_depth)
- revokes = sdp->sd_inptrs;
-
- memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
- bstart = 0;
- blen = 0;
-
- for (p = top; p < bottom; p++) {
- if (!*p)
- continue;
-
- bn = be64_to_cpu(*p);
-
- if (bstart + blen == bn)
- blen++;
- else {
- if (bstart)
- gfs2_rlist_add(ip, &rlist, bstart);
-
- bstart = bn;
- blen = 1;
- }
- }
-
- if (bstart)
- gfs2_rlist_add(ip, &rlist, bstart);
- else
- goto out; /* Nothing to do */
-
- gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
-
- for (x = 0; x < rlist.rl_rgrps; x++) {
- struct gfs2_rgrpd *rgd;
- rgd = rlist.rl_ghs[x].gh_gl->gl_object;
- rg_blocks += rgd->rd_length;
- }
-
- error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
- if (error)
- goto out_rlist;
-
- if (gfs2_rs_active(&ip->i_res)) /* needs to be done with the rgrp glock held */
- gfs2_rs_deltree(&ip->i_res);
-
-restart:
- jblocks_rqsted = rg_blocks + RES_DINODE +
- RES_INDIRECT + RES_STATFS + RES_QUOTA +
- gfs2_struct2blk(sdp, revokes, sizeof(u64));
- if (jblocks_rqsted > atomic_read(&sdp->sd_log_thresh2))
- jblocks_rqsted = atomic_read(&sdp->sd_log_thresh2);
- error = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
- if (error)
- goto out_rg_gunlock;
-
- tr = current->journal_info;
- down_write(&ip->i_rw_mutex);
-
- gfs2_trans_add_meta(ip->i_gl, dibh);
- gfs2_trans_add_meta(ip->i_gl, bh);
-
- bstart = 0;
- blen = 0;
- btotal = 0;
-
- for (p = top; p < bottom; p++) {
- if (!*p)
- continue;
-
- /* check for max reasonable journal transaction blocks */
- if (tr->tr_num_buf_new + RES_STATFS +
- RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
- if (rg_blocks >= tr->tr_num_buf_new)
- rg_blocks -= tr->tr_num_buf_new;
- else
- rg_blocks = 0;
- break;
- }
-
- bn = be64_to_cpu(*p);
-
- if (bstart + blen == bn)
- blen++;
- else {
- if (bstart) {
- __gfs2_free_blocks(ip, bstart, blen, metadata);
- btotal += blen;
- }
-
- bstart = bn;
- blen = 1;
- }
-
- *p = 0;
- gfs2_add_inode_blocks(&ip->i_inode, -1);
- }
- if (p == bottom)
- rg_blocks = 0;
-
- if (bstart) {
- __gfs2_free_blocks(ip, bstart, blen, metadata);
- btotal += blen;
- }
-
- gfs2_statfs_change(sdp, 0, +btotal, 0);
- gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
- ip->i_inode.i_gid);
-
- ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
-
- gfs2_dinode_out(ip, dibh->b_data);
-
- up_write(&ip->i_rw_mutex);
-
- gfs2_trans_end(sdp);
-
- if (rg_blocks)
- goto restart;
-
-out_rg_gunlock:
- gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
-out_rlist:
- gfs2_rlist_free(&rlist);
-out:
- return error;
-}
-
-/**
- * recursive_scan - recursively scan through the end of a file
- * @ip: the inode
- * @dibh: the dinode buffer
- * @mp: the path through the metadata to the point to start
- * @height: the height the recursion is at
- * @block: the indirect block to look at
- * @first: 1 if this is the first block
- * @sm: data opaque to this function to pass to @bc
- *
- * When this is first called @height and @block should be zero and
- * @first should be 1.
- *
- * Returns: errno
- */
-
-static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
- struct metapath *mp, unsigned int height,
- u64 block, int first, struct strip_mine *sm)
-{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct buffer_head *bh = NULL;
- __be64 *top, *bottom;
- u64 bn;
- int error;
- int mh_size = sizeof(struct gfs2_meta_header);
-
- if (!height) {
- error = gfs2_meta_inode_buffer(ip, &bh);
- if (error)
- return error;
- dibh = bh;
-
- top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
- bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
- } else {
- error = gfs2_meta_indirect_buffer(ip, height, block, &bh);
- if (error)
- return error;
-
- top = (__be64 *)(bh->b_data + mh_size) +
- (first ? mp->mp_list[height] : 0);
-
- bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
- }
-
- error = do_strip(ip, dibh, bh, top, bottom, height, sm);
- if (error)
- goto out;
-
- if (height < ip->i_height - 1) {
-
- gfs2_metapath_ra(ip->i_gl, bh, top);
-
- for (; top < bottom; top++, first = 0) {
- if (!*top)
- continue;
-
- bn = be64_to_cpu(*top);
-
- error = recursive_scan(ip, dibh, mp, height + 1, bn,
- first, sm);
- if (error)
- break;
- }
- }
-out:
- brelse(bh);
- return error;
-}
-
-
-/**
* gfs2_block_truncate_page - Deal with zeroing out data for truncate
*
* This is partly borrowed from ext3.
@@ -1106,41 +912,406 @@ out:
return error;
}
-static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
+/**
+ * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
+ * @ip: inode
+ * @rg_gh: holder of resource group glock
+ * @mp: current metapath fully populated with buffers
+ * @btotal: place to keep count of total blocks freed
+ * @hgt: height we're processing
+ * @first: true if this is the first call to this function for this height
+ *
+ * We sweep a metadata buffer (provided by the metapath) for blocks we need to
+ * free, and free them all. However, we do it one rgrp at a time. If this
+ * block has references to multiple rgrps, we break it into individual
+ * transactions. This allows other processes to use the rgrps while we're
+ * focused on a single one, for better concurrency / performance.
+ * At every transaction boundary, we rewrite the inode into the journal.
+ * That way the bitmaps are kept consistent with the inode and we can recover
+ * if we're interrupted by power-outages.
+ *
+ * Returns: 0, or return code if an error occurred.
+ * *btotal has the total number of blocks freed
+ */
+static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
+ const struct metapath *mp, u32 *btotal, int hgt,
+ bool preserve1)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- unsigned int height = ip->i_height;
- u64 lblock;
- struct metapath mp;
- int error;
+ struct gfs2_rgrpd *rgd;
+ struct gfs2_trans *tr;
+ struct buffer_head *bh = mp->mp_bh[hgt];
+ __be64 *top, *bottom, *p;
+ int blks_outside_rgrp;
+ u64 bn, bstart, isize_blks;
+ s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
+ int meta = ((hgt != ip->i_height - 1) ? 1 : 0);
+ int ret = 0;
+ bool buf_in_tr = false; /* buffer was added to transaction */
+
+ if (gfs2_metatype_check(sdp, bh,
+ (hgt ? GFS2_METATYPE_IN : GFS2_METATYPE_DI)))
+ return -EIO;
+
+more_rgrps:
+ blks_outside_rgrp = 0;
+ bstart = 0;
+ blen = 0;
+ top = metapointer(hgt, mp); /* first ptr from metapath */
+ /* If we're keeping some data at the truncation point, we've got to
+ preserve the metadata tree by adding 1 to the starting metapath. */
+ if (preserve1)
+ top++;
+
+ bottom = (__be64 *)(bh->b_data + bh->b_size);
+
+ for (p = top; p < bottom; p++) {
+ if (!*p)
+ continue;
+ bn = be64_to_cpu(*p);
+ if (gfs2_holder_initialized(rd_gh)) {
+ rgd = (struct gfs2_rgrpd *)rd_gh->gh_gl->gl_object;
+ gfs2_assert_withdraw(sdp,
+ gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
+ } else {
+ rgd = gfs2_blk2rgrpd(sdp, bn, false);
+ ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
+ 0, rd_gh);
+ if (ret)
+ goto out;
+
+ /* Must be done with the rgrp glock held: */
+ if (gfs2_rs_active(&ip->i_res) &&
+ rgd == ip->i_res.rs_rbm.rgd)
+ gfs2_rs_deltree(&ip->i_res);
+ }
+
+ if (!rgrp_contains_block(rgd, bn)) {
+ blks_outside_rgrp++;
+ continue;
+ }
+
+ /* The size of our transactions will be unknown until we
+ actually process all the metadata blocks that relate to
+ the rgrp. So we estimate. We know it can't be more than
+ the dinode's i_blocks and we don't want to exceed the
+ journal flush threshold, sd_log_thresh2. */
+ if (current->journal_info == NULL) {
+ unsigned int jblocks_rqsted, revokes;
+
+ jblocks_rqsted = rgd->rd_length + RES_DINODE +
+ RES_INDIRECT;
+ isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
+ if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
+ jblocks_rqsted +=
+ atomic_read(&sdp->sd_log_thresh2);
+ else
+ jblocks_rqsted += isize_blks;
+ revokes = jblocks_rqsted;
+ if (meta)
+ revokes += hptrs(sdp, hgt);
+ else if (ip->i_depth)
+ revokes += sdp->sd_inptrs;
+ ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
+ if (ret)
+ goto out_unlock;
+ down_write(&ip->i_rw_mutex);
+ }
+ /* check if we will exceed the transaction blocks requested */
+ tr = current->journal_info;
+ if (tr->tr_num_buf_new + RES_STATFS +
+ RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
+ /* We set blks_outside_rgrp to ensure the loop will
+ be repeated for the same rgrp, but with a new
+ transaction. */
+ blks_outside_rgrp++;
+ /* This next part is tricky. If the buffer was added
+ to the transaction, we've already set some block
+ pointers to 0, so we better follow through and free
+ them, or we will introduce corruption (so break).
+ This may be impossible, or at least rare, but I
+ decided to cover the case regardless.
+
+ If the buffer was not added to the transaction
+ (this call), doing so would exceed our transaction
+ size, so we need to end the transaction and start a
+ new one (so goto). */
+
+ if (buf_in_tr)
+ break;
+ goto out_unlock;
+ }
+
+ gfs2_trans_add_meta(ip->i_gl, bh);
+ buf_in_tr = true;
+ *p = 0;
+ if (bstart + blen == bn) {
+ blen++;
+ continue;
+ }
+ if (bstart) {
+ __gfs2_free_blocks(ip, bstart, (u32)blen, meta);
+ (*btotal) += blen;
+ gfs2_add_inode_blocks(&ip->i_inode, -blen);
+ }
+ bstart = bn;
+ blen = 1;
+ }
+ if (bstart) {
+ __gfs2_free_blocks(ip, bstart, (u32)blen, meta);
+ (*btotal) += blen;
+ gfs2_add_inode_blocks(&ip->i_inode, -blen);
+ }
+out_unlock:
+ if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
+ outside the rgrp we just processed,
+ do it all over again. */
+ if (current->journal_info) {
+ struct buffer_head *dibh = mp->mp_bh[0];
+
+ /* Every transaction boundary, we rewrite the dinode
+ to keep its di_blocks current in case of failure. */
+ ip->i_inode.i_mtime = ip->i_inode.i_ctime =
+ current_time(&ip->i_inode);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
+ gfs2_dinode_out(ip, dibh->b_data);
+ up_write(&ip->i_rw_mutex);
+ gfs2_trans_end(sdp);
+ }
+ gfs2_glock_dq_uninit(rd_gh);
+ cond_resched();
+ goto more_rgrps;
+ }
+out:
+ return ret;
+}
+
+/**
+ * find_nonnull_ptr - find a non-null pointer given a metapath and height
+ * assumes the metapath is valid (with buffers) out to height h
+ * @mp: starting metapath
+ * @h: desired height to search
+ *
+ * Returns: true if a non-null pointer was found in the metapath buffer
+ * false if all remaining pointers are NULL in the buffer
+ */
+static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
+ unsigned int h)
+{
+ __be64 *ptr;
+ unsigned int ptrs = hptrs(sdp, h) - 1;
+
+ while (true) {
+ ptr = metapointer(h, mp);
+ if (*ptr) /* if we have a non-null pointer */
+ return true;
+
+ if (mp->mp_list[h] < ptrs)
+ mp->mp_list[h]++;
+ else
+ return false; /* no more pointers in this buffer */
+ }
+}
+
+enum dealloc_states {
+ DEALLOC_MP_FULL = 0, /* Strip a metapath with all buffers read in */
+ DEALLOC_MP_LOWER = 1, /* lower the metapath strip height */
+ DEALLOC_FILL_MP = 2, /* Fill in the metapath to the given height. */
+ DEALLOC_DONE = 3, /* process complete */
+};
- if (!size)
+/**
+ * trunc_dealloc - truncate a file down to a desired size
+ * @ip: inode to truncate
+ * @newsize: The desired size of the file
+ *
+ * This function truncates a file to newsize. It works from the
+ * bottom up, and from the right to the left. In other words, it strips off
+ * the highest layer (data) before stripping any of the metadata. Doing it
+ * this way is best in case the operation is interrupted by power failure, etc.
+ * The dinode is rewritten in every transaction to guarantee integrity.
+ */
+static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct metapath mp;
+ struct buffer_head *dibh, *bh;
+ struct gfs2_holder rd_gh;
+ u64 lblock;
+ __u16 nbof[GFS2_MAX_META_HEIGHT]; /* new beginning of truncation */
+ unsigned int strip_h = ip->i_height - 1;
+ u32 btotal = 0;
+ int ret, state;
+ int mp_h; /* metapath buffers are read in to this height */
+ sector_t last_ra = 0;
+ u64 prev_bnr = 0;
+ bool preserve1; /* need to preserve the first meta pointer? */
+
+ if (!newsize)
lblock = 0;
else
- lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift;
+ lblock = (newsize - 1) >> sdp->sd_sb.sb_bsize_shift;
+ memset(&mp, 0, sizeof(mp));
find_metapath(sdp, lblock, &mp, ip->i_height);
- error = gfs2_rindex_update(sdp);
- if (error)
- return error;
- error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
- if (error)
- return error;
+ memcpy(&nbof, &mp.mp_list, sizeof(nbof));
+
+ ret = gfs2_meta_inode_buffer(ip, &dibh);
+ if (ret)
+ return ret;
- while (height--) {
- struct strip_mine sm;
- sm.sm_first = !!size;
- sm.sm_height = height;
+ mp.mp_bh[0] = dibh;
+ ret = lookup_metapath(ip, &mp);
+ if (ret == ip->i_height)
+ state = DEALLOC_MP_FULL; /* We have a complete metapath */
+ else
+ state = DEALLOC_FILL_MP; /* deal with partial metapath */
- error = recursive_scan(ip, NULL, &mp, 0, 0, 1, &sm);
- if (error)
+ ret = gfs2_rindex_update(sdp);
+ if (ret)
+ goto out_metapath;
+
+ ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
+ if (ret)
+ goto out_metapath;
+ gfs2_holder_mark_uninitialized(&rd_gh);
+
+ mp_h = strip_h;
+
+ while (state != DEALLOC_DONE) {
+ switch (state) {
+ /* Truncate a full metapath at the given strip height.
+ * Note that strip_h == mp_h in order to be in this state. */
+ case DEALLOC_MP_FULL:
+ if (mp_h > 0) { /* issue read-ahead on metadata */
+ __be64 *top;
+
+ bh = mp.mp_bh[mp_h - 1];
+ if (bh->b_blocknr != last_ra) {
+ last_ra = bh->b_blocknr;
+ top = metaptr1(mp_h - 1, &mp);
+ gfs2_metapath_ra(ip->i_gl, bh, top);
+ }
+ }
+ /* If we're truncating to a non-zero size and the mp is
+ at the beginning of file for the strip height, we
+ need to preserve the first metadata pointer. */
+ preserve1 = (newsize &&
+ (mp.mp_list[mp_h] == nbof[mp_h]));
+ bh = mp.mp_bh[mp_h];
+ gfs2_assert_withdraw(sdp, bh);
+ if (gfs2_assert_withdraw(sdp,
+ prev_bnr != bh->b_blocknr)) {
+ printk(KERN_EMERG "GFS2: fsid=%s:inode %llu, "
+ "block:%llu, i_h:%u, s_h:%u, mp_h:%u\n",
+ sdp->sd_fsname,
+ (unsigned long long)ip->i_no_addr,
+ prev_bnr, ip->i_height, strip_h, mp_h);
+ }
+ prev_bnr = bh->b_blocknr;
+ ret = sweep_bh_for_rgrps(ip, &rd_gh, &mp, &btotal,
+ mp_h, preserve1);
+ /* If we hit an error or just swept dinode buffer,
+ just exit. */
+ if (ret || !mp_h) {
+ state = DEALLOC_DONE;
+ break;
+ }
+ state = DEALLOC_MP_LOWER;
+ break;
+
+ /* lower the metapath strip height */
+ case DEALLOC_MP_LOWER:
+ /* We're done with the current buffer, so release it,
+ unless it's the dinode buffer. Then back up to the
+ previous pointer. */
+ if (mp_h) {
+ brelse(mp.mp_bh[mp_h]);
+ mp.mp_bh[mp_h] = NULL;
+ }
+ /* If we can't get any lower in height, we've stripped
+ off all we can. Next step is to back up and start
+ stripping the previous level of metadata. */
+ if (mp_h == 0) {
+ strip_h--;
+ memcpy(&mp.mp_list, &nbof, sizeof(nbof));
+ mp_h = strip_h;
+ state = DEALLOC_FILL_MP;
+ break;
+ }
+ mp.mp_list[mp_h] = 0;
+ mp_h--; /* search one metadata height down */
+ if (mp.mp_list[mp_h] >= hptrs(sdp, mp_h) - 1)
+ break; /* loop around in the same state */
+ mp.mp_list[mp_h]++;
+ /* Here we've found a part of the metapath that is not
+ * allocated. We need to search at that height for the
+ * next non-null pointer. */
+ if (find_nonnull_ptr(sdp, &mp, mp_h)) {
+ state = DEALLOC_FILL_MP;
+ mp_h++;
+ }
+ /* No more non-null pointers at this height. Back up
+ to the previous height and try again. */
+ break; /* loop around in the same state */
+
+ /* Fill the metapath with buffers to the given height. */
+ case DEALLOC_FILL_MP:
+ /* Fill the buffers out to the current height. */
+ ret = fillup_metapath(ip, &mp, mp_h);
+ if (ret < 0)
+ goto out;
+
+ /* If buffers found for the entire strip height */
+ if ((ret == ip->i_height) && (mp_h == strip_h)) {
+ state = DEALLOC_MP_FULL;
+ break;
+ }
+ if (ret < ip->i_height) /* We have a partial height */
+ mp_h = ret - 1;
+
+ /* If we find a non-null block pointer, crawl a bit
+ higher up in the metapath and try again, otherwise
+ we need to look lower for a new starting point. */
+ if (find_nonnull_ptr(sdp, &mp, mp_h))
+ mp_h++;
+ else
+ state = DEALLOC_MP_LOWER;
break;
+ }
}
- gfs2_quota_unhold(ip);
+ if (btotal) {
+ if (current->journal_info == NULL) {
+ ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
+ RES_QUOTA, 0);
+ if (ret)
+ goto out;
+ down_write(&ip->i_rw_mutex);
+ }
+ gfs2_statfs_change(sdp, 0, +btotal, 0);
+ gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
+ ip->i_inode.i_gid);
+ ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ gfs2_trans_add_meta(ip->i_gl, dibh);
+ gfs2_dinode_out(ip, dibh->b_data);
+ up_write(&ip->i_rw_mutex);
+ gfs2_trans_end(sdp);
+ }
- return error;
+out:
+ if (gfs2_holder_initialized(&rd_gh))
+ gfs2_glock_dq_uninit(&rd_gh);
+ if (current->journal_info) {
+ up_write(&ip->i_rw_mutex);
+ gfs2_trans_end(sdp);
+ cond_resched();
+ }
+ gfs2_quota_unhold(ip);
+out_metapath:
+ release_metapath(&mp);
+ return ret;
}
static int trunc_end(struct gfs2_inode *ip)
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 6fe2a59c6a9a..c2062a108d19 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -911,11 +911,15 @@ out_qunlock:
static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder gh;
int ret;
- if ((mode & ~FALLOC_FL_KEEP_SIZE) || gfs2_is_jdata(ip))
+ if (mode & ~FALLOC_FL_KEEP_SIZE)
+ return -EOPNOTSUPP;
+ /* fallocate is needed by gfs2_grow to reserve space in the rindex */
+ if (gfs2_is_jdata(ip) && inode != sdp->sd_rindex)
return -EOPNOTSUPP;
inode_lock(inode);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index ec0848fcca02..959a19ced4d5 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -73,7 +73,7 @@ static DEFINE_SPINLOCK(lru_lock);
static struct rhashtable_params ht_parms = {
.nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4,
- .key_len = sizeof(struct lm_lockname),
+ .key_len = offsetofend(struct lm_lockname, ln_type),
.key_offset = offsetof(struct gfs2_glock, gl_name),
.head_offset = offsetof(struct gfs2_glock, gl_node),
};
@@ -449,6 +449,9 @@ __acquires(&gl->gl_lockref.lock)
unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0);
int ret;
+ if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) &&
+ target != LM_ST_UNLOCKED)
+ return;
lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
LM_FLAG_PRIORITY);
GLOCK_BUG_ON(gl, gl->gl_state == target);
@@ -484,7 +487,8 @@ __acquires(&gl->gl_lockref.lock)
}
else if (ret) {
pr_err("lm_lock ret %d\n", ret);
- GLOCK_BUG_ON(gl, 1);
+ GLOCK_BUG_ON(gl, !test_bit(SDF_SHUTDOWN,
+ &sdp->sd_flags));
}
} else { /* lock_nolock */
finish_xmote(gl, target);
@@ -653,10 +657,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
struct lm_lockname name = { .ln_number = number,
.ln_type = glops->go_type,
.ln_sbd = sdp };
- struct gfs2_glock *gl, *tmp = NULL;
+ struct gfs2_glock *gl, *tmp;
struct address_space *mapping;
struct kmem_cache *cachep;
- int ret, tries = 0;
+ int ret = 0;
rcu_read_lock();
gl = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms);
@@ -721,35 +725,32 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
}
again:
- ret = rhashtable_lookup_insert_fast(&gl_hash_table, &gl->gl_node,
- ht_parms);
- if (ret == 0) {
+ rcu_read_lock();
+ tmp = rhashtable_lookup_get_insert_fast(&gl_hash_table, &gl->gl_node,
+ ht_parms);
+ if (!tmp) {
*glp = gl;
- return 0;
+ goto out;
}
-
- if (ret == -EEXIST) {
- ret = 0;
- rcu_read_lock();
- tmp = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms);
- if (tmp == NULL || !lockref_get_not_dead(&tmp->gl_lockref)) {
- if (++tries < 100) {
- rcu_read_unlock();
- cond_resched();
- goto again;
- }
- tmp = NULL;
- ret = -ENOMEM;
- }
- rcu_read_unlock();
- } else {
- WARN_ON_ONCE(ret);
+ if (IS_ERR(tmp)) {
+ ret = PTR_ERR(tmp);
+ goto out_free;
}
+ if (lockref_get_not_dead(&tmp->gl_lockref)) {
+ *glp = tmp;
+ goto out_free;
+ }
+ rcu_read_unlock();
+ cond_resched();
+ goto again;
+
+out_free:
kfree(gl->gl_lksb.sb_lvbptr);
kmem_cache_free(cachep, gl);
atomic_dec(&sdp->sd_glock_disposal);
- *glp = tmp;
+out:
+ rcu_read_unlock();
return ret;
}
@@ -1918,10 +1919,10 @@ static const struct seq_operations gfs2_sbstats_seq_ops = {
#define GFS2_SEQ_GOODSIZE min(PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER, 65536UL)
-static int gfs2_glocks_open(struct inode *inode, struct file *file)
+static int __gfs2_glocks_open(struct inode *inode, struct file *file,
+ const struct seq_operations *ops)
{
- int ret = seq_open_private(file, &gfs2_glock_seq_ops,
- sizeof(struct gfs2_glock_iter));
+ int ret = seq_open_private(file, ops, sizeof(struct gfs2_glock_iter));
if (ret == 0) {
struct seq_file *seq = file->private_data;
struct gfs2_glock_iter *gi = seq->private;
@@ -1932,11 +1933,16 @@ static int gfs2_glocks_open(struct inode *inode, struct file *file)
if (seq->buf)
seq->size = GFS2_SEQ_GOODSIZE;
gi->gl = NULL;
- ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL);
+ rhashtable_walk_enter(&gl_hash_table, &gi->hti);
}
return ret;
}
+static int gfs2_glocks_open(struct inode *inode, struct file *file)
+{
+ return __gfs2_glocks_open(inode, file, &gfs2_glock_seq_ops);
+}
+
static int gfs2_glocks_release(struct inode *inode, struct file *file)
{
struct seq_file *seq = file->private_data;
@@ -1949,20 +1955,7 @@ static int gfs2_glocks_release(struct inode *inode, struct file *file)
static int gfs2_glstats_open(struct inode *inode, struct file *file)
{
- int ret = seq_open_private(file, &gfs2_glstats_seq_ops,
- sizeof(struct gfs2_glock_iter));
- if (ret == 0) {
- struct seq_file *seq = file->private_data;
- struct gfs2_glock_iter *gi = seq->private;
- gi->sdp = inode->i_private;
- gi->last_pos = 0;
- seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN);
- if (seq->buf)
- seq->size = GFS2_SEQ_GOODSIZE;
- gi->gl = NULL;
- ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL);
- }
- return ret;
+ return __gfs2_glocks_open(inode, file, &gfs2_glstats_seq_ops);
}
static int gfs2_sbstats_open(struct inode *inode, struct file *file)
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 511e1ed7e2de..b7cf65d13561 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -203,11 +203,15 @@ enum {
DFL_DLM_RECOVERY = 6,
};
+/*
+ * We are using struct lm_lockname as an rhashtable key. Avoid holes within
+ * the struct; padding at the end is fine.
+ */
struct lm_lockname {
- struct gfs2_sbd *ln_sbd;
u64 ln_number;
+ struct gfs2_sbd *ln_sbd;
unsigned int ln_type;
-} __packed __aligned(sizeof(int));
+};
#define lm_name_equal(name1, name2) \
(((name1)->ln_number == (name2)->ln_number) && \
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index e279c3ce27be..9f605ea4810c 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -202,8 +202,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
fail_refresh:
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
ip->i_iopen_gh.gh_gl->gl_object = NULL;
- gfs2_glock_dq_wait(&ip->i_iopen_gh);
- gfs2_holder_uninit(&ip->i_iopen_gh);
+ gfs2_glock_dq_uninit(&ip->i_iopen_gh);
fail_put:
if (io_gl)
gfs2_glock_put(io_gl);
@@ -667,6 +666,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
ip->i_height = 0;
ip->i_depth = 0;
ip->i_entries = 0;
+ ip->i_no_addr = 0; /* Temporarily zero until real addr is assigned */
switch(mode & S_IFMT) {
case S_IFREG:
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 86ccc0159393..83c9909ff14a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -483,13 +483,6 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
}
}
-static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
-{
- u64 first = rgd->rd_data0;
- u64 last = first + rgd->rd_data;
- return first <= block && block < last;
-}
-
/**
* gfs2_blk2rgrpd - Find resource group for a given data/meta block number
* @sdp: The GFS2 superblock
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 66b51cf66dfa..e90478e2f545 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -83,5 +83,12 @@ static inline bool gfs2_rs_active(const struct gfs2_blkreserv *rs)
return rs && !RB_EMPTY_NODE(&rs->rs_node);
}
+static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
+{
+ u64 first = rgd->rd_data0;
+ u64 last = first + rgd->rd_data;
+ return first <= block && block < last;
+}
+
extern void check_and_update_goal(struct gfs2_inode *ip);
#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 361796a84fce..29b0473f6e74 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -793,7 +793,8 @@ static void gfs2_dirty_inode(struct inode *inode, int flags)
if (!(flags & (I_DIRTY_DATASYNC|I_DIRTY_SYNC)))
return;
-
+ if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ return;
if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
if (ret) {
@@ -1538,8 +1539,7 @@ static void gfs2_evict_inode(struct inode *inode)
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh);
if (unlikely(error)) {
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq_wait(&ip->i_iopen_gh);
- gfs2_holder_uninit(&ip->i_iopen_gh);
+ gfs2_glock_dq_uninit(&ip->i_iopen_gh);
goto out;
}
@@ -1617,7 +1617,7 @@ out_unlock:
if (gfs2_holder_initialized(&ip->i_iopen_gh)) {
if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq_wait(&ip->i_iopen_gh);
+ gfs2_glock_dq(&ip->i_iopen_gh);
}
gfs2_holder_uninit(&ip->i_iopen_gh);
}
@@ -1639,8 +1639,7 @@ out:
if (gfs2_holder_initialized(&ip->i_iopen_gh)) {
ip->i_iopen_gh.gh_gl->gl_object = NULL;
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq_wait(&ip->i_iopen_gh);
- gfs2_holder_uninit(&ip->i_iopen_gh);
+ gfs2_glock_dq_uninit(&ip->i_iopen_gh);
}
}
diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c
index e33a0d36a93e..5d0182654580 100644
--- a/fs/hfs/extent.c
+++ b/fs/hfs/extent.c
@@ -485,8 +485,8 @@ void hfs_file_truncate(struct inode *inode)
/* XXX: Can use generic_cont_expand? */
size = inode->i_size - 1;
- res = pagecache_write_begin(NULL, mapping, size+1, 0,
- AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
+ res = pagecache_write_begin(NULL, mapping, size+1, 0, 0,
+ &page, &fsdata);
if (!res) {
res = pagecache_write_end(NULL, mapping, size+1, 0, 0,
page, fsdata);
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index feca524ce2a5..a3eb640b4f8f 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -545,9 +545,8 @@ void hfsplus_file_truncate(struct inode *inode)
void *fsdata;
loff_t size = inode->i_size;
- res = pagecache_write_begin(NULL, mapping, size, 0,
- AOP_FLAG_UNINTERRUPTIBLE,
- &page, &fsdata);
+ res = pagecache_write_begin(NULL, mapping, size, 0, 0,
+ &page, &fsdata);
if (res)
return;
res = pagecache_write_end(NULL, mapping, size,
diff --git a/fs/inode.c b/fs/inode.c
index 131b2bcebc48..db5914783a71 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -119,7 +119,7 @@ static int no_open(struct inode *inode, struct file *file)
}
/**
- * inode_init_always - perform inode structure intialisation
+ * inode_init_always - perform inode structure initialisation
* @sb: superblock inode belongs to
* @inode: inode to initialise
*
@@ -402,6 +402,8 @@ static void inode_lru_list_add(struct inode *inode)
{
if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru))
this_cpu_inc(nr_unused);
+ else
+ inode->i_state |= I_REFERENCED;
}
/*
@@ -1489,7 +1491,6 @@ static void iput_final(struct inode *inode)
drop = generic_drop_inode(inode);
if (!drop && (sb->s_flags & MS_ACTIVE)) {
- inode->i_state |= I_REFERENCED;
inode_add_lru(inode);
spin_unlock(&inode->i_lock);
return;
diff --git a/fs/internal.h b/fs/internal.h
index 076751d90ba2..9676fe11c093 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -126,8 +126,6 @@ static inline bool atime_needs_update_rcu(const struct path *path,
return __atime_needs_update(path, inode, true);
}
-extern bool atime_needs_update_rcu(const struct path *, struct inode *);
-
/*
* fs-writeback.c
*/
diff --git a/fs/iomap.c b/fs/iomap.c
index 1c25ae30500e..4b10892967a5 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -158,12 +158,6 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
ssize_t written = 0;
unsigned int flags = AOP_FLAG_NOFS;
- /*
- * Copies from kernel address space cannot fail (NFSD is a big user).
- */
- if (!iter_is_iovec(i))
- flags |= AOP_FLAG_UNINTERRUPTIBLE;
-
do {
struct page *page;
unsigned long offset; /* Offset into pagecache page */
@@ -291,8 +285,7 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
return PTR_ERR(rpage);
status = iomap_write_begin(inode, pos, bytes,
- AOP_FLAG_NOFS | AOP_FLAG_UNINTERRUPTIBLE,
- &page, iomap);
+ AOP_FLAG_NOFS, &page, iomap);
put_page(rpage);
if (unlikely(status))
return status;
@@ -343,8 +336,8 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
struct page *page;
int status;
- status = iomap_write_begin(inode, pos, bytes,
- AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS, &page, iomap);
+ status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page,
+ iomap);
if (status)
return status;
@@ -360,7 +353,8 @@ static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
sector_t sector = iomap->blkno +
(((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
- return __dax_zero_page_range(iomap->bdev, sector, offset, bytes);
+ return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector,
+ offset, bytes);
}
static loff_t
@@ -909,6 +903,9 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
break;
}
pos += ret;
+
+ if (iov_iter_rw(iter) == READ && pos >= dio->i_size)
+ break;
} while ((count = iov_iter_count(iter)) > 0);
blk_finish_plug(&plug);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c43fe83ee708..5a0245e36240 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -700,8 +700,21 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
{
int err = 0;
- jbd2_might_wait_for_commit(journal);
read_lock(&journal->j_state_lock);
+#ifdef CONFIG_PROVE_LOCKING
+ /*
+ * Some callers make sure transaction is already committing and in that
+ * case we cannot block on open handles anymore. So don't warn in that
+ * case.
+ */
+ if (tid_gt(tid, journal->j_commit_sequence) &&
+ (!journal->j_committing_transaction ||
+ journal->j_committing_transaction->t_tid != tid)) {
+ read_unlock(&journal->j_state_lock);
+ jbd2_might_wait_for_commit(journal);
+ read_lock(&journal->j_state_lock);
+ }
+#endif
#ifdef CONFIG_JBD2_DEBUG
if (!tid_geq(journal->j_commit_request, tid)) {
printk(KERN_ERR
@@ -922,7 +935,8 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
* space and if we lose sb update during power failure we'd replay
* old transaction with possibly newly overwritten data.
*/
- ret = jbd2_journal_update_sb_log_tail(journal, tid, block, REQ_FUA);
+ ret = jbd2_journal_update_sb_log_tail(journal, tid, block,
+ REQ_SYNC | REQ_FUA);
if (ret)
goto out;
@@ -1323,7 +1337,7 @@ static int journal_reset(journal_t *journal)
jbd2_journal_update_sb_log_tail(journal,
journal->j_tail_sequence,
journal->j_tail,
- REQ_FUA);
+ REQ_SYNC | REQ_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
}
return jbd2_journal_start_thread(journal);
@@ -1463,7 +1477,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
sb->s_errno = cpu_to_be32(journal->j_errno);
read_unlock(&journal->j_state_lock);
- jbd2_write_superblock(journal, REQ_FUA);
+ jbd2_write_superblock(journal, REQ_SYNC | REQ_FUA);
}
EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
@@ -1730,7 +1744,7 @@ int jbd2_journal_destroy(journal_t *journal)
write_unlock(&journal->j_state_lock);
jbd2_mark_journal_empty(journal,
- REQ_PREFLUSH | REQ_FUA);
+ REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
} else
err = -EIO;
@@ -1989,7 +2003,7 @@ int jbd2_journal_flush(journal_t *journal)
* the magic code for a fully-recovered superblock. Any future
* commits of data to the journal will restore the current
* s_start value. */
- jbd2_mark_journal_empty(journal, REQ_FUA);
+ jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
write_lock(&journal->j_state_lock);
J_ASSERT(!journal->j_running_transaction);
@@ -2035,7 +2049,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
if (write) {
/* Lock to make assertions happy... */
mutex_lock(&journal->j_checkpoint_mutex);
- jbd2_mark_journal_empty(journal, REQ_FUA);
+ jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
}
diff --git a/fs/libfs.c b/fs/libfs.c
index a8b62e5d43a9..a04395334bb1 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -507,7 +507,7 @@ EXPORT_SYMBOL(simple_write_end);
* to pass it an appropriate max_reserved value to avoid collisions.
*/
int simple_fill_super(struct super_block *s, unsigned long magic,
- struct tree_descr *files)
+ const struct tree_descr *files)
{
struct inode *inode;
struct dentry *root;
diff --git a/fs/namei.c b/fs/namei.c
index 9a7f8bd748d8..7286f87ce863 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4766,7 +4766,7 @@ int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
struct page *page;
void *fsdata;
int err;
- unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
+ unsigned int flags = 0;
if (nofs)
flags |= AOP_FLAG_NOFS;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9dc65d7ae754..7b38fedb7e03 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -139,7 +139,7 @@ struct nfs_mount_request {
};
struct nfs_mount_info {
- int (*fill_super)(struct super_block *, struct nfs_mount_info *);
+ void (*fill_super)(struct super_block *, struct nfs_mount_info *);
int (*set_security)(struct super_block *, struct dentry *, struct nfs_mount_info *);
struct nfs_parsed_mount_data *parsed;
struct nfs_clone_mount *cloned;
@@ -407,7 +407,7 @@ struct dentry *nfs_fs_mount(struct file_system_type *, int, const char *, void *
struct dentry * nfs_xdev_mount_common(struct file_system_type *, int,
const char *, struct nfs_mount_info *);
void nfs_kill_super(struct super_block *);
-int nfs_fill_super(struct super_block *, struct nfs_mount_info *);
+void nfs_fill_super(struct super_block *, struct nfs_mount_info *);
extern struct rpc_stat nfs_rpcstat;
@@ -458,7 +458,7 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
/* super.c */
-int nfs_clone_super(struct super_block *, struct nfs_mount_info *);
+void nfs_clone_super(struct super_block *, struct nfs_mount_info *);
void nfs_umount_begin(struct super_block *);
int nfs_statfs(struct dentry *, struct kstatfs *);
int nfs_show_options(struct seq_file *, struct dentry *);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index dc69314d455e..2f3822a4a7d5 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2321,11 +2321,10 @@ inline void nfs_initialise_sb(struct super_block *sb)
/*
* Finish setting up an NFS2/3 superblock
*/
-int nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info)
+void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info)
{
struct nfs_parsed_mount_data *data = mount_info->parsed;
struct nfs_server *server = NFS_SB(sb);
- int ret;
sb->s_blocksize_bits = 0;
sb->s_blocksize = 0;
@@ -2343,21 +2342,13 @@ int nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info)
}
nfs_initialise_sb(sb);
-
- ret = super_setup_bdi_name(sb, "%u:%u", MAJOR(server->s_dev),
- MINOR(server->s_dev));
- if (ret)
- return ret;
- sb->s_bdi->ra_pages = server->rpages * NFS_MAX_READAHEAD;
- return 0;
-
}
EXPORT_SYMBOL_GPL(nfs_fill_super);
/*
* Finish setting up a cloned NFS2/3/4 superblock
*/
-int nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)
+void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)
{
const struct super_block *old_sb = mount_info->cloned->sb;
struct nfs_server *server = NFS_SB(sb);
@@ -2377,10 +2368,6 @@ int nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)
}
nfs_initialise_sb(sb);
-
- sb->s_bdi = bdi_get(old_sb->s_bdi);
-
- return 0;
}
static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags)
@@ -2600,14 +2587,19 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
nfs_free_server(server);
server = NULL;
} else {
+ error = super_setup_bdi_name(s, "%u:%u", MAJOR(server->s_dev),
+ MINOR(server->s_dev));
+ if (error) {
+ mntroot = ERR_PTR(error);
+ goto error_splat_super;
+ }
+ s->s_bdi->ra_pages = server->rpages * NFS_MAX_READAHEAD;
server->super = s;
}
if (!s->s_root) {
/* initial superblock/root creation */
- error = mount_info->fill_super(s, mount_info);
- if (error)
- goto error_splat_super;
+ mount_info->fill_super(s, mount_info);
nfs_get_cache_cookie(s, mount_info->parsed, mount_info->cloned);
}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 8bf8f667a8cf..6493df6b1bd5 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1146,7 +1146,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
{
- static struct tree_descr nfsd_files[] = {
+ static const struct tree_descr nfsd_files[] = {
[NFSD_List] = {"exports", &exports_nfsd_operations, S_IRUGO},
[NFSD_Export_features] = {"export_features",
&export_features_operations, S_IRUGO},
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 323f492e0822..f3db56e83dd2 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -196,9 +196,11 @@ int ns_get_name(char *buf, size_t size, struct task_struct *task,
{
struct ns_common *ns;
int res = -ENOENT;
+ const char *name;
ns = ns_ops->get(task);
if (ns) {
- res = snprintf(buf, size, "%s:[%u]", ns_ops->name, ns->inum);
+ name = ns_ops->real_ns_name ? : ns_ops->name;
+ res = snprintf(buf, size, "%s:[%u]", name, ns->inum);
ns_ops->put(ns);
}
return res;
diff --git a/fs/open.c b/fs/open.c
index 4d23f729dcc6..373787afd638 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -900,6 +900,12 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
int lookup_flags = 0;
int acc_mode = ACC_MODE(flags);
+ /*
+ * Clear out all open flags we don't know about so that we don't report
+ * them in fcntl(F_GETFD) or similar interfaces.
+ */
+ flags &= VALID_OPEN_FLAGS;
+
if (flags & (O_CREAT | __O_TMPFILE))
op->mode = (mode & S_IALLUGO) | S_IFREG;
else
diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c
index e1534c9bab16..c19f0787c9c6 100644
--- a/fs/orangefs/devorangefs-req.c
+++ b/fs/orangefs/devorangefs-req.c
@@ -180,6 +180,10 @@ static ssize_t orangefs_devreq_read(struct file *file,
return -EINVAL;
}
+ /* Check for an empty list before locking. */
+ if (list_empty(&orangefs_request_list))
+ return -EAGAIN;
+
restart:
/* Get next op (if any) from top of list. */
spin_lock(&orangefs_request_list_lock);
diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c
index 284373a57a08..d327cbd17756 100644
--- a/fs/orangefs/dir.c
+++ b/fs/orangefs/dir.c
@@ -1,396 +1,404 @@
/*
- * (C) 2001 Clemson University and The University of Chicago
- *
- * See COPYING in top-level directory.
+ * Copyright 2017 Omnibond Systems, L.L.C.
*/
#include "protocol.h"
#include "orangefs-kernel.h"
#include "orangefs-bufmap.h"
+struct orangefs_dir_part {
+ struct orangefs_dir_part *next;
+ size_t len;
+};
+
+struct orangefs_dir {
+ __u64 token;
+ struct orangefs_dir_part *part;
+ loff_t end;
+ int error;
+};
+
+#define PART_SHIFT (24)
+#define PART_SIZE (1<<24)
+#define PART_MASK (~(PART_SIZE - 1))
+
/*
- * decode routine used by kmod to deal with the blob sent from
- * userspace for readdirs. The blob contains zero or more of these
- * sub-blobs:
- * __u32 - represents length of the character string that follows.
- * string - between 1 and ORANGEFS_NAME_MAX bytes long.
- * padding - (if needed) to cause the __u32 plus the string to be
- * eight byte aligned.
- * khandle - sizeof(khandle) bytes.
+ * There can be up to 512 directory entries. Each entry is encoded as
+ * follows:
+ * 4 bytes: string size (n)
+ * n bytes: string
+ * 1 byte: trailing zero
+ * padding to 8 bytes
+ * 16 bytes: khandle
+ * padding to 8 bytes
+ *
+ * The trailer_buf starts with a struct orangefs_readdir_response_s
+ * which must be skipped to get to the directory data.
+ *
+ * The data which is received from the userspace daemon is termed a
+ * part and is stored in a linked list in case more than one part is
+ * needed for a large directory.
+ *
+ * The position pointer (ctx->pos) encodes the part and offset on which
+ * to begin reading at. Bits above PART_SHIFT encode the part and bits
+ * below PART_SHIFT encode the offset. Parts are stored in a linked
+ * list which grows as data is received from the server. The overhead
+ * associated with managing the list is presumed to be small compared to
+ * the overhead of communicating with the server.
+ *
+ * As data is received from the server, it is placed at the end of the
+ * part list. Data is parsed from the current position as it is needed.
+ * When data is determined to be corrupt, it is either because the
+ * userspace component has sent back corrupt data or because the file
+ * pointer has been moved to an invalid location. Since the two cannot
+ * be differentiated, return EIO.
+ *
+ * Part zero is synthesized to contains `.' and `..'. Part one is the
+ * first part of the part list.
*/
-static long decode_dirents(char *ptr, size_t size,
- struct orangefs_readdir_response_s *readdir)
+
+static int do_readdir(struct orangefs_inode_s *oi,
+ struct orangefs_dir *od, struct dentry *dentry,
+ struct orangefs_kernel_op_s *op)
{
- int i;
- struct orangefs_readdir_response_s *rd =
- (struct orangefs_readdir_response_s *) ptr;
- char *buf = ptr;
- int khandle_size = sizeof(struct orangefs_khandle);
- size_t offset = offsetof(struct orangefs_readdir_response_s,
- dirent_array);
- /* 8 reflects eight byte alignment */
- int smallest_blob = khandle_size + 8;
- __u32 len;
- int aligned_len;
- int sizeof_u32 = sizeof(__u32);
- long ret;
-
- gossip_debug(GOSSIP_DIR_DEBUG, "%s: size:%zu:\n", __func__, size);
-
- /* size is = offset on empty dirs, > offset on non-empty dirs... */
- if (size < offset) {
- gossip_err("%s: size:%zu: offset:%zu:\n",
- __func__,
- size,
- offset);
- ret = -EINVAL;
- goto out;
- }
+ struct orangefs_readdir_response_s *resp;
+ int bufi, r;
- if ((size == offset) && (readdir->orangefs_dirent_outcount != 0)) {
- gossip_err("%s: size:%zu: dirent_outcount:%d:\n",
- __func__,
- size,
- readdir->orangefs_dirent_outcount);
- ret = -EINVAL;
- goto out;
- }
+ /*
+ * Despite the badly named field, readdir does not use shared
+ * memory. However, there are a limited number of readdir
+ * slots, which must be allocated here. This flag simply tells
+ * the op scheduler to return the op here for retry.
+ */
+ op->uses_shared_memory = 1;
+ op->upcall.req.readdir.refn = oi->refn;
+ op->upcall.req.readdir.token = od->token;
+ op->upcall.req.readdir.max_dirent_count =
+ ORANGEFS_MAX_DIRENT_COUNT_READDIR;
- readdir->token = rd->token;
- readdir->orangefs_dirent_outcount = rd->orangefs_dirent_outcount;
- readdir->dirent_array = kcalloc(readdir->orangefs_dirent_outcount,
- sizeof(*readdir->dirent_array),
- GFP_KERNEL);
- if (readdir->dirent_array == NULL) {
- gossip_err("%s: kcalloc failed.\n", __func__);
- ret = -ENOMEM;
- goto out;
+again:
+ bufi = orangefs_readdir_index_get();
+ if (bufi < 0) {
+ od->error = bufi;
+ return bufi;
}
- buf += offset;
- size -= offset;
-
- for (i = 0; i < readdir->orangefs_dirent_outcount; i++) {
- if (size < smallest_blob) {
- gossip_err("%s: size:%zu: smallest_blob:%d:\n",
- __func__,
- size,
- smallest_blob);
- ret = -EINVAL;
- goto free;
- }
+ op->upcall.req.readdir.buf_index = bufi;
- len = *(__u32 *)buf;
- if ((len < 1) || (len > ORANGEFS_NAME_MAX)) {
- gossip_err("%s: len:%d:\n", __func__, len);
- ret = -EINVAL;
- goto free;
- }
+ r = service_operation(op, "orangefs_readdir",
+ get_interruptible_flag(dentry->d_inode));
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: size:%zu: len:%d:\n",
- __func__,
- size,
- len);
+ orangefs_readdir_index_put(bufi);
- readdir->dirent_array[i].d_name = buf + sizeof_u32;
- readdir->dirent_array[i].d_length = len;
+ if (op_state_purged(op)) {
+ if (r == -EAGAIN) {
+ vfree(op->downcall.trailer_buf);
+ goto again;
+ } else if (r == -EIO) {
+ vfree(op->downcall.trailer_buf);
+ od->error = r;
+ return r;
+ }
+ }
- /*
- * Calculate "aligned" length of this string and its
- * associated __u32 descriptor.
- */
- aligned_len = ((sizeof_u32 + len + 1) + 7) & ~7;
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: aligned_len:%d:\n",
- __func__,
- aligned_len);
+ if (r < 0) {
+ vfree(op->downcall.trailer_buf);
+ od->error = r;
+ return r;
+ } else if (op->downcall.status) {
+ vfree(op->downcall.trailer_buf);
+ od->error = op->downcall.status;
+ return op->downcall.status;
+ }
- /*
- * The end of the blob should coincide with the end
- * of the last sub-blob.
- */
- if (size < aligned_len + khandle_size) {
- gossip_err("%s: ran off the end of the blob.\n",
- __func__);
- ret = -EINVAL;
- goto free;
- }
- size -= aligned_len + khandle_size;
+ /*
+ * The maximum size is size per entry times the 512 entries plus
+ * the header. This is well under the limit.
+ */
+ if (op->downcall.trailer_size > PART_SIZE) {
+ vfree(op->downcall.trailer_buf);
+ od->error = -EIO;
+ return -EIO;
+ }
- buf += aligned_len;
+ resp = (struct orangefs_readdir_response_s *)
+ op->downcall.trailer_buf;
+ od->token = resp->token;
+ return 0;
+}
- readdir->dirent_array[i].khandle =
- *(struct orangefs_khandle *) buf;
- buf += khandle_size;
+static int parse_readdir(struct orangefs_dir *od,
+ struct orangefs_kernel_op_s *op)
+{
+ struct orangefs_dir_part *part, *new;
+ size_t count;
+
+ count = 1;
+ part = od->part;
+ while (part) {
+ count++;
+ if (part->next)
+ part = part->next;
+ else
+ break;
}
- ret = buf - ptr;
- gossip_debug(GOSSIP_DIR_DEBUG, "%s: returning:%ld:\n", __func__, ret);
- goto out;
-free:
- kfree(readdir->dirent_array);
- readdir->dirent_array = NULL;
+ new = (void *)op->downcall.trailer_buf;
+ new->next = NULL;
+ new->len = op->downcall.trailer_size -
+ sizeof(struct orangefs_readdir_response_s);
+ if (!od->part)
+ od->part = new;
+ else
+ part->next = new;
+ count++;
+ od->end = count << PART_SHIFT;
-out:
- return ret;
+ return 0;
}
-/*
- * Read directory entries from an instance of an open directory.
- */
-static int orangefs_readdir(struct file *file, struct dir_context *ctx)
+static int orangefs_dir_more(struct orangefs_inode_s *oi,
+ struct orangefs_dir *od, struct dentry *dentry)
{
- int ret = 0;
- int buffer_index;
- /*
- * ptoken supports Orangefs' distributed directory logic, added
- * in 2.9.2.
- */
- __u64 *ptoken = file->private_data;
- __u64 pos = 0;
- ino_t ino = 0;
- struct dentry *dentry = file->f_path.dentry;
- struct orangefs_kernel_op_s *new_op = NULL;
- struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(dentry->d_inode);
- struct orangefs_readdir_response_s readdir_response;
- void *dents_buf;
- int i = 0;
- int len = 0;
- ino_t current_ino = 0;
- char *current_entry = NULL;
- long bytes_decoded;
-
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: ctx->pos:%lld, ptoken = %llu\n",
- __func__,
- lld(ctx->pos),
- llu(*ptoken));
-
- pos = (__u64) ctx->pos;
-
- /* are we done? */
- if (pos == ORANGEFS_READDIR_END) {
- gossip_debug(GOSSIP_DIR_DEBUG,
- "Skipping to termination path\n");
- return 0;
+ struct orangefs_kernel_op_s *op;
+ int r;
+
+ op = op_alloc(ORANGEFS_VFS_OP_READDIR);
+ if (!op) {
+ od->error = -ENOMEM;
+ return -ENOMEM;
+ }
+ r = do_readdir(oi, od, dentry, op);
+ if (r) {
+ od->error = r;
+ goto out;
+ }
+ r = parse_readdir(od, op);
+ if (r) {
+ od->error = r;
+ goto out;
}
- gossip_debug(GOSSIP_DIR_DEBUG,
- "orangefs_readdir called on %pd (pos=%llu)\n",
- dentry, llu(pos));
+ od->error = 0;
+out:
+ op_release(op);
+ return od->error;
+}
- memset(&readdir_response, 0, sizeof(readdir_response));
+static int fill_from_part(struct orangefs_dir_part *part,
+ struct dir_context *ctx)
+{
+ const int offset = sizeof(struct orangefs_readdir_response_s);
+ struct orangefs_khandle *khandle;
+ __u32 *len, padlen;
+ loff_t i;
+ char *s;
+ i = ctx->pos & ~PART_MASK;
- new_op = op_alloc(ORANGEFS_VFS_OP_READDIR);
- if (!new_op)
- return -ENOMEM;
+ /* The file offset from userspace is too large. */
+ if (i > part->len)
+ return 1;
/*
- * Only the indices are shared. No memory is actually shared, but the
- * mechanism is used.
+ * If the seek pointer is positioned just before an entry it
+ * should find the next entry.
*/
- new_op->uses_shared_memory = 1;
- new_op->upcall.req.readdir.refn = orangefs_inode->refn;
- new_op->upcall.req.readdir.max_dirent_count =
- ORANGEFS_MAX_DIRENT_COUNT_READDIR;
-
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: upcall.req.readdir.refn.khandle: %pU\n",
- __func__,
- &new_op->upcall.req.readdir.refn.khandle);
+ if (i % 8)
+ i = i + (8 - i%8)%8;
- new_op->upcall.req.readdir.token = *ptoken;
-
-get_new_buffer_index:
- buffer_index = orangefs_readdir_index_get();
- if (buffer_index < 0) {
- ret = buffer_index;
- gossip_lerr("orangefs_readdir: orangefs_readdir_index_get() failure (%d)\n",
- ret);
- goto out_free_op;
+ while (i < part->len) {
+ if (part->len < i + sizeof *len)
+ break;
+ len = (void *)part + offset + i;
+ /*
+ * len is the size of the string itself. padlen is the
+ * total size of the encoded string.
+ */
+ padlen = (sizeof *len + *len + 1) +
+ (8 - (sizeof *len + *len + 1)%8)%8;
+ if (part->len < i + padlen + sizeof *khandle)
+ goto next;
+ s = (void *)part + offset + i + sizeof *len;
+ if (s[*len] != 0)
+ goto next;
+ khandle = (void *)part + offset + i + padlen;
+ if (!dir_emit(ctx, s, *len,
+ orangefs_khandle_to_ino(khandle),
+ DT_UNKNOWN))
+ return 0;
+ i += padlen + sizeof *khandle;
+ i = i + (8 - i%8)%8;
+ BUG_ON(i > part->len);
+ ctx->pos = (ctx->pos & PART_MASK) | i;
+ continue;
+next:
+ i += 8;
}
- new_op->upcall.req.readdir.buf_index = buffer_index;
-
- ret = service_operation(new_op,
- "orangefs_readdir",
- get_interruptible_flag(dentry->d_inode));
+ return 1;
+}
- gossip_debug(GOSSIP_DIR_DEBUG,
- "Readdir downcall status is %d. ret:%d\n",
- new_op->downcall.status,
- ret);
+static int orangefs_dir_fill(struct orangefs_inode_s *oi,
+ struct orangefs_dir *od, struct dentry *dentry,
+ struct dir_context *ctx)
+{
+ struct orangefs_dir_part *part;
+ size_t count;
- orangefs_readdir_index_put(buffer_index);
+ count = ((ctx->pos & PART_MASK) >> PART_SHIFT) - 1;
- if (ret == -EAGAIN && op_state_purged(new_op)) {
- /* Client-core indices are invalid after it restarted. */
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: Getting new buffer_index for retry of readdir..\n",
- __func__);
- goto get_new_buffer_index;
+ part = od->part;
+ while (part->next && count) {
+ count--;
+ part = part->next;
}
-
- if (ret == -EIO && op_state_purged(new_op)) {
- gossip_err("%s: Client is down. Aborting readdir call.\n",
- __func__);
- goto out_free_op;
+ /* This means the userspace file offset is invalid. */
+ if (count) {
+ od->error = -EIO;
+ return -EIO;
}
- if (ret < 0 || new_op->downcall.status != 0) {
- gossip_debug(GOSSIP_DIR_DEBUG,
- "Readdir request failed. Status:%d\n",
- new_op->downcall.status);
- if (ret >= 0)
- ret = new_op->downcall.status;
- goto out_free_op;
+ while (part && part->len) {
+ int r;
+ r = fill_from_part(part, ctx);
+ if (r < 0) {
+ od->error = r;
+ return r;
+ } else if (r == 0) {
+ /* Userspace buffer is full. */
+ break;
+ } else {
+ /*
+ * The part ran out of data. Move to the next
+ * part. */
+ ctx->pos = (ctx->pos & PART_MASK) +
+ (1 << PART_SHIFT);
+ part = part->next;
+ }
}
+ return 0;
+}
- dents_buf = new_op->downcall.trailer_buf;
- if (dents_buf == NULL) {
- gossip_err("Invalid NULL buffer in readdir response\n");
- ret = -ENOMEM;
- goto out_free_op;
+static loff_t orangefs_dir_llseek(struct file *file, loff_t offset,
+ int whence)
+{
+ struct orangefs_dir *od = file->private_data;
+ /*
+ * Delete the stored data so userspace sees new directory
+ * entries.
+ */
+ if (!whence && offset < od->end) {
+ struct orangefs_dir_part *part = od->part;
+ while (part) {
+ struct orangefs_dir_part *next = part->next;
+ vfree(part);
+ part = next;
+ }
+ od->token = ORANGEFS_ITERATE_START;
+ od->part = NULL;
+ od->end = 1 << PART_SHIFT;
}
+ return default_llseek(file, offset, whence);
+}
- bytes_decoded = decode_dirents(dents_buf, new_op->downcall.trailer_size,
- &readdir_response);
- if (bytes_decoded < 0) {
- ret = bytes_decoded;
- gossip_err("Could not decode readdir from buffer %d\n", ret);
- goto out_vfree;
- }
+static int orangefs_dir_iterate(struct file *file,
+ struct dir_context *ctx)
+{
+ struct orangefs_inode_s *oi;
+ struct orangefs_dir *od;
+ struct dentry *dentry;
+ int r;
- if (bytes_decoded != new_op->downcall.trailer_size) {
- gossip_err("orangefs_readdir: # bytes decoded (%ld) "
- "!= trailer size (%ld)\n",
- bytes_decoded,
- (long)new_op->downcall.trailer_size);
- ret = -EINVAL;
- goto out_destroy_handle;
- }
+ dentry = file->f_path.dentry;
+ oi = ORANGEFS_I(dentry->d_inode);
+ od = file->private_data;
- /*
- * orangefs doesn't actually store dot and dot-dot, but
- * we need to have them represented.
- */
- if (pos == 0) {
- ino = get_ino_from_khandle(dentry->d_inode);
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: calling dir_emit of \".\" with pos = %llu\n",
- __func__,
- llu(pos));
- ret = dir_emit(ctx, ".", 1, ino, DT_DIR);
- pos += 1;
- }
+ if (od->error)
+ return od->error;
- if (pos == 1) {
- ino = get_parent_ino_from_dentry(dentry);
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: calling dir_emit of \"..\" with pos = %llu\n",
- __func__,
- llu(pos));
- ret = dir_emit(ctx, "..", 2, ino, DT_DIR);
- pos += 1;
+ if (ctx->pos == 0) {
+ if (!dir_emit_dot(file, ctx))
+ return 0;
+ ctx->pos++;
+ }
+ if (ctx->pos == 1) {
+ if (!dir_emit_dotdot(file, ctx))
+ return 0;
+ ctx->pos = 1 << PART_SHIFT;
}
/*
- * we stored ORANGEFS_ITERATE_NEXT in ctx->pos last time around
- * to prevent "finding" dot and dot-dot on any iteration
- * other than the first.
+ * The seek position is in the first synthesized part but is not
+ * valid.
*/
- if (ctx->pos == ORANGEFS_ITERATE_NEXT)
- ctx->pos = 0;
-
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: dirent_outcount:%d:\n",
- __func__,
- readdir_response.orangefs_dirent_outcount);
- for (i = ctx->pos;
- i < readdir_response.orangefs_dirent_outcount;
- i++) {
- len = readdir_response.dirent_array[i].d_length;
- current_entry = readdir_response.dirent_array[i].d_name;
- current_ino = orangefs_khandle_to_ino(
- &readdir_response.dirent_array[i].khandle);
-
- gossip_debug(GOSSIP_DIR_DEBUG,
- "calling dir_emit for %s with len %d"
- ", ctx->pos %ld\n",
- current_entry,
- len,
- (unsigned long)ctx->pos);
- /*
- * type is unknown. We don't return object type
- * in the dirent_array. This leaves getdents
- * clueless about type.
- */
- ret =
- dir_emit(ctx, current_entry, len, current_ino, DT_UNKNOWN);
- if (!ret)
- break;
- ctx->pos++;
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: ctx->pos:%lld\n",
- __func__,
- lld(ctx->pos));
+ if ((ctx->pos & PART_MASK) == 0)
+ return -EIO;
- }
+ r = 0;
/*
- * we ran all the way through the last batch, set up for
- * getting another batch...
+ * Must read more if the user has sought past what has been read
+ * so far. Stop a user who has sought past the end.
*/
- if (ret) {
- *ptoken = readdir_response.token;
- ctx->pos = ORANGEFS_ITERATE_NEXT;
+ while (od->token != ORANGEFS_ITERATE_END &&
+ ctx->pos > od->end) {
+ r = orangefs_dir_more(oi, od, dentry);
+ if (r)
+ return r;
+ }
+ if (od->token == ORANGEFS_ITERATE_END && ctx->pos > od->end)
+ return -EIO;
+
+ /* Then try to fill if there's any left in the buffer. */
+ if (ctx->pos < od->end) {
+ r = orangefs_dir_fill(oi, od, dentry, ctx);
+ if (r)
+ return r;
}
- /*
- * Did we hit the end of the directory?
- */
- if (readdir_response.token == ORANGEFS_READDIR_END) {
- gossip_debug(GOSSIP_DIR_DEBUG,
- "End of dir detected; setting ctx->pos to ORANGEFS_READDIR_END.\n");
- ctx->pos = ORANGEFS_READDIR_END;
+ /* Finally get some more and try to fill. */
+ if (od->token != ORANGEFS_ITERATE_END) {
+ r = orangefs_dir_more(oi, od, dentry);
+ if (r)
+ return r;
+ r = orangefs_dir_fill(oi, od, dentry, ctx);
}
-out_destroy_handle:
- /* kfree(NULL) is safe */
- kfree(readdir_response.dirent_array);
-out_vfree:
- gossip_debug(GOSSIP_DIR_DEBUG, "vfree %p\n", dents_buf);
- vfree(dents_buf);
-out_free_op:
- op_release(new_op);
- gossip_debug(GOSSIP_DIR_DEBUG, "orangefs_readdir returning %d\n", ret);
- return ret;
+ return r;
}
static int orangefs_dir_open(struct inode *inode, struct file *file)
{
- __u64 *ptoken;
-
- file->private_data = kmalloc(sizeof(__u64), GFP_KERNEL);
+ struct orangefs_dir *od;
+ file->private_data = kmalloc(sizeof(struct orangefs_dir),
+ GFP_KERNEL);
if (!file->private_data)
return -ENOMEM;
-
- ptoken = file->private_data;
- *ptoken = ORANGEFS_READDIR_START;
+ od = file->private_data;
+ od->token = ORANGEFS_ITERATE_START;
+ od->part = NULL;
+ od->end = 1 << PART_SHIFT;
+ od->error = 0;
return 0;
}
static int orangefs_dir_release(struct inode *inode, struct file *file)
{
+ struct orangefs_dir *od = file->private_data;
+ struct orangefs_dir_part *part = od->part;
orangefs_flush_inode(inode);
- kfree(file->private_data);
+ while (part) {
+ struct orangefs_dir_part *next = part->next;
+ vfree(part);
+ part = next;
+ }
+ kfree(od);
return 0;
}
-/** ORANGEFS implementation of VFS directory operations */
const struct file_operations orangefs_dir_operations = {
+ .llseek = orangefs_dir_llseek,
.read = generic_read_dir,
- .iterate = orangefs_readdir,
+ .iterate = orangefs_dir_iterate,
.open = orangefs_dir_open,
- .release = orangefs_dir_release,
+ .release = orangefs_dir_release
};
diff --git a/fs/orangefs/downcall.h b/fs/orangefs/downcall.h
index 3b8923f8bf21..163001c95501 100644
--- a/fs/orangefs/downcall.h
+++ b/fs/orangefs/downcall.h
@@ -40,16 +40,6 @@ struct orangefs_mkdir_response {
struct orangefs_object_kref refn;
};
-/*
- * duplication of some system interface structures so that I don't have
- * to allocate extra memory
- */
-struct orangefs_dirent {
- char *d_name;
- int d_length;
- struct orangefs_khandle khandle;
-};
-
struct orangefs_statfs_response {
__s64 block_size;
__s64 blocks_total;
@@ -131,12 +121,16 @@ struct orangefs_downcall_s {
} resp;
};
+/*
+ * The readdir response comes in the trailer. It is followed by the
+ * directory entries as described in dir.c.
+ */
+
struct orangefs_readdir_response_s {
__u64 token;
__u64 directory_version;
__u32 __pad2;
__u32 orangefs_dirent_outcount;
- struct orangefs_dirent *dirent_array;
};
#endif /* __DOWNCALL_H */
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index 1cd37ebc4f25..28f38d813ad2 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -474,7 +474,8 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite
/* Make sure generic_write_checks sees an up to date inode size. */
if (file->f_flags & O_APPEND) {
- rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1);
+ rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1,
+ STATX_SIZE);
if (rc == -ESTALE)
rc = -EIO;
if (rc) {
@@ -692,7 +693,8 @@ static loff_t orangefs_file_llseek(struct file *file, loff_t offset, int origin)
* NOTE: We are only interested in file size here,
* so we set mask accordingly.
*/
- ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1);
+ ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1,
+ STATX_SIZE);
if (ret == -ESTALE)
ret = -EIO;
if (ret) {
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index a304bf34b212..9428ea0aac16 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -161,7 +161,7 @@ static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr)
iattr->ia_size);
/* Ensure that we have a up to date size, so we know if it changed. */
- ret = orangefs_inode_getattr(inode, 0, 1);
+ ret = orangefs_inode_getattr(inode, 0, 1, STATX_SIZE);
if (ret == -ESTALE)
ret = -EIO;
if (ret) {
@@ -218,8 +218,7 @@ int orangefs_setattr(struct dentry *dentry, struct iattr *iattr)
if (ret)
goto out;
- if ((iattr->ia_valid & ATTR_SIZE) &&
- iattr->ia_size != i_size_read(inode)) {
+ if (iattr->ia_valid & ATTR_SIZE) {
ret = orangefs_setattr_size(inode, iattr);
if (ret)
goto out;
@@ -256,13 +255,19 @@ int orangefs_getattr(const struct path *path, struct kstat *stat,
"orangefs_getattr: called on %pd\n",
path->dentry);
- ret = orangefs_inode_getattr(inode, 0, 0);
+ ret = orangefs_inode_getattr(inode, 0, 0, request_mask);
if (ret == 0) {
generic_fillattr(inode, stat);
/* override block size reported to stat */
orangefs_inode = ORANGEFS_I(inode);
stat->blksize = orangefs_inode->blksize;
+
+ if (request_mask & STATX_SIZE)
+ stat->result_mask = STATX_BASIC_STATS;
+ else
+ stat->result_mask = STATX_BASIC_STATS &
+ ~STATX_SIZE;
}
return ret;
}
@@ -277,7 +282,7 @@ int orangefs_permission(struct inode *inode, int mask)
gossip_debug(GOSSIP_INODE_DEBUG, "%s: refreshing\n", __func__);
/* Make sure the permission (and other common attrs) are up to date. */
- ret = orangefs_inode_getattr(inode, 0, 0);
+ ret = orangefs_inode_getattr(inode, 0, 0, STATX_MODE);
if (ret < 0)
return ret;
@@ -375,7 +380,7 @@ struct inode *orangefs_iget(struct super_block *sb, struct orangefs_object_kref
if (!inode || !(inode->i_state & I_NEW))
return inode;
- error = orangefs_inode_getattr(inode, 1, 1);
+ error = orangefs_inode_getattr(inode, 1, 1, STATX_ALL);
if (error) {
iget_failed(inode);
return ERR_PTR(error);
@@ -420,7 +425,7 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
orangefs_set_inode(inode, ref);
inode->i_ino = hash; /* needed for stat etc */
- error = orangefs_inode_getattr(inode, 1, 1);
+ error = orangefs_inode_getattr(inode, 1, 1, STATX_ALL);
if (error)
goto out_iput;
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index a290ff6ec756..478e88bd7f9d 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -74,6 +74,7 @@ static int orangefs_create(struct inode *dir,
unlock_new_inode(inode);
orangefs_set_timeout(dentry);
ORANGEFS_I(inode)->getattr_time = jiffies - 1;
+ ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS;
gossip_debug(GOSSIP_NAME_DEBUG,
"%s: dentry instantiated for %pd\n",
@@ -193,8 +194,6 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
goto out;
}
- ORANGEFS_I(inode)->getattr_time = jiffies - 1;
-
gossip_debug(GOSSIP_NAME_DEBUG,
"%s:%s:%d "
"Found good inode [%lu] with count [%d]\n",
@@ -324,6 +323,7 @@ static int orangefs_symlink(struct inode *dir,
unlock_new_inode(inode);
orangefs_set_timeout(dentry);
ORANGEFS_I(inode)->getattr_time = jiffies - 1;
+ ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS;
gossip_debug(GOSSIP_NAME_DEBUG,
"Inode (Symlink) %pU -> %pd\n",
@@ -388,6 +388,7 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
unlock_new_inode(inode);
orangefs_set_timeout(dentry);
ORANGEFS_I(inode)->getattr_time = jiffies - 1;
+ ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS;
gossip_debug(GOSSIP_NAME_DEBUG,
"Inode (Directory) %pU -> %pd\n",
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index 791912da97d7..716ed337f166 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -440,6 +440,9 @@ static ssize_t orangefs_debug_write(struct file *file,
"orangefs_debug_write: %pD\n",
file);
+ if (count == 0)
+ return 0;
+
/*
* Thwart users who try to jamb a ridiculous number
* of bytes into the debug file...
diff --git a/fs/orangefs/orangefs-dev-proto.h b/fs/orangefs/orangefs-dev-proto.h
index f380f9ed1b28..efe08c763e56 100644
--- a/fs/orangefs/orangefs-dev-proto.h
+++ b/fs/orangefs/orangefs-dev-proto.h
@@ -52,12 +52,7 @@
*/
#define ORANGEFS_MAX_DEBUG_STRING_LEN 0x00000800
-/*
- * The maximum number of directory entries in a single request is 96.
- * XXX: Why can this not be higher. The client-side code can handle up to 512.
- * XXX: What happens if we expect more than the client can return?
- */
-#define ORANGEFS_MAX_DIRENT_COUNT_READDIR 96
+#define ORANGEFS_MAX_DIRENT_COUNT_READDIR 512
#include "upcall.h"
#include "downcall.h"
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 8afac46fcc87..ea0ce507a6ab 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -215,6 +215,7 @@ struct orangefs_inode_s {
unsigned long pinode_flags;
unsigned long getattr_time;
+ u32 getattr_mask;
};
#define P_ATIME_FLAG 0
@@ -340,11 +341,6 @@ static inline struct orangefs_khandle *get_khandle_from_ino(struct inode *inode)
return &(ORANGEFS_I(inode)->refn.khandle);
}
-static inline __s32 get_fsid_from_ino(struct inode *inode)
-{
- return ORANGEFS_I(inode)->refn.fs_id;
-}
-
static inline ino_t get_ino_from_khandle(struct inode *inode)
{
struct orangefs_khandle *khandle;
@@ -500,7 +496,8 @@ int orangefs_inode_setxattr(struct inode *inode,
size_t size,
int flags);
-int orangefs_inode_getattr(struct inode *inode, int new, int bypass);
+int orangefs_inode_getattr(struct inode *inode, int new, int bypass,
+ u32 request_mask);
int orangefs_inode_check_changed(struct inode *inode);
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index 9b96b99539d6..aab6f1842963 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -251,7 +251,8 @@ static int orangefs_inode_is_stale(struct inode *inode, int new,
return 0;
}
-int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
+int orangefs_inode_getattr(struct inode *inode, int new, int bypass,
+ u32 request_mask)
{
struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
struct orangefs_kernel_op_s *new_op;
@@ -262,7 +263,13 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
get_khandle_from_ino(inode));
if (!new && !bypass) {
- if (time_before(jiffies, orangefs_inode->getattr_time))
+ /*
+ * Must have all the attributes in the mask and be within cache
+ * time.
+ */
+ if ((request_mask & orangefs_inode->getattr_mask) ==
+ request_mask &&
+ time_before(jiffies, orangefs_inode->getattr_time))
return 0;
}
@@ -270,7 +277,15 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
if (!new_op)
return -ENOMEM;
new_op->upcall.req.getattr.refn = orangefs_inode->refn;
- new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_ALL_NOHINT;
+ /*
+ * Size is the hardest attribute to get. The incremental cost of any
+ * other attribute is essentially zero.
+ */
+ if (request_mask & STATX_SIZE || new)
+ new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_ALL_NOHINT;
+ else
+ new_op->upcall.req.getattr.mask =
+ ORANGEFS_ATTR_SYS_ALL_NOHINT & ~ORANGEFS_ATTR_SYS_SIZE;
ret = service_operation(new_op, __func__,
get_interruptible_flag(inode));
@@ -291,25 +306,29 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
case S_IFREG:
inode->i_flags = orangefs_inode_flags(&new_op->
downcall.resp.getattr.attributes);
- inode_size = (loff_t)new_op->
- downcall.resp.getattr.attributes.size;
- rounded_up_size =
- (inode_size + (4096 - (inode_size % 4096)));
- inode->i_size = inode_size;
- orangefs_inode->blksize =
- new_op->downcall.resp.getattr.attributes.blksize;
- spin_lock(&inode->i_lock);
- inode->i_bytes = inode_size;
- inode->i_blocks =
- (unsigned long)(rounded_up_size / 512);
- spin_unlock(&inode->i_lock);
+ if (request_mask & STATX_SIZE || new) {
+ inode_size = (loff_t)new_op->
+ downcall.resp.getattr.attributes.size;
+ rounded_up_size =
+ (inode_size + (4096 - (inode_size % 4096)));
+ inode->i_size = inode_size;
+ orangefs_inode->blksize =
+ new_op->downcall.resp.getattr.attributes.blksize;
+ spin_lock(&inode->i_lock);
+ inode->i_bytes = inode_size;
+ inode->i_blocks =
+ (unsigned long)(rounded_up_size / 512);
+ spin_unlock(&inode->i_lock);
+ }
break;
case S_IFDIR:
- inode->i_size = PAGE_SIZE;
- orangefs_inode->blksize = i_blocksize(inode);
- spin_lock(&inode->i_lock);
- inode_set_bytes(inode, inode->i_size);
- spin_unlock(&inode->i_lock);
+ if (request_mask & STATX_SIZE || new) {
+ inode->i_size = PAGE_SIZE;
+ orangefs_inode->blksize = i_blocksize(inode);
+ spin_lock(&inode->i_lock);
+ inode_set_bytes(inode, inode->i_size);
+ spin_unlock(&inode->i_lock);
+ }
set_nlink(inode, 1);
break;
case S_IFLNK:
@@ -349,6 +368,10 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
orangefs_inode->getattr_time = jiffies +
orangefs_getattr_timeout_msecs*HZ/1000;
+ if (request_mask & STATX_SIZE || new)
+ orangefs_inode->getattr_mask = STATX_BASIC_STATS;
+ else
+ orangefs_inode->getattr_mask = STATX_BASIC_STATS & ~STATX_SIZE;
ret = 0;
out:
op_release(new_op);
@@ -500,41 +523,6 @@ int orangefs_flush_inode(struct inode *inode)
return ret;
}
-int orangefs_unmount_sb(struct super_block *sb)
-{
- int ret = -EINVAL;
- struct orangefs_kernel_op_s *new_op = NULL;
-
- gossip_debug(GOSSIP_UTILS_DEBUG,
- "orangefs_unmount_sb called on sb %p\n",
- sb);
-
- new_op = op_alloc(ORANGEFS_VFS_OP_FS_UMOUNT);
- if (!new_op)
- return -ENOMEM;
- new_op->upcall.req.fs_umount.id = ORANGEFS_SB(sb)->id;
- new_op->upcall.req.fs_umount.fs_id = ORANGEFS_SB(sb)->fs_id;
- strncpy(new_op->upcall.req.fs_umount.orangefs_config_server,
- ORANGEFS_SB(sb)->devname,
- ORANGEFS_MAX_SERVER_ADDR_LEN);
-
- gossip_debug(GOSSIP_UTILS_DEBUG,
- "Attempting ORANGEFS Unmount via host %s\n",
- new_op->upcall.req.fs_umount.orangefs_config_server);
-
- ret = service_operation(new_op, "orangefs_fs_umount", 0);
-
- gossip_debug(GOSSIP_UTILS_DEBUG,
- "orangefs_unmount: got return value of %d\n", ret);
- if (ret)
- sb = ERR_PTR(ret);
- else
- ORANGEFS_SB(sb)->mount_pending = 1;
-
- op_release(new_op);
- return ret;
-}
-
void orangefs_make_bad_inode(struct inode *inode)
{
if (is_root_handle(inode)) {
diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h
index 971307ad69be..48bcc1bbe415 100644
--- a/fs/orangefs/protocol.h
+++ b/fs/orangefs/protocol.h
@@ -138,13 +138,8 @@ typedef __s64 ORANGEFS_offset;
#define ORANGEFS_G_SGID (1 << 10)
#define ORANGEFS_U_SUID (1 << 11)
-/* definition taken from stdint.h */
-#define INT32_MAX (2147483647)
-#define ORANGEFS_ITERATE_START (INT32_MAX - 1)
-#define ORANGEFS_ITERATE_END (INT32_MAX - 2)
-#define ORANGEFS_ITERATE_NEXT (INT32_MAX - 3)
-#define ORANGEFS_READDIR_START ORANGEFS_ITERATE_START
-#define ORANGEFS_READDIR_END ORANGEFS_ITERATE_END
+#define ORANGEFS_ITERATE_START 2147483646
+#define ORANGEFS_ITERATE_END 2147483645
#define ORANGEFS_IMMUTABLE_FL FS_IMMUTABLE_FL
#define ORANGEFS_APPEND_FL FS_APPEND_FL
#define ORANGEFS_NOATIME_FL FS_NOATIME_FL
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index 629d8c917fa6..5c7c273e17ec 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -376,6 +376,25 @@ static const struct export_operations orangefs_export_ops = {
.fh_to_dentry = orangefs_fh_to_dentry,
};
+static int orangefs_unmount(int id, __s32 fs_id, const char *devname)
+{
+ struct orangefs_kernel_op_s *op;
+ int r;
+ op = op_alloc(ORANGEFS_VFS_OP_FS_UMOUNT);
+ if (!op)
+ return -ENOMEM;
+ op->upcall.req.fs_umount.id = id;
+ op->upcall.req.fs_umount.fs_id = fs_id;
+ strncpy(op->upcall.req.fs_umount.orangefs_config_server,
+ devname, ORANGEFS_MAX_SERVER_ADDR_LEN);
+ r = service_operation(op, "orangefs_fs_umount", 0);
+ /* Not much to do about an error here. */
+ if (r)
+ gossip_err("orangefs_unmount: service_operation %d\n", r);
+ op_release(op);
+ return r;
+}
+
static int orangefs_fill_sb(struct super_block *sb,
struct orangefs_fs_mount_response *fs_mount,
void *data, int silent)
@@ -484,6 +503,8 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
if (IS_ERR(sb)) {
d = ERR_CAST(sb);
+ orangefs_unmount(new_op->downcall.resp.fs_mount.id,
+ new_op->downcall.resp.fs_mount.fs_id, devname);
goto free_op;
}
@@ -539,6 +560,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
free_sb_and_op:
/* Will call orangefs_kill_sb with sb not in list. */
ORANGEFS_SB(sb)->no_list = 1;
+ /* ORANGEFS_VFS_OP_FS_UMOUNT is done by orangefs_kill_sb. */
deactivate_locked_super(sb);
free_op:
gossip_err("orangefs_mount: mount request failed with %d\n", ret);
@@ -554,6 +576,7 @@ free_op:
void orangefs_kill_sb(struct super_block *sb)
{
+ int r;
gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_kill_sb: called\n");
/* provided sb cleanup */
@@ -563,7 +586,10 @@ void orangefs_kill_sb(struct super_block *sb)
* issue the unmount to userspace to tell it to remove the
* dynamic mount info it has for this superblock
*/
- orangefs_unmount_sb(sb);
+ r = orangefs_unmount(ORANGEFS_SB(sb)->id, ORANGEFS_SB(sb)->fs_id,
+ ORANGEFS_SB(sb)->devname);
+ if (!r)
+ ORANGEFS_SB(sb)->mount_pending = 1;
if (!ORANGEFS_SB(sb)->no_list) {
/* remove the sb from our list of orangefs specific sb's */
diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c
index abcfa3fa9992..61e2ca7fec55 100644
--- a/fs/orangefs/waitqueue.c
+++ b/fs/orangefs/waitqueue.c
@@ -124,7 +124,14 @@ retry_servicing:
gossip_debug(GOSSIP_WAIT_DEBUG,
"%s:client core is NOT in service.\n",
__func__);
- timeout = op_timeout_secs * HZ;
+ /*
+ * Don't wait for the userspace component to return if
+ * the filesystem is being umounted anyway.
+ */
+ if (op->upcall.type == ORANGEFS_VFS_OP_FS_UMOUNT)
+ timeout = 0;
+ else
+ timeout = op_timeout_secs * HZ;
}
spin_unlock(&orangefs_request_list_lock);
diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c
index 74a81b1daaac..237c9c04dc3b 100644
--- a/fs/orangefs/xattr.c
+++ b/fs/orangefs/xattr.c
@@ -76,11 +76,8 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name,
if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
- if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) {
- gossip_err("Invalid key length (%d)\n",
- (int)strlen(name));
+ if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN)
return -EINVAL;
- }
fsuid = from_kuid(&init_user_ns, current_fsuid());
fsgid = from_kgid(&init_user_ns, current_fsgid());
@@ -172,6 +169,9 @@ static int orangefs_inode_removexattr(struct inode *inode, const char *name,
struct orangefs_kernel_op_s *new_op = NULL;
int ret = -ENOMEM;
+ if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN)
+ return -EINVAL;
+
down_write(&orangefs_inode->xattr_sem);
new_op = op_alloc(ORANGEFS_VFS_OP_REMOVEXATTR);
if (!new_op)
@@ -231,23 +231,13 @@ int orangefs_inode_setxattr(struct inode *inode, const char *name,
"%s: name %s, buffer_size %zd\n",
__func__, name, size);
- if (size >= ORANGEFS_MAX_XATTR_VALUELEN ||
- flags < 0) {
- gossip_err("orangefs_inode_setxattr: bogus values of size(%d), flags(%d)\n",
- (int)size,
- flags);
+ if (size > ORANGEFS_MAX_XATTR_VALUELEN)
+ return -EINVAL;
+ if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN)
return -EINVAL;
- }
internal_flag = convert_to_internal_xattr_flags(flags);
- if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) {
- gossip_err
- ("orangefs_inode_setxattr: bogus key size (%d)\n",
- (int)(strlen(name)));
- return -EINVAL;
- }
-
/* This is equivalent to a removexattr */
if (size == 0 && value == NULL) {
gossip_debug(GOSSIP_XATTR_DEBUG,
@@ -358,7 +348,7 @@ try_again:
returned_count = new_op->downcall.resp.listxattr.returned_count;
if (returned_count < 0 ||
- returned_count >= ORANGEFS_MAX_XATTR_LISTLEN) {
+ returned_count > ORANGEFS_MAX_XATTR_LISTLEN) {
gossip_err("%s: impossible value for returned_count:%d:\n",
__func__,
returned_count);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9e3ac5c11780..45f6bf68fff3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -821,10 +821,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
if (!mmget_not_zero(mm))
goto free;
- /* Maybe we should limit FOLL_FORCE to actual ptrace users? */
- flags = FOLL_FORCE;
- if (write)
- flags |= FOLL_WRITE;
+ flags = write ? FOLL_WRITE : 0;
while (count > 0) {
int this_len = min_t(int, count, PAGE_SIZE);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index ee27feb34cf4..9425c0d97262 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -472,6 +472,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name)
ent->data = NULL;
ent->proc_fops = NULL;
ent->proc_iops = NULL;
+ parent->nlink++;
if (proc_register(parent, ent) < 0) {
kfree(ent);
parent->nlink--;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 2cc7a8030275..e250910cffc8 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -58,7 +58,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
struct proc_inode *ei;
struct inode *inode;
- ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL);
+ ei = kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
ei->pid = NULL;
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 766f0c637ad1..3803b24ca220 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -23,6 +23,7 @@ static const struct proc_ns_operations *ns_entries[] = {
#endif
#ifdef CONFIG_PID_NS
&pidns_operations,
+ &pidns_for_children_operations,
#endif
#ifdef CONFIG_USER_NS
&userns_operations,
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index d04ea4349909..67985a7233c2 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -408,10 +408,6 @@ static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentr
*pentry = entry;
}
-void register_sysctl_root(struct ctl_table_root *root)
-{
-}
-
/*
* sysctl_perm does NOT grant the superuser all rights automatically, because
* some sysctl variables are readonly even to root.
diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
index aca73dd73906..e3c558d1b78c 100644
--- a/fs/reiserfs/item_ops.c
+++ b/fs/reiserfs/item_ops.c
@@ -724,18 +724,18 @@ static void errcatch_print_vi(struct virtual_item *vi)
}
static struct item_operations errcatch_ops = {
- errcatch_bytes_number,
- errcatch_decrement_key,
- errcatch_is_left_mergeable,
- errcatch_print_item,
- errcatch_check_item,
-
- errcatch_create_vi,
- errcatch_check_left,
- errcatch_check_right,
- errcatch_part_size,
- errcatch_unit_num,
- errcatch_print_vi
+ .bytes_number = errcatch_bytes_number,
+ .decrement_key = errcatch_decrement_key,
+ .is_left_mergeable = errcatch_is_left_mergeable,
+ .print_item = errcatch_print_item,
+ .check_item = errcatch_check_item,
+
+ .create_vi = errcatch_create_vi,
+ .check_left = errcatch_check_left,
+ .check_right = errcatch_check_right,
+ .part_size = errcatch_part_size,
+ .unit_num = errcatch_unit_num,
+ .print_vi = errcatch_print_vi
};
#if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3)
diff --git a/fs/select.c b/fs/select.c
index bd4b2ccfd346..d6c652a31e99 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -633,10 +633,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
goto out_nofds;
alloc_size = 6 * size;
- bits = kmalloc(alloc_size, GFP_KERNEL|__GFP_NOWARN);
- if (!bits && alloc_size > PAGE_SIZE)
- bits = vmalloc(alloc_size);
-
+ bits = kvmalloc(alloc_size, GFP_KERNEL);
if (!bits)
goto out_nofds;
}
diff --git a/fs/seq_file.c b/fs/seq_file.c
index ca69fb99e41a..dc7c2be963ed 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -25,21 +25,7 @@ static void seq_set_overflow(struct seq_file *m)
static void *seq_buf_alloc(unsigned long size)
{
- void *buf;
- gfp_t gfp = GFP_KERNEL;
-
- /*
- * For high order allocations, use __GFP_NORETRY to avoid oom-killing -
- * it's better to fall back to vmalloc() than to kill things. For small
- * allocations, just use GFP_KERNEL which will oom kill, thus no need
- * for vmalloc fallback.
- */
- if (size > PAGE_SIZE)
- gfp |= __GFP_NORETRY | __GFP_NOWARN;
- buf = kmalloc(size, gfp);
- if (!buf && size > PAGE_SIZE)
- buf = vmalloc(size);
- return buf;
+ return kvmalloc(size, GFP_KERNEL);
}
/**
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 21d36d284735..328e89c2cf83 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -266,7 +266,7 @@ static const struct super_operations tracefs_super_operations = {
static int trace_fill_super(struct super_block *sb, void *data, int silent)
{
- static struct tree_descr trace_files[] = {{""}};
+ static const struct tree_descr trace_files[] = {{""}};
struct tracefs_fs_info *fsi;
int err;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index b777bddaa1dd..566079d9b402 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -121,7 +121,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
inode_init_owner(inode, dir, mode);
inode->i_mtime = inode->i_atime = inode->i_ctime =
- ubifs_current_time(inode);
+ current_time(inode);
inode->i_mapping->nrpages = 0;
switch (mode & S_IFMT) {
@@ -285,6 +285,15 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
goto out_dent;
}
+ if (ubifs_crypt_is_encrypted(dir) &&
+ (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
+ !fscrypt_has_permitted_context(dir, inode)) {
+ ubifs_warn(c, "Inconsistent encryption contexts: %lu/%lu",
+ dir->i_ino, inode->i_ino);
+ err = -EPERM;
+ goto out_inode;
+ }
+
done:
kfree(dent);
fscrypt_free_filename(&nm);
@@ -295,6 +304,8 @@ done:
d_add(dentry, inode);
return NULL;
+out_inode:
+ iput(inode);
out_dent:
kfree(dent);
out_fname:
@@ -755,7 +766,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
inc_nlink(inode);
ihold(inode);
- inode->i_ctime = ubifs_current_time(inode);
+ inode->i_ctime = current_time(inode);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
dir->i_mtime = dir->i_ctime = inode->i_ctime;
@@ -830,7 +841,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
}
lock_2_inodes(dir, inode);
- inode->i_ctime = ubifs_current_time(dir);
+ inode->i_ctime = current_time(dir);
drop_nlink(inode);
dir->i_size -= sz_change;
dir_ui->ui_size = dir->i_size;
@@ -934,7 +945,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
}
lock_2_inodes(dir, inode);
- inode->i_ctime = ubifs_current_time(dir);
+ inode->i_ctime = current_time(dir);
clear_nlink(inode);
drop_nlink(dir);
dir->i_size -= sz_change;
@@ -1411,7 +1422,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
* Like most other Unix systems, set the @i_ctime for inodes on a
* rename.
*/
- time = ubifs_current_time(old_dir);
+ time = current_time(old_dir);
old_inode->i_ctime = time;
/* We must adjust parent link count when renaming directories */
@@ -1584,7 +1595,7 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry,
lock_4_inodes(old_dir, new_dir, NULL, NULL);
- time = ubifs_current_time(old_dir);
+ time = current_time(old_dir);
fst_inode->i_ctime = time;
snd_inode->i_ctime = time;
old_dir->i_mtime = old_dir->i_ctime = time;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index d9ae86f96df7..2cda3d67e2d0 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1196,7 +1196,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
mutex_lock(&ui->ui_mutex);
ui->ui_size = inode->i_size;
/* Truncation changes inode [mc]time */
- inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
/* Other attributes may be changed at the same time as well */
do_attr_changes(inode, attr);
err = ubifs_jnl_truncate(c, inode, old_size, new_size);
@@ -1243,7 +1243,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
mutex_lock(&ui->ui_mutex);
if (attr->ia_valid & ATTR_SIZE) {
/* Truncation changes inode [mc]time */
- inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
/* 'truncate_setsize()' changed @i_size, update @ui_size */
ui->ui_size = inode->i_size;
}
@@ -1420,7 +1420,7 @@ int ubifs_update_time(struct inode *inode, struct timespec *time,
*/
static int update_mctime(struct inode *inode)
{
- struct timespec now = ubifs_current_time(inode);
+ struct timespec now = current_time(inode);
struct ubifs_inode *ui = ubifs_inode(inode);
struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -1434,7 +1434,7 @@ static int update_mctime(struct inode *inode)
return err;
mutex_lock(&ui->ui_mutex);
- inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
release = ui->dirty;
mark_inode_dirty_sync(inode);
mutex_unlock(&ui->ui_mutex);
@@ -1511,7 +1511,7 @@ static int ubifs_vm_page_mkwrite(struct vm_fault *vmf)
struct page *page = vmf->page;
struct inode *inode = file_inode(vmf->vma->vm_file);
struct ubifs_info *c = inode->i_sb->s_fs_info;
- struct timespec now = ubifs_current_time(inode);
+ struct timespec now = current_time(inode);
struct ubifs_budget_req req = { .new_page = 1 };
int err, update_time;
@@ -1579,7 +1579,7 @@ static int ubifs_vm_page_mkwrite(struct vm_fault *vmf)
struct ubifs_inode *ui = ubifs_inode(inode);
mutex_lock(&ui->ui_mutex);
- inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
release = ui->dirty;
mark_inode_dirty_sync(inode);
mutex_unlock(&ui->ui_mutex);
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index da519ba205f6..12b9eb5005ff 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -126,7 +126,7 @@ static int setflags(struct inode *inode, int flags)
ui->flags = ioctl2ubifs(flags);
ubifs_set_inode_flags(inode);
- inode->i_ctime = ubifs_current_time(inode);
+ inode->i_ctime = current_time(inode);
release = ui->dirty;
mark_inode_dirty_sync(inode);
mutex_unlock(&ui->ui_mutex);
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index 8ece6ca58c0b..caf83d68fb38 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -225,16 +225,6 @@ static inline void *ubifs_idx_key(const struct ubifs_info *c,
}
/**
- * ubifs_current_time - round current time to time granularity.
- * @inode: inode
- */
-static inline struct timespec ubifs_current_time(struct inode *inode)
-{
- return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ?
- current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
-}
-
-/**
* ubifs_tnc_lookup - look up a file-system node.
* @c: UBIFS file-system description object
* @key: node key to lookup
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 7f1ead29e727..8c25081a5109 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -84,6 +84,8 @@ static int create_default_filesystem(struct ubifs_info *c)
int min_leb_cnt = UBIFS_MIN_LEB_CNT;
long long tmp64, main_bytes;
__le64 tmp_le64;
+ __le32 tmp_le32;
+ struct timespec ts;
/* Some functions called from here depend on the @c->key_len filed */
c->key_len = UBIFS_SK_LEN;
@@ -298,13 +300,17 @@ static int create_default_filesystem(struct ubifs_info *c)
ino->ch.node_type = UBIFS_INO_NODE;
ino->creat_sqnum = cpu_to_le64(++c->max_sqnum);
ino->nlink = cpu_to_le32(2);
- tmp_le64 = cpu_to_le64(CURRENT_TIME_SEC.tv_sec);
+
+ ktime_get_real_ts(&ts);
+ ts = timespec_trunc(ts, DEFAULT_TIME_GRAN);
+ tmp_le64 = cpu_to_le64(ts.tv_sec);
ino->atime_sec = tmp_le64;
ino->ctime_sec = tmp_le64;
ino->mtime_sec = tmp_le64;
- ino->atime_nsec = 0;
- ino->ctime_nsec = 0;
- ino->mtime_nsec = 0;
+ tmp_le32 = cpu_to_le32(ts.tv_nsec);
+ ino->atime_nsec = tmp_le32;
+ ino->ctime_nsec = tmp_le32;
+ ino->mtime_nsec = tmp_le32;
ino->mode = cpu_to_le32(S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO);
ino->size = cpu_to_le64(UBIFS_INO_NODE_SZ);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index efe00fcb8b75..3e53fdbf7997 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -152,7 +152,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
ui->data_len = size;
mutex_lock(&host_ui->ui_mutex);
- host->i_ctime = ubifs_current_time(host);
+ host->i_ctime = current_time(host);
host_ui->xattr_cnt += 1;
host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm));
host_ui->xattr_size += CALC_XATTR_BYTES(size);
@@ -234,7 +234,7 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
mutex_unlock(&ui->ui_mutex);
mutex_lock(&host_ui->ui_mutex);
- host->i_ctime = ubifs_current_time(host);
+ host->i_ctime = current_time(host);
host_ui->xattr_size -= CALC_XATTR_BYTES(old_size);
host_ui->xattr_size += CALC_XATTR_BYTES(size);
@@ -488,7 +488,7 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host,
return err;
mutex_lock(&host_ui->ui_mutex);
- host->i_ctime = ubifs_current_time(host);
+ host->i_ctime = current_time(host);
host_ui->xattr_cnt -= 1;
host_ui->xattr_size -= CALC_DENT_SIZE(fname_len(nm));
host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 9774555b3721..d1dd8cc33179 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -176,6 +176,7 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode)
struct ufs_cg_private_info * ucpi;
struct ufs_cylinder_group * ucg;
struct inode * inode;
+ struct timespec64 ts;
unsigned cg, bit, i, j, start;
struct ufs_inode_info *ufsi;
int err = -ENOSPC;
@@ -323,8 +324,9 @@ cg_found:
lock_buffer(bh);
ufs2_inode = (struct ufs2_inode *)bh->b_data;
ufs2_inode += ufs_inotofsbo(inode->i_ino);
- ufs2_inode->ui_birthtime = cpu_to_fs64(sb, CURRENT_TIME.tv_sec);
- ufs2_inode->ui_birthnsec = cpu_to_fs32(sb, CURRENT_TIME.tv_nsec);
+ ktime_get_real_ts64(&ts);
+ ufs2_inode->ui_birthtime = cpu_to_fs64(sb, ts.tv_sec);
+ ufs2_inode->ui_birthnsec = cpu_to_fs32(sb, ts.tv_nsec);
mark_buffer_dirty(bh);
unlock_buffer(bh);
if (sb->s_flags & MS_SYNCHRONOUS)
diff --git a/fs/xattr.c b/fs/xattr.c
index 7e3317cf4045..464c94bf65f9 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -431,12 +431,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
if (size) {
if (size > XATTR_SIZE_MAX)
return -E2BIG;
- kvalue = kmalloc(size, GFP_KERNEL | __GFP_NOWARN);
- if (!kvalue) {
- kvalue = vmalloc(size);
- if (!kvalue)
- return -ENOMEM;
- }
+ kvalue = kvmalloc(size, GFP_KERNEL);
+ if (!kvalue)
+ return -ENOMEM;
if (copy_from_user(kvalue, value, size)) {
error = -EFAULT;
goto out;
@@ -528,12 +525,9 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
if (size) {
if (size > XATTR_SIZE_MAX)
size = XATTR_SIZE_MAX;
- kvalue = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
- if (!kvalue) {
- kvalue = vmalloc(size);
- if (!kvalue)
- return -ENOMEM;
- }
+ kvalue = kvzalloc(size, GFP_KERNEL);
+ if (!kvalue)
+ return -ENOMEM;
}
error = vfs_getxattr(d, kname, kvalue, size);
@@ -611,12 +605,9 @@ listxattr(struct dentry *d, char __user *list, size_t size)
if (size) {
if (size > XATTR_LIST_MAX)
size = XATTR_LIST_MAX;
- klist = kmalloc(size, __GFP_NOWARN | GFP_KERNEL);
- if (!klist) {
- klist = vmalloc(size);
- if (!klist)
- return -ENOMEM;
- }
+ klist = kvmalloc(size, GFP_KERNEL);
+ if (!klist)
+ return -ENOMEM;
}
error = vfs_listxattr(d, klist, size);
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 26ef1958b65b..5c90f82b8f6b 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -79,6 +79,7 @@ xfs-y += xfs_aops.o \
xfs_extent_busy.o \
xfs_file.o \
xfs_filestream.o \
+ xfs_fsmap.o \
xfs_fsops.o \
xfs_globals.o \
xfs_icache.o \
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 780fc8986dab..393b6849aeb3 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -67,7 +67,7 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
nofs_flag = memalloc_nofs_save();
lflags = kmem_flags_convert(flags);
- ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
+ ptr = __vmalloc(size, lflags | __GFP_ZERO, PAGE_KERNEL);
if (flags & KM_NOFS)
memalloc_nofs_restore(nofs_flag);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 369adcc18c02..7486401ccbd3 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2868,3 +2868,60 @@ err:
xfs_trans_brelse(tp, agbp);
return error;
}
+
+struct xfs_alloc_query_range_info {
+ xfs_alloc_query_range_fn fn;
+ void *priv;
+};
+
+/* Format btree record and pass to our callback. */
+STATIC int
+xfs_alloc_query_range_helper(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec,
+ void *priv)
+{
+ struct xfs_alloc_query_range_info *query = priv;
+ struct xfs_alloc_rec_incore irec;
+
+ irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
+ irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
+ return query->fn(cur, &irec, query->priv);
+}
+
+/* Find all free space within a given range of blocks. */
+int
+xfs_alloc_query_range(
+ struct xfs_btree_cur *cur,
+ struct xfs_alloc_rec_incore *low_rec,
+ struct xfs_alloc_rec_incore *high_rec,
+ xfs_alloc_query_range_fn fn,
+ void *priv)
+{
+ union xfs_btree_irec low_brec;
+ union xfs_btree_irec high_brec;
+ struct xfs_alloc_query_range_info query;
+
+ ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
+ low_brec.a = *low_rec;
+ high_brec.a = *high_rec;
+ query.priv = priv;
+ query.fn = fn;
+ return xfs_btree_query_range(cur, &low_brec, &high_brec,
+ xfs_alloc_query_range_helper, &query);
+}
+
+/* Find all free space records. */
+int
+xfs_alloc_query_all(
+ struct xfs_btree_cur *cur,
+ xfs_alloc_query_range_fn fn,
+ void *priv)
+{
+ struct xfs_alloc_query_range_info query;
+
+ ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
+ query.priv = priv;
+ query.fn = fn;
+ return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
+}
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 2a8d0fa6fbbe..77d9c27330ab 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -219,4 +219,16 @@ int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno,
xfs_extlen_t xfs_prealloc_blocks(struct xfs_mount *mp);
+typedef int (*xfs_alloc_query_range_fn)(
+ struct xfs_btree_cur *cur,
+ struct xfs_alloc_rec_incore *rec,
+ void *priv);
+
+int xfs_alloc_query_range(struct xfs_btree_cur *cur,
+ struct xfs_alloc_rec_incore *low_rec,
+ struct xfs_alloc_rec_incore *high_rec,
+ xfs_alloc_query_range_fn fn, void *priv);
+int xfs_alloc_query_all(struct xfs_btree_cur *cur, xfs_alloc_query_range_fn fn,
+ void *priv);
+
#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index efb467b10a71..e1fcfe7f0a9a 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -205,19 +205,37 @@ xfs_allocbt_init_key_from_rec(
union xfs_btree_key *key,
union xfs_btree_rec *rec)
{
- ASSERT(rec->alloc.ar_startblock != 0);
-
key->alloc.ar_startblock = rec->alloc.ar_startblock;
key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
}
STATIC void
+xfs_bnobt_init_high_key_from_rec(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
+{
+ __u32 x;
+
+ x = be32_to_cpu(rec->alloc.ar_startblock);
+ x += be32_to_cpu(rec->alloc.ar_blockcount) - 1;
+ key->alloc.ar_startblock = cpu_to_be32(x);
+ key->alloc.ar_blockcount = 0;
+}
+
+STATIC void
+xfs_cntbt_init_high_key_from_rec(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
+{
+ key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
+ key->alloc.ar_startblock = 0;
+}
+
+STATIC void
xfs_allocbt_init_rec_from_cur(
struct xfs_btree_cur *cur,
union xfs_btree_rec *rec)
{
- ASSERT(cur->bc_rec.a.ar_startblock != 0);
-
rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
}
@@ -236,18 +254,24 @@ xfs_allocbt_init_ptr_from_cur(
}
STATIC __int64_t
-xfs_allocbt_key_diff(
+xfs_bnobt_key_diff(
struct xfs_btree_cur *cur,
union xfs_btree_key *key)
{
xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a;
xfs_alloc_key_t *kp = &key->alloc;
- __int64_t diff;
- if (cur->bc_btnum == XFS_BTNUM_BNO) {
- return (__int64_t)be32_to_cpu(kp->ar_startblock) -
- rec->ar_startblock;
- }
+ return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
+}
+
+STATIC __int64_t
+xfs_cntbt_key_diff(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key)
+{
+ xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a;
+ xfs_alloc_key_t *kp = &key->alloc;
+ __int64_t diff;
diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
if (diff)
@@ -256,6 +280,33 @@ xfs_allocbt_key_diff(
return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
}
+STATIC __int64_t
+xfs_bnobt_diff_two_keys(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2)
+{
+ return (__int64_t)be32_to_cpu(k1->alloc.ar_startblock) -
+ be32_to_cpu(k2->alloc.ar_startblock);
+}
+
+STATIC __int64_t
+xfs_cntbt_diff_two_keys(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2)
+{
+ __int64_t diff;
+
+ diff = be32_to_cpu(k1->alloc.ar_blockcount) -
+ be32_to_cpu(k2->alloc.ar_blockcount);
+ if (diff)
+ return diff;
+
+ return be32_to_cpu(k1->alloc.ar_startblock) -
+ be32_to_cpu(k2->alloc.ar_startblock);
+}
+
static bool
xfs_allocbt_verify(
struct xfs_buf *bp)
@@ -346,44 +397,54 @@ const struct xfs_buf_ops xfs_allocbt_buf_ops = {
#if defined(DEBUG) || defined(XFS_WARN)
STATIC int
-xfs_allocbt_keys_inorder(
+xfs_bnobt_keys_inorder(
struct xfs_btree_cur *cur,
union xfs_btree_key *k1,
union xfs_btree_key *k2)
{
- if (cur->bc_btnum == XFS_BTNUM_BNO) {
- return be32_to_cpu(k1->alloc.ar_startblock) <
- be32_to_cpu(k2->alloc.ar_startblock);
- } else {
- return be32_to_cpu(k1->alloc.ar_blockcount) <
- be32_to_cpu(k2->alloc.ar_blockcount) ||
- (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount &&
- be32_to_cpu(k1->alloc.ar_startblock) <
- be32_to_cpu(k2->alloc.ar_startblock));
- }
+ return be32_to_cpu(k1->alloc.ar_startblock) <
+ be32_to_cpu(k2->alloc.ar_startblock);
}
STATIC int
-xfs_allocbt_recs_inorder(
+xfs_bnobt_recs_inorder(
struct xfs_btree_cur *cur,
union xfs_btree_rec *r1,
union xfs_btree_rec *r2)
{
- if (cur->bc_btnum == XFS_BTNUM_BNO) {
- return be32_to_cpu(r1->alloc.ar_startblock) +
- be32_to_cpu(r1->alloc.ar_blockcount) <=
- be32_to_cpu(r2->alloc.ar_startblock);
- } else {
- return be32_to_cpu(r1->alloc.ar_blockcount) <
- be32_to_cpu(r2->alloc.ar_blockcount) ||
- (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount &&
- be32_to_cpu(r1->alloc.ar_startblock) <
- be32_to_cpu(r2->alloc.ar_startblock));
- }
+ return be32_to_cpu(r1->alloc.ar_startblock) +
+ be32_to_cpu(r1->alloc.ar_blockcount) <=
+ be32_to_cpu(r2->alloc.ar_startblock);
+}
+
+STATIC int
+xfs_cntbt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2)
+{
+ return be32_to_cpu(k1->alloc.ar_blockcount) <
+ be32_to_cpu(k2->alloc.ar_blockcount) ||
+ (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount &&
+ be32_to_cpu(k1->alloc.ar_startblock) <
+ be32_to_cpu(k2->alloc.ar_startblock));
}
-#endif /* DEBUG */
-static const struct xfs_btree_ops xfs_allocbt_ops = {
+STATIC int
+xfs_cntbt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *r1,
+ union xfs_btree_rec *r2)
+{
+ return be32_to_cpu(r1->alloc.ar_blockcount) <
+ be32_to_cpu(r2->alloc.ar_blockcount) ||
+ (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount &&
+ be32_to_cpu(r1->alloc.ar_startblock) <
+ be32_to_cpu(r2->alloc.ar_startblock));
+}
+#endif /* DEBUG */
+
+static const struct xfs_btree_ops xfs_bnobt_ops = {
.rec_len = sizeof(xfs_alloc_rec_t),
.key_len = sizeof(xfs_alloc_key_t),
@@ -395,13 +456,39 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
.get_minrecs = xfs_allocbt_get_minrecs,
.get_maxrecs = xfs_allocbt_get_maxrecs,
.init_key_from_rec = xfs_allocbt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_bnobt_init_high_key_from_rec,
.init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
- .key_diff = xfs_allocbt_key_diff,
+ .key_diff = xfs_bnobt_key_diff,
.buf_ops = &xfs_allocbt_buf_ops,
+ .diff_two_keys = xfs_bnobt_diff_two_keys,
#if defined(DEBUG) || defined(XFS_WARN)
- .keys_inorder = xfs_allocbt_keys_inorder,
- .recs_inorder = xfs_allocbt_recs_inorder,
+ .keys_inorder = xfs_bnobt_keys_inorder,
+ .recs_inorder = xfs_bnobt_recs_inorder,
+#endif
+};
+
+static const struct xfs_btree_ops xfs_cntbt_ops = {
+ .rec_len = sizeof(xfs_alloc_rec_t),
+ .key_len = sizeof(xfs_alloc_key_t),
+
+ .dup_cursor = xfs_allocbt_dup_cursor,
+ .set_root = xfs_allocbt_set_root,
+ .alloc_block = xfs_allocbt_alloc_block,
+ .free_block = xfs_allocbt_free_block,
+ .update_lastrec = xfs_allocbt_update_lastrec,
+ .get_minrecs = xfs_allocbt_get_minrecs,
+ .get_maxrecs = xfs_allocbt_get_maxrecs,
+ .init_key_from_rec = xfs_allocbt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_cntbt_init_high_key_from_rec,
+ .init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
+ .key_diff = xfs_cntbt_key_diff,
+ .buf_ops = &xfs_allocbt_buf_ops,
+ .diff_two_keys = xfs_cntbt_diff_two_keys,
+#if defined(DEBUG) || defined(XFS_WARN)
+ .keys_inorder = xfs_cntbt_keys_inorder,
+ .recs_inorder = xfs_cntbt_recs_inorder,
#endif
};
@@ -427,16 +514,15 @@ xfs_allocbt_init_cursor(
cur->bc_mp = mp;
cur->bc_btnum = btnum;
cur->bc_blocklog = mp->m_sb.sb_blocklog;
- cur->bc_ops = &xfs_allocbt_ops;
- if (btnum == XFS_BTNUM_BNO)
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
- else
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2);
if (btnum == XFS_BTNUM_CNT) {
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2);
+ cur->bc_ops = &xfs_cntbt_ops;
cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
} else {
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
+ cur->bc_ops = &xfs_bnobt_ops;
cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
}
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 9bd104f32908..f02eb7673392 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -764,7 +764,6 @@ xfs_bmap_extents_to_btree(
args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
} else if (dfops->dop_low) {
args.type = XFS_ALLOCTYPE_START_BNO;
-try_another_ag:
args.fsbno = *firstblock;
} else {
args.type = XFS_ALLOCTYPE_NEAR_BNO;
@@ -779,20 +778,6 @@ try_another_ag:
return error;
}
- /*
- * During a CoW operation, the allocation and bmbt updates occur in
- * different transactions. The mapping code tries to put new bmbt
- * blocks near extents being mapped, but the only way to guarantee this
- * is if the alloc and the mapping happen in a single transaction that
- * has a block reservation. That isn't the case here, so if we run out
- * of space we'll try again with another AG.
- */
- if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) &&
- args.fsbno == NULLFSBLOCK &&
- args.type == XFS_ALLOCTYPE_NEAR_BNO) {
- args.type = XFS_ALLOCTYPE_FIRST_AG;
- goto try_another_ag;
- }
if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) {
xfs_iroot_realloc(ip, -1, whichfork);
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
@@ -925,7 +910,6 @@ xfs_bmap_local_to_extents(
* file currently fits in an inode.
*/
if (*firstblock == NULLFSBLOCK) {
-try_another_ag:
args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
args.type = XFS_ALLOCTYPE_START_BNO;
} else {
@@ -938,19 +922,6 @@ try_another_ag:
if (error)
goto done;
- /*
- * During a CoW operation, the allocation and bmbt updates occur in
- * different transactions. The mapping code tries to put new bmbt
- * blocks near extents being mapped, but the only way to guarantee this
- * is if the alloc and the mapping happen in a single transaction that
- * has a block reservation. That isn't the case here, so if we run out
- * of space we'll try again with another AG.
- */
- if (xfs_sb_version_hasreflink(&ip->i_mount->m_sb) &&
- args.fsbno == NULLFSBLOCK &&
- args.type == XFS_ALLOCTYPE_NEAR_BNO) {
- goto try_another_ag;
- }
/* Can't fail, the space was reserved. */
ASSERT(args.fsbno != NULLFSBLOCK);
ASSERT(args.len == 1);
@@ -1260,7 +1231,6 @@ xfs_bmap_read_extents(
xfs_fsblock_t bno; /* block # of "block" */
xfs_buf_t *bp; /* buffer for "block" */
int error; /* error return value */
- xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */
xfs_extnum_t i, j; /* index into the extents list */
xfs_ifork_t *ifp; /* fork structure */
int level; /* btree level, for checking */
@@ -1271,8 +1241,6 @@ xfs_bmap_read_extents(
mp = ip->i_mount;
ifp = XFS_IFORK_PTR(ip, whichfork);
- exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
- XFS_EXTFMT_INODE(ip);
block = ifp->if_broot;
/*
* Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
@@ -1340,18 +1308,9 @@ xfs_bmap_read_extents(
xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
trp->l0 = be64_to_cpu(frp->l0);
trp->l1 = be64_to_cpu(frp->l1);
- }
- if (exntf == XFS_EXTFMT_NOSTATE) {
- /*
- * Check all attribute bmap btree records and
- * any "older" data bmap btree records for a
- * set bit in the "extent flag" position.
- */
- if (unlikely(xfs_check_nostate_extents(ifp,
- start, num_recs))) {
+ if (!xfs_bmbt_validate_extent(mp, whichfork, trp)) {
XFS_ERROR_REPORT("xfs_bmap_read_extents(2)",
- XFS_ERRLEVEL_LOW,
- ip->i_mount);
+ XFS_ERRLEVEL_LOW, mp);
goto error0;
}
}
@@ -2879,27 +2838,30 @@ xfs_bmap_add_extent_hole_delay(
*/
STATIC int /* error */
xfs_bmap_add_extent_hole_real(
- struct xfs_bmalloca *bma,
- int whichfork)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_extnum_t *idx,
+ struct xfs_btree_cur **curp,
+ struct xfs_bmbt_irec *new,
+ xfs_fsblock_t *first,
+ struct xfs_defer_ops *dfops,
+ int *logflagsp)
{
- struct xfs_bmbt_irec *new = &bma->got;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_btree_cur *cur = *curp;
int error; /* error return value */
int i; /* temp state */
- xfs_ifork_t *ifp; /* inode fork pointer */
xfs_bmbt_irec_t left; /* left neighbor extent entry */
xfs_bmbt_irec_t right; /* right neighbor extent entry */
int rval=0; /* return value (logging flags) */
int state; /* state bits, accessed thru macros */
- struct xfs_mount *mp;
- mp = bma->ip->i_mount;
- ifp = XFS_IFORK_PTR(bma->ip, whichfork);
-
- ASSERT(bma->idx >= 0);
- ASSERT(bma->idx <= xfs_iext_count(ifp));
+ ASSERT(*idx >= 0);
+ ASSERT(*idx <= xfs_iext_count(ifp));
ASSERT(!isnullstartblock(new->br_startblock));
- ASSERT(!bma->cur ||
- !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+ ASSERT(!cur || !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
XFS_STATS_INC(mp, xs_add_exlist);
@@ -2912,9 +2874,9 @@ xfs_bmap_add_extent_hole_real(
/*
* Check and set flags if this segment has a left neighbor.
*/
- if (bma->idx > 0) {
+ if (*idx > 0) {
state |= BMAP_LEFT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &left);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
if (isnullstartblock(left.br_startblock))
state |= BMAP_LEFT_DELAY;
}
@@ -2923,9 +2885,9 @@ xfs_bmap_add_extent_hole_real(
* Check and set flags if this segment has a current value.
* Not true if we're inserting into the "hole" at eof.
*/
- if (bma->idx < xfs_iext_count(ifp)) {
+ if (*idx < xfs_iext_count(ifp)) {
state |= BMAP_RIGHT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
if (isnullstartblock(right.br_startblock))
state |= BMAP_RIGHT_DELAY;
}
@@ -2962,36 +2924,36 @@ xfs_bmap_add_extent_hole_real(
* left and on the right.
* Merge all three into a single extent record.
*/
- --bma->idx;
- trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
+ --*idx;
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
left.br_blockcount + new->br_blockcount +
right.br_blockcount);
- trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
+ xfs_iext_remove(ip, *idx + 1, 1, state);
- XFS_IFORK_NEXT_SET(bma->ip, whichfork,
- XFS_IFORK_NEXTENTS(bma->ip, whichfork) - 1);
- if (bma->cur == NULL) {
+ XFS_IFORK_NEXT_SET(ip, whichfork,
+ XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+ if (cur == NULL) {
rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
} else {
rval = XFS_ILOG_CORE;
- error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff,
+ error = xfs_bmbt_lookup_eq(cur, right.br_startoff,
right.br_startblock, right.br_blockcount,
&i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_btree_delete(bma->cur, &i);
+ error = xfs_btree_delete(cur, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_btree_decrement(bma->cur, 0, &i);
+ error = xfs_btree_decrement(cur, 0, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_bmbt_update(bma->cur, left.br_startoff,
+ error = xfs_bmbt_update(cur, left.br_startoff,
left.br_startblock,
left.br_blockcount +
new->br_blockcount +
@@ -3008,23 +2970,23 @@ xfs_bmap_add_extent_hole_real(
* on the left.
* Merge the new allocation with the left neighbor.
*/
- --bma->idx;
- trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
+ --*idx;
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
left.br_blockcount + new->br_blockcount);
- trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- if (bma->cur == NULL) {
+ if (cur == NULL) {
rval = xfs_ilog_fext(whichfork);
} else {
rval = 0;
- error = xfs_bmbt_lookup_eq(bma->cur, left.br_startoff,
+ error = xfs_bmbt_lookup_eq(cur, left.br_startoff,
left.br_startblock, left.br_blockcount,
&i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_bmbt_update(bma->cur, left.br_startoff,
+ error = xfs_bmbt_update(cur, left.br_startoff,
left.br_startblock,
left.br_blockcount +
new->br_blockcount,
@@ -3040,25 +3002,25 @@ xfs_bmap_add_extent_hole_real(
* on the right.
* Merge the new allocation with the right neighbor.
*/
- trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx),
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
new->br_startoff, new->br_startblock,
new->br_blockcount + right.br_blockcount,
right.br_state);
- trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- if (bma->cur == NULL) {
+ if (cur == NULL) {
rval = xfs_ilog_fext(whichfork);
} else {
rval = 0;
- error = xfs_bmbt_lookup_eq(bma->cur,
+ error = xfs_bmbt_lookup_eq(cur,
right.br_startoff,
right.br_startblock,
right.br_blockcount, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_bmbt_update(bma->cur, new->br_startoff,
+ error = xfs_bmbt_update(cur, new->br_startoff,
new->br_startblock,
new->br_blockcount +
right.br_blockcount,
@@ -3074,22 +3036,22 @@ xfs_bmap_add_extent_hole_real(
* real allocation.
* Insert a new entry.
*/
- xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
- XFS_IFORK_NEXT_SET(bma->ip, whichfork,
- XFS_IFORK_NEXTENTS(bma->ip, whichfork) + 1);
- if (bma->cur == NULL) {
+ xfs_iext_insert(ip, *idx, 1, new, state);
+ XFS_IFORK_NEXT_SET(ip, whichfork,
+ XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+ if (cur == NULL) {
rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
} else {
rval = XFS_ILOG_CORE;
- error = xfs_bmbt_lookup_eq(bma->cur,
+ error = xfs_bmbt_lookup_eq(cur,
new->br_startoff,
new->br_startblock,
new->br_blockcount, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
- bma->cur->bc_rec.b.br_state = new->br_state;
- error = xfs_btree_insert(bma->cur, &i);
+ cur->bc_rec.b.br_state = new->br_state;
+ error = xfs_btree_insert(cur, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
@@ -3098,30 +3060,30 @@ xfs_bmap_add_extent_hole_real(
}
/* add reverse mapping */
- error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip, whichfork, new);
+ error = xfs_rmap_map_extent(mp, dfops, ip, whichfork, new);
if (error)
goto done;
/* convert to a btree if necessary */
- if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
+ if (xfs_bmap_needs_btree(ip, whichfork)) {
int tmp_logflags; /* partial log flag return val */
- ASSERT(bma->cur == NULL);
- error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
- bma->firstblock, bma->dfops, &bma->cur,
+ ASSERT(cur == NULL);
+ error = xfs_bmap_extents_to_btree(tp, ip, first, dfops, curp,
0, &tmp_logflags, whichfork);
- bma->logflags |= tmp_logflags;
+ *logflagsp |= tmp_logflags;
+ cur = *curp;
if (error)
goto done;
}
/* clear out the allocated field, done with it now in any case. */
- if (bma->cur)
- bma->cur->bc_private.b.allocated = 0;
+ if (cur)
+ cur->bc_private.b.allocated = 0;
- xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
+ xfs_bmap_check_leaf_extents(cur, ip, whichfork);
done:
- bma->logflags |= rval;
+ *logflagsp |= rval;
return error;
}
@@ -3853,60 +3815,6 @@ xfs_bmap_btalloc(
}
/*
- * For a remap operation, just "allocate" an extent at the address that the
- * caller passed in, and ensure that the AGFL is the right size. The caller
- * will then map the "allocated" extent into the file somewhere.
- */
-STATIC int
-xfs_bmap_remap_alloc(
- struct xfs_bmalloca *ap)
-{
- struct xfs_trans *tp = ap->tp;
- struct xfs_mount *mp = tp->t_mountp;
- xfs_agblock_t bno;
- struct xfs_alloc_arg args;
- int error;
-
- /*
- * validate that the block number is legal - the enables us to detect
- * and handle a silent filesystem corruption rather than crashing.
- */
- memset(&args, 0, sizeof(struct xfs_alloc_arg));
- args.tp = ap->tp;
- args.mp = ap->tp->t_mountp;
- bno = *ap->firstblock;
- args.agno = XFS_FSB_TO_AGNO(mp, bno);
- args.agbno = XFS_FSB_TO_AGBNO(mp, bno);
- if (args.agno >= mp->m_sb.sb_agcount ||
- args.agbno >= mp->m_sb.sb_agblocks)
- return -EFSCORRUPTED;
-
- /* "Allocate" the extent from the range we passed in. */
- trace_xfs_bmap_remap_alloc(ap->ip, *ap->firstblock, ap->length);
- ap->blkno = bno;
- ap->ip->i_d.di_nblocks += ap->length;
- xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
-
- /* Fix the freelist, like a real allocator does. */
- args.datatype = ap->datatype;
- args.pag = xfs_perag_get(args.mp, args.agno);
- ASSERT(args.pag);
-
- /*
- * The freelist fixing code will decline the allocation if
- * the size and shape of the free space doesn't allow for
- * allocating the extent and updating all the metadata that
- * happens during an allocation. We're remapping, not
- * allocating, so skip that check by pretending to be freeing.
- */
- error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
- xfs_perag_put(args.pag);
- if (error)
- trace_xfs_bmap_remap_alloc_error(ap->ip, error, _RET_IP_);
- return error;
-}
-
-/*
* xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
* It figures out where to ask the underlying allocator to put the new extent.
*/
@@ -3914,8 +3822,6 @@ STATIC int
xfs_bmap_alloc(
struct xfs_bmalloca *ap) /* bmap alloc argument struct */
{
- if (ap->flags & XFS_BMAPI_REMAP)
- return xfs_bmap_remap_alloc(ap);
if (XFS_IS_REALTIME_INODE(ap->ip) &&
xfs_alloc_is_userdata(ap->datatype))
return xfs_bmap_rtalloc(ap);
@@ -4386,7 +4292,9 @@ xfs_bmapi_allocate(
if (bma->wasdel)
error = xfs_bmap_add_extent_delay_real(bma, whichfork);
else
- error = xfs_bmap_add_extent_hole_real(bma, whichfork);
+ error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip,
+ whichfork, &bma->idx, &bma->cur, &bma->got,
+ bma->firstblock, bma->dfops, &bma->logflags);
bma->logflags |= tmp_logflags;
if (error)
@@ -4549,9 +4457,7 @@ xfs_bmapi_write(
ASSERT(len > 0);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK);
- ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP));
- ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP));
+ ASSERT(!(flags & XFS_BMAPI_REMAP));
/* zeroing is for currently only for data extents, not metadata */
ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) !=
@@ -4635,13 +4541,8 @@ xfs_bmapi_write(
} else {
need_alloc = true;
}
- } else {
- /*
- * Make sure we only reflink into a hole.
- */
- ASSERT(!(flags & XFS_BMAPI_REMAP));
- if (isnullstartblock(bma.got.br_startblock))
- wasdelay = true;
+ } else if (isnullstartblock(bma.got.br_startblock)) {
+ wasdelay = true;
}
/*
@@ -4770,6 +4671,93 @@ error0:
return error;
}
+static int
+xfs_bmapi_remap(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ xfs_fileoff_t bno,
+ xfs_filblks_t len,
+ xfs_fsblock_t startblock,
+ struct xfs_defer_ops *dfops)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ struct xfs_btree_cur *cur = NULL;
+ xfs_fsblock_t firstblock = NULLFSBLOCK;
+ struct xfs_bmbt_irec got;
+ xfs_extnum_t idx;
+ int logflags = 0, error;
+
+ ASSERT(len > 0);
+ ASSERT(len <= (xfs_filblks_t)MAXEXTLEN);
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ if (unlikely(XFS_TEST_ERROR(
+ (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
+ mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+ XFS_ERROR_REPORT("xfs_bmapi_remap", XFS_ERRLEVEL_LOW, mp);
+ return -EFSCORRUPTED;
+ }
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ if (error)
+ return error;
+ }
+
+ if (xfs_iext_lookup_extent(ip, ifp, bno, &idx, &got)) {
+ /* make sure we only reflink into a hole. */
+ ASSERT(got.br_startoff > bno);
+ ASSERT(got.br_startoff - bno >= len);
+ }
+
+ ip->i_d.di_nblocks += len;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ if (ifp->if_flags & XFS_IFBROOT) {
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
+ cur->bc_private.b.firstblock = firstblock;
+ cur->bc_private.b.dfops = dfops;
+ cur->bc_private.b.flags = 0;
+ }
+
+ got.br_startoff = bno;
+ got.br_startblock = startblock;
+ got.br_blockcount = len;
+ got.br_state = XFS_EXT_NORM;
+
+ error = xfs_bmap_add_extent_hole_real(tp, ip, XFS_DATA_FORK, &idx, &cur,
+ &got, &firstblock, dfops, &logflags);
+ if (error)
+ goto error0;
+
+ if (xfs_bmap_wants_extents(ip, XFS_DATA_FORK)) {
+ int tmp_logflags = 0;
+
+ error = xfs_bmap_btree_to_extents(tp, ip, cur,
+ &tmp_logflags, XFS_DATA_FORK);
+ logflags |= tmp_logflags;
+ }
+
+error0:
+ if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS)
+ logflags &= ~XFS_ILOG_DEXT;
+ else if (ip->i_d.di_format != XFS_DINODE_FMT_BTREE)
+ logflags &= ~XFS_ILOG_DBROOT;
+
+ if (logflags)
+ xfs_trans_log_inode(tp, ip, logflags);
+ if (cur) {
+ xfs_btree_del_cursor(cur,
+ error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+ }
+ return error;
+}
+
/*
* When a delalloc extent is split (e.g., due to a hole punch), the original
* indlen reservation must be shared across the two new extents that are left
@@ -4887,7 +4875,7 @@ xfs_bmap_del_extent_delay(
ASSERT(got_endoff >= del_endoff);
if (isrt) {
- int64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount);
+ uint64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount);
do_div(rtexts, mp->m_sb.sb_rextsize);
xfs_mod_frextents(mp, rtexts);
@@ -6488,27 +6476,15 @@ xfs_bmap_finish_one(
xfs_filblks_t blockcount,
xfs_exntst_t state)
{
- struct xfs_bmbt_irec bmap;
- int nimaps = 1;
- xfs_fsblock_t firstfsb;
- int flags = XFS_BMAPI_REMAP;
- int done;
- int error = 0;
-
- bmap.br_startblock = startblock;
- bmap.br_startoff = startoff;
- bmap.br_blockcount = blockcount;
- bmap.br_state = state;
+ int error = 0, done;
trace_xfs_bmap_deferred(tp->t_mountp,
XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
ip->i_ino, whichfork, startoff, blockcount, state);
- if (whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK)
+ if (WARN_ON_ONCE(whichfork != XFS_DATA_FORK))
return -EFSCORRUPTED;
- if (whichfork == XFS_ATTR_FORK)
- flags |= XFS_BMAPI_ATTRFORK;
if (XFS_TEST_ERROR(false, tp->t_mountp,
XFS_ERRTAG_BMAP_FINISH_ONE,
@@ -6517,16 +6493,12 @@ xfs_bmap_finish_one(
switch (type) {
case XFS_BMAP_MAP:
- firstfsb = bmap.br_startblock;
- error = xfs_bmapi_write(tp, ip, bmap.br_startoff,
- bmap.br_blockcount, flags, &firstfsb,
- bmap.br_blockcount, &bmap, &nimaps,
- dfops);
+ error = xfs_bmapi_remap(tp, ip, startoff, blockcount,
+ startblock, dfops);
break;
case XFS_BMAP_UNMAP:
- error = xfs_bunmapi(tp, ip, bmap.br_startoff,
- bmap.br_blockcount, flags, 1, &firstfsb,
- dfops, &done);
+ error = xfs_bunmapi(tp, ip, startoff, blockcount,
+ XFS_BMAPI_REMAP, 1, &startblock, dfops, &done);
ASSERT(done);
break;
default:
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index cdef87db5262..c35a14fa1527 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -172,6 +172,18 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags)
/*
+ * Return true if the extent is a real, allocated extent, or false if it is a
+ * delayed allocation, and unwritten extent or a hole.
+ */
+static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
+{
+ return irec->br_state != XFS_EXT_UNWRITTEN &&
+ irec->br_startblock != HOLESTARTBLOCK &&
+ irec->br_startblock != DELAYSTARTBLOCK &&
+ !isnullstartblock(irec->br_startblock);
+}
+
+/*
* This macro is used to determine how many extents will be shifted
* in one write transaction. We could require two splits,
* an extent move on the first and an extent merge on the second,
@@ -232,8 +244,6 @@ int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
struct xfs_bmbt_irec *del);
void xfs_bmap_del_extent_cow(struct xfs_inode *ip, xfs_extnum_t *idx,
struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del);
-int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
- xfs_extnum_t num);
uint xfs_default_attroffset(struct xfs_inode *ip);
int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index fd55db479385..6cba69aff077 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -366,32 +366,6 @@ xfs_bmbt_to_bmdr(
memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
}
-/*
- * Check extent records, which have just been read, for
- * any bit in the extent flag field. ASSERT on debug
- * kernels, as this condition should not occur.
- * Return an error condition (1) if any flags found,
- * otherwise return 0.
- */
-
-int
-xfs_check_nostate_extents(
- xfs_ifork_t *ifp,
- xfs_extnum_t idx,
- xfs_extnum_t num)
-{
- for (; num > 0; num--, idx++) {
- xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
- if ((ep->l0 >>
- (64 - BMBT_EXNTFLAG_BITLEN)) != 0) {
- ASSERT(0);
- return 1;
- }
- }
- return 0;
-}
-
-
STATIC struct xfs_btree_cur *
xfs_bmbt_dup_cursor(
struct xfs_btree_cur *cur)
@@ -448,7 +422,6 @@ xfs_bmbt_alloc_block(
if (args.fsbno == NULLFSBLOCK) {
args.fsbno = be64_to_cpu(start->l);
args.type = XFS_ALLOCTYPE_START_BNO;
-try_another_ag:
/*
* Make sure there is sufficient room left in the AG to
* complete a full tree split for an extent insert. If
@@ -477,22 +450,6 @@ try_another_ag:
if (error)
goto error0;
- /*
- * During a CoW operation, the allocation and bmbt updates occur in
- * different transactions. The mapping code tries to put new bmbt
- * blocks near extents being mapped, but the only way to guarantee this
- * is if the alloc and the mapping happen in a single transaction that
- * has a block reservation. That isn't the case here, so if we run out
- * of space we'll try again with another AG.
- */
- if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) &&
- args.fsbno == NULLFSBLOCK &&
- args.type == XFS_ALLOCTYPE_NEAR_BNO) {
- args.fsbno = cur->bc_private.b.firstblock;
- args.type = XFS_ALLOCTYPE_FIRST_AG;
- goto try_another_ag;
- }
-
if (args.fsbno == NULLFSBLOCK && args.minleft) {
/*
* Could not find an AG with enough free space to satisfy
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 819a8a4dee95..9da5a8d4f184 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -25,14 +25,6 @@ struct xfs_inode;
struct xfs_trans;
/*
- * Extent state and extent format macros.
- */
-#define XFS_EXTFMT_INODE(x) \
- (xfs_sb_version_hasextflgbit(&((x)->i_mount->m_sb)) ? \
- XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE)
-#define ISUNWRITTEN(x) ((x)->br_state == XFS_EXT_UNWRITTEN)
-
-/*
* Btree block header size depends on a superblock flag.
*/
#define XFS_BMBT_BLOCK_LEN(mp) \
@@ -140,4 +132,18 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
struct xfs_trans *, struct xfs_inode *, int);
+/*
+ * Check that the extent does not contain an invalid unwritten extent flag.
+ */
+static inline bool xfs_bmbt_validate_extent(struct xfs_mount *mp, int whichfork,
+ struct xfs_bmbt_rec_host *ep)
+{
+ if (ep->l0 >> (64 - BMBT_EXNTFLAG_BITLEN) == 0)
+ return true;
+ if (whichfork == XFS_DATA_FORK &&
+ xfs_sb_version_hasextflgbit(&mp->m_sb))
+ return true;
+ return false;
+}
+
#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 3059a3ec7ecb..5392674bf893 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -4842,6 +4842,21 @@ xfs_btree_query_range(
fn, priv);
}
+/* Query a btree for all records. */
+int
+xfs_btree_query_all(
+ struct xfs_btree_cur *cur,
+ xfs_btree_query_range_fn fn,
+ void *priv)
+{
+ union xfs_btree_irec low_rec;
+ union xfs_btree_irec high_rec;
+
+ memset(&low_rec, 0, sizeof(low_rec));
+ memset(&high_rec, 0xFF, sizeof(high_rec));
+ return xfs_btree_query_range(cur, &low_rec, &high_rec, fn, priv);
+}
+
/*
* Calculate the number of blocks needed to store a given number of records
* in a short-format (per-AG metadata) btree.
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 4bb62580a7fd..27bed08261c5 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -496,6 +496,8 @@ typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur,
int xfs_btree_query_range(struct xfs_btree_cur *cur,
union xfs_btree_irec *low_rec, union xfs_btree_irec *high_rec,
xfs_btree_query_range_fn fn, void *priv);
+int xfs_btree_query_all(struct xfs_btree_cur *cur, xfs_btree_query_range_fn fn,
+ void *priv);
typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level,
void *data);
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index ac9a003dd29a..747085b4ef44 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -35,13 +35,8 @@ int
xfs_calc_dquots_per_chunk(
unsigned int nbblks) /* basic block units */
{
- unsigned int ndquots;
-
ASSERT(nbblks > 0);
- ndquots = BBTOB(nbblks);
- do_div(ndquots, sizeof(xfs_dqblk_t));
-
- return ndquots;
+ return BBTOB(nbblks) / sizeof(xfs_dqblk_t);
}
/*
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 6b7579e7b60a..a1dccd8d96bc 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -930,10 +930,8 @@ static inline uint xfs_dinode_size(int version)
/*
* The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
* Since the pathconf interface is signed, we use 2^31 - 1 instead.
- * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX.
*/
#define XFS_MAXLINK ((1U << 31) - 1U)
-#define XFS_MAXLINK_1 65535U
/*
* Values for di_format
@@ -1578,19 +1576,10 @@ static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
}
/*
- * Possible extent formats.
- */
-typedef enum {
- XFS_EXTFMT_NOSTATE = 0,
- XFS_EXTFMT_HASSTATE
-} xfs_exntfmt_t;
-
-/*
* Possible extent states.
*/
typedef enum {
XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
- XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID
} xfs_exntst_t;
/*
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index b72dc821d78b..095bdf049a3f 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -92,6 +92,18 @@ struct getbmapx {
#define BMV_OF_LAST 0x4 /* segment is the last in the file */
#define BMV_OF_SHARED 0x8 /* segment shared with another file */
+/* fmr_owner special values for FS_IOC_GETFSMAP */
+#define XFS_FMR_OWN_FREE FMR_OWN_FREE /* free space */
+#define XFS_FMR_OWN_UNKNOWN FMR_OWN_UNKNOWN /* unknown owner */
+#define XFS_FMR_OWN_FS FMR_OWNER('X', 1) /* static fs metadata */
+#define XFS_FMR_OWN_LOG FMR_OWNER('X', 2) /* journalling log */
+#define XFS_FMR_OWN_AG FMR_OWNER('X', 3) /* per-AG metadata */
+#define XFS_FMR_OWN_INOBT FMR_OWNER('X', 4) /* inode btree blocks */
+#define XFS_FMR_OWN_INODES FMR_OWNER('X', 5) /* inodes */
+#define XFS_FMR_OWN_REFC FMR_OWNER('X', 6) /* refcount tree */
+#define XFS_FMR_OWN_COW FMR_OWNER('X', 7) /* cow staging */
+#define XFS_FMR_OWN_DEFECTIVE FMR_OWNER('X', 8) /* bad blocks */
+
/*
* Structure for XFS_IOC_FSSETDM.
* For use by backup and restore programs to set the XFS on-disk inode
@@ -502,6 +514,7 @@ typedef struct xfs_swapext
#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap)
#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64)
#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_fs_eofblocks)
+/* XFS_IOC_GETFSMAP ------ hoisted 59 */
/*
* ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index d93f9d918cfc..09c3d1aecef2 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -508,7 +508,7 @@ xfs_iread(
/* even unallocated inodes are verified */
if (!xfs_dinode_verify(mp, ip->i_ino, dip)) {
- xfs_alert(mp, "%s: validation failed for inode %lld failed",
+ xfs_alert(mp, "%s: validation failed for inode %lld",
__func__, ip->i_ino);
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 8a37efe04de3..0e80f34fe97c 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -42,35 +42,6 @@ STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
-#ifdef DEBUG
-/*
- * Make sure that the extents in the given memory buffer
- * are valid.
- */
-void
-xfs_validate_extents(
- xfs_ifork_t *ifp,
- int nrecs,
- xfs_exntfmt_t fmt)
-{
- xfs_bmbt_irec_t irec;
- xfs_bmbt_rec_host_t rec;
- int i;
-
- for (i = 0; i < nrecs; i++) {
- xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
- rec.l0 = get_unaligned(&ep->l0);
- rec.l1 = get_unaligned(&ep->l1);
- xfs_bmbt_get_all(&rec, &irec);
- if (fmt == XFS_EXTFMT_NOSTATE)
- ASSERT(irec.br_state == XFS_EXT_NORM);
- }
-}
-#else /* DEBUG */
-#define xfs_validate_extents(ifp, nrecs, fmt)
-#endif /* DEBUG */
-
-
/*
* Move inode type and inode format specific information from the
* on-disk inode to the in-core inode. For fifos, devs, and sockets
@@ -352,40 +323,33 @@ xfs_iformat_local(
}
/*
- * The file consists of a set of extents all
- * of which fit into the on-disk inode.
- * If there are few enough extents to fit into
- * the if_inline_ext, then copy them there.
- * Otherwise allocate a buffer for them and copy
- * them into it. Either way, set if_extents
- * to point at the extents.
+ * The file consists of a set of extents all of which fit into the on-disk
+ * inode. If there are few enough extents to fit into the if_inline_ext, then
+ * copy them there. Otherwise allocate a buffer for them and copy them into it.
+ * Either way, set if_extents to point at the extents.
*/
STATIC int
xfs_iformat_extents(
- xfs_inode_t *ip,
- xfs_dinode_t *dip,
- int whichfork)
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip,
+ int whichfork)
{
- xfs_bmbt_rec_t *dp;
- xfs_ifork_t *ifp;
- int nex;
- int size;
- int i;
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- nex = XFS_DFORK_NEXTENTS(dip, whichfork);
- size = nex * (uint)sizeof(xfs_bmbt_rec_t);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ int nex = XFS_DFORK_NEXTENTS(dip, whichfork);
+ int size = nex * sizeof(xfs_bmbt_rec_t);
+ struct xfs_bmbt_rec *dp;
+ int i;
/*
- * If the number of extents is unreasonable, then something
- * is wrong and we just bail out rather than crash in
- * kmem_alloc() or memcpy() below.
+ * If the number of extents is unreasonable, then something is wrong and
+ * we just bail out rather than crash in kmem_alloc() or memcpy() below.
*/
- if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
+ if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) {
xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
(unsigned long long) ip->i_ino, nex);
XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
- ip->i_mount, dip);
+ mp, dip);
return -EFSCORRUPTED;
}
@@ -400,22 +364,17 @@ xfs_iformat_extents(
ifp->if_bytes = size;
if (size) {
dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
- xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
for (i = 0; i < nex; i++, dp++) {
xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
ep->l0 = get_unaligned_be64(&dp->l0);
ep->l1 = get_unaligned_be64(&dp->l1);
+ if (!xfs_bmbt_validate_extent(mp, whichfork, ep)) {
+ XFS_ERROR_REPORT("xfs_iformat_extents(2)",
+ XFS_ERRLEVEL_LOW, mp);
+ return -EFSCORRUPTED;
+ }
}
XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
- if (whichfork != XFS_DATA_FORK ||
- XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
- if (unlikely(xfs_check_nostate_extents(
- ifp, 0, nex))) {
- XFS_ERROR_REPORT("xfs_iformat_extents(2)",
- XFS_ERRLEVEL_LOW,
- ip->i_mount);
- return -EFSCORRUPTED;
- }
}
ifp->if_flags |= XFS_IFEXTENTS;
return 0;
@@ -518,7 +477,6 @@ xfs_iread_extents(
xfs_iext_destroy(ifp);
return error;
}
- xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
ifp->if_flags |= XFS_IFEXTENTS;
return 0;
}
@@ -837,6 +795,9 @@ xfs_iextents_copy(
copied = 0;
for (i = 0; i < nrecs; i++) {
xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+
+ ASSERT(xfs_bmbt_validate_extent(ip->i_mount, whichfork, ep));
+
start_block = xfs_bmbt_get_startblock(ep);
if (isnullstartblock(start_block)) {
/*
@@ -852,7 +813,6 @@ xfs_iextents_copy(
copied++;
}
ASSERT(copied != 0);
- xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
return (copied * (uint)sizeof(xfs_bmbt_rec_t));
}
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 3a8cc7139912..06cfb93c2ef9 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -2001,14 +2001,14 @@ xfs_rmap_query_range_helper(
/* Find all rmaps between two keys. */
int
xfs_rmap_query_range(
- struct xfs_btree_cur *cur,
- struct xfs_rmap_irec *low_rec,
- struct xfs_rmap_irec *high_rec,
- xfs_rmap_query_range_fn fn,
- void *priv)
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *low_rec,
+ struct xfs_rmap_irec *high_rec,
+ xfs_rmap_query_range_fn fn,
+ void *priv)
{
- union xfs_btree_irec low_brec;
- union xfs_btree_irec high_brec;
+ union xfs_btree_irec low_brec;
+ union xfs_btree_irec high_brec;
struct xfs_rmap_query_range_info query;
low_brec.r = *low_rec;
@@ -2019,6 +2019,20 @@ xfs_rmap_query_range(
xfs_rmap_query_range_helper, &query);
}
+/* Find all rmaps. */
+int
+xfs_rmap_query_all(
+ struct xfs_btree_cur *cur,
+ xfs_rmap_query_range_fn fn,
+ void *priv)
+{
+ struct xfs_rmap_query_range_info query;
+
+ query.priv = priv;
+ query.fn = fn;
+ return xfs_btree_query_all(cur, xfs_rmap_query_range_helper, &query);
+}
+
/* Clean up after calling xfs_rmap_finish_one. */
void
xfs_rmap_finish_one_cleanup(
@@ -2291,3 +2305,31 @@ xfs_rmap_free_extent(
return __xfs_rmap_add(mp, dfops, XFS_RMAP_FREE, owner,
XFS_DATA_FORK, &bmap);
}
+
+/* Compare rmap records. Returns -1 if a < b, 1 if a > b, and 0 if equal. */
+int
+xfs_rmap_compare(
+ const struct xfs_rmap_irec *a,
+ const struct xfs_rmap_irec *b)
+{
+ __u64 oa;
+ __u64 ob;
+
+ oa = xfs_rmap_irec_offset_pack(a);
+ ob = xfs_rmap_irec_offset_pack(b);
+
+ if (a->rm_startblock < b->rm_startblock)
+ return -1;
+ else if (a->rm_startblock > b->rm_startblock)
+ return 1;
+ else if (a->rm_owner < b->rm_owner)
+ return -1;
+ else if (a->rm_owner > b->rm_owner)
+ return 1;
+ else if (oa < ob)
+ return -1;
+ else if (oa > ob)
+ return 1;
+ else
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 789930599339..98f908fea103 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -162,6 +162,8 @@ typedef int (*xfs_rmap_query_range_fn)(
int xfs_rmap_query_range(struct xfs_btree_cur *cur,
struct xfs_rmap_irec *low_rec, struct xfs_rmap_irec *high_rec,
xfs_rmap_query_range_fn fn, void *priv);
+int xfs_rmap_query_all(struct xfs_btree_cur *cur, xfs_rmap_query_range_fn fn,
+ void *priv);
enum xfs_rmap_intent_type {
XFS_RMAP_MAP,
@@ -212,5 +214,7 @@ int xfs_rmap_find_left_neighbor(struct xfs_btree_cur *cur, xfs_agblock_t bno,
int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno,
uint64_t owner, uint64_t offset, unsigned int flags,
struct xfs_rmap_irec *irec, int *stat);
+int xfs_rmap_compare(const struct xfs_rmap_irec *a,
+ const struct xfs_rmap_irec *b);
#endif /* __XFS_RMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index ea45584a9913..e47b99e59f60 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1016,3 +1016,73 @@ xfs_rtfree_extent(
}
return 0;
}
+
+/* Find all the free records within a given range. */
+int
+xfs_rtalloc_query_range(
+ struct xfs_trans *tp,
+ struct xfs_rtalloc_rec *low_rec,
+ struct xfs_rtalloc_rec *high_rec,
+ xfs_rtalloc_query_range_fn fn,
+ void *priv)
+{
+ struct xfs_rtalloc_rec rec;
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_rtblock_t rtstart;
+ xfs_rtblock_t rtend;
+ xfs_rtblock_t rem;
+ int is_free;
+ int error = 0;
+
+ if (low_rec->ar_startblock > high_rec->ar_startblock)
+ return -EINVAL;
+ else if (low_rec->ar_startblock == high_rec->ar_startblock)
+ return 0;
+
+ /* Iterate the bitmap, looking for discrepancies. */
+ rtstart = low_rec->ar_startblock;
+ rem = high_rec->ar_startblock - rtstart;
+ while (rem) {
+ /* Is the first block free? */
+ error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend,
+ &is_free);
+ if (error)
+ break;
+
+ /* How long does the extent go for? */
+ error = xfs_rtfind_forw(mp, tp, rtstart,
+ high_rec->ar_startblock - 1, &rtend);
+ if (error)
+ break;
+
+ if (is_free) {
+ rec.ar_startblock = rtstart;
+ rec.ar_blockcount = rtend - rtstart + 1;
+
+ error = fn(tp, &rec, priv);
+ if (error)
+ break;
+ }
+
+ rem -= rtend - rtstart + 1;
+ rtstart = rtend + 1;
+ }
+
+ return error;
+}
+
+/* Find all the free records. */
+int
+xfs_rtalloc_query_all(
+ struct xfs_trans *tp,
+ xfs_rtalloc_query_range_fn fn,
+ void *priv)
+{
+ struct xfs_rtalloc_rec keys[2];
+
+ keys[0].ar_startblock = 0;
+ keys[1].ar_startblock = tp->t_mountp->m_sb.sb_rblocks;
+ keys[0].ar_blockcount = keys[1].ar_blockcount = 0;
+
+ return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv);
+}
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index 7917f6e44286..d787c677d2a3 100644
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -21,8 +21,20 @@
/*
* Components of space reservations.
*/
+
+/* Worst case number of rmaps that can be held in a block. */
#define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) \
(((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0]))
+
+/* Adding one rmap could split every level up to the top of the tree. */
+#define XFS_RMAPADD_SPACE_RES(mp) ((mp)->m_rmap_maxlevels)
+
+/* Blocks we might need to add "b" rmaps to a tree. */
+#define XFS_NRMAPADD_SPACE_RES(mp, b)\
+ (((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \
+ XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \
+ XFS_RMAPADD_SPACE_RES(mp))
+
#define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) \
(((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0]))
#define XFS_EXTENTADD_SPACE_RES(mp,w) (XFS_BM_MAXLEVELS(mp,w) - 1)
@@ -30,13 +42,12 @@
(((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
XFS_EXTENTADD_SPACE_RES(mp,w))
+
+/* Blocks we might need to add "b" mappings & rmappings to a file. */
#define XFS_SWAP_RMAP_SPACE_RES(mp,b,w)\
- (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
- XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
- XFS_EXTENTADD_SPACE_RES(mp,w) + \
- ((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \
- XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \
- (mp)->m_rmap_maxlevels)
+ (XFS_NEXTENTADD_SPACE_RES((mp), (b), (w)) + \
+ XFS_NRMAPADD_SPACE_RES((mp), (b)))
+
#define XFS_DAENTER_1B(mp,w) \
((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1)
#define XFS_DAENTER_DBS(mp,w) \
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 05eca126c688..09af0f7cd55e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -111,11 +111,11 @@ xfs_finish_page_writeback(
bsize = bh->b_size;
do {
+ if (off > end)
+ break;
next = bh->b_this_page;
if (off < bvec->bv_offset)
goto next_bh;
- if (off > end)
- break;
bh->b_end_io(bh, !error);
next_bh:
off += bsize;
@@ -1261,8 +1261,8 @@ xfs_get_blocks(
if (nimaps) {
trace_xfs_get_blocks_found(ip, offset, size,
- ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
- : XFS_IO_OVERWRITE, &imap);
+ imap.br_state == XFS_EXT_UNWRITTEN ?
+ XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, &imap);
xfs_iunlock(ip, lockmode);
} else {
trace_xfs_get_blocks_notfound(ip, offset, size);
@@ -1276,9 +1276,7 @@ xfs_get_blocks(
* For unwritten extents do not report a disk address in the buffered
* read case (treat as if we're reading into a hole).
*/
- if (imap.br_startblock != HOLESTARTBLOCK &&
- imap.br_startblock != DELAYSTARTBLOCK &&
- !ISUNWRITTEN(&imap))
+ if (xfs_bmap_is_real_extent(&imap))
xfs_map_buffer(inode, bh_result, &imap, offset);
/*
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 9bf57c76623b..d419d23fa214 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -34,6 +34,8 @@
#include "xfs_bmap.h"
#include "xfs_icache.h"
#include "xfs_trace.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
kmem_zone_t *xfs_bui_zone;
@@ -215,6 +217,7 @@ void
xfs_bui_release(
struct xfs_bui_log_item *buip)
{
+ ASSERT(atomic_read(&buip->bui_refcount) > 0);
if (atomic_dec_and_test(&buip->bui_refcount)) {
xfs_trans_ail_remove(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR);
xfs_bui_item_free(buip);
@@ -446,7 +449,8 @@ xfs_bui_recover(
return -EIO;
}
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
+ XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
if (error)
return error;
budp = xfs_trans_get_bud(tp, buip);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 8795e9cd867c..2b954308a1d6 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -448,10 +448,9 @@ xfs_getbmap_adjust_shared(
next_map->br_blockcount = 0;
/* Only written data blocks can be shared. */
- if (!xfs_is_reflink_inode(ip) || whichfork != XFS_DATA_FORK ||
- map->br_startblock == DELAYSTARTBLOCK ||
- map->br_startblock == HOLESTARTBLOCK ||
- ISUNWRITTEN(map))
+ if (!xfs_is_reflink_inode(ip) ||
+ whichfork != XFS_DATA_FORK ||
+ !xfs_bmap_is_real_extent(map))
return 0;
agno = XFS_FSB_TO_AGNO(mp, map->br_startblock);
@@ -904,9 +903,9 @@ xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
}
/*
- * This is called by xfs_inactive to free any blocks beyond eof
- * when the link count isn't zero and by xfs_dm_punch_hole() when
- * punching a hole to EOF.
+ * This is called to free any blocks beyond eof. The caller must hold
+ * IOLOCK_EXCL unless we are in the inode reclaim path and have the only
+ * reference to the inode.
*/
int
xfs_free_eofblocks(
@@ -921,8 +920,6 @@ xfs_free_eofblocks(
struct xfs_bmbt_irec imap;
struct xfs_mount *mp = ip->i_mount;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
-
/*
* Figure out if there are any blocks beyond the end
* of the file. If not, then there is nothing to do.
@@ -1209,11 +1206,8 @@ xfs_adjust_extent_unmap_boundaries(
return error;
if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
- xfs_daddr_t block;
-
ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
- block = imap.br_startblock;
- mod = do_div(block, mp->m_sb.sb_rextsize);
+ mod = do_mod(imap.br_startblock, mp->m_sb.sb_rextsize);
if (mod)
*startoffset_fsb += mp->m_sb.sb_rextsize - mod;
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index ca09061369cb..62fa39276a24 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1079,6 +1079,8 @@ void
xfs_buf_unlock(
struct xfs_buf *bp)
{
+ ASSERT(xfs_buf_islocked(bp));
+
XB_CLEAR_OWNER(bp);
up(&bp->b_sema);
@@ -1815,6 +1817,28 @@ error:
}
/*
+ * Cancel a delayed write list.
+ *
+ * Remove each buffer from the list, clear the delwri queue flag and drop the
+ * associated buffer reference.
+ */
+void
+xfs_buf_delwri_cancel(
+ struct list_head *list)
+{
+ struct xfs_buf *bp;
+
+ while (!list_empty(list)) {
+ bp = list_first_entry(list, struct xfs_buf, b_list);
+
+ xfs_buf_lock(bp);
+ bp->b_flags &= ~_XBF_DELWRI_Q;
+ list_del_init(&bp->b_list);
+ xfs_buf_relse(bp);
+ }
+}
+
+/*
* Add a buffer to the delayed write list.
*
* This queues a buffer for writeout if it hasn't already been. Note that
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 3c867e5a63e1..8d1d44f87ce9 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -291,7 +291,6 @@ xfs_buf_readahead(
return xfs_buf_readahead_map(target, &map, 1, ops);
}
-struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks);
void xfs_buf_set_empty(struct xfs_buf *bp, size_t numblks);
int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length);
@@ -330,6 +329,7 @@ extern void *xfs_buf_offset(struct xfs_buf *, size_t);
extern void xfs_buf_stale(struct xfs_buf *bp);
/* Delayed Write Buffer Routines */
+extern void xfs_buf_delwri_cancel(struct list_head *);
extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
extern int xfs_buf_delwri_submit(struct list_head *);
extern int xfs_buf_delwri_submit_nowait(struct list_head *);
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index ad9396e516f6..20b7a5c6eb2f 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -394,6 +394,7 @@ xfs_dir2_leaf_readbuf(
/*
* Do we need more readahead?
+ * Each loop tries to process 1 full dir blk; last may be partial.
*/
blk_start_plug(&plug);
for (mip->ra_index = mip->ra_offset = i = 0;
@@ -404,7 +405,8 @@ xfs_dir2_leaf_readbuf(
* Read-ahead a contiguous directory block.
*/
if (i > mip->ra_current &&
- map[mip->ra_index].br_blockcount >= geo->fsbcount) {
+ (map[mip->ra_index].br_blockcount - mip->ra_offset) >=
+ geo->fsbcount) {
xfs_dir3_data_readahead(dp,
map[mip->ra_index].br_startoff + mip->ra_offset,
XFS_FSB_TO_DADDR(dp->i_mount,
@@ -425,14 +427,19 @@ xfs_dir2_leaf_readbuf(
}
/*
- * Advance offset through the mapping table.
+ * Advance offset through the mapping table, processing a full
+ * dir block even if it is fragmented into several extents.
+ * But stop if we have consumed all valid mappings, even if
+ * it's not yet a full directory block.
*/
- for (j = 0; j < geo->fsbcount; j += length ) {
+ for (j = 0;
+ j < geo->fsbcount && mip->ra_index < mip->map_valid;
+ j += length ) {
/*
* The rest of this extent but not more than a dir
* block.
*/
- length = min_t(int, geo->fsbcount,
+ length = min_t(int, geo->fsbcount - j,
map[mip->ra_index].br_blockcount -
mip->ra_offset);
mip->ra_offset += length;
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index d796ffac7296..6a05d278da64 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -132,6 +132,11 @@ next_extent:
error = xfs_btree_decrement(cur, 0, &i);
if (error)
goto out_del_cursor;
+
+ if (fatal_signal_pending(current)) {
+ error = -ERESTARTSYS;
+ goto out_del_cursor;
+ }
}
out_del_cursor:
@@ -196,8 +201,11 @@ xfs_ioc_trim(
for (agno = start_agno; agno <= end_agno; agno++) {
error = xfs_trim_extents(mp, agno, start, end, minlen,
&blocks_trimmed);
- if (error)
+ if (error) {
last_error = error;
+ if (error == -ERESTARTSYS)
+ break;
+ }
}
if (last_error)
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index d7bc14906af8..44f8c5451210 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -290,6 +290,7 @@ void
xfs_efi_release(
struct xfs_efi_log_item *efip)
{
+ ASSERT(atomic_read(&efip->efi_refcount) > 0);
if (atomic_dec_and_test(&efip->efi_refcount)) {
xfs_trans_ail_remove(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR);
xfs_efi_item_free(efip);
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
new file mode 100644
index 000000000000..3683819887a5
--- /dev/null
+++ b/fs/xfs/xfs_fsmap.c
@@ -0,0 +1,940 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_error.h"
+#include "xfs_btree.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_rmap.h"
+#include "xfs_alloc.h"
+#include "xfs_bit.h"
+#include <linux/fsmap.h>
+#include "xfs_fsmap.h"
+#include "xfs_refcount.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_rtalloc.h"
+
+/* Convert an xfs_fsmap to an fsmap. */
+void
+xfs_fsmap_from_internal(
+ struct fsmap *dest,
+ struct xfs_fsmap *src)
+{
+ dest->fmr_device = src->fmr_device;
+ dest->fmr_flags = src->fmr_flags;
+ dest->fmr_physical = BBTOB(src->fmr_physical);
+ dest->fmr_owner = src->fmr_owner;
+ dest->fmr_offset = BBTOB(src->fmr_offset);
+ dest->fmr_length = BBTOB(src->fmr_length);
+ dest->fmr_reserved[0] = 0;
+ dest->fmr_reserved[1] = 0;
+ dest->fmr_reserved[2] = 0;
+}
+
+/* Convert an fsmap to an xfs_fsmap. */
+void
+xfs_fsmap_to_internal(
+ struct xfs_fsmap *dest,
+ struct fsmap *src)
+{
+ dest->fmr_device = src->fmr_device;
+ dest->fmr_flags = src->fmr_flags;
+ dest->fmr_physical = BTOBBT(src->fmr_physical);
+ dest->fmr_owner = src->fmr_owner;
+ dest->fmr_offset = BTOBBT(src->fmr_offset);
+ dest->fmr_length = BTOBBT(src->fmr_length);
+}
+
+/* Convert an fsmap owner into an rmapbt owner. */
+static int
+xfs_fsmap_owner_to_rmap(
+ struct xfs_rmap_irec *dest,
+ struct xfs_fsmap *src)
+{
+ if (!(src->fmr_flags & FMR_OF_SPECIAL_OWNER)) {
+ dest->rm_owner = src->fmr_owner;
+ return 0;
+ }
+
+ switch (src->fmr_owner) {
+ case 0: /* "lowest owner id possible" */
+ case -1ULL: /* "highest owner id possible" */
+ dest->rm_owner = 0;
+ break;
+ case XFS_FMR_OWN_FREE:
+ dest->rm_owner = XFS_RMAP_OWN_NULL;
+ break;
+ case XFS_FMR_OWN_UNKNOWN:
+ dest->rm_owner = XFS_RMAP_OWN_UNKNOWN;
+ break;
+ case XFS_FMR_OWN_FS:
+ dest->rm_owner = XFS_RMAP_OWN_FS;
+ break;
+ case XFS_FMR_OWN_LOG:
+ dest->rm_owner = XFS_RMAP_OWN_LOG;
+ break;
+ case XFS_FMR_OWN_AG:
+ dest->rm_owner = XFS_RMAP_OWN_AG;
+ break;
+ case XFS_FMR_OWN_INOBT:
+ dest->rm_owner = XFS_RMAP_OWN_INOBT;
+ break;
+ case XFS_FMR_OWN_INODES:
+ dest->rm_owner = XFS_RMAP_OWN_INODES;
+ break;
+ case XFS_FMR_OWN_REFC:
+ dest->rm_owner = XFS_RMAP_OWN_REFC;
+ break;
+ case XFS_FMR_OWN_COW:
+ dest->rm_owner = XFS_RMAP_OWN_COW;
+ break;
+ case XFS_FMR_OWN_DEFECTIVE: /* not implemented */
+ /* fall through */
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* Convert an rmapbt owner into an fsmap owner. */
+static int
+xfs_fsmap_owner_from_rmap(
+ struct xfs_fsmap *dest,
+ struct xfs_rmap_irec *src)
+{
+ dest->fmr_flags = 0;
+ if (!XFS_RMAP_NON_INODE_OWNER(src->rm_owner)) {
+ dest->fmr_owner = src->rm_owner;
+ return 0;
+ }
+ dest->fmr_flags |= FMR_OF_SPECIAL_OWNER;
+
+ switch (src->rm_owner) {
+ case XFS_RMAP_OWN_FS:
+ dest->fmr_owner = XFS_FMR_OWN_FS;
+ break;
+ case XFS_RMAP_OWN_LOG:
+ dest->fmr_owner = XFS_FMR_OWN_LOG;
+ break;
+ case XFS_RMAP_OWN_AG:
+ dest->fmr_owner = XFS_FMR_OWN_AG;
+ break;
+ case XFS_RMAP_OWN_INOBT:
+ dest->fmr_owner = XFS_FMR_OWN_INOBT;
+ break;
+ case XFS_RMAP_OWN_INODES:
+ dest->fmr_owner = XFS_FMR_OWN_INODES;
+ break;
+ case XFS_RMAP_OWN_REFC:
+ dest->fmr_owner = XFS_FMR_OWN_REFC;
+ break;
+ case XFS_RMAP_OWN_COW:
+ dest->fmr_owner = XFS_FMR_OWN_COW;
+ break;
+ case XFS_RMAP_OWN_NULL: /* "free" */
+ dest->fmr_owner = XFS_FMR_OWN_FREE;
+ break;
+ default:
+ return -EFSCORRUPTED;
+ }
+ return 0;
+}
+
+/* getfsmap query state */
+struct xfs_getfsmap_info {
+ struct xfs_fsmap_head *head;
+ xfs_fsmap_format_t formatter; /* formatting fn */
+ void *format_arg; /* format buffer */
+ struct xfs_buf *agf_bp; /* AGF, for refcount queries */
+ xfs_daddr_t next_daddr; /* next daddr we expect */
+ u64 missing_owner; /* owner of holes */
+ u32 dev; /* device id */
+ xfs_agnumber_t agno; /* AG number, if applicable */
+ struct xfs_rmap_irec low; /* low rmap key */
+ struct xfs_rmap_irec high; /* high rmap key */
+ bool last; /* last extent? */
+};
+
+/* Associate a device with a getfsmap handler. */
+struct xfs_getfsmap_dev {
+ u32 dev;
+ int (*fn)(struct xfs_trans *tp,
+ struct xfs_fsmap *keys,
+ struct xfs_getfsmap_info *info);
+};
+
+/* Compare two getfsmap device handlers. */
+static int
+xfs_getfsmap_dev_compare(
+ const void *p1,
+ const void *p2)
+{
+ const struct xfs_getfsmap_dev *d1 = p1;
+ const struct xfs_getfsmap_dev *d2 = p2;
+
+ return d1->dev - d2->dev;
+}
+
+/* Decide if this mapping is shared. */
+STATIC int
+xfs_getfsmap_is_shared(
+ struct xfs_trans *tp,
+ struct xfs_getfsmap_info *info,
+ struct xfs_rmap_irec *rec,
+ bool *stat)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_btree_cur *cur;
+ xfs_agblock_t fbno;
+ xfs_extlen_t flen;
+ int error;
+
+ *stat = false;
+ if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ return 0;
+ /* rt files will have agno set to NULLAGNUMBER */
+ if (info->agno == NULLAGNUMBER)
+ return 0;
+
+ /* Are there any shared blocks here? */
+ flen = 0;
+ cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp,
+ info->agno, NULL);
+
+ error = xfs_refcount_find_shared(cur, rec->rm_startblock,
+ rec->rm_blockcount, &fbno, &flen, false);
+
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+ if (error)
+ return error;
+
+ *stat = flen > 0;
+ return 0;
+}
+
+/*
+ * Format a reverse mapping for getfsmap, having translated rm_startblock
+ * into the appropriate daddr units.
+ */
+STATIC int
+xfs_getfsmap_helper(
+ struct xfs_trans *tp,
+ struct xfs_getfsmap_info *info,
+ struct xfs_rmap_irec *rec,
+ xfs_daddr_t rec_daddr)
+{
+ struct xfs_fsmap fmr;
+ struct xfs_mount *mp = tp->t_mountp;
+ bool shared;
+ int error;
+
+ if (fatal_signal_pending(current))
+ return -EINTR;
+
+ /*
+ * Filter out records that start before our startpoint, if the
+ * caller requested that.
+ */
+ if (xfs_rmap_compare(rec, &info->low) < 0) {
+ rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
+ if (info->next_daddr < rec_daddr)
+ info->next_daddr = rec_daddr;
+ return XFS_BTREE_QUERY_RANGE_CONTINUE;
+ }
+
+ /* Are we just counting mappings? */
+ if (info->head->fmh_count == 0) {
+ if (rec_daddr > info->next_daddr)
+ info->head->fmh_entries++;
+
+ if (info->last)
+ return XFS_BTREE_QUERY_RANGE_CONTINUE;
+
+ info->head->fmh_entries++;
+
+ rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
+ if (info->next_daddr < rec_daddr)
+ info->next_daddr = rec_daddr;
+ return XFS_BTREE_QUERY_RANGE_CONTINUE;
+ }
+
+ /*
+ * If the record starts past the last physical block we saw,
+ * then we've found a gap. Report the gap as being owned by
+ * whatever the caller specified is the missing owner.
+ */
+ if (rec_daddr > info->next_daddr) {
+ if (info->head->fmh_entries >= info->head->fmh_count)
+ return XFS_BTREE_QUERY_RANGE_ABORT;
+
+ fmr.fmr_device = info->dev;
+ fmr.fmr_physical = info->next_daddr;
+ fmr.fmr_owner = info->missing_owner;
+ fmr.fmr_offset = 0;
+ fmr.fmr_length = rec_daddr - info->next_daddr;
+ fmr.fmr_flags = FMR_OF_SPECIAL_OWNER;
+ error = info->formatter(&fmr, info->format_arg);
+ if (error)
+ return error;
+ info->head->fmh_entries++;
+ }
+
+ if (info->last)
+ goto out;
+
+ /* Fill out the extent we found */
+ if (info->head->fmh_entries >= info->head->fmh_count)
+ return XFS_BTREE_QUERY_RANGE_ABORT;
+
+ trace_xfs_fsmap_mapping(mp, info->dev, info->agno, rec);
+
+ fmr.fmr_device = info->dev;
+ fmr.fmr_physical = rec_daddr;
+ error = xfs_fsmap_owner_from_rmap(&fmr, rec);
+ if (error)
+ return error;
+ fmr.fmr_offset = XFS_FSB_TO_BB(mp, rec->rm_offset);
+ fmr.fmr_length = XFS_FSB_TO_BB(mp, rec->rm_blockcount);
+ if (rec->rm_flags & XFS_RMAP_UNWRITTEN)
+ fmr.fmr_flags |= FMR_OF_PREALLOC;
+ if (rec->rm_flags & XFS_RMAP_ATTR_FORK)
+ fmr.fmr_flags |= FMR_OF_ATTR_FORK;
+ if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK)
+ fmr.fmr_flags |= FMR_OF_EXTENT_MAP;
+ if (fmr.fmr_flags == 0) {
+ error = xfs_getfsmap_is_shared(tp, info, rec, &shared);
+ if (error)
+ return error;
+ if (shared)
+ fmr.fmr_flags |= FMR_OF_SHARED;
+ }
+ error = info->formatter(&fmr, info->format_arg);
+ if (error)
+ return error;
+ info->head->fmh_entries++;
+
+out:
+ rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
+ if (info->next_daddr < rec_daddr)
+ info->next_daddr = rec_daddr;
+ return XFS_BTREE_QUERY_RANGE_CONTINUE;
+}
+
+/* Transform a rmapbt irec into a fsmap */
+STATIC int
+xfs_getfsmap_datadev_helper(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_getfsmap_info *info = priv;
+ xfs_fsblock_t fsb;
+ xfs_daddr_t rec_daddr;
+
+ fsb = XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, rec->rm_startblock);
+ rec_daddr = XFS_FSB_TO_DADDR(mp, fsb);
+
+ return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr);
+}
+
+/* Transform a rtbitmap "record" into a fsmap */
+STATIC int
+xfs_getfsmap_rtdev_rtbitmap_helper(
+ struct xfs_trans *tp,
+ struct xfs_rtalloc_rec *rec,
+ void *priv)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_getfsmap_info *info = priv;
+ struct xfs_rmap_irec irec;
+ xfs_daddr_t rec_daddr;
+
+ rec_daddr = XFS_FSB_TO_BB(mp, rec->ar_startblock);
+
+ irec.rm_startblock = rec->ar_startblock;
+ irec.rm_blockcount = rec->ar_blockcount;
+ irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */
+ irec.rm_offset = 0;
+ irec.rm_flags = 0;
+
+ return xfs_getfsmap_helper(tp, info, &irec, rec_daddr);
+}
+
+/* Transform a bnobt irec into a fsmap */
+STATIC int
+xfs_getfsmap_datadev_bnobt_helper(
+ struct xfs_btree_cur *cur,
+ struct xfs_alloc_rec_incore *rec,
+ void *priv)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_getfsmap_info *info = priv;
+ struct xfs_rmap_irec irec;
+ xfs_daddr_t rec_daddr;
+
+ rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_private.a.agno,
+ rec->ar_startblock);
+
+ irec.rm_startblock = rec->ar_startblock;
+ irec.rm_blockcount = rec->ar_blockcount;
+ irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */
+ irec.rm_offset = 0;
+ irec.rm_flags = 0;
+
+ return xfs_getfsmap_helper(cur->bc_tp, info, &irec, rec_daddr);
+}
+
+/* Set rmap flags based on the getfsmap flags */
+static void
+xfs_getfsmap_set_irec_flags(
+ struct xfs_rmap_irec *irec,
+ struct xfs_fsmap *fmr)
+{
+ irec->rm_flags = 0;
+ if (fmr->fmr_flags & FMR_OF_ATTR_FORK)
+ irec->rm_flags |= XFS_RMAP_ATTR_FORK;
+ if (fmr->fmr_flags & FMR_OF_EXTENT_MAP)
+ irec->rm_flags |= XFS_RMAP_BMBT_BLOCK;
+ if (fmr->fmr_flags & FMR_OF_PREALLOC)
+ irec->rm_flags |= XFS_RMAP_UNWRITTEN;
+}
+
+/* Execute a getfsmap query against the log device. */
+STATIC int
+xfs_getfsmap_logdev(
+ struct xfs_trans *tp,
+ struct xfs_fsmap *keys,
+ struct xfs_getfsmap_info *info)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_rmap_irec rmap;
+ int error;
+
+ /* Set up search keys */
+ info->low.rm_startblock = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical);
+ info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
+ error = xfs_fsmap_owner_to_rmap(&info->low, keys);
+ if (error)
+ return error;
+ info->low.rm_blockcount = 0;
+ xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
+
+ error = xfs_fsmap_owner_to_rmap(&info->high, keys + 1);
+ if (error)
+ return error;
+ info->high.rm_startblock = -1U;
+ info->high.rm_owner = ULLONG_MAX;
+ info->high.rm_offset = ULLONG_MAX;
+ info->high.rm_blockcount = 0;
+ info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS;
+ info->missing_owner = XFS_FMR_OWN_FREE;
+
+ trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low);
+ trace_xfs_fsmap_high_key(mp, info->dev, info->agno, &info->high);
+
+ if (keys[0].fmr_physical > 0)
+ return 0;
+
+ /* Fabricate an rmap entry for the external log device. */
+ rmap.rm_startblock = 0;
+ rmap.rm_blockcount = mp->m_sb.sb_logblocks;
+ rmap.rm_owner = XFS_RMAP_OWN_LOG;
+ rmap.rm_offset = 0;
+ rmap.rm_flags = 0;
+
+ return xfs_getfsmap_helper(tp, info, &rmap, 0);
+}
+
+/* Execute a getfsmap query against the realtime device. */
+STATIC int
+__xfs_getfsmap_rtdev(
+ struct xfs_trans *tp,
+ struct xfs_fsmap *keys,
+ int (*query_fn)(struct xfs_trans *,
+ struct xfs_getfsmap_info *),
+ struct xfs_getfsmap_info *info)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_fsblock_t start_fsb;
+ xfs_fsblock_t end_fsb;
+ xfs_daddr_t eofs;
+ int error = 0;
+
+ eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
+ if (keys[0].fmr_physical >= eofs)
+ return 0;
+ if (keys[1].fmr_physical >= eofs)
+ keys[1].fmr_physical = eofs - 1;
+ start_fsb = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical);
+ end_fsb = XFS_BB_TO_FSB(mp, keys[1].fmr_physical);
+
+ /* Set up search keys */
+ info->low.rm_startblock = start_fsb;
+ error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]);
+ if (error)
+ return error;
+ info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
+ info->low.rm_blockcount = 0;
+ xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
+
+ info->high.rm_startblock = end_fsb;
+ error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]);
+ if (error)
+ return error;
+ info->high.rm_offset = XFS_BB_TO_FSBT(mp, keys[1].fmr_offset);
+ info->high.rm_blockcount = 0;
+ xfs_getfsmap_set_irec_flags(&info->high, &keys[1]);
+
+ trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low);
+ trace_xfs_fsmap_high_key(mp, info->dev, info->agno, &info->high);
+
+ return query_fn(tp, info);
+}
+
+/* Actually query the realtime bitmap. */
+STATIC int
+xfs_getfsmap_rtdev_rtbitmap_query(
+ struct xfs_trans *tp,
+ struct xfs_getfsmap_info *info)
+{
+ struct xfs_rtalloc_rec alow;
+ struct xfs_rtalloc_rec ahigh;
+ int error;
+
+ xfs_ilock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED);
+
+ alow.ar_startblock = info->low.rm_startblock;
+ ahigh.ar_startblock = info->high.rm_startblock;
+ error = xfs_rtalloc_query_range(tp, &alow, &ahigh,
+ xfs_getfsmap_rtdev_rtbitmap_helper, info);
+ if (error)
+ goto err;
+
+ /* Report any gaps at the end of the rtbitmap */
+ info->last = true;
+ error = xfs_getfsmap_rtdev_rtbitmap_helper(tp, &ahigh, info);
+ if (error)
+ goto err;
+err:
+ xfs_iunlock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED);
+ return error;
+}
+
+/* Execute a getfsmap query against the realtime device rtbitmap. */
+STATIC int
+xfs_getfsmap_rtdev_rtbitmap(
+ struct xfs_trans *tp,
+ struct xfs_fsmap *keys,
+ struct xfs_getfsmap_info *info)
+{
+ info->missing_owner = XFS_FMR_OWN_UNKNOWN;
+ return __xfs_getfsmap_rtdev(tp, keys, xfs_getfsmap_rtdev_rtbitmap_query,
+ info);
+}
+
+/* Execute a getfsmap query against the regular data device. */
+STATIC int
+__xfs_getfsmap_datadev(
+ struct xfs_trans *tp,
+ struct xfs_fsmap *keys,
+ struct xfs_getfsmap_info *info,
+ int (*query_fn)(struct xfs_trans *,
+ struct xfs_getfsmap_info *,
+ struct xfs_btree_cur **,
+ void *),
+ void *priv)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_btree_cur *bt_cur = NULL;
+ xfs_fsblock_t start_fsb;
+ xfs_fsblock_t end_fsb;
+ xfs_agnumber_t start_ag;
+ xfs_agnumber_t end_ag;
+ xfs_daddr_t eofs;
+ int error = 0;
+
+ eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
+ if (keys[0].fmr_physical >= eofs)
+ return 0;
+ if (keys[1].fmr_physical >= eofs)
+ keys[1].fmr_physical = eofs - 1;
+ start_fsb = XFS_DADDR_TO_FSB(mp, keys[0].fmr_physical);
+ end_fsb = XFS_DADDR_TO_FSB(mp, keys[1].fmr_physical);
+
+ /*
+ * Convert the fsmap low/high keys to AG based keys. Initialize
+ * low to the fsmap low key and max out the high key to the end
+ * of the AG.
+ */
+ info->low.rm_startblock = XFS_FSB_TO_AGBNO(mp, start_fsb);
+ info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
+ error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]);
+ if (error)
+ return error;
+ info->low.rm_blockcount = 0;
+ xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
+
+ info->high.rm_startblock = -1U;
+ info->high.rm_owner = ULLONG_MAX;
+ info->high.rm_offset = ULLONG_MAX;
+ info->high.rm_blockcount = 0;
+ info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS;
+
+ start_ag = XFS_FSB_TO_AGNO(mp, start_fsb);
+ end_ag = XFS_FSB_TO_AGNO(mp, end_fsb);
+
+ /* Query each AG */
+ for (info->agno = start_ag; info->agno <= end_ag; info->agno++) {
+ /*
+ * Set the AG high key from the fsmap high key if this
+ * is the last AG that we're querying.
+ */
+ if (info->agno == end_ag) {
+ info->high.rm_startblock = XFS_FSB_TO_AGBNO(mp,
+ end_fsb);
+ info->high.rm_offset = XFS_BB_TO_FSBT(mp,
+ keys[1].fmr_offset);
+ error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]);
+ if (error)
+ goto err;
+ xfs_getfsmap_set_irec_flags(&info->high, &keys[1]);
+ }
+
+ if (bt_cur) {
+ xfs_btree_del_cursor(bt_cur, XFS_BTREE_NOERROR);
+ bt_cur = NULL;
+ xfs_trans_brelse(tp, info->agf_bp);
+ info->agf_bp = NULL;
+ }
+
+ error = xfs_alloc_read_agf(mp, tp, info->agno, 0,
+ &info->agf_bp);
+ if (error)
+ goto err;
+
+ trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low);
+ trace_xfs_fsmap_high_key(mp, info->dev, info->agno,
+ &info->high);
+
+ error = query_fn(tp, info, &bt_cur, priv);
+ if (error)
+ goto err;
+
+ /*
+ * Set the AG low key to the start of the AG prior to
+ * moving on to the next AG.
+ */
+ if (info->agno == start_ag) {
+ info->low.rm_startblock = 0;
+ info->low.rm_owner = 0;
+ info->low.rm_offset = 0;
+ info->low.rm_flags = 0;
+ }
+ }
+
+ /* Report any gap at the end of the AG */
+ info->last = true;
+ error = query_fn(tp, info, &bt_cur, priv);
+ if (error)
+ goto err;
+
+err:
+ if (bt_cur)
+ xfs_btree_del_cursor(bt_cur, error < 0 ? XFS_BTREE_ERROR :
+ XFS_BTREE_NOERROR);
+ if (info->agf_bp) {
+ xfs_trans_brelse(tp, info->agf_bp);
+ info->agf_bp = NULL;
+ }
+
+ return error;
+}
+
+/* Actually query the rmap btree. */
+STATIC int
+xfs_getfsmap_datadev_rmapbt_query(
+ struct xfs_trans *tp,
+ struct xfs_getfsmap_info *info,
+ struct xfs_btree_cur **curpp,
+ void *priv)
+{
+ /* Report any gap at the end of the last AG. */
+ if (info->last)
+ return xfs_getfsmap_datadev_helper(*curpp, &info->high, info);
+
+ /* Allocate cursor for this AG and query_range it. */
+ *curpp = xfs_rmapbt_init_cursor(tp->t_mountp, tp, info->agf_bp,
+ info->agno);
+ return xfs_rmap_query_range(*curpp, &info->low, &info->high,
+ xfs_getfsmap_datadev_helper, info);
+}
+
+/* Execute a getfsmap query against the regular data device rmapbt. */
+STATIC int
+xfs_getfsmap_datadev_rmapbt(
+ struct xfs_trans *tp,
+ struct xfs_fsmap *keys,
+ struct xfs_getfsmap_info *info)
+{
+ info->missing_owner = XFS_FMR_OWN_FREE;
+ return __xfs_getfsmap_datadev(tp, keys, info,
+ xfs_getfsmap_datadev_rmapbt_query, NULL);
+}
+
+/* Actually query the bno btree. */
+STATIC int
+xfs_getfsmap_datadev_bnobt_query(
+ struct xfs_trans *tp,
+ struct xfs_getfsmap_info *info,
+ struct xfs_btree_cur **curpp,
+ void *priv)
+{
+ struct xfs_alloc_rec_incore *key = priv;
+
+ /* Report any gap at the end of the last AG. */
+ if (info->last)
+ return xfs_getfsmap_datadev_bnobt_helper(*curpp, &key[1], info);
+
+ /* Allocate cursor for this AG and query_range it. */
+ *curpp = xfs_allocbt_init_cursor(tp->t_mountp, tp, info->agf_bp,
+ info->agno, XFS_BTNUM_BNO);
+ key->ar_startblock = info->low.rm_startblock;
+ key[1].ar_startblock = info->high.rm_startblock;
+ return xfs_alloc_query_range(*curpp, key, &key[1],
+ xfs_getfsmap_datadev_bnobt_helper, info);
+}
+
+/* Execute a getfsmap query against the regular data device's bnobt. */
+STATIC int
+xfs_getfsmap_datadev_bnobt(
+ struct xfs_trans *tp,
+ struct xfs_fsmap *keys,
+ struct xfs_getfsmap_info *info)
+{
+ struct xfs_alloc_rec_incore akeys[2];
+
+ info->missing_owner = XFS_FMR_OWN_UNKNOWN;
+ return __xfs_getfsmap_datadev(tp, keys, info,
+ xfs_getfsmap_datadev_bnobt_query, &akeys[0]);
+}
+
+/* Do we recognize the device? */
+STATIC bool
+xfs_getfsmap_is_valid_device(
+ struct xfs_mount *mp,
+ struct xfs_fsmap *fm)
+{
+ if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
+ fm->fmr_device == new_encode_dev(mp->m_ddev_targp->bt_dev))
+ return true;
+ if (mp->m_logdev_targp &&
+ fm->fmr_device == new_encode_dev(mp->m_logdev_targp->bt_dev))
+ return true;
+ if (mp->m_rtdev_targp &&
+ fm->fmr_device == new_encode_dev(mp->m_rtdev_targp->bt_dev))
+ return true;
+ return false;
+}
+
+/* Ensure that the low key is less than the high key. */
+STATIC bool
+xfs_getfsmap_check_keys(
+ struct xfs_fsmap *low_key,
+ struct xfs_fsmap *high_key)
+{
+ if (low_key->fmr_device > high_key->fmr_device)
+ return false;
+ if (low_key->fmr_device < high_key->fmr_device)
+ return true;
+
+ if (low_key->fmr_physical > high_key->fmr_physical)
+ return false;
+ if (low_key->fmr_physical < high_key->fmr_physical)
+ return true;
+
+ if (low_key->fmr_owner > high_key->fmr_owner)
+ return false;
+ if (low_key->fmr_owner < high_key->fmr_owner)
+ return true;
+
+ if (low_key->fmr_offset > high_key->fmr_offset)
+ return false;
+ if (low_key->fmr_offset < high_key->fmr_offset)
+ return true;
+
+ return false;
+}
+
+#define XFS_GETFSMAP_DEVS 3
+/*
+ * Get filesystem's extents as described in head, and format for
+ * output. Calls formatter to fill the user's buffer until all
+ * extents are mapped, until the passed-in head->fmh_count slots have
+ * been filled, or until the formatter short-circuits the loop, if it
+ * is tracking filled-in extents on its own.
+ *
+ * Key to Confusion
+ * ----------------
+ * There are multiple levels of keys and counters at work here:
+ * xfs_fsmap_head.fmh_keys -- low and high fsmap keys passed in;
+ * these reflect fs-wide sector addrs.
+ * dkeys -- fmh_keys used to query each device;
+ * these are fmh_keys but w/ the low key
+ * bumped up by fmr_length.
+ * xfs_getfsmap_info.next_daddr -- next disk addr we expect to see; this
+ * is how we detect gaps in the fsmap
+ records and report them.
+ * xfs_getfsmap_info.low/high -- per-AG low/high keys computed from
+ * dkeys; used to query the metadata.
+ */
+int
+xfs_getfsmap(
+ struct xfs_mount *mp,
+ struct xfs_fsmap_head *head,
+ xfs_fsmap_format_t formatter,
+ void *arg)
+{
+ struct xfs_trans *tp = NULL;
+ struct xfs_fsmap dkeys[2]; /* per-dev keys */
+ struct xfs_getfsmap_dev handlers[XFS_GETFSMAP_DEVS];
+ struct xfs_getfsmap_info info = { NULL };
+ int i;
+ int error = 0;
+
+ if (head->fmh_iflags & ~FMH_IF_VALID)
+ return -EINVAL;
+ if (!xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[0]) ||
+ !xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[1]))
+ return -EINVAL;
+
+ head->fmh_entries = 0;
+
+ /* Set up our device handlers. */
+ memset(handlers, 0, sizeof(handlers));
+ handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev);
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ handlers[0].fn = xfs_getfsmap_datadev_rmapbt;
+ else
+ handlers[0].fn = xfs_getfsmap_datadev_bnobt;
+ if (mp->m_logdev_targp != mp->m_ddev_targp) {
+ handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev);
+ handlers[1].fn = xfs_getfsmap_logdev;
+ }
+ if (mp->m_rtdev_targp) {
+ handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev);
+ handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap;
+ }
+
+ xfs_sort(handlers, XFS_GETFSMAP_DEVS, sizeof(struct xfs_getfsmap_dev),
+ xfs_getfsmap_dev_compare);
+
+ /*
+ * To continue where we left off, we allow userspace to use the
+ * last mapping from a previous call as the low key of the next.
+ * This is identified by a non-zero length in the low key. We
+ * have to increment the low key in this scenario to ensure we
+ * don't return the same mapping again, and instead return the
+ * very next mapping.
+ *
+ * If the low key mapping refers to file data, the same physical
+ * blocks could be mapped to several other files/offsets.
+ * According to rmapbt record ordering, the minimal next
+ * possible record for the block range is the next starting
+ * offset in the same inode. Therefore, bump the file offset to
+ * continue the search appropriately. For all other low key
+ * mapping types (attr blocks, metadata), bump the physical
+ * offset as there can be no other mapping for the same physical
+ * block range.
+ */
+ dkeys[0] = head->fmh_keys[0];
+ if (dkeys[0].fmr_flags & (FMR_OF_SPECIAL_OWNER | FMR_OF_EXTENT_MAP)) {
+ dkeys[0].fmr_physical += dkeys[0].fmr_length;
+ dkeys[0].fmr_owner = 0;
+ if (dkeys[0].fmr_offset)
+ return -EINVAL;
+ } else
+ dkeys[0].fmr_offset += dkeys[0].fmr_length;
+ dkeys[0].fmr_length = 0;
+ memset(&dkeys[1], 0xFF, sizeof(struct xfs_fsmap));
+
+ if (!xfs_getfsmap_check_keys(dkeys, &head->fmh_keys[1]))
+ return -EINVAL;
+
+ info.next_daddr = head->fmh_keys[0].fmr_physical +
+ head->fmh_keys[0].fmr_length;
+ info.formatter = formatter;
+ info.format_arg = arg;
+ info.head = head;
+
+ /* For each device we support... */
+ for (i = 0; i < XFS_GETFSMAP_DEVS; i++) {
+ /* Is this device within the range the user asked for? */
+ if (!handlers[i].fn)
+ continue;
+ if (head->fmh_keys[0].fmr_device > handlers[i].dev)
+ continue;
+ if (head->fmh_keys[1].fmr_device < handlers[i].dev)
+ break;
+
+ /*
+ * If this device number matches the high key, we have
+ * to pass the high key to the handler to limit the
+ * query results. If the device number exceeds the
+ * low key, zero out the low key so that we get
+ * everything from the beginning.
+ */
+ if (handlers[i].dev == head->fmh_keys[1].fmr_device)
+ dkeys[1] = head->fmh_keys[1];
+ if (handlers[i].dev > head->fmh_keys[0].fmr_device)
+ memset(&dkeys[0], 0, sizeof(struct xfs_fsmap));
+
+ error = xfs_trans_alloc_empty(mp, &tp);
+ if (error)
+ break;
+
+ info.dev = handlers[i].dev;
+ info.last = false;
+ info.agno = NULLAGNUMBER;
+ error = handlers[i].fn(tp, dkeys, &info);
+ if (error)
+ break;
+ xfs_trans_cancel(tp);
+ tp = NULL;
+ info.next_daddr = 0;
+ }
+
+ if (tp)
+ xfs_trans_cancel(tp);
+ head->fmh_oflags = FMH_OF_DEV_T;
+ return error;
+}
diff --git a/fs/xfs/xfs_fsmap.h b/fs/xfs/xfs_fsmap.h
new file mode 100644
index 000000000000..0b9bf822595c
--- /dev/null
+++ b/fs/xfs/xfs_fsmap.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_FSMAP_H__
+#define __XFS_FSMAP_H__
+
+struct fsmap;
+
+/* internal fsmap representation */
+struct xfs_fsmap {
+ dev_t fmr_device; /* device id */
+ uint32_t fmr_flags; /* mapping flags */
+ uint64_t fmr_physical; /* device offset of segment */
+ uint64_t fmr_owner; /* owner id */
+ xfs_fileoff_t fmr_offset; /* file offset of segment */
+ xfs_filblks_t fmr_length; /* length of segment, blocks */
+};
+
+struct xfs_fsmap_head {
+ uint32_t fmh_iflags; /* control flags */
+ uint32_t fmh_oflags; /* output flags */
+ unsigned int fmh_count; /* # of entries in array incl. input */
+ unsigned int fmh_entries; /* # of entries filled in (output). */
+
+ struct xfs_fsmap fmh_keys[2]; /* low and high keys */
+};
+
+void xfs_fsmap_from_internal(struct fsmap *dest, struct xfs_fsmap *src);
+void xfs_fsmap_to_internal(struct xfs_fsmap *dest, struct fsmap *src);
+
+/* fsmap to userspace formatter - copy to user & advance pointer */
+typedef int (*xfs_fsmap_format_t)(struct xfs_fsmap *, void *);
+
+int xfs_getfsmap(struct xfs_mount *mp, struct xfs_fsmap_head *head,
+ xfs_fsmap_format_t formatter, void *arg);
+
+#endif /* __XFS_FSMAP_H__ */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 3531f8f72fa5..f61c84f8e31a 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -262,6 +262,22 @@ xfs_inode_clear_reclaim_tag(
xfs_perag_clear_reclaim_tag(pag);
}
+static void
+xfs_inew_wait(
+ struct xfs_inode *ip)
+{
+ wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT);
+ DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT);
+
+ do {
+ prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ if (!xfs_iflags_test(ip, XFS_INEW))
+ break;
+ schedule();
+ } while (true);
+ finish_wait(wq, &wait.wait);
+}
+
/*
* When we recycle a reclaimable inode, we need to re-initialise the VFS inode
* part of the structure. This is made more complex by the fact we store
@@ -366,14 +382,17 @@ xfs_iget_cache_hit(
error = xfs_reinit_inode(mp, inode);
if (error) {
+ bool wake;
/*
* Re-initializing the inode failed, and we are in deep
* trouble. Try to re-add it to the reclaim list.
*/
rcu_read_lock();
spin_lock(&ip->i_flags_lock);
-
+ wake = !!__xfs_iflags_test(ip, XFS_INEW);
ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
+ if (wake)
+ wake_up_bit(&ip->i_flags, __XFS_INEW_BIT);
ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
trace_xfs_iget_reclaim_fail(ip);
goto out_error;
@@ -623,9 +642,11 @@ out_error_or_again:
STATIC int
xfs_inode_ag_walk_grab(
- struct xfs_inode *ip)
+ struct xfs_inode *ip,
+ int flags)
{
struct inode *inode = VFS_I(ip);
+ bool newinos = !!(flags & XFS_AGITER_INEW_WAIT);
ASSERT(rcu_read_lock_held());
@@ -643,7 +664,8 @@ xfs_inode_ag_walk_grab(
goto out_unlock_noent;
/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
- if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+ if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) ||
+ __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM))
goto out_unlock_noent;
spin_unlock(&ip->i_flags_lock);
@@ -671,7 +693,8 @@ xfs_inode_ag_walk(
void *args),
int flags,
void *args,
- int tag)
+ int tag,
+ int iter_flags)
{
uint32_t first_index;
int last_error = 0;
@@ -713,7 +736,7 @@ restart:
for (i = 0; i < nr_found; i++) {
struct xfs_inode *ip = batch[i];
- if (done || xfs_inode_ag_walk_grab(ip))
+ if (done || xfs_inode_ag_walk_grab(ip, iter_flags))
batch[i] = NULL;
/*
@@ -741,6 +764,9 @@ restart:
for (i = 0; i < nr_found; i++) {
if (!batch[i])
continue;
+ if ((iter_flags & XFS_AGITER_INEW_WAIT) &&
+ xfs_iflags_test(batch[i], XFS_INEW))
+ xfs_inew_wait(batch[i]);
error = execute(batch[i], flags, args);
IRELE(batch[i]);
if (error == -EAGAIN) {
@@ -820,12 +846,13 @@ xfs_cowblocks_worker(
}
int
-xfs_inode_ag_iterator(
+xfs_inode_ag_iterator_flags(
struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, int flags,
void *args),
int flags,
- void *args)
+ void *args,
+ int iter_flags)
{
struct xfs_perag *pag;
int error = 0;
@@ -835,7 +862,8 @@ xfs_inode_ag_iterator(
ag = 0;
while ((pag = xfs_perag_get(mp, ag))) {
ag = pag->pag_agno + 1;
- error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1);
+ error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1,
+ iter_flags);
xfs_perag_put(pag);
if (error) {
last_error = error;
@@ -847,6 +875,17 @@ xfs_inode_ag_iterator(
}
int
+xfs_inode_ag_iterator(
+ struct xfs_mount *mp,
+ int (*execute)(struct xfs_inode *ip, int flags,
+ void *args),
+ int flags,
+ void *args)
+{
+ return xfs_inode_ag_iterator_flags(mp, execute, flags, args, 0);
+}
+
+int
xfs_inode_ag_iterator_tag(
struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, int flags,
@@ -863,7 +902,8 @@ xfs_inode_ag_iterator_tag(
ag = 0;
while ((pag = xfs_perag_get_tag(mp, ag, tag))) {
ag = pag->pag_agno + 1;
- error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag);
+ error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag,
+ 0);
xfs_perag_put(pag);
if (error) {
last_error = error;
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 8a7c849b4dea..9183f77958ef 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -48,6 +48,11 @@ struct xfs_eofblocks {
#define XFS_IGET_UNTRUSTED 0x2
#define XFS_IGET_DONTCACHE 0x4
+/*
+ * flags for AG inode iterator
+ */
+#define XFS_AGITER_INEW_WAIT 0x1 /* wait on new inodes */
+
int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
uint flags, uint lock_flags, xfs_inode_t **ipp);
@@ -79,6 +84,9 @@ void xfs_cowblocks_worker(struct work_struct *);
int xfs_inode_ag_iterator(struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, int flags, void *args),
int flags, void *args);
+int xfs_inode_ag_iterator_flags(struct xfs_mount *mp,
+ int (*execute)(struct xfs_inode *ip, int flags, void *args),
+ int flags, void *args, int iter_flags);
int xfs_inode_ag_iterator_tag(struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, int flags, void *args),
int flags, void *args, int tag);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 7605d8396596..ec9826c56500 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1906,12 +1906,13 @@ xfs_inactive(
* force is true because we are evicting an inode from the
* cache. Post-eof blocks must be freed, lest we end up with
* broken free space accounting.
+ *
+ * Note: don't bother with iolock here since lockdep complains
+ * about acquiring it in reclaim context. We have the only
+ * reference to the inode at this point anyways.
*/
- if (xfs_can_free_eofblocks(ip, true)) {
- xfs_ilock(ip, XFS_IOLOCK_EXCL);
+ if (xfs_can_free_eofblocks(ip, true))
xfs_free_eofblocks(ip);
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- }
return;
}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 10dcf27b4c85..10e89fcb49d7 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -216,7 +216,8 @@ static inline bool xfs_is_reflink_inode(struct xfs_inode *ip)
#define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */
#define XFS_ISTALE (1 << 1) /* inode has been staled */
#define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */
-#define XFS_INEW (1 << 3) /* inode has just been allocated */
+#define __XFS_INEW_BIT 3 /* inode has just been allocated */
+#define XFS_INEW (1 << __XFS_INEW_BIT)
#define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */
#define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */
#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */
@@ -464,6 +465,7 @@ static inline void xfs_finish_inode_setup(struct xfs_inode *ip)
xfs_iflags_clear(ip, XFS_INEW);
barrier();
unlock_new_inode(VFS_I(ip));
+ wake_up_bit(&ip->i_flags, __XFS_INEW_BIT);
}
static inline void xfs_setup_existing_inode(struct xfs_inode *ip)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index d90e7811ccdd..08cb7d1a4a3a 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -731,22 +731,27 @@ xfs_iflush_done(
* holding the lock before removing the inode from the AIL.
*/
if (need_ail) {
- struct xfs_log_item *log_items[need_ail];
- int i = 0;
+ bool mlip_changed = false;
+
+ /* this is an opencoded batch version of xfs_trans_ail_delete */
spin_lock(&ailp->xa_lock);
for (blip = lip; blip; blip = blip->li_bio_list) {
- iip = INODE_ITEM(blip);
- if (iip->ili_logged &&
- blip->li_lsn == iip->ili_flush_lsn) {
- log_items[i++] = blip;
- }
- ASSERT(i <= need_ail);
+ if (INODE_ITEM(blip)->ili_logged &&
+ blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
+ mlip_changed |= xfs_ail_delete_one(ailp, blip);
}
- /* xfs_trans_ail_delete_bulk() drops the AIL lock. */
- xfs_trans_ail_delete_bulk(ailp, log_items, i,
- SHUTDOWN_CORRUPT_INCORE);
- }
+ if (mlip_changed) {
+ if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
+ xlog_assign_tail_lsn_locked(ailp->xa_mount);
+ if (list_empty(&ailp->xa_ail))
+ wake_up_all(&ailp->xa_empty);
+ }
+ spin_unlock(&ailp->xa_lock);
+
+ if (mlip_changed)
+ xfs_log_space_wake(ailp->xa_mount);
+ }
/*
* clean up and unlock the flush lock now we are done. We can clear the
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 2fd7fdf5438f..6190697603c9 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -41,6 +41,9 @@
#include "xfs_trans.h"
#include "xfs_pnfs.h"
#include "xfs_acl.h"
+#include "xfs_btree.h"
+#include <linux/fsmap.h>
+#include "xfs_fsmap.h"
#include <linux/capability.h>
#include <linux/cred.h>
@@ -1543,10 +1546,11 @@ xfs_ioc_getbmap(
unsigned int cmd,
void __user *arg)
{
- struct getbmapx bmx;
+ struct getbmapx bmx = { 0 };
int error;
- if (copy_from_user(&bmx, arg, sizeof(struct getbmapx)))
+ /* struct getbmap is a strict subset of struct getbmapx. */
+ if (copy_from_user(&bmx, arg, offsetof(struct getbmapx, bmv_iflags)))
return -EFAULT;
if (bmx.bmv_count < 2)
@@ -1608,6 +1612,84 @@ xfs_ioc_getbmapx(
return 0;
}
+struct getfsmap_info {
+ struct xfs_mount *mp;
+ struct fsmap_head __user *data;
+ unsigned int idx;
+ __u32 last_flags;
+};
+
+STATIC int
+xfs_getfsmap_format(struct xfs_fsmap *xfm, void *priv)
+{
+ struct getfsmap_info *info = priv;
+ struct fsmap fm;
+
+ trace_xfs_getfsmap_mapping(info->mp, xfm);
+
+ info->last_flags = xfm->fmr_flags;
+ xfs_fsmap_from_internal(&fm, xfm);
+ if (copy_to_user(&info->data->fmh_recs[info->idx++], &fm,
+ sizeof(struct fsmap)))
+ return -EFAULT;
+
+ return 0;
+}
+
+STATIC int
+xfs_ioc_getfsmap(
+ struct xfs_inode *ip,
+ struct fsmap_head __user *arg)
+{
+ struct getfsmap_info info = { NULL };
+ struct xfs_fsmap_head xhead = {0};
+ struct fsmap_head head;
+ bool aborted = false;
+ int error;
+
+ if (copy_from_user(&head, arg, sizeof(struct fsmap_head)))
+ return -EFAULT;
+ if (memchr_inv(head.fmh_reserved, 0, sizeof(head.fmh_reserved)) ||
+ memchr_inv(head.fmh_keys[0].fmr_reserved, 0,
+ sizeof(head.fmh_keys[0].fmr_reserved)) ||
+ memchr_inv(head.fmh_keys[1].fmr_reserved, 0,
+ sizeof(head.fmh_keys[1].fmr_reserved)))
+ return -EINVAL;
+
+ xhead.fmh_iflags = head.fmh_iflags;
+ xhead.fmh_count = head.fmh_count;
+ xfs_fsmap_to_internal(&xhead.fmh_keys[0], &head.fmh_keys[0]);
+ xfs_fsmap_to_internal(&xhead.fmh_keys[1], &head.fmh_keys[1]);
+
+ trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]);
+ trace_xfs_getfsmap_high_key(ip->i_mount, &xhead.fmh_keys[1]);
+
+ info.mp = ip->i_mount;
+ info.data = arg;
+ error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info);
+ if (error == XFS_BTREE_QUERY_RANGE_ABORT) {
+ error = 0;
+ aborted = true;
+ } else if (error)
+ return error;
+
+ /* If we didn't abort, set the "last" flag in the last fmx */
+ if (!aborted && info.idx) {
+ info.last_flags |= FMR_OF_LAST;
+ if (copy_to_user(&info.data->fmh_recs[info.idx - 1].fmr_flags,
+ &info.last_flags, sizeof(info.last_flags)))
+ return -EFAULT;
+ }
+
+ /* copy back header */
+ head.fmh_entries = xhead.fmh_entries;
+ head.fmh_oflags = xhead.fmh_oflags;
+ if (copy_to_user(arg, &head, sizeof(struct fsmap_head)))
+ return -EFAULT;
+
+ return 0;
+}
+
int
xfs_ioc_swapext(
xfs_swapext_t *sxp)
@@ -1788,6 +1870,9 @@ xfs_file_ioctl(
case XFS_IOC_GETBMAPX:
return xfs_ioc_getbmapx(ip, arg);
+ case FS_IOC_GETFSMAP:
+ return xfs_ioc_getfsmap(ip, arg);
+
case XFS_IOC_FD_TO_HANDLE:
case XFS_IOC_PATH_TO_HANDLE:
case XFS_IOC_PATH_TO_FSHANDLE: {
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 7c49938c5aed..fa0bc4d46065 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -20,6 +20,7 @@
#include <linux/mount.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
+#include <linux/fsmap.h>
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_format.h"
@@ -554,6 +555,7 @@ xfs_file_compat_ioctl(
case XFS_IOC_GOINGDOWN:
case XFS_IOC_ERROR_INJECTION:
case XFS_IOC_ERROR_CLEARALL:
+ case FS_IOC_GETFSMAP:
return xfs_file_ioctl(filp, cmd, p);
#ifndef BROKEN_X86_ALIGNMENT
/* These are handled fine if no alignment issues */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 288ee5b840d7..a63f61c256bd 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -240,7 +240,7 @@ xfs_iomap_write_direct(
*/
if (IS_DAX(VFS_I(ip))) {
bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
- if (ISUNWRITTEN(imap)) {
+ if (imap->br_state == XFS_EXT_UNWRITTEN) {
tflags |= XFS_TRANS_RESERVE;
resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
}
@@ -945,7 +945,7 @@ static inline bool imap_needs_alloc(struct inode *inode,
return !nimaps ||
imap->br_startblock == HOLESTARTBLOCK ||
imap->br_startblock == DELAYSTARTBLOCK ||
- (IS_DAX(inode) && ISUNWRITTEN(imap));
+ (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN);
}
static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags)
@@ -976,6 +976,7 @@ xfs_file_iomap_begin(
int nimaps = 1, error = 0;
bool shared = false, trimmed = false;
unsigned lockmode;
+ struct block_device *bdev;
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
@@ -1063,6 +1064,14 @@ xfs_file_iomap_begin(
}
xfs_bmbt_to_iomap(ip, iomap, &imap);
+
+ /* optionally associate a dax device with the iomap bdev */
+ bdev = iomap->bdev;
+ if (blk_queue_dax(bdev->bd_queue))
+ iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+ else
+ iomap->dax_dev = NULL;
+
if (shared)
iomap->flags |= IOMAP_F_SHARED;
return 0;
@@ -1140,6 +1149,7 @@ xfs_file_iomap_end(
unsigned flags,
struct iomap *iomap)
{
+ put_dax(iomap->dax_dev);
if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
length, written, iomap);
@@ -1170,10 +1180,10 @@ xfs_xattr_iomap_begin(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- lockmode = xfs_ilock_data_map_shared(ip);
+ lockmode = xfs_ilock_attr_map_shared(ip);
/* if there are no attribute fork or extents, return ENOENT */
- if (XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) {
+ if (!XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) {
error = -ENOENT;
goto out_unlock;
}
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 592fdf7111cb..044fb0e15390 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -212,88 +212,6 @@ static inline kgid_t xfs_gid_to_kgid(__uint32_t gid)
#define xfs_sort(a,n,s,fn) sort(a,n,s,fn,NULL)
#define xfs_stack_trace() dump_stack()
-
-/* Move the kernel do_div definition off to one side */
-
-#if defined __i386__
-/* For ia32 we need to pull some tricks to get past various versions
- * of the compiler which do not like us using do_div in the middle
- * of large functions.
- */
-static inline __u32 xfs_do_div(void *a, __u32 b, int n)
-{
- __u32 mod;
-
- switch (n) {
- case 4:
- mod = *(__u32 *)a % b;
- *(__u32 *)a = *(__u32 *)a / b;
- return mod;
- case 8:
- {
- unsigned long __upper, __low, __high, __mod;
- __u64 c = *(__u64 *)a;
- __upper = __high = c >> 32;
- __low = c;
- if (__high) {
- __upper = __high % (b);
- __high = __high / (b);
- }
- asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
- asm("":"=A" (c):"a" (__low),"d" (__high));
- *(__u64 *)a = c;
- return __mod;
- }
- }
-
- /* NOTREACHED */
- return 0;
-}
-
-/* Side effect free 64 bit mod operation */
-static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
-{
- switch (n) {
- case 4:
- return *(__u32 *)a % b;
- case 8:
- {
- unsigned long __upper, __low, __high, __mod;
- __u64 c = *(__u64 *)a;
- __upper = __high = c >> 32;
- __low = c;
- if (__high) {
- __upper = __high % (b);
- __high = __high / (b);
- }
- asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
- asm("":"=A" (c):"a" (__low),"d" (__high));
- return __mod;
- }
- }
-
- /* NOTREACHED */
- return 0;
-}
-#else
-static inline __u32 xfs_do_div(void *a, __u32 b, int n)
-{
- __u32 mod;
-
- switch (n) {
- case 4:
- mod = *(__u32 *)a % b;
- *(__u32 *)a = *(__u32 *)a / b;
- return mod;
- case 8:
- mod = do_div(*(__u64 *)a, b);
- return mod;
- }
-
- /* NOTREACHED */
- return 0;
-}
-
/* Side effect free 64 bit mod operation */
static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
{
@@ -310,10 +228,7 @@ static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
/* NOTREACHED */
return 0;
}
-#endif
-#undef do_div
-#define do_div(a, b) xfs_do_div(&(a), (b), sizeof(a))
#define do_mod(a, b) xfs_do_mod(&(a), (b), sizeof(a))
static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index b1469f0a91a6..3731f13f63e9 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1293,7 +1293,7 @@ void
xfs_log_work_queue(
struct xfs_mount *mp)
{
- queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work,
+ queue_delayed_work(mp->m_sync_workqueue, &mp->m_log->l_work,
msecs_to_jiffies(xfs_syncd_centisecs * 10));
}
@@ -1852,7 +1852,7 @@ xlog_sync(
*/
if (log->l_badcrc_factor &&
(prandom_u32() % log->l_badcrc_factor == 0)) {
- iclog->ic_header.h_crc &= 0xAAAAAAAA;
+ iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA);
iclog->ic_state |= XLOG_STATE_IOABORT;
xfs_warn(log->l_mp,
"Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.",
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 4a98762ec8b4..cd0b077deb35 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3796,7 +3796,7 @@ xlog_recover_bud_pass2(
* This routine is called when an inode create format structure is found in a
* committed transaction in the log. It's purpose is to initialise the inodes
* being allocated on disk. This requires us to get inode cluster buffers that
- * match the range to be intialised, stamped with inode templates and written
+ * match the range to be initialised, stamped with inode templates and written
* by delayed write so that subsequent modifications will hit the cached buffer
* and only need writing out at the end of recovery.
*/
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 688ebff1f663..2eaf81859166 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -73,6 +73,10 @@ xfs_uuid_mount(
uuid_t *uuid = &mp->m_sb.sb_uuid;
int hole, i;
+ /* Publish UUID in struct super_block */
+ BUILD_BUG_ON(sizeof(mp->m_super->s_uuid) != sizeof(uuid_t));
+ memcpy(&mp->m_super->s_uuid, uuid, sizeof(uuid_t));
+
if (mp->m_flags & XFS_MOUNT_NOUUID)
return 0;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 6db6fd6b82b0..9fa312a41c93 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -183,6 +183,7 @@ typedef struct xfs_mount {
struct workqueue_struct *m_reclaim_workqueue;
struct workqueue_struct *m_log_workqueue;
struct workqueue_struct *m_eofblocks_workqueue;
+ struct workqueue_struct *m_sync_workqueue;
/*
* Generation of the filesysyem layout. This is incremented by each
@@ -312,7 +313,7 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
static inline xfs_agnumber_t
xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d)
{
- xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d);
+ xfs_rfsblock_t ld = XFS_BB_TO_FSBT(mp, d);
do_div(ld, mp->m_sb.sb_agblocks);
return (xfs_agnumber_t) ld;
}
@@ -320,7 +321,7 @@ xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d)
static inline xfs_agblock_t
xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
{
- xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d);
+ xfs_rfsblock_t ld = XFS_BB_TO_FSBT(mp, d);
return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
}
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index b669b123287b..5fe6e70b88ef 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -851,8 +851,8 @@ xfs_qm_reset_dqcounts(
* started afresh by xfs_qm_quotacheck.
*/
#ifdef DEBUG
- j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
- do_div(j, sizeof(xfs_dqblk_t));
+ j = (int)XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) /
+ sizeof(xfs_dqblk_t);
ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
#endif
dqb = bp->b_addr;
@@ -1384,12 +1384,7 @@ xfs_qm_quotacheck(
mp->m_qflags |= flags;
error_return:
- while (!list_empty(&buffer_list)) {
- struct xfs_buf *bp =
- list_first_entry(&buffer_list, struct xfs_buf, b_list);
- list_del_init(&bp->b_list);
- xfs_buf_relse(bp);
- }
+ xfs_buf_delwri_cancel(&buffer_list);
if (error) {
xfs_warn(mp,
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 475a3882a81f..9cb5c381b01c 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -759,5 +759,6 @@ xfs_qm_dqrele_all_inodes(
uint flags)
{
ASSERT(mp->m_quotainfo);
- xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL);
+ xfs_inode_ag_iterator_flags(mp, xfs_dqrele_inode, flags, NULL,
+ XFS_AGITER_INEW_WAIT);
}
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 6e4c7446c3d4..96fe209b5eb6 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -221,6 +221,7 @@ void
xfs_cui_release(
struct xfs_cui_log_item *cuip)
{
+ ASSERT(atomic_read(&cuip->cui_refcount) > 0);
if (atomic_dec_and_test(&cuip->cui_refcount)) {
xfs_trans_ail_remove(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR);
xfs_cui_item_free(cuip);
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 4a84c5ea266d..ffe6fe7a7eb5 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -206,11 +206,7 @@ xfs_reflink_trim_around_shared(
int error = 0;
/* Holes, unwritten, and delalloc extents cannot be shared */
- if (!xfs_is_reflink_inode(ip) ||
- ISUNWRITTEN(irec) ||
- irec->br_startblock == HOLESTARTBLOCK ||
- irec->br_startblock == DELAYSTARTBLOCK ||
- isnullstartblock(irec->br_startblock)) {
+ if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
*shared = false;
return 0;
}
@@ -709,8 +705,22 @@ xfs_reflink_end_cow(
offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
- /* Start a rolling transaction to switch the mappings */
- resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
+ /*
+ * Start a rolling transaction to switch the mappings. We're
+ * unlikely ever to have to remap 16T worth of single-block
+ * extents, so just cap the worst case extent count to 2^32-1.
+ * Stick a warning in just in case, and avoid 64-bit division.
+ */
+ BUILD_BUG_ON(MAX_RW_COUNT > UINT_MAX);
+ if (end_fsb - offset_fsb > UINT_MAX) {
+ error = -EFSCORRUPTED;
+ xfs_force_shutdown(ip->i_mount, SHUTDOWN_CORRUPT_INCORE);
+ ASSERT(0);
+ goto out;
+ }
+ resblks = XFS_NEXTENTADD_SPACE_RES(ip->i_mount,
+ (unsigned int)(end_fsb - offset_fsb),
+ XFS_DATA_FORK);
error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
resblks, 0, 0, &tp);
if (error)
@@ -1045,12 +1055,12 @@ xfs_reflink_remap_extent(
xfs_off_t new_isize)
{
struct xfs_mount *mp = ip->i_mount;
+ bool real_extent = xfs_bmap_is_real_extent(irec);
struct xfs_trans *tp;
xfs_fsblock_t firstfsb;
unsigned int resblks;
struct xfs_defer_ops dfops;
struct xfs_bmbt_irec uirec;
- bool real_extent;
xfs_filblks_t rlen;
xfs_filblks_t unmap_len;
xfs_off_t newlen;
@@ -1059,11 +1069,6 @@ xfs_reflink_remap_extent(
unmap_len = irec->br_startoff + irec->br_blockcount - destoff;
trace_xfs_reflink_punch_range(ip, destoff, unmap_len);
- /* Only remap normal extents. */
- real_extent = (irec->br_startblock != HOLESTARTBLOCK &&
- irec->br_startblock != DELAYSTARTBLOCK &&
- !ISUNWRITTEN(irec));
-
/* No reflinking if we're low on space */
if (real_extent) {
error = xfs_reflink_ag_has_free_space(mp,
@@ -1359,9 +1364,7 @@ xfs_reflink_dirty_extents(
goto out;
if (nmaps == 0)
break;
- if (map[0].br_startblock == HOLESTARTBLOCK ||
- map[0].br_startblock == DELAYSTARTBLOCK ||
- ISUNWRITTEN(&map[0]))
+ if (!xfs_bmap_is_real_extent(&map[0]))
goto next;
map[1] = map[0];
@@ -1435,9 +1438,7 @@ xfs_reflink_clear_inode_flag(
return error;
if (nmaps == 0)
break;
- if (map.br_startblock == HOLESTARTBLOCK ||
- map.br_startblock == DELAYSTARTBLOCK ||
- ISUNWRITTEN(&map))
+ if (!xfs_bmap_is_real_extent(&map))
goto next;
agno = XFS_FSB_TO_AGNO(mp, map.br_startblock);
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 73c827831551..f3b139c9aa16 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -243,6 +243,7 @@ void
xfs_rui_release(
struct xfs_rui_log_item *ruip)
{
+ ASSERT(atomic_read(&ruip->rui_refcount) > 0);
if (atomic_dec_and_test(&ruip->rui_refcount)) {
xfs_trans_ail_remove(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR);
xfs_rui_item_free(ruip);
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 51dd3c726608..f13133e6f19f 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -23,6 +23,16 @@
struct xfs_mount;
struct xfs_trans;
+struct xfs_rtalloc_rec {
+ xfs_rtblock_t ar_startblock;
+ xfs_rtblock_t ar_blockcount;
+};
+
+typedef int (*xfs_rtalloc_query_range_fn)(
+ struct xfs_trans *tp,
+ struct xfs_rtalloc_rec *rec,
+ void *priv);
+
#ifdef CONFIG_XFS_RT
/*
* Function prototypes for exported functions.
@@ -118,13 +128,21 @@ int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log,
int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_rtblock_t start, xfs_extlen_t len,
struct xfs_buf **rbpp, xfs_fsblock_t *rsb);
-
-
+int xfs_rtalloc_query_range(struct xfs_trans *tp,
+ struct xfs_rtalloc_rec *low_rec,
+ struct xfs_rtalloc_rec *high_rec,
+ xfs_rtalloc_query_range_fn fn,
+ void *priv);
+int xfs_rtalloc_query_all(struct xfs_trans *tp,
+ xfs_rtalloc_query_range_fn fn,
+ void *priv);
#else
# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS)
# define xfs_rtfree_extent(t,b,l) (ENOSYS)
# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS)
# define xfs_growfs_rt(mp,in) (ENOSYS)
+# define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS)
+# define xfs_rtalloc_query_all(t,f,p) (ENOSYS)
static inline int /* error */
xfs_rtmount_init(
xfs_mount_t *mp) /* file system mount structure */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 685c042a120f..47d239dcf3f4 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -877,8 +877,15 @@ xfs_init_mount_workqueues(
if (!mp->m_eofblocks_workqueue)
goto out_destroy_log;
+ mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", WQ_FREEZABLE, 0,
+ mp->m_fsname);
+ if (!mp->m_sync_workqueue)
+ goto out_destroy_eofb;
+
return 0;
+out_destroy_eofb:
+ destroy_workqueue(mp->m_eofblocks_workqueue);
out_destroy_log:
destroy_workqueue(mp->m_log_workqueue);
out_destroy_reclaim:
@@ -899,6 +906,7 @@ STATIC void
xfs_destroy_mount_workqueues(
struct xfs_mount *mp)
{
+ destroy_workqueue(mp->m_sync_workqueue);
destroy_workqueue(mp->m_eofblocks_workqueue);
destroy_workqueue(mp->m_log_workqueue);
destroy_workqueue(mp->m_reclaim_workqueue);
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 7f17ae6d709a..5d95fe348294 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -47,6 +47,7 @@
#include "xfs_inode_item.h"
#include "xfs_bmap_btree.h"
#include "xfs_filestream.h"
+#include "xfs_fsmap.h"
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 383ac227ce2c..7c5a16528d8b 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -40,6 +40,8 @@ struct xfs_inode_log_format;
struct xfs_bmbt_irec;
struct xfs_btree_cur;
struct xfs_refcount_irec;
+struct xfs_fsmap;
+struct xfs_rmap_irec;
DECLARE_EVENT_CLASS(xfs_attr_list_class,
TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -2190,7 +2192,7 @@ DECLARE_EVENT_CLASS(xfs_discard_class,
__entry->agbno = agbno;
__entry->len = len;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u\n",
+ TP_printk("dev %d:%d agno %u agbno %u len %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -2253,8 +2255,8 @@ DECLARE_EVENT_CLASS(xfs_defer_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(void *, dop)
- __field(bool, committed)
- __field(bool, low)
+ __field(char, committed)
+ __field(char, low)
),
TP_fast_assign(
__entry->dev = mp ? mp->m_super->s_dev : 0;
@@ -2262,7 +2264,7 @@ DECLARE_EVENT_CLASS(xfs_defer_class,
__entry->committed = dop->dop_committed;
__entry->low = dop->dop_low;
),
- TP_printk("dev %d:%d ops %p committed %d low %d\n",
+ TP_printk("dev %d:%d ops %p committed %d low %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->dop,
__entry->committed,
@@ -2279,8 +2281,8 @@ DECLARE_EVENT_CLASS(xfs_defer_error_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(void *, dop)
- __field(bool, committed)
- __field(bool, low)
+ __field(char, committed)
+ __field(char, low)
__field(int, error)
),
TP_fast_assign(
@@ -2290,7 +2292,7 @@ DECLARE_EVENT_CLASS(xfs_defer_error_class,
__entry->low = dop->dop_low;
__entry->error = error;
),
- TP_printk("dev %d:%d ops %p committed %d low %d err %d\n",
+ TP_printk("dev %d:%d ops %p committed %d low %d err %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->dop,
__entry->committed,
@@ -2309,7 +2311,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class,
__field(dev_t, dev)
__field(int, type)
__field(void *, intent)
- __field(bool, committed)
+ __field(char, committed)
__field(int, nr)
),
TP_fast_assign(
@@ -2319,7 +2321,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class,
__entry->committed = dfp->dfp_done != NULL;
__entry->nr = dfp->dfp_count;
),
- TP_printk("dev %d:%d optype %d intent %p committed %d nr %d\n",
+ TP_printk("dev %d:%d optype %d intent %p committed %d nr %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->type,
__entry->intent,
@@ -2614,7 +2616,8 @@ DECLARE_EVENT_CLASS(xfs_ag_resv_class,
__entry->asked = r ? r->ar_asked : 0;
__entry->len = len;
),
- TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u resv %u ask %u len %u\n",
+ TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u "
+ "resv %u ask %u len %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->resv,
@@ -2667,7 +2670,7 @@ DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class,
__entry->agbno = agbno;
__entry->dir = dir;
),
- TP_printk("dev %d:%d agno %u agbno %u cmp %s(%d)\n",
+ TP_printk("dev %d:%d agno %u agbno %u cmp %s(%d)",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -2700,7 +2703,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
__entry->blockcount = irec->rc_blockcount;
__entry->refcount = irec->rc_refcount;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u\n",
+ TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->startblock,
@@ -2735,7 +2738,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class,
__entry->refcount = irec->rc_refcount;
__entry->agbno = agbno;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u @ agbno %u\n",
+ TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u @ agbno %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->startblock,
@@ -2776,7 +2779,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
__entry->i2_refcount = i2->rc_refcount;
),
TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- "
- "agbno %u len %u refcount %u\n",
+ "agbno %u len %u refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->i1_startblock,
@@ -2822,7 +2825,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
__entry->agbno = agbno;
),
TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- "
- "agbno %u len %u refcount %u @ agbno %u\n",
+ "agbno %u len %u refcount %u @ agbno %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->i1_startblock,
@@ -2875,7 +2878,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
),
TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- "
"agbno %u len %u refcount %u -- "
- "agbno %u len %u refcount %u\n",
+ "agbno %u len %u refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->i1_startblock,
@@ -3001,31 +3004,6 @@ DEFINE_EVENT(xfs_inode_error_class, name, \
unsigned long caller_ip), \
TP_ARGS(ip, error, caller_ip))
-/* reflink allocator */
-TRACE_EVENT(xfs_bmap_remap_alloc,
- TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t fsbno,
- xfs_extlen_t len),
- TP_ARGS(ip, fsbno, len),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_ino_t, ino)
- __field(xfs_fsblock_t, fsbno)
- __field(xfs_extlen_t, len)
- ),
- TP_fast_assign(
- __entry->dev = VFS_I(ip)->i_sb->s_dev;
- __entry->ino = ip->i_ino;
- __entry->fsbno = fsbno;
- __entry->len = len;
- ),
- TP_printk("dev %d:%d ino 0x%llx fsbno 0x%llx len %x",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ino,
- __entry->fsbno,
- __entry->len)
-);
-DEFINE_INODE_ERROR_EVENT(xfs_bmap_remap_alloc_error);
-
/* reflink tracepoint classes */
/* two-file io tracepoint class */
@@ -3227,7 +3205,7 @@ TRACE_EVENT(xfs_ioctl_clone,
),
TP_printk("dev %d:%d "
"ino 0x%lx isize 0x%llx -> "
- "ino 0x%lx isize 0x%llx\n",
+ "ino 0x%lx isize 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->src_ino,
__entry->src_isize,
@@ -3267,6 +3245,88 @@ DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap);
DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap_piece);
DEFINE_INODE_ERROR_EVENT(xfs_swap_extent_rmap_error);
+/* fsmap traces */
+DECLARE_EVENT_CLASS(xfs_fsmap_class,
+ TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno,
+ struct xfs_rmap_irec *rmap),
+ TP_ARGS(mp, keydev, agno, rmap),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, keydev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_fsblock_t, bno)
+ __field(xfs_filblks_t, len)
+ __field(__uint64_t, owner)
+ __field(__uint64_t, offset)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->keydev = new_decode_dev(keydev);
+ __entry->agno = agno;
+ __entry->bno = rmap->rm_startblock;
+ __entry->len = rmap->rm_blockcount;
+ __entry->owner = rmap->rm_owner;
+ __entry->offset = rmap->rm_offset;
+ __entry->flags = rmap->rm_flags;
+ ),
+ TP_printk("dev %d:%d keydev %d:%d agno %u bno %llu len %llu owner %lld offset %llu flags 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ MAJOR(__entry->keydev), MINOR(__entry->keydev),
+ __entry->agno,
+ __entry->bno,
+ __entry->len,
+ __entry->owner,
+ __entry->offset,
+ __entry->flags)
+)
+#define DEFINE_FSMAP_EVENT(name) \
+DEFINE_EVENT(xfs_fsmap_class, name, \
+ TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno, \
+ struct xfs_rmap_irec *rmap), \
+ TP_ARGS(mp, keydev, agno, rmap))
+DEFINE_FSMAP_EVENT(xfs_fsmap_low_key);
+DEFINE_FSMAP_EVENT(xfs_fsmap_high_key);
+DEFINE_FSMAP_EVENT(xfs_fsmap_mapping);
+
+DECLARE_EVENT_CLASS(xfs_getfsmap_class,
+ TP_PROTO(struct xfs_mount *mp, struct xfs_fsmap *fsmap),
+ TP_ARGS(mp, fsmap),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, keydev)
+ __field(xfs_daddr_t, block)
+ __field(xfs_daddr_t, len)
+ __field(__uint64_t, owner)
+ __field(__uint64_t, offset)
+ __field(__uint64_t, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->keydev = new_decode_dev(fsmap->fmr_device);
+ __entry->block = fsmap->fmr_physical;
+ __entry->len = fsmap->fmr_length;
+ __entry->owner = fsmap->fmr_owner;
+ __entry->offset = fsmap->fmr_offset;
+ __entry->flags = fsmap->fmr_flags;
+ ),
+ TP_printk("dev %d:%d keydev %d:%d block %llu len %llu owner %lld offset %llu flags 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ MAJOR(__entry->keydev), MINOR(__entry->keydev),
+ __entry->block,
+ __entry->len,
+ __entry->owner,
+ __entry->offset,
+ __entry->flags)
+)
+#define DEFINE_GETFSMAP_EVENT(name) \
+DEFINE_EVENT(xfs_getfsmap_class, name, \
+ TP_PROTO(struct xfs_mount *mp, struct xfs_fsmap *fsmap), \
+ TP_ARGS(mp, fsmap))
+DEFINE_GETFSMAP_EVENT(xfs_getfsmap_low_key);
+DEFINE_GETFSMAP_EVENT(xfs_getfsmap_high_key);
+DEFINE_GETFSMAP_EVENT(xfs_getfsmap_mapping);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index f5969c8274fc..2011620008de 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -263,6 +263,28 @@ xfs_trans_alloc(
}
/*
+ * Create an empty transaction with no reservation. This is a defensive
+ * mechanism for routines that query metadata without actually modifying
+ * them -- if the metadata being queried is somehow cross-linked (think a
+ * btree block pointer that points higher in the tree), we risk deadlock.
+ * However, blocks grabbed as part of a transaction can be re-grabbed.
+ * The verifiers will notice the corrupt block and the operation will fail
+ * back to userspace without deadlocking.
+ *
+ * Note the zero-length reservation; this transaction MUST be cancelled
+ * without any dirty data.
+ */
+int
+xfs_trans_alloc_empty(
+ struct xfs_mount *mp,
+ struct xfs_trans **tpp)
+{
+ struct xfs_trans_res resv = {0};
+
+ return xfs_trans_alloc(mp, &resv, 0, 0, XFS_TRANS_NO_WRITECOUNT, tpp);
+}
+
+/*
* Record the indicated change to the given field for application
* to the file system's superblock when the transaction commits.
* For now, just store the change in the transaction structure.
@@ -1012,17 +1034,14 @@ xfs_trans_cancel(
* chunk we've been working on and get a new transaction to continue.
*/
int
-__xfs_trans_roll(
+xfs_trans_roll(
struct xfs_trans **tpp,
- struct xfs_inode *dp,
- int *committed)
+ struct xfs_inode *dp)
{
struct xfs_trans *trans;
struct xfs_trans_res tres;
int error;
- *committed = 0;
-
/*
* Ensure that the inode is always logged.
*/
@@ -1048,7 +1067,6 @@ __xfs_trans_roll(
if (error)
return error;
- *committed = 1;
trans = *tpp;
/*
@@ -1071,12 +1089,3 @@ __xfs_trans_roll(
xfs_trans_ijoin(trans, dp, 0);
return 0;
}
-
-int
-xfs_trans_roll(
- struct xfs_trans **tpp,
- struct xfs_inode *dp)
-{
- int committed;
- return __xfs_trans_roll(tpp, dp, &committed);
-}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 1646f659b60f..a07acbf0bd8a 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -158,6 +158,8 @@ typedef struct xfs_trans {
int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp,
uint blocks, uint rtextents, uint flags,
struct xfs_trans **tpp);
+int xfs_trans_alloc_empty(struct xfs_mount *mp,
+ struct xfs_trans **tpp);
void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
struct xfs_buf *xfs_trans_get_buf_map(struct xfs_trans *tp,
@@ -226,7 +228,6 @@ int xfs_trans_free_extent(struct xfs_trans *,
struct xfs_efd_log_item *, xfs_fsblock_t,
xfs_extlen_t, struct xfs_owner_info *);
int xfs_trans_commit(struct xfs_trans *);
-int __xfs_trans_roll(struct xfs_trans **, struct xfs_inode *, int *);
int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
void xfs_trans_cancel(xfs_trans_t *);
int xfs_trans_ail_init(struct xfs_mount *);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index d6c9c3e9e02b..9056c0f34a3c 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -684,8 +684,23 @@ xfs_trans_ail_update_bulk(
}
}
-/*
- * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
+bool
+xfs_ail_delete_one(
+ struct xfs_ail *ailp,
+ struct xfs_log_item *lip)
+{
+ struct xfs_log_item *mlip = xfs_ail_min(ailp);
+
+ trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
+ xfs_ail_delete(ailp, lip);
+ lip->li_flags &= ~XFS_LI_IN_AIL;
+ lip->li_lsn = 0;
+
+ return mlip == lip;
+}
+
+/**
+ * Remove a log items from the AIL
*
* @xfs_trans_ail_delete_bulk takes an array of log items that all need to
* removed from the AIL. The caller is already holding the AIL lock, and done
@@ -706,52 +721,36 @@ xfs_trans_ail_update_bulk(
* before returning.
*/
void
-xfs_trans_ail_delete_bulk(
+xfs_trans_ail_delete(
struct xfs_ail *ailp,
- struct xfs_log_item **log_items,
- int nr_items,
+ struct xfs_log_item *lip,
int shutdown_type) __releases(ailp->xa_lock)
{
- xfs_log_item_t *mlip;
- int mlip_changed = 0;
- int i;
+ struct xfs_mount *mp = ailp->xa_mount;
+ bool mlip_changed;
- mlip = xfs_ail_min(ailp);
-
- for (i = 0; i < nr_items; i++) {
- struct xfs_log_item *lip = log_items[i];
- if (!(lip->li_flags & XFS_LI_IN_AIL)) {
- struct xfs_mount *mp = ailp->xa_mount;
-
- spin_unlock(&ailp->xa_lock);
- if (!XFS_FORCED_SHUTDOWN(mp)) {
- xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
- "%s: attempting to delete a log item that is not in the AIL",
- __func__);
- xfs_force_shutdown(mp, shutdown_type);
- }
- return;
+ if (!(lip->li_flags & XFS_LI_IN_AIL)) {
+ spin_unlock(&ailp->xa_lock);
+ if (!XFS_FORCED_SHUTDOWN(mp)) {
+ xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
+ "%s: attempting to delete a log item that is not in the AIL",
+ __func__);
+ xfs_force_shutdown(mp, shutdown_type);
}
-
- trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
- xfs_ail_delete(ailp, lip);
- lip->li_flags &= ~XFS_LI_IN_AIL;
- lip->li_lsn = 0;
- if (mlip == lip)
- mlip_changed = 1;
+ return;
}
+ mlip_changed = xfs_ail_delete_one(ailp, lip);
if (mlip_changed) {
- if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
- xlog_assign_tail_lsn_locked(ailp->xa_mount);
+ if (!XFS_FORCED_SHUTDOWN(mp))
+ xlog_assign_tail_lsn_locked(mp);
if (list_empty(&ailp->xa_ail))
wake_up_all(&ailp->xa_empty);
- spin_unlock(&ailp->xa_lock);
+ }
+ spin_unlock(&ailp->xa_lock);
+ if (mlip_changed)
xfs_log_space_wake(ailp->xa_mount);
- } else {
- spin_unlock(&ailp->xa_lock);
- }
}
int
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 49931b72da8a..d91706c56c63 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -106,18 +106,9 @@ xfs_trans_ail_update(
xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
}
-void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
- struct xfs_log_item **log_items, int nr_items,
- int shutdown_type)
- __releases(ailp->xa_lock);
-static inline void
-xfs_trans_ail_delete(
- struct xfs_ail *ailp,
- xfs_log_item_t *lip,
- int shutdown_type) __releases(ailp->xa_lock)
-{
- xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type);
-}
+bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
+void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip,
+ int shutdown_type) __releases(ailp->xa_lock);
static inline void
xfs_trans_ail_remove(