diff options
Diffstat (limited to 'fs')
189 files changed, 7126 insertions, 3251 deletions
diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 2f8bab390d13..773749be8290 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -7,7 +7,7 @@ #include <linux/types.h> #include <linux/fs.h> #include <linux/buffer_head.h> -#include <linux/amigaffs.h> +#include "amigaffs.h" #include <linux/mutex.h> #include <linux/workqueue.h> @@ -173,7 +173,7 @@ extern int affs_link(struct dentry *olddentry, struct inode *dir, struct dentry *dentry); extern int affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname); -extern int affs_rename(struct inode *old_dir, struct dentry *old_dentry, +extern int affs_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); diff --git a/fs/affs/amigaffs.h b/fs/affs/amigaffs.h new file mode 100644 index 000000000000..43b41c06aa37 --- /dev/null +++ b/fs/affs/amigaffs.h @@ -0,0 +1,144 @@ +#ifndef AMIGAFFS_H +#define AMIGAFFS_H + +#include <linux/types.h> +#include <asm/byteorder.h> + +#define FS_OFS 0x444F5300 +#define FS_FFS 0x444F5301 +#define FS_INTLOFS 0x444F5302 +#define FS_INTLFFS 0x444F5303 +#define FS_DCOFS 0x444F5304 +#define FS_DCFFS 0x444F5305 +#define MUFS_FS 0x6d754653 /* 'muFS' */ +#define MUFS_OFS 0x6d754600 /* 'muF\0' */ +#define MUFS_FFS 0x6d754601 /* 'muF\1' */ +#define MUFS_INTLOFS 0x6d754602 /* 'muF\2' */ +#define MUFS_INTLFFS 0x6d754603 /* 'muF\3' */ +#define MUFS_DCOFS 0x6d754604 /* 'muF\4' */ +#define MUFS_DCFFS 0x6d754605 /* 'muF\5' */ + +#define T_SHORT 2 +#define T_LIST 16 +#define T_DATA 8 + +#define ST_LINKFILE -4 +#define ST_FILE -3 +#define ST_ROOT 1 +#define ST_USERDIR 2 +#define ST_SOFTLINK 3 +#define ST_LINKDIR 4 + +#define AFFS_ROOT_BMAPS 25 + +struct affs_date { + __be32 days; + __be32 mins; + __be32 ticks; +}; + +struct affs_short_date { + __be16 days; + __be16 mins; + __be16 ticks; +}; + +struct affs_root_head { + __be32 ptype; + __be32 spare1; + __be32 spare2; + __be32 hash_size; + __be32 spare3; + __be32 checksum; + __be32 hashtable[1]; +}; + +struct affs_root_tail { + __be32 bm_flag; + __be32 bm_blk[AFFS_ROOT_BMAPS]; + __be32 bm_ext; + struct affs_date root_change; + u8 disk_name[32]; + __be32 spare1; + __be32 spare2; + struct affs_date disk_change; + struct affs_date disk_create; + __be32 spare3; + __be32 spare4; + __be32 dcache; + __be32 stype; +}; + +struct affs_head { + __be32 ptype; + __be32 key; + __be32 block_count; + __be32 spare1; + __be32 first_data; + __be32 checksum; + __be32 table[1]; +}; + +struct affs_tail { + __be32 spare1; + __be16 uid; + __be16 gid; + __be32 protect; + __be32 size; + u8 comment[92]; + struct affs_date change; + u8 name[32]; + __be32 spare2; + __be32 original; + __be32 link_chain; + __be32 spare[5]; + __be32 hash_chain; + __be32 parent; + __be32 extension; + __be32 stype; +}; + +struct slink_front +{ + __be32 ptype; + __be32 key; + __be32 spare1[3]; + __be32 checksum; + u8 symname[1]; /* depends on block size */ +}; + +struct affs_data_head +{ + __be32 ptype; + __be32 key; + __be32 sequence; + __be32 size; + __be32 next; + __be32 checksum; + u8 data[1]; /* depends on block size */ +}; + +/* Permission bits */ + +#define FIBF_OTR_READ 0x8000 +#define FIBF_OTR_WRITE 0x4000 +#define FIBF_OTR_EXECUTE 0x2000 +#define FIBF_OTR_DELETE 0x1000 +#define FIBF_GRP_READ 0x0800 +#define FIBF_GRP_WRITE 0x0400 +#define FIBF_GRP_EXECUTE 0x0200 +#define FIBF_GRP_DELETE 0x0100 + +#define FIBF_HIDDEN 0x0080 +#define FIBF_SCRIPT 0x0040 +#define FIBF_PURE 0x0020 /* no use under linux */ +#define FIBF_ARCHIVED 0x0010 /* never set, always cleared on write */ +#define FIBF_NOREAD 0x0008 /* 0 means allowed */ +#define FIBF_NOWRITE 0x0004 /* 0 means allowed */ +#define FIBF_NOEXECUTE 0x0002 /* 0 means allowed, ignored under linux */ +#define FIBF_NODELETE 0x0001 /* 0 means allowed */ + +#define FIBF_OWNER 0x000F /* Bits pertaining to owner */ +#define FIBF_MASK 0xEE0E /* Bits modified by Linux */ + +#endif diff --git a/fs/affs/dir.c b/fs/affs/dir.c index f1e7294381c5..591ecd7f3063 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -35,7 +35,7 @@ const struct inode_operations affs_dir_inode_operations = { .symlink = affs_symlink, .mkdir = affs_mkdir, .rmdir = affs_rmdir, - .rename = affs_rename, + .rename = affs_rename2, .setattr = affs_notify_change, }; diff --git a/fs/affs/file.c b/fs/affs/file.c index 0deec9cc2362..196ee7f6fdc4 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -499,7 +499,7 @@ affs_getemptyblk_ino(struct inode *inode, int block) } static int -affs_do_readpage_ofs(struct page *page, unsigned to) +affs_do_readpage_ofs(struct page *page, unsigned to, int create) { struct inode *inode = page->mapping->host; struct super_block *sb = inode->i_sb; @@ -518,7 +518,7 @@ affs_do_readpage_ofs(struct page *page, unsigned to) boff = tmp % bsize; while (pos < to) { - bh = affs_bread_ino(inode, bidx, 0); + bh = affs_bread_ino(inode, bidx, create); if (IS_ERR(bh)) return PTR_ERR(bh); tmp = min(bsize - boff, to - pos); @@ -620,7 +620,7 @@ affs_readpage_ofs(struct file *file, struct page *page) memset(page_address(page) + to, 0, PAGE_SIZE - to); } - err = affs_do_readpage_ofs(page, to); + err = affs_do_readpage_ofs(page, to, 0); if (!err) SetPageUptodate(page); unlock_page(page); @@ -657,7 +657,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping return 0; /* XXX: inefficient but safe in the face of short writes */ - err = affs_do_readpage_ofs(page, PAGE_SIZE); + err = affs_do_readpage_ofs(page, PAGE_SIZE, 1); if (err) { unlock_page(page); put_page(page); @@ -679,7 +679,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, int written; from = pos & (PAGE_SIZE - 1); - to = pos + len; + to = from + len; /* * XXX: not sure if this can handle short copies (len < copied), but * we don't have to, because the page should always be uptodate here, diff --git a/fs/affs/inode.c b/fs/affs/inode.c index abcc59899229..fd4ef3c40e40 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -140,6 +140,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) inode->i_fop = &affs_file_operations; break; case ST_SOFTLINK: + inode->i_size = strlen((char *)AFFS_HEAD(bh)->table); inode->i_mode |= S_IFLNK; inode_nohighmem(inode); inode->i_op = &affs_symlink_inode_operations; diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 96dd1d09a273..46d3ace6761d 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -365,6 +365,7 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) symname++; } *p = 0; + inode->i_size = i + 1; mark_buffer_dirty_inode(bh, inode); affs_brelse(bh); mark_inode_dirty(inode); @@ -393,21 +394,14 @@ affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) return affs_add_entry(dir, inode, dentry, ST_LINKFILE); } -int +static int affs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) + struct inode *new_dir, struct dentry *new_dentry) { struct super_block *sb = old_dir->i_sb; struct buffer_head *bh = NULL; int retval; - if (flags & ~RENAME_NOREPLACE) - return -EINVAL; - - pr_debug("%s(old=%lu,\"%pd\" to new=%lu,\"%pd\")\n", __func__, - old_dir->i_ino, old_dentry, new_dir->i_ino, new_dentry); - retval = affs_check_name(new_dentry->d_name.name, new_dentry->d_name.len, affs_nofilenametruncate(old_dentry)); @@ -447,6 +441,76 @@ done: return retval; } +static int +affs_xrename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + + struct super_block *sb = old_dir->i_sb; + struct buffer_head *bh_old = NULL; + struct buffer_head *bh_new = NULL; + int retval; + + bh_old = affs_bread(sb, d_inode(old_dentry)->i_ino); + if (!bh_old) + return -EIO; + + bh_new = affs_bread(sb, d_inode(new_dentry)->i_ino); + if (!bh_new) + return -EIO; + + /* Remove old header from its parent directory. */ + affs_lock_dir(old_dir); + retval = affs_remove_hash(old_dir, bh_old); + affs_unlock_dir(old_dir); + if (retval) + goto done; + + /* Remove new header from its parent directory. */ + affs_lock_dir(new_dir); + retval = affs_remove_hash(new_dir, bh_new); + affs_unlock_dir(new_dir); + if (retval) + goto done; + + /* Insert old into the new directory with the new name. */ + affs_copy_name(AFFS_TAIL(sb, bh_old)->name, new_dentry); + affs_fix_checksum(sb, bh_old); + affs_lock_dir(new_dir); + retval = affs_insert_hash(new_dir, bh_old); + affs_unlock_dir(new_dir); + + /* Insert new into the old directory with the old name. */ + affs_copy_name(AFFS_TAIL(sb, bh_new)->name, old_dentry); + affs_fix_checksum(sb, bh_new); + affs_lock_dir(old_dir); + retval = affs_insert_hash(old_dir, bh_new); + affs_unlock_dir(old_dir); +done: + mark_buffer_dirty_inode(bh_old, new_dir); + mark_buffer_dirty_inode(bh_new, old_dir); + affs_brelse(bh_old); + affs_brelse(bh_new); + return retval; +} + +int affs_rename2(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + pr_debug("%s(old=%lu,\"%pd\" to new=%lu,\"%pd\")\n", __func__, + old_dir->i_ino, old_dentry, new_dir->i_ino, new_dentry); + + if (flags & RENAME_EXCHANGE) + return affs_xrename(old_dir, old_dentry, new_dir, new_dentry); + + return affs_rename(old_dir, old_dentry, new_dir, new_dentry); +} + static struct dentry *affs_get_parent(struct dentry *child) { struct inode *parent; @@ -477,11 +541,6 @@ static struct inode *affs_nfs_get_inode(struct super_block *sb, u64 ino, if (IS_ERR(inode)) return ERR_CAST(inode); - if (generation && inode->i_generation != generation) { - iput(inode); - return ERR_PTR(-ESTALE); - } - return inode; } diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index c500e954debb..63e7c4760bfb 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -58,6 +58,7 @@ static struct dentry *befs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); static struct dentry *befs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); +static struct dentry *befs_get_parent(struct dentry *child); static const struct super_operations befs_sops = { .alloc_inode = befs_alloc_inode, /* allocate a new inode */ @@ -93,6 +94,7 @@ static const struct address_space_operations befs_symlink_aops = { static const struct export_operations befs_export_operations = { .fh_to_dentry = befs_fh_to_dentry, .fh_to_parent = befs_fh_to_parent, + .get_parent = befs_get_parent, }; /* @@ -667,6 +669,19 @@ static struct dentry *befs_fh_to_parent(struct super_block *sb, befs_nfs_get_inode); } +static struct dentry *befs_get_parent(struct dentry *child) +{ + struct inode *parent; + struct befs_inode_info *befs_ino = BEFS_I(d_inode(child)); + + parent = befs_iget(child->d_sb, + (unsigned long)befs_ino->i_parent.start); + if (IS_ERR(parent)) + return ERR_CAST(parent); + + return d_obtain_alias(parent); +} + enum { Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err, }; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index bee1a36bc2ec..f4718098ac31 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -818,7 +818,7 @@ static const struct super_operations s_ops = { static int bm_fill_super(struct super_block *sb, void *data, int silent) { int err; - static struct tree_descr bm_files[] = { + static const struct tree_descr bm_files[] = { [2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO}, [3] = {"register", &bm_register_operations, S_IWUSR}, /* last one */ {""} diff --git a/fs/block_dev.c b/fs/block_dev.c index 0d435c794d76..2a305c1a2d88 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -18,6 +18,7 @@ #include <linux/module.h> #include <linux/blkpg.h> #include <linux/magic.h> +#include <linux/dax.h> #include <linux/buffer_head.h> #include <linux/swap.h> #include <linux/pagevec.h> @@ -716,50 +717,18 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL_GPL(bdev_write_page); -/** - * bdev_direct_access() - Get the address for directly-accessibly memory - * @bdev: The device containing the memory - * @dax: control and output parameters for ->direct_access - * - * If a block device is made up of directly addressable memory, this function - * will tell the caller the PFN and the address of the memory. The address - * may be directly dereferenced within the kernel without the need to call - * ioremap(), kmap() or similar. The PFN is suitable for inserting into - * page tables. - * - * Return: negative errno if an error occurs, otherwise the number of bytes - * accessible at this address. - */ -long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax) +int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size, + pgoff_t *pgoff) { - sector_t sector = dax->sector; - long avail, size = dax->size; - const struct block_device_operations *ops = bdev->bd_disk->fops; + phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512; - /* - * The device driver is allowed to sleep, in order to make the - * memory directly accessible. - */ - might_sleep(); - - if (size < 0) - return size; - if (!blk_queue_dax(bdev_get_queue(bdev)) || !ops->direct_access) - return -EOPNOTSUPP; - if ((sector + DIV_ROUND_UP(size, 512)) > - part_nr_sects_read(bdev->bd_part)) - return -ERANGE; - sector += get_start_sect(bdev); - if (sector % (PAGE_SIZE / 512)) + if (pgoff) + *pgoff = PHYS_PFN(phys_off); + if (phys_off % PAGE_SIZE || size % PAGE_SIZE) return -EINVAL; - avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn, size); - if (!avail) - return -ERANGE; - if (avail > 0 && avail & ~PAGE_MASK) - return -ENXIO; - return min(avail, size); + return 0; } -EXPORT_SYMBOL_GPL(bdev_direct_access); +EXPORT_SYMBOL(bdev_dax_pgoff); /** * bdev_dax_supported() - Check if the device supports dax for filesystem @@ -773,62 +742,46 @@ EXPORT_SYMBOL_GPL(bdev_direct_access); */ int bdev_dax_supported(struct super_block *sb, int blocksize) { - struct blk_dax_ctl dax = { - .sector = 0, - .size = PAGE_SIZE, - }; - int err; + struct block_device *bdev = sb->s_bdev; + struct dax_device *dax_dev; + pgoff_t pgoff; + int err, id; + void *kaddr; + pfn_t pfn; + long len; if (blocksize != PAGE_SIZE) { vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax"); return -EINVAL; } - err = bdev_direct_access(sb->s_bdev, &dax); - if (err < 0) { - switch (err) { - case -EOPNOTSUPP: - vfs_msg(sb, KERN_ERR, - "error: device does not support dax"); - break; - case -EINVAL: - vfs_msg(sb, KERN_ERR, - "error: unaligned partition for dax"); - break; - default: - vfs_msg(sb, KERN_ERR, - "error: dax access failed (%d)", err); - } + err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff); + if (err) { + vfs_msg(sb, KERN_ERR, "error: unaligned partition for dax"); return err; } - return 0; -} -EXPORT_SYMBOL_GPL(bdev_dax_supported); - -/** - * bdev_dax_capable() - Return if the raw device is capable for dax - * @bdev: The device for raw block device access - */ -bool bdev_dax_capable(struct block_device *bdev) -{ - struct blk_dax_ctl dax = { - .size = PAGE_SIZE, - }; + dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); + if (!dax_dev) { + vfs_msg(sb, KERN_ERR, "error: device does not support dax"); + return -EOPNOTSUPP; + } - if (!IS_ENABLED(CONFIG_FS_DAX)) - return false; + id = dax_read_lock(); + len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn); + dax_read_unlock(id); - dax.sector = 0; - if (bdev_direct_access(bdev, &dax) < 0) - return false; + put_dax(dax_dev); - dax.sector = bdev->bd_part->nr_sects - (PAGE_SIZE / 512); - if (bdev_direct_access(bdev, &dax) < 0) - return false; + if (len < 1) { + vfs_msg(sb, KERN_ERR, + "error: dax access failed (%ld)", len); + return len < 0 ? len : -EIO; + } - return true; + return 0; } +EXPORT_SYMBOL_GPL(bdev_dax_supported); /* * pseudo-fs diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 7dc8844037e0..1c3b6c54d5ee 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -5392,13 +5392,10 @@ int btrfs_compare_trees(struct btrfs_root *left_root, goto out; } - tmp_buf = kmalloc(fs_info->nodesize, GFP_KERNEL | __GFP_NOWARN); + tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); if (!tmp_buf) { - tmp_buf = vmalloc(fs_info->nodesize); - if (!tmp_buf) { - ret = -ENOMEM; - goto out; - } + ret = -ENOMEM; + goto out; } left_path->search_commit_root = 1; diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index dd7fb22a955a..fc0bd8406758 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -167,8 +167,7 @@ static u8 *alloc_bitmap(u32 bitmap_size) if (mem) return mem; - return __vmalloc(bitmap_size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL); + return __vmalloc(bitmap_size, GFP_NOFS | __GFP_ZERO, PAGE_KERNEL); } int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index dabfc7ac48a6..922a66fce401 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3539,12 +3539,9 @@ static int btrfs_clone(struct inode *src, struct inode *inode, u64 last_dest_end = destoff; ret = -ENOMEM; - buf = kmalloc(fs_info->nodesize, GFP_KERNEL | __GFP_NOWARN); - if (!buf) { - buf = vmalloc(fs_info->nodesize); - if (!buf) - return ret; - } + buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); + if (!buf) + return ret; path = btrfs_alloc_path(); if (!path) { diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index a60d5bfb8a49..3f645cd67b54 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -6360,22 +6360,16 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) sctx->clone_roots_cnt = arg->clone_sources_count; sctx->send_max_size = BTRFS_SEND_BUF_SIZE; - sctx->send_buf = kmalloc(sctx->send_max_size, GFP_KERNEL | __GFP_NOWARN); + sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL); if (!sctx->send_buf) { - sctx->send_buf = vmalloc(sctx->send_max_size); - if (!sctx->send_buf) { - ret = -ENOMEM; - goto out; - } + ret = -ENOMEM; + goto out; } - sctx->read_buf = kmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL | __GFP_NOWARN); + sctx->read_buf = kvmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL); if (!sctx->read_buf) { - sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE); - if (!sctx->read_buf) { - ret = -ENOMEM; - goto out; - } + ret = -ENOMEM; + goto out; } sctx->pending_dir_moves = RB_ROOT; @@ -6396,13 +6390,10 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources); if (arg->clone_sources_count) { - clone_sources_tmp = kmalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN); + clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL); if (!clone_sources_tmp) { - clone_sources_tmp = vmalloc(alloc_size); - if (!clone_sources_tmp) { - ret = -ENOMEM; - goto out; - } + ret = -ENOMEM; + goto out; } ret = copy_from_user(clone_sources_tmp, arg->clone_sources, diff --git a/fs/buffer.c b/fs/buffer.c index 9196f2a270da..161be58c5cb0 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -49,7 +49,6 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, - unsigned long bio_flags, struct writeback_control *wbc); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) @@ -1830,7 +1829,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, 0, wbc); + submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc); nr_underway++; } bh = next; @@ -1884,7 +1883,7 @@ recover: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, 0, wbc); + submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc); nr_underway++; } bh = next; @@ -2379,8 +2378,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size) goto out; err = pagecache_write_begin(NULL, mapping, size, 0, - AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND, - &page, &fsdata); + AOP_FLAG_CONT_EXPAND, &page, &fsdata); if (err) goto out; @@ -2415,9 +2413,8 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, } len = PAGE_SIZE - zerofrom; - err = pagecache_write_begin(file, mapping, curpos, len, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); + err = pagecache_write_begin(file, mapping, curpos, len, 0, + &page, &fsdata); if (err) goto out; zero_user(page, zerofrom, len); @@ -2449,9 +2446,8 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, } len = offset - zerofrom; - err = pagecache_write_begin(file, mapping, curpos, len, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); + err = pagecache_write_begin(file, mapping, curpos, len, 0, + &page, &fsdata); if (err) goto out; zero_user(page, zerofrom, len); @@ -3095,7 +3091,7 @@ void guard_bio_eod(int op, struct bio *bio) } static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, - unsigned long bio_flags, struct writeback_control *wbc) + struct writeback_control *wbc) { struct bio *bio; @@ -3130,7 +3126,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; - bio->bi_flags |= bio_flags; /* Take care of bh's that straddle the end of the device */ guard_bio_eod(op, bio); @@ -3145,16 +3140,9 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, return 0; } -int _submit_bh(int op, int op_flags, struct buffer_head *bh, - unsigned long bio_flags) +int submit_bh(int op, int op_flags, struct buffer_head *bh) { - return submit_bh_wbc(op, op_flags, bh, bio_flags, NULL); -} -EXPORT_SYMBOL_GPL(_submit_bh); - -int submit_bh(int op, int op_flags, struct buffer_head *bh) -{ - return submit_bh_wbc(op, op_flags, bh, 0, NULL); + return submit_bh_wbc(op, op_flags, bh, NULL); } EXPORT_SYMBOL(submit_bh); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 26cc95421cca..18c045e2ead6 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -74,12 +74,9 @@ dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes, align = (unsigned long)(it->iov->iov_base + it->iov_offset) & (PAGE_SIZE - 1); npages = calc_pages_for(align, nbytes); - pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL); - if (!pages) { - pages = vmalloc(sizeof(*pages) * npages); - if (!pages) - return ERR_PTR(-ENOMEM); - } + pages = kvmalloc(sizeof(*pages) * npages, GFP_KERNEL); + if (!pages) + return ERR_PTR(-ENOMEM); for (idx = 0; idx < npages; ) { size_t start; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index c681762d76e6..1d3fa90d40b9 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1666,6 +1666,7 @@ struct ceph_mds_request * ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) { struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS); + struct timespec ts; if (!req) return ERR_PTR(-ENOMEM); @@ -1684,7 +1685,8 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) init_completion(&req->r_safe_completion); INIT_LIST_HEAD(&req->r_unsafe_item); - req->r_stamp = current_fs_time(mdsc->fsc->sb); + ktime_get_real_ts(&ts); + req->r_stamp = timespec_trunc(ts, mdsc->fsc->sb->s_time_gran); req->r_op = op; req->r_direct_mode = mode; diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 02b071bf3732..a0b3e7d1be48 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -83,6 +83,9 @@ convert_sfm_char(const __u16 src_char, char *target) case SFM_COLON: *target = ':'; break; + case SFM_DOUBLEQUOTE: + *target = '"'; + break; case SFM_ASTERISK: *target = '*'; break; @@ -418,6 +421,9 @@ static __le16 convert_to_sfm_char(char src_char, bool end_of_string) case ':': dest_char = cpu_to_le16(SFM_COLON); break; + case '"': + dest_char = cpu_to_le16(SFM_DOUBLEQUOTE); + break; case '*': dest_char = cpu_to_le16(SFM_ASTERISK); break; diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h index 3d7298cc0aeb..8a79a34e66b8 100644 --- a/fs/cifs/cifs_unicode.h +++ b/fs/cifs/cifs_unicode.h @@ -57,6 +57,7 @@ * not conflict (although almost does) with the mapping above. */ +#define SFM_DOUBLEQUOTE ((__u16) 0xF020) #define SFM_ASTERISK ((__u16) 0xF021) #define SFM_QUESTION ((__u16) 0xF025) #define SFM_COLON ((__u16) 0xF022) @@ -64,8 +65,8 @@ #define SFM_LESSTHAN ((__u16) 0xF023) #define SFM_PIPE ((__u16) 0xF027) #define SFM_SLASH ((__u16) 0xF026) -#define SFM_PERIOD ((__u16) 0xF028) -#define SFM_SPACE ((__u16) 0xF029) +#define SFM_SPACE ((__u16) 0xF028) +#define SFM_PERIOD ((__u16) 0xF029) /* * Mapping mechanism to use when one of the seven reserved characters is diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 058ac9b36f04..68abbb0db608 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -478,6 +478,7 @@ find_timestamp(struct cifs_ses *ses) unsigned char *blobptr; unsigned char *blobend; struct ntlmssp2_name *attrptr; + struct timespec ts; if (!ses->auth_key.len || !ses->auth_key.response) return 0; @@ -502,7 +503,8 @@ find_timestamp(struct cifs_ses *ses) blobptr += attrsize; /* advance attr value */ } - return cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); + ktime_get_real_ts(&ts); + return cpu_to_le64(cifs_UnixTimeToNT(ts)); } static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index d0d11b73b2af..9a1667e0e8d6 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -88,6 +88,7 @@ extern mempool_t *cifs_req_poolp; extern mempool_t *cifs_mid_poolp; struct workqueue_struct *cifsiod_wq; +struct workqueue_struct *cifsoplockd_wq; __u32 cifs_lock_secret; /* @@ -1375,9 +1376,16 @@ init_cifs(void) goto out_clean_proc; } + cifsoplockd_wq = alloc_workqueue("cifsoplockd", + WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + if (!cifsoplockd_wq) { + rc = -ENOMEM; + goto out_destroy_cifsiod_wq; + } + rc = cifs_fscache_register(); if (rc) - goto out_destroy_wq; + goto out_destroy_cifsoplockd_wq; rc = cifs_init_inodecache(); if (rc) @@ -1425,7 +1433,9 @@ out_destroy_inodecache: cifs_destroy_inodecache(); out_unreg_fscache: cifs_fscache_unregister(); -out_destroy_wq: +out_destroy_cifsoplockd_wq: + destroy_workqueue(cifsoplockd_wq); +out_destroy_cifsiod_wq: destroy_workqueue(cifsiod_wq); out_clean_proc: cifs_proc_clean(); @@ -1448,6 +1458,7 @@ exit_cifs(void) cifs_destroy_mids(); cifs_destroy_inodecache(); cifs_fscache_unregister(); + destroy_workqueue(cifsoplockd_wq); destroy_workqueue(cifsiod_wq); cifs_proc_clean(); } diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 37f5a41cc50c..8be55be70faf 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1115,6 +1115,23 @@ struct cifs_io_parms { struct cifs_tcon *tcon; }; +struct cifs_aio_ctx { + struct kref refcount; + struct list_head list; + struct mutex aio_mutex; + struct completion done; + struct iov_iter iter; + struct kiocb *iocb; + struct cifsFileInfo *cfile; + struct bio_vec *bv; + loff_t pos; + unsigned int npages; + ssize_t rc; + unsigned int len; + unsigned int total_len; + bool should_dirty; +}; + struct cifs_readdata; /* asynchronous read support */ @@ -1124,6 +1141,7 @@ struct cifs_readdata { struct completion done; struct cifsFileInfo *cfile; struct address_space *mapping; + struct cifs_aio_ctx *ctx; __u64 offset; unsigned int bytes; unsigned int got_bytes; @@ -1154,6 +1172,7 @@ struct cifs_writedata { enum writeback_sync_modes sync_mode; struct work_struct work; struct cifsFileInfo *cfile; + struct cifs_aio_ctx *ctx; __u64 offset; pid_t pid; unsigned int bytes; @@ -1683,6 +1702,7 @@ void cifs_oplock_break(struct work_struct *work); extern const struct slow_work_ops cifs_oplock_break_ops; extern struct workqueue_struct *cifsiod_wq; +extern struct workqueue_struct *cifsoplockd_wq; extern __u32 cifs_lock_secret; extern mempool_t *cifs_mid_poolp; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 97e5d236d265..e49958c3f8bb 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -535,4 +535,7 @@ int __cifs_calc_signature(struct smb_rqst *rqst, struct shash_desc *shash); enum securityEnum cifs_select_sectype(struct TCP_Server_Info *, enum securityEnum); +struct cifs_aio_ctx *cifs_aio_ctx_alloc(void); +void cifs_aio_ctx_release(struct kref *refcount); +int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw); #endif /* _CIFSPROTO_H */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 5d21f00ae341..4c01b3f9abf0 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -478,14 +478,14 @@ decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr) * this requirement. */ int val, seconds, remain, result; - struct timespec ts, utc; - utc = CURRENT_TIME; + struct timespec ts; + unsigned long utc = ktime_get_real_seconds(); ts = cnvrtDosUnixTm(rsp->SrvTime.Date, rsp->SrvTime.Time, 0); cifs_dbg(FYI, "SrvTime %d sec since 1970 (utc: %d) diff: %d\n", - (int)ts.tv_sec, (int)utc.tv_sec, - (int)(utc.tv_sec - ts.tv_sec)); - val = (int)(utc.tv_sec - ts.tv_sec); + (int)ts.tv_sec, (int)utc, + (int)(utc - ts.tv_sec)); + val = (int)(utc - ts.tv_sec); seconds = abs(val); result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ; remain = seconds % MIN_TZ_ADJ; @@ -718,6 +718,9 @@ CIFSSMBEcho(struct TCP_Server_Info *server) if (rc) return rc; + if (server->capabilities & CAP_UNICODE) + smb->hdr.Flags2 |= SMBFLG2_UNICODE; + /* set up echo request */ smb->hdr.Tid = 0xffff; smb->hdr.WordCount = 1; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 9bc0b4d6d065..9365c0cf77ad 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1946,9 +1946,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, } if (!got_ip) { + int len; + const char *slash; + /* No ip= option specified? Try to get it from UNC */ - if (!cifs_convert_address(dstaddr, &vol->UNC[2], - strlen(&vol->UNC[2]))) { + /* Use the address part of the UNC. */ + slash = strchr(&vol->UNC[2], '\\'); + len = slash - &vol->UNC[2]; + if (!cifs_convert_address(dstaddr, &vol->UNC[2], len)) { pr_err("Unable to determine destination address.\n"); goto cifs_parse_mount_err; } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 21d404535739..6ef78ad838e6 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2458,11 +2458,14 @@ cifs_uncached_writedata_release(struct kref *refcount) struct cifs_writedata *wdata = container_of(refcount, struct cifs_writedata, refcount); + kref_put(&wdata->ctx->refcount, cifs_aio_ctx_release); for (i = 0; i < wdata->nr_pages; i++) put_page(wdata->pages[i]); cifs_writedata_release(refcount); } +static void collect_uncached_write_data(struct cifs_aio_ctx *ctx); + static void cifs_uncached_writev_complete(struct work_struct *work) { @@ -2478,7 +2481,8 @@ cifs_uncached_writev_complete(struct work_struct *work) spin_unlock(&inode->i_lock); complete(&wdata->done); - + collect_uncached_write_data(wdata->ctx); + /* the below call can possibly free the last ref to aio ctx */ kref_put(&wdata->refcount, cifs_uncached_writedata_release); } @@ -2527,7 +2531,8 @@ wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from, static int cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, struct cifsFileInfo *open_file, - struct cifs_sb_info *cifs_sb, struct list_head *wdata_list) + struct cifs_sb_info *cifs_sb, struct list_head *wdata_list, + struct cifs_aio_ctx *ctx) { int rc = 0; size_t cur_len; @@ -2595,6 +2600,8 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, wdata->pagesz = PAGE_SIZE; wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE); wdata->credits = credits; + wdata->ctx = ctx; + kref_get(&ctx->refcount); if (!wdata->cfile->invalidHandle || !(rc = cifs_reopen_file(wdata->cfile, false))) @@ -2620,81 +2627,61 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, return rc; } -ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) +static void collect_uncached_write_data(struct cifs_aio_ctx *ctx) { - struct file *file = iocb->ki_filp; - ssize_t total_written = 0; - struct cifsFileInfo *open_file; + struct cifs_writedata *wdata, *tmp; struct cifs_tcon *tcon; struct cifs_sb_info *cifs_sb; - struct cifs_writedata *wdata, *tmp; - struct list_head wdata_list; - struct iov_iter saved_from = *from; + struct dentry *dentry = ctx->cfile->dentry; + unsigned int i; int rc; - /* - * BB - optimize the way when signing is disabled. We can drop this - * extra memory-to-memory copying and use iovec buffers for constructing - * write request. - */ - - rc = generic_write_checks(iocb, from); - if (rc <= 0) - return rc; - - INIT_LIST_HEAD(&wdata_list); - cifs_sb = CIFS_FILE_SB(file); - open_file = file->private_data; - tcon = tlink_tcon(open_file->tlink); - - if (!tcon->ses->server->ops->async_writev) - return -ENOSYS; + tcon = tlink_tcon(ctx->cfile->tlink); + cifs_sb = CIFS_SB(dentry->d_sb); - rc = cifs_write_from_iter(iocb->ki_pos, iov_iter_count(from), from, - open_file, cifs_sb, &wdata_list); + mutex_lock(&ctx->aio_mutex); - /* - * If at least one write was successfully sent, then discard any rc - * value from the later writes. If the other write succeeds, then - * we'll end up returning whatever was written. If it fails, then - * we'll get a new rc value from that. - */ - if (!list_empty(&wdata_list)) - rc = 0; + if (list_empty(&ctx->list)) { + mutex_unlock(&ctx->aio_mutex); + return; + } + rc = ctx->rc; /* * Wait for and collect replies for any successful sends in order of - * increasing offset. Once an error is hit or we get a fatal signal - * while waiting, then return without waiting for any more replies. + * increasing offset. Once an error is hit, then return without waiting + * for any more replies. */ restart_loop: - list_for_each_entry_safe(wdata, tmp, &wdata_list, list) { + list_for_each_entry_safe(wdata, tmp, &ctx->list, list) { if (!rc) { - /* FIXME: freezable too? */ - rc = wait_for_completion_killable(&wdata->done); - if (rc) - rc = -EINTR; - else if (wdata->result) + if (!try_wait_for_completion(&wdata->done)) { + mutex_unlock(&ctx->aio_mutex); + return; + } + + if (wdata->result) rc = wdata->result; else - total_written += wdata->bytes; + ctx->total_len += wdata->bytes; /* resend call if it's a retryable error */ if (rc == -EAGAIN) { struct list_head tmp_list; - struct iov_iter tmp_from = saved_from; + struct iov_iter tmp_from = ctx->iter; INIT_LIST_HEAD(&tmp_list); list_del_init(&wdata->list); iov_iter_advance(&tmp_from, - wdata->offset - iocb->ki_pos); + wdata->offset - ctx->pos); rc = cifs_write_from_iter(wdata->offset, wdata->bytes, &tmp_from, - open_file, cifs_sb, &tmp_list); + ctx->cfile, cifs_sb, &tmp_list, + ctx); - list_splice(&tmp_list, &wdata_list); + list_splice(&tmp_list, &ctx->list); kref_put(&wdata->refcount, cifs_uncached_writedata_release); @@ -2705,12 +2692,111 @@ restart_loop: kref_put(&wdata->refcount, cifs_uncached_writedata_release); } + for (i = 0; i < ctx->npages; i++) + put_page(ctx->bv[i].bv_page); + + cifs_stats_bytes_written(tcon, ctx->total_len); + set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(dentry->d_inode)->flags); + + ctx->rc = (rc == 0) ? ctx->total_len : rc; + + mutex_unlock(&ctx->aio_mutex); + + if (ctx->iocb && ctx->iocb->ki_complete) + ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0); + else + complete(&ctx->done); +} + +ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + ssize_t total_written = 0; + struct cifsFileInfo *cfile; + struct cifs_tcon *tcon; + struct cifs_sb_info *cifs_sb; + struct cifs_aio_ctx *ctx; + struct iov_iter saved_from = *from; + int rc; + + /* + * BB - optimize the way when signing is disabled. We can drop this + * extra memory-to-memory copying and use iovec buffers for constructing + * write request. + */ + + rc = generic_write_checks(iocb, from); + if (rc <= 0) + return rc; + + cifs_sb = CIFS_FILE_SB(file); + cfile = file->private_data; + tcon = tlink_tcon(cfile->tlink); + + if (!tcon->ses->server->ops->async_writev) + return -ENOSYS; + + ctx = cifs_aio_ctx_alloc(); + if (!ctx) + return -ENOMEM; + + ctx->cfile = cifsFileInfo_get(cfile); + + if (!is_sync_kiocb(iocb)) + ctx->iocb = iocb; + + ctx->pos = iocb->ki_pos; + + rc = setup_aio_ctx_iter(ctx, from, WRITE); + if (rc) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return rc; + } + + /* grab a lock here due to read response handlers can access ctx */ + mutex_lock(&ctx->aio_mutex); + + rc = cifs_write_from_iter(iocb->ki_pos, ctx->len, &saved_from, + cfile, cifs_sb, &ctx->list, ctx); + + /* + * If at least one write was successfully sent, then discard any rc + * value from the later writes. If the other write succeeds, then + * we'll end up returning whatever was written. If it fails, then + * we'll get a new rc value from that. + */ + if (!list_empty(&ctx->list)) + rc = 0; + + mutex_unlock(&ctx->aio_mutex); + + if (rc) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return rc; + } + + if (!is_sync_kiocb(iocb)) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return -EIOCBQUEUED; + } + + rc = wait_for_completion_killable(&ctx->done); + if (rc) { + mutex_lock(&ctx->aio_mutex); + ctx->rc = rc = -EINTR; + total_written = ctx->total_len; + mutex_unlock(&ctx->aio_mutex); + } else { + rc = ctx->rc; + total_written = ctx->total_len; + } + + kref_put(&ctx->refcount, cifs_aio_ctx_release); + if (unlikely(!total_written)) return rc; iocb->ki_pos += total_written; - set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(file_inode(file))->flags); - cifs_stats_bytes_written(tcon, total_written); return total_written; } @@ -2859,6 +2945,7 @@ cifs_uncached_readdata_release(struct kref *refcount) struct cifs_readdata, refcount); unsigned int i; + kref_put(&rdata->ctx->refcount, cifs_aio_ctx_release); for (i = 0; i < rdata->nr_pages; i++) { put_page(rdata->pages[i]); rdata->pages[i] = NULL; @@ -2900,6 +2987,8 @@ cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter) return remaining ? -EFAULT : 0; } +static void collect_uncached_read_data(struct cifs_aio_ctx *ctx); + static void cifs_uncached_readv_complete(struct work_struct *work) { @@ -2907,6 +2996,8 @@ cifs_uncached_readv_complete(struct work_struct *work) struct cifs_readdata, work); complete(&rdata->done); + collect_uncached_read_data(rdata->ctx); + /* the below call can possibly free the last ref to aio ctx */ kref_put(&rdata->refcount, cifs_uncached_readdata_release); } @@ -2973,7 +3064,8 @@ cifs_uncached_copy_into_pages(struct TCP_Server_Info *server, static int cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, - struct cifs_sb_info *cifs_sb, struct list_head *rdata_list) + struct cifs_sb_info *cifs_sb, struct list_head *rdata_list, + struct cifs_aio_ctx *ctx) { struct cifs_readdata *rdata; unsigned int npages, rsize, credits; @@ -3020,6 +3112,8 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, rdata->read_into_pages = cifs_uncached_read_into_pages; rdata->copy_into_pages = cifs_uncached_copy_into_pages; rdata->credits = credits; + rdata->ctx = ctx; + kref_get(&ctx->refcount); if (!rdata->cfile->invalidHandle || !(rc = cifs_reopen_file(rdata->cfile, true))) @@ -3042,50 +3136,37 @@ error: return rc; } -ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) +static void +collect_uncached_read_data(struct cifs_aio_ctx *ctx) { - struct file *file = iocb->ki_filp; - ssize_t rc; - size_t len; - ssize_t total_read = 0; - loff_t offset = iocb->ki_pos; + struct cifs_readdata *rdata, *tmp; + struct iov_iter *to = &ctx->iter; struct cifs_sb_info *cifs_sb; struct cifs_tcon *tcon; - struct cifsFileInfo *open_file; - struct cifs_readdata *rdata, *tmp; - struct list_head rdata_list; - - len = iov_iter_count(to); - if (!len) - return 0; - - INIT_LIST_HEAD(&rdata_list); - cifs_sb = CIFS_FILE_SB(file); - open_file = file->private_data; - tcon = tlink_tcon(open_file->tlink); - - if (!tcon->ses->server->ops->async_readv) - return -ENOSYS; + unsigned int i; + int rc; - if ((file->f_flags & O_ACCMODE) == O_WRONLY) - cifs_dbg(FYI, "attempting read on write only file instance\n"); + tcon = tlink_tcon(ctx->cfile->tlink); + cifs_sb = CIFS_SB(ctx->cfile->dentry->d_sb); - rc = cifs_send_async_read(offset, len, open_file, cifs_sb, &rdata_list); + mutex_lock(&ctx->aio_mutex); - /* if at least one read request send succeeded, then reset rc */ - if (!list_empty(&rdata_list)) - rc = 0; + if (list_empty(&ctx->list)) { + mutex_unlock(&ctx->aio_mutex); + return; + } - len = iov_iter_count(to); + rc = ctx->rc; /* the loop below should proceed in the order of increasing offsets */ again: - list_for_each_entry_safe(rdata, tmp, &rdata_list, list) { + list_for_each_entry_safe(rdata, tmp, &ctx->list, list) { if (!rc) { - /* FIXME: freezable sleep too? */ - rc = wait_for_completion_killable(&rdata->done); - if (rc) - rc = -EINTR; - else if (rdata->result == -EAGAIN) { + if (!try_wait_for_completion(&rdata->done)) { + mutex_unlock(&ctx->aio_mutex); + return; + } + + if (rdata->result == -EAGAIN) { /* resend call if it's a retryable error */ struct list_head tmp_list; unsigned int got_bytes = rdata->got_bytes; @@ -3111,9 +3192,9 @@ again: rdata->offset + got_bytes, rdata->bytes - got_bytes, rdata->cfile, cifs_sb, - &tmp_list); + &tmp_list, ctx); - list_splice(&tmp_list, &rdata_list); + list_splice(&tmp_list, &ctx->list); kref_put(&rdata->refcount, cifs_uncached_readdata_release); @@ -3131,14 +3212,110 @@ again: kref_put(&rdata->refcount, cifs_uncached_readdata_release); } - total_read = len - iov_iter_count(to); + for (i = 0; i < ctx->npages; i++) { + if (ctx->should_dirty) + set_page_dirty(ctx->bv[i].bv_page); + put_page(ctx->bv[i].bv_page); + } + + ctx->total_len = ctx->len - iov_iter_count(to); - cifs_stats_bytes_read(tcon, total_read); + cifs_stats_bytes_read(tcon, ctx->total_len); /* mask nodata case */ if (rc == -ENODATA) rc = 0; + ctx->rc = (rc == 0) ? ctx->total_len : rc; + + mutex_unlock(&ctx->aio_mutex); + + if (ctx->iocb && ctx->iocb->ki_complete) + ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0); + else + complete(&ctx->done); +} + +ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + ssize_t rc; + size_t len; + ssize_t total_read = 0; + loff_t offset = iocb->ki_pos; + struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; + struct cifsFileInfo *cfile; + struct cifs_aio_ctx *ctx; + + len = iov_iter_count(to); + if (!len) + return 0; + + cifs_sb = CIFS_FILE_SB(file); + cfile = file->private_data; + tcon = tlink_tcon(cfile->tlink); + + if (!tcon->ses->server->ops->async_readv) + return -ENOSYS; + + if ((file->f_flags & O_ACCMODE) == O_WRONLY) + cifs_dbg(FYI, "attempting read on write only file instance\n"); + + ctx = cifs_aio_ctx_alloc(); + if (!ctx) + return -ENOMEM; + + ctx->cfile = cifsFileInfo_get(cfile); + + if (!is_sync_kiocb(iocb)) + ctx->iocb = iocb; + + if (to->type & ITER_IOVEC) + ctx->should_dirty = true; + + rc = setup_aio_ctx_iter(ctx, to, READ); + if (rc) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return rc; + } + + len = ctx->len; + + /* grab a lock here due to read response handlers can access ctx */ + mutex_lock(&ctx->aio_mutex); + + rc = cifs_send_async_read(offset, len, cfile, cifs_sb, &ctx->list, ctx); + + /* if at least one read request send succeeded, then reset rc */ + if (!list_empty(&ctx->list)) + rc = 0; + + mutex_unlock(&ctx->aio_mutex); + + if (rc) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return rc; + } + + if (!is_sync_kiocb(iocb)) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return -EIOCBQUEUED; + } + + rc = wait_for_completion_killable(&ctx->done); + if (rc) { + mutex_lock(&ctx->aio_mutex); + ctx->rc = rc = -EINTR; + total_read = ctx->total_len; + mutex_unlock(&ctx->aio_mutex); + } else { + rc = ctx->rc; + total_read = ctx->total_len; + } + + kref_put(&ctx->refcount, cifs_aio_ctx_release); + if (total_read) { iocb->ki_pos += total_read; return total_read; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index b261db34103c..c3b2fa0b2ec8 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -322,9 +322,9 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb) fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU; fattr->cf_uid = cifs_sb->mnt_uid; fattr->cf_gid = cifs_sb->mnt_gid; - fattr->cf_atime = CURRENT_TIME; - fattr->cf_ctime = CURRENT_TIME; - fattr->cf_mtime = CURRENT_TIME; + ktime_get_real_ts(&fattr->cf_mtime); + fattr->cf_mtime = timespec_trunc(fattr->cf_mtime, sb->s_time_gran); + fattr->cf_atime = fattr->cf_ctime = fattr->cf_mtime; fattr->cf_nlink = 2; fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL; } @@ -586,9 +586,10 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, /* Fill a cifs_fattr struct with info from FILE_ALL_INFO */ static void cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, - struct cifs_sb_info *cifs_sb, bool adjust_tz, + struct super_block *sb, bool adjust_tz, bool symlink) { + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); memset(fattr, 0, sizeof(*fattr)); @@ -598,8 +599,10 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, if (info->LastAccessTime) fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime); - else - fattr->cf_atime = CURRENT_TIME; + else { + ktime_get_real_ts(&fattr->cf_atime); + fattr->cf_atime = timespec_trunc(fattr->cf_atime, sb->s_time_gran); + } fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime); fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime); @@ -659,7 +662,6 @@ cifs_get_file_info(struct file *filp) FILE_ALL_INFO find_data; struct cifs_fattr fattr; struct inode *inode = file_inode(filp); - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsFileInfo *cfile = filp->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; @@ -671,7 +673,7 @@ cifs_get_file_info(struct file *filp) rc = server->ops->query_file_info(xid, tcon, &cfile->fid, &find_data); switch (rc) { case 0: - cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false, + cifs_all_info_to_fattr(&fattr, &find_data, inode->i_sb, false, false); break; case -EREMOTE: @@ -753,7 +755,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, } if (!rc) { - cifs_all_info_to_fattr(&fattr, data, cifs_sb, adjust_tz, + cifs_all_info_to_fattr(&fattr, data, sb, adjust_tz, symlink); } else if (rc == -EREMOTE) { cifs_create_dfs_fattr(&fattr, sb); @@ -1363,9 +1365,9 @@ out_reval: cifs_inode = CIFS_I(inode); cifs_inode->time = 0; /* will force revalidate to get info when needed */ - inode->i_ctime = current_fs_time(sb); + inode->i_ctime = current_time(inode); } - dir->i_ctime = dir->i_mtime = current_fs_time(sb); + dir->i_ctime = dir->i_mtime = current_time(dir); cifs_inode = CIFS_I(dir); CIFS_I(dir)->time = 0; /* force revalidate of dir as well */ unlink_out: @@ -1633,7 +1635,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) cifsInode->time = 0; d_inode(direntry)->i_ctime = inode->i_ctime = inode->i_mtime = - current_fs_time(inode->i_sb); + current_time(inode); rmdir_exit: kfree(full_path); @@ -1806,7 +1808,7 @@ unlink_target: CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0; source_dir->i_ctime = source_dir->i_mtime = target_dir->i_ctime = - target_dir->i_mtime = current_fs_time(source_dir->i_sb); + target_dir->i_mtime = current_time(source_dir); cifs_rename_exit: kfree(info_buf_source); diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 7f4bba574930..76fb0917dc8c 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -209,10 +209,14 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) rc = -EOPNOTSUPP; break; case CIFS_IOC_GET_MNT_INFO: + if (pSMBFile == NULL) + break; tcon = tlink_tcon(pSMBFile->tlink); rc = smb_mnt_get_fsinfo(xid, tcon, (void __user *)arg); break; case CIFS_ENUMERATE_SNAPSHOTS: + if (pSMBFile == NULL) + break; if (arg == 0) { rc = -EINVAL; goto cifs_ioc_exit; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 843787850435..b08531977daa 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -22,6 +22,7 @@ #include <linux/slab.h> #include <linux/ctype.h> #include <linux/mempool.h> +#include <linux/vmalloc.h> #include "cifspdu.h" #include "cifsglob.h" #include "cifsproto.h" @@ -488,7 +489,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &pCifsInode->flags); - queue_work(cifsiod_wq, + queue_work(cifsoplockd_wq, &netfile->oplock_break); netfile->oplock_break_cancelled = false; @@ -741,3 +742,122 @@ parse_DFS_referrals_exit: } return rc; } + +struct cifs_aio_ctx * +cifs_aio_ctx_alloc(void) +{ + struct cifs_aio_ctx *ctx; + + ctx = kzalloc(sizeof(struct cifs_aio_ctx), GFP_KERNEL); + if (!ctx) + return NULL; + + INIT_LIST_HEAD(&ctx->list); + mutex_init(&ctx->aio_mutex); + init_completion(&ctx->done); + kref_init(&ctx->refcount); + return ctx; +} + +void +cifs_aio_ctx_release(struct kref *refcount) +{ + struct cifs_aio_ctx *ctx = container_of(refcount, + struct cifs_aio_ctx, refcount); + + cifsFileInfo_put(ctx->cfile); + kvfree(ctx->bv); + kfree(ctx); +} + +#define CIFS_AIO_KMALLOC_LIMIT (1024 * 1024) + +int +setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw) +{ + ssize_t rc; + unsigned int cur_npages; + unsigned int npages = 0; + unsigned int i; + size_t len; + size_t count = iov_iter_count(iter); + unsigned int saved_len; + size_t start; + unsigned int max_pages = iov_iter_npages(iter, INT_MAX); + struct page **pages = NULL; + struct bio_vec *bv = NULL; + + if (iter->type & ITER_KVEC) { + memcpy(&ctx->iter, iter, sizeof(struct iov_iter)); + ctx->len = count; + iov_iter_advance(iter, count); + return 0; + } + + if (max_pages * sizeof(struct bio_vec) <= CIFS_AIO_KMALLOC_LIMIT) + bv = kmalloc_array(max_pages, sizeof(struct bio_vec), + GFP_KERNEL); + + if (!bv) { + bv = vmalloc(max_pages * sizeof(struct bio_vec)); + if (!bv) + return -ENOMEM; + } + + if (max_pages * sizeof(struct page *) <= CIFS_AIO_KMALLOC_LIMIT) + pages = kmalloc_array(max_pages, sizeof(struct page *), + GFP_KERNEL); + + if (!pages) { + pages = vmalloc(max_pages * sizeof(struct page *)); + if (!bv) { + kvfree(bv); + return -ENOMEM; + } + } + + saved_len = count; + + while (count && npages < max_pages) { + rc = iov_iter_get_pages(iter, pages, count, max_pages, &start); + if (rc < 0) { + cifs_dbg(VFS, "couldn't get user pages (rc=%zd)\n", rc); + break; + } + + if (rc > count) { + cifs_dbg(VFS, "get pages rc=%zd more than %zu\n", rc, + count); + break; + } + + iov_iter_advance(iter, rc); + count -= rc; + rc += start; + cur_npages = DIV_ROUND_UP(rc, PAGE_SIZE); + + if (npages + cur_npages > max_pages) { + cifs_dbg(VFS, "out of vec array capacity (%u vs %u)\n", + npages + cur_npages, max_pages); + break; + } + + for (i = 0; i < cur_npages; i++) { + len = rc > PAGE_SIZE ? PAGE_SIZE : rc; + bv[npages + i].bv_page = pages[i]; + bv[npages + i].bv_offset = start; + bv[npages + i].bv_len = len - start; + rc -= len; + start = 0; + } + + npages += cur_npages; + } + + kvfree(pages); + ctx->bv = bv; + ctx->len = saved_len - count; + ctx->npages = npages; + iov_iter_bvec(&ctx->iter, ITER_BVEC | rw, ctx->bv, npages, ctx->len); + return 0; +} diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index abae6dd2c6b9..cc88f4f0325e 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -980,10 +980,10 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset) cifs_dbg(VFS, "illegal hours %d\n", st->Hours); days = sd->Day; month = sd->Month; - if ((days > 31) || (month > 12)) { + if (days < 1 || days > 31 || month < 1 || month > 12) { cifs_dbg(VFS, "illegal date, month %d day: %d\n", month, days); - if (month > 12) - month = 12; + days = clamp(days, 1, 31); + month = clamp(month, 1, 12); } month -= 1; days += total_days_of_prev_months[month]; diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 1a04b3a5beb1..7b08a1446a7f 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -499,7 +499,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, else cfile->oplock_break_cancelled = true; - queue_work(cifsiod_wq, &cfile->oplock_break); + queue_work(cifsoplockd_wq, &cfile->oplock_break); kfree(lw); return true; } @@ -643,7 +643,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags); spin_unlock(&cfile->file_info_lock); - queue_work(cifsiod_wq, &cfile->oplock_break); + queue_work(cifsoplockd_wq, + &cfile->oplock_break); spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 152e37f2ad92..c58691834eb2 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -942,6 +942,7 @@ smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon, } if (snapshot_in.snapshot_array_size < sizeof(struct smb_snapshot_array)) { rc = -ERANGE; + kfree(retbuf); return rc; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index fb0da096c2ce..48ff7703b919 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -633,8 +633,12 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) } if (rsplen != sizeof(struct validate_negotiate_info_rsp)) { - cifs_dbg(VFS, "invalid size of protocol negotiate response\n"); - return -EIO; + cifs_dbg(VFS, "invalid protocol negotiate response size: %d\n", + rsplen); + + /* relax check since Mac returns max bufsize allowed on ioctl */ + if (rsplen > CIFSMaxBufSize) + return -EIO; } /* check validate negotiate info response matches what we got earlier */ @@ -1854,8 +1858,12 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, * than one credit. Windows typically sets this smaller, but for some * ioctls it may be useful to allow server to send more. No point * limiting what the server can send as long as fits in one credit + * Unfortunately - we can not handle more than CIFS_MAX_MSG_SIZE + * (by default, note that it can be overridden to make max larger) + * in responses (except for read responses which can be bigger. + * We may want to bump this limit up */ - req->MaxOutputResponse = cpu_to_le32(0xFF00); /* < 64K uses 1 credit */ + req->MaxOutputResponse = cpu_to_le32(CIFSMaxBufSize); if (is_fsctl) req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL); diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 11d087b2b28e..6116d5275a3e 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -833,7 +833,7 @@ static int compat_ioctl_preallocate(struct file *file, */ #define XFORM(i) (((i) ^ ((i) << 27) ^ ((i) << 17)) & 0xffffffff) -#define COMPATIBLE_IOCTL(cmd) XFORM(cmd), +#define COMPATIBLE_IOCTL(cmd) XFORM((u32)cmd), /* ioctl should not be warned about even if it's not implemented. Valid reasons to use this: - It is implemented with ->compat_ioctl on some device, but programs diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 37b49894c762..d1bb02b1ee58 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -159,6 +159,8 @@ static int fname_decrypt(struct inode *inode, static const char *lookup_table = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; +#define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) + /** * digest_encode() - * @@ -230,11 +232,14 @@ EXPORT_SYMBOL(fscrypt_fname_encrypted_size); int fscrypt_fname_alloc_buffer(const struct inode *inode, u32 ilen, struct fscrypt_str *crypto_str) { - unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen); + u32 olen = fscrypt_fname_encrypted_size(inode, ilen); + const u32 max_encoded_len = + max_t(u32, BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE), + 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name))); crypto_str->len = olen; - if (olen < FS_FNAME_CRYPTO_DIGEST_SIZE * 2) - olen = FS_FNAME_CRYPTO_DIGEST_SIZE * 2; + olen = max(olen, max_encoded_len); + /* * Allocated buffer can hold one more character to null-terminate the * string @@ -266,6 +271,10 @@ EXPORT_SYMBOL(fscrypt_fname_free_buffer); * * The caller must have allocated sufficient memory for the @oname string. * + * If the key is available, we'll decrypt the disk name; otherwise, we'll encode + * it for presentation. Short names are directly base64-encoded, while long + * names are encoded in fscrypt_digested_name format. + * * Return: 0 on success, -errno on failure */ int fscrypt_fname_disk_to_usr(struct inode *inode, @@ -274,7 +283,7 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, struct fscrypt_str *oname) { const struct qstr qname = FSTR_TO_QSTR(iname); - char buf[24]; + struct fscrypt_digested_name digested_name; if (fscrypt_is_dot_dotdot(&qname)) { oname->name[0] = '.'; @@ -289,20 +298,24 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, if (inode->i_crypt_info) return fname_decrypt(inode, iname, oname); - if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) { + if (iname->len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) { oname->len = digest_encode(iname->name, iname->len, oname->name); return 0; } if (hash) { - memcpy(buf, &hash, 4); - memcpy(buf + 4, &minor_hash, 4); + digested_name.hash = hash; + digested_name.minor_hash = minor_hash; } else { - memset(buf, 0, 8); + digested_name.hash = 0; + digested_name.minor_hash = 0; } - memcpy(buf + 8, iname->name + iname->len - 16, 16); + memcpy(digested_name.digest, + FSCRYPT_FNAME_DIGEST(iname->name, iname->len), + FSCRYPT_FNAME_DIGEST_SIZE); oname->name[0] = '_'; - oname->len = 1 + digest_encode(buf, 24, oname->name + 1); + oname->len = 1 + digest_encode((const char *)&digested_name, + sizeof(digested_name), oname->name + 1); return 0; } EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); @@ -336,10 +349,35 @@ int fscrypt_fname_usr_to_disk(struct inode *inode, } EXPORT_SYMBOL(fscrypt_fname_usr_to_disk); +/** + * fscrypt_setup_filename() - prepare to search a possibly encrypted directory + * @dir: the directory that will be searched + * @iname: the user-provided filename being searched for + * @lookup: 1 if we're allowed to proceed without the key because it's + * ->lookup() or we're finding the dir_entry for deletion; 0 if we cannot + * proceed without the key because we're going to create the dir_entry. + * @fname: the filename information to be filled in + * + * Given a user-provided filename @iname, this function sets @fname->disk_name + * to the name that would be stored in the on-disk directory entry, if possible. + * If the directory is unencrypted this is simply @iname. Else, if we have the + * directory's encryption key, then @iname is the plaintext, so we encrypt it to + * get the disk_name. + * + * Else, for keyless @lookup operations, @iname is the presented ciphertext, so + * we decode it to get either the ciphertext disk_name (for short names) or the + * fscrypt_digested_name (for long names). Non-@lookup operations will be + * impossible in this case, so we fail them with ENOKEY. + * + * If successful, fscrypt_free_filename() must be called later to clean up. + * + * Return: 0 on success, -errno on failure + */ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct fscrypt_name *fname) { - int ret = 0, bigname = 0; + int ret; + int digested; memset(fname, 0, sizeof(struct fscrypt_name)); fname->usr_fname = iname; @@ -373,25 +411,37 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, * We don't have the key and we are doing a lookup; decode the * user-supplied name */ - if (iname->name[0] == '_') - bigname = 1; - if ((bigname && (iname->len != 33)) || (!bigname && (iname->len > 43))) - return -ENOENT; + if (iname->name[0] == '_') { + if (iname->len != + 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name))) + return -ENOENT; + digested = 1; + } else { + if (iname->len > + BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE)) + return -ENOENT; + digested = 0; + } - fname->crypto_buf.name = kmalloc(32, GFP_KERNEL); + fname->crypto_buf.name = + kmalloc(max_t(size_t, FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE, + sizeof(struct fscrypt_digested_name)), + GFP_KERNEL); if (fname->crypto_buf.name == NULL) return -ENOMEM; - ret = digest_decode(iname->name + bigname, iname->len - bigname, + ret = digest_decode(iname->name + digested, iname->len - digested, fname->crypto_buf.name); if (ret < 0) { ret = -ENOENT; goto errout; } fname->crypto_buf.len = ret; - if (bigname) { - memcpy(&fname->hash, fname->crypto_buf.name, 4); - memcpy(&fname->minor_hash, fname->crypto_buf.name + 4, 4); + if (digested) { + const struct fscrypt_digested_name *n = + (const void *)fname->crypto_buf.name; + fname->hash = n->hash; + fname->minor_hash = n->minor_hash; } else { fname->disk_name.name = fname->crypto_buf.name; fname->disk_name.len = fname->crypto_buf.len; diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index e39696e64494..1e1f8a361b75 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -13,8 +13,6 @@ #include <linux/fscrypt_supp.h> -#define FS_FNAME_CRYPTO_DIGEST_SIZE 32 - /* Encryption parameters */ #define FS_XTS_TWEAK_SIZE 16 #define FS_AES_128_ECB_KEY_SIZE 16 @@ -22,10 +20,6 @@ #define FS_AES_256_CBC_KEY_SIZE 32 #define FS_AES_256_CTS_KEY_SIZE 32 #define FS_AES_256_XTS_KEY_SIZE 64 -#define FS_MAX_KEY_SIZE 64 - -#define FS_KEY_DESC_PREFIX "fscrypt:" -#define FS_KEY_DESC_PREFIX_SIZE 8 #define FS_KEY_DERIVATION_NONCE_SIZE 16 @@ -51,13 +45,6 @@ struct fscrypt_context { #define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 -/* This is passed in from userspace into the kernel keyring */ -struct fscrypt_key { - u32 mode; - u8 raw[FS_MAX_KEY_SIZE]; - u32 size; -} __packed; - /* * A pointer to this structure is stored in the file system's in-core * representation of an inode. diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 8cdfddce2b34..179e578b875b 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -183,9 +183,6 @@ int fscrypt_get_encryption_info(struct inode *inode) if (res) return res; - if (!inode->i_sb->s_cop->get_context) - return -EOPNOTSUPP; - res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { if (!fscrypt_dummy_context_enabled(inode) || diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 4908906d54d5..210976e7a269 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -34,9 +34,6 @@ static int create_encryption_context_from_policy(struct inode *inode, { struct fscrypt_context ctx; - if (!inode->i_sb->s_cop->set_context) - return -EOPNOTSUPP; - ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, FS_KEY_DESCRIPTOR_SIZE); @@ -87,8 +84,6 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) if (ret == -ENODATA) { if (!S_ISDIR(inode->i_mode)) ret = -ENOTDIR; - else if (!inode->i_sb->s_cop->empty_dir) - ret = -EOPNOTSUPP; else if (!inode->i_sb->s_cop->empty_dir(inode)) ret = -ENOTEMPTY; else @@ -118,8 +113,7 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) struct fscrypt_policy policy; int res; - if (!inode->i_sb->s_cop->get_context || - !inode->i_sb->s_cop->is_encrypted(inode)) + if (!inode->i_sb->s_cop->is_encrypted(inode)) return -ENODATA; res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); @@ -143,27 +137,61 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) } EXPORT_SYMBOL(fscrypt_ioctl_get_policy); +/** + * fscrypt_has_permitted_context() - is a file's encryption policy permitted + * within its directory? + * + * @parent: inode for parent directory + * @child: inode for file being looked up, opened, or linked into @parent + * + * Filesystems must call this before permitting access to an inode in a + * situation where the parent directory is encrypted (either before allowing + * ->lookup() to succeed, or for a regular file before allowing it to be opened) + * and before any operation that involves linking an inode into an encrypted + * directory, including link, rename, and cross rename. It enforces the + * constraint that within a given encrypted directory tree, all files use the + * same encryption policy. The pre-access check is needed to detect potentially + * malicious offline violations of this constraint, while the link and rename + * checks are needed to prevent online violations of this constraint. + * + * Return: 1 if permitted, 0 if forbidden. If forbidden, the caller must fail + * the filesystem operation with EPERM. + */ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) { - struct fscrypt_info *parent_ci, *child_ci; + const struct fscrypt_operations *cops = parent->i_sb->s_cop; + const struct fscrypt_info *parent_ci, *child_ci; + struct fscrypt_context parent_ctx, child_ctx; int res; - if ((parent == NULL) || (child == NULL)) { - printk(KERN_ERR "parent %p child %p\n", parent, child); - BUG_ON(1); - } - /* No restrictions on file types which are never encrypted */ if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) && !S_ISLNK(child->i_mode)) return 1; - /* no restrictions if the parent directory is not encrypted */ - if (!parent->i_sb->s_cop->is_encrypted(parent)) + /* No restrictions if the parent directory is unencrypted */ + if (!cops->is_encrypted(parent)) return 1; - /* if the child directory is not encrypted, this is always a problem */ - if (!parent->i_sb->s_cop->is_encrypted(child)) + + /* Encrypted directories must not contain unencrypted files */ + if (!cops->is_encrypted(child)) return 0; + + /* + * Both parent and child are encrypted, so verify they use the same + * encryption policy. Compare the fscrypt_info structs if the keys are + * available, otherwise retrieve and compare the fscrypt_contexts. + * + * Note that the fscrypt_context retrieval will be required frequently + * when accessing an encrypted directory tree without the key. + * Performance-wise this is not a big deal because we already don't + * really optimize for file access without the key (to the extent that + * such access is even possible), given that any attempted access + * already causes a fscrypt_context retrieval and keyring search. + * + * In any case, if an unexpected error occurs, fall back to "forbidden". + */ + res = fscrypt_get_encryption_info(parent); if (res) return 0; @@ -172,17 +200,32 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) return 0; parent_ci = parent->i_crypt_info; child_ci = child->i_crypt_info; - if (!parent_ci && !child_ci) - return 1; - if (!parent_ci || !child_ci) + + if (parent_ci && child_ci) { + return memcmp(parent_ci->ci_master_key, child_ci->ci_master_key, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ci->ci_data_mode == child_ci->ci_data_mode) && + (parent_ci->ci_filename_mode == + child_ci->ci_filename_mode) && + (parent_ci->ci_flags == child_ci->ci_flags); + } + + res = cops->get_context(parent, &parent_ctx, sizeof(parent_ctx)); + if (res != sizeof(parent_ctx)) return 0; - return (memcmp(parent_ci->ci_master_key, - child_ci->ci_master_key, - FS_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ci->ci_data_mode == child_ci->ci_data_mode) && - (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) && - (parent_ci->ci_flags == child_ci->ci_flags)); + res = cops->get_context(child, &child_ctx, sizeof(child_ctx)); + if (res != sizeof(child_ctx)) + return 0; + + return memcmp(parent_ctx.master_key_descriptor, + child_ctx.master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ctx.contents_encryption_mode == + child_ctx.contents_encryption_mode) && + (parent_ctx.filenames_encryption_mode == + child_ctx.filenames_encryption_mode) && + (parent_ctx.flags == child_ctx.flags); } EXPORT_SYMBOL(fscrypt_has_permitted_context); @@ -202,9 +245,6 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child, struct fscrypt_info *ci; int res; - if (!parent->i_sb->s_cop->set_context) - return -EOPNOTSUPP; - res = fscrypt_get_encryption_info(parent); if (res < 0) return res; @@ -55,32 +55,6 @@ static int __init init_dax_wait_table(void) } fs_initcall(init_dax_wait_table); -static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) -{ - struct request_queue *q = bdev->bd_queue; - long rc = -EIO; - - dax->addr = ERR_PTR(-EIO); - if (blk_queue_enter(q, true) != 0) - return rc; - - rc = bdev_direct_access(bdev, dax); - if (rc < 0) { - dax->addr = ERR_PTR(rc); - blk_queue_exit(q); - return rc; - } - return rc; -} - -static void dax_unmap_atomic(struct block_device *bdev, - const struct blk_dax_ctl *dax) -{ - if (IS_ERR(dax->addr)) - return; - blk_queue_exit(bdev->bd_queue); -} - static int dax_is_pmd_entry(void *entry) { return (unsigned long)entry & RADIX_DAX_PMD; @@ -101,26 +75,6 @@ static int dax_is_empty_entry(void *entry) return (unsigned long)entry & RADIX_DAX_EMPTY; } -struct page *read_dax_sector(struct block_device *bdev, sector_t n) -{ - struct page *page = alloc_pages(GFP_KERNEL, 0); - struct blk_dax_ctl dax = { - .size = PAGE_SIZE, - .sector = n & ~((((int) PAGE_SIZE) / 512) - 1), - }; - long rc; - - if (!page) - return ERR_PTR(-ENOMEM); - - rc = dax_map_atomic(bdev, &dax); - if (rc < 0) - return ERR_PTR(rc); - memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE); - dax_unmap_atomic(bdev, &dax); - return page; -} - /* * DAX radix tree locking */ @@ -555,21 +509,25 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping, static int dax_load_hole(struct address_space *mapping, void **entry, struct vm_fault *vmf) { + struct inode *inode = mapping->host; struct page *page; int ret; /* Hole page already exists? Return it... */ if (!radix_tree_exceptional_entry(*entry)) { page = *entry; - goto out; + goto finish_fault; } /* This will replace locked radix tree entry with a hole page */ page = find_or_create_page(mapping, vmf->pgoff, vmf->gfp_mask | __GFP_ZERO); - if (!page) - return VM_FAULT_OOM; - out: + if (!page) { + ret = VM_FAULT_OOM; + goto out; + } + +finish_fault: vmf->page = page; ret = finish_fault(vmf); vmf->page = NULL; @@ -577,26 +535,37 @@ static int dax_load_hole(struct address_space *mapping, void **entry, if (!ret) { /* Grab reference for PTE that is now referencing the page */ get_page(page); - return VM_FAULT_NOPAGE; + ret = VM_FAULT_NOPAGE; } +out: + trace_dax_load_hole(inode, vmf, ret); return ret; } -static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size, - struct page *to, unsigned long vaddr) +static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, + sector_t sector, size_t size, struct page *to, + unsigned long vaddr) { - struct blk_dax_ctl dax = { - .sector = sector, - .size = size, - }; - void *vto; - - if (dax_map_atomic(bdev, &dax) < 0) - return PTR_ERR(dax.addr); + void *vto, *kaddr; + pgoff_t pgoff; + pfn_t pfn; + long rc; + int id; + + rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); + if (rc) + return rc; + + id = dax_read_lock(); + rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); + if (rc < 0) { + dax_read_unlock(id); + return rc; + } vto = kmap_atomic(to); - copy_user_page(vto, (void __force *)dax.addr, vaddr, to); + copy_user_page(vto, (void __force *)kaddr, vaddr, to); kunmap_atomic(vto); - dax_unmap_atomic(bdev, &dax); + dax_read_unlock(id); return 0; } @@ -764,12 +733,16 @@ unlock_pte: } static int dax_writeback_one(struct block_device *bdev, - struct address_space *mapping, pgoff_t index, void *entry) + struct dax_device *dax_dev, struct address_space *mapping, + pgoff_t index, void *entry) { struct radix_tree_root *page_tree = &mapping->page_tree; - struct blk_dax_ctl dax; - void *entry2, **slot; - int ret = 0; + void *entry2, **slot, *kaddr; + long ret = 0, id; + sector_t sector; + pgoff_t pgoff; + size_t size; + pfn_t pfn; /* * A page got tagged dirty in DAX mapping? Something is seriously @@ -818,26 +791,29 @@ static int dax_writeback_one(struct block_device *bdev, * 'entry'. This allows us to flush for PMD_SIZE and not have to * worry about partial PMD writebacks. */ - dax.sector = dax_radix_sector(entry); - dax.size = PAGE_SIZE << dax_radix_order(entry); + sector = dax_radix_sector(entry); + size = PAGE_SIZE << dax_radix_order(entry); + + id = dax_read_lock(); + ret = bdev_dax_pgoff(bdev, sector, size, &pgoff); + if (ret) + goto dax_unlock; /* - * We cannot hold tree_lock while calling dax_map_atomic() because it - * eventually calls cond_resched(). + * dax_direct_access() may sleep, so cannot hold tree_lock over + * its invocation. */ - ret = dax_map_atomic(bdev, &dax); - if (ret < 0) { - put_locked_mapping_entry(mapping, index, entry); - return ret; - } + ret = dax_direct_access(dax_dev, pgoff, size / PAGE_SIZE, &kaddr, &pfn); + if (ret < 0) + goto dax_unlock; - if (WARN_ON_ONCE(ret < dax.size)) { + if (WARN_ON_ONCE(ret < size / PAGE_SIZE)) { ret = -EIO; - goto unmap; + goto dax_unlock; } - dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn)); - wb_cache_pmem(dax.addr, dax.size); + dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn)); + wb_cache_pmem(kaddr, size); /* * After we have flushed the cache, we can clear the dirty tag. There * cannot be new dirty data in the pfn after the flush has completed as @@ -847,8 +823,9 @@ static int dax_writeback_one(struct block_device *bdev, spin_lock_irq(&mapping->tree_lock); radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); spin_unlock_irq(&mapping->tree_lock); - unmap: - dax_unmap_atomic(bdev, &dax); + trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); + dax_unlock: + dax_read_unlock(id); put_locked_mapping_entry(mapping, index, entry); return ret; @@ -869,6 +846,7 @@ int dax_writeback_mapping_range(struct address_space *mapping, struct inode *inode = mapping->host; pgoff_t start_index, end_index; pgoff_t indices[PAGEVEC_SIZE]; + struct dax_device *dax_dev; struct pagevec pvec; bool done = false; int i, ret = 0; @@ -879,9 +857,15 @@ int dax_writeback_mapping_range(struct address_space *mapping, if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) return 0; + dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); + if (!dax_dev) + return -EIO; + start_index = wbc->range_start >> PAGE_SHIFT; end_index = wbc->range_end >> PAGE_SHIFT; + trace_dax_writeback_range(inode, start_index, end_index); + tag_pages_for_writeback(mapping, start_index, end_index); pagevec_init(&pvec, 0); @@ -899,38 +883,50 @@ int dax_writeback_mapping_range(struct address_space *mapping, break; } - ret = dax_writeback_one(bdev, mapping, indices[i], - pvec.pages[i]); + ret = dax_writeback_one(bdev, dax_dev, mapping, + indices[i], pvec.pages[i]); if (ret < 0) - return ret; + goto out; } } - return 0; +out: + put_dax(dax_dev); + trace_dax_writeback_range_done(inode, start_index, end_index); + return (ret < 0 ? ret : 0); } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); static int dax_insert_mapping(struct address_space *mapping, - struct block_device *bdev, sector_t sector, size_t size, - void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) + struct block_device *bdev, struct dax_device *dax_dev, + sector_t sector, size_t size, void **entryp, + struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = vmf->address; - struct blk_dax_ctl dax = { - .sector = sector, - .size = size, - }; - void *ret; void *entry = *entryp; + void *ret, *kaddr; + pgoff_t pgoff; + int id, rc; + pfn_t pfn; - if (dax_map_atomic(bdev, &dax) < 0) - return PTR_ERR(dax.addr); - dax_unmap_atomic(bdev, &dax); + rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); + if (rc) + return rc; + + id = dax_read_lock(); + rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); + if (rc < 0) { + dax_read_unlock(id); + return rc; + } + dax_read_unlock(id); - ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0); + ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0); if (IS_ERR(ret)) return PTR_ERR(ret); *entryp = ret; - return vm_insert_mixed(vma, vaddr, dax.pfn); + trace_dax_insert_mapping(mapping->host, vmf, ret); + return vm_insert_mixed(vma, vaddr, pfn); } /** @@ -941,6 +937,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; void *entry, **slot; pgoff_t index = vmf->pgoff; @@ -950,6 +947,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf) if (entry) put_unlocked_mapping_entry(mapping, index, entry); spin_unlock_irq(&mapping->tree_lock); + trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE); return VM_FAULT_NOPAGE; } radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); @@ -962,6 +960,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf) */ finish_mkwrite_fault(vmf); put_locked_mapping_entry(mapping, index, entry); + trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); @@ -979,24 +978,34 @@ static bool dax_range_is_aligned(struct block_device *bdev, return true; } -int __dax_zero_page_range(struct block_device *bdev, sector_t sector, - unsigned int offset, unsigned int length) +int __dax_zero_page_range(struct block_device *bdev, + struct dax_device *dax_dev, sector_t sector, + unsigned int offset, unsigned int size) { - struct blk_dax_ctl dax = { - .sector = sector, - .size = PAGE_SIZE, - }; - - if (dax_range_is_aligned(bdev, offset, length)) { - sector_t start_sector = dax.sector + (offset >> 9); + if (dax_range_is_aligned(bdev, offset, size)) { + sector_t start_sector = sector + (offset >> 9); return blkdev_issue_zeroout(bdev, start_sector, - length >> 9, GFP_NOFS, 0); + size >> 9, GFP_NOFS, 0); } else { - if (dax_map_atomic(bdev, &dax) < 0) - return PTR_ERR(dax.addr); - clear_pmem(dax.addr + offset, length); - dax_unmap_atomic(bdev, &dax); + pgoff_t pgoff; + long rc, id; + void *kaddr; + pfn_t pfn; + + rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); + if (rc) + return rc; + + id = dax_read_lock(); + rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, + &pfn); + if (rc < 0) { + dax_read_unlock(id); + return rc; + } + clear_pmem(kaddr + offset, size); + dax_read_unlock(id); } return 0; } @@ -1011,9 +1020,12 @@ static loff_t dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct iomap *iomap) { + struct block_device *bdev = iomap->bdev; + struct dax_device *dax_dev = iomap->dax_dev; struct iov_iter *iter = data; loff_t end = pos + length, done = 0; ssize_t ret = 0; + int id; if (iov_iter_rw(iter) == READ) { end = min(end, i_size_read(inode)); @@ -1038,34 +1050,42 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, (end - 1) >> PAGE_SHIFT); } + id = dax_read_lock(); while (pos < end) { unsigned offset = pos & (PAGE_SIZE - 1); - struct blk_dax_ctl dax = { 0 }; + const size_t size = ALIGN(length + offset, PAGE_SIZE); + const sector_t sector = dax_iomap_sector(iomap, pos); ssize_t map_len; + pgoff_t pgoff; + void *kaddr; + pfn_t pfn; if (fatal_signal_pending(current)) { ret = -EINTR; break; } - dax.sector = dax_iomap_sector(iomap, pos); - dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK; - map_len = dax_map_atomic(iomap->bdev, &dax); + ret = bdev_dax_pgoff(bdev, sector, size, &pgoff); + if (ret) + break; + + map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), + &kaddr, &pfn); if (map_len < 0) { ret = map_len; break; } - dax.addr += offset; + map_len = PFN_PHYS(map_len); + kaddr += offset; map_len -= offset; if (map_len > end - pos) map_len = end - pos; if (iov_iter_rw(iter) == WRITE) - map_len = copy_from_iter_pmem(dax.addr, map_len, iter); + map_len = copy_from_iter_pmem(kaddr, map_len, iter); else - map_len = copy_to_iter(dax.addr, map_len, iter); - dax_unmap_atomic(iomap->bdev, &dax); + map_len = copy_to_iter(kaddr, map_len, iter); if (map_len <= 0) { ret = map_len ? map_len : -EFAULT; break; @@ -1075,6 +1095,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, length -= map_len; done += map_len; } + dax_read_unlock(id); return done ? done : ret; } @@ -1142,13 +1163,16 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, int vmf_ret = 0; void *entry; + trace_dax_pte_fault(inode, vmf, vmf_ret); /* * Check whether offset isn't beyond end of file now. Caller is supposed * to hold locks serializing us with truncate / punch hole so this is * a reliable test. */ - if (pos >= i_size_read(inode)) - return VM_FAULT_SIGBUS; + if (pos >= i_size_read(inode)) { + vmf_ret = VM_FAULT_SIGBUS; + goto out; + } if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) flags |= IOMAP_WRITE; @@ -1159,8 +1183,10 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, * that we never have to deal with more than a single extent here. */ error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); - if (error) - return dax_fault_return(error); + if (error) { + vmf_ret = dax_fault_return(error); + goto out; + } if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { vmf_ret = dax_fault_return(-EIO); /* fs corruption? */ goto finish_iomap; @@ -1181,8 +1207,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, clear_user_highpage(vmf->cow_page, vaddr); break; case IOMAP_MAPPED: - error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE, - vmf->cow_page, vaddr); + error = copy_user_dax(iomap.bdev, iomap.dax_dev, + sector, PAGE_SIZE, vmf->cow_page, vaddr); break; default: WARN_ON_ONCE(1); @@ -1207,8 +1233,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; } - error = dax_insert_mapping(mapping, iomap.bdev, sector, - PAGE_SIZE, &entry, vmf->vma, vmf); + error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, + sector, PAGE_SIZE, &entry, vmf->vma, vmf); /* -EBUSY is fine, somebody else faulted on the same PTE */ if (error == -EBUSY) error = 0; @@ -1244,6 +1270,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, */ ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); } +out: + trace_dax_pte_fault_done(inode, vmf, vmf_ret); return vmf_ret; } @@ -1258,41 +1286,48 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, loff_t pos, void **entryp) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; + const sector_t sector = dax_iomap_sector(iomap, pos); + struct dax_device *dax_dev = iomap->dax_dev; struct block_device *bdev = iomap->bdev; struct inode *inode = mapping->host; - struct blk_dax_ctl dax = { - .sector = dax_iomap_sector(iomap, pos), - .size = PMD_SIZE, - }; - long length = dax_map_atomic(bdev, &dax); - void *ret = NULL; - - if (length < 0) /* dax_map_atomic() failed */ + const size_t size = PMD_SIZE; + void *ret = NULL, *kaddr; + long length = 0; + pgoff_t pgoff; + pfn_t pfn; + int id; + + if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0) goto fallback; - if (length < PMD_SIZE) - goto unmap_fallback; - if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) - goto unmap_fallback; - if (!pfn_t_devmap(dax.pfn)) - goto unmap_fallback; - - dax_unmap_atomic(bdev, &dax); - ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector, + id = dax_read_lock(); + length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); + if (length < 0) + goto unlock_fallback; + length = PFN_PHYS(length); + + if (length < size) + goto unlock_fallback; + if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR) + goto unlock_fallback; + if (!pfn_t_devmap(pfn)) + goto unlock_fallback; + dax_read_unlock(id); + + ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector, RADIX_DAX_PMD); if (IS_ERR(ret)) goto fallback; *entryp = ret; - trace_dax_pmd_insert_mapping(inode, vmf, length, dax.pfn, ret); + trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret); return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, - dax.pfn, vmf->flags & FAULT_FLAG_WRITE); + pfn, vmf->flags & FAULT_FLAG_WRITE); - unmap_fallback: - dax_unmap_atomic(bdev, &dax); +unlock_fallback: + dax_read_unlock(id); fallback: - trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, - dax.pfn, ret); + trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret); return VM_FAULT_FALLBACK; } diff --git a/fs/dcache.c b/fs/dcache.c index 95d71eda8142..cddf39777835 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -419,6 +419,8 @@ static void dentry_lru_add(struct dentry *dentry) { if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) d_lru_add(dentry); + else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED))) + dentry->d_flags |= DCACHE_REFERENCED; } /** @@ -779,8 +781,6 @@ repeat: goto kill_it; } - if (!(dentry->d_flags & DCACHE_REFERENCED)) - dentry->d_flags |= DCACHE_REFERENCED; dentry_lru_add(dentry); dentry->d_lockref.count--; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 7fd4ec4bb214..e892ae7d89f8 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -199,7 +199,7 @@ static const struct dentry_operations debugfs_dops = { static int debug_fill_super(struct super_block *sb, void *data, int silent) { - static struct tree_descr debug_files[] = {{""}}; + static const struct tree_descr debug_files[] = {{""}}; struct debugfs_fs_info *fsi; int err; diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index 42f9a0a0c4ca..8eeb694332fe 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -405,8 +405,7 @@ int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de, int err; lock_page(page); - err = exofs_write_begin(NULL, page->mapping, pos, len, - AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); + err = exofs_write_begin(NULL, page->mapping, pos, len, 0, &page, NULL); if (err) EXOFS_ERR("exofs_set_link: exofs_write_begin FAILED => %d\n", err); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 3a38c1b84e3c..26d77f9f8c12 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -799,6 +799,7 @@ int ext2_get_block(struct inode *inode, sector_t iblock, static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap) { + struct block_device *bdev; unsigned int blkbits = inode->i_blkbits; unsigned long first_block = offset >> blkbits; unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits; @@ -812,8 +813,13 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, return ret; iomap->flags = 0; - iomap->bdev = inode->i_sb->s_bdev; + bdev = inode->i_sb->s_bdev; + iomap->bdev = bdev; iomap->offset = (u64)first_block << blkbits; + if (blk_queue_dax(bdev->bd_queue)) + iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); + else + iomap->dax_dev = NULL; if (ret == 0) { iomap->type = IOMAP_HOLE; @@ -835,6 +841,7 @@ static int ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { + put_dax(iomap->dax_dev); if (iomap->type == IOMAP_MAPPED && written < length && (flags & IOMAP_WRITE)) diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 354103f3490c..d9beca1653c5 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -4,11 +4,11 @@ obj-$(CONFIG_EXT4_FS) += ext4.o -ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ - ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ - mmp.o indirect.o extents_status.o xattr.o xattr_user.o \ - xattr_trusted.o inline.o readpage.o sysfs.o +ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ + extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \ + indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \ + mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \ + super.o symlink.o sysfs.o xattr.o xattr_trusted.o xattr_user.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f7b465b4fb69..8e8046104f4d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2356,17 +2356,16 @@ extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, void *buf, int buf_size, struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de); -int ext4_insert_dentry(struct inode *dir, - struct inode *inode, - struct ext4_dir_entry_2 *de, - int buf_size, - struct ext4_filename *fname); +void ext4_insert_dentry(struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + struct ext4_filename *fname); static inline void ext4_update_dx_flag(struct inode *inode) { if (!ext4_has_feature_dir_index(inode->i_sb)) ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); } -static unsigned char ext4_filetype_table[] = { +static const unsigned char ext4_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK }; @@ -3050,7 +3049,7 @@ extern int ext4_handle_dirty_dirent_node(handle_t *handle, struct inode *inode, struct buffer_head *bh); #define S_SHIFT 12 -static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { +static const unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c new file mode 100644 index 000000000000..b19436098837 --- /dev/null +++ b/fs/ext4/fsmap.c @@ -0,0 +1,722 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "ext4.h" +#include <linux/fsmap.h> +#include "fsmap.h" +#include "mballoc.h" +#include <linux/sort.h> +#include <linux/list_sort.h> +#include <trace/events/ext4.h> + +/* Convert an ext4_fsmap to an fsmap. */ +void ext4_fsmap_from_internal(struct super_block *sb, struct fsmap *dest, + struct ext4_fsmap *src) +{ + dest->fmr_device = src->fmr_device; + dest->fmr_flags = src->fmr_flags; + dest->fmr_physical = src->fmr_physical << sb->s_blocksize_bits; + dest->fmr_owner = src->fmr_owner; + dest->fmr_offset = 0; + dest->fmr_length = src->fmr_length << sb->s_blocksize_bits; + dest->fmr_reserved[0] = 0; + dest->fmr_reserved[1] = 0; + dest->fmr_reserved[2] = 0; +} + +/* Convert an fsmap to an ext4_fsmap. */ +void ext4_fsmap_to_internal(struct super_block *sb, struct ext4_fsmap *dest, + struct fsmap *src) +{ + dest->fmr_device = src->fmr_device; + dest->fmr_flags = src->fmr_flags; + dest->fmr_physical = src->fmr_physical >> sb->s_blocksize_bits; + dest->fmr_owner = src->fmr_owner; + dest->fmr_length = src->fmr_length >> sb->s_blocksize_bits; +} + +/* getfsmap query state */ +struct ext4_getfsmap_info { + struct ext4_fsmap_head *gfi_head; + ext4_fsmap_format_t gfi_formatter; /* formatting fn */ + void *gfi_format_arg;/* format buffer */ + ext4_fsblk_t gfi_next_fsblk; /* next fsblock we expect */ + u32 gfi_dev; /* device id */ + ext4_group_t gfi_agno; /* bg number, if applicable */ + struct ext4_fsmap gfi_low; /* low rmap key */ + struct ext4_fsmap gfi_high; /* high rmap key */ + struct ext4_fsmap gfi_lastfree; /* free ext at end of last bg */ + struct list_head gfi_meta_list; /* fixed metadata list */ + bool gfi_last; /* last extent? */ +}; + +/* Associate a device with a getfsmap handler. */ +struct ext4_getfsmap_dev { + int (*gfd_fn)(struct super_block *sb, + struct ext4_fsmap *keys, + struct ext4_getfsmap_info *info); + u32 gfd_dev; +}; + +/* Compare two getfsmap device handlers. */ +static int ext4_getfsmap_dev_compare(const void *p1, const void *p2) +{ + const struct ext4_getfsmap_dev *d1 = p1; + const struct ext4_getfsmap_dev *d2 = p2; + + return d1->gfd_dev - d2->gfd_dev; +} + +/* Compare a record against our starting point */ +static bool ext4_getfsmap_rec_before_low_key(struct ext4_getfsmap_info *info, + struct ext4_fsmap *rec) +{ + return rec->fmr_physical < info->gfi_low.fmr_physical; +} + +/* + * Format a reverse mapping for getfsmap, having translated rm_startblock + * into the appropriate daddr units. + */ +static int ext4_getfsmap_helper(struct super_block *sb, + struct ext4_getfsmap_info *info, + struct ext4_fsmap *rec) +{ + struct ext4_fsmap fmr; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t rec_fsblk = rec->fmr_physical; + ext4_group_t agno; + ext4_grpblk_t cno; + int error; + + if (fatal_signal_pending(current)) + return -EINTR; + + /* + * Filter out records that start before our startpoint, if the + * caller requested that. + */ + if (ext4_getfsmap_rec_before_low_key(info, rec)) { + rec_fsblk += rec->fmr_length; + if (info->gfi_next_fsblk < rec_fsblk) + info->gfi_next_fsblk = rec_fsblk; + return EXT4_QUERY_RANGE_CONTINUE; + } + + /* Are we just counting mappings? */ + if (info->gfi_head->fmh_count == 0) { + if (rec_fsblk > info->gfi_next_fsblk) + info->gfi_head->fmh_entries++; + + if (info->gfi_last) + return EXT4_QUERY_RANGE_CONTINUE; + + info->gfi_head->fmh_entries++; + + rec_fsblk += rec->fmr_length; + if (info->gfi_next_fsblk < rec_fsblk) + info->gfi_next_fsblk = rec_fsblk; + return EXT4_QUERY_RANGE_CONTINUE; + } + + /* + * If the record starts past the last physical block we saw, + * then we've found a gap. Report the gap as being owned by + * whatever the caller specified is the missing owner. + */ + if (rec_fsblk > info->gfi_next_fsblk) { + if (info->gfi_head->fmh_entries >= info->gfi_head->fmh_count) + return EXT4_QUERY_RANGE_ABORT; + + ext4_get_group_no_and_offset(sb, info->gfi_next_fsblk, + &agno, &cno); + trace_ext4_fsmap_mapping(sb, info->gfi_dev, agno, + EXT4_C2B(sbi, cno), + rec_fsblk - info->gfi_next_fsblk, + EXT4_FMR_OWN_UNKNOWN); + + fmr.fmr_device = info->gfi_dev; + fmr.fmr_physical = info->gfi_next_fsblk; + fmr.fmr_owner = EXT4_FMR_OWN_UNKNOWN; + fmr.fmr_length = rec_fsblk - info->gfi_next_fsblk; + fmr.fmr_flags = FMR_OF_SPECIAL_OWNER; + error = info->gfi_formatter(&fmr, info->gfi_format_arg); + if (error) + return error; + info->gfi_head->fmh_entries++; + } + + if (info->gfi_last) + goto out; + + /* Fill out the extent we found */ + if (info->gfi_head->fmh_entries >= info->gfi_head->fmh_count) + return EXT4_QUERY_RANGE_ABORT; + + ext4_get_group_no_and_offset(sb, rec_fsblk, &agno, &cno); + trace_ext4_fsmap_mapping(sb, info->gfi_dev, agno, EXT4_C2B(sbi, cno), + rec->fmr_length, rec->fmr_owner); + + fmr.fmr_device = info->gfi_dev; + fmr.fmr_physical = rec_fsblk; + fmr.fmr_owner = rec->fmr_owner; + fmr.fmr_flags = FMR_OF_SPECIAL_OWNER; + fmr.fmr_length = rec->fmr_length; + error = info->gfi_formatter(&fmr, info->gfi_format_arg); + if (error) + return error; + info->gfi_head->fmh_entries++; + +out: + rec_fsblk += rec->fmr_length; + if (info->gfi_next_fsblk < rec_fsblk) + info->gfi_next_fsblk = rec_fsblk; + return EXT4_QUERY_RANGE_CONTINUE; +} + +static inline ext4_fsblk_t ext4_fsmap_next_pblk(struct ext4_fsmap *fmr) +{ + return fmr->fmr_physical + fmr->fmr_length; +} + +/* Transform a blockgroup's free record into a fsmap */ +static int ext4_getfsmap_datadev_helper(struct super_block *sb, + ext4_group_t agno, ext4_grpblk_t start, + ext4_grpblk_t len, void *priv) +{ + struct ext4_fsmap irec; + struct ext4_getfsmap_info *info = priv; + struct ext4_fsmap *p; + struct ext4_fsmap *tmp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t fsb; + ext4_fsblk_t fslen; + int error; + + fsb = (EXT4_C2B(sbi, start) + ext4_group_first_block_no(sb, agno)); + fslen = EXT4_C2B(sbi, len); + + /* If the retained free extent record is set... */ + if (info->gfi_lastfree.fmr_owner) { + /* ...and abuts this one, lengthen it and return. */ + if (ext4_fsmap_next_pblk(&info->gfi_lastfree) == fsb) { + info->gfi_lastfree.fmr_length += fslen; + return 0; + } + + /* + * There's a gap between the two free extents; emit the + * retained extent prior to merging the meta_list. + */ + error = ext4_getfsmap_helper(sb, info, &info->gfi_lastfree); + if (error) + return error; + info->gfi_lastfree.fmr_owner = 0; + } + + /* Merge in any relevant extents from the meta_list */ + list_for_each_entry_safe(p, tmp, &info->gfi_meta_list, fmr_list) { + if (p->fmr_physical + p->fmr_length <= info->gfi_next_fsblk) { + list_del(&p->fmr_list); + kfree(p); + } else if (p->fmr_physical < fsb) { + error = ext4_getfsmap_helper(sb, info, p); + if (error) + return error; + + list_del(&p->fmr_list); + kfree(p); + } + } + + irec.fmr_device = 0; + irec.fmr_physical = fsb; + irec.fmr_length = fslen; + irec.fmr_owner = EXT4_FMR_OWN_FREE; + irec.fmr_flags = 0; + + /* If this is a free extent at the end of a bg, buffer it. */ + if (ext4_fsmap_next_pblk(&irec) == + ext4_group_first_block_no(sb, agno + 1)) { + info->gfi_lastfree = irec; + return 0; + } + + /* Otherwise, emit it */ + return ext4_getfsmap_helper(sb, info, &irec); +} + +/* Execute a getfsmap query against the log device. */ +static int ext4_getfsmap_logdev(struct super_block *sb, struct ext4_fsmap *keys, + struct ext4_getfsmap_info *info) +{ + journal_t *journal = EXT4_SB(sb)->s_journal; + struct ext4_fsmap irec; + + /* Set up search keys */ + info->gfi_low = keys[0]; + info->gfi_low.fmr_length = 0; + + memset(&info->gfi_high, 0xFF, sizeof(info->gfi_high)); + + trace_ext4_fsmap_low_key(sb, info->gfi_dev, 0, + info->gfi_low.fmr_physical, + info->gfi_low.fmr_length, + info->gfi_low.fmr_owner); + + trace_ext4_fsmap_high_key(sb, info->gfi_dev, 0, + info->gfi_high.fmr_physical, + info->gfi_high.fmr_length, + info->gfi_high.fmr_owner); + + if (keys[0].fmr_physical > 0) + return 0; + + /* Fabricate an rmap entry for the external log device. */ + irec.fmr_physical = journal->j_blk_offset; + irec.fmr_length = journal->j_maxlen; + irec.fmr_owner = EXT4_FMR_OWN_LOG; + irec.fmr_flags = 0; + + return ext4_getfsmap_helper(sb, info, &irec); +} + +/* Helper to fill out an ext4_fsmap. */ +static inline int ext4_getfsmap_fill(struct list_head *meta_list, + ext4_fsblk_t fsb, ext4_fsblk_t len, + uint64_t owner) +{ + struct ext4_fsmap *fsm; + + fsm = kmalloc(sizeof(*fsm), GFP_NOFS); + if (!fsm) + return -ENOMEM; + fsm->fmr_device = 0; + fsm->fmr_flags = 0; + fsm->fmr_physical = fsb; + fsm->fmr_owner = owner; + fsm->fmr_length = len; + list_add_tail(&fsm->fmr_list, meta_list); + + return 0; +} + +/* + * This function returns the number of file system metadata blocks at + * the beginning of a block group, including the reserved gdt blocks. + */ +static unsigned int ext4_getfsmap_find_sb(struct super_block *sb, + ext4_group_t agno, + struct list_head *meta_list) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t fsb = ext4_group_first_block_no(sb, agno); + ext4_fsblk_t len; + unsigned long first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); + unsigned long metagroup = agno / EXT4_DESC_PER_BLOCK(sb); + int error; + + /* Record the superblock. */ + if (ext4_bg_has_super(sb, agno)) { + error = ext4_getfsmap_fill(meta_list, fsb, 1, EXT4_FMR_OWN_FS); + if (error) + return error; + fsb++; + } + + /* Record the group descriptors. */ + len = ext4_bg_num_gdb(sb, agno); + if (!len) + return 0; + error = ext4_getfsmap_fill(meta_list, fsb, len, + EXT4_FMR_OWN_GDT); + if (error) + return error; + fsb += len; + + /* Reserved GDT blocks */ + if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) { + len = le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); + error = ext4_getfsmap_fill(meta_list, fsb, len, + EXT4_FMR_OWN_RESV_GDT); + if (error) + return error; + } + + return 0; +} + +/* Compare two fsmap items. */ +static int ext4_getfsmap_compare(void *priv, + struct list_head *a, + struct list_head *b) +{ + struct ext4_fsmap *fa; + struct ext4_fsmap *fb; + + fa = container_of(a, struct ext4_fsmap, fmr_list); + fb = container_of(b, struct ext4_fsmap, fmr_list); + if (fa->fmr_physical < fb->fmr_physical) + return -1; + else if (fa->fmr_physical > fb->fmr_physical) + return 1; + return 0; +} + +/* Merge adjacent extents of fixed metadata. */ +static void ext4_getfsmap_merge_fixed_metadata(struct list_head *meta_list) +{ + struct ext4_fsmap *p; + struct ext4_fsmap *prev = NULL; + struct ext4_fsmap *tmp; + + list_for_each_entry_safe(p, tmp, meta_list, fmr_list) { + if (!prev) { + prev = p; + continue; + } + + if (prev->fmr_owner == p->fmr_owner && + prev->fmr_physical + prev->fmr_length == p->fmr_physical) { + prev->fmr_length += p->fmr_length; + list_del(&p->fmr_list); + kfree(p); + } else + prev = p; + } +} + +/* Free a list of fixed metadata. */ +static void ext4_getfsmap_free_fixed_metadata(struct list_head *meta_list) +{ + struct ext4_fsmap *p; + struct ext4_fsmap *tmp; + + list_for_each_entry_safe(p, tmp, meta_list, fmr_list) { + list_del(&p->fmr_list); + kfree(p); + } +} + +/* Find all the fixed metadata in the filesystem. */ +int ext4_getfsmap_find_fixed_metadata(struct super_block *sb, + struct list_head *meta_list) +{ + struct ext4_group_desc *gdp; + ext4_group_t agno; + int error; + + INIT_LIST_HEAD(meta_list); + + /* Collect everything. */ + for (agno = 0; agno < EXT4_SB(sb)->s_groups_count; agno++) { + gdp = ext4_get_group_desc(sb, agno, NULL); + if (!gdp) { + error = -EFSCORRUPTED; + goto err; + } + + /* Superblock & GDT */ + error = ext4_getfsmap_find_sb(sb, agno, meta_list); + if (error) + goto err; + + /* Block bitmap */ + error = ext4_getfsmap_fill(meta_list, + ext4_block_bitmap(sb, gdp), 1, + EXT4_FMR_OWN_BLKBM); + if (error) + goto err; + + /* Inode bitmap */ + error = ext4_getfsmap_fill(meta_list, + ext4_inode_bitmap(sb, gdp), 1, + EXT4_FMR_OWN_INOBM); + if (error) + goto err; + + /* Inodes */ + error = ext4_getfsmap_fill(meta_list, + ext4_inode_table(sb, gdp), + EXT4_SB(sb)->s_itb_per_group, + EXT4_FMR_OWN_INODES); + if (error) + goto err; + } + + /* Sort the list */ + list_sort(NULL, meta_list, ext4_getfsmap_compare); + + /* Merge adjacent extents */ + ext4_getfsmap_merge_fixed_metadata(meta_list); + + return 0; +err: + ext4_getfsmap_free_fixed_metadata(meta_list); + return error; +} + +/* Execute a getfsmap query against the buddy bitmaps */ +static int ext4_getfsmap_datadev(struct super_block *sb, + struct ext4_fsmap *keys, + struct ext4_getfsmap_info *info) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t start_fsb; + ext4_fsblk_t end_fsb; + ext4_fsblk_t eofs; + ext4_group_t start_ag; + ext4_group_t end_ag; + ext4_grpblk_t first_cluster; + ext4_grpblk_t last_cluster; + int error = 0; + + eofs = ext4_blocks_count(sbi->s_es); + if (keys[0].fmr_physical >= eofs) + return 0; + if (keys[1].fmr_physical >= eofs) + keys[1].fmr_physical = eofs - 1; + start_fsb = keys[0].fmr_physical; + end_fsb = keys[1].fmr_physical; + + /* Determine first and last group to examine based on start and end */ + ext4_get_group_no_and_offset(sb, start_fsb, &start_ag, &first_cluster); + ext4_get_group_no_and_offset(sb, end_fsb, &end_ag, &last_cluster); + + /* + * Convert the fsmap low/high keys to bg based keys. Initialize + * low to the fsmap low key and max out the high key to the end + * of the bg. + */ + info->gfi_low = keys[0]; + info->gfi_low.fmr_physical = EXT4_C2B(sbi, first_cluster); + info->gfi_low.fmr_length = 0; + + memset(&info->gfi_high, 0xFF, sizeof(info->gfi_high)); + + /* Assemble a list of all the fixed-location metadata. */ + error = ext4_getfsmap_find_fixed_metadata(sb, &info->gfi_meta_list); + if (error) + goto err; + + /* Query each bg */ + for (info->gfi_agno = start_ag; + info->gfi_agno <= end_ag; + info->gfi_agno++) { + /* + * Set the bg high key from the fsmap high key if this + * is the last bg that we're querying. + */ + if (info->gfi_agno == end_ag) { + info->gfi_high = keys[1]; + info->gfi_high.fmr_physical = EXT4_C2B(sbi, + last_cluster); + info->gfi_high.fmr_length = 0; + } + + trace_ext4_fsmap_low_key(sb, info->gfi_dev, info->gfi_agno, + info->gfi_low.fmr_physical, + info->gfi_low.fmr_length, + info->gfi_low.fmr_owner); + + trace_ext4_fsmap_high_key(sb, info->gfi_dev, info->gfi_agno, + info->gfi_high.fmr_physical, + info->gfi_high.fmr_length, + info->gfi_high.fmr_owner); + + error = ext4_mballoc_query_range(sb, info->gfi_agno, + EXT4_B2C(sbi, info->gfi_low.fmr_physical), + EXT4_B2C(sbi, info->gfi_high.fmr_physical), + ext4_getfsmap_datadev_helper, info); + if (error) + goto err; + + /* + * Set the bg low key to the start of the bg prior to + * moving on to the next bg. + */ + if (info->gfi_agno == start_ag) + memset(&info->gfi_low, 0, sizeof(info->gfi_low)); + } + + /* Do we have a retained free extent? */ + if (info->gfi_lastfree.fmr_owner) { + error = ext4_getfsmap_helper(sb, info, &info->gfi_lastfree); + if (error) + goto err; + } + + /* Report any gaps at the end of the bg */ + info->gfi_last = true; + error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster, 0, info); + if (error) + goto err; + +err: + ext4_getfsmap_free_fixed_metadata(&info->gfi_meta_list); + return error; +} + +/* Do we recognize the device? */ +static bool ext4_getfsmap_is_valid_device(struct super_block *sb, + struct ext4_fsmap *fm) +{ + if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX || + fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev)) + return true; + if (EXT4_SB(sb)->journal_bdev && + fm->fmr_device == new_encode_dev(EXT4_SB(sb)->journal_bdev->bd_dev)) + return true; + return false; +} + +/* Ensure that the low key is less than the high key. */ +static bool ext4_getfsmap_check_keys(struct ext4_fsmap *low_key, + struct ext4_fsmap *high_key) +{ + if (low_key->fmr_device > high_key->fmr_device) + return false; + if (low_key->fmr_device < high_key->fmr_device) + return true; + + if (low_key->fmr_physical > high_key->fmr_physical) + return false; + if (low_key->fmr_physical < high_key->fmr_physical) + return true; + + if (low_key->fmr_owner > high_key->fmr_owner) + return false; + if (low_key->fmr_owner < high_key->fmr_owner) + return true; + + return false; +} + +#define EXT4_GETFSMAP_DEVS 2 +/* + * Get filesystem's extents as described in head, and format for + * output. Calls formatter to fill the user's buffer until all + * extents are mapped, until the passed-in head->fmh_count slots have + * been filled, or until the formatter short-circuits the loop, if it + * is tracking filled-in extents on its own. + * + * Key to Confusion + * ---------------- + * There are multiple levels of keys and counters at work here: + * _fsmap_head.fmh_keys -- low and high fsmap keys passed in; + * these reflect fs-wide block addrs. + * dkeys -- fmh_keys used to query each device; + * these are fmh_keys but w/ the low key + * bumped up by fmr_length. + * _getfsmap_info.gfi_next_fsblk-- next fs block we expect to see; this + * is how we detect gaps in the fsmap + * records and report them. + * _getfsmap_info.gfi_low/high -- per-bg low/high keys computed from + * dkeys; used to query the free space. + */ +int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head, + ext4_fsmap_format_t formatter, void *arg) +{ + struct ext4_fsmap dkeys[2]; /* per-dev keys */ + struct ext4_getfsmap_dev handlers[EXT4_GETFSMAP_DEVS]; + struct ext4_getfsmap_info info = {0}; + int i; + int error = 0; + + if (head->fmh_iflags & ~FMH_IF_VALID) + return -EINVAL; + if (!ext4_getfsmap_is_valid_device(sb, &head->fmh_keys[0]) || + !ext4_getfsmap_is_valid_device(sb, &head->fmh_keys[1])) + return -EINVAL; + + head->fmh_entries = 0; + + /* Set up our device handlers. */ + memset(handlers, 0, sizeof(handlers)); + handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev); + handlers[0].gfd_fn = ext4_getfsmap_datadev; + if (EXT4_SB(sb)->journal_bdev) { + handlers[1].gfd_dev = new_encode_dev( + EXT4_SB(sb)->journal_bdev->bd_dev); + handlers[1].gfd_fn = ext4_getfsmap_logdev; + } + + sort(handlers, EXT4_GETFSMAP_DEVS, sizeof(struct ext4_getfsmap_dev), + ext4_getfsmap_dev_compare, NULL); + + /* + * To continue where we left off, we allow userspace to use the + * last mapping from a previous call as the low key of the next. + * This is identified by a non-zero length in the low key. We + * have to increment the low key in this scenario to ensure we + * don't return the same mapping again, and instead return the + * very next mapping. + * + * Bump the physical offset as there can be no other mapping for + * the same physical block range. + */ + dkeys[0] = head->fmh_keys[0]; + dkeys[0].fmr_physical += dkeys[0].fmr_length; + dkeys[0].fmr_owner = 0; + dkeys[0].fmr_length = 0; + memset(&dkeys[1], 0xFF, sizeof(struct ext4_fsmap)); + + if (!ext4_getfsmap_check_keys(dkeys, &head->fmh_keys[1])) + return -EINVAL; + + info.gfi_next_fsblk = head->fmh_keys[0].fmr_physical + + head->fmh_keys[0].fmr_length; + info.gfi_formatter = formatter; + info.gfi_format_arg = arg; + info.gfi_head = head; + + /* For each device we support... */ + for (i = 0; i < EXT4_GETFSMAP_DEVS; i++) { + /* Is this device within the range the user asked for? */ + if (!handlers[i].gfd_fn) + continue; + if (head->fmh_keys[0].fmr_device > handlers[i].gfd_dev) + continue; + if (head->fmh_keys[1].fmr_device < handlers[i].gfd_dev) + break; + + /* + * If this device number matches the high key, we have + * to pass the high key to the handler to limit the + * query results. If the device number exceeds the + * low key, zero out the low key so that we get + * everything from the beginning. + */ + if (handlers[i].gfd_dev == head->fmh_keys[1].fmr_device) + dkeys[1] = head->fmh_keys[1]; + if (handlers[i].gfd_dev > head->fmh_keys[0].fmr_device) + memset(&dkeys[0], 0, sizeof(struct ext4_fsmap)); + + info.gfi_dev = handlers[i].gfd_dev; + info.gfi_last = false; + info.gfi_agno = -1; + error = handlers[i].gfd_fn(sb, dkeys, &info); + if (error) + break; + info.gfi_next_fsblk = 0; + } + + head->fmh_oflags = FMH_OF_DEV_T; + return error; +} diff --git a/fs/ext4/fsmap.h b/fs/ext4/fsmap.h new file mode 100644 index 000000000000..9a2cd367cc66 --- /dev/null +++ b/fs/ext4/fsmap.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __EXT4_FSMAP_H__ +#define __EXT4_FSMAP_H__ + +struct fsmap; + +/* internal fsmap representation */ +struct ext4_fsmap { + struct list_head fmr_list; + dev_t fmr_device; /* device id */ + uint32_t fmr_flags; /* mapping flags */ + uint64_t fmr_physical; /* device offset of segment */ + uint64_t fmr_owner; /* owner id */ + uint64_t fmr_length; /* length of segment, blocks */ +}; + +struct ext4_fsmap_head { + uint32_t fmh_iflags; /* control flags */ + uint32_t fmh_oflags; /* output flags */ + unsigned int fmh_count; /* # of entries in array incl. input */ + unsigned int fmh_entries; /* # of entries filled in (output). */ + + struct ext4_fsmap fmh_keys[2]; /* low and high keys */ +}; + +void ext4_fsmap_from_internal(struct super_block *sb, struct fsmap *dest, + struct ext4_fsmap *src); +void ext4_fsmap_to_internal(struct super_block *sb, struct ext4_fsmap *dest, + struct fsmap *src); + +/* fsmap to userspace formatter - copy to user & advance pointer */ +typedef int (*ext4_fsmap_format_t)(struct ext4_fsmap *, void *); + +int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head, + ext4_fsmap_format_t formatter, void *arg); + +#define EXT4_QUERY_RANGE_ABORT 1 +#define EXT4_QUERY_RANGE_CONTINUE 0 + +/* fmr_owner special values for FS_IOC_GETFSMAP; some share w/ XFS */ +#define EXT4_FMR_OWN_FREE FMR_OWN_FREE /* free space */ +#define EXT4_FMR_OWN_UNKNOWN FMR_OWN_UNKNOWN /* unknown owner */ +#define EXT4_FMR_OWN_FS FMR_OWNER('X', 1) /* static fs metadata */ +#define EXT4_FMR_OWN_LOG FMR_OWNER('X', 2) /* journalling log */ +#define EXT4_FMR_OWN_INODES FMR_OWNER('X', 5) /* inodes */ +#define EXT4_FMR_OWN_GDT FMR_OWNER('f', 1) /* group descriptors */ +#define EXT4_FMR_OWN_RESV_GDT FMR_OWNER('f', 2) /* reserved gdt blocks */ +#define EXT4_FMR_OWN_BLKBM FMR_OWNER('f', 3) /* inode bitmap */ +#define EXT4_FMR_OWN_INOBM FMR_OWNER('f', 4) /* block bitmap */ + +#endif /* __EXT4_FSMAP_H__ */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 17bc043308f3..98ac2f1f23b3 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1098,6 +1098,17 @@ got: if (err) goto fail_drop; + /* + * Since the encryption xattr will always be unique, create it first so + * that it's less likely to end up in an external xattr block and + * prevent its deduplication. + */ + if (encrypt) { + err = fscrypt_inherit_context(dir, inode, handle, true); + if (err) + goto fail_free_drop; + } + err = ext4_init_acl(handle, inode, dir); if (err) goto fail_free_drop; @@ -1119,12 +1130,6 @@ got: ei->i_datasync_tid = handle->h_transaction->t_tid; } - if (encrypt) { - err = fscrypt_inherit_context(dir, inode, handle, true); - if (err) - goto fail_free_drop; - } - err = ext4_mark_inode_dirty(handle, inode); if (err) { ext4_std_error(sb, err); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 375fb1c05d49..d5dea4c293ef 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1034,7 +1034,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, err = ext4_journal_get_write_access(handle, iloc->bh); if (err) return err; - ext4_insert_dentry(dir, inode, de, inline_size, fname); + ext4_insert_dentry(inode, de, inline_size, fname); ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 10b574ab354b..5834c4d76be8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1643,6 +1643,7 @@ struct mpage_da_data { */ struct ext4_map_blocks map; struct ext4_io_submit io_submit; /* IO submission data */ + unsigned int do_map:1; }; static void mpage_release_unused_pages(struct mpage_da_data *mpd, @@ -2179,6 +2180,9 @@ static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, /* First block in the extent? */ if (map->m_len == 0) { + /* We cannot map unless handle is started... */ + if (!mpd->do_map) + return false; map->m_lblk = lblk; map->m_len = 1; map->m_flags = bh->b_state & BH_FLAGS; @@ -2231,6 +2235,9 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, /* Found extent to map? */ if (mpd->map.m_len) return 0; + /* Buffer needs mapping and handle is not started? */ + if (!mpd->do_map) + return 0; /* Everything mapped so far and we hit EOF */ break; } @@ -2747,6 +2754,29 @@ retry: tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page); done = false; blk_start_plug(&plug); + + /* + * First writeback pages that don't need mapping - we can avoid + * starting a transaction unnecessarily and also avoid being blocked + * in the block layer on device congestion while having transaction + * started. + */ + mpd.do_map = 0; + mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); + if (!mpd.io_submit.io_end) { + ret = -ENOMEM; + goto unplug; + } + ret = mpage_prepare_extent_to_map(&mpd); + /* Submit prepared bio */ + ext4_io_submit(&mpd.io_submit); + ext4_put_io_end_defer(mpd.io_submit.io_end); + mpd.io_submit.io_end = NULL; + /* Unlock pages we didn't use */ + mpage_release_unused_pages(&mpd, false); + if (ret < 0) + goto unplug; + while (!done && mpd.first_page <= mpd.last_page) { /* For each extent of pages we use new io_end */ mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); @@ -2775,8 +2805,10 @@ retry: wbc->nr_to_write, inode->i_ino, ret); /* Release allocated io_end */ ext4_put_io_end(mpd.io_submit.io_end); + mpd.io_submit.io_end = NULL; break; } + mpd.do_map = 1; trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc); ret = mpage_prepare_extent_to_map(&mpd); @@ -2807,6 +2839,7 @@ retry: if (!ext4_handle_valid(handle) || handle->h_sync == 0) { ext4_journal_stop(handle); handle = NULL; + mpd.do_map = 0; } /* Submit prepared bio */ ext4_io_submit(&mpd.io_submit); @@ -2824,6 +2857,7 @@ retry: ext4_journal_stop(handle); } else ext4_put_io_end(mpd.io_submit.io_end); + mpd.io_submit.io_end = NULL; if (ret == -ENOSPC && sbi->s_journal) { /* @@ -2839,6 +2873,7 @@ retry: if (ret) break; } +unplug: blk_finish_plug(&plug); if (!ret && !cycled && wbc->nr_to_write > 0) { cycled = 1; @@ -3305,6 +3340,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait) static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap) { + struct block_device *bdev; unsigned int blkbits = inode->i_blkbits; unsigned long first_block = offset >> blkbits; unsigned long last_block = (offset + length - 1) >> blkbits; @@ -3373,7 +3409,12 @@ retry: } iomap->flags = 0; - iomap->bdev = inode->i_sb->s_bdev; + bdev = inode->i_sb->s_bdev; + iomap->bdev = bdev; + if (blk_queue_dax(bdev->bd_queue)) + iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); + else + iomap->dax_dev = NULL; iomap->offset = first_block << blkbits; if (ret == 0) { @@ -3406,6 +3447,7 @@ static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, int blkbits = inode->i_blkbits; bool truncate = false; + put_dax(iomap->dax_dev); if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT)) return 0; @@ -5848,6 +5890,11 @@ int ext4_page_mkwrite(struct vm_fault *vmf) file_update_time(vma->vm_file); down_read(&EXT4_I(inode)->i_mmap_sem); + + ret = ext4_convert_inline_data(inode); + if (ret) + goto out_ret; + /* Delalloc case is easy... */ if (test_opt(inode->i_sb, DELALLOC) && !ext4_should_journal_data(inode) && diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 184e74eb3004..0c21e22acd74 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -19,6 +19,9 @@ #include <linux/delay.h> #include "ext4_jbd2.h" #include "ext4.h" +#include <linux/fsmap.h> +#include "fsmap.h" +#include <trace/events/ext4.h> /** * Swap memory between @a and @b for @len bytes. @@ -443,7 +446,7 @@ static inline unsigned long ext4_xflags_to_iflags(__u32 xflags) return iflags; } -int ext4_shutdown(struct super_block *sb, unsigned long arg) +static int ext4_shutdown(struct super_block *sb, unsigned long arg) { struct ext4_sb_info *sbi = EXT4_SB(sb); __u32 flags; @@ -489,6 +492,90 @@ int ext4_shutdown(struct super_block *sb, unsigned long arg) return 0; } +struct getfsmap_info { + struct super_block *gi_sb; + struct fsmap_head __user *gi_data; + unsigned int gi_idx; + __u32 gi_last_flags; +}; + +static int ext4_getfsmap_format(struct ext4_fsmap *xfm, void *priv) +{ + struct getfsmap_info *info = priv; + struct fsmap fm; + + trace_ext4_getfsmap_mapping(info->gi_sb, xfm); + + info->gi_last_flags = xfm->fmr_flags; + ext4_fsmap_from_internal(info->gi_sb, &fm, xfm); + if (copy_to_user(&info->gi_data->fmh_recs[info->gi_idx++], &fm, + sizeof(struct fsmap))) + return -EFAULT; + + return 0; +} + +static int ext4_ioc_getfsmap(struct super_block *sb, + struct fsmap_head __user *arg) +{ + struct getfsmap_info info = {0}; + struct ext4_fsmap_head xhead = {0}; + struct fsmap_head head; + bool aborted = false; + int error; + + if (copy_from_user(&head, arg, sizeof(struct fsmap_head))) + return -EFAULT; + if (memchr_inv(head.fmh_reserved, 0, sizeof(head.fmh_reserved)) || + memchr_inv(head.fmh_keys[0].fmr_reserved, 0, + sizeof(head.fmh_keys[0].fmr_reserved)) || + memchr_inv(head.fmh_keys[1].fmr_reserved, 0, + sizeof(head.fmh_keys[1].fmr_reserved))) + return -EINVAL; + /* + * ext4 doesn't report file extents at all, so the only valid + * file offsets are the magic ones (all zeroes or all ones). + */ + if (head.fmh_keys[0].fmr_offset || + (head.fmh_keys[1].fmr_offset != 0 && + head.fmh_keys[1].fmr_offset != -1ULL)) + return -EINVAL; + + xhead.fmh_iflags = head.fmh_iflags; + xhead.fmh_count = head.fmh_count; + ext4_fsmap_to_internal(sb, &xhead.fmh_keys[0], &head.fmh_keys[0]); + ext4_fsmap_to_internal(sb, &xhead.fmh_keys[1], &head.fmh_keys[1]); + + trace_ext4_getfsmap_low_key(sb, &xhead.fmh_keys[0]); + trace_ext4_getfsmap_high_key(sb, &xhead.fmh_keys[1]); + + info.gi_sb = sb; + info.gi_data = arg; + error = ext4_getfsmap(sb, &xhead, ext4_getfsmap_format, &info); + if (error == EXT4_QUERY_RANGE_ABORT) { + error = 0; + aborted = true; + } else if (error) + return error; + + /* If we didn't abort, set the "last" flag in the last fmx */ + if (!aborted && info.gi_idx) { + info.gi_last_flags |= FMR_OF_LAST; + if (copy_to_user(&info.gi_data->fmh_recs[info.gi_idx - 1].fmr_flags, + &info.gi_last_flags, + sizeof(info.gi_last_flags))) + return -EFAULT; + } + + /* copy back header */ + head.fmh_entries = xhead.fmh_entries; + head.fmh_oflags = xhead.fmh_oflags; + if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) + return -EFAULT; + + return 0; +} + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -499,6 +586,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) ext4_debug("cmd = %u, arg = %lu\n", cmd, arg); switch (cmd) { + case FS_IOC_GETFSMAP: + return ext4_ioc_getfsmap(sb, (void __user *)arg); case EXT4_IOC_GETFLAGS: flags = ei->i_flags & EXT4_FL_USER_VISIBLE; return put_user(flags, (int __user *) arg); @@ -1007,6 +1096,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_GET_ENCRYPTION_PWSALT: case EXT4_IOC_GET_ENCRYPTION_POLICY: case EXT4_IOC_SHUTDOWN: + case FS_IOC_GETFSMAP: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 354dc1a894c2..5083bce20ac4 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -357,7 +357,7 @@ static struct kmem_cache *ext4_free_data_cachep; #define NR_GRPINFO_CACHES 8 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; -static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { +static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k", "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k", "ext4_groupinfo_64k", "ext4_groupinfo_128k" @@ -2393,7 +2393,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) return 0; size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size); - new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL); + new_groupinfo = kvzalloc(size, GFP_KERNEL); if (!new_groupinfo) { ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); return -ENOMEM; @@ -5277,3 +5277,52 @@ out: range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits; return ret; } + +/* Iterate all the free extents in the group. */ +int +ext4_mballoc_query_range( + struct super_block *sb, + ext4_group_t group, + ext4_grpblk_t start, + ext4_grpblk_t end, + ext4_mballoc_query_range_fn formatter, + void *priv) +{ + void *bitmap; + ext4_grpblk_t next; + struct ext4_buddy e4b; + int error; + + error = ext4_mb_load_buddy(sb, group, &e4b); + if (error) + return error; + bitmap = e4b.bd_bitmap; + + ext4_lock_group(sb, group); + + start = (e4b.bd_info->bb_first_free > start) ? + e4b.bd_info->bb_first_free : start; + if (end >= EXT4_CLUSTERS_PER_GROUP(sb)) + end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; + + while (start <= end) { + start = mb_find_next_zero_bit(bitmap, end + 1, start); + if (start > end) + break; + next = mb_find_next_bit(bitmap, end + 1, start); + + ext4_unlock_group(sb, group); + error = formatter(sb, group, start, next - start, priv); + if (error) + goto out_unload; + ext4_lock_group(sb, group); + + start = next + 1; + } + + ext4_unlock_group(sb, group); +out_unload: + ext4_mb_unload_buddy(&e4b); + + return error; +} diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 1aba469f8220..2bed62084a8c 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -199,4 +199,21 @@ static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, return ext4_group_first_block_no(sb, fex->fe_group) + (fex->fe_start << EXT4_SB(sb)->s_cluster_bits); } + +typedef int (*ext4_mballoc_query_range_fn)( + struct super_block *sb, + ext4_group_t agno, + ext4_grpblk_t start, + ext4_grpblk_t len, + void *priv); + +int +ext4_mballoc_query_range( + struct super_block *sb, + ext4_group_t agno, + ext4_grpblk_t start, + ext4_grpblk_t end, + ext4_mballoc_query_range_fn formatter, + void *priv); + #endif diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 07e5e1405771..b81f7d46f344 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1237,37 +1237,24 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) } /* - * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure. + * Test whether a directory entry matches the filename being searched for. * - * `len <= EXT4_NAME_LEN' is guaranteed by caller. - * `de != NULL' is guaranteed by caller. + * Return: %true if the directory entry matches, otherwise %false. */ -static inline int ext4_match(struct ext4_filename *fname, - struct ext4_dir_entry_2 *de) +static inline bool ext4_match(const struct ext4_filename *fname, + const struct ext4_dir_entry_2 *de) { - const void *name = fname_name(fname); - u32 len = fname_len(fname); + struct fscrypt_name f; if (!de->inode) - return 0; + return false; + f.usr_fname = fname->usr_fname; + f.disk_name = fname->disk_name; #ifdef CONFIG_EXT4_FS_ENCRYPTION - if (unlikely(!name)) { - if (fname->usr_fname->name[0] == '_') { - int ret; - if (de->name_len < 16) - return 0; - ret = memcmp(de->name + de->name_len - 16, - fname->crypto_buf.name + 8, 16); - return (ret == 0) ? 1 : 0; - } - name = fname->crypto_buf.name; - len = fname->crypto_buf.len; - } + f.crypto_buf = fname->crypto_buf; #endif - if (de->name_len != len) - return 0; - return (memcmp(de->name, name, len) == 0) ? 1 : 0; + return fscrypt_match_name(&f, de->name, de->name_len); } /* @@ -1281,48 +1268,31 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, struct ext4_dir_entry_2 * de; char * dlimit; int de_len; - int res; de = (struct ext4_dir_entry_2 *)search_buf; dlimit = search_buf + buf_size; while ((char *) de < dlimit) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ - if ((char *) de + de->name_len <= dlimit) { - res = ext4_match(fname, de); - if (res < 0) { - res = -1; - goto return_result; - } - if (res > 0) { - /* found a match - just to be sure, do - * a full check */ - if (ext4_check_dir_entry(dir, NULL, de, bh, - bh->b_data, - bh->b_size, offset)) { - res = -1; - goto return_result; - } - *res_dir = de; - res = 1; - goto return_result; - } - + if ((char *) de + de->name_len <= dlimit && + ext4_match(fname, de)) { + /* found a match - just to be sure, do + * a full check */ + if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, + bh->b_size, offset)) + return -1; + *res_dir = de; + return 1; } /* prevent looping on a bad block */ de_len = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); - if (de_len <= 0) { - res = -1; - goto return_result; - } + if (de_len <= 0) + return -1; offset += de_len; de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); } - - res = 0; -return_result: - return res; + return 0; } static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, @@ -1616,16 +1586,9 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi if (!IS_ERR(inode) && ext4_encrypted_inode(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { - int nokey = ext4_encrypted_inode(inode) && - !fscrypt_has_encryption_key(inode); - if (nokey) { - iput(inode); - return ERR_PTR(-ENOKEY); - } ext4_warning(inode->i_sb, "Inconsistent encryption contexts: %lu/%lu", - (unsigned long) dir->i_ino, - (unsigned long) inode->i_ino); + dir->i_ino, inode->i_ino); iput(inode); return ERR_PTR(-EPERM); } @@ -1833,24 +1796,15 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, int nlen, rlen; unsigned int offset = 0; char *top; - int res; de = (struct ext4_dir_entry_2 *)buf; top = buf + buf_size - reclen; while ((char *) de <= top) { if (ext4_check_dir_entry(dir, NULL, de, bh, - buf, buf_size, offset)) { - res = -EFSCORRUPTED; - goto return_result; - } - /* Provide crypto context and crypto buffer to ext4 match */ - res = ext4_match(fname, de); - if (res < 0) - goto return_result; - if (res > 0) { - res = -EEXIST; - goto return_result; - } + buf, buf_size, offset)) + return -EFSCORRUPTED; + if (ext4_match(fname, de)) + return -EEXIST; nlen = EXT4_DIR_REC_LEN(de->name_len); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); if ((de->inode ? rlen - nlen : rlen) >= reclen) @@ -1858,22 +1812,17 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, de = (struct ext4_dir_entry_2 *)((char *)de + rlen); offset += rlen; } - if ((char *) de > top) - res = -ENOSPC; - else { - *dest_de = de; - res = 0; - } -return_result: - return res; + return -ENOSPC; + + *dest_de = de; + return 0; } -int ext4_insert_dentry(struct inode *dir, - struct inode *inode, - struct ext4_dir_entry_2 *de, - int buf_size, - struct ext4_filename *fname) +void ext4_insert_dentry(struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + struct ext4_filename *fname) { int nlen, rlen; @@ -1892,7 +1841,6 @@ int ext4_insert_dentry(struct inode *dir, ext4_set_de_type(inode->i_sb, de, inode->i_mode); de->name_len = fname_len(fname); memcpy(de->name, fname_name(fname), fname_len(fname)); - return 0; } /* @@ -1928,11 +1876,8 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, return err; } - /* By now the buffer is marked for journaling. Due to crypto operations, - * the following function call may fail */ - err = ext4_insert_dentry(dir, inode, de, blocksize, fname); - if (err < 0) - return err; + /* By now the buffer is marked for journaling */ + ext4_insert_dentry(inode, de, blocksize, fname); /* * XXX shouldn't update any times until successful diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 208241b06662..1a82138ba739 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -297,8 +297,17 @@ static void ext4_end_bio(struct bio *bio) { ext4_io_end_t *io_end = bio->bi_private; sector_t bi_sector = bio->bi_iter.bi_sector; + char b[BDEVNAME_SIZE]; - BUG_ON(!io_end); + if (WARN_ONCE(!io_end, "io_end is NULL: %s: sector %Lu len %u err %d\n", + bdevname(bio->bi_bdev, b), + (long long) bio->bi_iter.bi_sector, + (unsigned) bio_sectors(bio), + bio->bi_error)) { + ext4_finish_bio(bio); + bio_put(bio); + return; + } bio->bi_end_io = NULL; if (bio->bi_error) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a9c72e39a4ee..c90edf09b0c3 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -49,6 +49,7 @@ #include "xattr.h" #include "acl.h" #include "mballoc.h" +#include "fsmap.h" #define CREATE_TRACE_POINTS #include <trace/events/ext4.h> @@ -1230,7 +1231,7 @@ static const struct fscrypt_operations ext4_cryptops = { #endif #ifdef CONFIG_QUOTA -static char *quotatypes[] = INITQFNAMES; +static const char * const quotatypes[] = INITQFNAMES; #define QTYPE2NAME(t) (quotatypes[t]) static int ext4_write_dquot(struct dquot *dquot); @@ -1443,7 +1444,8 @@ static ext4_fsblk_t get_sb_block(void **data) } #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) -static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n" +static const char deprecated_msg[] = + "Mount option \"%s\" will be removed by %s\n" "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n"; #ifdef CONFIG_QUOTA @@ -2153,7 +2155,7 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) return 0; size = roundup_pow_of_two(size * sizeof(struct flex_groups)); - new_groups = ext4_kvzalloc(size, GFP_KERNEL); + new_groups = kvzalloc(size, GFP_KERNEL); if (!new_groups) { ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups", size / (int) sizeof(struct flex_groups)); @@ -3887,7 +3889,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } } - sbi->s_group_desc = ext4_kvmalloc(db_count * + sbi->s_group_desc = kvmalloc(db_count * sizeof(struct buffer_head *), GFP_KERNEL); if (sbi->s_group_desc == NULL) { @@ -3898,6 +3900,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) bgl_lock_init(sbi->s_blockgroup_lock); + /* Pre-read the descriptors into the buffer cache */ + for (i = 0; i < db_count; i++) { + block = descriptor_loc(sb, logical_sb_block, i); + sb_breadahead(sb, block); + } + for (i = 0; i < db_count; i++) { block = descriptor_loc(sb, logical_sb_block, i); sbi->s_group_desc[i] = sb_bread_unmovable(sb, block); @@ -4650,7 +4658,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) if (sync) { unlock_buffer(sbh); error = __sync_dirty_buffer(sbh, - test_opt(sb, BARRIER) ? REQ_FUA : REQ_SYNC); + REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0)); if (error) return error; diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 42145be5c6b4..d74dc5f81a04 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -34,7 +34,7 @@ typedef enum { ptr_ext4_super_block_offset, } attr_ptr_t; -static const char *proc_dirname = "fs/ext4"; +static const char proc_dirname[] = "fs/ext4"; static struct proc_dir_entry *ext4_proc_root; struct ext4_attr { @@ -375,7 +375,7 @@ static const struct file_operations ext4_seq_##name##_fops = { \ PROC_FILE_SHOW_DEFN(es_shrinker_info); PROC_FILE_SHOW_DEFN(options); -static struct ext4_proc_files { +static const struct ext4_proc_files { const char *name; const struct file_operations *fops; } proc_files[] = { @@ -388,7 +388,7 @@ static struct ext4_proc_files { int ext4_register_sysfs(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_proc_files *p; + const struct ext4_proc_files *p; int err; sbi->s_kobj.kset = &ext4_kset; @@ -412,7 +412,7 @@ int ext4_register_sysfs(struct super_block *sb) void ext4_unregister_sysfs(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_proc_files *p; + const struct ext4_proc_files *p; if (sbi->s_proc) { for (p = proc_files; p->name; p++) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 996e7900d4c8..8fb7ce14e6eb 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -78,10 +78,8 @@ static struct buffer_head *ext4_xattr_cache_find(struct inode *, struct mb_cache_entry **); static void ext4_xattr_rehash(struct ext4_xattr_header *, struct ext4_xattr_entry *); -static int ext4_xattr_list(struct dentry *dentry, char *buffer, - size_t buffer_size); -static const struct xattr_handler *ext4_xattr_handler_map[] = { +static const struct xattr_handler * const ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, @@ -163,20 +161,9 @@ ext4_xattr_handler(int name_index) return handler; } -/* - * Inode operation listxattr() - * - * d_inode(dentry)->i_mutex: don't care - */ -ssize_t -ext4_listxattr(struct dentry *dentry, char *buffer, size_t size) -{ - return ext4_xattr_list(dentry, buffer, size); -} - static int -ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end, - void *value_start) +ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end, + void *value_start) { struct ext4_xattr_entry *e = entry; @@ -230,8 +217,8 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh) return -EFSCORRUPTED; if (!ext4_xattr_block_csum_verify(inode, bh)) return -EFSBADCRC; - error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size, - bh->b_data); + error = ext4_xattr_check_entries(BFIRST(bh), bh->b_data + bh->b_size, + bh->b_data); if (!error) set_buffer_verified(bh); return error; @@ -246,7 +233,7 @@ __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, if (end - (void *)header < sizeof(*header) + sizeof(u32) || (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC))) goto errout; - error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header)); + error = ext4_xattr_check_entries(IFIRST(header), end, IFIRST(header)); errout: if (error) __ext4_error_inode(inode, function, line, 0, @@ -257,20 +244,9 @@ errout: #define xattr_check_inode(inode, header, end) \ __xattr_check_inode((inode), (header), (end), __func__, __LINE__) -static inline int -ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size) -{ - size_t value_size = le32_to_cpu(entry->e_value_size); - - if (entry->e_value_block != 0 || value_size > size || - le16_to_cpu(entry->e_value_offs) + value_size > size) - return -EFSCORRUPTED; - return 0; -} - static int ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index, - const char *name, size_t size, int sorted) + const char *name, int sorted) { struct ext4_xattr_entry *entry; size_t name_len; @@ -290,8 +266,6 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index, break; } *pentry = entry; - if (!cmp && ext4_xattr_check_entry(entry, size)) - return -EFSCORRUPTED; return cmp ? -ENODATA : 0; } @@ -319,7 +293,6 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); if (ext4_xattr_check_block(inode, bh)) { -bad_block: EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); error = -EFSCORRUPTED; @@ -327,9 +300,7 @@ bad_block: } ext4_xattr_cache_insert(ext4_mb_cache, bh); entry = BFIRST(bh); - error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); - if (error == -EFSCORRUPTED) - goto bad_block; + error = ext4_xattr_find_entry(&entry, name_index, name, 1); if (error) goto cleanup; size = le32_to_cpu(entry->e_value_size); @@ -366,13 +337,12 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, return error; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); - entry = IFIRST(header); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; error = xattr_check_inode(inode, header, end); if (error) goto cleanup; - error = ext4_xattr_find_entry(&entry, name_index, name, - end - (void *)entry, 0); + entry = IFIRST(header); + error = ext4_xattr_find_entry(&entry, name_index, name, 0); if (error) goto cleanup; size = le32_to_cpu(entry->e_value_size); @@ -519,7 +489,9 @@ cleanup: } /* - * ext4_xattr_list() + * Inode operation listxattr() + * + * d_inode(dentry)->i_rwsem: don't care * * Copy a list of attribute names into the buffer * provided, or compute the buffer size required. @@ -528,8 +500,8 @@ cleanup: * Returns a negative error number on failure, or the number of bytes * used / required on success. */ -static int -ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) +ssize_t +ext4_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { int ret, ret2; @@ -804,7 +776,7 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, bs->s.end = bs->bh->b_data + bs->bh->b_size; bs->s.here = bs->s.first; error = ext4_xattr_find_entry(&bs->s.here, i->name_index, - i->name, bs->bh->b_size, 1); + i->name, 1); if (error && error != -ENODATA) goto cleanup; bs->s.not_found = error; @@ -1076,8 +1048,7 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, return error; /* Find the named attribute. */ error = ext4_xattr_find_entry(&is->s.here, i->name_index, - i->name, is->s.end - - (void *)is->s.base, 0); + i->name, 0); if (error && error != -ENODATA) return error; is->s.not_found = error; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 0339daf4ca02..ea9c317b5916 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -275,10 +275,11 @@ static int f2fs_write_meta_pages(struct address_space *mapping, get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) goto skip_write; - trace_f2fs_writepages(mapping->host, wbc, META); + /* if locked failed, cp will flush dirty pages instead */ + if (!mutex_trylock(&sbi->cp_mutex)) + goto skip_write; - /* if mounting is failed, skip writing node pages */ - mutex_lock(&sbi->cp_mutex); + trace_f2fs_writepages(mapping->host, wbc, META); diff = nr_pages_to_write(sbi, META, wbc); written = sync_meta_pages(sbi, META, wbc->nr_to_write); mutex_unlock(&sbi->cp_mutex); @@ -567,7 +568,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) if (ni.blk_addr != NULL_ADDR) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_msg(sbi->sb, KERN_WARNING, - "%s: orphan failed (ino=%x), run fsck to fix.", + "%s: orphan failed (ino=%x) by kernel, retry mount.", __func__, ino); return -EIO; } @@ -677,7 +678,7 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page); crc_offset = le32_to_cpu((*cp_block)->checksum_offset); - if (crc_offset >= blk_size) { + if (crc_offset > (blk_size - sizeof(__le32))) { f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc_offset: %zu", crc_offset); return -EINVAL; @@ -816,7 +817,9 @@ static void __add_dirty_inode(struct inode *inode, enum inode_type type) return; set_inode_flag(inode, flag); - list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]); + if (!f2fs_is_volatile_file(inode)) + list_add_tail(&F2FS_I(inode)->dirty_list, + &sbi->inode_list[type]); stat_inc_dirty_inode(sbi, type); } @@ -941,6 +944,19 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) return 0; } +static void __prepare_cp_block(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + nid_t last_nid = nm_i->next_scan_nid; + + next_free_nid(sbi, &last_nid); + ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); + ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi)); + ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi)); + ckpt->next_free_nid = cpu_to_le32(last_nid); +} + /* * Freeze all the FS-operations for checkpoint. */ @@ -964,21 +980,26 @@ retry_flush_dents: err = sync_dirty_inodes(sbi, DIR_INODE); if (err) goto out; + cond_resched(); goto retry_flush_dents; } + /* + * POR: we should ensure that there are no dirty node pages + * until finishing nat/sit flush. inode->i_blocks can be updated. + */ + down_write(&sbi->node_change); + if (get_pages(sbi, F2FS_DIRTY_IMETA)) { + up_write(&sbi->node_change); f2fs_unlock_all(sbi); err = f2fs_sync_inode_meta(sbi); if (err) goto out; + cond_resched(); goto retry_flush_dents; } - /* - * POR: we should ensure that there are no dirty node pages - * until finishing nat/sit flush. - */ retry_flush_nodes: down_write(&sbi->node_write); @@ -986,11 +1007,20 @@ retry_flush_nodes: up_write(&sbi->node_write); err = sync_node_pages(sbi, &wbc); if (err) { + up_write(&sbi->node_change); f2fs_unlock_all(sbi); goto out; } + cond_resched(); goto retry_flush_nodes; } + + /* + * sbi->node_change is used only for AIO write_begin path which produces + * dirty node blocks and some checkpoint values by block allocation. + */ + __prepare_cp_block(sbi); + up_write(&sbi->node_change); out: blk_finish_plug(&plug); return err; @@ -1024,16 +1054,20 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) spin_lock(&sbi->cp_lock); - if (cpc->reason == CP_UMOUNT && ckpt->cp_pack_total_block_count > + if ((cpc->reason & CP_UMOUNT) && + le32_to_cpu(ckpt->cp_pack_total_block_count) > sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) disable_nat_bits(sbi, false); - if (cpc->reason == CP_UMOUNT) + if (cpc->reason & CP_TRIMMED) + __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG); + + if (cpc->reason & CP_UMOUNT) __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); else __clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); - if (cpc->reason == CP_FASTBOOT) + if (cpc->reason & CP_FASTBOOT) __set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); else __clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); @@ -1057,7 +1091,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; - nid_t last_nid = nm_i->next_scan_nid; block_t start_blk; unsigned int data_sum_blocks, orphan_blocks; __u32 crc32 = 0; @@ -1074,14 +1107,11 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) return -EIO; } - next_free_nid(sbi, &last_nid); - /* * modify checkpoint * version number is already updated */ ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi)); - ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { ckpt->cur_node_segno[i] = @@ -1100,10 +1130,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) curseg_alloc_type(sbi, i + CURSEG_HOT_DATA); } - ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi)); - ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi)); - ckpt->next_free_nid = cpu_to_le32(last_nid); - /* 2 cp + n data seg summary + orphan inode blocks */ data_sum_blocks = npages_for_summary_flush(sbi, false); spin_lock(&sbi->cp_lock); @@ -1143,7 +1169,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* write nat bits */ if (enabled_nat_bits(sbi, cpc)) { __u64 cp_ver = cur_cp_version(ckpt); - unsigned int i; block_t blk; cp_ver |= ((__u64)crc32 << 32); @@ -1250,8 +1275,8 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) mutex_lock(&sbi->cp_mutex); if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && - (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC || - (cpc->reason == CP_DISCARD && !sbi->discard_blks))) + ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) || + ((cpc->reason & CP_DISCARD) && !sbi->discard_blks))) goto out; if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; @@ -1273,7 +1298,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_flush_merged_bios(sbi); /* this is the case of multiple fstrims without any changes */ - if (cpc->reason == CP_DISCARD) { + if (cpc->reason & CP_DISCARD) { if (!exist_trim_candidates(sbi, cpc)) { unblock_operations(sbi); goto out; @@ -1311,7 +1336,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); - if (cpc->reason == CP_RECOVERY) + if (cpc->reason & CP_RECOVERY) f2fs_msg(sbi->sb, KERN_NOTICE, "checkpoint: version = %llx", ckpt_ver); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1602b4bccae6..7c0f6bdf817d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -309,7 +309,7 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, if (type >= META_FLUSH) { io->fio.type = META_FLUSH; io->fio.op = REQ_OP_WRITE; - io->fio.op_flags = REQ_META | REQ_PRIO; + io->fio.op_flags = REQ_META | REQ_PRIO | REQ_SYNC; if (!test_opt(sbi, NOBARRIER)) io->fio.op_flags |= REQ_PREFLUSH | REQ_FUA; } @@ -341,7 +341,7 @@ void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi) /* * Fill the locked page with data located in the block address. - * Return unlocked page. + * A caller needs to unlock the page on failure. */ int f2fs_submit_page_bio(struct f2fs_io_info *fio) { @@ -362,6 +362,9 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) bio_set_op_attrs(bio, fio->op, fio->op_flags); __submit_bio(fio->sbi, bio, fio->type); + + if (!is_read_io(fio->op)) + inc_page_count(fio->sbi, WB_DATA_TYPE(fio->page)); return 0; } @@ -787,6 +790,21 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) return err; } +static inline void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) +{ + if (flag == F2FS_GET_BLOCK_PRE_AIO) { + if (lock) + down_read(&sbi->node_change); + else + up_read(&sbi->node_change); + } else { + if (lock) + f2fs_lock_op(sbi); + else + f2fs_unlock_op(sbi); + } +} + /* * f2fs_map_blocks() now supported readahead/bmap/rw direct_IO with * f2fs_map_blocks structure. @@ -829,7 +847,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, next_dnode: if (create) - f2fs_lock_op(sbi); + __do_map_lock(sbi, flag, true); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -939,7 +957,7 @@ skip: f2fs_put_dnode(&dn); if (create) { - f2fs_unlock_op(sbi); + __do_map_lock(sbi, flag, false); f2fs_balance_fs(sbi, dn.node_changed); } goto next_dnode; @@ -948,7 +966,7 @@ sync_out: f2fs_put_dnode(&dn); unlock_out: if (create) { - f2fs_unlock_op(sbi); + __do_map_lock(sbi, flag, false); f2fs_balance_fs(sbi, dn.node_changed); } out: @@ -1151,9 +1169,10 @@ static int f2fs_mpage_readpages(struct address_space *mapping, for (page_idx = 0; nr_pages; page_idx++, nr_pages--) { - prefetchw(&page->flags); if (pages) { page = list_last_entry(pages, struct page, lru); + + prefetchw(&page->flags); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, @@ -1283,17 +1302,83 @@ static int f2fs_read_data_pages(struct file *file, return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages); } +static int encrypt_one_page(struct f2fs_io_info *fio) +{ + struct inode *inode = fio->page->mapping->host; + gfp_t gfp_flags = GFP_NOFS; + + if (!f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode)) + return 0; + + /* wait for GCed encrypted page writeback */ + f2fs_wait_on_encrypted_page_writeback(fio->sbi, fio->old_blkaddr); + +retry_encrypt: + fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, + PAGE_SIZE, 0, fio->page->index, gfp_flags); + if (!IS_ERR(fio->encrypted_page)) + return 0; + + /* flush pending IOs and wait for a while in the ENOMEM case */ + if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { + f2fs_flush_merged_bios(fio->sbi); + congestion_wait(BLK_RW_ASYNC, HZ/50); + gfp_flags |= __GFP_NOFAIL; + goto retry_encrypt; + } + return PTR_ERR(fio->encrypted_page); +} + +static inline bool need_inplace_update(struct f2fs_io_info *fio) +{ + struct inode *inode = fio->page->mapping->host; + + if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) + return false; + if (is_cold_data(fio->page)) + return false; + if (IS_ATOMIC_WRITTEN_PAGE(fio->page)) + return false; + + return need_inplace_update_policy(inode, fio); +} + +static inline bool valid_ipu_blkaddr(struct f2fs_io_info *fio) +{ + if (fio->old_blkaddr == NEW_ADDR) + return false; + if (fio->old_blkaddr == NULL_ADDR) + return false; + return true; +} + int do_write_data_page(struct f2fs_io_info *fio) { struct page *page = fio->page; struct inode *inode = page->mapping->host; struct dnode_of_data dn; + struct extent_info ei = {0,0,0}; + bool ipu_force = false; int err = 0; set_new_dnode(&dn, inode, NULL, NULL, 0); + if (need_inplace_update(fio) && + f2fs_lookup_extent_cache(inode, page->index, &ei)) { + fio->old_blkaddr = ei.blk + page->index - ei.fofs; + + if (valid_ipu_blkaddr(fio)) { + ipu_force = true; + fio->need_lock = false; + goto got_it; + } + } + + if (fio->need_lock) + f2fs_lock_op(fio->sbi); + err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); if (err) - return err; + goto out; fio->old_blkaddr = dn.data_blkaddr; @@ -1302,31 +1387,10 @@ int do_write_data_page(struct f2fs_io_info *fio) ClearPageUptodate(page); goto out_writepage; } - - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - gfp_t gfp_flags = GFP_NOFS; - - /* wait for GCed encrypted page writeback */ - f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode), - fio->old_blkaddr); -retry_encrypt: - fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, - PAGE_SIZE, 0, - fio->page->index, - gfp_flags); - if (IS_ERR(fio->encrypted_page)) { - err = PTR_ERR(fio->encrypted_page); - if (err == -ENOMEM) { - /* flush pending ios and wait for a while */ - f2fs_flush_merged_bios(F2FS_I_SB(inode)); - congestion_wait(BLK_RW_ASYNC, HZ/50); - gfp_flags |= __GFP_NOFAIL; - err = 0; - goto retry_encrypt; - } - goto out_writepage; - } - } +got_it: + err = encrypt_one_page(fio); + if (err) + goto out_writepage; set_page_writeback(page); @@ -1334,22 +1398,27 @@ retry_encrypt: * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (unlikely(fio->old_blkaddr != NEW_ADDR && - !is_cold_data(page) && - !IS_ATOMIC_WRITTEN_PAGE(page) && - need_inplace_update(inode))) { - rewrite_data_page(fio); + if (ipu_force || (valid_ipu_blkaddr(fio) && need_inplace_update(fio))) { + f2fs_put_dnode(&dn); + if (fio->need_lock) + f2fs_unlock_op(fio->sbi); + err = rewrite_data_page(fio); + trace_f2fs_do_write_data_page(fio->page, IPU); set_inode_flag(inode, FI_UPDATE_WRITE); - trace_f2fs_do_write_data_page(page, IPU); - } else { - write_data_page(&dn, fio); - trace_f2fs_do_write_data_page(page, OPU); - set_inode_flag(inode, FI_APPEND_WRITE); - if (page->index == 0) - set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); + return err; } + + /* LFS mode write path */ + write_data_page(&dn, fio); + trace_f2fs_do_write_data_page(page, OPU); + set_inode_flag(inode, FI_APPEND_WRITE); + if (page->index == 0) + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); out_writepage: f2fs_put_dnode(&dn); +out: + if (fio->need_lock) + f2fs_unlock_op(fio->sbi); return err; } @@ -1370,9 +1439,11 @@ static int __write_data_page(struct page *page, bool *submitted, .type = DATA, .op = REQ_OP_WRITE, .op_flags = wbc_to_write_flags(wbc), + .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, .submitted = false, + .need_lock = true, }; trace_f2fs_writepage(page, DATA); @@ -1408,6 +1479,7 @@ write: /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { + fio.need_lock = false; err = do_write_data_page(&fio); goto done; } @@ -1416,6 +1488,8 @@ write: need_balance_fs = true; else if (has_not_enough_free_secs(sbi, 0, 0)) goto redirty_out; + else + set_inode_flag(inode, FI_HOT_DATA); err = -EAGAIN; if (f2fs_has_inline_data(inode)) { @@ -1423,12 +1497,12 @@ write: if (!err) goto out; } - f2fs_lock_op(sbi); + if (err == -EAGAIN) err = do_write_data_page(&fio); if (F2FS_I(inode)->last_disk_size < psize) F2FS_I(inode)->last_disk_size = psize; - f2fs_unlock_op(sbi); + done: if (err && err != -ENOENT) goto redirty_out; @@ -1441,12 +1515,14 @@ out: if (wbc->for_reclaim) { f2fs_submit_merged_bio_cond(sbi, inode, 0, page->index, DATA, WRITE); + clear_inode_flag(inode, FI_HOT_DATA); remove_dirty_inode(inode); submitted = NULL; } unlock_page(page); - f2fs_balance_fs(sbi, need_balance_fs); + if (!S_ISDIR(inode->i_mode)) + f2fs_balance_fs(sbi, need_balance_fs); if (unlikely(f2fs_cp_error(sbi))) { f2fs_submit_merged_bio(sbi, DATA, WRITE); @@ -1495,6 +1571,12 @@ static int f2fs_write_cache_pages(struct address_space *mapping, pagevec_init(&pvec, 0); + if (get_dirty_pages(mapping->host) <= + SM_I(F2FS_M_SB(mapping))->min_hot_blocks) + set_inode_flag(mapping->host, FI_HOT_DATA); + else + clear_inode_flag(mapping->host, FI_HOT_DATA); + if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; @@ -1580,8 +1662,10 @@ continue_unlock: last_idx = page->index; } - if (--wbc->nr_to_write <= 0 && - wbc->sync_mode == WB_SYNC_NONE) { + /* give a priority to WB_SYNC threads */ + if ((atomic_read(&F2FS_M_SB(mapping)->wb_sync_req) || + --wbc->nr_to_write <= 0) && + wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } @@ -1637,9 +1721,18 @@ static int f2fs_write_data_pages(struct address_space *mapping, trace_f2fs_writepages(mapping->host, wbc, DATA); + /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */ + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_inc(&sbi->wb_sync_req); + else if (atomic_read(&sbi->wb_sync_req)) + goto skip_write; + blk_start_plug(&plug); ret = f2fs_write_cache_pages(mapping, wbc); blk_finish_plug(&plug); + + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_dec(&sbi->wb_sync_req); /* * if some pages were truncated, we cannot guarantee its mapping->host * to detect pending bios. @@ -1687,7 +1780,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, if (f2fs_has_inline_data(inode) || (pos & PAGE_MASK) >= i_size_read(inode)) { - f2fs_lock_op(sbi); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); locked = true; } restart: @@ -1723,7 +1816,8 @@ restart: err = get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err || dn.data_blkaddr == NULL_ADDR) { f2fs_put_dnode(&dn); - f2fs_lock_op(sbi); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, + true); locked = true; goto restart; } @@ -1737,7 +1831,7 @@ out: f2fs_put_dnode(&dn); unlock_out: if (locked) - f2fs_unlock_op(sbi); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); return err; } @@ -1951,7 +2045,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, /* This is atomic written page, keep Private */ if (IS_ATOMIC_WRITTEN_PAGE(page)) - return; + return drop_inmem_page(inode, page); set_page_private(page, 0); ClearPagePrivate(page); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index ee2d0a485fc3..87f449845f5f 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -51,15 +51,26 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); si->aw_cnt = atomic_read(&sbi->aw_cnt); + si->vw_cnt = atomic_read(&sbi->vw_cnt); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); + si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); - if (SM_I(sbi) && SM_I(sbi)->fcc_info) - si->nr_flush = - atomic_read(&SM_I(sbi)->fcc_info->submit_flush); - if (SM_I(sbi) && SM_I(sbi)->dcc_info) - si->nr_discard = - atomic_read(&SM_I(sbi)->dcc_info->submit_discard); + if (SM_I(sbi) && SM_I(sbi)->fcc_info) { + si->nr_flushed = + atomic_read(&SM_I(sbi)->fcc_info->issued_flush); + si->nr_flushing = + atomic_read(&SM_I(sbi)->fcc_info->issing_flush); + } + if (SM_I(sbi) && SM_I(sbi)->dcc_info) { + si->nr_discarded = + atomic_read(&SM_I(sbi)->dcc_info->issued_discard); + si->nr_discarding = + atomic_read(&SM_I(sbi)->dcc_info->issing_discard); + si->nr_discard_cmd = + atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); + si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks; + } si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -86,6 +97,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->sits = MAIN_SEGS(sbi); si->dirty_sits = SIT_I(sbi)->dirty_sentries; si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST]; + si->avail_nids = NM_I(sbi)->available_nids; si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]; si->bg_gc = sbi->bg_gc; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) @@ -99,8 +111,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); si->curseg[i] = curseg->segno; - si->cursec[i] = curseg->segno / sbi->segs_per_sec; - si->curzone[i] = si->cursec[i] / sbi->secs_per_zone; + si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno); + si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]); } for (i = 0; i < 2; i++) { @@ -124,10 +136,10 @@ static void update_sit_info(struct f2fs_sb_info *sbi) bimodal = 0; total_vblocks = 0; - blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; + blks_per_sec = BLKS_PER_SEC(sbi); hblks_per_sec = blks_per_sec / 2; for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { - vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); + vblocks = get_valid_blocks(sbi, segno, true); dist = abs(vblocks - hblks_per_sec); bimodal += dist * dist; @@ -156,7 +168,11 @@ static void update_mem_info(struct f2fs_sb_info *sbi) if (si->base_mem) goto get_cache; - si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; + /* build stat */ + si->base_mem = sizeof(struct f2fs_stat_info); + + /* build superblock */ + si->base_mem += sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; si->base_mem += 2 * sizeof(struct f2fs_inode_info); si->base_mem += sizeof(*sbi->ckpt); si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE; @@ -208,8 +224,11 @@ get_cache: /* build merge flush thread */ if (SM_I(sbi)->fcc_info) si->cache_mem += sizeof(struct flush_cmd_control); - if (SM_I(sbi)->dcc_info) + if (SM_I(sbi)->dcc_info) { si->cache_mem += sizeof(struct discard_cmd_control); + si->cache_mem += sizeof(struct discard_cmd) * + atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); + } /* free nids */ si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] + @@ -330,11 +349,16 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: %4d, Discard: %4d)\n", + seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: (%4d %4d), " + "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n", si->nr_wb_cp_data, si->nr_wb_data, - si->nr_flush, si->nr_discard); - seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d)\n", - si->inmem_pages, si->aw_cnt, si->max_aw_cnt); + si->nr_flushing, si->nr_flushed, + si->nr_discarding, si->nr_discarded, + si->nr_discard_cmd, si->undiscard_blks); + seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), " + "volatile IO: %4d (Max. %4d)\n", + si->inmem_pages, si->aw_cnt, si->max_aw_cnt, + si->vw_cnt, si->max_vw_cnt); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", @@ -347,8 +371,8 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_imeta); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); - seq_printf(s, " - free_nids: %9d, alloc_nids: %9d\n", - si->free_nids, si->alloc_nids); + seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n", + si->free_nids, si->avail_nids, si->alloc_nids); seq_puts(s, "\nDistribution of User Blocks:"); seq_puts(s, " [ valid | invalid | free ]\n"); seq_puts(s, " ["); @@ -434,7 +458,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inplace_count, 0); atomic_set(&sbi->aw_cnt, 0); + atomic_set(&sbi->vw_cnt, 0); atomic_set(&sbi->max_aw_cnt, 0); + atomic_set(&sbi->max_vw_cnt, 0); mutex_lock(&f2fs_stat_mutex); list_add_tail(&si->stat_list, &f2fs_stat_list); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 8d5c62b07b28..94756f55a97e 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -94,7 +94,7 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(NULL, &d, dentry_blk); de = find_target_dentry(fname, namehash, max_slots, &d); if (de) *res_page = dentry_page; @@ -111,8 +111,6 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, struct f2fs_dir_entry *de; unsigned long bit_pos = 0; int max_len = 0; - struct fscrypt_str de_name = FSTR_INIT(NULL, 0); - struct fscrypt_str *name = &fname->disk_name; if (max_slots) *max_slots = 0; @@ -130,17 +128,9 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, continue; } - /* encrypted case */ - de_name.name = d->filename[bit_pos]; - de_name.len = le16_to_cpu(de->name_len); - - /* show encrypted name */ - if (fname->hash) { - if (de->hash_code == cpu_to_le32(fname->hash)) - goto found; - } else if (de_name.len == name->len && - de->hash_code == namehash && - !memcmp(de_name.name, name->name, name->len)) + if (de->hash_code == namehash && + fscrypt_match_name(fname, d->filename[bit_pos], + le16_to_cpu(de->name_len))) goto found; if (max_slots && max_len > *max_slots) @@ -170,12 +160,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, struct f2fs_dir_entry *de = NULL; bool room = false; int max_slots; - f2fs_hash_t namehash; - - if(fname->hash) - namehash = cpu_to_le32(fname->hash); - else - namehash = f2fs_dentry_hash(&name); + f2fs_hash_t namehash = f2fs_dentry_hash(&name, fname); nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); @@ -207,13 +192,9 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, f2fs_put_page(dentry_page, 0); } - /* This is to increase the speed of f2fs_create */ - if (!de && room) { - F2FS_I(dir)->task = current; - if (F2FS_I(dir)->chash != namehash) { - F2FS_I(dir)->chash = namehash; - F2FS_I(dir)->clevel = level; - } + if (!de && room && F2FS_I(dir)->chash != namehash) { + F2FS_I(dir)->chash = namehash; + F2FS_I(dir)->clevel = level; } return de; @@ -254,6 +235,9 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, break; } out: + /* This is to increase the speed of f2fs_create */ + if (!de) + F2FS_I(dir)->task = current; return de; } @@ -337,24 +321,6 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage) set_page_dirty(ipage); } -int update_dent_inode(struct inode *inode, struct inode *to, - const struct qstr *name) -{ - struct page *page; - - if (file_enc_name(to)) - return 0; - - page = get_node_page(F2FS_I_SB(inode), inode->i_ino); - if (IS_ERR(page)) - return PTR_ERR(page); - - init_dent_inode(name, page); - f2fs_put_page(page, 1); - - return 0; -} - void do_make_empty_dir(struct inode *inode, struct inode *parent, struct f2fs_dentry_ptr *d) { @@ -384,7 +350,7 @@ static int make_empty_dir(struct inode *inode, dentry_blk = kmap_atomic(dentry_page); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(NULL, &d, dentry_blk); do_make_empty_dir(inode, parent, &d); kunmap_atomic(dentry_blk); @@ -438,8 +404,11 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, set_cold_node(inode, page); } - if (new_name) + if (new_name) { init_dent_inode(new_name, page); + if (f2fs_encrypted_inode(dir)) + file_set_enc_name(inode); + } /* * This file should be checkpointed during fsync. @@ -542,7 +511,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, level = 0; slots = GET_DENTRY_SLOTS(new_name->len); - dentry_hash = f2fs_dentry_hash(new_name); + dentry_hash = f2fs_dentry_hash(new_name, NULL); current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == dentry_hash) { @@ -599,11 +568,9 @@ add_dentry: err = PTR_ERR(page); goto fail; } - if (f2fs_encrypted_inode(dir)) - file_set_enc_name(inode); } - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(NULL, &d, dentry_blk); f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos); set_page_dirty(dentry_page); @@ -911,7 +878,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) dentry_blk = kmap(dentry_page); - make_dentry_ptr(inode, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(inode, &d, dentry_blk); err = f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index c6934f014e0f..2f98d7039701 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -18,6 +18,179 @@ #include "node.h" #include <trace/events/f2fs.h> +static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re, + unsigned int ofs) +{ + if (cached_re) { + if (cached_re->ofs <= ofs && + cached_re->ofs + cached_re->len > ofs) { + return cached_re; + } + } + return NULL; +} + +static struct rb_entry *__lookup_rb_tree_slow(struct rb_root *root, + unsigned int ofs) +{ + struct rb_node *node = root->rb_node; + struct rb_entry *re; + + while (node) { + re = rb_entry(node, struct rb_entry, rb_node); + + if (ofs < re->ofs) + node = node->rb_left; + else if (ofs >= re->ofs + re->len) + node = node->rb_right; + else + return re; + } + return NULL; +} + +struct rb_entry *__lookup_rb_tree(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs) +{ + struct rb_entry *re; + + re = __lookup_rb_tree_fast(cached_re, ofs); + if (!re) + return __lookup_rb_tree_slow(root, ofs); + + return re; +} + +struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, + struct rb_root *root, struct rb_node **parent, + unsigned int ofs) +{ + struct rb_node **p = &root->rb_node; + struct rb_entry *re; + + while (*p) { + *parent = *p; + re = rb_entry(*parent, struct rb_entry, rb_node); + + if (ofs < re->ofs) + p = &(*p)->rb_left; + else if (ofs >= re->ofs + re->len) + p = &(*p)->rb_right; + else + f2fs_bug_on(sbi, 1); + } + + return p; +} + +/* + * lookup rb entry in position of @ofs in rb-tree, + * if hit, return the entry, otherwise, return NULL + * @prev_ex: extent before ofs + * @next_ex: extent after ofs + * @insert_p: insert point for new extent at ofs + * in order to simpfy the insertion after. + * tree must stay unchanged between lookup and insertion. + */ +struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, + struct rb_entry *cached_re, + unsigned int ofs, + struct rb_entry **prev_entry, + struct rb_entry **next_entry, + struct rb_node ***insert_p, + struct rb_node **insert_parent, + bool force) +{ + struct rb_node **pnode = &root->rb_node; + struct rb_node *parent = NULL, *tmp_node; + struct rb_entry *re = cached_re; + + *insert_p = NULL; + *insert_parent = NULL; + *prev_entry = NULL; + *next_entry = NULL; + + if (RB_EMPTY_ROOT(root)) + return NULL; + + if (re) { + if (re->ofs <= ofs && re->ofs + re->len > ofs) + goto lookup_neighbors; + } + + while (*pnode) { + parent = *pnode; + re = rb_entry(*pnode, struct rb_entry, rb_node); + + if (ofs < re->ofs) + pnode = &(*pnode)->rb_left; + else if (ofs >= re->ofs + re->len) + pnode = &(*pnode)->rb_right; + else + goto lookup_neighbors; + } + + *insert_p = pnode; + *insert_parent = parent; + + re = rb_entry(parent, struct rb_entry, rb_node); + tmp_node = parent; + if (parent && ofs > re->ofs) + tmp_node = rb_next(parent); + *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + + tmp_node = parent; + if (parent && ofs < re->ofs) + tmp_node = rb_prev(parent); + *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + return NULL; + +lookup_neighbors: + if (ofs == re->ofs || force) { + /* lookup prev node for merging backward later */ + tmp_node = rb_prev(&re->rb_node); + *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + } + if (ofs == re->ofs + re->len - 1 || force) { + /* lookup next node for merging frontward later */ + tmp_node = rb_next(&re->rb_node); + *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + } + return re; +} + +bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi, + struct rb_root *root) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct rb_node *cur = rb_first(root), *next; + struct rb_entry *cur_re, *next_re; + + if (!cur) + return true; + + while (cur) { + next = rb_next(cur); + if (!next) + return true; + + cur_re = rb_entry(cur, struct rb_entry, rb_node); + next_re = rb_entry(next, struct rb_entry, rb_node); + + if (cur_re->ofs + cur_re->len > next_re->ofs) { + f2fs_msg(sbi->sb, KERN_INFO, "inconsistent rbtree, " + "cur(%u, %u) next(%u, %u)", + cur_re->ofs, cur_re->len, + next_re->ofs, next_re->len); + return false; + } + + cur = next; + } +#endif + return true; +} + static struct kmem_cache *extent_tree_slab; static struct kmem_cache *extent_node_slab; @@ -102,36 +275,6 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) return et; } -static struct extent_node *__lookup_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et, unsigned int fofs) -{ - struct rb_node *node = et->root.rb_node; - struct extent_node *en = et->cached_en; - - if (en) { - struct extent_info *cei = &en->ei; - - if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) { - stat_inc_cached_node_hit(sbi); - return en; - } - } - - while (node) { - en = rb_entry(node, struct extent_node, rb_node); - - if (fofs < en->ei.fofs) { - node = node->rb_left; - } else if (fofs >= en->ei.fofs + en->ei.len) { - node = node->rb_right; - } else { - stat_inc_rbtree_node_hit(sbi); - return en; - } - } - return NULL; -} - static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_info *ei) { @@ -237,17 +380,24 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, goto out; } - en = __lookup_extent_tree(sbi, et, pgofs); - if (en) { - *ei = en->ei; - spin_lock(&sbi->extent_lock); - if (!list_empty(&en->list)) { - list_move_tail(&en->list, &sbi->extent_list); - et->cached_en = en; - } - spin_unlock(&sbi->extent_lock); - ret = true; + en = (struct extent_node *)__lookup_rb_tree(&et->root, + (struct rb_entry *)et->cached_en, pgofs); + if (!en) + goto out; + + if (en == et->cached_en) + stat_inc_cached_node_hit(sbi); + else + stat_inc_rbtree_node_hit(sbi); + + *ei = en->ei; + spin_lock(&sbi->extent_lock); + if (!list_empty(&en->list)) { + list_move_tail(&en->list, &sbi->extent_list); + et->cached_en = en; } + spin_unlock(&sbi->extent_lock); + ret = true; out: stat_inc_total_hit(sbi); read_unlock(&et->lock); @@ -256,83 +406,6 @@ out: return ret; } - -/* - * lookup extent at @fofs, if hit, return the extent - * if not, return NULL and - * @prev_ex: extent before fofs - * @next_ex: extent after fofs - * @insert_p: insert point for new extent at fofs - * in order to simpfy the insertion after. - * tree must stay unchanged between lookup and insertion. - */ -static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et, - unsigned int fofs, - struct extent_node **prev_ex, - struct extent_node **next_ex, - struct rb_node ***insert_p, - struct rb_node **insert_parent) -{ - struct rb_node **pnode = &et->root.rb_node; - struct rb_node *parent = NULL, *tmp_node; - struct extent_node *en = et->cached_en; - - *insert_p = NULL; - *insert_parent = NULL; - *prev_ex = NULL; - *next_ex = NULL; - - if (RB_EMPTY_ROOT(&et->root)) - return NULL; - - if (en) { - struct extent_info *cei = &en->ei; - - if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) - goto lookup_neighbors; - } - - while (*pnode) { - parent = *pnode; - en = rb_entry(*pnode, struct extent_node, rb_node); - - if (fofs < en->ei.fofs) - pnode = &(*pnode)->rb_left; - else if (fofs >= en->ei.fofs + en->ei.len) - pnode = &(*pnode)->rb_right; - else - goto lookup_neighbors; - } - - *insert_p = pnode; - *insert_parent = parent; - - en = rb_entry(parent, struct extent_node, rb_node); - tmp_node = parent; - if (parent && fofs > en->ei.fofs) - tmp_node = rb_next(parent); - *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - - tmp_node = parent; - if (parent && fofs < en->ei.fofs) - tmp_node = rb_prev(parent); - *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - return NULL; - -lookup_neighbors: - if (fofs == en->ei.fofs) { - /* lookup prev node for merging backward later */ - tmp_node = rb_prev(&en->rb_node); - *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - } - if (fofs == en->ei.fofs + en->ei.len - 1) { - /* lookup next node for merging frontward later */ - tmp_node = rb_next(&en->rb_node); - *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - } - return en; -} - static struct extent_node *__try_merge_extent_node(struct inode *inode, struct extent_tree *et, struct extent_info *ei, struct extent_node *prev_ex, @@ -387,17 +460,7 @@ static struct extent_node *__insert_extent_tree(struct inode *inode, goto do_insert; } - while (*p) { - parent = *p; - en = rb_entry(parent, struct extent_node, rb_node); - - if (ei->fofs < en->ei.fofs) - p = &(*p)->rb_left; - else if (ei->fofs >= en->ei.fofs + en->ei.len) - p = &(*p)->rb_right; - else - f2fs_bug_on(sbi, 1); - } + p = __lookup_rb_tree_for_insert(sbi, &et->root, &parent, ei->fofs); do_insert: en = __attach_extent_node(sbi, et, ei, parent, p); if (!en) @@ -447,8 +510,11 @@ static void f2fs_update_extent_tree_range(struct inode *inode, __drop_largest_extent(inode, fofs, len); /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ - en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en, - &insert_p, &insert_parent); + en = (struct extent_node *)__lookup_rb_tree_ret(&et->root, + (struct rb_entry *)et->cached_en, fofs, + (struct rb_entry **)&prev_en, + (struct rb_entry **)&next_en, + &insert_p, &insert_parent, false); if (!en) en = next_en; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0a6e115562f6..2185c7a040a1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -50,6 +50,7 @@ enum { FAULT_BLOCK, FAULT_DIR_DEPTH, FAULT_EVICT_INODE, + FAULT_TRUNCATE, FAULT_IO, FAULT_CHECKPOINT, FAULT_MAX, @@ -62,7 +63,7 @@ struct f2fs_fault_info { }; extern char *fault_name[FAULT_MAX]; -#define IS_FAULT_SET(fi, type) (fi->inject_type & (1 << (type))) +#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type))) #endif /* @@ -88,9 +89,9 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_ADAPTIVE 0x00020000 #define F2FS_MOUNT_LFS 0x00040000 -#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) -#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) -#define test_opt(sbi, option) (sbi->mount_opt.opt & F2FS_MOUNT_##option) +#define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) +#define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) +#define test_opt(sbi, option) ((sbi)->mount_opt.opt & F2FS_MOUNT_##option) #define ver_after(a, b) (typecheck(unsigned long long, a) && \ typecheck(unsigned long long, b) && \ @@ -124,22 +125,20 @@ enum { SIT_BITMAP }; -enum { - CP_UMOUNT, - CP_FASTBOOT, - CP_SYNC, - CP_RECOVERY, - CP_DISCARD, -}; +#define CP_UMOUNT 0x00000001 +#define CP_FASTBOOT 0x00000002 +#define CP_SYNC 0x00000004 +#define CP_RECOVERY 0x00000008 +#define CP_DISCARD 0x00000010 +#define CP_TRIMMED 0x00000020 #define DEF_BATCHED_TRIM_SECTIONS 2048 #define BATCHED_TRIM_SEGMENTS(sbi) \ - (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) + (GET_SEG_FROM_SEC(sbi, SM_I(sbi)->trim_sections)) #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) -#define MAX_DISCARD_BLOCKS(sbi) \ - ((1 << (sbi)->log_blocks_per_seg) * (sbi)->segs_per_sec) -#define DISCARD_ISSUE_RATE 8 +#define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) +#define DISCARD_ISSUE_RATE 8 #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ @@ -181,37 +180,63 @@ struct inode_entry { struct inode *inode; /* vfs inode pointer */ }; -/* for the list of blockaddresses to be discarded */ +/* for the bitmap indicate blocks to be discarded */ struct discard_entry { struct list_head list; /* list head */ - block_t blkaddr; /* block address to be discarded */ - int len; /* # of consecutive blocks of the discard */ + block_t start_blkaddr; /* start blockaddr of current segment */ + unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */ }; +/* max discard pend list number */ +#define MAX_PLIST_NUM 512 +#define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ + (MAX_PLIST_NUM - 1) : (blk_num - 1)) + enum { D_PREP, D_SUBMIT, D_DONE, }; +struct discard_info { + block_t lstart; /* logical start address */ + block_t len; /* length */ + block_t start; /* actual start address in dev */ +}; + struct discard_cmd { + struct rb_node rb_node; /* rb node located in rb-tree */ + union { + struct { + block_t lstart; /* logical start address */ + block_t len; /* length */ + block_t start; /* actual start address in dev */ + }; + struct discard_info di; /* discard info */ + + }; struct list_head list; /* command list */ struct completion wait; /* compleation */ - block_t lstart; /* logical start address */ - block_t len; /* length */ - struct bio *bio; /* bio */ - int state; /* state */ + struct block_device *bdev; /* bdev */ + unsigned short ref; /* reference count */ + unsigned char state; /* state */ + int error; /* bio error */ }; struct discard_cmd_control { struct task_struct *f2fs_issue_discard; /* discard thread */ - struct list_head discard_entry_list; /* 4KB discard entry list */ - int nr_discards; /* # of discards in the list */ - struct list_head discard_cmd_list; /* discard cmd list */ + struct list_head entry_list; /* 4KB discard entry list */ + struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */ + struct list_head wait_list; /* store on-flushing entries */ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ struct mutex cmd_lock; - int max_discards; /* max. discards to be issued */ - atomic_t submit_discard; /* # of issued discard */ + unsigned int nr_discards; /* # of discards in the list */ + unsigned int max_discards; /* max. discards to be issued */ + unsigned int undiscard_blks; /* # of undiscard blocks */ + atomic_t issued_discard; /* # of issued discard */ + atomic_t issing_discard; /* # of issing discard */ + atomic_t discard_cmd_cnt; /* # of cached cmd count */ + struct rb_root root; /* root of discard rb-tree */ }; /* for the list of fsync inodes, used only during recovery */ @@ -222,13 +247,13 @@ struct fsync_inode_entry { block_t last_dentry; /* block address locating the last dentry */ }; -#define nats_in_cursum(jnl) (le16_to_cpu(jnl->n_nats)) -#define sits_in_cursum(jnl) (le16_to_cpu(jnl->n_sits)) +#define nats_in_cursum(jnl) (le16_to_cpu((jnl)->n_nats)) +#define sits_in_cursum(jnl) (le16_to_cpu((jnl)->n_sits)) -#define nat_in_journal(jnl, i) (jnl->nat_j.entries[i].ne) -#define nid_in_journal(jnl, i) (jnl->nat_j.entries[i].nid) -#define sit_in_journal(jnl, i) (jnl->sit_j.entries[i].se) -#define segno_in_journal(jnl, i) (jnl->sit_j.entries[i].segno) +#define nat_in_journal(jnl, i) ((jnl)->nat_j.entries[i].ne) +#define nid_in_journal(jnl, i) ((jnl)->nat_j.entries[i].nid) +#define sit_in_journal(jnl, i) ((jnl)->sit_j.entries[i].se) +#define segno_in_journal(jnl, i) ((jnl)->sit_j.entries[i].segno) #define MAX_NAT_JENTRIES(jnl) (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl)) #define MAX_SIT_JENTRIES(jnl) (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl)) @@ -270,11 +295,14 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3) #define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) #define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) -#define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6) +#define F2FS_IOC_GARBAGE_COLLECT _IOW(F2FS_IOCTL_MAGIC, 6, __u32) #define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) -#define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8) +#define F2FS_IOC_DEFRAGMENT _IOWR(F2FS_IOCTL_MAGIC, 8, \ + struct f2fs_defragment) #define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \ struct f2fs_move_range) +#define F2FS_IOC_FLUSH_DEVICE _IOW(F2FS_IOCTL_MAGIC, 10, \ + struct f2fs_flush_device) #define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY @@ -311,6 +339,11 @@ struct f2fs_move_range { u64 len; /* size to move */ }; +struct f2fs_flush_device { + u32 dev_num; /* device number to flush */ + u32 segments; /* # of segments to flush */ +}; + /* * For INODE and NODE manager */ @@ -323,26 +356,24 @@ struct f2fs_dentry_ptr { int max; }; -static inline void make_dentry_ptr(struct inode *inode, - struct f2fs_dentry_ptr *d, void *src, int type) +static inline void make_dentry_ptr_block(struct inode *inode, + struct f2fs_dentry_ptr *d, struct f2fs_dentry_block *t) { d->inode = inode; + d->max = NR_DENTRY_IN_BLOCK; + d->bitmap = &t->dentry_bitmap; + d->dentry = t->dentry; + d->filename = t->filename; +} - if (type == 1) { - struct f2fs_dentry_block *t = (struct f2fs_dentry_block *)src; - - d->max = NR_DENTRY_IN_BLOCK; - d->bitmap = &t->dentry_bitmap; - d->dentry = t->dentry; - d->filename = t->filename; - } else { - struct f2fs_inline_dentry *t = (struct f2fs_inline_dentry *)src; - - d->max = NR_INLINE_DENTRY; - d->bitmap = &t->dentry_bitmap; - d->dentry = t->dentry; - d->filename = t->filename; - } +static inline void make_dentry_ptr_inline(struct inode *inode, + struct f2fs_dentry_ptr *d, struct f2fs_inline_dentry *t) +{ + d->inode = inode; + d->max = NR_INLINE_DENTRY; + d->bitmap = &t->dentry_bitmap; + d->dentry = t->dentry; + d->filename = t->filename; } /* @@ -374,16 +405,30 @@ enum { /* number of extent info in extent cache we try to shrink */ #define EXTENT_CACHE_SHRINK_NUMBER 128 +struct rb_entry { + struct rb_node rb_node; /* rb node located in rb-tree */ + unsigned int ofs; /* start offset of the entry */ + unsigned int len; /* length of the entry */ +}; + struct extent_info { unsigned int fofs; /* start offset in a file */ - u32 blk; /* start block address of the extent */ unsigned int len; /* length of the extent */ + u32 blk; /* start block address of the extent */ }; struct extent_node { - struct rb_node rb_node; /* rb node located in rb-tree */ + struct rb_node rb_node; + union { + struct { + unsigned int fofs; + unsigned int len; + u32 blk; + }; + struct extent_info ei; /* extent info */ + + }; struct list_head list; /* node in global extent list of sbi */ - struct extent_info ei; /* extent info */ struct extent_tree *et; /* extent tree pointer */ }; @@ -500,6 +545,24 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, ei->len = len; } +static inline bool __is_discard_mergeable(struct discard_info *back, + struct discard_info *front) +{ + return back->lstart + back->len == front->lstart; +} + +static inline bool __is_discard_back_mergeable(struct discard_info *cur, + struct discard_info *back) +{ + return __is_discard_mergeable(back, cur); +} + +static inline bool __is_discard_front_mergeable(struct discard_info *cur, + struct discard_info *front) +{ + return __is_discard_mergeable(cur, front); +} + static inline bool __is_extent_mergeable(struct extent_info *back, struct extent_info *front) { @@ -562,7 +625,6 @@ struct f2fs_nm_info { unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE]; unsigned char *nat_block_bitmap; unsigned short *free_nid_count; /* free nid count of NAT block */ - spinlock_t free_nid_lock; /* protect updating of nid count */ /* for checkpoint */ char *nat_bitmap; /* NAT bitmap pointer */ @@ -641,7 +703,8 @@ struct flush_cmd { struct flush_cmd_control { struct task_struct *f2fs_issue_flush; /* flush thread */ wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ - atomic_t submit_flush; /* # of issued flushes */ + atomic_t issued_flush; /* # of issued flushes */ + atomic_t issing_flush; /* # of issing flushes */ struct llist_head issue_list; /* list for command issue */ struct llist_node *dispatch_list; /* list for command dispatch */ }; @@ -672,6 +735,7 @@ struct f2fs_sm_info { unsigned int ipu_policy; /* in-place-update policy */ unsigned int min_ipu_util; /* in-place-update threshold */ unsigned int min_fsync_blocks; /* threshold for fsync */ + unsigned int min_hot_blocks; /* threshold for hot block allocation */ /* for flush command control */ struct flush_cmd_control *fcc_info; @@ -722,6 +786,7 @@ enum page_type { META_FLUSH, INMEM, /* the below types are used by tracepoints only. */ INMEM_DROP, + INMEM_INVALIDATE, INMEM_REVOKE, IPU, OPU, @@ -737,9 +802,10 @@ struct f2fs_io_info { struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ bool submitted; /* indicate IO submission */ + bool need_lock; /* indicate we need to lock cp_rwsem */ }; -#define is_read_io(rw) (rw == READ) +#define is_read_io(rw) ((rw) == READ) struct f2fs_bio_info { struct f2fs_sb_info *sbi; /* f2fs superblock */ struct bio *bio; /* bios to merge */ @@ -827,6 +893,7 @@ struct f2fs_sb_info { struct mutex cp_mutex; /* checkpoint procedure lock */ struct rw_semaphore cp_rwsem; /* blocking FS operations */ struct rw_semaphore node_write; /* locking node writes */ + struct rw_semaphore node_change; /* locking node change */ wait_queue_head_t cp_wait; unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ long interval_time[MAX_TIME]; /* to store thresholds */ @@ -879,6 +946,9 @@ struct f2fs_sb_info { /* # of allocated blocks */ struct percpu_counter alloc_valid_block_count; + /* writeback control */ + atomic_t wb_sync_req; /* count # of WB_SYNC threads */ + /* valid inode count */ struct percpu_counter total_valid_inode_count; @@ -912,11 +982,12 @@ struct f2fs_sb_info { atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ atomic_t aw_cnt; /* # of atomic writes */ + atomic_t vw_cnt; /* # of volatile writes */ atomic_t max_aw_cnt; /* max # of atomic writes */ + atomic_t max_vw_cnt; /* max # of volatile writes */ int bg_gc; /* background gc calls */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ #endif - unsigned int last_victim[2]; /* last victim segment # */ spinlock_t stat_lock; /* lock for stat operations */ /* For sysfs suppport */ @@ -971,8 +1042,8 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) * and the return value is in kbytes. s is of struct f2fs_sb_info. */ #define BD_PART_WRITTEN(s) \ -(((u64)part_stat_read(s->sb->s_bdev->bd_part, sectors[1]) - \ - s->sectors_written_start) >> 1) +(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[1]) - \ + (s)->sectors_written_start) >> 1) static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) { @@ -1193,7 +1264,7 @@ static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, { bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); - return (cpc) ? (cpc->reason == CP_UMOUNT) && set : set; + return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set; } static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) @@ -1229,7 +1300,7 @@ static inline int __get_cp_reason(struct f2fs_sb_info *sbi) static inline bool __remain_node_summaries(int reason) { - return (reason == CP_UMOUNT || reason == CP_FASTBOOT); + return (reason & (CP_UMOUNT | CP_FASTBOOT)); } static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi) @@ -1707,6 +1778,7 @@ enum { FI_DO_DEFRAG, /* indicate defragment is running */ FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ + FI_HOT_DATA, /* indicate file is hot */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, @@ -1869,12 +1941,6 @@ static inline int f2fs_has_inline_data(struct inode *inode) return is_inode_flag_set(inode, FI_INLINE_DATA); } -static inline void f2fs_clear_inline_inode(struct inode *inode) -{ - clear_inode_flag(inode, FI_INLINE_DATA); - clear_inode_flag(inode, FI_DATA_EXIST); -} - static inline int f2fs_exist_data(struct inode *inode) { return is_inode_flag_set(inode, FI_DATA_EXIST); @@ -2005,36 +2071,10 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, return kmalloc(size, flags); } -static inline void *f2fs_kvmalloc(size_t size, gfp_t flags) -{ - void *ret; - - ret = kmalloc(size, flags | __GFP_NOWARN); - if (!ret) - ret = __vmalloc(size, flags, PAGE_KERNEL); - return ret; -} - -static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) -{ - void *ret; - - ret = kzalloc(size, flags | __GFP_NOWARN); - if (!ret) - ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL); - return ret; -} - #define get_inode_mode(i) \ ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) -/* get offset of first page in next direct node */ -#define PGOFS_OF_NEXT_DNODE(pgofs, inode) \ - ((pgofs < ADDRS_PER_INODE(inode)) ? ADDRS_PER_INODE(inode) : \ - (pgofs - ADDRS_PER_INODE(inode) + ADDRS_PER_BLOCK) / \ - ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode)) - /* * file.c */ @@ -2096,8 +2136,6 @@ ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, struct page **page); void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct page *page, struct inode *inode); -int update_dent_inode(struct inode *inode, struct inode *to, - const struct qstr *name); void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, const struct qstr *name, f2fs_hash_t name_hash, unsigned int bit_pos); @@ -2133,7 +2171,8 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi); /* * hash.c */ -f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info); +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, + struct fscrypt_name *fname); /* * node.c @@ -2184,6 +2223,7 @@ void destroy_node_manager_caches(void); */ void register_inmem_page(struct inode *inode, struct page *page); void drop_inmem_pages(struct inode *inode); +void drop_inmem_page(struct inode *inode, struct page *page); int commit_inmem_pages(struct inode *inode); void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi); @@ -2193,7 +2233,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); -void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr); +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); void release_discard_addrs(struct f2fs_sb_info *sbi); int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); @@ -2205,7 +2245,7 @@ void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr); void write_meta_page(struct f2fs_sb_info *sbi, struct page *page); void write_node_page(unsigned int nid, struct f2fs_io_info *fio); void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio); -void rewrite_data_page(struct f2fs_io_info *fio); +int rewrite_data_page(struct f2fs_io_info *fio); void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, bool recover_curseg, bool recover_newaddr); @@ -2310,7 +2350,8 @@ int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, int start_gc_thread(struct f2fs_sb_info *sbi); void stop_gc_thread(struct f2fs_sb_info *sbi); block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode); -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background); +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, + unsigned int segno); void build_gc_manager(struct f2fs_sb_info *sbi); /* @@ -2334,11 +2375,15 @@ struct f2fs_stat_info { int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; int inmem_pages; unsigned int ndirty_dirs, ndirty_files, ndirty_all; - int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; + int nats, dirty_nats, sits, dirty_sits; + int free_nids, avail_nids, alloc_nids; int total_count, utilization; - int bg_gc, nr_wb_cp_data, nr_wb_data, nr_flush, nr_discard; + int bg_gc, nr_wb_cp_data, nr_wb_data; + int nr_flushing, nr_flushed, nr_discarding, nr_discarded; + int nr_discard_cmd; + unsigned int undiscard_blks; int inline_xattr, inline_inode, inline_dir, append, update, orphans; - int aw_cnt, max_aw_cnt; + int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; @@ -2421,11 +2466,22 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) if (cur > max) \ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ } while (0) +#define stat_inc_volatile_write(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->vw_cnt)) +#define stat_dec_volatile_write(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->vw_cnt)) +#define stat_update_max_volatile_write(inode) \ + do { \ + int cur = atomic_read(&F2FS_I_SB(inode)->vw_cnt); \ + int max = atomic_read(&F2FS_I_SB(inode)->max_vw_cnt); \ + if (cur > max) \ + atomic_set(&F2FS_I_SB(inode)->max_vw_cnt, cur); \ + } while (0) #define stat_inc_seg_count(sbi, type, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ - (si)->tot_segs++; \ - if (type == SUM_TYPE_DATA) { \ + si->tot_segs++; \ + if ((type) == SUM_TYPE_DATA) { \ si->data_segs++; \ si->bg_data_segs += (gc_type == BG_GC) ? 1 : 0; \ } else { \ @@ -2435,14 +2491,14 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) } while (0) #define stat_inc_tot_blk_count(si, blks) \ - (si->tot_blks += (blks)) + ((si)->tot_blks += (blks)) #define stat_inc_data_blk_count(sbi, blks, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->data_blks += (blks); \ - si->bg_data_blks += (gc_type == BG_GC) ? (blks) : 0; \ + si->bg_data_blks += ((gc_type) == BG_GC) ? (blks) : 0; \ } while (0) #define stat_inc_node_blk_count(sbi, blks, gc_type) \ @@ -2450,7 +2506,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->node_blks += (blks); \ - si->bg_node_blks += (gc_type == BG_GC) ? (blks) : 0; \ + si->bg_node_blks += ((gc_type) == BG_GC) ? (blks) : 0; \ } while (0) int f2fs_build_stats(struct f2fs_sb_info *sbi); @@ -2458,32 +2514,35 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi); int __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else -#define stat_inc_cp_count(si) -#define stat_inc_bg_cp_count(si) -#define stat_inc_call_count(si) -#define stat_inc_bggc_count(si) -#define stat_inc_dirty_inode(sbi, type) -#define stat_dec_dirty_inode(sbi, type) -#define stat_inc_total_hit(sb) -#define stat_inc_rbtree_node_hit(sb) -#define stat_inc_largest_node_hit(sbi) -#define stat_inc_cached_node_hit(sbi) -#define stat_inc_inline_xattr(inode) -#define stat_dec_inline_xattr(inode) -#define stat_inc_inline_inode(inode) -#define stat_dec_inline_inode(inode) -#define stat_inc_inline_dir(inode) -#define stat_dec_inline_dir(inode) -#define stat_inc_atomic_write(inode) -#define stat_dec_atomic_write(inode) -#define stat_update_max_atomic_write(inode) -#define stat_inc_seg_type(sbi, curseg) -#define stat_inc_block_count(sbi, curseg) -#define stat_inc_inplace_blocks(sbi) -#define stat_inc_seg_count(sbi, type, gc_type) -#define stat_inc_tot_blk_count(si, blks) -#define stat_inc_data_blk_count(sbi, blks, gc_type) -#define stat_inc_node_blk_count(sbi, blks, gc_type) +#define stat_inc_cp_count(si) do { } while (0) +#define stat_inc_bg_cp_count(si) do { } while (0) +#define stat_inc_call_count(si) do { } while (0) +#define stat_inc_bggc_count(si) do { } while (0) +#define stat_inc_dirty_inode(sbi, type) do { } while (0) +#define stat_dec_dirty_inode(sbi, type) do { } while (0) +#define stat_inc_total_hit(sb) do { } while (0) +#define stat_inc_rbtree_node_hit(sb) do { } while (0) +#define stat_inc_largest_node_hit(sbi) do { } while (0) +#define stat_inc_cached_node_hit(sbi) do { } while (0) +#define stat_inc_inline_xattr(inode) do { } while (0) +#define stat_dec_inline_xattr(inode) do { } while (0) +#define stat_inc_inline_inode(inode) do { } while (0) +#define stat_dec_inline_inode(inode) do { } while (0) +#define stat_inc_inline_dir(inode) do { } while (0) +#define stat_dec_inline_dir(inode) do { } while (0) +#define stat_inc_atomic_write(inode) do { } while (0) +#define stat_dec_atomic_write(inode) do { } while (0) +#define stat_update_max_atomic_write(inode) do { } while (0) +#define stat_inc_volatile_write(inode) do { } while (0) +#define stat_dec_volatile_write(inode) do { } while (0) +#define stat_update_max_volatile_write(inode) do { } while (0) +#define stat_inc_seg_type(sbi, curseg) do { } while (0) +#define stat_inc_block_count(sbi, curseg) do { } while (0) +#define stat_inc_inplace_blocks(sbi) do { } while (0) +#define stat_inc_seg_count(sbi, type, gc_type) do { } while (0) +#define stat_inc_tot_blk_count(si, blks) do { } while (0) +#define stat_inc_data_blk_count(sbi, blks, gc_type) do { } while (0) +#define stat_inc_node_blk_count(sbi, blks, gc_type) do { } while (0) static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } @@ -2509,7 +2568,7 @@ extern struct kmem_cache *inode_entry_slab; bool f2fs_may_inline_data(struct inode *inode); bool f2fs_may_inline_dentry(struct inode *inode); void read_inline_data(struct page *page, struct page *ipage); -bool truncate_inline_inode(struct page *ipage, u64 from); +void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from); int f2fs_read_inline_data(struct inode *inode, struct page *page); int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page); int f2fs_convert_inline_inode(struct inode *inode); @@ -2544,6 +2603,18 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); /* * extent_cache.c */ +struct rb_entry *__lookup_rb_tree(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs); +struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, + struct rb_root *root, struct rb_node **parent, + unsigned int ofs); +struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs, + struct rb_entry **prev_entry, struct rb_entry **next_entry, + struct rb_node ***insert_p, struct rb_node **insert_parent, + bool force); +bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi, + struct rb_root *root); unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext); void f2fs_drop_extent_tree(struct inode *inode); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5f7317875a67..61af721329fa 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -116,11 +116,6 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) if (!dentry) return 0; - if (update_dent_inode(inode, inode, &dentry->d_name)) { - dput(dentry); - return 0; - } - *pino = parent_ino(dentry); dput(dentry); return 1; @@ -528,7 +523,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, page = get_lock_data_page(inode, index, true); if (IS_ERR(page)) - return 0; + return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page); truncate_out: f2fs_wait_on_page_writeback(page, DATA, true); zero_user(page, offset, PAGE_SIZE - offset); @@ -566,9 +561,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) } if (f2fs_has_inline_data(inode)) { - truncate_inline_inode(ipage, from); - if (from == 0) - clear_inode_flag(inode, FI_DATA_EXIST); + truncate_inline_inode(inode, ipage, from); f2fs_put_page(ipage, 1); truncate_page = true; goto out; @@ -617,6 +610,12 @@ int f2fs_truncate(struct inode *inode) trace_f2fs_truncate(inode); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) { + f2fs_show_injection_info(FAULT_TRUNCATE); + return -EIO; + } +#endif /* we should check inline_data size */ if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); @@ -1012,11 +1011,11 @@ static int __exchange_data_block(struct inode *src_inode, while (len) { olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len); - src_blkaddr = f2fs_kvzalloc(sizeof(block_t) * olen, GFP_KERNEL); + src_blkaddr = kvzalloc(sizeof(block_t) * olen, GFP_KERNEL); if (!src_blkaddr) return -ENOMEM; - do_replace = f2fs_kvzalloc(sizeof(int) * olen, GFP_KERNEL); + do_replace = kvzalloc(sizeof(int) * olen, GFP_KERNEL); if (!do_replace) { kvfree(src_blkaddr); return -ENOMEM; @@ -1188,8 +1187,6 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; - if (offset + len > new_size) - new_size = offset + len; new_size = max_t(loff_t, new_size, offset + len); } else { if (off_start) { @@ -1257,8 +1254,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) int ret = 0; new_size = i_size_read(inode) + len; - if (new_size > inode->i_sb->s_maxbytes) - return -EFBIG; + ret = inode_newsize_ok(inode, new_size); + if (ret) + return ret; if (offset >= i_size_read(inode)) return -EINVAL; @@ -1428,6 +1426,7 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { clear_inode_flag(inode, FI_VOLATILE_FILE); + stat_dec_volatile_write(inode); set_inode_flag(inode, FI_DROP_CACHE); filemap_fdatawrite(inode->i_mapping); clear_inode_flag(inode, FI_DROP_CACHE); @@ -1474,10 +1473,10 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) if (ret) return ret; - flags = f2fs_mask_flags(inode->i_mode, flags); - inode_lock(inode); + flags = f2fs_mask_flags(inode->i_mode, flags); + oldflags = fi->i_flags; if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { @@ -1491,10 +1490,11 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) flags = flags & FS_FL_USER_MODIFIABLE; flags |= oldflags & ~FS_FL_USER_MODIFIABLE; fi->i_flags = flags; - inode_unlock(inode); inode->i_ctime = current_time(inode); f2fs_set_inode_flags(inode); + + inode_unlock(inode); out: mnt_drop_write_file(filp); return ret; @@ -1515,6 +1515,9 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -1529,20 +1532,25 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; set_inode_flag(inode, FI_ATOMIC_FILE); + set_inode_flag(inode, FI_HOT_DATA); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); if (!get_dirty_pages(inode)) - goto out; + goto inc_stat; f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); - if (ret) + if (ret) { clear_inode_flag(inode, FI_ATOMIC_FILE); -out: + goto out; + } + +inc_stat: stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); +out: inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -1592,6 +1600,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -1605,6 +1616,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (ret) goto out; + stat_inc_volatile_write(inode); + stat_update_max_volatile_write(inode); + set_inode_flag(inode, FI_VOLATILE_FILE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); out: @@ -1660,6 +1674,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { clear_inode_flag(inode, FI_VOLATILE_FILE); + stat_dec_volatile_write(inode); ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } @@ -1841,7 +1856,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) mutex_lock(&sbi->gc_mutex); } - ret = f2fs_gc(sbi, sync, true); + ret = f2fs_gc(sbi, sync, true, NULL_SEGNO); out: mnt_drop_write_file(filp); return ret; @@ -1879,13 +1894,12 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, pgoff_t pg_start, pg_end; unsigned int blk_per_seg = sbi->blocks_per_seg; unsigned int total = 0, sec_num; - unsigned int pages_per_sec = sbi->segs_per_sec * blk_per_seg; block_t blk_end = 0; bool fragmented = false; int err; /* if in-place-update policy is enabled, don't waste time here */ - if (need_inplace_update(inode)) + if (need_inplace_update_policy(inode, NULL)) return -EINVAL; pg_start = range->start >> PAGE_SHIFT; @@ -1943,7 +1957,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, map.m_lblk = pg_start; map.m_len = pg_end - pg_start; - sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec; + sec_num = (map.m_len + BLKS_PER_SEC(sbi) - 1) / BLKS_PER_SEC(sbi); /* * make sure there are enough free section for LFS allocation, this can @@ -2020,42 +2034,40 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!S_ISREG(inode->i_mode)) + if (!S_ISREG(inode->i_mode) || f2fs_is_atomic_file(inode)) return -EINVAL; - err = mnt_want_write_file(filp); - if (err) - return err; - - if (f2fs_readonly(sbi->sb)) { - err = -EROFS; - goto out; - } + if (f2fs_readonly(sbi->sb)) + return -EROFS; if (copy_from_user(&range, (struct f2fs_defragment __user *)arg, - sizeof(range))) { - err = -EFAULT; - goto out; - } + sizeof(range))) + return -EFAULT; /* verify alignment of offset & size */ - if (range.start & (F2FS_BLKSIZE - 1) || - range.len & (F2FS_BLKSIZE - 1)) { - err = -EINVAL; - goto out; - } + if (range.start & (F2FS_BLKSIZE - 1) || range.len & (F2FS_BLKSIZE - 1)) + return -EINVAL; + + if (unlikely((range.start + range.len) >> PAGE_SHIFT > + sbi->max_file_blocks)) + return -EINVAL; + + err = mnt_want_write_file(filp); + if (err) + return err; err = f2fs_defragment_range(sbi, filp, &range); + mnt_drop_write_file(filp); + f2fs_update_time(sbi, REQ_TIME); if (err < 0) - goto out; + return err; if (copy_to_user((struct f2fs_defragment __user *)arg, &range, sizeof(range))) - err = -EFAULT; -out: - mnt_drop_write_file(filp); - return err; + return -EFAULT; + + return 0; } static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, @@ -2189,6 +2201,8 @@ static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) range.pos_out, range.len); mnt_drop_write_file(filp); + if (err) + goto err_out; if (copy_to_user((struct f2fs_move_range __user *)arg, &range, sizeof(range))) @@ -2198,6 +2212,69 @@ err_out: return err; } +static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct sit_info *sm = SIT_I(sbi); + unsigned int start_segno = 0, end_segno = 0; + unsigned int dev_start_segno = 0, dev_end_segno = 0; + struct f2fs_flush_device range; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + if (copy_from_user(&range, (struct f2fs_flush_device __user *)arg, + sizeof(range))) + return -EFAULT; + + if (sbi->s_ndevs <= 1 || sbi->s_ndevs - 1 <= range.dev_num || + sbi->segs_per_sec != 1) { + f2fs_msg(sbi->sb, KERN_WARNING, + "Can't flush %u in %d for segs_per_sec %u != 1\n", + range.dev_num, sbi->s_ndevs, + sbi->segs_per_sec); + return -EINVAL; + } + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + if (range.dev_num != 0) + dev_start_segno = GET_SEGNO(sbi, FDEV(range.dev_num).start_blk); + dev_end_segno = GET_SEGNO(sbi, FDEV(range.dev_num).end_blk); + + start_segno = sm->last_victim[FLUSH_DEVICE]; + if (start_segno < dev_start_segno || start_segno >= dev_end_segno) + start_segno = dev_start_segno; + end_segno = min(start_segno + range.segments, dev_end_segno); + + while (start_segno < end_segno) { + if (!mutex_trylock(&sbi->gc_mutex)) { + ret = -EBUSY; + goto out; + } + sm->last_victim[GC_CB] = end_segno + 1; + sm->last_victim[GC_GREEDY] = end_segno + 1; + sm->last_victim[ALLOC_NEXT] = end_segno + 1; + ret = f2fs_gc(sbi, true, true, start_segno); + if (ret == -EAGAIN) + ret = 0; + else if (ret < 0) + break; + start_segno++; + } +out: + mnt_drop_write_file(filp); + return ret; +} + + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -2235,6 +2312,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_defragment(filp, arg); case F2FS_IOC_MOVE_RANGE: return f2fs_ioc_move_range(filp, arg); + case F2FS_IOC_FLUSH_DEVICE: + return f2fs_ioc_flush_device(filp, arg); default: return -ENOTTY; } @@ -2302,8 +2381,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_GARBAGE_COLLECT: case F2FS_IOC_WRITE_CHECKPOINT: case F2FS_IOC_DEFRAGMENT: - break; case F2FS_IOC_MOVE_RANGE: + case F2FS_IOC_FLUSH_DEVICE: break; default: return -ENOIOCTLCMD; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 418fd9881646..026522107ca3 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -84,7 +84,7 @@ static int gc_thread_func(void *data) stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true)) + if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO)) wait_ms = gc_th->no_gc_sleep_time; trace_f2fs_background_gc(sbi->sb, wait_ms, @@ -172,7 +172,11 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, if (gc_type != FG_GC && p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; - p->offset = sbi->last_victim[p->gc_mode]; + /* let's select beginning hot/small space first */ + if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) + p->offset = 0; + else + p->offset = SIT_I(sbi)->last_victim[p->gc_mode]; } static unsigned int get_max_cost(struct f2fs_sb_info *sbi, @@ -182,7 +186,7 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, if (p->alloc_mode == SSR) return sbi->blocks_per_seg; if (p->gc_mode == GC_GREEDY) - return sbi->blocks_per_seg * p->ofs_unit; + return 2 * sbi->blocks_per_seg * p->ofs_unit; else if (p->gc_mode == GC_CB) return UINT_MAX; else /* No other gc_mode */ @@ -207,7 +211,7 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) continue; clear_bit(secno, dirty_i->victim_secmap); - return secno * sbi->segs_per_sec; + return GET_SEG_FROM_SEC(sbi, secno); } return NULL_SEGNO; } @@ -215,8 +219,8 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - unsigned int secno = GET_SECNO(sbi, segno); - unsigned int start = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start = GET_SEG_FROM_SEC(sbi, secno); unsigned long long mtime = 0; unsigned int vblocks; unsigned char age = 0; @@ -225,7 +229,7 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) for (i = 0; i < sbi->segs_per_sec; i++) mtime += get_seg_entry(sbi, start + i)->mtime; - vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); + vblocks = get_valid_blocks(sbi, segno, true); mtime = div_u64(mtime, sbi->segs_per_sec); vblocks = div_u64(vblocks, sbi->segs_per_sec); @@ -248,7 +252,7 @@ static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi, unsigned int segno) { unsigned int valid_blocks = - get_valid_blocks(sbi, segno, sbi->segs_per_sec); + get_valid_blocks(sbi, segno, true); return IS_DATASEG(get_seg_entry(sbi, segno)->type) ? valid_blocks * 2 : valid_blocks; @@ -291,6 +295,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, unsigned int *result, int gc_type, int type, char alloc_mode) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct sit_info *sm = SIT_I(sbi); struct victim_sel_policy p; unsigned int secno, last_victim; unsigned int last_segment = MAIN_SEGS(sbi); @@ -304,10 +309,18 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, p.min_segno = NULL_SEGNO; p.min_cost = get_max_cost(sbi, &p); + if (*result != NULL_SEGNO) { + if (IS_DATASEG(get_seg_entry(sbi, *result)->type) && + get_valid_blocks(sbi, *result, false) && + !sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result))) + p.min_segno = *result; + goto out; + } + if (p.max_search == 0) goto out; - last_victim = sbi->last_victim[p.gc_mode]; + last_victim = sm->last_victim[p.gc_mode]; if (p.alloc_mode == LFS && gc_type == FG_GC) { p.min_segno = check_bg_victims(sbi); if (p.min_segno != NULL_SEGNO) @@ -320,9 +333,10 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, segno = find_next_bit(p.dirty_segmap, last_segment, p.offset); if (segno >= last_segment) { - if (sbi->last_victim[p.gc_mode]) { - last_segment = sbi->last_victim[p.gc_mode]; - sbi->last_victim[p.gc_mode] = 0; + if (sm->last_victim[p.gc_mode]) { + last_segment = + sm->last_victim[p.gc_mode]; + sm->last_victim[p.gc_mode] = 0; p.offset = 0; continue; } @@ -339,7 +353,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, nsearched++; } - secno = GET_SECNO(sbi, segno); + secno = GET_SEC_FROM_SEG(sbi, segno); if (sec_usage_check(sbi, secno)) goto next; @@ -357,17 +371,18 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, } next: if (nsearched >= p.max_search) { - if (!sbi->last_victim[p.gc_mode] && segno <= last_victim) - sbi->last_victim[p.gc_mode] = last_victim + 1; + if (!sm->last_victim[p.gc_mode] && segno <= last_victim) + sm->last_victim[p.gc_mode] = last_victim + 1; else - sbi->last_victim[p.gc_mode] = segno + 1; + sm->last_victim[p.gc_mode] = segno + 1; + sm->last_victim[p.gc_mode] %= MAIN_SEGS(sbi); break; } } if (p.min_segno != NULL_SEGNO) { got_it: if (p.alloc_mode == LFS) { - secno = GET_SECNO(sbi, p.min_segno); + secno = GET_SEC_FROM_SEG(sbi, p.min_segno); if (gc_type == FG_GC) sbi->cur_victim_sec = secno; else @@ -550,8 +565,10 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, get_node_info(sbi, nid, dni); if (sum->version != dni->version) { - f2fs_put_page(node_page, 1); - return false; + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: valid data with mismatched node version.", + __func__); + set_sbi_flag(sbi, SBI_NEED_FSCK); } *nofs = ofs_of_node(node_page); @@ -697,8 +714,10 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, .type = DATA, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC, + .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, + .need_lock = true, }; bool is_dirty = PageDirty(page); int err; @@ -890,7 +909,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, GET_SUM_BLOCK(sbi, segno)); f2fs_put_page(sum_page, 0); - if (get_valid_blocks(sbi, segno, 1) == 0 || + if (get_valid_blocks(sbi, segno, false) == 0 || !PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi))) goto next; @@ -905,7 +924,6 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, * - mutex_lock(sentry_lock) - change_curseg() * - lock_page(sum_page) */ - if (type == SUM_TYPE_NODE) gc_node_segment(sbi, sum->entries, segno, gc_type); else @@ -924,7 +942,7 @@ next: blk_finish_plug(&plug); if (gc_type == FG_GC && - get_valid_blocks(sbi, start_segno, sbi->segs_per_sec) == 0) + get_valid_blocks(sbi, start_segno, true) == 0) sec_freed = 1; stat_inc_call_count(sbi->stat_info); @@ -932,13 +950,14 @@ next: return sec_freed; } -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, + bool background, unsigned int segno) { - unsigned int segno; int gc_type = sync ? FG_GC : BG_GC; int sec_freed = 0; int ret = -EINVAL; struct cp_control cpc; + unsigned int init_segno = segno; struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), .iroot = RADIX_TREE_INIT(GFP_NOFS), @@ -959,9 +978,11 @@ gc_more: * threshold, we can make them free by checkpoint. Then, we * secure free segments which doesn't need fggc any more. */ - ret = write_checkpoint(sbi, &cpc); - if (ret) - goto stop; + if (prefree_segments(sbi)) { + ret = write_checkpoint(sbi, &cpc); + if (ret) + goto stop; + } if (has_not_enough_free_secs(sbi, 0, 0)) gc_type = FG_GC; } @@ -981,13 +1002,17 @@ gc_more: sbi->cur_victim_sec = NULL_SEGNO; if (!sync) { - if (has_not_enough_free_secs(sbi, sec_freed, 0)) + if (has_not_enough_free_secs(sbi, sec_freed, 0)) { + segno = NULL_SEGNO; goto gc_more; + } if (gc_type == FG_GC) ret = write_checkpoint(sbi, &cpc); } stop: + SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; + SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; mutex_unlock(&sbi->gc_mutex); put_gc_inode(&gc_list); @@ -999,7 +1024,7 @@ stop: void build_gc_manager(struct f2fs_sb_info *sbi) { - u64 main_count, resv_count, ovp_count, blocks_per_sec; + u64 main_count, resv_count, ovp_count; DIRTY_I(sbi)->v_ops = &default_v_ops; @@ -1007,8 +1032,12 @@ void build_gc_manager(struct f2fs_sb_info *sbi) main_count = SM_I(sbi)->main_segments << sbi->log_blocks_per_seg; resv_count = SM_I(sbi)->reserved_segments << sbi->log_blocks_per_seg; ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; - blocks_per_sec = sbi->blocks_per_seg * sbi->segs_per_sec; - sbi->fggc_threshold = div64_u64((main_count - ovp_count) * blocks_per_sec, - (main_count - resv_count)); + sbi->fggc_threshold = div64_u64((main_count - ovp_count) * + BLKS_PER_SEC(sbi), (main_count - resv_count)); + + /* give warm/cold data area from slower device */ + if (sbi->s_ndevs && sbi->segs_per_sec == 1) + SIT_I(sbi)->last_victim[ALLOC_NEXT] = + GET_SEGNO(sbi, FDEV(0).end_blk) + 1; } diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index 71b7206c431e..eb2e031ea887 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -70,7 +70,8 @@ static void str2hashbuf(const unsigned char *msg, size_t len, *buf++ = pad; } -f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, + struct fscrypt_name *fname) { __u32 hash; f2fs_hash_t f2fs_hash; @@ -79,6 +80,10 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) const unsigned char *name = name_info->name; size_t len = name_info->len; + /* encrypted bigname case */ + if (fname && !fname->disk_name.name) + return cpu_to_le32(fname->hash); + if (is_dot_dotdot(name_info)) return 0; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index e32a9e527968..e4c527c4e7d0 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -63,19 +63,21 @@ void read_inline_data(struct page *page, struct page *ipage) SetPageUptodate(page); } -bool truncate_inline_inode(struct page *ipage, u64 from) +void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from) { void *addr; if (from >= MAX_INLINE_DATA) - return false; + return; addr = inline_data_addr(ipage); f2fs_wait_on_page_writeback(ipage, NODE, true); memset(addr + from, 0, MAX_INLINE_DATA - from); set_page_dirty(ipage); - return true; + + if (from == 0) + clear_inode_flag(inode, FI_DATA_EXIST); } int f2fs_read_inline_data(struct inode *inode, struct page *page) @@ -135,6 +137,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) /* write data page to try to make data consistent */ set_page_writeback(page); fio.old_blkaddr = dn->data_blkaddr; + set_inode_flag(dn->inode, FI_HOT_DATA); write_data_page(dn, &fio); f2fs_wait_on_page_writeback(page, DATA, true); if (dirty) { @@ -146,11 +149,11 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) set_inode_flag(dn->inode, FI_APPEND_WRITE); /* clear inline data and flag after data writeback */ - truncate_inline_inode(dn->inode_page, 0); + truncate_inline_inode(dn->inode, dn->inode_page, 0); clear_inline_node(dn->inode_page); clear_out: stat_dec_inline_inode(dn->inode); - f2fs_clear_inline_inode(dn->inode); + clear_inode_flag(dn->inode, FI_INLINE_DATA); f2fs_put_dnode(dn); return 0; } @@ -267,9 +270,8 @@ process_inline: if (f2fs_has_inline_data(inode)) { ipage = get_node_page(sbi, inode->i_ino); f2fs_bug_on(sbi, IS_ERR(ipage)); - if (!truncate_inline_inode(ipage, 0)) - return false; - f2fs_clear_inline_inode(inode); + truncate_inline_inode(inode, ipage, 0); + clear_inode_flag(inode, FI_INLINE_DATA); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { if (truncate_blocks(inode, 0, false)) @@ -296,11 +298,11 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, return NULL; } - namehash = f2fs_dentry_hash(&name); + namehash = f2fs_dentry_hash(&name, fname); inline_dentry = inline_data_addr(ipage); - make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2); + make_dentry_ptr_inline(NULL, &d, inline_dentry); de = find_target_dentry(fname, namehash, NULL, &d); unlock_page(ipage); if (de) @@ -319,7 +321,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, dentry_blk = inline_data_addr(ipage); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); + make_dentry_ptr_inline(NULL, &d, dentry_blk); do_make_empty_dir(inode, parent, &d); set_page_dirty(ipage); @@ -380,7 +382,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, set_page_dirty(page); /* clear inline dir and flag after data writeback */ - truncate_inline_inode(ipage, 0); + truncate_inline_inode(dir, ipage, 0); stat_dec_inline_dir(dir); clear_inode_flag(dir, FI_INLINE_DENTRY); @@ -400,7 +402,7 @@ static int f2fs_add_inline_entries(struct inode *dir, unsigned long bit_pos = 0; int err = 0; - make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2); + make_dentry_ptr_inline(NULL, &d, inline_dentry); while (bit_pos < d.max) { struct f2fs_dir_entry *de; @@ -455,7 +457,7 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, } memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA); - truncate_inline_inode(ipage, 0); + truncate_inline_inode(dir, ipage, 0); unlock_page(ipage); @@ -527,14 +529,12 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, err = PTR_ERR(page); goto fail; } - if (f2fs_encrypted_inode(dir)) - file_set_enc_name(inode); } f2fs_wait_on_page_writeback(ipage, NODE, true); - name_hash = f2fs_dentry_hash(new_name); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); + name_hash = f2fs_dentry_hash(new_name, NULL); + make_dentry_ptr_inline(NULL, &d, dentry_blk); f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); set_page_dirty(ipage); @@ -623,7 +623,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, inline_dentry = inline_data_addr(ipage); - make_dentry_ptr(inode, &d, (void *)inline_dentry, 2); + make_dentry_ptr_inline(inode, &d, inline_dentry); err = f2fs_fill_dentries(ctx, &d, 0, fstr); if (!err) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 24bb8213d974..518f49643092 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -316,7 +316,6 @@ retry: } else if (err != -ENOENT) { f2fs_stop_checkpoint(sbi, false); } - f2fs_inode_synced(inode); return 0; } ret = update_inode(inode, node_page); @@ -339,7 +338,8 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - if (update_inode_page(inode) && wbc && wbc->nr_to_write) + update_inode_page(inode); + if (wbc && wbc->nr_to_write) f2fs_balance_fs(sbi, true); return 0; } @@ -372,13 +372,6 @@ void f2fs_evict_inode(struct inode *inode) if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; -#ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_EVICT_INODE)) { - f2fs_show_injection_info(FAULT_EVICT_INODE); - goto no_delete; - } -#endif - remove_ino_entry(sbi, inode->i_ino, APPEND_INO); remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); @@ -389,6 +382,12 @@ retry: if (F2FS_HAS_BLOCKS(inode)) err = f2fs_truncate(inode); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_EVICT_INODE)) { + f2fs_show_injection_info(FAULT_EVICT_INODE); + err = -EIO; + } +#endif if (!err) { f2fs_lock_op(sbi); err = remove_inode_page(inode); @@ -411,7 +410,10 @@ no_delete: stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); - invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); + /* ino == 0, if f2fs_new_inode() was failed t*/ + if (inode->i_ino) + invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, + inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); if (inode->i_nlink) { @@ -448,6 +450,7 @@ void handle_failed_inode(struct inode *inode) * in a panic when flushing dirty inodes in gdirty_list. */ update_inode_page(inode); + f2fs_inode_synced(inode); /* don't make bad inode, since it becomes a regular file. */ unlock_new_inode(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 98f00a3a7f50..c31b40e5f9cf 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -148,8 +148,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, inode->i_mapping->a_ops = &f2fs_dblock_aops; ino = inode->i_ino; - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -163,6 +161,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out: handle_failed_inode(inode); @@ -324,9 +324,10 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, if (f2fs_encrypted_inode(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { - bool nokey = f2fs_encrypted_inode(inode) && - !fscrypt_has_encryption_key(inode); - err = nokey ? -ENOKEY : -EPERM; + f2fs_msg(inode->i_sb, KERN_WARNING, + "Inconsistent encryption contexts: %lu/%lu", + dir->i_ino, inode->i_ino); + err = -EPERM; goto err_out; } return d_splice_alias(inode, dentry); @@ -423,8 +424,6 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, inode_nohighmem(inode); inode->i_mapping->a_ops = &f2fs_dblock_aops; - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -487,6 +486,8 @@ err_out: } kfree(sd); + + f2fs_balance_fs(sbi, true); return err; out: handle_failed_inode(inode); @@ -508,8 +509,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_mapping->a_ops = &f2fs_dblock_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); - f2fs_balance_fs(sbi, true); - set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); @@ -524,6 +523,8 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out_fail: @@ -554,8 +555,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &f2fs_special_inode_operations; - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -569,6 +568,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out: handle_failed_inode(inode); @@ -595,8 +596,6 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, inode->i_mapping->a_ops = &f2fs_dblock_aops; } - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); if (err) @@ -622,6 +621,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, /* link_count was changed by d_tmpfile as well. */ f2fs_unlock_op(sbi); unlock_new_inode(inode); + + f2fs_balance_fs(sbi, true); return 0; release_out: @@ -720,13 +721,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto put_out_dir; - err = update_dent_inode(old_inode, new_inode, - &new_dentry->d_name); - if (err) { - release_orphan_inode(sbi); - goto put_out_dir; - } - f2fs_set_link(new_dir, new_entry, new_page, old_inode); new_inode->i_ctime = current_time(new_inode); @@ -779,8 +773,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, down_write(&F2FS_I(old_inode)->i_sem); file_lost_pino(old_inode); - if (new_inode && file_enc_name(new_inode)) - file_set_enc_name(old_inode); up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = current_time(old_inode); @@ -908,8 +900,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, old_nlink = old_dir_entry ? -1 : 1; new_nlink = -old_nlink; err = -EMLINK; - if ((old_nlink > 0 && old_inode->i_nlink >= F2FS_LINK_MAX) || - (new_nlink > 0 && new_inode->i_nlink >= F2FS_LINK_MAX)) + if ((old_nlink > 0 && old_dir->i_nlink >= F2FS_LINK_MAX) || + (new_nlink > 0 && new_dir->i_nlink >= F2FS_LINK_MAX)) goto out_new_dir; } @@ -917,18 +909,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_lock_op(sbi); - err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name); - if (err) - goto out_unlock; - if (file_enc_name(new_inode)) - file_set_enc_name(old_inode); - - err = update_dent_inode(new_inode, old_inode, &old_dentry->d_name); - if (err) - goto out_undo; - if (file_enc_name(old_inode)) - file_set_enc_name(new_inode); - /* update ".." directory entry info of old dentry */ if (old_dir_entry) f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); @@ -972,14 +952,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) f2fs_sync_fs(sbi->sb, 1); return 0; -out_undo: - /* - * Still we may fail to recover name info of f2fs_inode here - * Drop it, once its name is set as encrypted - */ - update_dent_inode(old_inode, old_inode, &old_dentry->d_name); -out_unlock: - f2fs_unlock_op(sbi); out_new_dir: if (new_dir_entry) { f2fs_dentry_kunmap(new_inode, new_dir_page); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 481aa8dc79f4..4547c5c5cd98 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -22,7 +22,7 @@ #include "trace.h" #include <trace/events/f2fs.h> -#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock) +#define on_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock) static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; @@ -63,8 +63,9 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) int i; for (i = 0; i <= UPDATE_INO; i++) - mem_size += (sbi->im[i].ino_num * - sizeof(struct ino_entry)) >> PAGE_SHIFT; + mem_size += sbi->im[i].ino_num * + sizeof(struct ino_entry); + mem_size >>= PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == EXTENT_CACHE) { mem_size = (atomic_read(&sbi->total_ext_tree) * @@ -177,18 +178,12 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, } static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, - struct nat_entry *ne) + struct nat_entry_set *set, struct nat_entry *ne) { - nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); - struct nat_entry_set *head; - - head = radix_tree_lookup(&nm_i->nat_set_root, set); - if (head) { - list_move_tail(&ne->list, &nm_i->nat_entries); - set_nat_flag(ne, IS_DIRTY, false); - head->entry_cnt--; - nm_i->dirty_nat_cnt--; - } + list_move_tail(&ne->list, &nm_i->nat_entries); + set_nat_flag(ne, IS_DIRTY, false); + set->entry_cnt--; + nm_i->dirty_nat_cnt--; } static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, @@ -381,6 +376,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) struct page *page = NULL; struct f2fs_nat_entry ne; struct nat_entry *e; + pgoff_t index; int i; ni->nid = nid; @@ -406,17 +402,21 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) node_info_from_raw_nat(ni, &ne); } up_read(&curseg->journal_rwsem); - if (i >= 0) + if (i >= 0) { + up_read(&nm_i->nat_tree_lock); goto cache; + } /* Fill node_info from nat page */ - page = get_current_nat_page(sbi, start_nid); + index = current_nat_addr(sbi, nid); + up_read(&nm_i->nat_tree_lock); + + page = get_meta_page(sbi, index); nat_blk = (struct f2fs_nat_block *)page_address(page); ne = nat_blk->entries[nid - start_nid]; node_info_from_raw_nat(ni, &ne); f2fs_put_page(page, 1); cache: - up_read(&nm_i->nat_tree_lock); /* cache nat entry */ down_write(&nm_i->nat_tree_lock); cache_nat_entry(sbi, nid, &ne); @@ -1463,6 +1463,9 @@ continue_unlock: f2fs_wait_on_page_writeback(page, NODE, true); BUG_ON(PageWriteback(page)); + set_fsync_mark(page, 0); + set_dentry_mark(page, 0); + if (!atomic || page == last_page) { set_fsync_mark(page, 1); if (IS_INODE(page)) { @@ -1766,40 +1769,67 @@ static void __remove_nid_from_list(struct f2fs_sb_info *sbi, static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) { struct f2fs_nm_info *nm_i = NM_I(sbi); - struct free_nid *i; + struct free_nid *i, *e; struct nat_entry *ne; - int err; + int err = -EINVAL; + bool ret = false; /* 0 nid should not be used */ if (unlikely(nid == 0)) return false; - if (build) { - /* do not add allocated nids */ - ne = __lookup_nat_cache(nm_i, nid); - if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || - nat_get_blkaddr(ne) != NULL_ADDR)) - return false; - } - i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS); i->nid = nid; i->state = NID_NEW; - if (radix_tree_preload(GFP_NOFS)) { - kmem_cache_free(free_nid_slab, i); - return true; - } + if (radix_tree_preload(GFP_NOFS)) + goto err; spin_lock(&nm_i->nid_list_lock); + + if (build) { + /* + * Thread A Thread B + * - f2fs_create + * - f2fs_new_inode + * - alloc_nid + * - __insert_nid_to_list(ALLOC_NID_LIST) + * - f2fs_balance_fs_bg + * - build_free_nids + * - __build_free_nids + * - scan_nat_page + * - add_free_nid + * - __lookup_nat_cache + * - f2fs_add_link + * - init_inode_metadata + * - new_inode_page + * - new_node_page + * - set_node_addr + * - alloc_nid_done + * - __remove_nid_from_list(ALLOC_NID_LIST) + * - __insert_nid_to_list(FREE_NID_LIST) + */ + ne = __lookup_nat_cache(nm_i, nid); + if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || + nat_get_blkaddr(ne) != NULL_ADDR)) + goto err_out; + + e = __lookup_free_nid_list(nm_i, nid); + if (e) { + if (e->state == NID_NEW) + ret = true; + goto err_out; + } + } + ret = true; err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true); +err_out: spin_unlock(&nm_i->nid_list_lock); radix_tree_preload_end(); - if (err) { +err: + if (err) kmem_cache_free(free_nid_slab, i); - return true; - } - return true; + return ret; } static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) @@ -1821,7 +1851,7 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) } static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, - bool set, bool build, bool locked) + bool set, bool build) { struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); @@ -1835,14 +1865,10 @@ static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, else __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); - if (!locked) - spin_lock(&nm_i->free_nid_lock); if (set) nm_i->free_nid_count[nat_ofs]++; else if (!build) nm_i->free_nid_count[nat_ofs]--; - if (!locked) - spin_unlock(&nm_i->free_nid_lock); } static void scan_nat_page(struct f2fs_sb_info *sbi, @@ -1871,7 +1897,9 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, blk_addr == NEW_ADDR); if (blk_addr == NULL_ADDR) freed = add_free_nid(sbi, start_nid, true); - update_free_nid_bitmap(sbi, start_nid, freed, true, false); + spin_lock(&NM_I(sbi)->nid_list_lock); + update_free_nid_bitmap(sbi, start_nid, freed, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); } } @@ -1927,6 +1955,9 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) int i = 0; nid_t nid = nm_i->next_scan_nid; + if (unlikely(nid >= nm_i->max_nid)) + nid = 0; + /* Enough entries */ if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK) return; @@ -2026,7 +2057,7 @@ retry: __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); nm_i->available_nids--; - update_free_nid_bitmap(sbi, *nid, false, false, false); + update_free_nid_bitmap(sbi, *nid, false, false); spin_unlock(&nm_i->nid_list_lock); return true; @@ -2082,7 +2113,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) nm_i->available_nids++; - update_free_nid_bitmap(sbi, nid, true, false, false); + update_free_nid_bitmap(sbi, nid, true, false); spin_unlock(&nm_i->nid_list_lock); @@ -2407,16 +2438,16 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, } raw_nat_from_node_info(raw_ne, &ne->ni); nat_reset_flag(ne); - __clear_nat_cache_dirty(NM_I(sbi), ne); + __clear_nat_cache_dirty(NM_I(sbi), set, ne); if (nat_get_blkaddr(ne) == NULL_ADDR) { add_free_nid(sbi, nid, false); spin_lock(&NM_I(sbi)->nid_list_lock); NM_I(sbi)->available_nids++; - update_free_nid_bitmap(sbi, nid, true, false, false); + update_free_nid_bitmap(sbi, nid, true, false); spin_unlock(&NM_I(sbi)->nid_list_lock); } else { spin_lock(&NM_I(sbi)->nid_list_lock); - update_free_nid_bitmap(sbi, nid, false, false, false); + update_free_nid_bitmap(sbi, nid, false, false); spin_unlock(&NM_I(sbi)->nid_list_lock); } } @@ -2428,10 +2459,11 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, f2fs_put_page(page, 1); } - f2fs_bug_on(sbi, set->entry_cnt); - - radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); - kmem_cache_free(nat_entry_set_slab, set); + /* Allow dirty nats by node block allocation in write_begin */ + if (!set->entry_cnt) { + radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); + kmem_cache_free(nat_entry_set_slab, set); + } } /* @@ -2476,8 +2508,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) __flush_nat_entry_set(sbi, set, cpc); up_write(&nm_i->nat_tree_lock); - - f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); + /* Allow dirty nats by node block allocation in write_begin */ } static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) @@ -2541,10 +2572,10 @@ inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) nid = i * NAT_ENTRY_PER_BLOCK; last_nid = (i + 1) * NAT_ENTRY_PER_BLOCK; - spin_lock(&nm_i->free_nid_lock); + spin_lock(&NM_I(sbi)->nid_list_lock); for (; nid < last_nid; nid++) - update_free_nid_bitmap(sbi, nid, true, true, true); - spin_unlock(&nm_i->free_nid_lock); + update_free_nid_bitmap(sbi, nid, true, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); } for (i = 0; i < nm_i->nat_blocks; i++) { @@ -2621,23 +2652,20 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); - nm_i->free_nid_bitmap = f2fs_kvzalloc(nm_i->nat_blocks * + nm_i->free_nid_bitmap = kvzalloc(nm_i->nat_blocks * NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL); if (!nm_i->free_nid_bitmap) return -ENOMEM; - nm_i->nat_block_bitmap = f2fs_kvzalloc(nm_i->nat_blocks / 8, + nm_i->nat_block_bitmap = kvzalloc(nm_i->nat_blocks / 8, GFP_KERNEL); if (!nm_i->nat_block_bitmap) return -ENOMEM; - nm_i->free_nid_count = f2fs_kvzalloc(nm_i->nat_blocks * + nm_i->free_nid_count = kvzalloc(nm_i->nat_blocks * sizeof(unsigned short), GFP_KERNEL); if (!nm_i->free_nid_count) return -ENOMEM; - - spin_lock_init(&nm_i->free_nid_lock); - return 0; } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 2f9603fa85a5..558048e33cf9 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -9,10 +9,10 @@ * published by the Free Software Foundation. */ /* start node id of a node block dedicated to the given node id */ -#define START_NID(nid) ((nid / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) +#define START_NID(nid) (((nid) / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) /* node block offset on the NAT area dedicated to the given start node id */ -#define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK) +#define NAT_BLOCK_OFFSET(start_nid) ((start_nid) / NAT_ENTRY_PER_BLOCK) /* # of pages to perform synchronous readahead before building free nids */ #define FREE_NID_PAGES 8 @@ -62,16 +62,16 @@ struct nat_entry { struct node_info ni; /* in-memory node information */ }; -#define nat_get_nid(nat) (nat->ni.nid) -#define nat_set_nid(nat, n) (nat->ni.nid = n) -#define nat_get_blkaddr(nat) (nat->ni.blk_addr) -#define nat_set_blkaddr(nat, b) (nat->ni.blk_addr = b) -#define nat_get_ino(nat) (nat->ni.ino) -#define nat_set_ino(nat, i) (nat->ni.ino = i) -#define nat_get_version(nat) (nat->ni.version) -#define nat_set_version(nat, v) (nat->ni.version = v) +#define nat_get_nid(nat) ((nat)->ni.nid) +#define nat_set_nid(nat, n) ((nat)->ni.nid = (n)) +#define nat_get_blkaddr(nat) ((nat)->ni.blk_addr) +#define nat_set_blkaddr(nat, b) ((nat)->ni.blk_addr = (b)) +#define nat_get_ino(nat) ((nat)->ni.ino) +#define nat_set_ino(nat, i) ((nat)->ni.ino = (i)) +#define nat_get_version(nat) ((nat)->ni.version) +#define nat_set_version(nat, v) ((nat)->ni.version = (v)) -#define inc_node_version(version) (++version) +#define inc_node_version(version) (++(version)) static inline void copy_node_info(struct node_info *dst, struct node_info *src) @@ -200,13 +200,16 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) struct f2fs_nm_info *nm_i = NM_I(sbi); pgoff_t block_off; pgoff_t block_addr; - int seg_off; + /* + * block_off = segment_off * 512 + off_in_segment + * OLD = (segment_off * 512) * 2 + off_in_segment + * NEW = 2 * (segment_off * 512 + off_in_segment) - off_in_segment + */ block_off = NAT_BLOCK_OFFSET(start); - seg_off = block_off >> sbi->log_blocks_per_seg; block_addr = (pgoff_t)(nm_i->nat_blkaddr + - (seg_off << sbi->log_blocks_per_seg << 1) + + (block_off << 1) - (block_off & (sbi->blocks_per_seg - 1))); if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index d025aa83fb5b..907d6b7dde6a 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -198,7 +198,8 @@ static void recover_inode(struct inode *inode, struct page *page) ino_of_node(page), name); } -static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) +static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, + bool check_only) { struct curseg_info *curseg; struct page *page = NULL; @@ -225,7 +226,8 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) entry = get_fsync_inode(head, ino_of_node(page)); if (!entry) { - if (IS_INODE(page) && is_dent_dnode(page)) { + if (!check_only && + IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) break; @@ -569,7 +571,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) mutex_lock(&sbi->cp_mutex); /* step #1: find fsynced inode numbers */ - err = find_fsync_dnodes(sbi, &inode_list); + err = find_fsync_dnodes(sbi, &inode_list, check_only); if (err || list_empty(&inode_list)) goto out; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 29ef7088c558..96845854e7ee 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -250,6 +250,36 @@ void drop_inmem_pages(struct inode *inode) stat_dec_atomic_write(inode); } +void drop_inmem_page(struct inode *inode, struct page *page) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct list_head *head = &fi->inmem_pages; + struct inmem_pages *cur = NULL; + + f2fs_bug_on(sbi, !IS_ATOMIC_WRITTEN_PAGE(page)); + + mutex_lock(&fi->inmem_lock); + list_for_each_entry(cur, head, list) { + if (cur->page == page) + break; + } + + f2fs_bug_on(sbi, !cur || cur->page != page); + list_del(&cur->list); + mutex_unlock(&fi->inmem_lock); + + dec_page_count(sbi, F2FS_INMEM_PAGES); + kmem_cache_free(inmem_entry_slab, cur); + + ClearPageUptodate(page); + set_page_private(page, 0); + ClearPagePrivate(page); + f2fs_put_page(page, 0); + + trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); +} + static int __commit_inmem_pages(struct inode *inode, struct list_head *revoke_list) { @@ -261,7 +291,6 @@ static int __commit_inmem_pages(struct inode *inode, .type = DATA, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_PRIO, - .encrypted_page = NULL, }; pgoff_t last_idx = ULONG_MAX; int err = 0; @@ -281,6 +310,9 @@ static int __commit_inmem_pages(struct inode *inode, } fio.page = page; + fio.old_blkaddr = NULL_ADDR; + fio.encrypted_page = NULL; + fio.need_lock = false, err = do_write_data_page(&fio); if (err) { unlock_page(page); @@ -358,11 +390,8 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) } #endif - if (!need) - return; - /* balance_fs_bg is able to be pending */ - if (excess_cached_nats(sbi)) + if (need && excess_cached_nats(sbi)) f2fs_balance_fs_bg(sbi); /* @@ -371,7 +400,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) */ if (has_not_enough_free_secs(sbi, 0, 0)) { mutex_lock(&sbi->gc_mutex); - f2fs_gc(sbi, false, false); + f2fs_gc(sbi, false, false, NULL_SEGNO); } } @@ -390,7 +419,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) else build_free_nids(sbi, false, false); - if (!is_idle(sbi)) + if (!is_idle(sbi) && !excess_dirty_nats(sbi)) return; /* checkpoint is the only way to shrink partial cached entries */ @@ -411,32 +440,34 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) } } -static int __submit_flush_wait(struct block_device *bdev) +static int __submit_flush_wait(struct f2fs_sb_info *sbi, + struct block_device *bdev) { struct bio *bio = f2fs_bio_alloc(0); int ret; - bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; bio->bi_bdev = bdev; ret = submit_bio_wait(bio); bio_put(bio); + + trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER), + test_opt(sbi, FLUSH_MERGE), ret); return ret; } static int submit_flush_wait(struct f2fs_sb_info *sbi) { - int ret = __submit_flush_wait(sbi->sb->s_bdev); + int ret = __submit_flush_wait(sbi, sbi->sb->s_bdev); int i; - if (sbi->s_ndevs && !ret) { - for (i = 1; i < sbi->s_ndevs; i++) { - trace_f2fs_issue_flush(FDEV(i).bdev, - test_opt(sbi, NOBARRIER), - test_opt(sbi, FLUSH_MERGE)); - ret = __submit_flush_wait(FDEV(i).bdev); - if (ret) - break; - } + if (!sbi->s_ndevs || ret) + return ret; + + for (i = 1; i < sbi->s_ndevs; i++) { + ret = __submit_flush_wait(sbi, FDEV(i).bdev); + if (ret) + break; } return ret; } @@ -458,6 +489,8 @@ repeat: fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); ret = submit_flush_wait(sbi); + atomic_inc(&fcc->issued_flush); + llist_for_each_entry_safe(cmd, next, fcc->dispatch_list, llnode) { cmd->ret = ret; @@ -475,25 +508,29 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) { struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; struct flush_cmd cmd; + int ret; if (test_opt(sbi, NOBARRIER)) return 0; - if (!test_opt(sbi, FLUSH_MERGE)) - return submit_flush_wait(sbi); - - if (!atomic_read(&fcc->submit_flush)) { - int ret; + if (!test_opt(sbi, FLUSH_MERGE)) { + ret = submit_flush_wait(sbi); + atomic_inc(&fcc->issued_flush); + return ret; + } - atomic_inc(&fcc->submit_flush); + if (!atomic_read(&fcc->issing_flush)) { + atomic_inc(&fcc->issing_flush); ret = submit_flush_wait(sbi); - atomic_dec(&fcc->submit_flush); + atomic_dec(&fcc->issing_flush); + + atomic_inc(&fcc->issued_flush); return ret; } init_completion(&cmd.wait); - atomic_inc(&fcc->submit_flush); + atomic_inc(&fcc->issing_flush); llist_add(&cmd.llnode, &fcc->issue_list); if (!fcc->dispatch_list) @@ -501,10 +538,10 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) if (fcc->f2fs_issue_flush) { wait_for_completion(&cmd.wait); - atomic_dec(&fcc->submit_flush); + atomic_dec(&fcc->issing_flush); } else { llist_del_all(&fcc->issue_list); - atomic_set(&fcc->submit_flush, 0); + atomic_set(&fcc->issing_flush, 0); } return cmd.ret; @@ -524,7 +561,8 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; - atomic_set(&fcc->submit_flush, 0); + atomic_set(&fcc->issued_flush, 0); + atomic_set(&fcc->issing_flush, 0); init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->fcc_info = fcc; @@ -597,8 +635,8 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) dirty_i->nr_dirty[t]--; - if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0) - clear_bit(GET_SECNO(sbi, segno), + if (get_valid_blocks(sbi, segno, true) == 0) + clear_bit(GET_SEC_FROM_SEG(sbi, segno), dirty_i->victim_secmap); } } @@ -618,7 +656,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_lock(&dirty_i->seglist_lock); - valid_blocks = get_valid_blocks(sbi, segno, 0); + valid_blocks = get_valid_blocks(sbi, segno, false); if (valid_blocks == 0) { __locate_dirty_segment(sbi, segno, PRE); @@ -633,162 +671,407 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } -static void __add_discard_cmd(struct f2fs_sb_info *sbi, - struct bio *bio, block_t lstart, block_t len) +static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *cmd_list = &(dcc->discard_cmd_list); + struct list_head *pend_list; struct discard_cmd *dc; + f2fs_bug_on(sbi, !len); + + pend_list = &dcc->pend_list[plist_idx(len)]; + dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); INIT_LIST_HEAD(&dc->list); - dc->bio = bio; - bio->bi_private = dc; + dc->bdev = bdev; dc->lstart = lstart; + dc->start = start; dc->len = len; + dc->ref = 0; dc->state = D_PREP; + dc->error = 0; init_completion(&dc->wait); + list_add_tail(&dc->list, pend_list); + atomic_inc(&dcc->discard_cmd_cnt); + dcc->undiscard_blks += len; - mutex_lock(&dcc->cmd_lock); - list_add_tail(&dc->list, cmd_list); - mutex_unlock(&dcc->cmd_lock); + return dc; } -static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc) +static struct discard_cmd *__attach_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len, + struct rb_node *parent, struct rb_node **p) { - int err = dc->bio->bi_error; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *dc; - if (dc->state == D_DONE) - atomic_dec(&(SM_I(sbi)->dcc_info->submit_discard)); + dc = __create_discard_cmd(sbi, bdev, lstart, start, len); - if (err == -EOPNOTSUPP) - err = 0; + rb_link_node(&dc->rb_node, parent, p); + rb_insert_color(&dc->rb_node, &dcc->root); + + return dc; +} + +static void __detach_discard_cmd(struct discard_cmd_control *dcc, + struct discard_cmd *dc) +{ + if (dc->state == D_DONE) + atomic_dec(&dcc->issing_discard); - if (err) - f2fs_msg(sbi->sb, KERN_INFO, - "Issue discard failed, ret: %d", err); - bio_put(dc->bio); list_del(&dc->list); + rb_erase(&dc->rb_node, &dcc->root); + dcc->undiscard_blks -= dc->len; + kmem_cache_free(discard_cmd_slab, dc); + + atomic_dec(&dcc->discard_cmd_cnt); } -/* This should be covered by global mutex, &sit_i->sentry_lock */ -void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) +static void __remove_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *wait_list = &(dcc->discard_cmd_list); - struct discard_cmd *dc, *tmp; - struct blk_plug plug; - mutex_lock(&dcc->cmd_lock); + if (dc->error == -EOPNOTSUPP) + dc->error = 0; - blk_start_plug(&plug); + if (dc->error) + f2fs_msg(sbi->sb, KERN_INFO, + "Issue discard failed, ret: %d", dc->error); + __detach_discard_cmd(dcc, dc); +} - list_for_each_entry_safe(dc, tmp, wait_list, list) { +static void f2fs_submit_discard_endio(struct bio *bio) +{ + struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; - if (blkaddr == NULL_ADDR) { - if (dc->state == D_PREP) { - dc->state = D_SUBMIT; - submit_bio(dc->bio); - atomic_inc(&dcc->submit_discard); - } - continue; + dc->error = bio->bi_error; + dc->state = D_DONE; + complete(&dc->wait); + bio_put(bio); +} + +/* this function is copied from blkdev_issue_discard from block/blk-lib.c */ +static void __submit_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct bio *bio = NULL; + + if (dc->state != D_PREP) + return; + + trace_f2fs_issue_discard(dc->bdev, dc->start, dc->len); + + dc->error = __blkdev_issue_discard(dc->bdev, + SECTOR_FROM_BLOCK(dc->start), + SECTOR_FROM_BLOCK(dc->len), + GFP_NOFS, 0, &bio); + if (!dc->error) { + /* should keep before submission to avoid D_DONE right away */ + dc->state = D_SUBMIT; + atomic_inc(&dcc->issued_discard); + atomic_inc(&dcc->issing_discard); + if (bio) { + bio->bi_private = dc; + bio->bi_end_io = f2fs_submit_discard_endio; + bio->bi_opf |= REQ_SYNC; + submit_bio(bio); + list_move_tail(&dc->list, &dcc->wait_list); } + } else { + __remove_discard_cmd(sbi, dc); + } +} - if (dc->lstart <= blkaddr && blkaddr < dc->lstart + dc->len) { - if (dc->state == D_SUBMIT) - wait_for_completion_io(&dc->wait); - else - __remove_discard_cmd(sbi, dc); +static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len, + struct rb_node **insert_p, + struct rb_node *insert_parent) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct rb_node **p = &dcc->root.rb_node; + struct rb_node *parent = NULL; + struct discard_cmd *dc = NULL; + + if (insert_p && insert_parent) { + parent = insert_parent; + p = insert_p; + goto do_insert; + } + + p = __lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, lstart); +do_insert: + dc = __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, p); + if (!dc) + return NULL; + + return dc; +} + +static void __relocate_discard_cmd(struct discard_cmd_control *dcc, + struct discard_cmd *dc) +{ + list_move_tail(&dc->list, &dcc->pend_list[plist_idx(dc->len)]); +} + +static void __punch_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc, block_t blkaddr) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_info di = dc->di; + bool modified = false; + + if (dc->state == D_DONE || dc->len == 1) { + __remove_discard_cmd(sbi, dc); + return; + } + + dcc->undiscard_blks -= di.len; + + if (blkaddr > di.lstart) { + dc->len = blkaddr - dc->lstart; + dcc->undiscard_blks += dc->len; + __relocate_discard_cmd(dcc, dc); + f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); + modified = true; + } + + if (blkaddr < di.lstart + di.len - 1) { + if (modified) { + __insert_discard_tree(sbi, dc->bdev, blkaddr + 1, + di.start + blkaddr + 1 - di.lstart, + di.lstart + di.len - 1 - blkaddr, + NULL, NULL); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); + } else { + dc->lstart++; + dc->len--; + dc->start++; + dcc->undiscard_blks += dc->len; + __relocate_discard_cmd(dcc, dc); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); } } - blk_finish_plug(&plug); +} - /* this comes from f2fs_put_super */ - if (blkaddr == NULL_ADDR) { - list_for_each_entry_safe(dc, tmp, wait_list, list) { - wait_for_completion_io(&dc->wait); - __remove_discard_cmd(sbi, dc); +static void __update_discard_tree_range(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct discard_cmd *dc; + struct discard_info di = {0}; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + block_t end = lstart + len; + + mutex_lock(&dcc->cmd_lock); + + dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root, + NULL, lstart, + (struct rb_entry **)&prev_dc, + (struct rb_entry **)&next_dc, + &insert_p, &insert_parent, true); + if (dc) + prev_dc = dc; + + if (!prev_dc) { + di.lstart = lstart; + di.len = next_dc ? next_dc->lstart - lstart : len; + di.len = min(di.len, len); + di.start = start; + } + + while (1) { + struct rb_node *node; + bool merged = false; + struct discard_cmd *tdc = NULL; + + if (prev_dc) { + di.lstart = prev_dc->lstart + prev_dc->len; + if (di.lstart < lstart) + di.lstart = lstart; + if (di.lstart >= end) + break; + + if (!next_dc || next_dc->lstart > end) + di.len = end - di.lstart; + else + di.len = next_dc->lstart - di.lstart; + di.start = start + di.lstart - lstart; + } + + if (!di.len) + goto next; + + if (prev_dc && prev_dc->state == D_PREP && + prev_dc->bdev == bdev && + __is_discard_back_mergeable(&di, &prev_dc->di)) { + prev_dc->di.len += di.len; + dcc->undiscard_blks += di.len; + __relocate_discard_cmd(dcc, prev_dc); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); + di = prev_dc->di; + tdc = prev_dc; + merged = true; + } + + if (next_dc && next_dc->state == D_PREP && + next_dc->bdev == bdev && + __is_discard_front_mergeable(&di, &next_dc->di)) { + next_dc->di.lstart = di.lstart; + next_dc->di.len += di.len; + next_dc->di.start = di.start; + dcc->undiscard_blks += di.len; + __relocate_discard_cmd(dcc, next_dc); + if (tdc) + __remove_discard_cmd(sbi, tdc); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); + merged = true; + } + + if (!merged) { + __insert_discard_tree(sbi, bdev, di.lstart, di.start, + di.len, NULL, NULL); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); } + next: + prev_dc = next_dc; + if (!prev_dc) + break; + + node = rb_next(&prev_dc->rb_node); + next_dc = rb_entry_safe(node, struct discard_cmd, rb_node); } + mutex_unlock(&dcc->cmd_lock); } -static void f2fs_submit_discard_endio(struct bio *bio) +static int __queue_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) { - struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; + block_t lblkstart = blkstart; - complete(&dc->wait); - dc->state = D_DONE; + trace_f2fs_queue_discard(bdev, blkstart, blklen); + + if (sbi->s_ndevs) { + int devi = f2fs_target_device_index(sbi, blkstart); + + blkstart -= FDEV(devi).start_blk; + } + __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen); + return 0; } -static int issue_discard_thread(void *data) +static void __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) { - struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - wait_queue_head_t *q = &dcc->discard_wait_queue; - struct list_head *cmd_list = &dcc->discard_cmd_list; + struct list_head *pend_list; struct discard_cmd *dc, *tmp; struct blk_plug plug; - int iter = 0; -repeat: - if (kthread_should_stop()) - return 0; + int i, iter = 0; + mutex_lock(&dcc->cmd_lock); blk_start_plug(&plug); + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) { + f2fs_bug_on(sbi, dc->state != D_PREP); + + if (!issue_cond || is_idle(sbi)) + __submit_discard_cmd(sbi, dc); + if (issue_cond && iter++ > DISCARD_ISSUE_RATE) + goto out; + } + } +out: + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); +} + +static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *wait_list = &(dcc->wait_list); + struct discard_cmd *dc, *tmp; mutex_lock(&dcc->cmd_lock); - list_for_each_entry_safe(dc, tmp, cmd_list, list) { - if (dc->state == D_PREP) { - dc->state = D_SUBMIT; - submit_bio(dc->bio); - atomic_inc(&dcc->submit_discard); - if (iter++ > DISCARD_ISSUE_RATE) - break; - } else if (dc->state == D_DONE) { + list_for_each_entry_safe(dc, tmp, wait_list, list) { + if (!wait_cond || dc->state == D_DONE) { + if (dc->ref) + continue; + wait_for_completion_io(&dc->wait); __remove_discard_cmd(sbi, dc); } } mutex_unlock(&dcc->cmd_lock); +} - blk_finish_plug(&plug); +/* This should be covered by global mutex, &sit_i->sentry_lock */ +void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *dc; + bool need_wait = false; - iter = 0; - congestion_wait(BLK_RW_SYNC, HZ/50); + mutex_lock(&dcc->cmd_lock); + dc = (struct discard_cmd *)__lookup_rb_tree(&dcc->root, NULL, blkaddr); + if (dc) { + if (dc->state == D_PREP) { + __punch_discard_cmd(sbi, dc, blkaddr); + } else { + dc->ref++; + need_wait = true; + } + } + mutex_unlock(&dcc->cmd_lock); - wait_event_interruptible(*q, - kthread_should_stop() || !list_empty(&dcc->discard_cmd_list)); - goto repeat; + if (need_wait) { + wait_for_completion_io(&dc->wait); + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, dc->state != D_DONE); + dc->ref--; + if (!dc->ref) + __remove_discard_cmd(sbi, dc); + mutex_unlock(&dcc->cmd_lock); + } } - -/* this function is copied from blkdev_issue_discard from block/blk-lib.c */ -static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, - struct block_device *bdev, block_t blkstart, block_t blklen) +/* This comes from f2fs_put_super */ +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) { - struct bio *bio = NULL; - block_t lblkstart = blkstart; - int err; + __issue_discard_cmd(sbi, false); + __wait_discard_cmd(sbi, false); +} - trace_f2fs_issue_discard(bdev, blkstart, blklen); +static int issue_discard_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + wait_queue_head_t *q = &dcc->discard_wait_queue; +repeat: + if (kthread_should_stop()) + return 0; - if (sbi->s_ndevs) { - int devi = f2fs_target_device_index(sbi, blkstart); + __issue_discard_cmd(sbi, true); + __wait_discard_cmd(sbi, true); - blkstart -= FDEV(devi).start_blk; - } - err = __blkdev_issue_discard(bdev, - SECTOR_FROM_BLOCK(blkstart), - SECTOR_FROM_BLOCK(blklen), - GFP_NOFS, 0, &bio); - if (!err && bio) { - bio->bi_end_io = f2fs_submit_discard_endio; - bio->bi_opf |= REQ_SYNC; + congestion_wait(BLK_RW_SYNC, HZ/50); - __add_discard_cmd(sbi, bio, lblkstart, blklen); - wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue); - } - return err; + wait_event_interruptible(*q, kthread_should_stop() || + atomic_read(&dcc->discard_cmd_cnt)); + goto repeat; } #ifdef CONFIG_BLK_DEV_ZONED @@ -796,6 +1079,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { sector_t sector, nr_sects; + block_t lblkstart = blkstart; int devi = 0; if (sbi->s_ndevs) { @@ -813,7 +1097,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, case BLK_ZONE_TYPE_CONVENTIONAL: if (!blk_queue_discard(bdev_get_queue(bdev))) return 0; - return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); + return __queue_discard_cmd(sbi, bdev, lblkstart, blklen); case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: sector = SECTOR_FROM_BLOCK(blkstart); @@ -845,7 +1129,7 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi, bdev_zoned_model(bdev) != BLK_ZONED_NONE) return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); #endif - return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); + return __queue_discard_cmd(sbi, bdev, blkstart, blklen); } static int f2fs_issue_discard(struct f2fs_sb_info *sbi, @@ -888,32 +1172,6 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, return err; } -static void __add_discard_entry(struct f2fs_sb_info *sbi, - struct cp_control *cpc, struct seg_entry *se, - unsigned int start, unsigned int end) -{ - struct list_head *head = &SM_I(sbi)->dcc_info->discard_entry_list; - struct discard_entry *new, *last; - - if (!list_empty(head)) { - last = list_last_entry(head, struct discard_entry, list); - if (START_BLOCK(sbi, cpc->trim_start) + start == - last->blkaddr + last->len && - last->len < MAX_DISCARD_BLOCKS(sbi)) { - last->len += end - start; - goto done; - } - } - - new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS); - INIT_LIST_HEAD(&new->list); - new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start; - new->len = end - start; - list_add_tail(&new->list, head); -done: - SM_I(sbi)->dcc_info->nr_discards += end - start; -} - static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, bool check_only) { @@ -925,7 +1183,9 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, unsigned long *discard_map = (unsigned long *)se->discard_map; unsigned long *dmap = SIT_I(sbi)->tmp_map; unsigned int start = 0, end = -1; - bool force = (cpc->reason == CP_DISCARD); + bool force = (cpc->reason & CP_DISCARD); + struct discard_entry *de = NULL; + struct list_head *head = &SM_I(sbi)->dcc_info->entry_list; int i; if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi)) @@ -957,14 +1217,24 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, if (check_only) return true; - __add_discard_entry(sbi, cpc, se, start, end); + if (!de) { + de = f2fs_kmem_cache_alloc(discard_entry_slab, + GFP_F2FS_ZERO); + de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start); + list_add_tail(&de->list, head); + } + + for (i = start; i < end; i++) + __set_bit_le(i, (void *)de->discard_map); + + SM_I(sbi)->dcc_info->nr_discards += end - start; } return false; } void release_discard_addrs(struct f2fs_sb_info *sbi) { - struct list_head *head = &(SM_I(sbi)->dcc_info->discard_entry_list); + struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); struct discard_entry *entry, *this; /* drop caches */ @@ -990,13 +1260,13 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) { - struct list_head *head = &(SM_I(sbi)->dcc_info->discard_entry_list); + struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int start = 0, end = -1; unsigned int secno, start_segno; - bool force = (cpc->reason == CP_DISCARD); + bool force = (cpc->reason & CP_DISCARD); mutex_lock(&dirty_i->seglist_lock); @@ -1026,10 +1296,10 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) continue; } next: - secno = GET_SECNO(sbi, start); - start_segno = secno * sbi->segs_per_sec; + secno = GET_SEC_FROM_SEG(sbi, start); + start_segno = GET_SEG_FROM_SEC(sbi, secno); if (!IS_CURSEC(sbi, secno) && - !get_valid_blocks(sbi, start, sbi->segs_per_sec)) + !get_valid_blocks(sbi, start, true)) f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno), sbi->segs_per_sec << sbi->log_blocks_per_seg); @@ -1043,22 +1313,46 @@ next: /* send small discards */ list_for_each_entry_safe(entry, this, head, list) { - if (force && entry->len < cpc->trim_minlen) - goto skip; - f2fs_issue_discard(sbi, entry->blkaddr, entry->len); - cpc->trimmed += entry->len; + unsigned int cur_pos = 0, next_pos, len, total_len = 0; + bool is_valid = test_bit_le(0, entry->discard_map); + +find_next: + if (is_valid) { + next_pos = find_next_zero_bit_le(entry->discard_map, + sbi->blocks_per_seg, cur_pos); + len = next_pos - cur_pos; + + if (force && len < cpc->trim_minlen) + goto skip; + + f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos, + len); + cpc->trimmed += len; + total_len += len; + } else { + next_pos = find_next_bit_le(entry->discard_map, + sbi->blocks_per_seg, cur_pos); + } skip: + cur_pos = next_pos; + is_valid = !is_valid; + + if (cur_pos < sbi->blocks_per_seg) + goto find_next; + list_del(&entry->list); - SM_I(sbi)->dcc_info->nr_discards -= entry->len; + SM_I(sbi)->dcc_info->nr_discards -= total_len; kmem_cache_free(discard_entry_slab, entry); } + + wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue); } static int create_discard_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; struct discard_cmd_control *dcc; - int err = 0; + int err = 0, i; if (SM_I(sbi)->dcc_info) { dcc = SM_I(sbi)->dcc_info; @@ -1069,12 +1363,18 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) if (!dcc) return -ENOMEM; - INIT_LIST_HEAD(&dcc->discard_entry_list); - INIT_LIST_HEAD(&dcc->discard_cmd_list); + INIT_LIST_HEAD(&dcc->entry_list); + for (i = 0; i < MAX_PLIST_NUM; i++) + INIT_LIST_HEAD(&dcc->pend_list[i]); + INIT_LIST_HEAD(&dcc->wait_list); mutex_init(&dcc->cmd_lock); - atomic_set(&dcc->submit_discard, 0); + atomic_set(&dcc->issued_discard, 0); + atomic_set(&dcc->issing_discard, 0); + atomic_set(&dcc->discard_cmd_cnt, 0); dcc->nr_discards = 0; - dcc->max_discards = 0; + dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg; + dcc->undiscard_blks = 0; + dcc->root = RB_ROOT; init_waitqueue_head(&dcc->discard_wait_queue); SM_I(sbi)->dcc_info = dcc; @@ -1091,20 +1391,22 @@ init_thread: return err; } -static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi, bool free) +static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - if (dcc && dcc->f2fs_issue_discard) { + if (!dcc) + return; + + if (dcc->f2fs_issue_discard) { struct task_struct *discard_thread = dcc->f2fs_issue_discard; dcc->f2fs_issue_discard = NULL; kthread_stop(discard_thread); } - if (free) { - kfree(dcc); - SM_I(sbi)->dcc_info = NULL; - } + + kfree(dcc); + SM_I(sbi)->dcc_info = NULL; } static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) @@ -1345,6 +1647,17 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi, f2fs_put_page(page, 1); } +static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned int segno = curseg->segno + 1; + struct free_segmap_info *free_i = FREE_I(sbi); + + if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec) + return !test_bit(segno, free_i->free_segmap); + return 0; +} + /* * Find a new segment from the free segments bitmap to right order * This function should be returned with success, otherwise BUG @@ -1355,8 +1668,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno, secno, zoneno; unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone; - unsigned int hint = *newseg / sbi->segs_per_sec; - unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg); + unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg); + unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg); unsigned int left_start = hint; bool init = true; int go_left = 0; @@ -1366,8 +1679,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { segno = find_next_zero_bit(free_i->free_segmap, - (hint + 1) * sbi->segs_per_sec, *newseg + 1); - if (segno < (hint + 1) * sbi->segs_per_sec) + GET_SEG_FROM_SEC(sbi, hint + 1), *newseg + 1); + if (segno < GET_SEG_FROM_SEC(sbi, hint + 1)) goto got_it; } find_other_zone: @@ -1398,8 +1711,8 @@ find_other_zone: secno = left_start; skip_left: hint = secno; - segno = secno * sbi->segs_per_sec; - zoneno = secno / sbi->secs_per_zone; + segno = GET_SEG_FROM_SEC(sbi, secno); + zoneno = GET_ZONE_FROM_SEC(sbi, secno); /* give up on finding another zone */ if (!init) @@ -1443,7 +1756,7 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) struct summary_footer *sum_footer; curseg->segno = curseg->next_segno; - curseg->zone = GET_ZONENO_FROM_SEGNO(sbi, curseg->segno); + curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno); curseg->next_blkoff = 0; curseg->next_segno = NULL_SEGNO; @@ -1456,6 +1769,20 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) __set_sit_entry_type(sbi, type, curseg->segno, modified); } +static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) +{ + /* if segs_per_sec is large than 1, we need to keep original policy. */ + if (sbi->segs_per_sec != 1) + return CURSEG_I(sbi, type)->segno; + + if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) + return 0; + + if (SIT_I(sbi)->last_victim[ALLOC_NEXT]) + return SIT_I(sbi)->last_victim[ALLOC_NEXT]; + return CURSEG_I(sbi, type)->segno; +} + /* * Allocate a current working segment. * This function always allocates a free segment in LFS manner. @@ -1474,6 +1801,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) if (test_opt(sbi, NOHEAP)) dir = ALLOC_RIGHT; + segno = __get_next_segno(sbi, type); get_new_segment(sbi, &segno, new_sec, dir); curseg->next_segno = segno; reset_curseg(sbi, type, 1); @@ -1549,12 +1877,15 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; + unsigned segno = NULL_SEGNO; int i, cnt; bool reversed = false; /* need_SSR() already forces to do this */ - if (v_ops->get_victim(sbi, &(curseg)->next_segno, BG_GC, type, SSR)) + if (v_ops->get_victim(sbi, &segno, BG_GC, type, SSR)) { + curseg->next_segno = segno; return 1; + } /* For node segments, let's do SSR more intensively */ if (IS_NODESEG(type)) { @@ -1578,9 +1909,10 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) for (; cnt-- > 0; reversed ? i-- : i++) { if (i == type) continue; - if (v_ops->get_victim(sbi, &(curseg)->next_segno, - BG_GC, i, SSR)) + if (v_ops->get_victim(sbi, &segno, BG_GC, i, SSR)) { + curseg->next_segno = segno; return 1; + } } return 0; } @@ -1592,17 +1924,21 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) static void allocate_segment_by_default(struct f2fs_sb_info *sbi, int type, bool force) { + struct curseg_info *curseg = CURSEG_I(sbi, type); + if (force) new_curseg(sbi, type, true); else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && type == CURSEG_WARM_NODE) new_curseg(sbi, type, false); + else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) + new_curseg(sbi, type, false); else if (need_SSR(sbi) && get_ssr_segment(sbi, type)) change_curseg(sbi, type, true); else new_curseg(sbi, type, false); - stat_inc_seg_type(sbi, CURSEG_I(sbi, type)); + stat_inc_seg_type(sbi, curseg); } void allocate_new_segments(struct f2fs_sb_info *sbi) @@ -1734,18 +2070,16 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type) if (p_type == DATA) { struct inode *inode = page->mapping->host; - if (S_ISDIR(inode->i_mode)) - return CURSEG_HOT_DATA; - else if (is_cold_data(page) || file_is_cold(inode)) + if (is_cold_data(page) || file_is_cold(inode)) return CURSEG_COLD_DATA; - else - return CURSEG_WARM_DATA; + if (is_inode_flag_set(inode, FI_HOT_DATA)) + return CURSEG_HOT_DATA; + return CURSEG_WARM_DATA; } else { if (IS_DNODE(page)) return is_cold_node(page) ? CURSEG_WARM_NODE : CURSEG_HOT_NODE; - else - return CURSEG_COLD_NODE; + return CURSEG_COLD_NODE; } } @@ -1788,15 +2122,14 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, stat_inc_block_count(sbi, curseg); + if (!__has_curseg_space(sbi, type)) + sit_i->s_ops->allocate_segment(sbi, type, false); /* - * SIT information should be updated before segment allocation, - * since SSR needs latest valid block information. + * SIT information should be updated after segment allocation, + * since we need to keep dirty segments precisely under SSR. */ refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); - if (!__has_curseg_space(sbi, type)) - sit_i->s_ops->allocate_segment(sbi, type, false); - mutex_unlock(&sit_i->sentry_lock); if (page && IS_NODESEG(type)) @@ -1868,11 +2201,11 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) f2fs_update_data_blkaddr(dn, fio->new_blkaddr); } -void rewrite_data_page(struct f2fs_io_info *fio) +int rewrite_data_page(struct f2fs_io_info *fio) { fio->new_blkaddr = fio->old_blkaddr; stat_inc_inplace_blocks(fio->sbi); - f2fs_submit_page_mbio(fio); + return f2fs_submit_page_bio(fio); } void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, @@ -2437,7 +2770,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) se = get_seg_entry(sbi, segno); /* add discard candidates */ - if (cpc->reason != CP_DISCARD) { + if (!(cpc->reason & CP_DISCARD)) { cpc->trim_start = segno; add_discard_addrs(sbi, cpc, false); } @@ -2473,7 +2806,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_bug_on(sbi, !list_empty(head)); f2fs_bug_on(sbi, sit_i->dirty_sentries); out: - if (cpc->reason == CP_DISCARD) { + if (cpc->reason & CP_DISCARD) { __u64 trim_start = cpc->trim_start; for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) @@ -2501,13 +2834,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi) SM_I(sbi)->sit_info = sit_i; - sit_i->sentries = f2fs_kvzalloc(MAIN_SEGS(sbi) * + sit_i->sentries = kvzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry), GFP_KERNEL); if (!sit_i->sentries) return -ENOMEM; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); + sit_i->dirty_sentries_bitmap = kvzalloc(bitmap_size, GFP_KERNEL); if (!sit_i->dirty_sentries_bitmap) return -ENOMEM; @@ -2540,7 +2873,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; if (sbi->segs_per_sec > 1) { - sit_i->sec_entries = f2fs_kvzalloc(MAIN_SECS(sbi) * + sit_i->sec_entries = kvzalloc(MAIN_SECS(sbi) * sizeof(struct sec_entry), GFP_KERNEL); if (!sit_i->sec_entries) return -ENOMEM; @@ -2573,7 +2906,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) sit_i->dirty_sentries = 0; sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK; sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time); - sit_i->mounted_time = CURRENT_TIME_SEC.tv_sec; + sit_i->mounted_time = ktime_get_real_seconds(); mutex_init(&sit_i->sentry_lock); return 0; } @@ -2591,12 +2924,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) SM_I(sbi)->free_info = free_i; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - free_i->free_segmap = f2fs_kvmalloc(bitmap_size, GFP_KERNEL); + free_i->free_segmap = kvmalloc(bitmap_size, GFP_KERNEL); if (!free_i->free_segmap) return -ENOMEM; sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - free_i->free_secmap = f2fs_kvmalloc(sec_bitmap_size, GFP_KERNEL); + free_i->free_secmap = kvmalloc(sec_bitmap_size, GFP_KERNEL); if (!free_i->free_secmap) return -ENOMEM; @@ -2672,10 +3005,17 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) /* build discard map only one time */ if (f2fs_discard_en(sbi)) { - memcpy(se->discard_map, se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += sbi->blocks_per_seg - - se->valid_blocks; + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, + se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += + sbi->blocks_per_seg - + se->valid_blocks; + } } if (sbi->segs_per_sec > 1) @@ -2699,10 +3039,15 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) seg_info_from_raw_sit(se, &sit); if (f2fs_discard_en(sbi)) { - memcpy(se->discard_map, se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += old_valid_blocks - - se->valid_blocks; + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += old_valid_blocks - + se->valid_blocks; + } } if (sbi->segs_per_sec > 1) @@ -2746,7 +3091,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) if (segno >= MAIN_SEGS(sbi)) break; offset = segno + 1; - valid_blocks = get_valid_blocks(sbi, segno, 0); + valid_blocks = get_valid_blocks(sbi, segno, false); if (valid_blocks == sbi->blocks_per_seg || !valid_blocks) continue; if (valid_blocks > sbi->blocks_per_seg) { @@ -2764,7 +3109,7 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi) struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - dirty_i->victim_secmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); + dirty_i->victim_secmap = kvzalloc(bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) return -ENOMEM; return 0; @@ -2786,7 +3131,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); for (i = 0; i < NR_DIRTY_TYPE; i++) { - dirty_i->dirty_segmap[i] = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); + dirty_i->dirty_segmap[i] = kvzalloc(bitmap_size, GFP_KERNEL); if (!dirty_i->dirty_segmap[i]) return -ENOMEM; } @@ -2852,6 +3197,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; + sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS; @@ -2988,7 +3334,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) if (!sm_info) return; destroy_flush_cmd_control(sbi, true); - destroy_discard_cmd_control(sbi, true); + destroy_discard_cmd_control(sbi); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5e8ad4280a50..010f336a7573 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -21,78 +21,84 @@ #define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ /* L: Logical segment # in volume, R: Relative segment # in main area */ -#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) -#define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) +#define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno) +#define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno) -#define IS_DATASEG(t) (t <= CURSEG_COLD_DATA) -#define IS_NODESEG(t) (t >= CURSEG_HOT_NODE) +#define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) +#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE) #define IS_CURSEG(sbi, seg) \ - ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) + (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) #define IS_CURSEC(sbi, secno) \ - ((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ - sbi->segs_per_sec)) \ + (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ + (sbi)->segs_per_sec)) \ #define MAIN_BLKADDR(sbi) (SM_I(sbi)->main_blkaddr) #define SEG0_BLKADDR(sbi) (SM_I(sbi)->seg0_blkaddr) #define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments) -#define MAIN_SECS(sbi) (sbi->total_sections) +#define MAIN_SECS(sbi) ((sbi)->total_sections) #define TOTAL_SEGS(sbi) (SM_I(sbi)->segment_count) -#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << sbi->log_blocks_per_seg) +#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi)) -#define SEGMENT_SIZE(sbi) (1ULL << (sbi->log_blocksize + \ - sbi->log_blocks_per_seg)) +#define SEGMENT_SIZE(sbi) (1ULL << ((sbi)->log_blocksize + \ + (sbi)->log_blocks_per_seg)) #define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \ - (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg)) + (GET_R2L_SEGNO(FREE_I(sbi), segno) << (sbi)->log_blocks_per_seg)) #define NEXT_FREE_BLKADDR(sbi, curseg) \ - (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff) + (START_BLOCK(sbi, (curseg)->segno) + (curseg)->next_blkoff) #define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi)) #define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ - (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg) + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> (sbi)->log_blocks_per_seg) #define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ - (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1)) + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1)) #define GET_SEGNO(sbi, blk_addr) \ - (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \ + ((((blk_addr) == NULL_ADDR) || ((blk_addr) == NEW_ADDR)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ GET_SEGNO_FROM_SEG0(sbi, blk_addr))) -#define GET_SECNO(sbi, segno) \ - ((segno) / sbi->segs_per_sec) -#define GET_ZONENO_FROM_SEGNO(sbi, segno) \ - ((segno / sbi->segs_per_sec) / sbi->secs_per_zone) +#define BLKS_PER_SEC(sbi) \ + ((sbi)->segs_per_sec * (sbi)->blocks_per_seg) +#define GET_SEC_FROM_SEG(sbi, segno) \ + ((segno) / (sbi)->segs_per_sec) +#define GET_SEG_FROM_SEC(sbi, secno) \ + ((secno) * (sbi)->segs_per_sec) +#define GET_ZONE_FROM_SEC(sbi, secno) \ + ((secno) / (sbi)->secs_per_zone) +#define GET_ZONE_FROM_SEG(sbi, segno) \ + GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno)) #define GET_SUM_BLOCK(sbi, segno) \ - ((sbi->sm_info->ssa_blkaddr) + segno) + ((sbi)->sm_info->ssa_blkaddr + (segno)) #define GET_SUM_TYPE(footer) ((footer)->entry_type) -#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = type) +#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = (type)) #define SIT_ENTRY_OFFSET(sit_i, segno) \ - (segno % sit_i->sents_per_block) + ((segno) % (sit_i)->sents_per_block) #define SIT_BLOCK_OFFSET(segno) \ - (segno / SIT_ENTRY_PER_BLOCK) + ((segno) / SIT_ENTRY_PER_BLOCK) #define START_SEGNO(segno) \ (SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK) #define SIT_BLK_CNT(sbi) \ @@ -103,7 +109,7 @@ #define SECTOR_FROM_BLOCK(blk_addr) \ (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK) #define SECTOR_TO_BLOCK(sectors) \ - (sectors >> F2FS_LOG_SECTORS_PER_BLOCK) + ((sectors) >> F2FS_LOG_SECTORS_PER_BLOCK) /* * indicate a block allocation direction: RIGHT and LEFT. @@ -132,7 +138,10 @@ enum { */ enum { GC_CB = 0, - GC_GREEDY + GC_GREEDY, + ALLOC_NEXT, + FLUSH_DEVICE, + MAX_GC_POLICY, }; /* @@ -227,6 +236,8 @@ struct sit_info { unsigned long long mounted_time; /* mount time */ unsigned long long min_mtime; /* min. modification time */ unsigned long long max_mtime; /* max. modification time */ + + unsigned int last_victim[MAX_GC_POLICY]; /* last victim segment # */ }; struct free_segmap_info { @@ -303,17 +314,17 @@ static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - return &sit_i->sec_entries[GET_SECNO(sbi, segno)]; + return &sit_i->sec_entries[GET_SEC_FROM_SEG(sbi, segno)]; } static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, - unsigned int segno, int section) + unsigned int segno, bool use_section) { /* * In order to get # of valid blocks in a section instantly from many * segments, f2fs manages two counting structures separately. */ - if (section > 1) + if (use_section && sbi->segs_per_sec > 1) return get_sec_entry(sbi, segno)->valid_blocks; else return get_seg_entry(sbi, segno)->valid_blocks; @@ -358,8 +369,8 @@ static inline unsigned int find_next_inuse(struct free_segmap_info *free_i, static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; - unsigned int start_segno = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; spin_lock(&free_i->segmap_lock); @@ -379,7 +390,8 @@ static inline void __set_inuse(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + set_bit(segno, free_i->free_segmap); free_i->free_segments--; if (!test_and_set_bit(secno, free_i->free_secmap)) @@ -390,8 +402,8 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; - unsigned int start_segno = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; spin_lock(&free_i->segmap_lock); @@ -412,7 +424,8 @@ static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + spin_lock(&free_i->segmap_lock); if (!test_and_set_bit(segno, free_i->free_segmap)) { free_i->free_segments--; @@ -477,12 +490,12 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi) static inline int overprovision_sections(struct f2fs_sb_info *sbi) { - return ((unsigned int) overprovision_segments(sbi)) / sbi->segs_per_sec; + return GET_SEC_FROM_SEG(sbi, (unsigned int)overprovision_segments(sbi)); } static inline int reserved_sections(struct f2fs_sb_info *sbi) { - return ((unsigned int) reserved_segments(sbi)) / sbi->segs_per_sec; + return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi)); } static inline bool need_SSR(struct f2fs_sb_info *sbi) @@ -495,7 +508,7 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi) return false; return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + - reserved_sections(sbi) + 1); + 2 * reserved_sections(sbi)); } static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, @@ -540,6 +553,7 @@ static inline int utilization(struct f2fs_sb_info *sbi) */ #define DEF_MIN_IPU_UTIL 70 #define DEF_MIN_FSYNC_BLOCKS 8 +#define DEF_MIN_HOT_BLOCKS 16 enum { F2FS_IPU_FORCE, @@ -547,17 +561,15 @@ enum { F2FS_IPU_UTIL, F2FS_IPU_SSR_UTIL, F2FS_IPU_FSYNC, + F2FS_IPU_ASYNC, }; -static inline bool need_inplace_update(struct inode *inode) +static inline bool need_inplace_update_policy(struct inode *inode, + struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int policy = SM_I(sbi)->ipu_policy; - /* IPU can be done only for the user data */ - if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) - return false; - if (test_opt(sbi, LFS)) return false; @@ -572,6 +584,15 @@ static inline bool need_inplace_update(struct inode *inode) utilization(sbi) > SM_I(sbi)->min_ipu_util) return true; + /* + * IPU for rewrite async pages + */ + if (policy & (0x1 << F2FS_IPU_ASYNC) && + fio && fio->op == REQ_OP_WRITE && + !(fio->op_flags & REQ_SYNC) && + !f2fs_encrypted_inode(inode)) + return true; + /* this is only set during fdatasync */ if (policy & (0x1 << F2FS_IPU_FSYNC) && is_inode_flag_set(inode, FI_NEED_IPU)) @@ -691,8 +712,9 @@ static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start) static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); - return sit_i->elapsed_time + CURRENT_TIME_SEC.tv_sec - - sit_i->mounted_time; + time64_t now = ktime_get_real_seconds(); + + return sit_i->elapsed_time + now - sit_i->mounted_time; } static inline void set_summary(struct f2fs_summary *sum, nid_t nid, @@ -719,7 +741,7 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi, unsigned int secno) { - if (get_valid_blocks(sbi, secno, sbi->segs_per_sec) >= + if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) >= sbi->fggc_threshold) return true; return false; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 96fe8ed73100..83355ec4a92c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -49,6 +49,7 @@ char *fault_name[FAULT_MAX] = { [FAULT_BLOCK] = "no more block", [FAULT_DIR_DEPTH] = "too big dir depth", [FAULT_EVICT_INODE] = "evict_inode fail", + [FAULT_TRUNCATE] = "truncate fail", [FAULT_IO] = "IO error", [FAULT_CHECKPOINT] = "checkpoint error", }; @@ -82,6 +83,7 @@ enum { Opt_discard, Opt_nodiscard, Opt_noheap, + Opt_heap, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, @@ -116,6 +118,7 @@ static match_table_t f2fs_tokens = { {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, {Opt_noheap, "no_heap"}, + {Opt_heap, "heap"}, {Opt_user_xattr, "user_xattr"}, {Opt_nouser_xattr, "nouser_xattr"}, {Opt_acl, "acl"}, @@ -293,6 +296,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); @@ -318,6 +322,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), ATTR_LIST(min_fsync_blocks), + ATTR_LIST(min_hot_blocks), ATTR_LIST(max_victim_search), ATTR_LIST(dir_level), ATTR_LIST(ram_thresh), @@ -436,6 +441,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_noheap: set_opt(sbi, NOHEAP); break; + case Opt_heap: + clear_opt(sbi, NOHEAP); + break; #ifdef CONFIG_F2FS_FS_XATTR case Opt_user_xattr: set_opt(sbi, XATTR_USER); @@ -787,7 +795,14 @@ static void f2fs_put_super(struct super_block *sb) } /* be sure to wait for any on-going discard commands */ - f2fs_wait_discard_bio(sbi, NULL_ADDR); + f2fs_wait_discard_bios(sbi); + + if (!sbi->discard_blks) { + struct cp_control cpc = { + .reason = CP_UMOUNT | CP_TRIMMED, + }; + write_checkpoint(sbi, &cpc); + } /* write_checkpoint can update stat informaion */ f2fs_destroy_stats(sbi); @@ -913,7 +928,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, DISCARD)) seq_puts(seq, ",discard"); if (test_opt(sbi, NOHEAP)) - seq_puts(seq, ",no_heap_alloc"); + seq_puts(seq, ",no_heap"); + else + seq_puts(seq, ",heap"); #ifdef CONFIG_F2FS_FS_XATTR if (test_opt(sbi, XATTR_USER)) seq_puts(seq, ",user_xattr"); @@ -986,7 +1003,7 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset) if ((i % 10) == 0) seq_printf(seq, "%-10d", i); seq_printf(seq, "%d|%-3u", se->type, - get_valid_blocks(sbi, i, 1)); + get_valid_blocks(sbi, i, false)); if ((i % 10) == 9 || i == (total_segs - 1)) seq_putc(seq, '\n'); else @@ -1012,7 +1029,7 @@ static int segment_bits_seq_show(struct seq_file *seq, void *offset) seq_printf(seq, "%-10d", i); seq_printf(seq, "%d|%-3u|", se->type, - get_valid_blocks(sbi, i, 1)); + get_valid_blocks(sbi, i, false)); for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) seq_printf(seq, " %.2x", se->cur_valid_map[j]); seq_putc(seq, '\n'); @@ -1046,6 +1063,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); set_opt(sbi, EXTENT_CACHE); + set_opt(sbi, NOHEAP); sbi->sb->s_flags |= MS_LAZYTIME; set_opt(sbi, FLUSH_MERGE); if (f2fs_sb_mounted_blkzoned(sbi->sb)) { @@ -1307,7 +1325,7 @@ static int __f2fs_commit_super(struct buffer_head *bh, unlock_buffer(bh); /* it's rare case, we can do fua all the time */ - return __sync_dirty_buffer(bh, REQ_PREFLUSH | REQ_FUA); + return __sync_dirty_buffer(bh, REQ_SYNC | REQ_PREFLUSH | REQ_FUA); } static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, @@ -1483,6 +1501,13 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return 1; } + if (le32_to_cpu(raw_super->segment_count) > F2FS_MAX_SEGMENT) { + f2fs_msg(sb, KERN_INFO, + "Invalid segment count (%u)", + le32_to_cpu(raw_super->segment_count)); + return 1; + } + /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ if (sanity_check_area_boundary(sbi, bh)) return 1; @@ -1555,6 +1580,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) for (i = 0; i < NR_COUNT_TYPE; i++) atomic_set(&sbi->nr_pages[i], 0); + atomic_set(&sbi->wb_sync_req, 0); + INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); mutex_init(&sbi->wio_mutex[NODE]); @@ -1917,6 +1944,7 @@ try_onemore: mutex_init(&sbi->gc_mutex); mutex_init(&sbi->cp_mutex); init_rwsem(&sbi->node_write); + init_rwsem(&sbi->node_change); /* disallow all the data/node/meta page writes */ set_sbi_flag(sbi, SBI_POR_DOING); @@ -2022,6 +2050,10 @@ try_onemore: f2fs_join_shrinker(sbi); + err = f2fs_build_stats(sbi); + if (err) + goto free_nm; + /* if there are nt orphan nodes free them */ err = recover_orphan_inodes(sbi); if (err) @@ -2046,10 +2078,6 @@ try_onemore: goto free_root_inode; } - err = f2fs_build_stats(sbi); - if (err) - goto free_root_inode; - if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); @@ -2143,7 +2171,6 @@ free_proc: remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry(sb->s_id, f2fs_proc_root); } - f2fs_destroy_stats(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; @@ -2161,6 +2188,7 @@ free_node_inode: truncate_inode_pages_final(META_MAPPING(sbi)); iput(sbi->node_inode); mutex_unlock(&sbi->umount_mutex); + f2fs_destroy_stats(sbi); free_nm: destroy_node_manager(sbi); free_sm: diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index 73b4e1d1912a..bccbbf2616d2 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -59,7 +59,7 @@ void f2fs_trace_pid(struct page *page) pid_t pid = task_pid_nr(current); void *p; - page->private = pid; + set_page_private(page, (unsigned long)pid); if (radix_tree_preload(GFP_NOFS)) return; @@ -138,7 +138,7 @@ static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index, radix_tree_for_each_slot(slot, &pids, &iter, first_index) { results[ret] = iter.index; - if (++ret == PIDVEC_SIZE) + if (++ret == max_items) break; } return ret; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 7298a4488f7f..832c5110abab 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -250,15 +250,13 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, void *cur_addr, *txattr_addr, *last_addr = NULL; nid_t xnid = F2FS_I(inode)->i_xattr_nid; unsigned int size = xnid ? VALID_XATTR_BLOCK_SIZE : 0; - unsigned int inline_size = 0; + unsigned int inline_size = inline_xattr_size(inode); int err = 0; - inline_size = inline_xattr_size(inode); - if (!size && !inline_size) return -ENODATA; - txattr_addr = kzalloc(inline_size + size + sizeof(__u32), + txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE, GFP_F2FS_ZERO); if (!txattr_addr) return -ENOMEM; @@ -328,13 +326,14 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_xattr_header *header; - size_t size = PAGE_SIZE, inline_size = 0; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int size = VALID_XATTR_BLOCK_SIZE; + unsigned int inline_size = inline_xattr_size(inode); void *txattr_addr; int err; - inline_size = inline_xattr_size(inode); - - txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO); + txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE, + GFP_F2FS_ZERO); if (!txattr_addr) return -ENOMEM; @@ -358,19 +357,19 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, } /* read from xattr node block */ - if (F2FS_I(inode)->i_xattr_nid) { + if (xnid) { struct page *xpage; void *xattr_addr; /* The inode already has an extended attribute block. */ - xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); + xpage = get_node_page(sbi, xnid); if (IS_ERR(xpage)) { err = PTR_ERR(xpage); goto fail; } xattr_addr = page_address(xpage); - memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE); + memcpy(txattr_addr + inline_size, xattr_addr, size); f2fs_put_page(xpage, 1); } @@ -392,14 +391,12 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, void *txattr_addr, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - size_t inline_size = 0; + size_t inline_size = inline_xattr_size(inode); void *xattr_addr; struct page *xpage; nid_t new_nid = 0; int err; - inline_size = inline_xattr_size(inode); - if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid) if (!alloc_nid(sbi, &new_nid)) return -ENOSPC; @@ -454,7 +451,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, } xattr_addr = page_address(xpage); - memcpy(xattr_addr, txattr_addr + inline_size, MAX_XATTR_BLOCK_SIZE); + memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE); set_page_dirty(xpage); f2fs_put_page(xpage, 1); @@ -546,7 +543,9 @@ static bool f2fs_xattr_value_same(struct f2fs_xattr_entry *entry, const void *value, size_t size) { void *pval = entry->e_name + entry->e_name_len; - return (entry->e_value_size == size) && !memcmp(pval, value, size); + + return (le16_to_cpu(entry->e_value_size) == size) && + !memcmp(pval, value, size); } static int __f2fs_setxattr(struct inode *inode, int index, diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index d5a94928c116..dbcd1d16e669 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -58,10 +58,10 @@ struct f2fs_xattr_entry { #define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr) + 1)) #define XATTR_ROUND (3) -#define XATTR_ALIGN(size) ((size + XATTR_ROUND) & ~XATTR_ROUND) +#define XATTR_ALIGN(size) (((size) + XATTR_ROUND) & ~XATTR_ROUND) #define ENTRY_SIZE(entry) (XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + \ - entry->e_name_len + le16_to_cpu(entry->e_value_size))) + (entry)->e_name_len + le16_to_cpu((entry)->e_value_size))) #define XATTR_NEXT_ENTRY(entry) ((struct f2fs_xattr_entry *)((char *)(entry) +\ ENTRY_SIZE(entry))) @@ -72,8 +72,8 @@ struct f2fs_xattr_entry { for (entry = XATTR_FIRST_ENTRY(addr);\ !IS_XATTR_LAST_ENTRY(entry);\ entry = XATTR_NEXT_ENTRY(entry)) -#define MAX_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer)) -#define VALID_XATTR_BLOCK_SIZE (MAX_XATTR_BLOCK_SIZE - sizeof(__u32)) +#define VALID_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer)) +#define XATTR_PADDING_SIZE (sizeof(__u32)) #define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + \ VALID_XATTR_BLOCK_SIZE) diff --git a/fs/fcntl.c b/fs/fcntl.c index 8bd81c2e89b2..f4e7267d117f 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -899,16 +899,10 @@ static int __init fcntl_init(void) * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY * is defined as O_NONBLOCK on some platforms and not on others. */ - BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( - O_RDONLY | O_WRONLY | O_RDWR | - O_CREAT | O_EXCL | O_NOCTTY | - O_TRUNC | O_APPEND | /* O_NONBLOCK | */ - __O_SYNC | O_DSYNC | FASYNC | - O_DIRECT | O_LARGEFILE | O_DIRECTORY | - O_NOFOLLOW | O_NOATIME | O_CLOEXEC | - __FMODE_EXEC | O_PATH | __O_TMPFILE | - __FMODE_NONOTIFY - )); + BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != + HWEIGHT32( + (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) | + __FMODE_EXEC | __FMODE_NONOTIFY)); fasync_cache = kmem_cache_create("fasync_cache", sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL); diff --git a/fs/file.c b/fs/file.c index ad6f094f2eff..1c2972e3a405 100644 --- a/fs/file.c +++ b/fs/file.c @@ -42,7 +42,7 @@ static void *alloc_fdmem(size_t size) if (data != NULL) return data; } - return __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM, PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL_ACCOUNT, PAGE_KERNEL); } static void __free_fdtable(struct fdtable *fdt) diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 6e22748b0704..b9ea99c5b5b3 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -292,7 +292,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc) static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent) { - struct tree_descr empty_descr = {""}; + static const struct tree_descr empty_descr = {""}; struct fuse_conn *fc; int err; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 01b97c012c6e..4d810be532dd 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -38,11 +38,6 @@ struct metapath { __u16 mp_list[GFS2_MAX_META_HEIGHT]; }; -struct strip_mine { - int sm_first; - unsigned int sm_height; -}; - /** * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page * @ip: the inode @@ -253,6 +248,19 @@ static inline unsigned int metapath_branch_start(const struct metapath *mp) } /** + * metaptr1 - Return the first possible metadata pointer in a metaath buffer + * @height: The metadata height (0 = dinode) + * @mp: The metapath + */ +static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp) +{ + struct buffer_head *bh = mp->mp_bh[height]; + if (height == 0) + return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode))); + return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header))); +} + +/** * metapointer - Return pointer to start of metadata in a buffer * @height: The metadata height (0 = dinode) * @mp: The metapath @@ -264,10 +272,8 @@ static inline unsigned int metapath_branch_start(const struct metapath *mp) static inline __be64 *metapointer(unsigned int height, const struct metapath *mp) { - struct buffer_head *bh = mp->mp_bh[height]; - unsigned int head_size = (height > 0) ? - sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode); - return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height]; + __be64 *p = metaptr1(height, mp); + return p + mp->mp_list[height]; } static void gfs2_metapath_ra(struct gfs2_glock *gl, @@ -296,6 +302,23 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl, } /** + * lookup_mp_height - helper function for lookup_metapath + * @ip: the inode + * @mp: the metapath + * @h: the height which needs looking up + */ +static int lookup_mp_height(struct gfs2_inode *ip, struct metapath *mp, int h) +{ + __be64 *ptr = metapointer(h, mp); + u64 dblock = be64_to_cpu(*ptr); + + if (!dblock) + return h + 1; + + return gfs2_meta_indirect_buffer(ip, h + 1, dblock, &mp->mp_bh[h + 1]); +} + +/** * lookup_metapath - Walk the metadata tree to a specific point * @ip: The inode * @mp: The metapath @@ -316,17 +339,10 @@ static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp) { unsigned int end_of_metadata = ip->i_height - 1; unsigned int x; - __be64 *ptr; - u64 dblock; int ret; for (x = 0; x < end_of_metadata; x++) { - ptr = metapointer(x, mp); - dblock = be64_to_cpu(*ptr); - if (!dblock) - return x + 1; - - ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, &mp->mp_bh[x+1]); + ret = lookup_mp_height(ip, mp, x); if (ret) return ret; } @@ -334,6 +350,35 @@ static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp) return ip->i_height; } +/** + * fillup_metapath - fill up buffers for the metadata path to a specific height + * @ip: The inode + * @mp: The metapath + * @h: The height to which it should be mapped + * + * Similar to lookup_metapath, but does lookups for a range of heights + * + * Returns: error or height of metadata tree + */ + +static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h) +{ + unsigned int start_h = h - 1; + int ret; + + if (h) { + /* find the first buffer we need to look up. */ + while (start_h > 0 && mp->mp_bh[start_h] == NULL) + start_h--; + for (; start_h < h; start_h++) { + ret = lookup_mp_height(ip, mp, start_h); + if (ret) + return ret; + } + } + return ip->i_height; +} + static inline void release_metapath(struct metapath *mp) { int i; @@ -422,6 +467,13 @@ enum alloc_state { /* ALLOC_UNSTUFF = 3, TBD and rather complicated */ }; +static inline unsigned int hptrs(struct gfs2_sbd *sdp, const unsigned int hgt) +{ + if (hgt) + return sdp->sd_inptrs; + return sdp->sd_diptrs; +} + /** * gfs2_bmap_alloc - Build a metadata tree of the requested height * @inode: The GFS2 inode @@ -620,7 +672,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock, BUG_ON(maxlen == 0); - memset(mp.mp_bh, 0, sizeof(mp.mp_bh)); + memset(&mp, 0, sizeof(mp)); bmap_lock(ip, create); clear_buffer_mapped(bh_map); clear_buffer_new(bh_map); @@ -702,252 +754,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi } /** - * do_strip - Look for a layer a particular layer of the file and strip it off - * @ip: the inode - * @dibh: the dinode buffer - * @bh: A buffer of pointers - * @top: The first pointer in the buffer - * @bottom: One more than the last pointer - * @height: the height this buffer is at - * @sm: a pointer to a struct strip_mine - * - * Returns: errno - */ - -static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, - struct buffer_head *bh, __be64 *top, __be64 *bottom, - unsigned int height, struct strip_mine *sm) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_rgrp_list rlist; - struct gfs2_trans *tr; - u64 bn, bstart; - u32 blen, btotal; - __be64 *p; - unsigned int rg_blocks = 0; - int metadata; - unsigned int revokes = 0; - int x; - int error; - int jblocks_rqsted; - - error = gfs2_rindex_update(sdp); - if (error) - return error; - - if (!*top) - sm->sm_first = 0; - - if (height != sm->sm_height) - return 0; - - if (sm->sm_first) { - top++; - sm->sm_first = 0; - } - - metadata = (height != ip->i_height - 1); - if (metadata) - revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs; - else if (ip->i_depth) - revokes = sdp->sd_inptrs; - - memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); - bstart = 0; - blen = 0; - - for (p = top; p < bottom; p++) { - if (!*p) - continue; - - bn = be64_to_cpu(*p); - - if (bstart + blen == bn) - blen++; - else { - if (bstart) - gfs2_rlist_add(ip, &rlist, bstart); - - bstart = bn; - blen = 1; - } - } - - if (bstart) - gfs2_rlist_add(ip, &rlist, bstart); - else - goto out; /* Nothing to do */ - - gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE); - - for (x = 0; x < rlist.rl_rgrps; x++) { - struct gfs2_rgrpd *rgd; - rgd = rlist.rl_ghs[x].gh_gl->gl_object; - rg_blocks += rgd->rd_length; - } - - error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs); - if (error) - goto out_rlist; - - if (gfs2_rs_active(&ip->i_res)) /* needs to be done with the rgrp glock held */ - gfs2_rs_deltree(&ip->i_res); - -restart: - jblocks_rqsted = rg_blocks + RES_DINODE + - RES_INDIRECT + RES_STATFS + RES_QUOTA + - gfs2_struct2blk(sdp, revokes, sizeof(u64)); - if (jblocks_rqsted > atomic_read(&sdp->sd_log_thresh2)) - jblocks_rqsted = atomic_read(&sdp->sd_log_thresh2); - error = gfs2_trans_begin(sdp, jblocks_rqsted, revokes); - if (error) - goto out_rg_gunlock; - - tr = current->journal_info; - down_write(&ip->i_rw_mutex); - - gfs2_trans_add_meta(ip->i_gl, dibh); - gfs2_trans_add_meta(ip->i_gl, bh); - - bstart = 0; - blen = 0; - btotal = 0; - - for (p = top; p < bottom; p++) { - if (!*p) - continue; - - /* check for max reasonable journal transaction blocks */ - if (tr->tr_num_buf_new + RES_STATFS + - RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) { - if (rg_blocks >= tr->tr_num_buf_new) - rg_blocks -= tr->tr_num_buf_new; - else - rg_blocks = 0; - break; - } - - bn = be64_to_cpu(*p); - - if (bstart + blen == bn) - blen++; - else { - if (bstart) { - __gfs2_free_blocks(ip, bstart, blen, metadata); - btotal += blen; - } - - bstart = bn; - blen = 1; - } - - *p = 0; - gfs2_add_inode_blocks(&ip->i_inode, -1); - } - if (p == bottom) - rg_blocks = 0; - - if (bstart) { - __gfs2_free_blocks(ip, bstart, blen, metadata); - btotal += blen; - } - - gfs2_statfs_change(sdp, 0, +btotal, 0); - gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid, - ip->i_inode.i_gid); - - ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); - - gfs2_dinode_out(ip, dibh->b_data); - - up_write(&ip->i_rw_mutex); - - gfs2_trans_end(sdp); - - if (rg_blocks) - goto restart; - -out_rg_gunlock: - gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); -out_rlist: - gfs2_rlist_free(&rlist); -out: - return error; -} - -/** - * recursive_scan - recursively scan through the end of a file - * @ip: the inode - * @dibh: the dinode buffer - * @mp: the path through the metadata to the point to start - * @height: the height the recursion is at - * @block: the indirect block to look at - * @first: 1 if this is the first block - * @sm: data opaque to this function to pass to @bc - * - * When this is first called @height and @block should be zero and - * @first should be 1. - * - * Returns: errno - */ - -static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh, - struct metapath *mp, unsigned int height, - u64 block, int first, struct strip_mine *sm) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct buffer_head *bh = NULL; - __be64 *top, *bottom; - u64 bn; - int error; - int mh_size = sizeof(struct gfs2_meta_header); - - if (!height) { - error = gfs2_meta_inode_buffer(ip, &bh); - if (error) - return error; - dibh = bh; - - top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0]; - bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs; - } else { - error = gfs2_meta_indirect_buffer(ip, height, block, &bh); - if (error) - return error; - - top = (__be64 *)(bh->b_data + mh_size) + - (first ? mp->mp_list[height] : 0); - - bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs; - } - - error = do_strip(ip, dibh, bh, top, bottom, height, sm); - if (error) - goto out; - - if (height < ip->i_height - 1) { - - gfs2_metapath_ra(ip->i_gl, bh, top); - - for (; top < bottom; top++, first = 0) { - if (!*top) - continue; - - bn = be64_to_cpu(*top); - - error = recursive_scan(ip, dibh, mp, height + 1, bn, - first, sm); - if (error) - break; - } - } -out: - brelse(bh); - return error; -} - - -/** * gfs2_block_truncate_page - Deal with zeroing out data for truncate * * This is partly borrowed from ext3. @@ -1106,41 +912,406 @@ out: return error; } -static int trunc_dealloc(struct gfs2_inode *ip, u64 size) +/** + * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein + * @ip: inode + * @rg_gh: holder of resource group glock + * @mp: current metapath fully populated with buffers + * @btotal: place to keep count of total blocks freed + * @hgt: height we're processing + * @first: true if this is the first call to this function for this height + * + * We sweep a metadata buffer (provided by the metapath) for blocks we need to + * free, and free them all. However, we do it one rgrp at a time. If this + * block has references to multiple rgrps, we break it into individual + * transactions. This allows other processes to use the rgrps while we're + * focused on a single one, for better concurrency / performance. + * At every transaction boundary, we rewrite the inode into the journal. + * That way the bitmaps are kept consistent with the inode and we can recover + * if we're interrupted by power-outages. + * + * Returns: 0, or return code if an error occurred. + * *btotal has the total number of blocks freed + */ +static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh, + const struct metapath *mp, u32 *btotal, int hgt, + bool preserve1) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - unsigned int height = ip->i_height; - u64 lblock; - struct metapath mp; - int error; + struct gfs2_rgrpd *rgd; + struct gfs2_trans *tr; + struct buffer_head *bh = mp->mp_bh[hgt]; + __be64 *top, *bottom, *p; + int blks_outside_rgrp; + u64 bn, bstart, isize_blks; + s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */ + int meta = ((hgt != ip->i_height - 1) ? 1 : 0); + int ret = 0; + bool buf_in_tr = false; /* buffer was added to transaction */ + + if (gfs2_metatype_check(sdp, bh, + (hgt ? GFS2_METATYPE_IN : GFS2_METATYPE_DI))) + return -EIO; + +more_rgrps: + blks_outside_rgrp = 0; + bstart = 0; + blen = 0; + top = metapointer(hgt, mp); /* first ptr from metapath */ + /* If we're keeping some data at the truncation point, we've got to + preserve the metadata tree by adding 1 to the starting metapath. */ + if (preserve1) + top++; + + bottom = (__be64 *)(bh->b_data + bh->b_size); + + for (p = top; p < bottom; p++) { + if (!*p) + continue; + bn = be64_to_cpu(*p); + if (gfs2_holder_initialized(rd_gh)) { + rgd = (struct gfs2_rgrpd *)rd_gh->gh_gl->gl_object; + gfs2_assert_withdraw(sdp, + gfs2_glock_is_locked_by_me(rd_gh->gh_gl)); + } else { + rgd = gfs2_blk2rgrpd(sdp, bn, false); + ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, + 0, rd_gh); + if (ret) + goto out; + + /* Must be done with the rgrp glock held: */ + if (gfs2_rs_active(&ip->i_res) && + rgd == ip->i_res.rs_rbm.rgd) + gfs2_rs_deltree(&ip->i_res); + } + + if (!rgrp_contains_block(rgd, bn)) { + blks_outside_rgrp++; + continue; + } + + /* The size of our transactions will be unknown until we + actually process all the metadata blocks that relate to + the rgrp. So we estimate. We know it can't be more than + the dinode's i_blocks and we don't want to exceed the + journal flush threshold, sd_log_thresh2. */ + if (current->journal_info == NULL) { + unsigned int jblocks_rqsted, revokes; + + jblocks_rqsted = rgd->rd_length + RES_DINODE + + RES_INDIRECT; + isize_blks = gfs2_get_inode_blocks(&ip->i_inode); + if (isize_blks > atomic_read(&sdp->sd_log_thresh2)) + jblocks_rqsted += + atomic_read(&sdp->sd_log_thresh2); + else + jblocks_rqsted += isize_blks; + revokes = jblocks_rqsted; + if (meta) + revokes += hptrs(sdp, hgt); + else if (ip->i_depth) + revokes += sdp->sd_inptrs; + ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes); + if (ret) + goto out_unlock; + down_write(&ip->i_rw_mutex); + } + /* check if we will exceed the transaction blocks requested */ + tr = current->journal_info; + if (tr->tr_num_buf_new + RES_STATFS + + RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) { + /* We set blks_outside_rgrp to ensure the loop will + be repeated for the same rgrp, but with a new + transaction. */ + blks_outside_rgrp++; + /* This next part is tricky. If the buffer was added + to the transaction, we've already set some block + pointers to 0, so we better follow through and free + them, or we will introduce corruption (so break). + This may be impossible, or at least rare, but I + decided to cover the case regardless. + + If the buffer was not added to the transaction + (this call), doing so would exceed our transaction + size, so we need to end the transaction and start a + new one (so goto). */ + + if (buf_in_tr) + break; + goto out_unlock; + } + + gfs2_trans_add_meta(ip->i_gl, bh); + buf_in_tr = true; + *p = 0; + if (bstart + blen == bn) { + blen++; + continue; + } + if (bstart) { + __gfs2_free_blocks(ip, bstart, (u32)blen, meta); + (*btotal) += blen; + gfs2_add_inode_blocks(&ip->i_inode, -blen); + } + bstart = bn; + blen = 1; + } + if (bstart) { + __gfs2_free_blocks(ip, bstart, (u32)blen, meta); + (*btotal) += blen; + gfs2_add_inode_blocks(&ip->i_inode, -blen); + } +out_unlock: + if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks + outside the rgrp we just processed, + do it all over again. */ + if (current->journal_info) { + struct buffer_head *dibh = mp->mp_bh[0]; + + /* Every transaction boundary, we rewrite the dinode + to keep its di_blocks current in case of failure. */ + ip->i_inode.i_mtime = ip->i_inode.i_ctime = + current_time(&ip->i_inode); + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_dinode_out(ip, dibh->b_data); + up_write(&ip->i_rw_mutex); + gfs2_trans_end(sdp); + } + gfs2_glock_dq_uninit(rd_gh); + cond_resched(); + goto more_rgrps; + } +out: + return ret; +} + +/** + * find_nonnull_ptr - find a non-null pointer given a metapath and height + * assumes the metapath is valid (with buffers) out to height h + * @mp: starting metapath + * @h: desired height to search + * + * Returns: true if a non-null pointer was found in the metapath buffer + * false if all remaining pointers are NULL in the buffer + */ +static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp, + unsigned int h) +{ + __be64 *ptr; + unsigned int ptrs = hptrs(sdp, h) - 1; + + while (true) { + ptr = metapointer(h, mp); + if (*ptr) /* if we have a non-null pointer */ + return true; + + if (mp->mp_list[h] < ptrs) + mp->mp_list[h]++; + else + return false; /* no more pointers in this buffer */ + } +} + +enum dealloc_states { + DEALLOC_MP_FULL = 0, /* Strip a metapath with all buffers read in */ + DEALLOC_MP_LOWER = 1, /* lower the metapath strip height */ + DEALLOC_FILL_MP = 2, /* Fill in the metapath to the given height. */ + DEALLOC_DONE = 3, /* process complete */ +}; - if (!size) +/** + * trunc_dealloc - truncate a file down to a desired size + * @ip: inode to truncate + * @newsize: The desired size of the file + * + * This function truncates a file to newsize. It works from the + * bottom up, and from the right to the left. In other words, it strips off + * the highest layer (data) before stripping any of the metadata. Doing it + * this way is best in case the operation is interrupted by power failure, etc. + * The dinode is rewritten in every transaction to guarantee integrity. + */ +static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct metapath mp; + struct buffer_head *dibh, *bh; + struct gfs2_holder rd_gh; + u64 lblock; + __u16 nbof[GFS2_MAX_META_HEIGHT]; /* new beginning of truncation */ + unsigned int strip_h = ip->i_height - 1; + u32 btotal = 0; + int ret, state; + int mp_h; /* metapath buffers are read in to this height */ + sector_t last_ra = 0; + u64 prev_bnr = 0; + bool preserve1; /* need to preserve the first meta pointer? */ + + if (!newsize) lblock = 0; else - lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift; + lblock = (newsize - 1) >> sdp->sd_sb.sb_bsize_shift; + memset(&mp, 0, sizeof(mp)); find_metapath(sdp, lblock, &mp, ip->i_height); - error = gfs2_rindex_update(sdp); - if (error) - return error; - error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); - if (error) - return error; + memcpy(&nbof, &mp.mp_list, sizeof(nbof)); + + ret = gfs2_meta_inode_buffer(ip, &dibh); + if (ret) + return ret; - while (height--) { - struct strip_mine sm; - sm.sm_first = !!size; - sm.sm_height = height; + mp.mp_bh[0] = dibh; + ret = lookup_metapath(ip, &mp); + if (ret == ip->i_height) + state = DEALLOC_MP_FULL; /* We have a complete metapath */ + else + state = DEALLOC_FILL_MP; /* deal with partial metapath */ - error = recursive_scan(ip, NULL, &mp, 0, 0, 1, &sm); - if (error) + ret = gfs2_rindex_update(sdp); + if (ret) + goto out_metapath; + + ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); + if (ret) + goto out_metapath; + gfs2_holder_mark_uninitialized(&rd_gh); + + mp_h = strip_h; + + while (state != DEALLOC_DONE) { + switch (state) { + /* Truncate a full metapath at the given strip height. + * Note that strip_h == mp_h in order to be in this state. */ + case DEALLOC_MP_FULL: + if (mp_h > 0) { /* issue read-ahead on metadata */ + __be64 *top; + + bh = mp.mp_bh[mp_h - 1]; + if (bh->b_blocknr != last_ra) { + last_ra = bh->b_blocknr; + top = metaptr1(mp_h - 1, &mp); + gfs2_metapath_ra(ip->i_gl, bh, top); + } + } + /* If we're truncating to a non-zero size and the mp is + at the beginning of file for the strip height, we + need to preserve the first metadata pointer. */ + preserve1 = (newsize && + (mp.mp_list[mp_h] == nbof[mp_h])); + bh = mp.mp_bh[mp_h]; + gfs2_assert_withdraw(sdp, bh); + if (gfs2_assert_withdraw(sdp, + prev_bnr != bh->b_blocknr)) { + printk(KERN_EMERG "GFS2: fsid=%s:inode %llu, " + "block:%llu, i_h:%u, s_h:%u, mp_h:%u\n", + sdp->sd_fsname, + (unsigned long long)ip->i_no_addr, + prev_bnr, ip->i_height, strip_h, mp_h); + } + prev_bnr = bh->b_blocknr; + ret = sweep_bh_for_rgrps(ip, &rd_gh, &mp, &btotal, + mp_h, preserve1); + /* If we hit an error or just swept dinode buffer, + just exit. */ + if (ret || !mp_h) { + state = DEALLOC_DONE; + break; + } + state = DEALLOC_MP_LOWER; + break; + + /* lower the metapath strip height */ + case DEALLOC_MP_LOWER: + /* We're done with the current buffer, so release it, + unless it's the dinode buffer. Then back up to the + previous pointer. */ + if (mp_h) { + brelse(mp.mp_bh[mp_h]); + mp.mp_bh[mp_h] = NULL; + } + /* If we can't get any lower in height, we've stripped + off all we can. Next step is to back up and start + stripping the previous level of metadata. */ + if (mp_h == 0) { + strip_h--; + memcpy(&mp.mp_list, &nbof, sizeof(nbof)); + mp_h = strip_h; + state = DEALLOC_FILL_MP; + break; + } + mp.mp_list[mp_h] = 0; + mp_h--; /* search one metadata height down */ + if (mp.mp_list[mp_h] >= hptrs(sdp, mp_h) - 1) + break; /* loop around in the same state */ + mp.mp_list[mp_h]++; + /* Here we've found a part of the metapath that is not + * allocated. We need to search at that height for the + * next non-null pointer. */ + if (find_nonnull_ptr(sdp, &mp, mp_h)) { + state = DEALLOC_FILL_MP; + mp_h++; + } + /* No more non-null pointers at this height. Back up + to the previous height and try again. */ + break; /* loop around in the same state */ + + /* Fill the metapath with buffers to the given height. */ + case DEALLOC_FILL_MP: + /* Fill the buffers out to the current height. */ + ret = fillup_metapath(ip, &mp, mp_h); + if (ret < 0) + goto out; + + /* If buffers found for the entire strip height */ + if ((ret == ip->i_height) && (mp_h == strip_h)) { + state = DEALLOC_MP_FULL; + break; + } + if (ret < ip->i_height) /* We have a partial height */ + mp_h = ret - 1; + + /* If we find a non-null block pointer, crawl a bit + higher up in the metapath and try again, otherwise + we need to look lower for a new starting point. */ + if (find_nonnull_ptr(sdp, &mp, mp_h)) + mp_h++; + else + state = DEALLOC_MP_LOWER; break; + } } - gfs2_quota_unhold(ip); + if (btotal) { + if (current->journal_info == NULL) { + ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + + RES_QUOTA, 0); + if (ret) + goto out; + down_write(&ip->i_rw_mutex); + } + gfs2_statfs_change(sdp, 0, +btotal, 0); + gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid, + ip->i_inode.i_gid); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_dinode_out(ip, dibh->b_data); + up_write(&ip->i_rw_mutex); + gfs2_trans_end(sdp); + } - return error; +out: + if (gfs2_holder_initialized(&rd_gh)) + gfs2_glock_dq_uninit(&rd_gh); + if (current->journal_info) { + up_write(&ip->i_rw_mutex); + gfs2_trans_end(sdp); + cond_resched(); + } + gfs2_quota_unhold(ip); +out_metapath: + release_metapath(&mp); + return ret; } static int trunc_end(struct gfs2_inode *ip) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 6fe2a59c6a9a..c2062a108d19 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -911,11 +911,15 @@ out_qunlock: static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); + struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int ret; - if ((mode & ~FALLOC_FL_KEEP_SIZE) || gfs2_is_jdata(ip)) + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + /* fallocate is needed by gfs2_grow to reserve space in the rindex */ + if (gfs2_is_jdata(ip) && inode != sdp->sd_rindex) return -EOPNOTSUPP; inode_lock(inode); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index ec0848fcca02..959a19ced4d5 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -73,7 +73,7 @@ static DEFINE_SPINLOCK(lru_lock); static struct rhashtable_params ht_parms = { .nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4, - .key_len = sizeof(struct lm_lockname), + .key_len = offsetofend(struct lm_lockname, ln_type), .key_offset = offsetof(struct gfs2_glock, gl_name), .head_offset = offsetof(struct gfs2_glock, gl_node), }; @@ -449,6 +449,9 @@ __acquires(&gl->gl_lockref.lock) unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0); int ret; + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) && + target != LM_ST_UNLOCKED) + return; lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | LM_FLAG_PRIORITY); GLOCK_BUG_ON(gl, gl->gl_state == target); @@ -484,7 +487,8 @@ __acquires(&gl->gl_lockref.lock) } else if (ret) { pr_err("lm_lock ret %d\n", ret); - GLOCK_BUG_ON(gl, 1); + GLOCK_BUG_ON(gl, !test_bit(SDF_SHUTDOWN, + &sdp->sd_flags)); } } else { /* lock_nolock */ finish_xmote(gl, target); @@ -653,10 +657,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type, .ln_sbd = sdp }; - struct gfs2_glock *gl, *tmp = NULL; + struct gfs2_glock *gl, *tmp; struct address_space *mapping; struct kmem_cache *cachep; - int ret, tries = 0; + int ret = 0; rcu_read_lock(); gl = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms); @@ -721,35 +725,32 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, } again: - ret = rhashtable_lookup_insert_fast(&gl_hash_table, &gl->gl_node, - ht_parms); - if (ret == 0) { + rcu_read_lock(); + tmp = rhashtable_lookup_get_insert_fast(&gl_hash_table, &gl->gl_node, + ht_parms); + if (!tmp) { *glp = gl; - return 0; + goto out; } - - if (ret == -EEXIST) { - ret = 0; - rcu_read_lock(); - tmp = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms); - if (tmp == NULL || !lockref_get_not_dead(&tmp->gl_lockref)) { - if (++tries < 100) { - rcu_read_unlock(); - cond_resched(); - goto again; - } - tmp = NULL; - ret = -ENOMEM; - } - rcu_read_unlock(); - } else { - WARN_ON_ONCE(ret); + if (IS_ERR(tmp)) { + ret = PTR_ERR(tmp); + goto out_free; } + if (lockref_get_not_dead(&tmp->gl_lockref)) { + *glp = tmp; + goto out_free; + } + rcu_read_unlock(); + cond_resched(); + goto again; + +out_free: kfree(gl->gl_lksb.sb_lvbptr); kmem_cache_free(cachep, gl); atomic_dec(&sdp->sd_glock_disposal); - *glp = tmp; +out: + rcu_read_unlock(); return ret; } @@ -1918,10 +1919,10 @@ static const struct seq_operations gfs2_sbstats_seq_ops = { #define GFS2_SEQ_GOODSIZE min(PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER, 65536UL) -static int gfs2_glocks_open(struct inode *inode, struct file *file) +static int __gfs2_glocks_open(struct inode *inode, struct file *file, + const struct seq_operations *ops) { - int ret = seq_open_private(file, &gfs2_glock_seq_ops, - sizeof(struct gfs2_glock_iter)); + int ret = seq_open_private(file, ops, sizeof(struct gfs2_glock_iter)); if (ret == 0) { struct seq_file *seq = file->private_data; struct gfs2_glock_iter *gi = seq->private; @@ -1932,11 +1933,16 @@ static int gfs2_glocks_open(struct inode *inode, struct file *file) if (seq->buf) seq->size = GFS2_SEQ_GOODSIZE; gi->gl = NULL; - ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL); + rhashtable_walk_enter(&gl_hash_table, &gi->hti); } return ret; } +static int gfs2_glocks_open(struct inode *inode, struct file *file) +{ + return __gfs2_glocks_open(inode, file, &gfs2_glock_seq_ops); +} + static int gfs2_glocks_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; @@ -1949,20 +1955,7 @@ static int gfs2_glocks_release(struct inode *inode, struct file *file) static int gfs2_glstats_open(struct inode *inode, struct file *file) { - int ret = seq_open_private(file, &gfs2_glstats_seq_ops, - sizeof(struct gfs2_glock_iter)); - if (ret == 0) { - struct seq_file *seq = file->private_data; - struct gfs2_glock_iter *gi = seq->private; - gi->sdp = inode->i_private; - gi->last_pos = 0; - seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN); - if (seq->buf) - seq->size = GFS2_SEQ_GOODSIZE; - gi->gl = NULL; - ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL); - } - return ret; + return __gfs2_glocks_open(inode, file, &gfs2_glstats_seq_ops); } static int gfs2_sbstats_open(struct inode *inode, struct file *file) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 511e1ed7e2de..b7cf65d13561 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -203,11 +203,15 @@ enum { DFL_DLM_RECOVERY = 6, }; +/* + * We are using struct lm_lockname as an rhashtable key. Avoid holes within + * the struct; padding at the end is fine. + */ struct lm_lockname { - struct gfs2_sbd *ln_sbd; u64 ln_number; + struct gfs2_sbd *ln_sbd; unsigned int ln_type; -} __packed __aligned(sizeof(int)); +}; #define lm_name_equal(name1, name2) \ (((name1)->ln_number == (name2)->ln_number) && \ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index e279c3ce27be..9f605ea4810c 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -202,8 +202,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, fail_refresh: ip->i_iopen_gh.gh_flags |= GL_NOCACHE; ip->i_iopen_gh.gh_gl->gl_object = NULL; - gfs2_glock_dq_wait(&ip->i_iopen_gh); - gfs2_holder_uninit(&ip->i_iopen_gh); + gfs2_glock_dq_uninit(&ip->i_iopen_gh); fail_put: if (io_gl) gfs2_glock_put(io_gl); @@ -667,6 +666,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, ip->i_height = 0; ip->i_depth = 0; ip->i_entries = 0; + ip->i_no_addr = 0; /* Temporarily zero until real addr is assigned */ switch(mode & S_IFMT) { case S_IFREG: diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 86ccc0159393..83c9909ff14a 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -483,13 +483,6 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd) } } -static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) -{ - u64 first = rgd->rd_data0; - u64 last = first + rgd->rd_data; - return first <= block && block < last; -} - /** * gfs2_blk2rgrpd - Find resource group for a given data/meta block number * @sdp: The GFS2 superblock diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 66b51cf66dfa..e90478e2f545 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -83,5 +83,12 @@ static inline bool gfs2_rs_active(const struct gfs2_blkreserv *rs) return rs && !RB_EMPTY_NODE(&rs->rs_node); } +static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) +{ + u64 first = rgd->rd_data0; + u64 last = first + rgd->rd_data; + return first <= block && block < last; +} + extern void check_and_update_goal(struct gfs2_inode *ip); #endif /* __RGRP_DOT_H__ */ diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 361796a84fce..29b0473f6e74 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -793,7 +793,8 @@ static void gfs2_dirty_inode(struct inode *inode, int flags) if (!(flags & (I_DIRTY_DATASYNC|I_DIRTY_SYNC))) return; - + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + return; if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); if (ret) { @@ -1538,8 +1539,7 @@ static void gfs2_evict_inode(struct inode *inode) error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); if (unlikely(error)) { ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq_wait(&ip->i_iopen_gh); - gfs2_holder_uninit(&ip->i_iopen_gh); + gfs2_glock_dq_uninit(&ip->i_iopen_gh); goto out; } @@ -1617,7 +1617,7 @@ out_unlock: if (gfs2_holder_initialized(&ip->i_iopen_gh)) { if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq_wait(&ip->i_iopen_gh); + gfs2_glock_dq(&ip->i_iopen_gh); } gfs2_holder_uninit(&ip->i_iopen_gh); } @@ -1639,8 +1639,7 @@ out: if (gfs2_holder_initialized(&ip->i_iopen_gh)) { ip->i_iopen_gh.gh_gl->gl_object = NULL; ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq_wait(&ip->i_iopen_gh); - gfs2_holder_uninit(&ip->i_iopen_gh); + gfs2_glock_dq_uninit(&ip->i_iopen_gh); } } diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c index e33a0d36a93e..5d0182654580 100644 --- a/fs/hfs/extent.c +++ b/fs/hfs/extent.c @@ -485,8 +485,8 @@ void hfs_file_truncate(struct inode *inode) /* XXX: Can use generic_cont_expand? */ size = inode->i_size - 1; - res = pagecache_write_begin(NULL, mapping, size+1, 0, - AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); + res = pagecache_write_begin(NULL, mapping, size+1, 0, 0, + &page, &fsdata); if (!res) { res = pagecache_write_end(NULL, mapping, size+1, 0, 0, page, fsdata); diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index feca524ce2a5..a3eb640b4f8f 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -545,9 +545,8 @@ void hfsplus_file_truncate(struct inode *inode) void *fsdata; loff_t size = inode->i_size; - res = pagecache_write_begin(NULL, mapping, size, 0, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); + res = pagecache_write_begin(NULL, mapping, size, 0, 0, + &page, &fsdata); if (res) return; res = pagecache_write_end(NULL, mapping, size, diff --git a/fs/inode.c b/fs/inode.c index 131b2bcebc48..db5914783a71 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -119,7 +119,7 @@ static int no_open(struct inode *inode, struct file *file) } /** - * inode_init_always - perform inode structure intialisation + * inode_init_always - perform inode structure initialisation * @sb: superblock inode belongs to * @inode: inode to initialise * @@ -402,6 +402,8 @@ static void inode_lru_list_add(struct inode *inode) { if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_inc(nr_unused); + else + inode->i_state |= I_REFERENCED; } /* @@ -1489,7 +1491,6 @@ static void iput_final(struct inode *inode) drop = generic_drop_inode(inode); if (!drop && (sb->s_flags & MS_ACTIVE)) { - inode->i_state |= I_REFERENCED; inode_add_lru(inode); spin_unlock(&inode->i_lock); return; diff --git a/fs/internal.h b/fs/internal.h index 076751d90ba2..9676fe11c093 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -126,8 +126,6 @@ static inline bool atime_needs_update_rcu(const struct path *path, return __atime_needs_update(path, inode, true); } -extern bool atime_needs_update_rcu(const struct path *, struct inode *); - /* * fs-writeback.c */ diff --git a/fs/iomap.c b/fs/iomap.c index 1c25ae30500e..4b10892967a5 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -158,12 +158,6 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, ssize_t written = 0; unsigned int flags = AOP_FLAG_NOFS; - /* - * Copies from kernel address space cannot fail (NFSD is a big user). - */ - if (!iter_is_iovec(i)) - flags |= AOP_FLAG_UNINTERRUPTIBLE; - do { struct page *page; unsigned long offset; /* Offset into pagecache page */ @@ -291,8 +285,7 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, return PTR_ERR(rpage); status = iomap_write_begin(inode, pos, bytes, - AOP_FLAG_NOFS | AOP_FLAG_UNINTERRUPTIBLE, - &page, iomap); + AOP_FLAG_NOFS, &page, iomap); put_page(rpage); if (unlikely(status)) return status; @@ -343,8 +336,8 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, struct page *page; int status; - status = iomap_write_begin(inode, pos, bytes, - AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS, &page, iomap); + status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page, + iomap); if (status) return status; @@ -360,7 +353,8 @@ static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, sector_t sector = iomap->blkno + (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9); - return __dax_zero_page_range(iomap->bdev, sector, offset, bytes); + return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector, + offset, bytes); } static loff_t @@ -909,6 +903,9 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, break; } pos += ret; + + if (iov_iter_rw(iter) == READ && pos >= dio->i_size) + break; } while ((count = iov_iter_count(iter)) > 0); blk_finish_plug(&plug); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index c43fe83ee708..5a0245e36240 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -700,8 +700,21 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) { int err = 0; - jbd2_might_wait_for_commit(journal); read_lock(&journal->j_state_lock); +#ifdef CONFIG_PROVE_LOCKING + /* + * Some callers make sure transaction is already committing and in that + * case we cannot block on open handles anymore. So don't warn in that + * case. + */ + if (tid_gt(tid, journal->j_commit_sequence) && + (!journal->j_committing_transaction || + journal->j_committing_transaction->t_tid != tid)) { + read_unlock(&journal->j_state_lock); + jbd2_might_wait_for_commit(journal); + read_lock(&journal->j_state_lock); + } +#endif #ifdef CONFIG_JBD2_DEBUG if (!tid_geq(journal->j_commit_request, tid)) { printk(KERN_ERR @@ -922,7 +935,8 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) * space and if we lose sb update during power failure we'd replay * old transaction with possibly newly overwritten data. */ - ret = jbd2_journal_update_sb_log_tail(journal, tid, block, REQ_FUA); + ret = jbd2_journal_update_sb_log_tail(journal, tid, block, + REQ_SYNC | REQ_FUA); if (ret) goto out; @@ -1323,7 +1337,7 @@ static int journal_reset(journal_t *journal) jbd2_journal_update_sb_log_tail(journal, journal->j_tail_sequence, journal->j_tail, - REQ_FUA); + REQ_SYNC | REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } return jbd2_journal_start_thread(journal); @@ -1463,7 +1477,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal) sb->s_errno = cpu_to_be32(journal->j_errno); read_unlock(&journal->j_state_lock); - jbd2_write_superblock(journal, REQ_FUA); + jbd2_write_superblock(journal, REQ_SYNC | REQ_FUA); } EXPORT_SYMBOL(jbd2_journal_update_sb_errno); @@ -1730,7 +1744,7 @@ int jbd2_journal_destroy(journal_t *journal) write_unlock(&journal->j_state_lock); jbd2_mark_journal_empty(journal, - REQ_PREFLUSH | REQ_FUA); + REQ_SYNC | REQ_PREFLUSH | REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } else err = -EIO; @@ -1989,7 +2003,7 @@ int jbd2_journal_flush(journal_t *journal) * the magic code for a fully-recovered superblock. Any future * commits of data to the journal will restore the current * s_start value. */ - jbd2_mark_journal_empty(journal, REQ_FUA); + jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); write_lock(&journal->j_state_lock); J_ASSERT(!journal->j_running_transaction); @@ -2035,7 +2049,7 @@ int jbd2_journal_wipe(journal_t *journal, int write) if (write) { /* Lock to make assertions happy... */ mutex_lock(&journal->j_checkpoint_mutex); - jbd2_mark_journal_empty(journal, REQ_FUA); + jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } diff --git a/fs/libfs.c b/fs/libfs.c index a8b62e5d43a9..a04395334bb1 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -507,7 +507,7 @@ EXPORT_SYMBOL(simple_write_end); * to pass it an appropriate max_reserved value to avoid collisions. */ int simple_fill_super(struct super_block *s, unsigned long magic, - struct tree_descr *files) + const struct tree_descr *files) { struct inode *inode; struct dentry *root; diff --git a/fs/namei.c b/fs/namei.c index 9a7f8bd748d8..7286f87ce863 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4766,7 +4766,7 @@ int __page_symlink(struct inode *inode, const char *symname, int len, int nofs) struct page *page; void *fsdata; int err; - unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE; + unsigned int flags = 0; if (nofs) flags |= AOP_FLAG_NOFS; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 9dc65d7ae754..7b38fedb7e03 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -139,7 +139,7 @@ struct nfs_mount_request { }; struct nfs_mount_info { - int (*fill_super)(struct super_block *, struct nfs_mount_info *); + void (*fill_super)(struct super_block *, struct nfs_mount_info *); int (*set_security)(struct super_block *, struct dentry *, struct nfs_mount_info *); struct nfs_parsed_mount_data *parsed; struct nfs_clone_mount *cloned; @@ -407,7 +407,7 @@ struct dentry *nfs_fs_mount(struct file_system_type *, int, const char *, void * struct dentry * nfs_xdev_mount_common(struct file_system_type *, int, const char *, struct nfs_mount_info *); void nfs_kill_super(struct super_block *); -int nfs_fill_super(struct super_block *, struct nfs_mount_info *); +void nfs_fill_super(struct super_block *, struct nfs_mount_info *); extern struct rpc_stat nfs_rpcstat; @@ -458,7 +458,7 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata); extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); /* super.c */ -int nfs_clone_super(struct super_block *, struct nfs_mount_info *); +void nfs_clone_super(struct super_block *, struct nfs_mount_info *); void nfs_umount_begin(struct super_block *); int nfs_statfs(struct dentry *, struct kstatfs *); int nfs_show_options(struct seq_file *, struct dentry *); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index dc69314d455e..2f3822a4a7d5 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2321,11 +2321,10 @@ inline void nfs_initialise_sb(struct super_block *sb) /* * Finish setting up an NFS2/3 superblock */ -int nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info) +void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info) { struct nfs_parsed_mount_data *data = mount_info->parsed; struct nfs_server *server = NFS_SB(sb); - int ret; sb->s_blocksize_bits = 0; sb->s_blocksize = 0; @@ -2343,21 +2342,13 @@ int nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info) } nfs_initialise_sb(sb); - - ret = super_setup_bdi_name(sb, "%u:%u", MAJOR(server->s_dev), - MINOR(server->s_dev)); - if (ret) - return ret; - sb->s_bdi->ra_pages = server->rpages * NFS_MAX_READAHEAD; - return 0; - } EXPORT_SYMBOL_GPL(nfs_fill_super); /* * Finish setting up a cloned NFS2/3/4 superblock */ -int nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info) +void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info) { const struct super_block *old_sb = mount_info->cloned->sb; struct nfs_server *server = NFS_SB(sb); @@ -2377,10 +2368,6 @@ int nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info) } nfs_initialise_sb(sb); - - sb->s_bdi = bdi_get(old_sb->s_bdi); - - return 0; } static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags) @@ -2600,14 +2587,19 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server, nfs_free_server(server); server = NULL; } else { + error = super_setup_bdi_name(s, "%u:%u", MAJOR(server->s_dev), + MINOR(server->s_dev)); + if (error) { + mntroot = ERR_PTR(error); + goto error_splat_super; + } + s->s_bdi->ra_pages = server->rpages * NFS_MAX_READAHEAD; server->super = s; } if (!s->s_root) { /* initial superblock/root creation */ - error = mount_info->fill_super(s, mount_info); - if (error) - goto error_splat_super; + mount_info->fill_super(s, mount_info); nfs_get_cache_cookie(s, mount_info->parsed, mount_info->cloned); } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 8bf8f667a8cf..6493df6b1bd5 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1146,7 +1146,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) static int nfsd_fill_super(struct super_block * sb, void * data, int silent) { - static struct tree_descr nfsd_files[] = { + static const struct tree_descr nfsd_files[] = { [NFSD_List] = {"exports", &exports_nfsd_operations, S_IRUGO}, [NFSD_Export_features] = {"export_features", &export_features_operations, S_IRUGO}, diff --git a/fs/nsfs.c b/fs/nsfs.c index 323f492e0822..f3db56e83dd2 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -196,9 +196,11 @@ int ns_get_name(char *buf, size_t size, struct task_struct *task, { struct ns_common *ns; int res = -ENOENT; + const char *name; ns = ns_ops->get(task); if (ns) { - res = snprintf(buf, size, "%s:[%u]", ns_ops->name, ns->inum); + name = ns_ops->real_ns_name ? : ns_ops->name; + res = snprintf(buf, size, "%s:[%u]", name, ns->inum); ns_ops->put(ns); } return res; diff --git a/fs/open.c b/fs/open.c index 4d23f729dcc6..373787afd638 100644 --- a/fs/open.c +++ b/fs/open.c @@ -900,6 +900,12 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o int lookup_flags = 0; int acc_mode = ACC_MODE(flags); + /* + * Clear out all open flags we don't know about so that we don't report + * them in fcntl(F_GETFD) or similar interfaces. + */ + flags &= VALID_OPEN_FLAGS; + if (flags & (O_CREAT | __O_TMPFILE)) op->mode = (mode & S_IALLUGO) | S_IFREG; else diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c index e1534c9bab16..c19f0787c9c6 100644 --- a/fs/orangefs/devorangefs-req.c +++ b/fs/orangefs/devorangefs-req.c @@ -180,6 +180,10 @@ static ssize_t orangefs_devreq_read(struct file *file, return -EINVAL; } + /* Check for an empty list before locking. */ + if (list_empty(&orangefs_request_list)) + return -EAGAIN; + restart: /* Get next op (if any) from top of list. */ spin_lock(&orangefs_request_list_lock); diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c index 284373a57a08..d327cbd17756 100644 --- a/fs/orangefs/dir.c +++ b/fs/orangefs/dir.c @@ -1,396 +1,404 @@ /* - * (C) 2001 Clemson University and The University of Chicago - * - * See COPYING in top-level directory. + * Copyright 2017 Omnibond Systems, L.L.C. */ #include "protocol.h" #include "orangefs-kernel.h" #include "orangefs-bufmap.h" +struct orangefs_dir_part { + struct orangefs_dir_part *next; + size_t len; +}; + +struct orangefs_dir { + __u64 token; + struct orangefs_dir_part *part; + loff_t end; + int error; +}; + +#define PART_SHIFT (24) +#define PART_SIZE (1<<24) +#define PART_MASK (~(PART_SIZE - 1)) + /* - * decode routine used by kmod to deal with the blob sent from - * userspace for readdirs. The blob contains zero or more of these - * sub-blobs: - * __u32 - represents length of the character string that follows. - * string - between 1 and ORANGEFS_NAME_MAX bytes long. - * padding - (if needed) to cause the __u32 plus the string to be - * eight byte aligned. - * khandle - sizeof(khandle) bytes. + * There can be up to 512 directory entries. Each entry is encoded as + * follows: + * 4 bytes: string size (n) + * n bytes: string + * 1 byte: trailing zero + * padding to 8 bytes + * 16 bytes: khandle + * padding to 8 bytes + * + * The trailer_buf starts with a struct orangefs_readdir_response_s + * which must be skipped to get to the directory data. + * + * The data which is received from the userspace daemon is termed a + * part and is stored in a linked list in case more than one part is + * needed for a large directory. + * + * The position pointer (ctx->pos) encodes the part and offset on which + * to begin reading at. Bits above PART_SHIFT encode the part and bits + * below PART_SHIFT encode the offset. Parts are stored in a linked + * list which grows as data is received from the server. The overhead + * associated with managing the list is presumed to be small compared to + * the overhead of communicating with the server. + * + * As data is received from the server, it is placed at the end of the + * part list. Data is parsed from the current position as it is needed. + * When data is determined to be corrupt, it is either because the + * userspace component has sent back corrupt data or because the file + * pointer has been moved to an invalid location. Since the two cannot + * be differentiated, return EIO. + * + * Part zero is synthesized to contains `.' and `..'. Part one is the + * first part of the part list. */ -static long decode_dirents(char *ptr, size_t size, - struct orangefs_readdir_response_s *readdir) + +static int do_readdir(struct orangefs_inode_s *oi, + struct orangefs_dir *od, struct dentry *dentry, + struct orangefs_kernel_op_s *op) { - int i; - struct orangefs_readdir_response_s *rd = - (struct orangefs_readdir_response_s *) ptr; - char *buf = ptr; - int khandle_size = sizeof(struct orangefs_khandle); - size_t offset = offsetof(struct orangefs_readdir_response_s, - dirent_array); - /* 8 reflects eight byte alignment */ - int smallest_blob = khandle_size + 8; - __u32 len; - int aligned_len; - int sizeof_u32 = sizeof(__u32); - long ret; - - gossip_debug(GOSSIP_DIR_DEBUG, "%s: size:%zu:\n", __func__, size); - - /* size is = offset on empty dirs, > offset on non-empty dirs... */ - if (size < offset) { - gossip_err("%s: size:%zu: offset:%zu:\n", - __func__, - size, - offset); - ret = -EINVAL; - goto out; - } + struct orangefs_readdir_response_s *resp; + int bufi, r; - if ((size == offset) && (readdir->orangefs_dirent_outcount != 0)) { - gossip_err("%s: size:%zu: dirent_outcount:%d:\n", - __func__, - size, - readdir->orangefs_dirent_outcount); - ret = -EINVAL; - goto out; - } + /* + * Despite the badly named field, readdir does not use shared + * memory. However, there are a limited number of readdir + * slots, which must be allocated here. This flag simply tells + * the op scheduler to return the op here for retry. + */ + op->uses_shared_memory = 1; + op->upcall.req.readdir.refn = oi->refn; + op->upcall.req.readdir.token = od->token; + op->upcall.req.readdir.max_dirent_count = + ORANGEFS_MAX_DIRENT_COUNT_READDIR; - readdir->token = rd->token; - readdir->orangefs_dirent_outcount = rd->orangefs_dirent_outcount; - readdir->dirent_array = kcalloc(readdir->orangefs_dirent_outcount, - sizeof(*readdir->dirent_array), - GFP_KERNEL); - if (readdir->dirent_array == NULL) { - gossip_err("%s: kcalloc failed.\n", __func__); - ret = -ENOMEM; - goto out; +again: + bufi = orangefs_readdir_index_get(); + if (bufi < 0) { + od->error = bufi; + return bufi; } - buf += offset; - size -= offset; - - for (i = 0; i < readdir->orangefs_dirent_outcount; i++) { - if (size < smallest_blob) { - gossip_err("%s: size:%zu: smallest_blob:%d:\n", - __func__, - size, - smallest_blob); - ret = -EINVAL; - goto free; - } + op->upcall.req.readdir.buf_index = bufi; - len = *(__u32 *)buf; - if ((len < 1) || (len > ORANGEFS_NAME_MAX)) { - gossip_err("%s: len:%d:\n", __func__, len); - ret = -EINVAL; - goto free; - } + r = service_operation(op, "orangefs_readdir", + get_interruptible_flag(dentry->d_inode)); - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: size:%zu: len:%d:\n", - __func__, - size, - len); + orangefs_readdir_index_put(bufi); - readdir->dirent_array[i].d_name = buf + sizeof_u32; - readdir->dirent_array[i].d_length = len; + if (op_state_purged(op)) { + if (r == -EAGAIN) { + vfree(op->downcall.trailer_buf); + goto again; + } else if (r == -EIO) { + vfree(op->downcall.trailer_buf); + od->error = r; + return r; + } + } - /* - * Calculate "aligned" length of this string and its - * associated __u32 descriptor. - */ - aligned_len = ((sizeof_u32 + len + 1) + 7) & ~7; - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: aligned_len:%d:\n", - __func__, - aligned_len); + if (r < 0) { + vfree(op->downcall.trailer_buf); + od->error = r; + return r; + } else if (op->downcall.status) { + vfree(op->downcall.trailer_buf); + od->error = op->downcall.status; + return op->downcall.status; + } - /* - * The end of the blob should coincide with the end - * of the last sub-blob. - */ - if (size < aligned_len + khandle_size) { - gossip_err("%s: ran off the end of the blob.\n", - __func__); - ret = -EINVAL; - goto free; - } - size -= aligned_len + khandle_size; + /* + * The maximum size is size per entry times the 512 entries plus + * the header. This is well under the limit. + */ + if (op->downcall.trailer_size > PART_SIZE) { + vfree(op->downcall.trailer_buf); + od->error = -EIO; + return -EIO; + } - buf += aligned_len; + resp = (struct orangefs_readdir_response_s *) + op->downcall.trailer_buf; + od->token = resp->token; + return 0; +} - readdir->dirent_array[i].khandle = - *(struct orangefs_khandle *) buf; - buf += khandle_size; +static int parse_readdir(struct orangefs_dir *od, + struct orangefs_kernel_op_s *op) +{ + struct orangefs_dir_part *part, *new; + size_t count; + + count = 1; + part = od->part; + while (part) { + count++; + if (part->next) + part = part->next; + else + break; } - ret = buf - ptr; - gossip_debug(GOSSIP_DIR_DEBUG, "%s: returning:%ld:\n", __func__, ret); - goto out; -free: - kfree(readdir->dirent_array); - readdir->dirent_array = NULL; + new = (void *)op->downcall.trailer_buf; + new->next = NULL; + new->len = op->downcall.trailer_size - + sizeof(struct orangefs_readdir_response_s); + if (!od->part) + od->part = new; + else + part->next = new; + count++; + od->end = count << PART_SHIFT; -out: - return ret; + return 0; } -/* - * Read directory entries from an instance of an open directory. - */ -static int orangefs_readdir(struct file *file, struct dir_context *ctx) +static int orangefs_dir_more(struct orangefs_inode_s *oi, + struct orangefs_dir *od, struct dentry *dentry) { - int ret = 0; - int buffer_index; - /* - * ptoken supports Orangefs' distributed directory logic, added - * in 2.9.2. - */ - __u64 *ptoken = file->private_data; - __u64 pos = 0; - ino_t ino = 0; - struct dentry *dentry = file->f_path.dentry; - struct orangefs_kernel_op_s *new_op = NULL; - struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(dentry->d_inode); - struct orangefs_readdir_response_s readdir_response; - void *dents_buf; - int i = 0; - int len = 0; - ino_t current_ino = 0; - char *current_entry = NULL; - long bytes_decoded; - - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: ctx->pos:%lld, ptoken = %llu\n", - __func__, - lld(ctx->pos), - llu(*ptoken)); - - pos = (__u64) ctx->pos; - - /* are we done? */ - if (pos == ORANGEFS_READDIR_END) { - gossip_debug(GOSSIP_DIR_DEBUG, - "Skipping to termination path\n"); - return 0; + struct orangefs_kernel_op_s *op; + int r; + + op = op_alloc(ORANGEFS_VFS_OP_READDIR); + if (!op) { + od->error = -ENOMEM; + return -ENOMEM; + } + r = do_readdir(oi, od, dentry, op); + if (r) { + od->error = r; + goto out; + } + r = parse_readdir(od, op); + if (r) { + od->error = r; + goto out; } - gossip_debug(GOSSIP_DIR_DEBUG, - "orangefs_readdir called on %pd (pos=%llu)\n", - dentry, llu(pos)); + od->error = 0; +out: + op_release(op); + return od->error; +} - memset(&readdir_response, 0, sizeof(readdir_response)); +static int fill_from_part(struct orangefs_dir_part *part, + struct dir_context *ctx) +{ + const int offset = sizeof(struct orangefs_readdir_response_s); + struct orangefs_khandle *khandle; + __u32 *len, padlen; + loff_t i; + char *s; + i = ctx->pos & ~PART_MASK; - new_op = op_alloc(ORANGEFS_VFS_OP_READDIR); - if (!new_op) - return -ENOMEM; + /* The file offset from userspace is too large. */ + if (i > part->len) + return 1; /* - * Only the indices are shared. No memory is actually shared, but the - * mechanism is used. + * If the seek pointer is positioned just before an entry it + * should find the next entry. */ - new_op->uses_shared_memory = 1; - new_op->upcall.req.readdir.refn = orangefs_inode->refn; - new_op->upcall.req.readdir.max_dirent_count = - ORANGEFS_MAX_DIRENT_COUNT_READDIR; - - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: upcall.req.readdir.refn.khandle: %pU\n", - __func__, - &new_op->upcall.req.readdir.refn.khandle); + if (i % 8) + i = i + (8 - i%8)%8; - new_op->upcall.req.readdir.token = *ptoken; - -get_new_buffer_index: - buffer_index = orangefs_readdir_index_get(); - if (buffer_index < 0) { - ret = buffer_index; - gossip_lerr("orangefs_readdir: orangefs_readdir_index_get() failure (%d)\n", - ret); - goto out_free_op; + while (i < part->len) { + if (part->len < i + sizeof *len) + break; + len = (void *)part + offset + i; + /* + * len is the size of the string itself. padlen is the + * total size of the encoded string. + */ + padlen = (sizeof *len + *len + 1) + + (8 - (sizeof *len + *len + 1)%8)%8; + if (part->len < i + padlen + sizeof *khandle) + goto next; + s = (void *)part + offset + i + sizeof *len; + if (s[*len] != 0) + goto next; + khandle = (void *)part + offset + i + padlen; + if (!dir_emit(ctx, s, *len, + orangefs_khandle_to_ino(khandle), + DT_UNKNOWN)) + return 0; + i += padlen + sizeof *khandle; + i = i + (8 - i%8)%8; + BUG_ON(i > part->len); + ctx->pos = (ctx->pos & PART_MASK) | i; + continue; +next: + i += 8; } - new_op->upcall.req.readdir.buf_index = buffer_index; - - ret = service_operation(new_op, - "orangefs_readdir", - get_interruptible_flag(dentry->d_inode)); + return 1; +} - gossip_debug(GOSSIP_DIR_DEBUG, - "Readdir downcall status is %d. ret:%d\n", - new_op->downcall.status, - ret); +static int orangefs_dir_fill(struct orangefs_inode_s *oi, + struct orangefs_dir *od, struct dentry *dentry, + struct dir_context *ctx) +{ + struct orangefs_dir_part *part; + size_t count; - orangefs_readdir_index_put(buffer_index); + count = ((ctx->pos & PART_MASK) >> PART_SHIFT) - 1; - if (ret == -EAGAIN && op_state_purged(new_op)) { - /* Client-core indices are invalid after it restarted. */ - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: Getting new buffer_index for retry of readdir..\n", - __func__); - goto get_new_buffer_index; + part = od->part; + while (part->next && count) { + count--; + part = part->next; } - - if (ret == -EIO && op_state_purged(new_op)) { - gossip_err("%s: Client is down. Aborting readdir call.\n", - __func__); - goto out_free_op; + /* This means the userspace file offset is invalid. */ + if (count) { + od->error = -EIO; + return -EIO; } - if (ret < 0 || new_op->downcall.status != 0) { - gossip_debug(GOSSIP_DIR_DEBUG, - "Readdir request failed. Status:%d\n", - new_op->downcall.status); - if (ret >= 0) - ret = new_op->downcall.status; - goto out_free_op; + while (part && part->len) { + int r; + r = fill_from_part(part, ctx); + if (r < 0) { + od->error = r; + return r; + } else if (r == 0) { + /* Userspace buffer is full. */ + break; + } else { + /* + * The part ran out of data. Move to the next + * part. */ + ctx->pos = (ctx->pos & PART_MASK) + + (1 << PART_SHIFT); + part = part->next; + } } + return 0; +} - dents_buf = new_op->downcall.trailer_buf; - if (dents_buf == NULL) { - gossip_err("Invalid NULL buffer in readdir response\n"); - ret = -ENOMEM; - goto out_free_op; +static loff_t orangefs_dir_llseek(struct file *file, loff_t offset, + int whence) +{ + struct orangefs_dir *od = file->private_data; + /* + * Delete the stored data so userspace sees new directory + * entries. + */ + if (!whence && offset < od->end) { + struct orangefs_dir_part *part = od->part; + while (part) { + struct orangefs_dir_part *next = part->next; + vfree(part); + part = next; + } + od->token = ORANGEFS_ITERATE_START; + od->part = NULL; + od->end = 1 << PART_SHIFT; } + return default_llseek(file, offset, whence); +} - bytes_decoded = decode_dirents(dents_buf, new_op->downcall.trailer_size, - &readdir_response); - if (bytes_decoded < 0) { - ret = bytes_decoded; - gossip_err("Could not decode readdir from buffer %d\n", ret); - goto out_vfree; - } +static int orangefs_dir_iterate(struct file *file, + struct dir_context *ctx) +{ + struct orangefs_inode_s *oi; + struct orangefs_dir *od; + struct dentry *dentry; + int r; - if (bytes_decoded != new_op->downcall.trailer_size) { - gossip_err("orangefs_readdir: # bytes decoded (%ld) " - "!= trailer size (%ld)\n", - bytes_decoded, - (long)new_op->downcall.trailer_size); - ret = -EINVAL; - goto out_destroy_handle; - } + dentry = file->f_path.dentry; + oi = ORANGEFS_I(dentry->d_inode); + od = file->private_data; - /* - * orangefs doesn't actually store dot and dot-dot, but - * we need to have them represented. - */ - if (pos == 0) { - ino = get_ino_from_khandle(dentry->d_inode); - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: calling dir_emit of \".\" with pos = %llu\n", - __func__, - llu(pos)); - ret = dir_emit(ctx, ".", 1, ino, DT_DIR); - pos += 1; - } + if (od->error) + return od->error; - if (pos == 1) { - ino = get_parent_ino_from_dentry(dentry); - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: calling dir_emit of \"..\" with pos = %llu\n", - __func__, - llu(pos)); - ret = dir_emit(ctx, "..", 2, ino, DT_DIR); - pos += 1; + if (ctx->pos == 0) { + if (!dir_emit_dot(file, ctx)) + return 0; + ctx->pos++; + } + if (ctx->pos == 1) { + if (!dir_emit_dotdot(file, ctx)) + return 0; + ctx->pos = 1 << PART_SHIFT; } /* - * we stored ORANGEFS_ITERATE_NEXT in ctx->pos last time around - * to prevent "finding" dot and dot-dot on any iteration - * other than the first. + * The seek position is in the first synthesized part but is not + * valid. */ - if (ctx->pos == ORANGEFS_ITERATE_NEXT) - ctx->pos = 0; - - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: dirent_outcount:%d:\n", - __func__, - readdir_response.orangefs_dirent_outcount); - for (i = ctx->pos; - i < readdir_response.orangefs_dirent_outcount; - i++) { - len = readdir_response.dirent_array[i].d_length; - current_entry = readdir_response.dirent_array[i].d_name; - current_ino = orangefs_khandle_to_ino( - &readdir_response.dirent_array[i].khandle); - - gossip_debug(GOSSIP_DIR_DEBUG, - "calling dir_emit for %s with len %d" - ", ctx->pos %ld\n", - current_entry, - len, - (unsigned long)ctx->pos); - /* - * type is unknown. We don't return object type - * in the dirent_array. This leaves getdents - * clueless about type. - */ - ret = - dir_emit(ctx, current_entry, len, current_ino, DT_UNKNOWN); - if (!ret) - break; - ctx->pos++; - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: ctx->pos:%lld\n", - __func__, - lld(ctx->pos)); + if ((ctx->pos & PART_MASK) == 0) + return -EIO; - } + r = 0; /* - * we ran all the way through the last batch, set up for - * getting another batch... + * Must read more if the user has sought past what has been read + * so far. Stop a user who has sought past the end. */ - if (ret) { - *ptoken = readdir_response.token; - ctx->pos = ORANGEFS_ITERATE_NEXT; + while (od->token != ORANGEFS_ITERATE_END && + ctx->pos > od->end) { + r = orangefs_dir_more(oi, od, dentry); + if (r) + return r; + } + if (od->token == ORANGEFS_ITERATE_END && ctx->pos > od->end) + return -EIO; + + /* Then try to fill if there's any left in the buffer. */ + if (ctx->pos < od->end) { + r = orangefs_dir_fill(oi, od, dentry, ctx); + if (r) + return r; } - /* - * Did we hit the end of the directory? - */ - if (readdir_response.token == ORANGEFS_READDIR_END) { - gossip_debug(GOSSIP_DIR_DEBUG, - "End of dir detected; setting ctx->pos to ORANGEFS_READDIR_END.\n"); - ctx->pos = ORANGEFS_READDIR_END; + /* Finally get some more and try to fill. */ + if (od->token != ORANGEFS_ITERATE_END) { + r = orangefs_dir_more(oi, od, dentry); + if (r) + return r; + r = orangefs_dir_fill(oi, od, dentry, ctx); } -out_destroy_handle: - /* kfree(NULL) is safe */ - kfree(readdir_response.dirent_array); -out_vfree: - gossip_debug(GOSSIP_DIR_DEBUG, "vfree %p\n", dents_buf); - vfree(dents_buf); -out_free_op: - op_release(new_op); - gossip_debug(GOSSIP_DIR_DEBUG, "orangefs_readdir returning %d\n", ret); - return ret; + return r; } static int orangefs_dir_open(struct inode *inode, struct file *file) { - __u64 *ptoken; - - file->private_data = kmalloc(sizeof(__u64), GFP_KERNEL); + struct orangefs_dir *od; + file->private_data = kmalloc(sizeof(struct orangefs_dir), + GFP_KERNEL); if (!file->private_data) return -ENOMEM; - - ptoken = file->private_data; - *ptoken = ORANGEFS_READDIR_START; + od = file->private_data; + od->token = ORANGEFS_ITERATE_START; + od->part = NULL; + od->end = 1 << PART_SHIFT; + od->error = 0; return 0; } static int orangefs_dir_release(struct inode *inode, struct file *file) { + struct orangefs_dir *od = file->private_data; + struct orangefs_dir_part *part = od->part; orangefs_flush_inode(inode); - kfree(file->private_data); + while (part) { + struct orangefs_dir_part *next = part->next; + vfree(part); + part = next; + } + kfree(od); return 0; } -/** ORANGEFS implementation of VFS directory operations */ const struct file_operations orangefs_dir_operations = { + .llseek = orangefs_dir_llseek, .read = generic_read_dir, - .iterate = orangefs_readdir, + .iterate = orangefs_dir_iterate, .open = orangefs_dir_open, - .release = orangefs_dir_release, + .release = orangefs_dir_release }; diff --git a/fs/orangefs/downcall.h b/fs/orangefs/downcall.h index 3b8923f8bf21..163001c95501 100644 --- a/fs/orangefs/downcall.h +++ b/fs/orangefs/downcall.h @@ -40,16 +40,6 @@ struct orangefs_mkdir_response { struct orangefs_object_kref refn; }; -/* - * duplication of some system interface structures so that I don't have - * to allocate extra memory - */ -struct orangefs_dirent { - char *d_name; - int d_length; - struct orangefs_khandle khandle; -}; - struct orangefs_statfs_response { __s64 block_size; __s64 blocks_total; @@ -131,12 +121,16 @@ struct orangefs_downcall_s { } resp; }; +/* + * The readdir response comes in the trailer. It is followed by the + * directory entries as described in dir.c. + */ + struct orangefs_readdir_response_s { __u64 token; __u64 directory_version; __u32 __pad2; __u32 orangefs_dirent_outcount; - struct orangefs_dirent *dirent_array; }; #endif /* __DOWNCALL_H */ diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 1cd37ebc4f25..28f38d813ad2 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -474,7 +474,8 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite /* Make sure generic_write_checks sees an up to date inode size. */ if (file->f_flags & O_APPEND) { - rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1); + rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1, + STATX_SIZE); if (rc == -ESTALE) rc = -EIO; if (rc) { @@ -692,7 +693,8 @@ static loff_t orangefs_file_llseek(struct file *file, loff_t offset, int origin) * NOTE: We are only interested in file size here, * so we set mask accordingly. */ - ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1); + ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1, + STATX_SIZE); if (ret == -ESTALE) ret = -EIO; if (ret) { diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index a304bf34b212..9428ea0aac16 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -161,7 +161,7 @@ static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr) iattr->ia_size); /* Ensure that we have a up to date size, so we know if it changed. */ - ret = orangefs_inode_getattr(inode, 0, 1); + ret = orangefs_inode_getattr(inode, 0, 1, STATX_SIZE); if (ret == -ESTALE) ret = -EIO; if (ret) { @@ -218,8 +218,7 @@ int orangefs_setattr(struct dentry *dentry, struct iattr *iattr) if (ret) goto out; - if ((iattr->ia_valid & ATTR_SIZE) && - iattr->ia_size != i_size_read(inode)) { + if (iattr->ia_valid & ATTR_SIZE) { ret = orangefs_setattr_size(inode, iattr); if (ret) goto out; @@ -256,13 +255,19 @@ int orangefs_getattr(const struct path *path, struct kstat *stat, "orangefs_getattr: called on %pd\n", path->dentry); - ret = orangefs_inode_getattr(inode, 0, 0); + ret = orangefs_inode_getattr(inode, 0, 0, request_mask); if (ret == 0) { generic_fillattr(inode, stat); /* override block size reported to stat */ orangefs_inode = ORANGEFS_I(inode); stat->blksize = orangefs_inode->blksize; + + if (request_mask & STATX_SIZE) + stat->result_mask = STATX_BASIC_STATS; + else + stat->result_mask = STATX_BASIC_STATS & + ~STATX_SIZE; } return ret; } @@ -277,7 +282,7 @@ int orangefs_permission(struct inode *inode, int mask) gossip_debug(GOSSIP_INODE_DEBUG, "%s: refreshing\n", __func__); /* Make sure the permission (and other common attrs) are up to date. */ - ret = orangefs_inode_getattr(inode, 0, 0); + ret = orangefs_inode_getattr(inode, 0, 0, STATX_MODE); if (ret < 0) return ret; @@ -375,7 +380,7 @@ struct inode *orangefs_iget(struct super_block *sb, struct orangefs_object_kref if (!inode || !(inode->i_state & I_NEW)) return inode; - error = orangefs_inode_getattr(inode, 1, 1); + error = orangefs_inode_getattr(inode, 1, 1, STATX_ALL); if (error) { iget_failed(inode); return ERR_PTR(error); @@ -420,7 +425,7 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, orangefs_set_inode(inode, ref); inode->i_ino = hash; /* needed for stat etc */ - error = orangefs_inode_getattr(inode, 1, 1); + error = orangefs_inode_getattr(inode, 1, 1, STATX_ALL); if (error) goto out_iput; diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index a290ff6ec756..478e88bd7f9d 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -74,6 +74,7 @@ static int orangefs_create(struct inode *dir, unlock_new_inode(inode); orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; + ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS; gossip_debug(GOSSIP_NAME_DEBUG, "%s: dentry instantiated for %pd\n", @@ -193,8 +194,6 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry, goto out; } - ORANGEFS_I(inode)->getattr_time = jiffies - 1; - gossip_debug(GOSSIP_NAME_DEBUG, "%s:%s:%d " "Found good inode [%lu] with count [%d]\n", @@ -324,6 +323,7 @@ static int orangefs_symlink(struct inode *dir, unlock_new_inode(inode); orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; + ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS; gossip_debug(GOSSIP_NAME_DEBUG, "Inode (Symlink) %pU -> %pd\n", @@ -388,6 +388,7 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode unlock_new_inode(inode); orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; + ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS; gossip_debug(GOSSIP_NAME_DEBUG, "Inode (Directory) %pU -> %pd\n", diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index 791912da97d7..716ed337f166 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -440,6 +440,9 @@ static ssize_t orangefs_debug_write(struct file *file, "orangefs_debug_write: %pD\n", file); + if (count == 0) + return 0; + /* * Thwart users who try to jamb a ridiculous number * of bytes into the debug file... diff --git a/fs/orangefs/orangefs-dev-proto.h b/fs/orangefs/orangefs-dev-proto.h index f380f9ed1b28..efe08c763e56 100644 --- a/fs/orangefs/orangefs-dev-proto.h +++ b/fs/orangefs/orangefs-dev-proto.h @@ -52,12 +52,7 @@ */ #define ORANGEFS_MAX_DEBUG_STRING_LEN 0x00000800 -/* - * The maximum number of directory entries in a single request is 96. - * XXX: Why can this not be higher. The client-side code can handle up to 512. - * XXX: What happens if we expect more than the client can return? - */ -#define ORANGEFS_MAX_DIRENT_COUNT_READDIR 96 +#define ORANGEFS_MAX_DIRENT_COUNT_READDIR 512 #include "upcall.h" #include "downcall.h" diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 8afac46fcc87..ea0ce507a6ab 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -215,6 +215,7 @@ struct orangefs_inode_s { unsigned long pinode_flags; unsigned long getattr_time; + u32 getattr_mask; }; #define P_ATIME_FLAG 0 @@ -340,11 +341,6 @@ static inline struct orangefs_khandle *get_khandle_from_ino(struct inode *inode) return &(ORANGEFS_I(inode)->refn.khandle); } -static inline __s32 get_fsid_from_ino(struct inode *inode) -{ - return ORANGEFS_I(inode)->refn.fs_id; -} - static inline ino_t get_ino_from_khandle(struct inode *inode) { struct orangefs_khandle *khandle; @@ -500,7 +496,8 @@ int orangefs_inode_setxattr(struct inode *inode, size_t size, int flags); -int orangefs_inode_getattr(struct inode *inode, int new, int bypass); +int orangefs_inode_getattr(struct inode *inode, int new, int bypass, + u32 request_mask); int orangefs_inode_check_changed(struct inode *inode); diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index 9b96b99539d6..aab6f1842963 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -251,7 +251,8 @@ static int orangefs_inode_is_stale(struct inode *inode, int new, return 0; } -int orangefs_inode_getattr(struct inode *inode, int new, int bypass) +int orangefs_inode_getattr(struct inode *inode, int new, int bypass, + u32 request_mask) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_kernel_op_s *new_op; @@ -262,7 +263,13 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass) get_khandle_from_ino(inode)); if (!new && !bypass) { - if (time_before(jiffies, orangefs_inode->getattr_time)) + /* + * Must have all the attributes in the mask and be within cache + * time. + */ + if ((request_mask & orangefs_inode->getattr_mask) == + request_mask && + time_before(jiffies, orangefs_inode->getattr_time)) return 0; } @@ -270,7 +277,15 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass) if (!new_op) return -ENOMEM; new_op->upcall.req.getattr.refn = orangefs_inode->refn; - new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_ALL_NOHINT; + /* + * Size is the hardest attribute to get. The incremental cost of any + * other attribute is essentially zero. + */ + if (request_mask & STATX_SIZE || new) + new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_ALL_NOHINT; + else + new_op->upcall.req.getattr.mask = + ORANGEFS_ATTR_SYS_ALL_NOHINT & ~ORANGEFS_ATTR_SYS_SIZE; ret = service_operation(new_op, __func__, get_interruptible_flag(inode)); @@ -291,25 +306,29 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass) case S_IFREG: inode->i_flags = orangefs_inode_flags(&new_op-> downcall.resp.getattr.attributes); - inode_size = (loff_t)new_op-> - downcall.resp.getattr.attributes.size; - rounded_up_size = - (inode_size + (4096 - (inode_size % 4096))); - inode->i_size = inode_size; - orangefs_inode->blksize = - new_op->downcall.resp.getattr.attributes.blksize; - spin_lock(&inode->i_lock); - inode->i_bytes = inode_size; - inode->i_blocks = - (unsigned long)(rounded_up_size / 512); - spin_unlock(&inode->i_lock); + if (request_mask & STATX_SIZE || new) { + inode_size = (loff_t)new_op-> + downcall.resp.getattr.attributes.size; + rounded_up_size = + (inode_size + (4096 - (inode_size % 4096))); + inode->i_size = inode_size; + orangefs_inode->blksize = + new_op->downcall.resp.getattr.attributes.blksize; + spin_lock(&inode->i_lock); + inode->i_bytes = inode_size; + inode->i_blocks = + (unsigned long)(rounded_up_size / 512); + spin_unlock(&inode->i_lock); + } break; case S_IFDIR: - inode->i_size = PAGE_SIZE; - orangefs_inode->blksize = i_blocksize(inode); - spin_lock(&inode->i_lock); - inode_set_bytes(inode, inode->i_size); - spin_unlock(&inode->i_lock); + if (request_mask & STATX_SIZE || new) { + inode->i_size = PAGE_SIZE; + orangefs_inode->blksize = i_blocksize(inode); + spin_lock(&inode->i_lock); + inode_set_bytes(inode, inode->i_size); + spin_unlock(&inode->i_lock); + } set_nlink(inode, 1); break; case S_IFLNK: @@ -349,6 +368,10 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass) orangefs_inode->getattr_time = jiffies + orangefs_getattr_timeout_msecs*HZ/1000; + if (request_mask & STATX_SIZE || new) + orangefs_inode->getattr_mask = STATX_BASIC_STATS; + else + orangefs_inode->getattr_mask = STATX_BASIC_STATS & ~STATX_SIZE; ret = 0; out: op_release(new_op); @@ -500,41 +523,6 @@ int orangefs_flush_inode(struct inode *inode) return ret; } -int orangefs_unmount_sb(struct super_block *sb) -{ - int ret = -EINVAL; - struct orangefs_kernel_op_s *new_op = NULL; - - gossip_debug(GOSSIP_UTILS_DEBUG, - "orangefs_unmount_sb called on sb %p\n", - sb); - - new_op = op_alloc(ORANGEFS_VFS_OP_FS_UMOUNT); - if (!new_op) - return -ENOMEM; - new_op->upcall.req.fs_umount.id = ORANGEFS_SB(sb)->id; - new_op->upcall.req.fs_umount.fs_id = ORANGEFS_SB(sb)->fs_id; - strncpy(new_op->upcall.req.fs_umount.orangefs_config_server, - ORANGEFS_SB(sb)->devname, - ORANGEFS_MAX_SERVER_ADDR_LEN); - - gossip_debug(GOSSIP_UTILS_DEBUG, - "Attempting ORANGEFS Unmount via host %s\n", - new_op->upcall.req.fs_umount.orangefs_config_server); - - ret = service_operation(new_op, "orangefs_fs_umount", 0); - - gossip_debug(GOSSIP_UTILS_DEBUG, - "orangefs_unmount: got return value of %d\n", ret); - if (ret) - sb = ERR_PTR(ret); - else - ORANGEFS_SB(sb)->mount_pending = 1; - - op_release(new_op); - return ret; -} - void orangefs_make_bad_inode(struct inode *inode) { if (is_root_handle(inode)) { diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h index 971307ad69be..48bcc1bbe415 100644 --- a/fs/orangefs/protocol.h +++ b/fs/orangefs/protocol.h @@ -138,13 +138,8 @@ typedef __s64 ORANGEFS_offset; #define ORANGEFS_G_SGID (1 << 10) #define ORANGEFS_U_SUID (1 << 11) -/* definition taken from stdint.h */ -#define INT32_MAX (2147483647) -#define ORANGEFS_ITERATE_START (INT32_MAX - 1) -#define ORANGEFS_ITERATE_END (INT32_MAX - 2) -#define ORANGEFS_ITERATE_NEXT (INT32_MAX - 3) -#define ORANGEFS_READDIR_START ORANGEFS_ITERATE_START -#define ORANGEFS_READDIR_END ORANGEFS_ITERATE_END +#define ORANGEFS_ITERATE_START 2147483646 +#define ORANGEFS_ITERATE_END 2147483645 #define ORANGEFS_IMMUTABLE_FL FS_IMMUTABLE_FL #define ORANGEFS_APPEND_FL FS_APPEND_FL #define ORANGEFS_NOATIME_FL FS_NOATIME_FL diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index 629d8c917fa6..5c7c273e17ec 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -376,6 +376,25 @@ static const struct export_operations orangefs_export_ops = { .fh_to_dentry = orangefs_fh_to_dentry, }; +static int orangefs_unmount(int id, __s32 fs_id, const char *devname) +{ + struct orangefs_kernel_op_s *op; + int r; + op = op_alloc(ORANGEFS_VFS_OP_FS_UMOUNT); + if (!op) + return -ENOMEM; + op->upcall.req.fs_umount.id = id; + op->upcall.req.fs_umount.fs_id = fs_id; + strncpy(op->upcall.req.fs_umount.orangefs_config_server, + devname, ORANGEFS_MAX_SERVER_ADDR_LEN); + r = service_operation(op, "orangefs_fs_umount", 0); + /* Not much to do about an error here. */ + if (r) + gossip_err("orangefs_unmount: service_operation %d\n", r); + op_release(op); + return r; +} + static int orangefs_fill_sb(struct super_block *sb, struct orangefs_fs_mount_response *fs_mount, void *data, int silent) @@ -484,6 +503,8 @@ struct dentry *orangefs_mount(struct file_system_type *fst, if (IS_ERR(sb)) { d = ERR_CAST(sb); + orangefs_unmount(new_op->downcall.resp.fs_mount.id, + new_op->downcall.resp.fs_mount.fs_id, devname); goto free_op; } @@ -539,6 +560,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst, free_sb_and_op: /* Will call orangefs_kill_sb with sb not in list. */ ORANGEFS_SB(sb)->no_list = 1; + /* ORANGEFS_VFS_OP_FS_UMOUNT is done by orangefs_kill_sb. */ deactivate_locked_super(sb); free_op: gossip_err("orangefs_mount: mount request failed with %d\n", ret); @@ -554,6 +576,7 @@ free_op: void orangefs_kill_sb(struct super_block *sb) { + int r; gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_kill_sb: called\n"); /* provided sb cleanup */ @@ -563,7 +586,10 @@ void orangefs_kill_sb(struct super_block *sb) * issue the unmount to userspace to tell it to remove the * dynamic mount info it has for this superblock */ - orangefs_unmount_sb(sb); + r = orangefs_unmount(ORANGEFS_SB(sb)->id, ORANGEFS_SB(sb)->fs_id, + ORANGEFS_SB(sb)->devname); + if (!r) + ORANGEFS_SB(sb)->mount_pending = 1; if (!ORANGEFS_SB(sb)->no_list) { /* remove the sb from our list of orangefs specific sb's */ diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c index abcfa3fa9992..61e2ca7fec55 100644 --- a/fs/orangefs/waitqueue.c +++ b/fs/orangefs/waitqueue.c @@ -124,7 +124,14 @@ retry_servicing: gossip_debug(GOSSIP_WAIT_DEBUG, "%s:client core is NOT in service.\n", __func__); - timeout = op_timeout_secs * HZ; + /* + * Don't wait for the userspace component to return if + * the filesystem is being umounted anyway. + */ + if (op->upcall.type == ORANGEFS_VFS_OP_FS_UMOUNT) + timeout = 0; + else + timeout = op_timeout_secs * HZ; } spin_unlock(&orangefs_request_list_lock); diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c index 74a81b1daaac..237c9c04dc3b 100644 --- a/fs/orangefs/xattr.c +++ b/fs/orangefs/xattr.c @@ -76,11 +76,8 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name, if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; - if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) { - gossip_err("Invalid key length (%d)\n", - (int)strlen(name)); + if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN) return -EINVAL; - } fsuid = from_kuid(&init_user_ns, current_fsuid()); fsgid = from_kgid(&init_user_ns, current_fsgid()); @@ -172,6 +169,9 @@ static int orangefs_inode_removexattr(struct inode *inode, const char *name, struct orangefs_kernel_op_s *new_op = NULL; int ret = -ENOMEM; + if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN) + return -EINVAL; + down_write(&orangefs_inode->xattr_sem); new_op = op_alloc(ORANGEFS_VFS_OP_REMOVEXATTR); if (!new_op) @@ -231,23 +231,13 @@ int orangefs_inode_setxattr(struct inode *inode, const char *name, "%s: name %s, buffer_size %zd\n", __func__, name, size); - if (size >= ORANGEFS_MAX_XATTR_VALUELEN || - flags < 0) { - gossip_err("orangefs_inode_setxattr: bogus values of size(%d), flags(%d)\n", - (int)size, - flags); + if (size > ORANGEFS_MAX_XATTR_VALUELEN) + return -EINVAL; + if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN) return -EINVAL; - } internal_flag = convert_to_internal_xattr_flags(flags); - if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) { - gossip_err - ("orangefs_inode_setxattr: bogus key size (%d)\n", - (int)(strlen(name))); - return -EINVAL; - } - /* This is equivalent to a removexattr */ if (size == 0 && value == NULL) { gossip_debug(GOSSIP_XATTR_DEBUG, @@ -358,7 +348,7 @@ try_again: returned_count = new_op->downcall.resp.listxattr.returned_count; if (returned_count < 0 || - returned_count >= ORANGEFS_MAX_XATTR_LISTLEN) { + returned_count > ORANGEFS_MAX_XATTR_LISTLEN) { gossip_err("%s: impossible value for returned_count:%d:\n", __func__, returned_count); diff --git a/fs/proc/base.c b/fs/proc/base.c index 9e3ac5c11780..45f6bf68fff3 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -821,10 +821,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf, if (!mmget_not_zero(mm)) goto free; - /* Maybe we should limit FOLL_FORCE to actual ptrace users? */ - flags = FOLL_FORCE; - if (write) - flags |= FOLL_WRITE; + flags = write ? FOLL_WRITE : 0; while (count > 0) { int this_len = min_t(int, count, PAGE_SIZE); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index ee27feb34cf4..9425c0d97262 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -472,6 +472,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name) ent->data = NULL; ent->proc_fops = NULL; ent->proc_iops = NULL; + parent->nlink++; if (proc_register(parent, ent) < 0) { kfree(ent); parent->nlink--; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 2cc7a8030275..e250910cffc8 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -58,7 +58,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb) struct proc_inode *ei; struct inode *inode; - ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL); if (!ei) return NULL; ei->pid = NULL; diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 766f0c637ad1..3803b24ca220 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -23,6 +23,7 @@ static const struct proc_ns_operations *ns_entries[] = { #endif #ifdef CONFIG_PID_NS &pidns_operations, + &pidns_for_children_operations, #endif #ifdef CONFIG_USER_NS &userns_operations, diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index d04ea4349909..67985a7233c2 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -408,10 +408,6 @@ static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentr *pentry = entry; } -void register_sysctl_root(struct ctl_table_root *root) -{ -} - /* * sysctl_perm does NOT grant the superuser all rights automatically, because * some sysctl variables are readonly even to root. diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c index aca73dd73906..e3c558d1b78c 100644 --- a/fs/reiserfs/item_ops.c +++ b/fs/reiserfs/item_ops.c @@ -724,18 +724,18 @@ static void errcatch_print_vi(struct virtual_item *vi) } static struct item_operations errcatch_ops = { - errcatch_bytes_number, - errcatch_decrement_key, - errcatch_is_left_mergeable, - errcatch_print_item, - errcatch_check_item, - - errcatch_create_vi, - errcatch_check_left, - errcatch_check_right, - errcatch_part_size, - errcatch_unit_num, - errcatch_print_vi + .bytes_number = errcatch_bytes_number, + .decrement_key = errcatch_decrement_key, + .is_left_mergeable = errcatch_is_left_mergeable, + .print_item = errcatch_print_item, + .check_item = errcatch_check_item, + + .create_vi = errcatch_create_vi, + .check_left = errcatch_check_left, + .check_right = errcatch_check_right, + .part_size = errcatch_part_size, + .unit_num = errcatch_unit_num, + .print_vi = errcatch_print_vi }; #if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3) diff --git a/fs/select.c b/fs/select.c index bd4b2ccfd346..d6c652a31e99 100644 --- a/fs/select.c +++ b/fs/select.c @@ -633,10 +633,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, goto out_nofds; alloc_size = 6 * size; - bits = kmalloc(alloc_size, GFP_KERNEL|__GFP_NOWARN); - if (!bits && alloc_size > PAGE_SIZE) - bits = vmalloc(alloc_size); - + bits = kvmalloc(alloc_size, GFP_KERNEL); if (!bits) goto out_nofds; } diff --git a/fs/seq_file.c b/fs/seq_file.c index ca69fb99e41a..dc7c2be963ed 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -25,21 +25,7 @@ static void seq_set_overflow(struct seq_file *m) static void *seq_buf_alloc(unsigned long size) { - void *buf; - gfp_t gfp = GFP_KERNEL; - - /* - * For high order allocations, use __GFP_NORETRY to avoid oom-killing - - * it's better to fall back to vmalloc() than to kill things. For small - * allocations, just use GFP_KERNEL which will oom kill, thus no need - * for vmalloc fallback. - */ - if (size > PAGE_SIZE) - gfp |= __GFP_NORETRY | __GFP_NOWARN; - buf = kmalloc(size, gfp); - if (!buf && size > PAGE_SIZE) - buf = vmalloc(size); - return buf; + return kvmalloc(size, GFP_KERNEL); } /** diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 21d36d284735..328e89c2cf83 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -266,7 +266,7 @@ static const struct super_operations tracefs_super_operations = { static int trace_fill_super(struct super_block *sb, void *data, int silent) { - static struct tree_descr trace_files[] = {{""}}; + static const struct tree_descr trace_files[] = {{""}}; struct tracefs_fs_info *fsi; int err; diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index b777bddaa1dd..566079d9b402 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -121,7 +121,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, inode_init_owner(inode, dir, mode); inode->i_mtime = inode->i_atime = inode->i_ctime = - ubifs_current_time(inode); + current_time(inode); inode->i_mapping->nrpages = 0; switch (mode & S_IFMT) { @@ -285,6 +285,15 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, goto out_dent; } + if (ubifs_crypt_is_encrypted(dir) && + (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && + !fscrypt_has_permitted_context(dir, inode)) { + ubifs_warn(c, "Inconsistent encryption contexts: %lu/%lu", + dir->i_ino, inode->i_ino); + err = -EPERM; + goto out_inode; + } + done: kfree(dent); fscrypt_free_filename(&nm); @@ -295,6 +304,8 @@ done: d_add(dentry, inode); return NULL; +out_inode: + iput(inode); out_dent: kfree(dent); out_fname: @@ -755,7 +766,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, inc_nlink(inode); ihold(inode); - inode->i_ctime = ubifs_current_time(inode); + inode->i_ctime = current_time(inode); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; dir->i_mtime = dir->i_ctime = inode->i_ctime; @@ -830,7 +841,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) } lock_2_inodes(dir, inode); - inode->i_ctime = ubifs_current_time(dir); + inode->i_ctime = current_time(dir); drop_nlink(inode); dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; @@ -934,7 +945,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) } lock_2_inodes(dir, inode); - inode->i_ctime = ubifs_current_time(dir); + inode->i_ctime = current_time(dir); clear_nlink(inode); drop_nlink(dir); dir->i_size -= sz_change; @@ -1411,7 +1422,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, * Like most other Unix systems, set the @i_ctime for inodes on a * rename. */ - time = ubifs_current_time(old_dir); + time = current_time(old_dir); old_inode->i_ctime = time; /* We must adjust parent link count when renaming directories */ @@ -1584,7 +1595,7 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, lock_4_inodes(old_dir, new_dir, NULL, NULL); - time = ubifs_current_time(old_dir); + time = current_time(old_dir); fst_inode->i_ctime = time; snd_inode->i_ctime = time; old_dir->i_mtime = old_dir->i_ctime = time; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index d9ae86f96df7..2cda3d67e2d0 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1196,7 +1196,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, mutex_lock(&ui->ui_mutex); ui->ui_size = inode->i_size; /* Truncation changes inode [mc]time */ - inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); /* Other attributes may be changed at the same time as well */ do_attr_changes(inode, attr); err = ubifs_jnl_truncate(c, inode, old_size, new_size); @@ -1243,7 +1243,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, mutex_lock(&ui->ui_mutex); if (attr->ia_valid & ATTR_SIZE) { /* Truncation changes inode [mc]time */ - inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); /* 'truncate_setsize()' changed @i_size, update @ui_size */ ui->ui_size = inode->i_size; } @@ -1420,7 +1420,7 @@ int ubifs_update_time(struct inode *inode, struct timespec *time, */ static int update_mctime(struct inode *inode) { - struct timespec now = ubifs_current_time(inode); + struct timespec now = current_time(inode); struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_info *c = inode->i_sb->s_fs_info; @@ -1434,7 +1434,7 @@ static int update_mctime(struct inode *inode) return err; mutex_lock(&ui->ui_mutex); - inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); release = ui->dirty; mark_inode_dirty_sync(inode); mutex_unlock(&ui->ui_mutex); @@ -1511,7 +1511,7 @@ static int ubifs_vm_page_mkwrite(struct vm_fault *vmf) struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); struct ubifs_info *c = inode->i_sb->s_fs_info; - struct timespec now = ubifs_current_time(inode); + struct timespec now = current_time(inode); struct ubifs_budget_req req = { .new_page = 1 }; int err, update_time; @@ -1579,7 +1579,7 @@ static int ubifs_vm_page_mkwrite(struct vm_fault *vmf) struct ubifs_inode *ui = ubifs_inode(inode); mutex_lock(&ui->ui_mutex); - inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); release = ui->dirty; mark_inode_dirty_sync(inode); mutex_unlock(&ui->ui_mutex); diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index da519ba205f6..12b9eb5005ff 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -126,7 +126,7 @@ static int setflags(struct inode *inode, int flags) ui->flags = ioctl2ubifs(flags); ubifs_set_inode_flags(inode); - inode->i_ctime = ubifs_current_time(inode); + inode->i_ctime = current_time(inode); release = ui->dirty; mark_inode_dirty_sync(inode); mutex_unlock(&ui->ui_mutex); diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h index 8ece6ca58c0b..caf83d68fb38 100644 --- a/fs/ubifs/misc.h +++ b/fs/ubifs/misc.h @@ -225,16 +225,6 @@ static inline void *ubifs_idx_key(const struct ubifs_info *c, } /** - * ubifs_current_time - round current time to time granularity. - * @inode: inode - */ -static inline struct timespec ubifs_current_time(struct inode *inode) -{ - return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ? - current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; -} - -/** * ubifs_tnc_lookup - look up a file-system node. * @c: UBIFS file-system description object * @key: node key to lookup diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 7f1ead29e727..8c25081a5109 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -84,6 +84,8 @@ static int create_default_filesystem(struct ubifs_info *c) int min_leb_cnt = UBIFS_MIN_LEB_CNT; long long tmp64, main_bytes; __le64 tmp_le64; + __le32 tmp_le32; + struct timespec ts; /* Some functions called from here depend on the @c->key_len filed */ c->key_len = UBIFS_SK_LEN; @@ -298,13 +300,17 @@ static int create_default_filesystem(struct ubifs_info *c) ino->ch.node_type = UBIFS_INO_NODE; ino->creat_sqnum = cpu_to_le64(++c->max_sqnum); ino->nlink = cpu_to_le32(2); - tmp_le64 = cpu_to_le64(CURRENT_TIME_SEC.tv_sec); + + ktime_get_real_ts(&ts); + ts = timespec_trunc(ts, DEFAULT_TIME_GRAN); + tmp_le64 = cpu_to_le64(ts.tv_sec); ino->atime_sec = tmp_le64; ino->ctime_sec = tmp_le64; ino->mtime_sec = tmp_le64; - ino->atime_nsec = 0; - ino->ctime_nsec = 0; - ino->mtime_nsec = 0; + tmp_le32 = cpu_to_le32(ts.tv_nsec); + ino->atime_nsec = tmp_le32; + ino->ctime_nsec = tmp_le32; + ino->mtime_nsec = tmp_le32; ino->mode = cpu_to_le32(S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO); ino->size = cpu_to_le64(UBIFS_INO_NODE_SZ); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index efe00fcb8b75..3e53fdbf7997 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -152,7 +152,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, ui->data_len = size; mutex_lock(&host_ui->ui_mutex); - host->i_ctime = ubifs_current_time(host); + host->i_ctime = current_time(host); host_ui->xattr_cnt += 1; host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm)); host_ui->xattr_size += CALC_XATTR_BYTES(size); @@ -234,7 +234,7 @@ static int change_xattr(struct ubifs_info *c, struct inode *host, mutex_unlock(&ui->ui_mutex); mutex_lock(&host_ui->ui_mutex); - host->i_ctime = ubifs_current_time(host); + host->i_ctime = current_time(host); host_ui->xattr_size -= CALC_XATTR_BYTES(old_size); host_ui->xattr_size += CALC_XATTR_BYTES(size); @@ -488,7 +488,7 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host, return err; mutex_lock(&host_ui->ui_mutex); - host->i_ctime = ubifs_current_time(host); + host->i_ctime = current_time(host); host_ui->xattr_cnt -= 1; host_ui->xattr_size -= CALC_DENT_SIZE(fname_len(nm)); host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len); diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 9774555b3721..d1dd8cc33179 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -176,6 +176,7 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode) struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; struct inode * inode; + struct timespec64 ts; unsigned cg, bit, i, j, start; struct ufs_inode_info *ufsi; int err = -ENOSPC; @@ -323,8 +324,9 @@ cg_found: lock_buffer(bh); ufs2_inode = (struct ufs2_inode *)bh->b_data; ufs2_inode += ufs_inotofsbo(inode->i_ino); - ufs2_inode->ui_birthtime = cpu_to_fs64(sb, CURRENT_TIME.tv_sec); - ufs2_inode->ui_birthnsec = cpu_to_fs32(sb, CURRENT_TIME.tv_nsec); + ktime_get_real_ts64(&ts); + ufs2_inode->ui_birthtime = cpu_to_fs64(sb, ts.tv_sec); + ufs2_inode->ui_birthnsec = cpu_to_fs32(sb, ts.tv_nsec); mark_buffer_dirty(bh); unlock_buffer(bh); if (sb->s_flags & MS_SYNCHRONOUS) diff --git a/fs/xattr.c b/fs/xattr.c index 7e3317cf4045..464c94bf65f9 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -431,12 +431,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value, if (size) { if (size > XATTR_SIZE_MAX) return -E2BIG; - kvalue = kmalloc(size, GFP_KERNEL | __GFP_NOWARN); - if (!kvalue) { - kvalue = vmalloc(size); - if (!kvalue) - return -ENOMEM; - } + kvalue = kvmalloc(size, GFP_KERNEL); + if (!kvalue) + return -ENOMEM; if (copy_from_user(kvalue, value, size)) { error = -EFAULT; goto out; @@ -528,12 +525,9 @@ getxattr(struct dentry *d, const char __user *name, void __user *value, if (size) { if (size > XATTR_SIZE_MAX) size = XATTR_SIZE_MAX; - kvalue = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); - if (!kvalue) { - kvalue = vmalloc(size); - if (!kvalue) - return -ENOMEM; - } + kvalue = kvzalloc(size, GFP_KERNEL); + if (!kvalue) + return -ENOMEM; } error = vfs_getxattr(d, kname, kvalue, size); @@ -611,12 +605,9 @@ listxattr(struct dentry *d, char __user *list, size_t size) if (size) { if (size > XATTR_LIST_MAX) size = XATTR_LIST_MAX; - klist = kmalloc(size, __GFP_NOWARN | GFP_KERNEL); - if (!klist) { - klist = vmalloc(size); - if (!klist) - return -ENOMEM; - } + klist = kvmalloc(size, GFP_KERNEL); + if (!klist) + return -ENOMEM; } error = vfs_listxattr(d, klist, size); diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 26ef1958b65b..5c90f82b8f6b 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -79,6 +79,7 @@ xfs-y += xfs_aops.o \ xfs_extent_busy.o \ xfs_file.o \ xfs_filestream.o \ + xfs_fsmap.o \ xfs_fsops.o \ xfs_globals.o \ xfs_icache.o \ diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 780fc8986dab..393b6849aeb3 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -67,7 +67,7 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags) nofs_flag = memalloc_nofs_save(); lflags = kmem_flags_convert(flags); - ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); + ptr = __vmalloc(size, lflags | __GFP_ZERO, PAGE_KERNEL); if (flags & KM_NOFS) memalloc_nofs_restore(nofs_flag); diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 369adcc18c02..7486401ccbd3 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2868,3 +2868,60 @@ err: xfs_trans_brelse(tp, agbp); return error; } + +struct xfs_alloc_query_range_info { + xfs_alloc_query_range_fn fn; + void *priv; +}; + +/* Format btree record and pass to our callback. */ +STATIC int +xfs_alloc_query_range_helper( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec, + void *priv) +{ + struct xfs_alloc_query_range_info *query = priv; + struct xfs_alloc_rec_incore irec; + + irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock); + irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount); + return query->fn(cur, &irec, query->priv); +} + +/* Find all free space within a given range of blocks. */ +int +xfs_alloc_query_range( + struct xfs_btree_cur *cur, + struct xfs_alloc_rec_incore *low_rec, + struct xfs_alloc_rec_incore *high_rec, + xfs_alloc_query_range_fn fn, + void *priv) +{ + union xfs_btree_irec low_brec; + union xfs_btree_irec high_brec; + struct xfs_alloc_query_range_info query; + + ASSERT(cur->bc_btnum == XFS_BTNUM_BNO); + low_brec.a = *low_rec; + high_brec.a = *high_rec; + query.priv = priv; + query.fn = fn; + return xfs_btree_query_range(cur, &low_brec, &high_brec, + xfs_alloc_query_range_helper, &query); +} + +/* Find all free space records. */ +int +xfs_alloc_query_all( + struct xfs_btree_cur *cur, + xfs_alloc_query_range_fn fn, + void *priv) +{ + struct xfs_alloc_query_range_info query; + + ASSERT(cur->bc_btnum == XFS_BTNUM_BNO); + query.priv = priv; + query.fn = fn; + return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query); +} diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 2a8d0fa6fbbe..77d9c27330ab 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -219,4 +219,16 @@ int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno, xfs_extlen_t xfs_prealloc_blocks(struct xfs_mount *mp); +typedef int (*xfs_alloc_query_range_fn)( + struct xfs_btree_cur *cur, + struct xfs_alloc_rec_incore *rec, + void *priv); + +int xfs_alloc_query_range(struct xfs_btree_cur *cur, + struct xfs_alloc_rec_incore *low_rec, + struct xfs_alloc_rec_incore *high_rec, + xfs_alloc_query_range_fn fn, void *priv); +int xfs_alloc_query_all(struct xfs_btree_cur *cur, xfs_alloc_query_range_fn fn, + void *priv); + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index efb467b10a71..e1fcfe7f0a9a 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -205,19 +205,37 @@ xfs_allocbt_init_key_from_rec( union xfs_btree_key *key, union xfs_btree_rec *rec) { - ASSERT(rec->alloc.ar_startblock != 0); - key->alloc.ar_startblock = rec->alloc.ar_startblock; key->alloc.ar_blockcount = rec->alloc.ar_blockcount; } STATIC void +xfs_bnobt_init_high_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + __u32 x; + + x = be32_to_cpu(rec->alloc.ar_startblock); + x += be32_to_cpu(rec->alloc.ar_blockcount) - 1; + key->alloc.ar_startblock = cpu_to_be32(x); + key->alloc.ar_blockcount = 0; +} + +STATIC void +xfs_cntbt_init_high_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + key->alloc.ar_blockcount = rec->alloc.ar_blockcount; + key->alloc.ar_startblock = 0; +} + +STATIC void xfs_allocbt_init_rec_from_cur( struct xfs_btree_cur *cur, union xfs_btree_rec *rec) { - ASSERT(cur->bc_rec.a.ar_startblock != 0); - rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock); rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount); } @@ -236,18 +254,24 @@ xfs_allocbt_init_ptr_from_cur( } STATIC __int64_t -xfs_allocbt_key_diff( +xfs_bnobt_key_diff( struct xfs_btree_cur *cur, union xfs_btree_key *key) { xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a; xfs_alloc_key_t *kp = &key->alloc; - __int64_t diff; - if (cur->bc_btnum == XFS_BTNUM_BNO) { - return (__int64_t)be32_to_cpu(kp->ar_startblock) - - rec->ar_startblock; - } + return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; +} + +STATIC __int64_t +xfs_cntbt_key_diff( + struct xfs_btree_cur *cur, + union xfs_btree_key *key) +{ + xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a; + xfs_alloc_key_t *kp = &key->alloc; + __int64_t diff; diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount; if (diff) @@ -256,6 +280,33 @@ xfs_allocbt_key_diff( return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; } +STATIC __int64_t +xfs_bnobt_diff_two_keys( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + return (__int64_t)be32_to_cpu(k1->alloc.ar_startblock) - + be32_to_cpu(k2->alloc.ar_startblock); +} + +STATIC __int64_t +xfs_cntbt_diff_two_keys( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + __int64_t diff; + + diff = be32_to_cpu(k1->alloc.ar_blockcount) - + be32_to_cpu(k2->alloc.ar_blockcount); + if (diff) + return diff; + + return be32_to_cpu(k1->alloc.ar_startblock) - + be32_to_cpu(k2->alloc.ar_startblock); +} + static bool xfs_allocbt_verify( struct xfs_buf *bp) @@ -346,44 +397,54 @@ const struct xfs_buf_ops xfs_allocbt_buf_ops = { #if defined(DEBUG) || defined(XFS_WARN) STATIC int -xfs_allocbt_keys_inorder( +xfs_bnobt_keys_inorder( struct xfs_btree_cur *cur, union xfs_btree_key *k1, union xfs_btree_key *k2) { - if (cur->bc_btnum == XFS_BTNUM_BNO) { - return be32_to_cpu(k1->alloc.ar_startblock) < - be32_to_cpu(k2->alloc.ar_startblock); - } else { - return be32_to_cpu(k1->alloc.ar_blockcount) < - be32_to_cpu(k2->alloc.ar_blockcount) || - (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount && - be32_to_cpu(k1->alloc.ar_startblock) < - be32_to_cpu(k2->alloc.ar_startblock)); - } + return be32_to_cpu(k1->alloc.ar_startblock) < + be32_to_cpu(k2->alloc.ar_startblock); } STATIC int -xfs_allocbt_recs_inorder( +xfs_bnobt_recs_inorder( struct xfs_btree_cur *cur, union xfs_btree_rec *r1, union xfs_btree_rec *r2) { - if (cur->bc_btnum == XFS_BTNUM_BNO) { - return be32_to_cpu(r1->alloc.ar_startblock) + - be32_to_cpu(r1->alloc.ar_blockcount) <= - be32_to_cpu(r2->alloc.ar_startblock); - } else { - return be32_to_cpu(r1->alloc.ar_blockcount) < - be32_to_cpu(r2->alloc.ar_blockcount) || - (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount && - be32_to_cpu(r1->alloc.ar_startblock) < - be32_to_cpu(r2->alloc.ar_startblock)); - } + return be32_to_cpu(r1->alloc.ar_startblock) + + be32_to_cpu(r1->alloc.ar_blockcount) <= + be32_to_cpu(r2->alloc.ar_startblock); +} + +STATIC int +xfs_cntbt_keys_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + return be32_to_cpu(k1->alloc.ar_blockcount) < + be32_to_cpu(k2->alloc.ar_blockcount) || + (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount && + be32_to_cpu(k1->alloc.ar_startblock) < + be32_to_cpu(k2->alloc.ar_startblock)); } -#endif /* DEBUG */ -static const struct xfs_btree_ops xfs_allocbt_ops = { +STATIC int +xfs_cntbt_recs_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2) +{ + return be32_to_cpu(r1->alloc.ar_blockcount) < + be32_to_cpu(r2->alloc.ar_blockcount) || + (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount && + be32_to_cpu(r1->alloc.ar_startblock) < + be32_to_cpu(r2->alloc.ar_startblock)); +} +#endif /* DEBUG */ + +static const struct xfs_btree_ops xfs_bnobt_ops = { .rec_len = sizeof(xfs_alloc_rec_t), .key_len = sizeof(xfs_alloc_key_t), @@ -395,13 +456,39 @@ static const struct xfs_btree_ops xfs_allocbt_ops = { .get_minrecs = xfs_allocbt_get_minrecs, .get_maxrecs = xfs_allocbt_get_maxrecs, .init_key_from_rec = xfs_allocbt_init_key_from_rec, + .init_high_key_from_rec = xfs_bnobt_init_high_key_from_rec, .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, - .key_diff = xfs_allocbt_key_diff, + .key_diff = xfs_bnobt_key_diff, .buf_ops = &xfs_allocbt_buf_ops, + .diff_two_keys = xfs_bnobt_diff_two_keys, #if defined(DEBUG) || defined(XFS_WARN) - .keys_inorder = xfs_allocbt_keys_inorder, - .recs_inorder = xfs_allocbt_recs_inorder, + .keys_inorder = xfs_bnobt_keys_inorder, + .recs_inorder = xfs_bnobt_recs_inorder, +#endif +}; + +static const struct xfs_btree_ops xfs_cntbt_ops = { + .rec_len = sizeof(xfs_alloc_rec_t), + .key_len = sizeof(xfs_alloc_key_t), + + .dup_cursor = xfs_allocbt_dup_cursor, + .set_root = xfs_allocbt_set_root, + .alloc_block = xfs_allocbt_alloc_block, + .free_block = xfs_allocbt_free_block, + .update_lastrec = xfs_allocbt_update_lastrec, + .get_minrecs = xfs_allocbt_get_minrecs, + .get_maxrecs = xfs_allocbt_get_maxrecs, + .init_key_from_rec = xfs_allocbt_init_key_from_rec, + .init_high_key_from_rec = xfs_cntbt_init_high_key_from_rec, + .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, + .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, + .key_diff = xfs_cntbt_key_diff, + .buf_ops = &xfs_allocbt_buf_ops, + .diff_two_keys = xfs_cntbt_diff_two_keys, +#if defined(DEBUG) || defined(XFS_WARN) + .keys_inorder = xfs_cntbt_keys_inorder, + .recs_inorder = xfs_cntbt_recs_inorder, #endif }; @@ -427,16 +514,15 @@ xfs_allocbt_init_cursor( cur->bc_mp = mp; cur->bc_btnum = btnum; cur->bc_blocklog = mp->m_sb.sb_blocklog; - cur->bc_ops = &xfs_allocbt_ops; - if (btnum == XFS_BTNUM_BNO) - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); - else - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2); if (btnum == XFS_BTNUM_CNT) { + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2); + cur->bc_ops = &xfs_cntbt_ops; cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); cur->bc_flags = XFS_BTREE_LASTREC_UPDATE; } else { + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); + cur->bc_ops = &xfs_bnobt_ops; cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 9bd104f32908..f02eb7673392 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -764,7 +764,6 @@ xfs_bmap_extents_to_btree( args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); } else if (dfops->dop_low) { args.type = XFS_ALLOCTYPE_START_BNO; -try_another_ag: args.fsbno = *firstblock; } else { args.type = XFS_ALLOCTYPE_NEAR_BNO; @@ -779,20 +778,6 @@ try_another_ag: return error; } - /* - * During a CoW operation, the allocation and bmbt updates occur in - * different transactions. The mapping code tries to put new bmbt - * blocks near extents being mapped, but the only way to guarantee this - * is if the alloc and the mapping happen in a single transaction that - * has a block reservation. That isn't the case here, so if we run out - * of space we'll try again with another AG. - */ - if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) && - args.fsbno == NULLFSBLOCK && - args.type == XFS_ALLOCTYPE_NEAR_BNO) { - args.type = XFS_ALLOCTYPE_FIRST_AG; - goto try_another_ag; - } if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) { xfs_iroot_realloc(ip, -1, whichfork); xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); @@ -925,7 +910,6 @@ xfs_bmap_local_to_extents( * file currently fits in an inode. */ if (*firstblock == NULLFSBLOCK) { -try_another_ag: args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino); args.type = XFS_ALLOCTYPE_START_BNO; } else { @@ -938,19 +922,6 @@ try_another_ag: if (error) goto done; - /* - * During a CoW operation, the allocation and bmbt updates occur in - * different transactions. The mapping code tries to put new bmbt - * blocks near extents being mapped, but the only way to guarantee this - * is if the alloc and the mapping happen in a single transaction that - * has a block reservation. That isn't the case here, so if we run out - * of space we'll try again with another AG. - */ - if (xfs_sb_version_hasreflink(&ip->i_mount->m_sb) && - args.fsbno == NULLFSBLOCK && - args.type == XFS_ALLOCTYPE_NEAR_BNO) { - goto try_another_ag; - } /* Can't fail, the space was reserved. */ ASSERT(args.fsbno != NULLFSBLOCK); ASSERT(args.len == 1); @@ -1260,7 +1231,6 @@ xfs_bmap_read_extents( xfs_fsblock_t bno; /* block # of "block" */ xfs_buf_t *bp; /* buffer for "block" */ int error; /* error return value */ - xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */ xfs_extnum_t i, j; /* index into the extents list */ xfs_ifork_t *ifp; /* fork structure */ int level; /* btree level, for checking */ @@ -1271,8 +1241,6 @@ xfs_bmap_read_extents( mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); - exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE : - XFS_EXTFMT_INODE(ip); block = ifp->if_broot; /* * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. @@ -1340,18 +1308,9 @@ xfs_bmap_read_extents( xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i); trp->l0 = be64_to_cpu(frp->l0); trp->l1 = be64_to_cpu(frp->l1); - } - if (exntf == XFS_EXTFMT_NOSTATE) { - /* - * Check all attribute bmap btree records and - * any "older" data bmap btree records for a - * set bit in the "extent flag" position. - */ - if (unlikely(xfs_check_nostate_extents(ifp, - start, num_recs))) { + if (!xfs_bmbt_validate_extent(mp, whichfork, trp)) { XFS_ERROR_REPORT("xfs_bmap_read_extents(2)", - XFS_ERRLEVEL_LOW, - ip->i_mount); + XFS_ERRLEVEL_LOW, mp); goto error0; } } @@ -2879,27 +2838,30 @@ xfs_bmap_add_extent_hole_delay( */ STATIC int /* error */ xfs_bmap_add_extent_hole_real( - struct xfs_bmalloca *bma, - int whichfork) + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork, + xfs_extnum_t *idx, + struct xfs_btree_cur **curp, + struct xfs_bmbt_irec *new, + xfs_fsblock_t *first, + struct xfs_defer_ops *dfops, + int *logflagsp) { - struct xfs_bmbt_irec *new = &bma->got; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_mount *mp = ip->i_mount; + struct xfs_btree_cur *cur = *curp; int error; /* error return value */ int i; /* temp state */ - xfs_ifork_t *ifp; /* inode fork pointer */ xfs_bmbt_irec_t left; /* left neighbor extent entry */ xfs_bmbt_irec_t right; /* right neighbor extent entry */ int rval=0; /* return value (logging flags) */ int state; /* state bits, accessed thru macros */ - struct xfs_mount *mp; - mp = bma->ip->i_mount; - ifp = XFS_IFORK_PTR(bma->ip, whichfork); - - ASSERT(bma->idx >= 0); - ASSERT(bma->idx <= xfs_iext_count(ifp)); + ASSERT(*idx >= 0); + ASSERT(*idx <= xfs_iext_count(ifp)); ASSERT(!isnullstartblock(new->br_startblock)); - ASSERT(!bma->cur || - !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + ASSERT(!cur || !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); XFS_STATS_INC(mp, xs_add_exlist); @@ -2912,9 +2874,9 @@ xfs_bmap_add_extent_hole_real( /* * Check and set flags if this segment has a left neighbor. */ - if (bma->idx > 0) { + if (*idx > 0) { state |= BMAP_LEFT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &left); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left); if (isnullstartblock(left.br_startblock)) state |= BMAP_LEFT_DELAY; } @@ -2923,9 +2885,9 @@ xfs_bmap_add_extent_hole_real( * Check and set flags if this segment has a current value. * Not true if we're inserting into the "hole" at eof. */ - if (bma->idx < xfs_iext_count(ifp)) { + if (*idx < xfs_iext_count(ifp)) { state |= BMAP_RIGHT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); if (isnullstartblock(right.br_startblock)) state |= BMAP_RIGHT_DELAY; } @@ -2962,36 +2924,36 @@ xfs_bmap_add_extent_hole_real( * left and on the right. * Merge all three into a single extent record. */ - --bma->idx; - trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), + --*idx; + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), left.br_blockcount + new->br_blockcount + right.br_blockcount); - trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - xfs_iext_remove(bma->ip, bma->idx + 1, 1, state); + xfs_iext_remove(ip, *idx + 1, 1, state); - XFS_IFORK_NEXT_SET(bma->ip, whichfork, - XFS_IFORK_NEXTENTS(bma->ip, whichfork) - 1); - if (bma->cur == NULL) { + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + if (cur == NULL) { rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { rval = XFS_ILOG_CORE; - error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff, + error = xfs_bmbt_lookup_eq(cur, right.br_startoff, right.br_startblock, right.br_blockcount, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_btree_delete(bma->cur, &i); + error = xfs_btree_delete(cur, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_btree_decrement(bma->cur, 0, &i); + error = xfs_btree_decrement(cur, 0, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_bmbt_update(bma->cur, left.br_startoff, + error = xfs_bmbt_update(cur, left.br_startoff, left.br_startblock, left.br_blockcount + new->br_blockcount + @@ -3008,23 +2970,23 @@ xfs_bmap_add_extent_hole_real( * on the left. * Merge the new allocation with the left neighbor. */ - --bma->idx; - trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), + --*idx; + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), left.br_blockcount + new->br_blockcount); - trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - if (bma->cur == NULL) { + if (cur == NULL) { rval = xfs_ilog_fext(whichfork); } else { rval = 0; - error = xfs_bmbt_lookup_eq(bma->cur, left.br_startoff, + error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock, left.br_blockcount, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_bmbt_update(bma->cur, left.br_startoff, + error = xfs_bmbt_update(cur, left.br_startoff, left.br_startblock, left.br_blockcount + new->br_blockcount, @@ -3040,25 +3002,25 @@ xfs_bmap_add_extent_hole_real( * on the right. * Merge the new allocation with the right neighbor. */ - trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx), + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), new->br_startoff, new->br_startblock, new->br_blockcount + right.br_blockcount, right.br_state); - trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - if (bma->cur == NULL) { + if (cur == NULL) { rval = xfs_ilog_fext(whichfork); } else { rval = 0; - error = xfs_bmbt_lookup_eq(bma->cur, + error = xfs_bmbt_lookup_eq(cur, right.br_startoff, right.br_startblock, right.br_blockcount, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_bmbt_update(bma->cur, new->br_startoff, + error = xfs_bmbt_update(cur, new->br_startoff, new->br_startblock, new->br_blockcount + right.br_blockcount, @@ -3074,22 +3036,22 @@ xfs_bmap_add_extent_hole_real( * real allocation. * Insert a new entry. */ - xfs_iext_insert(bma->ip, bma->idx, 1, new, state); - XFS_IFORK_NEXT_SET(bma->ip, whichfork, - XFS_IFORK_NEXTENTS(bma->ip, whichfork) + 1); - if (bma->cur == NULL) { + xfs_iext_insert(ip, *idx, 1, new, state); + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + if (cur == NULL) { rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { rval = XFS_ILOG_CORE; - error = xfs_bmbt_lookup_eq(bma->cur, + error = xfs_bmbt_lookup_eq(cur, new->br_startoff, new->br_startblock, new->br_blockcount, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); - bma->cur->bc_rec.b.br_state = new->br_state; - error = xfs_btree_insert(bma->cur, &i); + cur->bc_rec.b.br_state = new->br_state; + error = xfs_btree_insert(cur, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); @@ -3098,30 +3060,30 @@ xfs_bmap_add_extent_hole_real( } /* add reverse mapping */ - error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip, whichfork, new); + error = xfs_rmap_map_extent(mp, dfops, ip, whichfork, new); if (error) goto done; /* convert to a btree if necessary */ - if (xfs_bmap_needs_btree(bma->ip, whichfork)) { + if (xfs_bmap_needs_btree(ip, whichfork)) { int tmp_logflags; /* partial log flag return val */ - ASSERT(bma->cur == NULL); - error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, - bma->firstblock, bma->dfops, &bma->cur, + ASSERT(cur == NULL); + error = xfs_bmap_extents_to_btree(tp, ip, first, dfops, curp, 0, &tmp_logflags, whichfork); - bma->logflags |= tmp_logflags; + *logflagsp |= tmp_logflags; + cur = *curp; if (error) goto done; } /* clear out the allocated field, done with it now in any case. */ - if (bma->cur) - bma->cur->bc_private.b.allocated = 0; + if (cur) + cur->bc_private.b.allocated = 0; - xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork); + xfs_bmap_check_leaf_extents(cur, ip, whichfork); done: - bma->logflags |= rval; + *logflagsp |= rval; return error; } @@ -3853,60 +3815,6 @@ xfs_bmap_btalloc( } /* - * For a remap operation, just "allocate" an extent at the address that the - * caller passed in, and ensure that the AGFL is the right size. The caller - * will then map the "allocated" extent into the file somewhere. - */ -STATIC int -xfs_bmap_remap_alloc( - struct xfs_bmalloca *ap) -{ - struct xfs_trans *tp = ap->tp; - struct xfs_mount *mp = tp->t_mountp; - xfs_agblock_t bno; - struct xfs_alloc_arg args; - int error; - - /* - * validate that the block number is legal - the enables us to detect - * and handle a silent filesystem corruption rather than crashing. - */ - memset(&args, 0, sizeof(struct xfs_alloc_arg)); - args.tp = ap->tp; - args.mp = ap->tp->t_mountp; - bno = *ap->firstblock; - args.agno = XFS_FSB_TO_AGNO(mp, bno); - args.agbno = XFS_FSB_TO_AGBNO(mp, bno); - if (args.agno >= mp->m_sb.sb_agcount || - args.agbno >= mp->m_sb.sb_agblocks) - return -EFSCORRUPTED; - - /* "Allocate" the extent from the range we passed in. */ - trace_xfs_bmap_remap_alloc(ap->ip, *ap->firstblock, ap->length); - ap->blkno = bno; - ap->ip->i_d.di_nblocks += ap->length; - xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); - - /* Fix the freelist, like a real allocator does. */ - args.datatype = ap->datatype; - args.pag = xfs_perag_get(args.mp, args.agno); - ASSERT(args.pag); - - /* - * The freelist fixing code will decline the allocation if - * the size and shape of the free space doesn't allow for - * allocating the extent and updating all the metadata that - * happens during an allocation. We're remapping, not - * allocating, so skip that check by pretending to be freeing. - */ - error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); - xfs_perag_put(args.pag); - if (error) - trace_xfs_bmap_remap_alloc_error(ap->ip, error, _RET_IP_); - return error; -} - -/* * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. * It figures out where to ask the underlying allocator to put the new extent. */ @@ -3914,8 +3822,6 @@ STATIC int xfs_bmap_alloc( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { - if (ap->flags & XFS_BMAPI_REMAP) - return xfs_bmap_remap_alloc(ap); if (XFS_IS_REALTIME_INODE(ap->ip) && xfs_alloc_is_userdata(ap->datatype)) return xfs_bmap_rtalloc(ap); @@ -4386,7 +4292,9 @@ xfs_bmapi_allocate( if (bma->wasdel) error = xfs_bmap_add_extent_delay_real(bma, whichfork); else - error = xfs_bmap_add_extent_hole_real(bma, whichfork); + error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip, + whichfork, &bma->idx, &bma->cur, &bma->got, + bma->firstblock, bma->dfops, &bma->logflags); bma->logflags |= tmp_logflags; if (error) @@ -4549,9 +4457,7 @@ xfs_bmapi_write( ASSERT(len > 0); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK); - ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP)); - ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP)); + ASSERT(!(flags & XFS_BMAPI_REMAP)); /* zeroing is for currently only for data extents, not metadata */ ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) != @@ -4635,13 +4541,8 @@ xfs_bmapi_write( } else { need_alloc = true; } - } else { - /* - * Make sure we only reflink into a hole. - */ - ASSERT(!(flags & XFS_BMAPI_REMAP)); - if (isnullstartblock(bma.got.br_startblock)) - wasdelay = true; + } else if (isnullstartblock(bma.got.br_startblock)) { + wasdelay = true; } /* @@ -4770,6 +4671,93 @@ error0: return error; } +static int +xfs_bmapi_remap( + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_fileoff_t bno, + xfs_filblks_t len, + xfs_fsblock_t startblock, + struct xfs_defer_ops *dfops) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_btree_cur *cur = NULL; + xfs_fsblock_t firstblock = NULLFSBLOCK; + struct xfs_bmbt_irec got; + xfs_extnum_t idx; + int logflags = 0, error; + + ASSERT(len > 0); + ASSERT(len <= (xfs_filblks_t)MAXEXTLEN); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmapi_remap", XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + return error; + } + + if (xfs_iext_lookup_extent(ip, ifp, bno, &idx, &got)) { + /* make sure we only reflink into a hole. */ + ASSERT(got.br_startoff > bno); + ASSERT(got.br_startoff - bno >= len); + } + + ip->i_d.di_nblocks += len; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + if (ifp->if_flags & XFS_IFBROOT) { + cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK); + cur->bc_private.b.firstblock = firstblock; + cur->bc_private.b.dfops = dfops; + cur->bc_private.b.flags = 0; + } + + got.br_startoff = bno; + got.br_startblock = startblock; + got.br_blockcount = len; + got.br_state = XFS_EXT_NORM; + + error = xfs_bmap_add_extent_hole_real(tp, ip, XFS_DATA_FORK, &idx, &cur, + &got, &firstblock, dfops, &logflags); + if (error) + goto error0; + + if (xfs_bmap_wants_extents(ip, XFS_DATA_FORK)) { + int tmp_logflags = 0; + + error = xfs_bmap_btree_to_extents(tp, ip, cur, + &tmp_logflags, XFS_DATA_FORK); + logflags |= tmp_logflags; + } + +error0: + if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) + logflags &= ~XFS_ILOG_DEXT; + else if (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) + logflags &= ~XFS_ILOG_DBROOT; + + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + if (cur) { + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + } + return error; +} + /* * When a delalloc extent is split (e.g., due to a hole punch), the original * indlen reservation must be shared across the two new extents that are left @@ -4887,7 +4875,7 @@ xfs_bmap_del_extent_delay( ASSERT(got_endoff >= del_endoff); if (isrt) { - int64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount); + uint64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount); do_div(rtexts, mp->m_sb.sb_rextsize); xfs_mod_frextents(mp, rtexts); @@ -6488,27 +6476,15 @@ xfs_bmap_finish_one( xfs_filblks_t blockcount, xfs_exntst_t state) { - struct xfs_bmbt_irec bmap; - int nimaps = 1; - xfs_fsblock_t firstfsb; - int flags = XFS_BMAPI_REMAP; - int done; - int error = 0; - - bmap.br_startblock = startblock; - bmap.br_startoff = startoff; - bmap.br_blockcount = blockcount; - bmap.br_state = state; + int error = 0, done; trace_xfs_bmap_deferred(tp->t_mountp, XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type, XFS_FSB_TO_AGBNO(tp->t_mountp, startblock), ip->i_ino, whichfork, startoff, blockcount, state); - if (whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK) + if (WARN_ON_ONCE(whichfork != XFS_DATA_FORK)) return -EFSCORRUPTED; - if (whichfork == XFS_ATTR_FORK) - flags |= XFS_BMAPI_ATTRFORK; if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE, @@ -6517,16 +6493,12 @@ xfs_bmap_finish_one( switch (type) { case XFS_BMAP_MAP: - firstfsb = bmap.br_startblock; - error = xfs_bmapi_write(tp, ip, bmap.br_startoff, - bmap.br_blockcount, flags, &firstfsb, - bmap.br_blockcount, &bmap, &nimaps, - dfops); + error = xfs_bmapi_remap(tp, ip, startoff, blockcount, + startblock, dfops); break; case XFS_BMAP_UNMAP: - error = xfs_bunmapi(tp, ip, bmap.br_startoff, - bmap.br_blockcount, flags, 1, &firstfsb, - dfops, &done); + error = xfs_bunmapi(tp, ip, startoff, blockcount, + XFS_BMAPI_REMAP, 1, &startblock, dfops, &done); ASSERT(done); break; default: diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index cdef87db5262..c35a14fa1527 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -172,6 +172,18 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags) /* + * Return true if the extent is a real, allocated extent, or false if it is a + * delayed allocation, and unwritten extent or a hole. + */ +static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) +{ + return irec->br_state != XFS_EXT_UNWRITTEN && + irec->br_startblock != HOLESTARTBLOCK && + irec->br_startblock != DELAYSTARTBLOCK && + !isnullstartblock(irec->br_startblock); +} + +/* * This macro is used to determine how many extents will be shifted * in one write transaction. We could require two splits, * an extent move on the first and an extent merge on the second, @@ -232,8 +244,6 @@ int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *del); void xfs_bmap_del_extent_cow(struct xfs_inode *ip, xfs_extnum_t *idx, struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del); -int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, - xfs_extnum_t num); uint xfs_default_attroffset(struct xfs_inode *ip); int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index fd55db479385..6cba69aff077 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -366,32 +366,6 @@ xfs_bmbt_to_bmdr( memcpy(tpp, fpp, sizeof(*fpp) * dmxr); } -/* - * Check extent records, which have just been read, for - * any bit in the extent flag field. ASSERT on debug - * kernels, as this condition should not occur. - * Return an error condition (1) if any flags found, - * otherwise return 0. - */ - -int -xfs_check_nostate_extents( - xfs_ifork_t *ifp, - xfs_extnum_t idx, - xfs_extnum_t num) -{ - for (; num > 0; num--, idx++) { - xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); - if ((ep->l0 >> - (64 - BMBT_EXNTFLAG_BITLEN)) != 0) { - ASSERT(0); - return 1; - } - } - return 0; -} - - STATIC struct xfs_btree_cur * xfs_bmbt_dup_cursor( struct xfs_btree_cur *cur) @@ -448,7 +422,6 @@ xfs_bmbt_alloc_block( if (args.fsbno == NULLFSBLOCK) { args.fsbno = be64_to_cpu(start->l); args.type = XFS_ALLOCTYPE_START_BNO; -try_another_ag: /* * Make sure there is sufficient room left in the AG to * complete a full tree split for an extent insert. If @@ -477,22 +450,6 @@ try_another_ag: if (error) goto error0; - /* - * During a CoW operation, the allocation and bmbt updates occur in - * different transactions. The mapping code tries to put new bmbt - * blocks near extents being mapped, but the only way to guarantee this - * is if the alloc and the mapping happen in a single transaction that - * has a block reservation. That isn't the case here, so if we run out - * of space we'll try again with another AG. - */ - if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) && - args.fsbno == NULLFSBLOCK && - args.type == XFS_ALLOCTYPE_NEAR_BNO) { - args.fsbno = cur->bc_private.b.firstblock; - args.type = XFS_ALLOCTYPE_FIRST_AG; - goto try_another_ag; - } - if (args.fsbno == NULLFSBLOCK && args.minleft) { /* * Could not find an AG with enough free space to satisfy diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index 819a8a4dee95..9da5a8d4f184 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h @@ -25,14 +25,6 @@ struct xfs_inode; struct xfs_trans; /* - * Extent state and extent format macros. - */ -#define XFS_EXTFMT_INODE(x) \ - (xfs_sb_version_hasextflgbit(&((x)->i_mount->m_sb)) ? \ - XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE) -#define ISUNWRITTEN(x) ((x)->br_state == XFS_EXT_UNWRITTEN) - -/* * Btree block header size depends on a superblock flag. */ #define XFS_BMBT_BLOCK_LEN(mp) \ @@ -140,4 +132,18 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); +/* + * Check that the extent does not contain an invalid unwritten extent flag. + */ +static inline bool xfs_bmbt_validate_extent(struct xfs_mount *mp, int whichfork, + struct xfs_bmbt_rec_host *ep) +{ + if (ep->l0 >> (64 - BMBT_EXNTFLAG_BITLEN) == 0) + return true; + if (whichfork == XFS_DATA_FORK && + xfs_sb_version_hasextflgbit(&mp->m_sb)) + return true; + return false; +} + #endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 3059a3ec7ecb..5392674bf893 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -4842,6 +4842,21 @@ xfs_btree_query_range( fn, priv); } +/* Query a btree for all records. */ +int +xfs_btree_query_all( + struct xfs_btree_cur *cur, + xfs_btree_query_range_fn fn, + void *priv) +{ + union xfs_btree_irec low_rec; + union xfs_btree_irec high_rec; + + memset(&low_rec, 0, sizeof(low_rec)); + memset(&high_rec, 0xFF, sizeof(high_rec)); + return xfs_btree_query_range(cur, &low_rec, &high_rec, fn, priv); +} + /* * Calculate the number of blocks needed to store a given number of records * in a short-format (per-AG metadata) btree. diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 4bb62580a7fd..27bed08261c5 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -496,6 +496,8 @@ typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur, int xfs_btree_query_range(struct xfs_btree_cur *cur, union xfs_btree_irec *low_rec, union xfs_btree_irec *high_rec, xfs_btree_query_range_fn fn, void *priv); +int xfs_btree_query_all(struct xfs_btree_cur *cur, xfs_btree_query_range_fn fn, + void *priv); typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level, void *data); diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index ac9a003dd29a..747085b4ef44 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -35,13 +35,8 @@ int xfs_calc_dquots_per_chunk( unsigned int nbblks) /* basic block units */ { - unsigned int ndquots; - ASSERT(nbblks > 0); - ndquots = BBTOB(nbblks); - do_div(ndquots, sizeof(xfs_dqblk_t)); - - return ndquots; + return BBTOB(nbblks) / sizeof(xfs_dqblk_t); } /* diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 6b7579e7b60a..a1dccd8d96bc 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -930,10 +930,8 @@ static inline uint xfs_dinode_size(int version) /* * The 32 bit link count in the inode theoretically maxes out at UINT_MAX. * Since the pathconf interface is signed, we use 2^31 - 1 instead. - * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX. */ #define XFS_MAXLINK ((1U << 31) - 1U) -#define XFS_MAXLINK_1 65535U /* * Values for di_format @@ -1578,19 +1576,10 @@ static inline xfs_filblks_t startblockval(xfs_fsblock_t x) } /* - * Possible extent formats. - */ -typedef enum { - XFS_EXTFMT_NOSTATE = 0, - XFS_EXTFMT_HASSTATE -} xfs_exntfmt_t; - -/* * Possible extent states. */ typedef enum { XFS_EXT_NORM, XFS_EXT_UNWRITTEN, - XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID } xfs_exntst_t; /* diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index b72dc821d78b..095bdf049a3f 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -92,6 +92,18 @@ struct getbmapx { #define BMV_OF_LAST 0x4 /* segment is the last in the file */ #define BMV_OF_SHARED 0x8 /* segment shared with another file */ +/* fmr_owner special values for FS_IOC_GETFSMAP */ +#define XFS_FMR_OWN_FREE FMR_OWN_FREE /* free space */ +#define XFS_FMR_OWN_UNKNOWN FMR_OWN_UNKNOWN /* unknown owner */ +#define XFS_FMR_OWN_FS FMR_OWNER('X', 1) /* static fs metadata */ +#define XFS_FMR_OWN_LOG FMR_OWNER('X', 2) /* journalling log */ +#define XFS_FMR_OWN_AG FMR_OWNER('X', 3) /* per-AG metadata */ +#define XFS_FMR_OWN_INOBT FMR_OWNER('X', 4) /* inode btree blocks */ +#define XFS_FMR_OWN_INODES FMR_OWNER('X', 5) /* inodes */ +#define XFS_FMR_OWN_REFC FMR_OWNER('X', 6) /* refcount tree */ +#define XFS_FMR_OWN_COW FMR_OWNER('X', 7) /* cow staging */ +#define XFS_FMR_OWN_DEFECTIVE FMR_OWNER('X', 8) /* bad blocks */ + /* * Structure for XFS_IOC_FSSETDM. * For use by backup and restore programs to set the XFS on-disk inode @@ -502,6 +514,7 @@ typedef struct xfs_swapext #define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap) #define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64) #define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_fs_eofblocks) +/* XFS_IOC_GETFSMAP ------ hoisted 59 */ /* * ioctl commands that replace IRIX syssgi()'s diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index d93f9d918cfc..09c3d1aecef2 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -508,7 +508,7 @@ xfs_iread( /* even unallocated inodes are verified */ if (!xfs_dinode_verify(mp, ip->i_ino, dip)) { - xfs_alert(mp, "%s: validation failed for inode %lld failed", + xfs_alert(mp, "%s: validation failed for inode %lld", __func__, ip->i_ino); XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip); diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 8a37efe04de3..0e80f34fe97c 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -42,35 +42,6 @@ STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); -#ifdef DEBUG -/* - * Make sure that the extents in the given memory buffer - * are valid. - */ -void -xfs_validate_extents( - xfs_ifork_t *ifp, - int nrecs, - xfs_exntfmt_t fmt) -{ - xfs_bmbt_irec_t irec; - xfs_bmbt_rec_host_t rec; - int i; - - for (i = 0; i < nrecs; i++) { - xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); - rec.l0 = get_unaligned(&ep->l0); - rec.l1 = get_unaligned(&ep->l1); - xfs_bmbt_get_all(&rec, &irec); - if (fmt == XFS_EXTFMT_NOSTATE) - ASSERT(irec.br_state == XFS_EXT_NORM); - } -} -#else /* DEBUG */ -#define xfs_validate_extents(ifp, nrecs, fmt) -#endif /* DEBUG */ - - /* * Move inode type and inode format specific information from the * on-disk inode to the in-core inode. For fifos, devs, and sockets @@ -352,40 +323,33 @@ xfs_iformat_local( } /* - * The file consists of a set of extents all - * of which fit into the on-disk inode. - * If there are few enough extents to fit into - * the if_inline_ext, then copy them there. - * Otherwise allocate a buffer for them and copy - * them into it. Either way, set if_extents - * to point at the extents. + * The file consists of a set of extents all of which fit into the on-disk + * inode. If there are few enough extents to fit into the if_inline_ext, then + * copy them there. Otherwise allocate a buffer for them and copy them into it. + * Either way, set if_extents to point at the extents. */ STATIC int xfs_iformat_extents( - xfs_inode_t *ip, - xfs_dinode_t *dip, - int whichfork) + struct xfs_inode *ip, + struct xfs_dinode *dip, + int whichfork) { - xfs_bmbt_rec_t *dp; - xfs_ifork_t *ifp; - int nex; - int size; - int i; - - ifp = XFS_IFORK_PTR(ip, whichfork); - nex = XFS_DFORK_NEXTENTS(dip, whichfork); - size = nex * (uint)sizeof(xfs_bmbt_rec_t); + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + int nex = XFS_DFORK_NEXTENTS(dip, whichfork); + int size = nex * sizeof(xfs_bmbt_rec_t); + struct xfs_bmbt_rec *dp; + int i; /* - * If the number of extents is unreasonable, then something - * is wrong and we just bail out rather than crash in - * kmem_alloc() or memcpy() below. + * If the number of extents is unreasonable, then something is wrong and + * we just bail out rather than crash in kmem_alloc() or memcpy() below. */ - if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { + if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) { xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).", (unsigned long long) ip->i_ino, nex); XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); + mp, dip); return -EFSCORRUPTED; } @@ -400,22 +364,17 @@ xfs_iformat_extents( ifp->if_bytes = size; if (size) { dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork); - xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip)); for (i = 0; i < nex; i++, dp++) { xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); ep->l0 = get_unaligned_be64(&dp->l0); ep->l1 = get_unaligned_be64(&dp->l1); + if (!xfs_bmbt_validate_extent(mp, whichfork, ep)) { + XFS_ERROR_REPORT("xfs_iformat_extents(2)", + XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } } XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork); - if (whichfork != XFS_DATA_FORK || - XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE) - if (unlikely(xfs_check_nostate_extents( - ifp, 0, nex))) { - XFS_ERROR_REPORT("xfs_iformat_extents(2)", - XFS_ERRLEVEL_LOW, - ip->i_mount); - return -EFSCORRUPTED; - } } ifp->if_flags |= XFS_IFEXTENTS; return 0; @@ -518,7 +477,6 @@ xfs_iread_extents( xfs_iext_destroy(ifp); return error; } - xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip)); ifp->if_flags |= XFS_IFEXTENTS; return 0; } @@ -837,6 +795,9 @@ xfs_iextents_copy( copied = 0; for (i = 0; i < nrecs; i++) { xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); + + ASSERT(xfs_bmbt_validate_extent(ip->i_mount, whichfork, ep)); + start_block = xfs_bmbt_get_startblock(ep); if (isnullstartblock(start_block)) { /* @@ -852,7 +813,6 @@ xfs_iextents_copy( copied++; } ASSERT(copied != 0); - xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip)); return (copied * (uint)sizeof(xfs_bmbt_rec_t)); } diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 3a8cc7139912..06cfb93c2ef9 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -2001,14 +2001,14 @@ xfs_rmap_query_range_helper( /* Find all rmaps between two keys. */ int xfs_rmap_query_range( - struct xfs_btree_cur *cur, - struct xfs_rmap_irec *low_rec, - struct xfs_rmap_irec *high_rec, - xfs_rmap_query_range_fn fn, - void *priv) + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *low_rec, + struct xfs_rmap_irec *high_rec, + xfs_rmap_query_range_fn fn, + void *priv) { - union xfs_btree_irec low_brec; - union xfs_btree_irec high_brec; + union xfs_btree_irec low_brec; + union xfs_btree_irec high_brec; struct xfs_rmap_query_range_info query; low_brec.r = *low_rec; @@ -2019,6 +2019,20 @@ xfs_rmap_query_range( xfs_rmap_query_range_helper, &query); } +/* Find all rmaps. */ +int +xfs_rmap_query_all( + struct xfs_btree_cur *cur, + xfs_rmap_query_range_fn fn, + void *priv) +{ + struct xfs_rmap_query_range_info query; + + query.priv = priv; + query.fn = fn; + return xfs_btree_query_all(cur, xfs_rmap_query_range_helper, &query); +} + /* Clean up after calling xfs_rmap_finish_one. */ void xfs_rmap_finish_one_cleanup( @@ -2291,3 +2305,31 @@ xfs_rmap_free_extent( return __xfs_rmap_add(mp, dfops, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap); } + +/* Compare rmap records. Returns -1 if a < b, 1 if a > b, and 0 if equal. */ +int +xfs_rmap_compare( + const struct xfs_rmap_irec *a, + const struct xfs_rmap_irec *b) +{ + __u64 oa; + __u64 ob; + + oa = xfs_rmap_irec_offset_pack(a); + ob = xfs_rmap_irec_offset_pack(b); + + if (a->rm_startblock < b->rm_startblock) + return -1; + else if (a->rm_startblock > b->rm_startblock) + return 1; + else if (a->rm_owner < b->rm_owner) + return -1; + else if (a->rm_owner > b->rm_owner) + return 1; + else if (oa < ob) + return -1; + else if (oa > ob) + return 1; + else + return 0; +} diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 789930599339..98f908fea103 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -162,6 +162,8 @@ typedef int (*xfs_rmap_query_range_fn)( int xfs_rmap_query_range(struct xfs_btree_cur *cur, struct xfs_rmap_irec *low_rec, struct xfs_rmap_irec *high_rec, xfs_rmap_query_range_fn fn, void *priv); +int xfs_rmap_query_all(struct xfs_btree_cur *cur, xfs_rmap_query_range_fn fn, + void *priv); enum xfs_rmap_intent_type { XFS_RMAP_MAP, @@ -212,5 +214,7 @@ int xfs_rmap_find_left_neighbor(struct xfs_btree_cur *cur, xfs_agblock_t bno, int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, uint64_t owner, uint64_t offset, unsigned int flags, struct xfs_rmap_irec *irec, int *stat); +int xfs_rmap_compare(const struct xfs_rmap_irec *a, + const struct xfs_rmap_irec *b); #endif /* __XFS_RMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index ea45584a9913..e47b99e59f60 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1016,3 +1016,73 @@ xfs_rtfree_extent( } return 0; } + +/* Find all the free records within a given range. */ +int +xfs_rtalloc_query_range( + struct xfs_trans *tp, + struct xfs_rtalloc_rec *low_rec, + struct xfs_rtalloc_rec *high_rec, + xfs_rtalloc_query_range_fn fn, + void *priv) +{ + struct xfs_rtalloc_rec rec; + struct xfs_mount *mp = tp->t_mountp; + xfs_rtblock_t rtstart; + xfs_rtblock_t rtend; + xfs_rtblock_t rem; + int is_free; + int error = 0; + + if (low_rec->ar_startblock > high_rec->ar_startblock) + return -EINVAL; + else if (low_rec->ar_startblock == high_rec->ar_startblock) + return 0; + + /* Iterate the bitmap, looking for discrepancies. */ + rtstart = low_rec->ar_startblock; + rem = high_rec->ar_startblock - rtstart; + while (rem) { + /* Is the first block free? */ + error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend, + &is_free); + if (error) + break; + + /* How long does the extent go for? */ + error = xfs_rtfind_forw(mp, tp, rtstart, + high_rec->ar_startblock - 1, &rtend); + if (error) + break; + + if (is_free) { + rec.ar_startblock = rtstart; + rec.ar_blockcount = rtend - rtstart + 1; + + error = fn(tp, &rec, priv); + if (error) + break; + } + + rem -= rtend - rtstart + 1; + rtstart = rtend + 1; + } + + return error; +} + +/* Find all the free records. */ +int +xfs_rtalloc_query_all( + struct xfs_trans *tp, + xfs_rtalloc_query_range_fn fn, + void *priv) +{ + struct xfs_rtalloc_rec keys[2]; + + keys[0].ar_startblock = 0; + keys[1].ar_startblock = tp->t_mountp->m_sb.sb_rblocks; + keys[0].ar_blockcount = keys[1].ar_blockcount = 0; + + return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv); +} diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index 7917f6e44286..d787c677d2a3 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -21,8 +21,20 @@ /* * Components of space reservations. */ + +/* Worst case number of rmaps that can be held in a block. */ #define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) \ (((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0])) + +/* Adding one rmap could split every level up to the top of the tree. */ +#define XFS_RMAPADD_SPACE_RES(mp) ((mp)->m_rmap_maxlevels) + +/* Blocks we might need to add "b" rmaps to a tree. */ +#define XFS_NRMAPADD_SPACE_RES(mp, b)\ + (((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \ + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \ + XFS_RMAPADD_SPACE_RES(mp)) + #define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) \ (((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0])) #define XFS_EXTENTADD_SPACE_RES(mp,w) (XFS_BM_MAXLEVELS(mp,w) - 1) @@ -30,13 +42,12 @@ (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ XFS_EXTENTADD_SPACE_RES(mp,w)) + +/* Blocks we might need to add "b" mappings & rmappings to a file. */ #define XFS_SWAP_RMAP_SPACE_RES(mp,b,w)\ - (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ - XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ - XFS_EXTENTADD_SPACE_RES(mp,w) + \ - ((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \ - XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \ - (mp)->m_rmap_maxlevels) + (XFS_NEXTENTADD_SPACE_RES((mp), (b), (w)) + \ + XFS_NRMAPADD_SPACE_RES((mp), (b))) + #define XFS_DAENTER_1B(mp,w) \ ((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1) #define XFS_DAENTER_DBS(mp,w) \ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 05eca126c688..09af0f7cd55e 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -111,11 +111,11 @@ xfs_finish_page_writeback( bsize = bh->b_size; do { + if (off > end) + break; next = bh->b_this_page; if (off < bvec->bv_offset) goto next_bh; - if (off > end) - break; bh->b_end_io(bh, !error); next_bh: off += bsize; @@ -1261,8 +1261,8 @@ xfs_get_blocks( if (nimaps) { trace_xfs_get_blocks_found(ip, offset, size, - ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN - : XFS_IO_OVERWRITE, &imap); + imap.br_state == XFS_EXT_UNWRITTEN ? + XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, &imap); xfs_iunlock(ip, lockmode); } else { trace_xfs_get_blocks_notfound(ip, offset, size); @@ -1276,9 +1276,7 @@ xfs_get_blocks( * For unwritten extents do not report a disk address in the buffered * read case (treat as if we're reading into a hole). */ - if (imap.br_startblock != HOLESTARTBLOCK && - imap.br_startblock != DELAYSTARTBLOCK && - !ISUNWRITTEN(&imap)) + if (xfs_bmap_is_real_extent(&imap)) xfs_map_buffer(inode, bh_result, &imap, offset); /* diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 9bf57c76623b..d419d23fa214 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -34,6 +34,8 @@ #include "xfs_bmap.h" #include "xfs_icache.h" #include "xfs_trace.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" kmem_zone_t *xfs_bui_zone; @@ -215,6 +217,7 @@ void xfs_bui_release( struct xfs_bui_log_item *buip) { + ASSERT(atomic_read(&buip->bui_refcount) > 0); if (atomic_dec_and_test(&buip->bui_refcount)) { xfs_trans_ail_remove(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR); xfs_bui_item_free(buip); @@ -446,7 +449,8 @@ xfs_bui_recover( return -EIO; } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp); if (error) return error; budp = xfs_trans_get_bud(tp, buip); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 8795e9cd867c..2b954308a1d6 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -448,10 +448,9 @@ xfs_getbmap_adjust_shared( next_map->br_blockcount = 0; /* Only written data blocks can be shared. */ - if (!xfs_is_reflink_inode(ip) || whichfork != XFS_DATA_FORK || - map->br_startblock == DELAYSTARTBLOCK || - map->br_startblock == HOLESTARTBLOCK || - ISUNWRITTEN(map)) + if (!xfs_is_reflink_inode(ip) || + whichfork != XFS_DATA_FORK || + !xfs_bmap_is_real_extent(map)) return 0; agno = XFS_FSB_TO_AGNO(mp, map->br_startblock); @@ -904,9 +903,9 @@ xfs_can_free_eofblocks(struct xfs_inode *ip, bool force) } /* - * This is called by xfs_inactive to free any blocks beyond eof - * when the link count isn't zero and by xfs_dm_punch_hole() when - * punching a hole to EOF. + * This is called to free any blocks beyond eof. The caller must hold + * IOLOCK_EXCL unless we are in the inode reclaim path and have the only + * reference to the inode. */ int xfs_free_eofblocks( @@ -921,8 +920,6 @@ xfs_free_eofblocks( struct xfs_bmbt_irec imap; struct xfs_mount *mp = ip->i_mount; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - /* * Figure out if there are any blocks beyond the end * of the file. If not, then there is nothing to do. @@ -1209,11 +1206,8 @@ xfs_adjust_extent_unmap_boundaries( return error; if (nimap && imap.br_startblock != HOLESTARTBLOCK) { - xfs_daddr_t block; - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - block = imap.br_startblock; - mod = do_div(block, mp->m_sb.sb_rextsize); + mod = do_mod(imap.br_startblock, mp->m_sb.sb_rextsize); if (mod) *startoffset_fsb += mp->m_sb.sb_rextsize - mod; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index ca09061369cb..62fa39276a24 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1079,6 +1079,8 @@ void xfs_buf_unlock( struct xfs_buf *bp) { + ASSERT(xfs_buf_islocked(bp)); + XB_CLEAR_OWNER(bp); up(&bp->b_sema); @@ -1815,6 +1817,28 @@ error: } /* + * Cancel a delayed write list. + * + * Remove each buffer from the list, clear the delwri queue flag and drop the + * associated buffer reference. + */ +void +xfs_buf_delwri_cancel( + struct list_head *list) +{ + struct xfs_buf *bp; + + while (!list_empty(list)) { + bp = list_first_entry(list, struct xfs_buf, b_list); + + xfs_buf_lock(bp); + bp->b_flags &= ~_XBF_DELWRI_Q; + list_del_init(&bp->b_list); + xfs_buf_relse(bp); + } +} + +/* * Add a buffer to the delayed write list. * * This queues a buffer for writeout if it hasn't already been. Note that diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 3c867e5a63e1..8d1d44f87ce9 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -291,7 +291,6 @@ xfs_buf_readahead( return xfs_buf_readahead_map(target, &map, 1, ops); } -struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks); void xfs_buf_set_empty(struct xfs_buf *bp, size_t numblks); int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length); @@ -330,6 +329,7 @@ extern void *xfs_buf_offset(struct xfs_buf *, size_t); extern void xfs_buf_stale(struct xfs_buf *bp); /* Delayed Write Buffer Routines */ +extern void xfs_buf_delwri_cancel(struct list_head *); extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); extern int xfs_buf_delwri_submit(struct list_head *); extern int xfs_buf_delwri_submit_nowait(struct list_head *); diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index ad9396e516f6..20b7a5c6eb2f 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -394,6 +394,7 @@ xfs_dir2_leaf_readbuf( /* * Do we need more readahead? + * Each loop tries to process 1 full dir blk; last may be partial. */ blk_start_plug(&plug); for (mip->ra_index = mip->ra_offset = i = 0; @@ -404,7 +405,8 @@ xfs_dir2_leaf_readbuf( * Read-ahead a contiguous directory block. */ if (i > mip->ra_current && - map[mip->ra_index].br_blockcount >= geo->fsbcount) { + (map[mip->ra_index].br_blockcount - mip->ra_offset) >= + geo->fsbcount) { xfs_dir3_data_readahead(dp, map[mip->ra_index].br_startoff + mip->ra_offset, XFS_FSB_TO_DADDR(dp->i_mount, @@ -425,14 +427,19 @@ xfs_dir2_leaf_readbuf( } /* - * Advance offset through the mapping table. + * Advance offset through the mapping table, processing a full + * dir block even if it is fragmented into several extents. + * But stop if we have consumed all valid mappings, even if + * it's not yet a full directory block. */ - for (j = 0; j < geo->fsbcount; j += length ) { + for (j = 0; + j < geo->fsbcount && mip->ra_index < mip->map_valid; + j += length ) { /* * The rest of this extent but not more than a dir * block. */ - length = min_t(int, geo->fsbcount, + length = min_t(int, geo->fsbcount - j, map[mip->ra_index].br_blockcount - mip->ra_offset); mip->ra_offset += length; diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index d796ffac7296..6a05d278da64 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -132,6 +132,11 @@ next_extent: error = xfs_btree_decrement(cur, 0, &i); if (error) goto out_del_cursor; + + if (fatal_signal_pending(current)) { + error = -ERESTARTSYS; + goto out_del_cursor; + } } out_del_cursor: @@ -196,8 +201,11 @@ xfs_ioc_trim( for (agno = start_agno; agno <= end_agno; agno++) { error = xfs_trim_extents(mp, agno, start, end, minlen, &blocks_trimmed); - if (error) + if (error) { last_error = error; + if (error == -ERESTARTSYS) + break; + } } if (last_error) diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index d7bc14906af8..44f8c5451210 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -290,6 +290,7 @@ void xfs_efi_release( struct xfs_efi_log_item *efip) { + ASSERT(atomic_read(&efip->efi_refcount) > 0); if (atomic_dec_and_test(&efip->efi_refcount)) { xfs_trans_ail_remove(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR); xfs_efi_item_free(efip); diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c new file mode 100644 index 000000000000..3683819887a5 --- /dev/null +++ b/fs/xfs/xfs_fsmap.c @@ -0,0 +1,940 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_error.h" +#include "xfs_btree.h" +#include "xfs_rmap_btree.h" +#include "xfs_trace.h" +#include "xfs_log.h" +#include "xfs_rmap.h" +#include "xfs_alloc.h" +#include "xfs_bit.h" +#include <linux/fsmap.h> +#include "xfs_fsmap.h" +#include "xfs_refcount.h" +#include "xfs_refcount_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_rtalloc.h" + +/* Convert an xfs_fsmap to an fsmap. */ +void +xfs_fsmap_from_internal( + struct fsmap *dest, + struct xfs_fsmap *src) +{ + dest->fmr_device = src->fmr_device; + dest->fmr_flags = src->fmr_flags; + dest->fmr_physical = BBTOB(src->fmr_physical); + dest->fmr_owner = src->fmr_owner; + dest->fmr_offset = BBTOB(src->fmr_offset); + dest->fmr_length = BBTOB(src->fmr_length); + dest->fmr_reserved[0] = 0; + dest->fmr_reserved[1] = 0; + dest->fmr_reserved[2] = 0; +} + +/* Convert an fsmap to an xfs_fsmap. */ +void +xfs_fsmap_to_internal( + struct xfs_fsmap *dest, + struct fsmap *src) +{ + dest->fmr_device = src->fmr_device; + dest->fmr_flags = src->fmr_flags; + dest->fmr_physical = BTOBBT(src->fmr_physical); + dest->fmr_owner = src->fmr_owner; + dest->fmr_offset = BTOBBT(src->fmr_offset); + dest->fmr_length = BTOBBT(src->fmr_length); +} + +/* Convert an fsmap owner into an rmapbt owner. */ +static int +xfs_fsmap_owner_to_rmap( + struct xfs_rmap_irec *dest, + struct xfs_fsmap *src) +{ + if (!(src->fmr_flags & FMR_OF_SPECIAL_OWNER)) { + dest->rm_owner = src->fmr_owner; + return 0; + } + + switch (src->fmr_owner) { + case 0: /* "lowest owner id possible" */ + case -1ULL: /* "highest owner id possible" */ + dest->rm_owner = 0; + break; + case XFS_FMR_OWN_FREE: + dest->rm_owner = XFS_RMAP_OWN_NULL; + break; + case XFS_FMR_OWN_UNKNOWN: + dest->rm_owner = XFS_RMAP_OWN_UNKNOWN; + break; + case XFS_FMR_OWN_FS: + dest->rm_owner = XFS_RMAP_OWN_FS; + break; + case XFS_FMR_OWN_LOG: + dest->rm_owner = XFS_RMAP_OWN_LOG; + break; + case XFS_FMR_OWN_AG: + dest->rm_owner = XFS_RMAP_OWN_AG; + break; + case XFS_FMR_OWN_INOBT: + dest->rm_owner = XFS_RMAP_OWN_INOBT; + break; + case XFS_FMR_OWN_INODES: + dest->rm_owner = XFS_RMAP_OWN_INODES; + break; + case XFS_FMR_OWN_REFC: + dest->rm_owner = XFS_RMAP_OWN_REFC; + break; + case XFS_FMR_OWN_COW: + dest->rm_owner = XFS_RMAP_OWN_COW; + break; + case XFS_FMR_OWN_DEFECTIVE: /* not implemented */ + /* fall through */ + default: + return -EINVAL; + } + return 0; +} + +/* Convert an rmapbt owner into an fsmap owner. */ +static int +xfs_fsmap_owner_from_rmap( + struct xfs_fsmap *dest, + struct xfs_rmap_irec *src) +{ + dest->fmr_flags = 0; + if (!XFS_RMAP_NON_INODE_OWNER(src->rm_owner)) { + dest->fmr_owner = src->rm_owner; + return 0; + } + dest->fmr_flags |= FMR_OF_SPECIAL_OWNER; + + switch (src->rm_owner) { + case XFS_RMAP_OWN_FS: + dest->fmr_owner = XFS_FMR_OWN_FS; + break; + case XFS_RMAP_OWN_LOG: + dest->fmr_owner = XFS_FMR_OWN_LOG; + break; + case XFS_RMAP_OWN_AG: + dest->fmr_owner = XFS_FMR_OWN_AG; + break; + case XFS_RMAP_OWN_INOBT: + dest->fmr_owner = XFS_FMR_OWN_INOBT; + break; + case XFS_RMAP_OWN_INODES: + dest->fmr_owner = XFS_FMR_OWN_INODES; + break; + case XFS_RMAP_OWN_REFC: + dest->fmr_owner = XFS_FMR_OWN_REFC; + break; + case XFS_RMAP_OWN_COW: + dest->fmr_owner = XFS_FMR_OWN_COW; + break; + case XFS_RMAP_OWN_NULL: /* "free" */ + dest->fmr_owner = XFS_FMR_OWN_FREE; + break; + default: + return -EFSCORRUPTED; + } + return 0; +} + +/* getfsmap query state */ +struct xfs_getfsmap_info { + struct xfs_fsmap_head *head; + xfs_fsmap_format_t formatter; /* formatting fn */ + void *format_arg; /* format buffer */ + struct xfs_buf *agf_bp; /* AGF, for refcount queries */ + xfs_daddr_t next_daddr; /* next daddr we expect */ + u64 missing_owner; /* owner of holes */ + u32 dev; /* device id */ + xfs_agnumber_t agno; /* AG number, if applicable */ + struct xfs_rmap_irec low; /* low rmap key */ + struct xfs_rmap_irec high; /* high rmap key */ + bool last; /* last extent? */ +}; + +/* Associate a device with a getfsmap handler. */ +struct xfs_getfsmap_dev { + u32 dev; + int (*fn)(struct xfs_trans *tp, + struct xfs_fsmap *keys, + struct xfs_getfsmap_info *info); +}; + +/* Compare two getfsmap device handlers. */ +static int +xfs_getfsmap_dev_compare( + const void *p1, + const void *p2) +{ + const struct xfs_getfsmap_dev *d1 = p1; + const struct xfs_getfsmap_dev *d2 = p2; + + return d1->dev - d2->dev; +} + +/* Decide if this mapping is shared. */ +STATIC int +xfs_getfsmap_is_shared( + struct xfs_trans *tp, + struct xfs_getfsmap_info *info, + struct xfs_rmap_irec *rec, + bool *stat) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_btree_cur *cur; + xfs_agblock_t fbno; + xfs_extlen_t flen; + int error; + + *stat = false; + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return 0; + /* rt files will have agno set to NULLAGNUMBER */ + if (info->agno == NULLAGNUMBER) + return 0; + + /* Are there any shared blocks here? */ + flen = 0; + cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp, + info->agno, NULL); + + error = xfs_refcount_find_shared(cur, rec->rm_startblock, + rec->rm_blockcount, &fbno, &flen, false); + + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + if (error) + return error; + + *stat = flen > 0; + return 0; +} + +/* + * Format a reverse mapping for getfsmap, having translated rm_startblock + * into the appropriate daddr units. + */ +STATIC int +xfs_getfsmap_helper( + struct xfs_trans *tp, + struct xfs_getfsmap_info *info, + struct xfs_rmap_irec *rec, + xfs_daddr_t rec_daddr) +{ + struct xfs_fsmap fmr; + struct xfs_mount *mp = tp->t_mountp; + bool shared; + int error; + + if (fatal_signal_pending(current)) + return -EINTR; + + /* + * Filter out records that start before our startpoint, if the + * caller requested that. + */ + if (xfs_rmap_compare(rec, &info->low) < 0) { + rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); + if (info->next_daddr < rec_daddr) + info->next_daddr = rec_daddr; + return XFS_BTREE_QUERY_RANGE_CONTINUE; + } + + /* Are we just counting mappings? */ + if (info->head->fmh_count == 0) { + if (rec_daddr > info->next_daddr) + info->head->fmh_entries++; + + if (info->last) + return XFS_BTREE_QUERY_RANGE_CONTINUE; + + info->head->fmh_entries++; + + rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); + if (info->next_daddr < rec_daddr) + info->next_daddr = rec_daddr; + return XFS_BTREE_QUERY_RANGE_CONTINUE; + } + + /* + * If the record starts past the last physical block we saw, + * then we've found a gap. Report the gap as being owned by + * whatever the caller specified is the missing owner. + */ + if (rec_daddr > info->next_daddr) { + if (info->head->fmh_entries >= info->head->fmh_count) + return XFS_BTREE_QUERY_RANGE_ABORT; + + fmr.fmr_device = info->dev; + fmr.fmr_physical = info->next_daddr; + fmr.fmr_owner = info->missing_owner; + fmr.fmr_offset = 0; + fmr.fmr_length = rec_daddr - info->next_daddr; + fmr.fmr_flags = FMR_OF_SPECIAL_OWNER; + error = info->formatter(&fmr, info->format_arg); + if (error) + return error; + info->head->fmh_entries++; + } + + if (info->last) + goto out; + + /* Fill out the extent we found */ + if (info->head->fmh_entries >= info->head->fmh_count) + return XFS_BTREE_QUERY_RANGE_ABORT; + + trace_xfs_fsmap_mapping(mp, info->dev, info->agno, rec); + + fmr.fmr_device = info->dev; + fmr.fmr_physical = rec_daddr; + error = xfs_fsmap_owner_from_rmap(&fmr, rec); + if (error) + return error; + fmr.fmr_offset = XFS_FSB_TO_BB(mp, rec->rm_offset); + fmr.fmr_length = XFS_FSB_TO_BB(mp, rec->rm_blockcount); + if (rec->rm_flags & XFS_RMAP_UNWRITTEN) + fmr.fmr_flags |= FMR_OF_PREALLOC; + if (rec->rm_flags & XFS_RMAP_ATTR_FORK) + fmr.fmr_flags |= FMR_OF_ATTR_FORK; + if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) + fmr.fmr_flags |= FMR_OF_EXTENT_MAP; + if (fmr.fmr_flags == 0) { + error = xfs_getfsmap_is_shared(tp, info, rec, &shared); + if (error) + return error; + if (shared) + fmr.fmr_flags |= FMR_OF_SHARED; + } + error = info->formatter(&fmr, info->format_arg); + if (error) + return error; + info->head->fmh_entries++; + +out: + rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); + if (info->next_daddr < rec_daddr) + info->next_daddr = rec_daddr; + return XFS_BTREE_QUERY_RANGE_CONTINUE; +} + +/* Transform a rmapbt irec into a fsmap */ +STATIC int +xfs_getfsmap_datadev_helper( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rec, + void *priv) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_getfsmap_info *info = priv; + xfs_fsblock_t fsb; + xfs_daddr_t rec_daddr; + + fsb = XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, rec->rm_startblock); + rec_daddr = XFS_FSB_TO_DADDR(mp, fsb); + + return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr); +} + +/* Transform a rtbitmap "record" into a fsmap */ +STATIC int +xfs_getfsmap_rtdev_rtbitmap_helper( + struct xfs_trans *tp, + struct xfs_rtalloc_rec *rec, + void *priv) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_getfsmap_info *info = priv; + struct xfs_rmap_irec irec; + xfs_daddr_t rec_daddr; + + rec_daddr = XFS_FSB_TO_BB(mp, rec->ar_startblock); + + irec.rm_startblock = rec->ar_startblock; + irec.rm_blockcount = rec->ar_blockcount; + irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */ + irec.rm_offset = 0; + irec.rm_flags = 0; + + return xfs_getfsmap_helper(tp, info, &irec, rec_daddr); +} + +/* Transform a bnobt irec into a fsmap */ +STATIC int +xfs_getfsmap_datadev_bnobt_helper( + struct xfs_btree_cur *cur, + struct xfs_alloc_rec_incore *rec, + void *priv) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_getfsmap_info *info = priv; + struct xfs_rmap_irec irec; + xfs_daddr_t rec_daddr; + + rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_private.a.agno, + rec->ar_startblock); + + irec.rm_startblock = rec->ar_startblock; + irec.rm_blockcount = rec->ar_blockcount; + irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */ + irec.rm_offset = 0; + irec.rm_flags = 0; + + return xfs_getfsmap_helper(cur->bc_tp, info, &irec, rec_daddr); +} + +/* Set rmap flags based on the getfsmap flags */ +static void +xfs_getfsmap_set_irec_flags( + struct xfs_rmap_irec *irec, + struct xfs_fsmap *fmr) +{ + irec->rm_flags = 0; + if (fmr->fmr_flags & FMR_OF_ATTR_FORK) + irec->rm_flags |= XFS_RMAP_ATTR_FORK; + if (fmr->fmr_flags & FMR_OF_EXTENT_MAP) + irec->rm_flags |= XFS_RMAP_BMBT_BLOCK; + if (fmr->fmr_flags & FMR_OF_PREALLOC) + irec->rm_flags |= XFS_RMAP_UNWRITTEN; +} + +/* Execute a getfsmap query against the log device. */ +STATIC int +xfs_getfsmap_logdev( + struct xfs_trans *tp, + struct xfs_fsmap *keys, + struct xfs_getfsmap_info *info) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_rmap_irec rmap; + int error; + + /* Set up search keys */ + info->low.rm_startblock = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical); + info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset); + error = xfs_fsmap_owner_to_rmap(&info->low, keys); + if (error) + return error; + info->low.rm_blockcount = 0; + xfs_getfsmap_set_irec_flags(&info->low, &keys[0]); + + error = xfs_fsmap_owner_to_rmap(&info->high, keys + 1); + if (error) + return error; + info->high.rm_startblock = -1U; + info->high.rm_owner = ULLONG_MAX; + info->high.rm_offset = ULLONG_MAX; + info->high.rm_blockcount = 0; + info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS; + info->missing_owner = XFS_FMR_OWN_FREE; + + trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low); + trace_xfs_fsmap_high_key(mp, info->dev, info->agno, &info->high); + + if (keys[0].fmr_physical > 0) + return 0; + + /* Fabricate an rmap entry for the external log device. */ + rmap.rm_startblock = 0; + rmap.rm_blockcount = mp->m_sb.sb_logblocks; + rmap.rm_owner = XFS_RMAP_OWN_LOG; + rmap.rm_offset = 0; + rmap.rm_flags = 0; + + return xfs_getfsmap_helper(tp, info, &rmap, 0); +} + +/* Execute a getfsmap query against the realtime device. */ +STATIC int +__xfs_getfsmap_rtdev( + struct xfs_trans *tp, + struct xfs_fsmap *keys, + int (*query_fn)(struct xfs_trans *, + struct xfs_getfsmap_info *), + struct xfs_getfsmap_info *info) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_fsblock_t start_fsb; + xfs_fsblock_t end_fsb; + xfs_daddr_t eofs; + int error = 0; + + eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); + if (keys[0].fmr_physical >= eofs) + return 0; + if (keys[1].fmr_physical >= eofs) + keys[1].fmr_physical = eofs - 1; + start_fsb = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical); + end_fsb = XFS_BB_TO_FSB(mp, keys[1].fmr_physical); + + /* Set up search keys */ + info->low.rm_startblock = start_fsb; + error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]); + if (error) + return error; + info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset); + info->low.rm_blockcount = 0; + xfs_getfsmap_set_irec_flags(&info->low, &keys[0]); + + info->high.rm_startblock = end_fsb; + error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]); + if (error) + return error; + info->high.rm_offset = XFS_BB_TO_FSBT(mp, keys[1].fmr_offset); + info->high.rm_blockcount = 0; + xfs_getfsmap_set_irec_flags(&info->high, &keys[1]); + + trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low); + trace_xfs_fsmap_high_key(mp, info->dev, info->agno, &info->high); + + return query_fn(tp, info); +} + +/* Actually query the realtime bitmap. */ +STATIC int +xfs_getfsmap_rtdev_rtbitmap_query( + struct xfs_trans *tp, + struct xfs_getfsmap_info *info) +{ + struct xfs_rtalloc_rec alow; + struct xfs_rtalloc_rec ahigh; + int error; + + xfs_ilock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED); + + alow.ar_startblock = info->low.rm_startblock; + ahigh.ar_startblock = info->high.rm_startblock; + error = xfs_rtalloc_query_range(tp, &alow, &ahigh, + xfs_getfsmap_rtdev_rtbitmap_helper, info); + if (error) + goto err; + + /* Report any gaps at the end of the rtbitmap */ + info->last = true; + error = xfs_getfsmap_rtdev_rtbitmap_helper(tp, &ahigh, info); + if (error) + goto err; +err: + xfs_iunlock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED); + return error; +} + +/* Execute a getfsmap query against the realtime device rtbitmap. */ +STATIC int +xfs_getfsmap_rtdev_rtbitmap( + struct xfs_trans *tp, + struct xfs_fsmap *keys, + struct xfs_getfsmap_info *info) +{ + info->missing_owner = XFS_FMR_OWN_UNKNOWN; + return __xfs_getfsmap_rtdev(tp, keys, xfs_getfsmap_rtdev_rtbitmap_query, + info); +} + +/* Execute a getfsmap query against the regular data device. */ +STATIC int +__xfs_getfsmap_datadev( + struct xfs_trans *tp, + struct xfs_fsmap *keys, + struct xfs_getfsmap_info *info, + int (*query_fn)(struct xfs_trans *, + struct xfs_getfsmap_info *, + struct xfs_btree_cur **, + void *), + void *priv) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_btree_cur *bt_cur = NULL; + xfs_fsblock_t start_fsb; + xfs_fsblock_t end_fsb; + xfs_agnumber_t start_ag; + xfs_agnumber_t end_ag; + xfs_daddr_t eofs; + int error = 0; + + eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); + if (keys[0].fmr_physical >= eofs) + return 0; + if (keys[1].fmr_physical >= eofs) + keys[1].fmr_physical = eofs - 1; + start_fsb = XFS_DADDR_TO_FSB(mp, keys[0].fmr_physical); + end_fsb = XFS_DADDR_TO_FSB(mp, keys[1].fmr_physical); + + /* + * Convert the fsmap low/high keys to AG based keys. Initialize + * low to the fsmap low key and max out the high key to the end + * of the AG. + */ + info->low.rm_startblock = XFS_FSB_TO_AGBNO(mp, start_fsb); + info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset); + error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]); + if (error) + return error; + info->low.rm_blockcount = 0; + xfs_getfsmap_set_irec_flags(&info->low, &keys[0]); + + info->high.rm_startblock = -1U; + info->high.rm_owner = ULLONG_MAX; + info->high.rm_offset = ULLONG_MAX; + info->high.rm_blockcount = 0; + info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS; + + start_ag = XFS_FSB_TO_AGNO(mp, start_fsb); + end_ag = XFS_FSB_TO_AGNO(mp, end_fsb); + + /* Query each AG */ + for (info->agno = start_ag; info->agno <= end_ag; info->agno++) { + /* + * Set the AG high key from the fsmap high key if this + * is the last AG that we're querying. + */ + if (info->agno == end_ag) { + info->high.rm_startblock = XFS_FSB_TO_AGBNO(mp, + end_fsb); + info->high.rm_offset = XFS_BB_TO_FSBT(mp, + keys[1].fmr_offset); + error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]); + if (error) + goto err; + xfs_getfsmap_set_irec_flags(&info->high, &keys[1]); + } + + if (bt_cur) { + xfs_btree_del_cursor(bt_cur, XFS_BTREE_NOERROR); + bt_cur = NULL; + xfs_trans_brelse(tp, info->agf_bp); + info->agf_bp = NULL; + } + + error = xfs_alloc_read_agf(mp, tp, info->agno, 0, + &info->agf_bp); + if (error) + goto err; + + trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low); + trace_xfs_fsmap_high_key(mp, info->dev, info->agno, + &info->high); + + error = query_fn(tp, info, &bt_cur, priv); + if (error) + goto err; + + /* + * Set the AG low key to the start of the AG prior to + * moving on to the next AG. + */ + if (info->agno == start_ag) { + info->low.rm_startblock = 0; + info->low.rm_owner = 0; + info->low.rm_offset = 0; + info->low.rm_flags = 0; + } + } + + /* Report any gap at the end of the AG */ + info->last = true; + error = query_fn(tp, info, &bt_cur, priv); + if (error) + goto err; + +err: + if (bt_cur) + xfs_btree_del_cursor(bt_cur, error < 0 ? XFS_BTREE_ERROR : + XFS_BTREE_NOERROR); + if (info->agf_bp) { + xfs_trans_brelse(tp, info->agf_bp); + info->agf_bp = NULL; + } + + return error; +} + +/* Actually query the rmap btree. */ +STATIC int +xfs_getfsmap_datadev_rmapbt_query( + struct xfs_trans *tp, + struct xfs_getfsmap_info *info, + struct xfs_btree_cur **curpp, + void *priv) +{ + /* Report any gap at the end of the last AG. */ + if (info->last) + return xfs_getfsmap_datadev_helper(*curpp, &info->high, info); + + /* Allocate cursor for this AG and query_range it. */ + *curpp = xfs_rmapbt_init_cursor(tp->t_mountp, tp, info->agf_bp, + info->agno); + return xfs_rmap_query_range(*curpp, &info->low, &info->high, + xfs_getfsmap_datadev_helper, info); +} + +/* Execute a getfsmap query against the regular data device rmapbt. */ +STATIC int +xfs_getfsmap_datadev_rmapbt( + struct xfs_trans *tp, + struct xfs_fsmap *keys, + struct xfs_getfsmap_info *info) +{ + info->missing_owner = XFS_FMR_OWN_FREE; + return __xfs_getfsmap_datadev(tp, keys, info, + xfs_getfsmap_datadev_rmapbt_query, NULL); +} + +/* Actually query the bno btree. */ +STATIC int +xfs_getfsmap_datadev_bnobt_query( + struct xfs_trans *tp, + struct xfs_getfsmap_info *info, + struct xfs_btree_cur **curpp, + void *priv) +{ + struct xfs_alloc_rec_incore *key = priv; + + /* Report any gap at the end of the last AG. */ + if (info->last) + return xfs_getfsmap_datadev_bnobt_helper(*curpp, &key[1], info); + + /* Allocate cursor for this AG and query_range it. */ + *curpp = xfs_allocbt_init_cursor(tp->t_mountp, tp, info->agf_bp, + info->agno, XFS_BTNUM_BNO); + key->ar_startblock = info->low.rm_startblock; + key[1].ar_startblock = info->high.rm_startblock; + return xfs_alloc_query_range(*curpp, key, &key[1], + xfs_getfsmap_datadev_bnobt_helper, info); +} + +/* Execute a getfsmap query against the regular data device's bnobt. */ +STATIC int +xfs_getfsmap_datadev_bnobt( + struct xfs_trans *tp, + struct xfs_fsmap *keys, + struct xfs_getfsmap_info *info) +{ + struct xfs_alloc_rec_incore akeys[2]; + + info->missing_owner = XFS_FMR_OWN_UNKNOWN; + return __xfs_getfsmap_datadev(tp, keys, info, + xfs_getfsmap_datadev_bnobt_query, &akeys[0]); +} + +/* Do we recognize the device? */ +STATIC bool +xfs_getfsmap_is_valid_device( + struct xfs_mount *mp, + struct xfs_fsmap *fm) +{ + if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX || + fm->fmr_device == new_encode_dev(mp->m_ddev_targp->bt_dev)) + return true; + if (mp->m_logdev_targp && + fm->fmr_device == new_encode_dev(mp->m_logdev_targp->bt_dev)) + return true; + if (mp->m_rtdev_targp && + fm->fmr_device == new_encode_dev(mp->m_rtdev_targp->bt_dev)) + return true; + return false; +} + +/* Ensure that the low key is less than the high key. */ +STATIC bool +xfs_getfsmap_check_keys( + struct xfs_fsmap *low_key, + struct xfs_fsmap *high_key) +{ + if (low_key->fmr_device > high_key->fmr_device) + return false; + if (low_key->fmr_device < high_key->fmr_device) + return true; + + if (low_key->fmr_physical > high_key->fmr_physical) + return false; + if (low_key->fmr_physical < high_key->fmr_physical) + return true; + + if (low_key->fmr_owner > high_key->fmr_owner) + return false; + if (low_key->fmr_owner < high_key->fmr_owner) + return true; + + if (low_key->fmr_offset > high_key->fmr_offset) + return false; + if (low_key->fmr_offset < high_key->fmr_offset) + return true; + + return false; +} + +#define XFS_GETFSMAP_DEVS 3 +/* + * Get filesystem's extents as described in head, and format for + * output. Calls formatter to fill the user's buffer until all + * extents are mapped, until the passed-in head->fmh_count slots have + * been filled, or until the formatter short-circuits the loop, if it + * is tracking filled-in extents on its own. + * + * Key to Confusion + * ---------------- + * There are multiple levels of keys and counters at work here: + * xfs_fsmap_head.fmh_keys -- low and high fsmap keys passed in; + * these reflect fs-wide sector addrs. + * dkeys -- fmh_keys used to query each device; + * these are fmh_keys but w/ the low key + * bumped up by fmr_length. + * xfs_getfsmap_info.next_daddr -- next disk addr we expect to see; this + * is how we detect gaps in the fsmap + records and report them. + * xfs_getfsmap_info.low/high -- per-AG low/high keys computed from + * dkeys; used to query the metadata. + */ +int +xfs_getfsmap( + struct xfs_mount *mp, + struct xfs_fsmap_head *head, + xfs_fsmap_format_t formatter, + void *arg) +{ + struct xfs_trans *tp = NULL; + struct xfs_fsmap dkeys[2]; /* per-dev keys */ + struct xfs_getfsmap_dev handlers[XFS_GETFSMAP_DEVS]; + struct xfs_getfsmap_info info = { NULL }; + int i; + int error = 0; + + if (head->fmh_iflags & ~FMH_IF_VALID) + return -EINVAL; + if (!xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[0]) || + !xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[1])) + return -EINVAL; + + head->fmh_entries = 0; + + /* Set up our device handlers. */ + memset(handlers, 0, sizeof(handlers)); + handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev); + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + handlers[0].fn = xfs_getfsmap_datadev_rmapbt; + else + handlers[0].fn = xfs_getfsmap_datadev_bnobt; + if (mp->m_logdev_targp != mp->m_ddev_targp) { + handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev); + handlers[1].fn = xfs_getfsmap_logdev; + } + if (mp->m_rtdev_targp) { + handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev); + handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap; + } + + xfs_sort(handlers, XFS_GETFSMAP_DEVS, sizeof(struct xfs_getfsmap_dev), + xfs_getfsmap_dev_compare); + + /* + * To continue where we left off, we allow userspace to use the + * last mapping from a previous call as the low key of the next. + * This is identified by a non-zero length in the low key. We + * have to increment the low key in this scenario to ensure we + * don't return the same mapping again, and instead return the + * very next mapping. + * + * If the low key mapping refers to file data, the same physical + * blocks could be mapped to several other files/offsets. + * According to rmapbt record ordering, the minimal next + * possible record for the block range is the next starting + * offset in the same inode. Therefore, bump the file offset to + * continue the search appropriately. For all other low key + * mapping types (attr blocks, metadata), bump the physical + * offset as there can be no other mapping for the same physical + * block range. + */ + dkeys[0] = head->fmh_keys[0]; + if (dkeys[0].fmr_flags & (FMR_OF_SPECIAL_OWNER | FMR_OF_EXTENT_MAP)) { + dkeys[0].fmr_physical += dkeys[0].fmr_length; + dkeys[0].fmr_owner = 0; + if (dkeys[0].fmr_offset) + return -EINVAL; + } else + dkeys[0].fmr_offset += dkeys[0].fmr_length; + dkeys[0].fmr_length = 0; + memset(&dkeys[1], 0xFF, sizeof(struct xfs_fsmap)); + + if (!xfs_getfsmap_check_keys(dkeys, &head->fmh_keys[1])) + return -EINVAL; + + info.next_daddr = head->fmh_keys[0].fmr_physical + + head->fmh_keys[0].fmr_length; + info.formatter = formatter; + info.format_arg = arg; + info.head = head; + + /* For each device we support... */ + for (i = 0; i < XFS_GETFSMAP_DEVS; i++) { + /* Is this device within the range the user asked for? */ + if (!handlers[i].fn) + continue; + if (head->fmh_keys[0].fmr_device > handlers[i].dev) + continue; + if (head->fmh_keys[1].fmr_device < handlers[i].dev) + break; + + /* + * If this device number matches the high key, we have + * to pass the high key to the handler to limit the + * query results. If the device number exceeds the + * low key, zero out the low key so that we get + * everything from the beginning. + */ + if (handlers[i].dev == head->fmh_keys[1].fmr_device) + dkeys[1] = head->fmh_keys[1]; + if (handlers[i].dev > head->fmh_keys[0].fmr_device) + memset(&dkeys[0], 0, sizeof(struct xfs_fsmap)); + + error = xfs_trans_alloc_empty(mp, &tp); + if (error) + break; + + info.dev = handlers[i].dev; + info.last = false; + info.agno = NULLAGNUMBER; + error = handlers[i].fn(tp, dkeys, &info); + if (error) + break; + xfs_trans_cancel(tp); + tp = NULL; + info.next_daddr = 0; + } + + if (tp) + xfs_trans_cancel(tp); + head->fmh_oflags = FMH_OF_DEV_T; + return error; +} diff --git a/fs/xfs/xfs_fsmap.h b/fs/xfs/xfs_fsmap.h new file mode 100644 index 000000000000..0b9bf822595c --- /dev/null +++ b/fs/xfs/xfs_fsmap.h @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_FSMAP_H__ +#define __XFS_FSMAP_H__ + +struct fsmap; + +/* internal fsmap representation */ +struct xfs_fsmap { + dev_t fmr_device; /* device id */ + uint32_t fmr_flags; /* mapping flags */ + uint64_t fmr_physical; /* device offset of segment */ + uint64_t fmr_owner; /* owner id */ + xfs_fileoff_t fmr_offset; /* file offset of segment */ + xfs_filblks_t fmr_length; /* length of segment, blocks */ +}; + +struct xfs_fsmap_head { + uint32_t fmh_iflags; /* control flags */ + uint32_t fmh_oflags; /* output flags */ + unsigned int fmh_count; /* # of entries in array incl. input */ + unsigned int fmh_entries; /* # of entries filled in (output). */ + + struct xfs_fsmap fmh_keys[2]; /* low and high keys */ +}; + +void xfs_fsmap_from_internal(struct fsmap *dest, struct xfs_fsmap *src); +void xfs_fsmap_to_internal(struct xfs_fsmap *dest, struct fsmap *src); + +/* fsmap to userspace formatter - copy to user & advance pointer */ +typedef int (*xfs_fsmap_format_t)(struct xfs_fsmap *, void *); + +int xfs_getfsmap(struct xfs_mount *mp, struct xfs_fsmap_head *head, + xfs_fsmap_format_t formatter, void *arg); + +#endif /* __XFS_FSMAP_H__ */ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 3531f8f72fa5..f61c84f8e31a 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -262,6 +262,22 @@ xfs_inode_clear_reclaim_tag( xfs_perag_clear_reclaim_tag(pag); } +static void +xfs_inew_wait( + struct xfs_inode *ip) +{ + wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT); + DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT); + + do { + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + if (!xfs_iflags_test(ip, XFS_INEW)) + break; + schedule(); + } while (true); + finish_wait(wq, &wait.wait); +} + /* * When we recycle a reclaimable inode, we need to re-initialise the VFS inode * part of the structure. This is made more complex by the fact we store @@ -366,14 +382,17 @@ xfs_iget_cache_hit( error = xfs_reinit_inode(mp, inode); if (error) { + bool wake; /* * Re-initializing the inode failed, and we are in deep * trouble. Try to re-add it to the reclaim list. */ rcu_read_lock(); spin_lock(&ip->i_flags_lock); - + wake = !!__xfs_iflags_test(ip, XFS_INEW); ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); + if (wake) + wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); ASSERT(ip->i_flags & XFS_IRECLAIMABLE); trace_xfs_iget_reclaim_fail(ip); goto out_error; @@ -623,9 +642,11 @@ out_error_or_again: STATIC int xfs_inode_ag_walk_grab( - struct xfs_inode *ip) + struct xfs_inode *ip, + int flags) { struct inode *inode = VFS_I(ip); + bool newinos = !!(flags & XFS_AGITER_INEW_WAIT); ASSERT(rcu_read_lock_held()); @@ -643,7 +664,8 @@ xfs_inode_ag_walk_grab( goto out_unlock_noent; /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ - if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) + if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) || + __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)) goto out_unlock_noent; spin_unlock(&ip->i_flags_lock); @@ -671,7 +693,8 @@ xfs_inode_ag_walk( void *args), int flags, void *args, - int tag) + int tag, + int iter_flags) { uint32_t first_index; int last_error = 0; @@ -713,7 +736,7 @@ restart: for (i = 0; i < nr_found; i++) { struct xfs_inode *ip = batch[i]; - if (done || xfs_inode_ag_walk_grab(ip)) + if (done || xfs_inode_ag_walk_grab(ip, iter_flags)) batch[i] = NULL; /* @@ -741,6 +764,9 @@ restart: for (i = 0; i < nr_found; i++) { if (!batch[i]) continue; + if ((iter_flags & XFS_AGITER_INEW_WAIT) && + xfs_iflags_test(batch[i], XFS_INEW)) + xfs_inew_wait(batch[i]); error = execute(batch[i], flags, args); IRELE(batch[i]); if (error == -EAGAIN) { @@ -820,12 +846,13 @@ xfs_cowblocks_worker( } int -xfs_inode_ag_iterator( +xfs_inode_ag_iterator_flags( struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, - void *args) + void *args, + int iter_flags) { struct xfs_perag *pag; int error = 0; @@ -835,7 +862,8 @@ xfs_inode_ag_iterator( ag = 0; while ((pag = xfs_perag_get(mp, ag))) { ag = pag->pag_agno + 1; - error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1); + error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1, + iter_flags); xfs_perag_put(pag); if (error) { last_error = error; @@ -847,6 +875,17 @@ xfs_inode_ag_iterator( } int +xfs_inode_ag_iterator( + struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, int flags, + void *args), + int flags, + void *args) +{ + return xfs_inode_ag_iterator_flags(mp, execute, flags, args, 0); +} + +int xfs_inode_ag_iterator_tag( struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, @@ -863,7 +902,8 @@ xfs_inode_ag_iterator_tag( ag = 0; while ((pag = xfs_perag_get_tag(mp, ag, tag))) { ag = pag->pag_agno + 1; - error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag); + error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag, + 0); xfs_perag_put(pag); if (error) { last_error = error; diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 8a7c849b4dea..9183f77958ef 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -48,6 +48,11 @@ struct xfs_eofblocks { #define XFS_IGET_UNTRUSTED 0x2 #define XFS_IGET_DONTCACHE 0x4 +/* + * flags for AG inode iterator + */ +#define XFS_AGITER_INEW_WAIT 0x1 /* wait on new inodes */ + int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, uint flags, uint lock_flags, xfs_inode_t **ipp); @@ -79,6 +84,9 @@ void xfs_cowblocks_worker(struct work_struct *); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args); +int xfs_inode_ag_iterator_flags(struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, int flags, void *args), + int flags, void *args, int iter_flags); int xfs_inode_ag_iterator_tag(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args, int tag); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 7605d8396596..ec9826c56500 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1906,12 +1906,13 @@ xfs_inactive( * force is true because we are evicting an inode from the * cache. Post-eof blocks must be freed, lest we end up with * broken free space accounting. + * + * Note: don't bother with iolock here since lockdep complains + * about acquiring it in reclaim context. We have the only + * reference to the inode at this point anyways. */ - if (xfs_can_free_eofblocks(ip, true)) { - xfs_ilock(ip, XFS_IOLOCK_EXCL); + if (xfs_can_free_eofblocks(ip, true)) xfs_free_eofblocks(ip); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - } return; } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 10dcf27b4c85..10e89fcb49d7 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -216,7 +216,8 @@ static inline bool xfs_is_reflink_inode(struct xfs_inode *ip) #define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */ #define XFS_ISTALE (1 << 1) /* inode has been staled */ #define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */ -#define XFS_INEW (1 << 3) /* inode has just been allocated */ +#define __XFS_INEW_BIT 3 /* inode has just been allocated */ +#define XFS_INEW (1 << __XFS_INEW_BIT) #define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ #define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */ #define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */ @@ -464,6 +465,7 @@ static inline void xfs_finish_inode_setup(struct xfs_inode *ip) xfs_iflags_clear(ip, XFS_INEW); barrier(); unlock_new_inode(VFS_I(ip)); + wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); } static inline void xfs_setup_existing_inode(struct xfs_inode *ip) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index d90e7811ccdd..08cb7d1a4a3a 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -731,22 +731,27 @@ xfs_iflush_done( * holding the lock before removing the inode from the AIL. */ if (need_ail) { - struct xfs_log_item *log_items[need_ail]; - int i = 0; + bool mlip_changed = false; + + /* this is an opencoded batch version of xfs_trans_ail_delete */ spin_lock(&ailp->xa_lock); for (blip = lip; blip; blip = blip->li_bio_list) { - iip = INODE_ITEM(blip); - if (iip->ili_logged && - blip->li_lsn == iip->ili_flush_lsn) { - log_items[i++] = blip; - } - ASSERT(i <= need_ail); + if (INODE_ITEM(blip)->ili_logged && + blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) + mlip_changed |= xfs_ail_delete_one(ailp, blip); } - /* xfs_trans_ail_delete_bulk() drops the AIL lock. */ - xfs_trans_ail_delete_bulk(ailp, log_items, i, - SHUTDOWN_CORRUPT_INCORE); - } + if (mlip_changed) { + if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount)) + xlog_assign_tail_lsn_locked(ailp->xa_mount); + if (list_empty(&ailp->xa_ail)) + wake_up_all(&ailp->xa_empty); + } + spin_unlock(&ailp->xa_lock); + + if (mlip_changed) + xfs_log_space_wake(ailp->xa_mount); + } /* * clean up and unlock the flush lock now we are done. We can clear the diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 2fd7fdf5438f..6190697603c9 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -41,6 +41,9 @@ #include "xfs_trans.h" #include "xfs_pnfs.h" #include "xfs_acl.h" +#include "xfs_btree.h" +#include <linux/fsmap.h> +#include "xfs_fsmap.h" #include <linux/capability.h> #include <linux/cred.h> @@ -1543,10 +1546,11 @@ xfs_ioc_getbmap( unsigned int cmd, void __user *arg) { - struct getbmapx bmx; + struct getbmapx bmx = { 0 }; int error; - if (copy_from_user(&bmx, arg, sizeof(struct getbmapx))) + /* struct getbmap is a strict subset of struct getbmapx. */ + if (copy_from_user(&bmx, arg, offsetof(struct getbmapx, bmv_iflags))) return -EFAULT; if (bmx.bmv_count < 2) @@ -1608,6 +1612,84 @@ xfs_ioc_getbmapx( return 0; } +struct getfsmap_info { + struct xfs_mount *mp; + struct fsmap_head __user *data; + unsigned int idx; + __u32 last_flags; +}; + +STATIC int +xfs_getfsmap_format(struct xfs_fsmap *xfm, void *priv) +{ + struct getfsmap_info *info = priv; + struct fsmap fm; + + trace_xfs_getfsmap_mapping(info->mp, xfm); + + info->last_flags = xfm->fmr_flags; + xfs_fsmap_from_internal(&fm, xfm); + if (copy_to_user(&info->data->fmh_recs[info->idx++], &fm, + sizeof(struct fsmap))) + return -EFAULT; + + return 0; +} + +STATIC int +xfs_ioc_getfsmap( + struct xfs_inode *ip, + struct fsmap_head __user *arg) +{ + struct getfsmap_info info = { NULL }; + struct xfs_fsmap_head xhead = {0}; + struct fsmap_head head; + bool aborted = false; + int error; + + if (copy_from_user(&head, arg, sizeof(struct fsmap_head))) + return -EFAULT; + if (memchr_inv(head.fmh_reserved, 0, sizeof(head.fmh_reserved)) || + memchr_inv(head.fmh_keys[0].fmr_reserved, 0, + sizeof(head.fmh_keys[0].fmr_reserved)) || + memchr_inv(head.fmh_keys[1].fmr_reserved, 0, + sizeof(head.fmh_keys[1].fmr_reserved))) + return -EINVAL; + + xhead.fmh_iflags = head.fmh_iflags; + xhead.fmh_count = head.fmh_count; + xfs_fsmap_to_internal(&xhead.fmh_keys[0], &head.fmh_keys[0]); + xfs_fsmap_to_internal(&xhead.fmh_keys[1], &head.fmh_keys[1]); + + trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]); + trace_xfs_getfsmap_high_key(ip->i_mount, &xhead.fmh_keys[1]); + + info.mp = ip->i_mount; + info.data = arg; + error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info); + if (error == XFS_BTREE_QUERY_RANGE_ABORT) { + error = 0; + aborted = true; + } else if (error) + return error; + + /* If we didn't abort, set the "last" flag in the last fmx */ + if (!aborted && info.idx) { + info.last_flags |= FMR_OF_LAST; + if (copy_to_user(&info.data->fmh_recs[info.idx - 1].fmr_flags, + &info.last_flags, sizeof(info.last_flags))) + return -EFAULT; + } + + /* copy back header */ + head.fmh_entries = xhead.fmh_entries; + head.fmh_oflags = xhead.fmh_oflags; + if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) + return -EFAULT; + + return 0; +} + int xfs_ioc_swapext( xfs_swapext_t *sxp) @@ -1788,6 +1870,9 @@ xfs_file_ioctl( case XFS_IOC_GETBMAPX: return xfs_ioc_getbmapx(ip, arg); + case FS_IOC_GETFSMAP: + return xfs_ioc_getfsmap(ip, arg); + case XFS_IOC_FD_TO_HANDLE: case XFS_IOC_PATH_TO_HANDLE: case XFS_IOC_PATH_TO_FSHANDLE: { diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 7c49938c5aed..fa0bc4d46065 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -20,6 +20,7 @@ #include <linux/mount.h> #include <linux/slab.h> #include <linux/uaccess.h> +#include <linux/fsmap.h> #include "xfs.h" #include "xfs_fs.h" #include "xfs_format.h" @@ -554,6 +555,7 @@ xfs_file_compat_ioctl( case XFS_IOC_GOINGDOWN: case XFS_IOC_ERROR_INJECTION: case XFS_IOC_ERROR_CLEARALL: + case FS_IOC_GETFSMAP: return xfs_file_ioctl(filp, cmd, p); #ifndef BROKEN_X86_ALIGNMENT /* These are handled fine if no alignment issues */ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 288ee5b840d7..a63f61c256bd 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -240,7 +240,7 @@ xfs_iomap_write_direct( */ if (IS_DAX(VFS_I(ip))) { bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO; - if (ISUNWRITTEN(imap)) { + if (imap->br_state == XFS_EXT_UNWRITTEN) { tflags |= XFS_TRANS_RESERVE; resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; } @@ -945,7 +945,7 @@ static inline bool imap_needs_alloc(struct inode *inode, return !nimaps || imap->br_startblock == HOLESTARTBLOCK || imap->br_startblock == DELAYSTARTBLOCK || - (IS_DAX(inode) && ISUNWRITTEN(imap)); + (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN); } static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags) @@ -976,6 +976,7 @@ xfs_file_iomap_begin( int nimaps = 1, error = 0; bool shared = false, trimmed = false; unsigned lockmode; + struct block_device *bdev; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -1063,6 +1064,14 @@ xfs_file_iomap_begin( } xfs_bmbt_to_iomap(ip, iomap, &imap); + + /* optionally associate a dax device with the iomap bdev */ + bdev = iomap->bdev; + if (blk_queue_dax(bdev->bd_queue)) + iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); + else + iomap->dax_dev = NULL; + if (shared) iomap->flags |= IOMAP_F_SHARED; return 0; @@ -1140,6 +1149,7 @@ xfs_file_iomap_end( unsigned flags, struct iomap *iomap) { + put_dax(iomap->dax_dev); if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC) return xfs_file_iomap_end_delalloc(XFS_I(inode), offset, length, written, iomap); @@ -1170,10 +1180,10 @@ xfs_xattr_iomap_begin( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - lockmode = xfs_ilock_data_map_shared(ip); + lockmode = xfs_ilock_attr_map_shared(ip); /* if there are no attribute fork or extents, return ENOENT */ - if (XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) { + if (!XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) { error = -ENOENT; goto out_unlock; } diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 592fdf7111cb..044fb0e15390 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -212,88 +212,6 @@ static inline kgid_t xfs_gid_to_kgid(__uint32_t gid) #define xfs_sort(a,n,s,fn) sort(a,n,s,fn,NULL) #define xfs_stack_trace() dump_stack() - -/* Move the kernel do_div definition off to one side */ - -#if defined __i386__ -/* For ia32 we need to pull some tricks to get past various versions - * of the compiler which do not like us using do_div in the middle - * of large functions. - */ -static inline __u32 xfs_do_div(void *a, __u32 b, int n) -{ - __u32 mod; - - switch (n) { - case 4: - mod = *(__u32 *)a % b; - *(__u32 *)a = *(__u32 *)a / b; - return mod; - case 8: - { - unsigned long __upper, __low, __high, __mod; - __u64 c = *(__u64 *)a; - __upper = __high = c >> 32; - __low = c; - if (__high) { - __upper = __high % (b); - __high = __high / (b); - } - asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper)); - asm("":"=A" (c):"a" (__low),"d" (__high)); - *(__u64 *)a = c; - return __mod; - } - } - - /* NOTREACHED */ - return 0; -} - -/* Side effect free 64 bit mod operation */ -static inline __u32 xfs_do_mod(void *a, __u32 b, int n) -{ - switch (n) { - case 4: - return *(__u32 *)a % b; - case 8: - { - unsigned long __upper, __low, __high, __mod; - __u64 c = *(__u64 *)a; - __upper = __high = c >> 32; - __low = c; - if (__high) { - __upper = __high % (b); - __high = __high / (b); - } - asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper)); - asm("":"=A" (c):"a" (__low),"d" (__high)); - return __mod; - } - } - - /* NOTREACHED */ - return 0; -} -#else -static inline __u32 xfs_do_div(void *a, __u32 b, int n) -{ - __u32 mod; - - switch (n) { - case 4: - mod = *(__u32 *)a % b; - *(__u32 *)a = *(__u32 *)a / b; - return mod; - case 8: - mod = do_div(*(__u64 *)a, b); - return mod; - } - - /* NOTREACHED */ - return 0; -} - /* Side effect free 64 bit mod operation */ static inline __u32 xfs_do_mod(void *a, __u32 b, int n) { @@ -310,10 +228,7 @@ static inline __u32 xfs_do_mod(void *a, __u32 b, int n) /* NOTREACHED */ return 0; } -#endif -#undef do_div -#define do_div(a, b) xfs_do_div(&(a), (b), sizeof(a)) #define do_mod(a, b) xfs_do_mod(&(a), (b), sizeof(a)) static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index b1469f0a91a6..3731f13f63e9 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1293,7 +1293,7 @@ void xfs_log_work_queue( struct xfs_mount *mp) { - queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work, + queue_delayed_work(mp->m_sync_workqueue, &mp->m_log->l_work, msecs_to_jiffies(xfs_syncd_centisecs * 10)); } @@ -1852,7 +1852,7 @@ xlog_sync( */ if (log->l_badcrc_factor && (prandom_u32() % log->l_badcrc_factor == 0)) { - iclog->ic_header.h_crc &= 0xAAAAAAAA; + iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA); iclog->ic_state |= XLOG_STATE_IOABORT; xfs_warn(log->l_mp, "Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.", diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 4a98762ec8b4..cd0b077deb35 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3796,7 +3796,7 @@ xlog_recover_bud_pass2( * This routine is called when an inode create format structure is found in a * committed transaction in the log. It's purpose is to initialise the inodes * being allocated on disk. This requires us to get inode cluster buffers that - * match the range to be intialised, stamped with inode templates and written + * match the range to be initialised, stamped with inode templates and written * by delayed write so that subsequent modifications will hit the cached buffer * and only need writing out at the end of recovery. */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 688ebff1f663..2eaf81859166 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -73,6 +73,10 @@ xfs_uuid_mount( uuid_t *uuid = &mp->m_sb.sb_uuid; int hole, i; + /* Publish UUID in struct super_block */ + BUILD_BUG_ON(sizeof(mp->m_super->s_uuid) != sizeof(uuid_t)); + memcpy(&mp->m_super->s_uuid, uuid, sizeof(uuid_t)); + if (mp->m_flags & XFS_MOUNT_NOUUID) return 0; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 6db6fd6b82b0..9fa312a41c93 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -183,6 +183,7 @@ typedef struct xfs_mount { struct workqueue_struct *m_reclaim_workqueue; struct workqueue_struct *m_log_workqueue; struct workqueue_struct *m_eofblocks_workqueue; + struct workqueue_struct *m_sync_workqueue; /* * Generation of the filesysyem layout. This is incremented by each @@ -312,7 +313,7 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, static inline xfs_agnumber_t xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d) { - xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d); + xfs_rfsblock_t ld = XFS_BB_TO_FSBT(mp, d); do_div(ld, mp->m_sb.sb_agblocks); return (xfs_agnumber_t) ld; } @@ -320,7 +321,7 @@ xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d) static inline xfs_agblock_t xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) { - xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d); + xfs_rfsblock_t ld = XFS_BB_TO_FSBT(mp, d); return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks); } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index b669b123287b..5fe6e70b88ef 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -851,8 +851,8 @@ xfs_qm_reset_dqcounts( * started afresh by xfs_qm_quotacheck. */ #ifdef DEBUG - j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); - do_div(j, sizeof(xfs_dqblk_t)); + j = (int)XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) / + sizeof(xfs_dqblk_t); ASSERT(mp->m_quotainfo->qi_dqperchunk == j); #endif dqb = bp->b_addr; @@ -1384,12 +1384,7 @@ xfs_qm_quotacheck( mp->m_qflags |= flags; error_return: - while (!list_empty(&buffer_list)) { - struct xfs_buf *bp = - list_first_entry(&buffer_list, struct xfs_buf, b_list); - list_del_init(&bp->b_list); - xfs_buf_relse(bp); - } + xfs_buf_delwri_cancel(&buffer_list); if (error) { xfs_warn(mp, diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 475a3882a81f..9cb5c381b01c 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -759,5 +759,6 @@ xfs_qm_dqrele_all_inodes( uint flags) { ASSERT(mp->m_quotainfo); - xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL); + xfs_inode_ag_iterator_flags(mp, xfs_dqrele_inode, flags, NULL, + XFS_AGITER_INEW_WAIT); } diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 6e4c7446c3d4..96fe209b5eb6 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -221,6 +221,7 @@ void xfs_cui_release( struct xfs_cui_log_item *cuip) { + ASSERT(atomic_read(&cuip->cui_refcount) > 0); if (atomic_dec_and_test(&cuip->cui_refcount)) { xfs_trans_ail_remove(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR); xfs_cui_item_free(cuip); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 4a84c5ea266d..ffe6fe7a7eb5 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -206,11 +206,7 @@ xfs_reflink_trim_around_shared( int error = 0; /* Holes, unwritten, and delalloc extents cannot be shared */ - if (!xfs_is_reflink_inode(ip) || - ISUNWRITTEN(irec) || - irec->br_startblock == HOLESTARTBLOCK || - irec->br_startblock == DELAYSTARTBLOCK || - isnullstartblock(irec->br_startblock)) { + if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) { *shared = false; return 0; } @@ -709,8 +705,22 @@ xfs_reflink_end_cow( offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); - /* Start a rolling transaction to switch the mappings */ - resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); + /* + * Start a rolling transaction to switch the mappings. We're + * unlikely ever to have to remap 16T worth of single-block + * extents, so just cap the worst case extent count to 2^32-1. + * Stick a warning in just in case, and avoid 64-bit division. + */ + BUILD_BUG_ON(MAX_RW_COUNT > UINT_MAX); + if (end_fsb - offset_fsb > UINT_MAX) { + error = -EFSCORRUPTED; + xfs_force_shutdown(ip->i_mount, SHUTDOWN_CORRUPT_INCORE); + ASSERT(0); + goto out; + } + resblks = XFS_NEXTENTADD_SPACE_RES(ip->i_mount, + (unsigned int)(end_fsb - offset_fsb), + XFS_DATA_FORK); error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, resblks, 0, 0, &tp); if (error) @@ -1045,12 +1055,12 @@ xfs_reflink_remap_extent( xfs_off_t new_isize) { struct xfs_mount *mp = ip->i_mount; + bool real_extent = xfs_bmap_is_real_extent(irec); struct xfs_trans *tp; xfs_fsblock_t firstfsb; unsigned int resblks; struct xfs_defer_ops dfops; struct xfs_bmbt_irec uirec; - bool real_extent; xfs_filblks_t rlen; xfs_filblks_t unmap_len; xfs_off_t newlen; @@ -1059,11 +1069,6 @@ xfs_reflink_remap_extent( unmap_len = irec->br_startoff + irec->br_blockcount - destoff; trace_xfs_reflink_punch_range(ip, destoff, unmap_len); - /* Only remap normal extents. */ - real_extent = (irec->br_startblock != HOLESTARTBLOCK && - irec->br_startblock != DELAYSTARTBLOCK && - !ISUNWRITTEN(irec)); - /* No reflinking if we're low on space */ if (real_extent) { error = xfs_reflink_ag_has_free_space(mp, @@ -1359,9 +1364,7 @@ xfs_reflink_dirty_extents( goto out; if (nmaps == 0) break; - if (map[0].br_startblock == HOLESTARTBLOCK || - map[0].br_startblock == DELAYSTARTBLOCK || - ISUNWRITTEN(&map[0])) + if (!xfs_bmap_is_real_extent(&map[0])) goto next; map[1] = map[0]; @@ -1435,9 +1438,7 @@ xfs_reflink_clear_inode_flag( return error; if (nmaps == 0) break; - if (map.br_startblock == HOLESTARTBLOCK || - map.br_startblock == DELAYSTARTBLOCK || - ISUNWRITTEN(&map)) + if (!xfs_bmap_is_real_extent(&map)) goto next; agno = XFS_FSB_TO_AGNO(mp, map.br_startblock); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 73c827831551..f3b139c9aa16 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -243,6 +243,7 @@ void xfs_rui_release( struct xfs_rui_log_item *ruip) { + ASSERT(atomic_read(&ruip->rui_refcount) > 0); if (atomic_dec_and_test(&ruip->rui_refcount)) { xfs_trans_ail_remove(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR); xfs_rui_item_free(ruip); diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 51dd3c726608..f13133e6f19f 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -23,6 +23,16 @@ struct xfs_mount; struct xfs_trans; +struct xfs_rtalloc_rec { + xfs_rtblock_t ar_startblock; + xfs_rtblock_t ar_blockcount; +}; + +typedef int (*xfs_rtalloc_query_range_fn)( + struct xfs_trans *tp, + struct xfs_rtalloc_rec *rec, + void *priv); + #ifdef CONFIG_XFS_RT /* * Function prototypes for exported functions. @@ -118,13 +128,21 @@ int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log, int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtblock_t start, xfs_extlen_t len, struct xfs_buf **rbpp, xfs_fsblock_t *rsb); - - +int xfs_rtalloc_query_range(struct xfs_trans *tp, + struct xfs_rtalloc_rec *low_rec, + struct xfs_rtalloc_rec *high_rec, + xfs_rtalloc_query_range_fn fn, + void *priv); +int xfs_rtalloc_query_all(struct xfs_trans *tp, + xfs_rtalloc_query_range_fn fn, + void *priv); #else # define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS) # define xfs_rtfree_extent(t,b,l) (ENOSYS) # define xfs_rtpick_extent(m,t,l,rb) (ENOSYS) # define xfs_growfs_rt(mp,in) (ENOSYS) +# define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS) +# define xfs_rtalloc_query_all(t,f,p) (ENOSYS) static inline int /* error */ xfs_rtmount_init( xfs_mount_t *mp) /* file system mount structure */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 685c042a120f..47d239dcf3f4 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -877,8 +877,15 @@ xfs_init_mount_workqueues( if (!mp->m_eofblocks_workqueue) goto out_destroy_log; + mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", WQ_FREEZABLE, 0, + mp->m_fsname); + if (!mp->m_sync_workqueue) + goto out_destroy_eofb; + return 0; +out_destroy_eofb: + destroy_workqueue(mp->m_eofblocks_workqueue); out_destroy_log: destroy_workqueue(mp->m_log_workqueue); out_destroy_reclaim: @@ -899,6 +906,7 @@ STATIC void xfs_destroy_mount_workqueues( struct xfs_mount *mp) { + destroy_workqueue(mp->m_sync_workqueue); destroy_workqueue(mp->m_eofblocks_workqueue); destroy_workqueue(mp->m_log_workqueue); destroy_workqueue(mp->m_reclaim_workqueue); diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 7f17ae6d709a..5d95fe348294 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -47,6 +47,7 @@ #include "xfs_inode_item.h" #include "xfs_bmap_btree.h" #include "xfs_filestream.h" +#include "xfs_fsmap.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 383ac227ce2c..7c5a16528d8b 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -40,6 +40,8 @@ struct xfs_inode_log_format; struct xfs_bmbt_irec; struct xfs_btree_cur; struct xfs_refcount_irec; +struct xfs_fsmap; +struct xfs_rmap_irec; DECLARE_EVENT_CLASS(xfs_attr_list_class, TP_PROTO(struct xfs_attr_list_context *ctx), @@ -2190,7 +2192,7 @@ DECLARE_EVENT_CLASS(xfs_discard_class, __entry->agbno = agbno; __entry->len = len; ), - TP_printk("dev %d:%d agno %u agbno %u len %u\n", + TP_printk("dev %d:%d agno %u agbno %u len %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -2253,8 +2255,8 @@ DECLARE_EVENT_CLASS(xfs_defer_class, TP_STRUCT__entry( __field(dev_t, dev) __field(void *, dop) - __field(bool, committed) - __field(bool, low) + __field(char, committed) + __field(char, low) ), TP_fast_assign( __entry->dev = mp ? mp->m_super->s_dev : 0; @@ -2262,7 +2264,7 @@ DECLARE_EVENT_CLASS(xfs_defer_class, __entry->committed = dop->dop_committed; __entry->low = dop->dop_low; ), - TP_printk("dev %d:%d ops %p committed %d low %d\n", + TP_printk("dev %d:%d ops %p committed %d low %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->dop, __entry->committed, @@ -2279,8 +2281,8 @@ DECLARE_EVENT_CLASS(xfs_defer_error_class, TP_STRUCT__entry( __field(dev_t, dev) __field(void *, dop) - __field(bool, committed) - __field(bool, low) + __field(char, committed) + __field(char, low) __field(int, error) ), TP_fast_assign( @@ -2290,7 +2292,7 @@ DECLARE_EVENT_CLASS(xfs_defer_error_class, __entry->low = dop->dop_low; __entry->error = error; ), - TP_printk("dev %d:%d ops %p committed %d low %d err %d\n", + TP_printk("dev %d:%d ops %p committed %d low %d err %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->dop, __entry->committed, @@ -2309,7 +2311,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class, __field(dev_t, dev) __field(int, type) __field(void *, intent) - __field(bool, committed) + __field(char, committed) __field(int, nr) ), TP_fast_assign( @@ -2319,7 +2321,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class, __entry->committed = dfp->dfp_done != NULL; __entry->nr = dfp->dfp_count; ), - TP_printk("dev %d:%d optype %d intent %p committed %d nr %d\n", + TP_printk("dev %d:%d optype %d intent %p committed %d nr %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->type, __entry->intent, @@ -2614,7 +2616,8 @@ DECLARE_EVENT_CLASS(xfs_ag_resv_class, __entry->asked = r ? r->ar_asked : 0; __entry->len = len; ), - TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u resv %u ask %u len %u\n", + TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u " + "resv %u ask %u len %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->resv, @@ -2667,7 +2670,7 @@ DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class, __entry->agbno = agbno; __entry->dir = dir; ), - TP_printk("dev %d:%d agno %u agbno %u cmp %s(%d)\n", + TP_printk("dev %d:%d agno %u agbno %u cmp %s(%d)", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -2700,7 +2703,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class, __entry->blockcount = irec->rc_blockcount; __entry->refcount = irec->rc_refcount; ), - TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u\n", + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->startblock, @@ -2735,7 +2738,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class, __entry->refcount = irec->rc_refcount; __entry->agbno = agbno; ), - TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u @ agbno %u\n", + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u @ agbno %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->startblock, @@ -2776,7 +2779,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, __entry->i2_refcount = i2->rc_refcount; ), TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " - "agbno %u len %u refcount %u\n", + "agbno %u len %u refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->i1_startblock, @@ -2822,7 +2825,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, __entry->agbno = agbno; ), TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " - "agbno %u len %u refcount %u @ agbno %u\n", + "agbno %u len %u refcount %u @ agbno %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->i1_startblock, @@ -2875,7 +2878,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, ), TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " "agbno %u len %u refcount %u -- " - "agbno %u len %u refcount %u\n", + "agbno %u len %u refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->i1_startblock, @@ -3001,31 +3004,6 @@ DEFINE_EVENT(xfs_inode_error_class, name, \ unsigned long caller_ip), \ TP_ARGS(ip, error, caller_ip)) -/* reflink allocator */ -TRACE_EVENT(xfs_bmap_remap_alloc, - TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t fsbno, - xfs_extlen_t len), - TP_ARGS(ip, fsbno, len), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(xfs_fsblock_t, fsbno) - __field(xfs_extlen_t, len) - ), - TP_fast_assign( - __entry->dev = VFS_I(ip)->i_sb->s_dev; - __entry->ino = ip->i_ino; - __entry->fsbno = fsbno; - __entry->len = len; - ), - TP_printk("dev %d:%d ino 0x%llx fsbno 0x%llx len %x", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __entry->fsbno, - __entry->len) -); -DEFINE_INODE_ERROR_EVENT(xfs_bmap_remap_alloc_error); - /* reflink tracepoint classes */ /* two-file io tracepoint class */ @@ -3227,7 +3205,7 @@ TRACE_EVENT(xfs_ioctl_clone, ), TP_printk("dev %d:%d " "ino 0x%lx isize 0x%llx -> " - "ino 0x%lx isize 0x%llx\n", + "ino 0x%lx isize 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->src_ino, __entry->src_isize, @@ -3267,6 +3245,88 @@ DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap); DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap_piece); DEFINE_INODE_ERROR_EVENT(xfs_swap_extent_rmap_error); +/* fsmap traces */ +DECLARE_EVENT_CLASS(xfs_fsmap_class, + TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno, + struct xfs_rmap_irec *rmap), + TP_ARGS(mp, keydev, agno, rmap), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, keydev) + __field(xfs_agnumber_t, agno) + __field(xfs_fsblock_t, bno) + __field(xfs_filblks_t, len) + __field(__uint64_t, owner) + __field(__uint64_t, offset) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->keydev = new_decode_dev(keydev); + __entry->agno = agno; + __entry->bno = rmap->rm_startblock; + __entry->len = rmap->rm_blockcount; + __entry->owner = rmap->rm_owner; + __entry->offset = rmap->rm_offset; + __entry->flags = rmap->rm_flags; + ), + TP_printk("dev %d:%d keydev %d:%d agno %u bno %llu len %llu owner %lld offset %llu flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->keydev), MINOR(__entry->keydev), + __entry->agno, + __entry->bno, + __entry->len, + __entry->owner, + __entry->offset, + __entry->flags) +) +#define DEFINE_FSMAP_EVENT(name) \ +DEFINE_EVENT(xfs_fsmap_class, name, \ + TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno, \ + struct xfs_rmap_irec *rmap), \ + TP_ARGS(mp, keydev, agno, rmap)) +DEFINE_FSMAP_EVENT(xfs_fsmap_low_key); +DEFINE_FSMAP_EVENT(xfs_fsmap_high_key); +DEFINE_FSMAP_EVENT(xfs_fsmap_mapping); + +DECLARE_EVENT_CLASS(xfs_getfsmap_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_fsmap *fsmap), + TP_ARGS(mp, fsmap), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, keydev) + __field(xfs_daddr_t, block) + __field(xfs_daddr_t, len) + __field(__uint64_t, owner) + __field(__uint64_t, offset) + __field(__uint64_t, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->keydev = new_decode_dev(fsmap->fmr_device); + __entry->block = fsmap->fmr_physical; + __entry->len = fsmap->fmr_length; + __entry->owner = fsmap->fmr_owner; + __entry->offset = fsmap->fmr_offset; + __entry->flags = fsmap->fmr_flags; + ), + TP_printk("dev %d:%d keydev %d:%d block %llu len %llu owner %lld offset %llu flags 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->keydev), MINOR(__entry->keydev), + __entry->block, + __entry->len, + __entry->owner, + __entry->offset, + __entry->flags) +) +#define DEFINE_GETFSMAP_EVENT(name) \ +DEFINE_EVENT(xfs_getfsmap_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_fsmap *fsmap), \ + TP_ARGS(mp, fsmap)) +DEFINE_GETFSMAP_EVENT(xfs_getfsmap_low_key); +DEFINE_GETFSMAP_EVENT(xfs_getfsmap_high_key); +DEFINE_GETFSMAP_EVENT(xfs_getfsmap_mapping); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index f5969c8274fc..2011620008de 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -263,6 +263,28 @@ xfs_trans_alloc( } /* + * Create an empty transaction with no reservation. This is a defensive + * mechanism for routines that query metadata without actually modifying + * them -- if the metadata being queried is somehow cross-linked (think a + * btree block pointer that points higher in the tree), we risk deadlock. + * However, blocks grabbed as part of a transaction can be re-grabbed. + * The verifiers will notice the corrupt block and the operation will fail + * back to userspace without deadlocking. + * + * Note the zero-length reservation; this transaction MUST be cancelled + * without any dirty data. + */ +int +xfs_trans_alloc_empty( + struct xfs_mount *mp, + struct xfs_trans **tpp) +{ + struct xfs_trans_res resv = {0}; + + return xfs_trans_alloc(mp, &resv, 0, 0, XFS_TRANS_NO_WRITECOUNT, tpp); +} + +/* * Record the indicated change to the given field for application * to the file system's superblock when the transaction commits. * For now, just store the change in the transaction structure. @@ -1012,17 +1034,14 @@ xfs_trans_cancel( * chunk we've been working on and get a new transaction to continue. */ int -__xfs_trans_roll( +xfs_trans_roll( struct xfs_trans **tpp, - struct xfs_inode *dp, - int *committed) + struct xfs_inode *dp) { struct xfs_trans *trans; struct xfs_trans_res tres; int error; - *committed = 0; - /* * Ensure that the inode is always logged. */ @@ -1048,7 +1067,6 @@ __xfs_trans_roll( if (error) return error; - *committed = 1; trans = *tpp; /* @@ -1071,12 +1089,3 @@ __xfs_trans_roll( xfs_trans_ijoin(trans, dp, 0); return 0; } - -int -xfs_trans_roll( - struct xfs_trans **tpp, - struct xfs_inode *dp) -{ - int committed; - return __xfs_trans_roll(tpp, dp, &committed); -} diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 1646f659b60f..a07acbf0bd8a 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -158,6 +158,8 @@ typedef struct xfs_trans { int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp, uint blocks, uint rtextents, uint flags, struct xfs_trans **tpp); +int xfs_trans_alloc_empty(struct xfs_mount *mp, + struct xfs_trans **tpp); void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); struct xfs_buf *xfs_trans_get_buf_map(struct xfs_trans *tp, @@ -226,7 +228,6 @@ int xfs_trans_free_extent(struct xfs_trans *, struct xfs_efd_log_item *, xfs_fsblock_t, xfs_extlen_t, struct xfs_owner_info *); int xfs_trans_commit(struct xfs_trans *); -int __xfs_trans_roll(struct xfs_trans **, struct xfs_inode *, int *); int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); void xfs_trans_cancel(xfs_trans_t *); int xfs_trans_ail_init(struct xfs_mount *); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index d6c9c3e9e02b..9056c0f34a3c 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -684,8 +684,23 @@ xfs_trans_ail_update_bulk( } } -/* - * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL +bool +xfs_ail_delete_one( + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_log_item *mlip = xfs_ail_min(ailp); + + trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); + xfs_ail_delete(ailp, lip); + lip->li_flags &= ~XFS_LI_IN_AIL; + lip->li_lsn = 0; + + return mlip == lip; +} + +/** + * Remove a log items from the AIL * * @xfs_trans_ail_delete_bulk takes an array of log items that all need to * removed from the AIL. The caller is already holding the AIL lock, and done @@ -706,52 +721,36 @@ xfs_trans_ail_update_bulk( * before returning. */ void -xfs_trans_ail_delete_bulk( +xfs_trans_ail_delete( struct xfs_ail *ailp, - struct xfs_log_item **log_items, - int nr_items, + struct xfs_log_item *lip, int shutdown_type) __releases(ailp->xa_lock) { - xfs_log_item_t *mlip; - int mlip_changed = 0; - int i; + struct xfs_mount *mp = ailp->xa_mount; + bool mlip_changed; - mlip = xfs_ail_min(ailp); - - for (i = 0; i < nr_items; i++) { - struct xfs_log_item *lip = log_items[i]; - if (!(lip->li_flags & XFS_LI_IN_AIL)) { - struct xfs_mount *mp = ailp->xa_mount; - - spin_unlock(&ailp->xa_lock); - if (!XFS_FORCED_SHUTDOWN(mp)) { - xfs_alert_tag(mp, XFS_PTAG_AILDELETE, - "%s: attempting to delete a log item that is not in the AIL", - __func__); - xfs_force_shutdown(mp, shutdown_type); - } - return; + if (!(lip->li_flags & XFS_LI_IN_AIL)) { + spin_unlock(&ailp->xa_lock); + if (!XFS_FORCED_SHUTDOWN(mp)) { + xfs_alert_tag(mp, XFS_PTAG_AILDELETE, + "%s: attempting to delete a log item that is not in the AIL", + __func__); + xfs_force_shutdown(mp, shutdown_type); } - - trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); - xfs_ail_delete(ailp, lip); - lip->li_flags &= ~XFS_LI_IN_AIL; - lip->li_lsn = 0; - if (mlip == lip) - mlip_changed = 1; + return; } + mlip_changed = xfs_ail_delete_one(ailp, lip); if (mlip_changed) { - if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount)) - xlog_assign_tail_lsn_locked(ailp->xa_mount); + if (!XFS_FORCED_SHUTDOWN(mp)) + xlog_assign_tail_lsn_locked(mp); if (list_empty(&ailp->xa_ail)) wake_up_all(&ailp->xa_empty); - spin_unlock(&ailp->xa_lock); + } + spin_unlock(&ailp->xa_lock); + if (mlip_changed) xfs_log_space_wake(ailp->xa_mount); - } else { - spin_unlock(&ailp->xa_lock); - } } int diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 49931b72da8a..d91706c56c63 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -106,18 +106,9 @@ xfs_trans_ail_update( xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); } -void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp, - struct xfs_log_item **log_items, int nr_items, - int shutdown_type) - __releases(ailp->xa_lock); -static inline void -xfs_trans_ail_delete( - struct xfs_ail *ailp, - xfs_log_item_t *lip, - int shutdown_type) __releases(ailp->xa_lock) -{ - xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type); -} +bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip); +void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip, + int shutdown_type) __releases(ailp->xa_lock); static inline void xfs_trans_ail_remove( |