summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_addr.c5
-rw-r--r--fs/9p/vfs_file.c14
-rw-r--r--fs/adfs/file.c8
-rw-r--r--fs/affs/file.c8
-rw-r--r--fs/afs/file.c8
-rw-r--r--fs/afs/internal.h3
-rw-r--r--fs/afs/write.c11
-rw-r--r--fs/aio.c84
-rw-r--r--fs/attr.c8
-rw-r--r--fs/bfs/file.c8
-rw-r--r--fs/block_dev.c40
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c7
-rw-r--r--fs/btrfs/backref.c41
-rw-r--r--fs/btrfs/backref.h8
-rw-r--r--fs/btrfs/btrfs_inode.h2
-rw-r--r--fs/btrfs/check-integrity.c5
-rw-r--r--fs/btrfs/compression.c6
-rw-r--r--fs/btrfs/ctree.c104
-rw-r--r--fs/btrfs/ctree.h156
-rw-r--r--fs/btrfs/delayed-inode.c7
-rw-r--r--fs/btrfs/delayed-ref.c39
-rw-r--r--fs/btrfs/delayed-ref.h24
-rw-r--r--fs/btrfs/dev-replace.c2
-rw-r--r--fs/btrfs/disk-io.c113
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/extent-tree.c613
-rw-r--r--fs/btrfs/extent_io.c440
-rw-r--r--fs/btrfs/extent_io.h6
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/extent_map.h1
-rw-r--r--fs/btrfs/file-item.c80
-rw-r--r--fs/btrfs/file.c203
-rw-r--r--fs/btrfs/free-space-cache.c420
-rw-r--r--fs/btrfs/inode-map.c2
-rw-r--r--fs/btrfs/inode.c381
-rw-r--r--fs/btrfs/ioctl.c543
-rw-r--r--fs/btrfs/locking.c80
-rw-r--r--fs/btrfs/lzo.c14
-rw-r--r--fs/btrfs/ordered-data.c2
-rw-r--r--fs/btrfs/qgroup.c939
-rw-r--r--fs/btrfs/qgroup.h107
-rw-r--r--fs/btrfs/reada.c9
-rw-r--r--fs/btrfs/relocation.c21
-rw-r--r--fs/btrfs/root-tree.c2
-rw-r--r--fs/btrfs/scrub.c28
-rw-r--r--fs/btrfs/send.c290
-rw-r--r--fs/btrfs/super.c13
-rw-r--r--fs/btrfs/sysfs.c50
-rw-r--r--fs/btrfs/tests/btrfs-tests.c97
-rw-r--r--fs/btrfs/tests/btrfs-tests.h9
-rw-r--r--fs/btrfs/tests/inode-tests.c35
-rw-r--r--fs/btrfs/tests/qgroup-tests.c470
-rw-r--r--fs/btrfs/transaction.c115
-rw-r--r--fs/btrfs/transaction.h1
-rw-r--r--fs/btrfs/tree-defrag.c2
-rw-r--r--fs/btrfs/tree-log.c49
-rw-r--r--fs/btrfs/tree-log.h16
-rw-r--r--fs/btrfs/volumes.c158
-rw-r--r--fs/btrfs/volumes.h4
-rw-r--r--fs/btrfs/zlib.c26
-rw-r--r--fs/ceph/acl.c6
-rw-r--r--fs/ceph/addr.c21
-rw-r--r--fs/ceph/caps.c246
-rw-r--r--fs/ceph/export.c2
-rw-r--r--fs/ceph/file.c185
-rw-r--r--fs/ceph/inode.c247
-rw-r--r--fs/ceph/mds_client.c9
-rw-r--r--fs/ceph/mds_client.h1
-rw-r--r--fs/ceph/super.h13
-rw-r--r--fs/cifs/cifsfs.c55
-rw-r--r--fs/cifs/cifsfs.h12
-rw-r--r--fs/cifs/file.c81
-rw-r--r--fs/dcache.c4
-rw-r--r--fs/direct-io.c164
-rw-r--r--fs/dlm/lowcomms.c5
-rw-r--r--fs/ecryptfs/file.c13
-rw-r--r--fs/eventpoll.c4
-rw-r--r--fs/exec.c7
-rw-r--r--fs/exofs/file.c10
-rw-r--r--fs/exofs/inode.c2
-rw-r--r--fs/ext2/file.c10
-rw-r--r--fs/ext2/inode.c10
-rw-r--r--fs/ext3/file.c10
-rw-r--r--fs/ext3/inode.c48
-rw-r--r--fs/ext4/ext4.h3
-rw-r--r--fs/ext4/file.c35
-rw-r--r--fs/ext4/indirect.c14
-rw-r--r--fs/ext4/inode.c24
-rw-r--r--fs/f2fs/data.c17
-rw-r--r--fs/f2fs/file.c10
-rw-r--r--fs/fat/file.c8
-rw-r--r--fs/fat/inode.c12
-rw-r--r--fs/file.c11
-rw-r--r--fs/file_table.c6
-rw-r--r--fs/fuse/cuse.c8
-rw-r--r--fs/fuse/file.c154
-rw-r--r--fs/fuse/fuse_i.h5
-rw-r--r--fs/gfs2/aops.c11
-rw-r--r--fs/gfs2/file.c30
-rw-r--r--fs/hfs/inode.c16
-rw-r--r--fs/hfsplus/inode.c15
-rw-r--r--fs/hostfs/hostfs_kern.c8
-rw-r--r--fs/hpfs/file.c8
-rw-r--r--fs/inode.c10
-rw-r--r--fs/jbd/revoke.c12
-rw-r--r--fs/jffs2/file.c8
-rw-r--r--fs/jfs/file.c10
-rw-r--r--fs/jfs/inode.c8
-rw-r--r--fs/lockd/clnt4xdr.c2
-rw-r--r--fs/lockd/clntxdr.c2
-rw-r--r--fs/lockd/svc.c2
-rw-r--r--fs/lockd/svcsubs.c3
-rw-r--r--fs/lockd/xdr.c2
-rw-r--r--fs/logfs/file.c8
-rw-r--r--fs/minix/file.c8
-rw-r--r--fs/namei.c11
-rw-r--r--fs/nfs/Makefile4
-rw-r--r--fs/nfs/blocklayout/blocklayout.c38
-rw-r--r--fs/nfs/direct.c439
-rw-r--r--fs/nfs/file.c65
-rw-r--r--fs/nfs/filelayout/Makefile5
-rw-r--r--fs/nfs/filelayout/filelayout.c (renamed from fs/nfs/nfs4filelayout.c)203
-rw-r--r--fs/nfs/filelayout/filelayout.h (renamed from fs/nfs/nfs4filelayout.h)2
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c (renamed from fs/nfs/nfs4filelayoutdev.c)6
-rw-r--r--fs/nfs/getroot.c3
-rw-r--r--fs/nfs/inode.c26
-rw-r--r--fs/nfs/internal.h39
-rw-r--r--fs/nfs/nfs2xdr.c14
-rw-r--r--fs/nfs/nfs3proc.c21
-rw-r--r--fs/nfs/nfs3xdr.c16
-rw-r--r--fs/nfs/nfs4_fs.h4
-rw-r--r--fs/nfs/nfs4file.c13
-rw-r--r--fs/nfs/nfs4proc.c58
-rw-r--r--fs/nfs/nfs4state.c6
-rw-r--r--fs/nfs/nfs4trace.h8
-rw-r--r--fs/nfs/nfs4xdr.c19
-rw-r--r--fs/nfs/objlayout/objio_osd.c24
-rw-r--r--fs/nfs/objlayout/objlayout.c24
-rw-r--r--fs/nfs/objlayout/objlayout.h8
-rw-r--r--fs/nfs/pagelist.c633
-rw-r--r--fs/nfs/pnfs.c166
-rw-r--r--fs/nfs/pnfs.h30
-rw-r--r--fs/nfs/proc.c21
-rw-r--r--fs/nfs/read.c414
-rw-r--r--fs/nfs/super.c27
-rw-r--r--fs/nfs/write.c588
-rw-r--r--fs/nfsd/acl.h2
-rw-r--r--fs/nfsd/auth.c5
-rw-r--r--fs/nfsd/export.c88
-rw-r--r--fs/nfsd/export.h110
-rw-r--r--fs/nfsd/fault_inject.c15
-rw-r--r--fs/nfsd/idmap.h4
-rw-r--r--fs/nfsd/nfs2acl.c12
-rw-r--r--fs/nfsd/nfs3acl.c6
-rw-r--r--fs/nfsd/nfs3xdr.c27
-rw-r--r--fs/nfsd/nfs4acl.c12
-rw-r--r--fs/nfsd/nfs4idmap.c42
-rw-r--r--fs/nfsd/nfs4proc.c180
-rw-r--r--fs/nfsd/nfs4state.c349
-rw-r--r--fs/nfsd/nfs4xdr.c1934
-rw-r--r--fs/nfsd/nfscache.c17
-rw-r--r--fs/nfsd/nfsctl.c1
-rw-r--r--fs/nfsd/nfsd.h17
-rw-r--r--fs/nfsd/nfsfh.c25
-rw-r--r--fs/nfsd/nfsfh.h59
-rw-r--r--fs/nfsd/nfssvc.c6
-rw-r--r--fs/nfsd/nfsxdr.c15
-rw-r--r--fs/nfsd/state.h5
-rw-r--r--fs/nfsd/stats.c1
-rw-r--r--fs/nfsd/stats.h43
-rw-r--r--fs/nfsd/vfs.c155
-rw-r--r--fs/nfsd/vfs.h10
-rw-r--r--fs/nfsd/xdr4.h23
-rw-r--r--fs/nilfs2/file.c8
-rw-r--r--fs/nilfs2/inode.c9
-rw-r--r--fs/ntfs/file.c9
-rw-r--r--fs/ocfs2/aops.c7
-rw-r--r--fs/ocfs2/cluster/tcp.c31
-rw-r--r--fs/ocfs2/file.c138
-rw-r--r--fs/omfs/file.c8
-rw-r--r--fs/open.c6
-rw-r--r--fs/pipe.c145
-rw-r--r--fs/quota/quota.c14
-rw-r--r--fs/ramfs/file-mmu.c10
-rw-r--r--fs/ramfs/file-nommu.c10
-rw-r--r--fs/read_write.c108
-rw-r--r--fs/reiserfs/bitmap.c259
-rw-r--r--fs/reiserfs/dir.c156
-rw-r--r--fs/reiserfs/do_balan.c2449
-rw-r--r--fs/reiserfs/file.c100
-rw-r--r--fs/reiserfs/fix_node.c1008
-rw-r--r--fs/reiserfs/hashes.c15
-rw-r--r--fs/reiserfs/ibalance.c271
-rw-r--r--fs/reiserfs/inode.c1216
-rw-r--r--fs/reiserfs/ioctl.c27
-rw-r--r--fs/reiserfs/item_ops.c108
-rw-r--r--fs/reiserfs/journal.c1339
-rw-r--r--fs/reiserfs/lbalance.c501
-rw-r--r--fs/reiserfs/namei.c513
-rw-r--r--fs/reiserfs/objectid.c101
-rw-r--r--fs/reiserfs/prints.c176
-rw-r--r--fs/reiserfs/reiserfs.h1921
-rw-r--r--fs/reiserfs/resize.c75
-rw-r--r--fs/reiserfs/stree.c884
-rw-r--r--fs/reiserfs/super.c552
-rw-r--r--fs/reiserfs/tail_conversion.c161
-rw-r--r--fs/reiserfs/xattr.c70
-rw-r--r--fs/reiserfs/xattr.h3
-rw-r--r--fs/reiserfs/xattr_acl.c38
-rw-r--r--fs/romfs/mmap-nommu.c4
-rw-r--r--fs/splice.c195
-rw-r--r--fs/sysv/file.c8
-rw-r--r--fs/ubifs/budget.c1
-rw-r--r--fs/ubifs/debug.c4
-rw-r--r--fs/ubifs/file.c31
-rw-r--r--fs/ubifs/io.c18
-rw-r--r--fs/ubifs/shrinker.c1
-rw-r--r--fs/ubifs/super.c5
-rw-r--r--fs/ubifs/tnc.c5
-rw-r--r--fs/ubifs/ubifs.h11
-rw-r--r--fs/udf/file.c19
-rw-r--r--fs/udf/inode.c10
-rw-r--r--fs/ufs/file.c8
-rw-r--r--fs/xfs/xfs_ag.h36
-rw-r--r--fs/xfs/xfs_alloc.c19
-rw-r--r--fs/xfs/xfs_alloc_btree.c1
-rw-r--r--fs/xfs/xfs_aops.c72
-rw-r--r--fs/xfs/xfs_attr.c343
-rw-r--r--fs/xfs/xfs_attr_leaf.c184
-rw-r--r--fs/xfs/xfs_attr_leaf.h3
-rw-r--r--fs/xfs/xfs_attr_list.c1
-rw-r--r--fs/xfs/xfs_attr_remote.c58
-rw-r--r--fs/xfs/xfs_bit.h7
-rw-r--r--fs/xfs/xfs_bmap.c211
-rw-r--r--fs/xfs/xfs_bmap.h4
-rw-r--r--fs/xfs/xfs_bmap_btree.c9
-rw-r--r--fs/xfs/xfs_bmap_btree.h2
-rw-r--r--fs/xfs/xfs_bmap_util.c18
-rw-r--r--fs/xfs/xfs_bmap_util.h13
-rw-r--r--fs/xfs/xfs_btree.c56
-rw-r--r--fs/xfs/xfs_btree.h5
-rw-r--r--fs/xfs/xfs_buf.c17
-rw-r--r--fs/xfs/xfs_buf.h9
-rw-r--r--fs/xfs/xfs_buf_item.c5
-rw-r--r--fs/xfs/xfs_da_btree.c114
-rw-r--r--fs/xfs/xfs_da_btree.h26
-rw-r--r--fs/xfs/xfs_da_format.c36
-rw-r--r--fs/xfs/xfs_da_format.h154
-rw-r--r--fs/xfs/xfs_dir2.c136
-rw-r--r--fs/xfs/xfs_dir2.h30
-rw-r--r--fs/xfs/xfs_dir2_block.c97
-rw-r--r--fs/xfs/xfs_dir2_data.c83
-rw-r--r--fs/xfs/xfs_dir2_leaf.c202
-rw-r--r--fs/xfs/xfs_dir2_node.c190
-rw-r--r--fs/xfs/xfs_dir2_priv.h142
-rw-r--r--fs/xfs/xfs_dir2_readdir.c155
-rw-r--r--fs/xfs/xfs_dir2_sf.c39
-rw-r--r--fs/xfs/xfs_dquot.c59
-rw-r--r--fs/xfs/xfs_dquot.h2
-rw-r--r--fs/xfs/xfs_dquot_buf.c5
-rw-r--r--fs/xfs/xfs_file.c121
-rw-r--r--fs/xfs/xfs_filestream.c684
-rw-r--r--fs/xfs/xfs_filestream.h34
-rw-r--r--fs/xfs/xfs_format.h14
-rw-r--r--fs/xfs/xfs_fs.h1
-rw-r--r--fs/xfs/xfs_fsops.c49
-rw-r--r--fs/xfs/xfs_ialloc.c704
-rw-r--r--fs/xfs/xfs_ialloc.h2
-rw-r--r--fs/xfs/xfs_ialloc_btree.c69
-rw-r--r--fs/xfs/xfs_ialloc_btree.h3
-rw-r--r--fs/xfs/xfs_icache.c12
-rw-r--r--fs/xfs/xfs_icache.h6
-rw-r--r--fs/xfs/xfs_inode.c178
-rw-r--r--fs/xfs/xfs_inode.h5
-rw-r--r--fs/xfs/xfs_inode_buf.c17
-rw-r--r--fs/xfs/xfs_inode_fork.c3
-rw-r--r--fs/xfs/xfs_inode_fork.h3
-rw-r--r--fs/xfs/xfs_inode_item.c32
-rw-r--r--fs/xfs/xfs_ioctl.c16
-rw-r--r--fs/xfs/xfs_ioctl32.c5
-rw-r--r--fs/xfs/xfs_iomap.c2
-rw-r--r--fs/xfs/xfs_iops.c20
-rw-r--r--fs/xfs/xfs_itable.c6
-rw-r--r--fs/xfs/xfs_log.c11
-rw-r--r--fs/xfs/xfs_log.h19
-rw-r--r--fs/xfs/xfs_log_cil.c57
-rw-r--r--fs/xfs/xfs_log_recover.c11
-rw-r--r--fs/xfs/xfs_log_rlimit.c2
-rw-r--r--fs/xfs/xfs_mount.c45
-rw-r--r--fs/xfs/xfs_mount.h12
-rw-r--r--fs/xfs/xfs_mru_cache.c151
-rw-r--r--fs/xfs/xfs_mru_cache.h31
-rw-r--r--fs/xfs/xfs_qm.c217
-rw-r--r--fs/xfs/xfs_qm_syscalls.c6
-rw-r--r--fs/xfs/xfs_quota_defs.h2
-rw-r--r--fs/xfs/xfs_quotaops.c29
-rw-r--r--fs/xfs/xfs_rtbitmap.c1
-rw-r--r--fs/xfs/xfs_sb.c12
-rw-r--r--fs/xfs/xfs_sb.h235
-rw-r--r--fs/xfs/xfs_shared.h2
-rw-r--r--fs/xfs/xfs_stats.c1
-rw-r--r--fs/xfs/xfs_stats.h18
-rw-r--r--fs/xfs/xfs_super.c22
-rw-r--r--fs/xfs/xfs_symlink.c3
-rw-r--r--fs/xfs/xfs_symlink_remote.c1
-rw-r--r--fs/xfs/xfs_trace.c1
-rw-r--r--fs/xfs/xfs_trace.h59
-rw-r--r--fs/xfs/xfs_trans.c2
-rw-r--r--fs/xfs/xfs_trans_ail.c5
-rw-r--r--fs/xfs/xfs_trans_priv.h3
-rw-r--r--fs/xfs/xfs_trans_resv.c56
-rw-r--r--fs/xfs/xfs_trans_space.h12
-rw-r--r--fs/xfs/xfs_types.h2
314 files changed, 18560 insertions, 13438 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index c71e88602ff4..cc1cfae726b3 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -259,8 +259,7 @@ static int v9fs_launder_page(struct page *page)
*
*/
static ssize_t
-v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
- loff_t pos, unsigned long nr_segs)
+v9fs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
{
/*
* FIXME
@@ -269,7 +268,7 @@ v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
*/
p9_debug(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) off/no(%lld/%lu) EINVAL\n",
iocb->ki_filp->f_path.dentry->d_name.name,
- (long long)pos, nr_segs);
+ (long long)pos, iter->nr_segs);
return -EINVAL;
}
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 96e550760699..520c11c2dcca 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -692,7 +692,7 @@ v9fs_cached_file_read(struct file *filp, char __user *data, size_t count,
{
if (filp->f_flags & O_DIRECT)
return v9fs_direct_read(filp, data, count, offset);
- return do_sync_read(filp, data, count, offset);
+ return new_sync_read(filp, data, count, offset);
}
/**
@@ -760,7 +760,7 @@ err_out:
buff_write:
mutex_unlock(&inode->i_mutex);
- return do_sync_write(filp, data, count, offsetp);
+ return new_sync_write(filp, data, count, offsetp);
}
/**
@@ -778,7 +778,7 @@ v9fs_cached_file_write(struct file *filp, const char __user * data,
if (filp->f_flags & O_DIRECT)
return v9fs_direct_write(filp, data, count, offset);
- return do_sync_write(filp, data, count, offset);
+ return new_sync_write(filp, data, count, offset);
}
@@ -847,8 +847,8 @@ const struct file_operations v9fs_cached_file_operations = {
.llseek = generic_file_llseek,
.read = v9fs_cached_file_read,
.write = v9fs_cached_file_write,
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.open = v9fs_file_open,
.release = v9fs_dir_release,
.lock = v9fs_file_lock,
@@ -860,8 +860,8 @@ const struct file_operations v9fs_cached_file_operations_dotl = {
.llseek = generic_file_llseek,
.read = v9fs_cached_file_read,
.write = v9fs_cached_file_write,
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.open = v9fs_file_open,
.release = v9fs_dir_release,
.lock = v9fs_file_lock_dotl,
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index a36da5382b40..07c9edce5aa7 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -23,12 +23,12 @@
const struct file_operations adfs_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
.mmap = generic_file_mmap,
.fsync = generic_file_fsync,
- .write = do_sync_write,
- .aio_write = generic_file_aio_write,
+ .write = new_sync_write,
+ .write_iter = generic_file_write_iter,
.splice_read = generic_file_splice_read,
};
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 0270303388ee..a7fe57d2cd9a 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -27,10 +27,10 @@ static int affs_file_release(struct inode *inode, struct file *filp);
const struct file_operations affs_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .write = do_sync_write,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.open = affs_file_open,
.release = affs_file_release,
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 66d50fe2ee45..932ce07948b3 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -31,10 +31,10 @@ const struct file_operations afs_file_operations = {
.open = afs_open,
.release = afs_release,
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = afs_file_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = afs_file_write,
.mmap = generic_file_readonly_mmap,
.splice_read = generic_file_splice_read,
.fsync = afs_fsync,
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 590b55f46d61..71d5982312f3 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -747,8 +747,7 @@ extern int afs_write_end(struct file *file, struct address_space *mapping,
extern int afs_writepage(struct page *, struct writeback_control *);
extern int afs_writepages(struct address_space *, struct writeback_control *);
extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
-extern ssize_t afs_file_write(struct kiocb *, const struct iovec *,
- unsigned long, loff_t);
+extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *);
extern int afs_writeback_all(struct afs_vnode *);
extern int afs_fsync(struct file *, loff_t, loff_t, int);
diff --git a/fs/afs/write.c b/fs/afs/write.c
index a890db4b9898..ab6adfd52516 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -625,15 +625,14 @@ void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
/*
* write to an AFS file
*/
-ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)
{
struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp));
ssize_t result;
- size_t count = iov_length(iov, nr_segs);
+ size_t count = iov_iter_count(from);
- _enter("{%x.%u},{%zu},%lu,",
- vnode->fid.vid, vnode->fid.vnode, count, nr_segs);
+ _enter("{%x.%u},{%zu},",
+ vnode->fid.vid, vnode->fid.vnode, count);
if (IS_SWAPFILE(&vnode->vfs_inode)) {
printk(KERN_INFO
@@ -644,7 +643,7 @@ ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov,
if (!count)
return 0;
- result = generic_file_aio_write(iocb, iov, nr_segs, pos);
+ result = generic_file_write_iter(iocb, from);
if (IS_ERR_VALUE(result)) {
_leave(" = %zd", result);
return result;
diff --git a/fs/aio.c b/fs/aio.c
index a0ed6c7d2cd2..4f078c054b41 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -477,7 +477,7 @@ void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
}
EXPORT_SYMBOL(kiocb_set_cancel_fn);
-static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb)
+static int kiocb_cancel(struct kiocb *kiocb)
{
kiocb_cancel_fn *old, *cancel;
@@ -538,7 +538,7 @@ static void free_ioctx_users(struct percpu_ref *ref)
struct kiocb, ki_list);
list_del_init(&req->ki_list);
- kiocb_cancel(ctx, req);
+ kiocb_cancel(req);
}
spin_unlock_irq(&ctx->ctx_lock);
@@ -727,42 +727,42 @@ err:
* when the processes owning a context have all exited to encourage
* the rapid destruction of the kioctx.
*/
-static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
+static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
struct completion *requests_done)
{
- if (!atomic_xchg(&ctx->dead, 1)) {
- struct kioctx_table *table;
+ struct kioctx_table *table;
- spin_lock(&mm->ioctx_lock);
- rcu_read_lock();
- table = rcu_dereference(mm->ioctx_table);
+ if (atomic_xchg(&ctx->dead, 1))
+ return -EINVAL;
- WARN_ON(ctx != table->table[ctx->id]);
- table->table[ctx->id] = NULL;
- rcu_read_unlock();
- spin_unlock(&mm->ioctx_lock);
- /* percpu_ref_kill() will do the necessary call_rcu() */
- wake_up_all(&ctx->wait);
+ spin_lock(&mm->ioctx_lock);
+ rcu_read_lock();
+ table = rcu_dereference(mm->ioctx_table);
- /*
- * It'd be more correct to do this in free_ioctx(), after all
- * the outstanding kiocbs have finished - but by then io_destroy
- * has already returned, so io_setup() could potentially return
- * -EAGAIN with no ioctxs actually in use (as far as userspace
- * could tell).
- */
- aio_nr_sub(ctx->max_reqs);
+ WARN_ON(ctx != table->table[ctx->id]);
+ table->table[ctx->id] = NULL;
+ rcu_read_unlock();
+ spin_unlock(&mm->ioctx_lock);
- if (ctx->mmap_size)
- vm_munmap(ctx->mmap_base, ctx->mmap_size);
+ /* percpu_ref_kill() will do the necessary call_rcu() */
+ wake_up_all(&ctx->wait);
- ctx->requests_done = requests_done;
- percpu_ref_kill(&ctx->users);
- } else {
- if (requests_done)
- complete(requests_done);
- }
+ /*
+ * It'd be more correct to do this in free_ioctx(), after all
+ * the outstanding kiocbs have finished - but by then io_destroy
+ * has already returned, so io_setup() could potentially return
+ * -EAGAIN with no ioctxs actually in use (as far as userspace
+ * could tell).
+ */
+ aio_nr_sub(ctx->max_reqs);
+
+ if (ctx->mmap_size)
+ vm_munmap(ctx->mmap_base, ctx->mmap_size);
+
+ ctx->requests_done = requests_done;
+ percpu_ref_kill(&ctx->users);
+ return 0;
}
/* wait_on_sync_kiocb:
@@ -1219,21 +1219,23 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
if (likely(NULL != ioctx)) {
struct completion requests_done =
COMPLETION_INITIALIZER_ONSTACK(requests_done);
+ int ret;
/* Pass requests_done to kill_ioctx() where it can be set
* in a thread-safe way. If we try to set it here then we have
* a race condition if two io_destroy() called simultaneously.
*/
- kill_ioctx(current->mm, ioctx, &requests_done);
+ ret = kill_ioctx(current->mm, ioctx, &requests_done);
percpu_ref_put(&ioctx->users);
/* Wait until all IO for the context are done. Otherwise kernel
* keep using user-space buffers even if user thinks the context
* is destroyed.
*/
- wait_for_completion(&requests_done);
+ if (!ret)
+ wait_for_completion(&requests_done);
- return 0;
+ return ret;
}
pr_debug("EINVAL: io_destroy: invalid context id\n");
return -EINVAL;
@@ -1241,6 +1243,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *,
unsigned long, loff_t);
+typedef ssize_t (rw_iter_op)(struct kiocb *, struct iov_iter *);
static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,
int rw, char __user *buf,
@@ -1298,7 +1301,9 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
int rw;
fmode_t mode;
aio_rw_op *rw_op;
+ rw_iter_op *iter_op;
struct iovec inline_vec, *iovec = &inline_vec;
+ struct iov_iter iter;
switch (opcode) {
case IOCB_CMD_PREAD:
@@ -1306,6 +1311,7 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
mode = FMODE_READ;
rw = READ;
rw_op = file->f_op->aio_read;
+ iter_op = file->f_op->read_iter;
goto rw_common;
case IOCB_CMD_PWRITE:
@@ -1313,12 +1319,13 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
mode = FMODE_WRITE;
rw = WRITE;
rw_op = file->f_op->aio_write;
+ iter_op = file->f_op->write_iter;
goto rw_common;
rw_common:
if (unlikely(!(file->f_mode & mode)))
return -EBADF;
- if (!rw_op)
+ if (!rw_op && !iter_op)
return -EINVAL;
ret = (opcode == IOCB_CMD_PREADV ||
@@ -1347,7 +1354,12 @@ rw_common:
if (rw == WRITE)
file_start_write(file);
- ret = rw_op(req, iovec, nr_segs, req->ki_pos);
+ if (iter_op) {
+ iov_iter_init(&iter, rw, iovec, nr_segs, req->ki_nbytes);
+ ret = iter_op(req, &iter);
+ } else {
+ ret = rw_op(req, iovec, nr_segs, req->ki_pos);
+ }
if (rw == WRITE)
file_end_write(file);
@@ -1585,7 +1597,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
kiocb = lookup_kiocb(ctx, iocb, key);
if (kiocb)
- ret = kiocb_cancel(ctx, kiocb);
+ ret = kiocb_cancel(kiocb);
else
ret = -EINVAL;
diff --git a/fs/attr.c b/fs/attr.c
index 5d4e59d56e85..6530ced19697 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -50,14 +50,14 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
if ((ia_valid & ATTR_UID) &&
(!uid_eq(current_fsuid(), inode->i_uid) ||
!uid_eq(attr->ia_uid, inode->i_uid)) &&
- !inode_capable(inode, CAP_CHOWN))
+ !capable_wrt_inode_uidgid(inode, CAP_CHOWN))
return -EPERM;
/* Make sure caller can chgrp. */
if ((ia_valid & ATTR_GID) &&
(!uid_eq(current_fsuid(), inode->i_uid) ||
(!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) &&
- !inode_capable(inode, CAP_CHOWN))
+ !capable_wrt_inode_uidgid(inode, CAP_CHOWN))
return -EPERM;
/* Make sure a caller can chmod. */
@@ -67,7 +67,7 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
/* Also check the setgid bit! */
if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
inode->i_gid) &&
- !inode_capable(inode, CAP_FSETID))
+ !capable_wrt_inode_uidgid(inode, CAP_FSETID))
attr->ia_mode &= ~S_ISGID;
}
@@ -160,7 +160,7 @@ void setattr_copy(struct inode *inode, const struct iattr *attr)
umode_t mode = attr->ia_mode;
if (!in_group_p(inode->i_gid) &&
- !inode_capable(inode, CAP_FSETID))
+ !capable_wrt_inode_uidgid(inode, CAP_FSETID))
mode &= ~S_ISGID;
inode->i_mode = mode;
}
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index ae2892218335..e7f88ace1a25 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -23,10 +23,10 @@
const struct file_operations bfs_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .write = do_sync_write,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.splice_read = generic_file_splice_read,
};
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 83fba15cc394..6d7274619bf9 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -165,14 +165,15 @@ blkdev_get_block(struct inode *inode, sector_t iblock,
}
static ssize_t
-blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs)
+blkdev_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
- return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
- nr_segs, blkdev_get_block, NULL, NULL, 0);
+ return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iter,
+ offset, blkdev_get_block,
+ NULL, NULL, 0);
}
int __sync_blockdev(struct block_device *bdev, int wait)
@@ -1571,43 +1572,38 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
* Does not take i_mutex for the write and thus is not for general purpose
* use.
*/
-ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct blk_plug plug;
ssize_t ret;
- BUG_ON(iocb->ki_pos != pos);
-
blk_start_plug(&plug);
- ret = __generic_file_aio_write(iocb, iov, nr_segs);
+ ret = __generic_file_write_iter(iocb, from);
if (ret > 0) {
ssize_t err;
-
- err = generic_write_sync(file, pos, ret);
+ err = generic_write_sync(file, iocb->ki_pos - ret, ret);
if (err < 0)
ret = err;
}
blk_finish_plug(&plug);
return ret;
}
-EXPORT_SYMBOL_GPL(blkdev_aio_write);
+EXPORT_SYMBOL_GPL(blkdev_write_iter);
-static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *file = iocb->ki_filp;
struct inode *bd_inode = file->f_mapping->host;
loff_t size = i_size_read(bd_inode);
+ loff_t pos = iocb->ki_pos;
if (pos >= size)
return 0;
size -= pos;
- if (size < iocb->ki_nbytes)
- nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size);
- return generic_file_aio_read(iocb, iov, nr_segs, pos);
+ iov_iter_truncate(to, size);
+ return generic_file_read_iter(iocb, to);
}
/*
@@ -1639,10 +1635,10 @@ const struct file_operations def_blk_fops = {
.open = blkdev_open,
.release = blkdev_close,
.llseek = block_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = blkdev_aio_read,
- .aio_write = blkdev_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = blkdev_read_iter,
+ .write_iter = blkdev_write_iter,
.mmap = generic_file_mmap,
.fsync = blkdev_fsync,
.unlocked_ioctl = block_ioctl,
@@ -1650,7 +1646,7 @@ const struct file_operations def_blk_fops = {
.compat_ioctl = compat_blkdev_ioctl,
#endif
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
};
int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index f341a98031d2..6d1d0b93b1aa 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -16,4 +16,4 @@ btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
tests/extent-buffer-tests.o tests/btrfs-tests.o \
- tests/extent-io-tests.o tests/inode-tests.o
+ tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index ff9b3995d453..9a0124a95851 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -79,13 +79,6 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
const char *name;
char *value = NULL;
- if (acl) {
- ret = posix_acl_valid(acl);
- if (ret < 0)
- return ret;
- ret = 0;
- }
-
switch (type) {
case ACL_TYPE_ACCESS:
name = POSIX_ACL_XATTR_ACCESS;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 10db21fa0926..e25564bfcb46 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -900,7 +900,11 @@ again:
goto out;
BUG_ON(ret == 0);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (trans && likely(trans->type != __TRANS_DUMMY)) {
+#else
if (trans) {
+#endif
/*
* look if there are updates for this ref queued and lock the
* head
@@ -984,11 +988,12 @@ again:
goto out;
}
if (ref->count && ref->parent) {
- if (extent_item_pos && !ref->inode_list) {
+ if (extent_item_pos && !ref->inode_list &&
+ ref->level == 0) {
u32 bsz;
struct extent_buffer *eb;
bsz = btrfs_level_size(fs_info->extent_root,
- info_level);
+ ref->level);
eb = read_tree_block(fs_info->extent_root,
ref->parent, bsz, 0);
if (!eb || !extent_buffer_uptodate(eb)) {
@@ -1404,9 +1409,10 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
* returns <0 on error
*/
static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
- struct btrfs_extent_item *ei, u32 item_size,
- struct btrfs_extent_inline_ref **out_eiref,
- int *out_type)
+ struct btrfs_key *key,
+ struct btrfs_extent_item *ei, u32 item_size,
+ struct btrfs_extent_inline_ref **out_eiref,
+ int *out_type)
{
unsigned long end;
u64 flags;
@@ -1416,19 +1422,26 @@ static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
/* first call */
flags = btrfs_extent_flags(eb, ei);
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- info = (struct btrfs_tree_block_info *)(ei + 1);
- *out_eiref =
- (struct btrfs_extent_inline_ref *)(info + 1);
+ if (key->type == BTRFS_METADATA_ITEM_KEY) {
+ /* a skinny metadata extent */
+ *out_eiref =
+ (struct btrfs_extent_inline_ref *)(ei + 1);
+ } else {
+ WARN_ON(key->type != BTRFS_EXTENT_ITEM_KEY);
+ info = (struct btrfs_tree_block_info *)(ei + 1);
+ *out_eiref =
+ (struct btrfs_extent_inline_ref *)(info + 1);
+ }
} else {
*out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1);
}
*ptr = (unsigned long)*out_eiref;
- if ((void *)*ptr >= (void *)ei + item_size)
+ if ((unsigned long)(*ptr) >= (unsigned long)ei + item_size)
return -ENOENT;
}
end = (unsigned long)ei + item_size;
- *out_eiref = (struct btrfs_extent_inline_ref *)*ptr;
+ *out_eiref = (struct btrfs_extent_inline_ref *)(*ptr);
*out_type = btrfs_extent_inline_ref_type(eb, *out_eiref);
*ptr += btrfs_extent_inline_ref_size(*out_type);
@@ -1447,8 +1460,8 @@ static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
* <0 on error.
*/
int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
- struct btrfs_extent_item *ei, u32 item_size,
- u64 *out_root, u8 *out_level)
+ struct btrfs_key *key, struct btrfs_extent_item *ei,
+ u32 item_size, u64 *out_root, u8 *out_level)
{
int ret;
int type;
@@ -1459,8 +1472,8 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
return 1;
while (1) {
- ret = __get_extent_inline_ref(ptr, eb, ei, item_size,
- &eiref, &type);
+ ret = __get_extent_inline_ref(ptr, eb, key, ei, item_size,
+ &eiref, &type);
if (ret < 0)
return ret;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index a910b27a8ad9..86fc20fec282 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -40,8 +40,8 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
u64 *flags);
int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
- struct btrfs_extent_item *ei, u32 item_size,
- u64 *out_root, u8 *out_level);
+ struct btrfs_key *key, struct btrfs_extent_item *ei,
+ u32 item_size, u64 *out_root, u8 *out_level);
int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
u64 extent_item_objectid,
@@ -55,8 +55,8 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist **roots);
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist **roots);
char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
u32 name_len, unsigned long name_off,
struct extent_buffer *eb_in, u64 parent,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 2256e9cceec5..4794923c410c 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -284,4 +284,6 @@ static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode)
&BTRFS_I(inode)->runtime_flags);
}
+bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end);
+
#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 0e8388e72d8d..ce92ae30250f 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1093,6 +1093,7 @@ leaf_item_out_of_bounce_error:
next_stack =
btrfsic_stack_frame_alloc();
if (NULL == next_stack) {
+ sf->error = -1;
btrfsic_release_block_ctx(
&sf->
next_block_ctx);
@@ -1190,8 +1191,10 @@ continue_with_current_node_stack_frame:
sf->next_block_ctx.datav[0];
next_stack = btrfsic_stack_frame_alloc();
- if (NULL == next_stack)
+ if (NULL == next_stack) {
+ sf->error = -1;
goto one_stack_frame_backwards;
+ }
next_stack->i = -1;
next_stack->block = sf->next_block;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index d43c544d3b68..92371c414228 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -887,7 +887,7 @@ int btrfs_compress_pages(int type, struct address_space *mapping,
workspace = find_workspace(type);
if (IS_ERR(workspace))
- return -1;
+ return PTR_ERR(workspace);
ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
start, len, pages,
@@ -923,7 +923,7 @@ static int btrfs_decompress_biovec(int type, struct page **pages_in,
workspace = find_workspace(type);
if (IS_ERR(workspace))
- return -ENOMEM;
+ return PTR_ERR(workspace);
ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
disk_start,
@@ -945,7 +945,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
workspace = find_workspace(type);
if (IS_ERR(workspace))
- return -ENOMEM;
+ return PTR_ERR(workspace);
ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
dest_page, start_byte,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 1bcfcdb23cf4..aeab453b8e24 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -224,7 +224,8 @@ static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
static void add_root_to_dirty_list(struct btrfs_root *root)
{
spin_lock(&root->fs_info->trans_lock);
- if (root->track_dirty && list_empty(&root->dirty_list)) {
+ if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) &&
+ list_empty(&root->dirty_list)) {
list_add(&root->dirty_list,
&root->fs_info->dirty_cowonly_roots);
}
@@ -246,9 +247,10 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
int level;
struct btrfs_disk_key disk_key;
- WARN_ON(root->ref_cows && trans->transid !=
- root->fs_info->running_transaction->transid);
- WARN_ON(root->ref_cows && trans->transid != root->last_trans);
+ WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ trans->transid != root->fs_info->running_transaction->transid);
+ WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ trans->transid != root->last_trans);
level = btrfs_header_level(buf);
if (level == 0)
@@ -354,44 +356,14 @@ static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
}
/*
- * Increment the upper half of tree_mod_seq, set lower half zero.
- *
- * Must be called with fs_info->tree_mod_seq_lock held.
- */
-static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
-{
- u64 seq = atomic64_read(&fs_info->tree_mod_seq);
- seq &= 0xffffffff00000000ull;
- seq += 1ull << 32;
- atomic64_set(&fs_info->tree_mod_seq, seq);
- return seq;
-}
-
-/*
- * Increment the lower half of tree_mod_seq.
- *
- * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers
- * are generated should not technically require a spin lock here. (Rationale:
- * incrementing the minor while incrementing the major seq number is between its
- * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it
- * just returns a unique sequence number as usual.) We have decided to leave
- * that requirement in here and rethink it once we notice it really imposes a
- * problem on some workload.
+ * Pull a new tree mod seq number for our operation.
*/
-static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info)
+static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
{
return atomic64_inc_return(&fs_info->tree_mod_seq);
}
/*
- * return the last minor in the previous major tree_mod_seq number
- */
-u64 btrfs_tree_mod_seq_prev(u64 seq)
-{
- return (seq & 0xffffffff00000000ull) - 1ull;
-}
-
-/*
* This adds a new blocker to the tree mod log's blocker list if the @elem
* passed does not already have a sequence number set. So when a caller expects
* to record tree modifications, it should ensure to set elem->seq to zero
@@ -402,19 +374,16 @@ u64 btrfs_tree_mod_seq_prev(u64 seq)
u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct seq_list *elem)
{
- u64 seq;
-
tree_mod_log_write_lock(fs_info);
spin_lock(&fs_info->tree_mod_seq_lock);
if (!elem->seq) {
- elem->seq = btrfs_inc_tree_mod_seq_major(fs_info);
+ elem->seq = btrfs_inc_tree_mod_seq(fs_info);
list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
}
- seq = btrfs_inc_tree_mod_seq_minor(fs_info);
spin_unlock(&fs_info->tree_mod_seq_lock);
tree_mod_log_write_unlock(fs_info);
- return seq;
+ return elem->seq;
}
void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
@@ -487,9 +456,7 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
BUG_ON(!tm);
- spin_lock(&fs_info->tree_mod_seq_lock);
- tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
- spin_unlock(&fs_info->tree_mod_seq_lock);
+ tm->seq = btrfs_inc_tree_mod_seq(fs_info);
tm_root = &fs_info->tree_mod_log;
new = &tm_root->rb_node;
@@ -997,14 +964,14 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
* snapshot and the block was not allocated by tree relocation,
* we know the block is not shared.
*/
- if (root->ref_cows &&
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
buf != root->node && buf != root->commit_root &&
(btrfs_header_generation(buf) <=
btrfs_root_last_snapshot(&root->root_item) ||
btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
return 1;
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
- if (root->ref_cows &&
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
return 1;
#endif
@@ -1146,9 +1113,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
btrfs_assert_tree_locked(buf);
- WARN_ON(root->ref_cows && trans->transid !=
- root->fs_info->running_transaction->transid);
- WARN_ON(root->ref_cows && trans->transid != root->last_trans);
+ WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ trans->transid != root->fs_info->running_transaction->transid);
+ WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ trans->transid != root->last_trans);
level = btrfs_header_level(buf);
@@ -1193,7 +1161,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
return ret;
}
- if (root->ref_cows) {
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
ret = btrfs_reloc_cow_block(trans, root, buf, cow);
if (ret)
return ret;
@@ -1538,6 +1506,10 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf)
{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ return 0;
+#endif
/* ensure we can see the force_cow */
smp_rmb();
@@ -1556,7 +1528,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
!btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
!(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
- !root->force_cow)
+ !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
return 0;
return 1;
}
@@ -5125,7 +5097,17 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
return ret;
btrfs_item_key(path->nodes[0], &found_key, 0);
ret = comp_keys(&found_key, &key);
- if (ret < 0)
+ /*
+ * We might have had an item with the previous key in the tree right
+ * before we released our path. And after we released our path, that
+ * item might have been pushed to the first slot (0) of the leaf we
+ * were holding due to a tree balance. Alternatively, an item with the
+ * previous key can exist as the only element of a leaf (big fat item).
+ * Therefore account for these 2 cases, so that our callers (like
+ * btrfs_previous_item) don't miss an existing item with a key matching
+ * the previous key we computed above.
+ */
+ if (ret <= 0)
return 0;
return 1;
}
@@ -5736,6 +5718,24 @@ again:
ret = 0;
goto done;
}
+ /*
+ * So the above check misses one case:
+ * - after releasing the path above, someone has removed the item that
+ * used to be at the very end of the block, and balance between leafs
+ * gets another one with bigger key.offset to replace it.
+ *
+ * This one should be returned as well, or we can get leaf corruption
+ * later(esp. in __btrfs_drop_extents()).
+ *
+ * And a bit more explanation about this check,
+ * with ret > 0, the key isn't found, the path points to the slot
+ * where it should be inserted, so the path->slots[0] item must be the
+ * bigger one.
+ */
+ if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
+ ret = 0;
+ goto done;
+ }
while (level < BTRFS_MAX_LEVEL) {
if (!path->nodes[level]) {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ba6b88528dc7..be91397f4e92 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -33,6 +33,7 @@
#include <asm/kmap_types.h>
#include <linux/pagemap.h>
#include <linux/btrfs.h>
+#include <linux/workqueue.h>
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
@@ -756,6 +757,12 @@ struct btrfs_dir_item {
#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0)
+/*
+ * Internal in-memory flag that a subvolume has been marked for deletion but
+ * still visible as a directory
+ */
+#define BTRFS_ROOT_SUBVOL_DEAD (1ULL << 48)
+
struct btrfs_root_item {
struct btrfs_inode_item inode;
__le64 generation;
@@ -840,7 +847,10 @@ struct btrfs_disk_balance_args {
/* BTRFS_BALANCE_ARGS_* */
__le64 flags;
- __le64 unused[8];
+ /* BTRFS_BALANCE_ARGS_LIMIT value */
+ __le64 limit;
+
+ __le64 unused[7];
} __attribute__ ((__packed__));
/*
@@ -1113,6 +1123,12 @@ struct btrfs_qgroup_limit_item {
__le64 rsv_excl;
} __attribute__ ((__packed__));
+/* For raid type sysfs entries */
+struct raid_kobject {
+ int raid_type;
+ struct kobject kobj;
+};
+
struct btrfs_space_info {
spinlock_t lock;
@@ -1163,7 +1179,7 @@ struct btrfs_space_info {
wait_queue_head_t wait;
struct kobject kobj;
- struct kobject block_group_kobjs[BTRFS_NR_RAID_TYPES];
+ struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];
};
#define BTRFS_BLOCK_RSV_GLOBAL 1
@@ -1243,11 +1259,19 @@ struct btrfs_block_group_cache {
spinlock_t lock;
u64 pinned;
u64 reserved;
+ u64 delalloc_bytes;
u64 bytes_super;
u64 flags;
u64 sectorsize;
u64 cache_generation;
+ /*
+ * It is just used for the delayed data space allocation because
+ * only the data space allocation and the relative metadata update
+ * can be done cross the transaction.
+ */
+ struct rw_semaphore data_rwsem;
+
/* for raid56, this is a full stripe, without parity */
unsigned long full_stripe_len;
@@ -1313,6 +1337,8 @@ struct btrfs_stripe_hash_table {
#define BTRFS_STRIPE_HASH_TABLE_BITS 11
+void btrfs_init_async_reclaim_work(struct work_struct *work);
+
/* fs_info */
struct reloc_control;
struct btrfs_device;
@@ -1534,6 +1560,9 @@ struct btrfs_fs_info {
*/
struct btrfs_workqueue *fixup_workers;
struct btrfs_workqueue *delayed_workers;
+
+ /* the extent workers do delayed refs on the extent allocation tree */
+ struct btrfs_workqueue *extent_workers;
struct task_struct *transaction_kthread;
struct task_struct *cleaner_kthread;
int thread_pool_size;
@@ -1636,7 +1665,10 @@ struct btrfs_fs_info {
/* holds configuration and tracking. Protected by qgroup_lock */
struct rb_root qgroup_tree;
+ struct rb_root qgroup_op_tree;
spinlock_t qgroup_lock;
+ spinlock_t qgroup_op_lock;
+ atomic_t qgroup_op_seq;
/*
* used to avoid frequently calling ulist_alloc()/ulist_free()
@@ -1688,6 +1720,9 @@ struct btrfs_fs_info {
struct semaphore uuid_tree_rescan_sem;
unsigned int update_uuid_tree_gen:1;
+
+ /* Used to reclaim the metadata space in the background. */
+ struct work_struct async_reclaim_work;
};
struct btrfs_subvolume_writers {
@@ -1696,6 +1731,26 @@ struct btrfs_subvolume_writers {
};
/*
+ * The state of btrfs root
+ */
+/*
+ * btrfs_record_root_in_trans is a multi-step process,
+ * and it can race with the balancing code. But the
+ * race is very small, and only the first time the root
+ * is added to each transaction. So IN_TRANS_SETUP
+ * is used to tell us when more checks are required
+ */
+#define BTRFS_ROOT_IN_TRANS_SETUP 0
+#define BTRFS_ROOT_REF_COWS 1
+#define BTRFS_ROOT_TRACK_DIRTY 2
+#define BTRFS_ROOT_IN_RADIX 3
+#define BTRFS_ROOT_DUMMY_ROOT 4
+#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 5
+#define BTRFS_ROOT_DEFRAG_RUNNING 6
+#define BTRFS_ROOT_FORCE_COW 7
+#define BTRFS_ROOT_MULTI_LOG_TASKS 8
+
+/*
* in ram representation of the tree. extent_root is used for all allocations
* and for the extent tree extent_root root.
*/
@@ -1706,6 +1761,7 @@ struct btrfs_root {
struct btrfs_root *log_root;
struct btrfs_root *reloc_root;
+ unsigned long state;
struct btrfs_root_item root_item;
struct btrfs_key root_key;
struct btrfs_fs_info *fs_info;
@@ -1740,7 +1796,6 @@ struct btrfs_root {
/* Just be updated when the commit succeeds. */
int last_log_commit;
pid_t log_start_pid;
- bool log_multiple_pids;
u64 objectid;
u64 last_trans;
@@ -1760,23 +1815,13 @@ struct btrfs_root {
u64 highest_objectid;
- /* btrfs_record_root_in_trans is a multi-step process,
- * and it can race with the balancing code. But the
- * race is very small, and only the first time the root
- * is added to each transaction. So in_trans_setup
- * is used to tell us when more checks are required
- */
- unsigned long in_trans_setup;
- int ref_cows;
- int track_dirty;
- int in_radix;
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- int dummy_root;
+ u64 alloc_bytenr;
#endif
+
u64 defrag_trans_start;
struct btrfs_key defrag_progress;
struct btrfs_key defrag_max;
- int defrag_running;
char *name;
/* the dirty list is only used by non-reference counted roots */
@@ -1790,7 +1835,6 @@ struct btrfs_root {
spinlock_t orphan_lock;
atomic_t orphan_inodes;
struct btrfs_block_rsv *orphan_block_rsv;
- int orphan_item_inserted;
int orphan_cleanup_state;
spinlock_t inode_lock;
@@ -1808,8 +1852,6 @@ struct btrfs_root {
*/
dev_t anon_dev;
- int force_cow;
-
spinlock_t root_item_lock;
atomic_t refs;
@@ -2788,6 +2830,11 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root)
return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
}
+static inline bool btrfs_root_dead(struct btrfs_root *root)
+{
+ return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
+}
+
/* struct btrfs_root_backup */
BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
tree_root, 64);
@@ -2897,6 +2944,7 @@ btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
cpu->vend = le64_to_cpu(disk->vend);
cpu->target = le64_to_cpu(disk->target);
cpu->flags = le64_to_cpu(disk->flags);
+ cpu->limit = le64_to_cpu(disk->limit);
}
static inline void
@@ -2914,6 +2962,7 @@ btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
disk->vend = cpu_to_le64(cpu->vend);
disk->target = cpu_to_le64(cpu->target);
disk->flags = cpu_to_le64(cpu->flags);
+ disk->limit = cpu_to_le64(cpu->limit);
}
/* struct btrfs_super_block */
@@ -3236,6 +3285,8 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root, unsigned long count);
+int btrfs_async_run_delayed_refs(struct btrfs_root *root,
+ unsigned long count, int wait);
int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr,
@@ -3273,11 +3324,11 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_key *ins);
int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
u64 min_alloc_size, u64 empty_size, u64 hint_byte,
- struct btrfs_key *ins, int is_data);
+ struct btrfs_key *ins, int is_data, int delalloc);
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref, int for_cow);
+ struct extent_buffer *buf, int full_backref, int no_quota);
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref, int for_cow);
+ struct extent_buffer *buf, int full_backref, int no_quota);
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 flags,
@@ -3285,9 +3336,10 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
int btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
- u64 owner, u64 offset, int for_cow);
+ u64 owner, u64 offset, int no_quota);
-int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len,
+ int delalloc);
int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
u64 start, u64 len);
void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
@@ -3297,7 +3349,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 owner, u64 offset, int for_cow);
+ u64 root_objectid, u64 owner, u64 offset, int no_quota);
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
@@ -3385,7 +3437,6 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int __get_raid_index(u64 flags);
-
int btrfs_start_nocow_write(struct btrfs_root *root);
void btrfs_end_nocow_write(struct btrfs_root *root);
/* ctree.c */
@@ -3561,7 +3612,6 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct seq_list *elem);
void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct seq_list *elem);
-u64 btrfs_tree_mod_seq_prev(u64 seq);
int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
/* root-item.c */
@@ -3708,6 +3758,12 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
struct bio *bio, u64 file_start, int contig);
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit);
+void btrfs_extent_item_to_extent_map(struct inode *inode,
+ const struct btrfs_path *path,
+ struct btrfs_file_extent_item *fi,
+ const bool new_inline,
+ struct extent_map *em);
+
/* inode.c */
struct btrfs_delalloc_work {
struct inode *inode;
@@ -4069,52 +4125,6 @@ void btrfs_reada_detach(void *handle);
int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
u64 start, int err);
-/* qgroup.c */
-struct qgroup_update {
- struct list_head list;
- struct btrfs_delayed_ref_node *node;
- struct btrfs_delayed_extent_op *extent_op;
-};
-
-int btrfs_quota_enable(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
-int btrfs_quota_disable(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
-int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
-void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
-int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
-int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 src, u64 dst);
-int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 src, u64 dst);
-int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 qgroupid,
- char *name);
-int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 qgroupid);
-int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 qgroupid,
- struct btrfs_qgroup_limit *limit);
-int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
-void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
-struct btrfs_delayed_extent_op;
-int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *node,
- struct btrfs_delayed_extent_op *extent_op);
-int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_node *node,
- struct btrfs_delayed_extent_op *extent_op);
-int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
-int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
- struct btrfs_qgroup_inherit *inherit);
-int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
-void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
-
-void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
-
static inline int is_fstree(u64 rootid)
{
if (rootid == BTRFS_FS_TREE_OBJECTID ||
@@ -4131,6 +4141,8 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
/* Sanity test specific functions */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
void btrfs_test_destroy_inode(struct inode *inode);
+int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
+ u64 rfer, u64 excl);
#endif
#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 33e561a84013..da775bfdebc9 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -149,8 +149,8 @@ again:
spin_lock(&root->inode_lock);
ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
if (ret == -EEXIST) {
- kmem_cache_free(delayed_node_cache, node);
spin_unlock(&root->inode_lock);
+ kmem_cache_free(delayed_node_cache, node);
radix_tree_preload_end();
goto again;
}
@@ -267,14 +267,17 @@ static void __btrfs_release_delayed_node(
mutex_unlock(&delayed_node->mutex);
if (atomic_dec_and_test(&delayed_node->refs)) {
+ bool free = false;
struct btrfs_root *root = delayed_node->root;
spin_lock(&root->inode_lock);
if (atomic_read(&delayed_node->refs) == 0) {
radix_tree_delete(&root->delayed_nodes_tree,
delayed_node->inode_id);
- kmem_cache_free(delayed_node_cache, delayed_node);
+ free = true;
}
spin_unlock(&root->inode_lock);
+ if (free)
+ kmem_cache_free(delayed_node_cache, delayed_node);
}
}
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 31299646024d..6d16bea94e1c 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -106,6 +106,10 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
return -1;
if (ref1->type > ref2->type)
return 1;
+ if (ref1->no_quota > ref2->no_quota)
+ return 1;
+ if (ref1->no_quota < ref2->no_quota)
+ return -1;
/* merging of sequenced refs is not allowed */
if (compare_seq) {
if (ref1->seq < ref2->seq)
@@ -635,7 +639,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_head *head_ref,
struct btrfs_delayed_ref_node *ref, u64 bytenr,
u64 num_bytes, u64 parent, u64 ref_root, int level,
- int action, int for_cow)
+ int action, int no_quota)
{
struct btrfs_delayed_ref_node *existing;
struct btrfs_delayed_tree_ref *full_ref;
@@ -645,6 +649,8 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
if (action == BTRFS_ADD_DELAYED_EXTENT)
action = BTRFS_ADD_DELAYED_REF;
+ if (is_fstree(ref_root))
+ seq = atomic64_read(&fs_info->tree_mod_seq);
delayed_refs = &trans->transaction->delayed_refs;
/* first set the basic ref node struct up */
@@ -655,9 +661,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
ref->action = action;
ref->is_head = 0;
ref->in_tree = 1;
-
- if (need_ref_seq(for_cow, ref_root))
- seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
+ ref->no_quota = no_quota;
ref->seq = seq;
full_ref = btrfs_delayed_node_to_tree_ref(ref);
@@ -697,7 +701,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_head *head_ref,
struct btrfs_delayed_ref_node *ref, u64 bytenr,
u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
- u64 offset, int action, int for_cow)
+ u64 offset, int action, int no_quota)
{
struct btrfs_delayed_ref_node *existing;
struct btrfs_delayed_data_ref *full_ref;
@@ -709,6 +713,9 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
delayed_refs = &trans->transaction->delayed_refs;
+ if (is_fstree(ref_root))
+ seq = atomic64_read(&fs_info->tree_mod_seq);
+
/* first set the basic ref node struct up */
atomic_set(&ref->refs, 1);
ref->bytenr = bytenr;
@@ -717,9 +724,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
ref->action = action;
ref->is_head = 0;
ref->in_tree = 1;
-
- if (need_ref_seq(for_cow, ref_root))
- seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
+ ref->no_quota = no_quota;
ref->seq = seq;
full_ref = btrfs_delayed_node_to_data_ref(ref);
@@ -762,12 +767,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes, u64 parent,
u64 ref_root, int level, int action,
struct btrfs_delayed_extent_op *extent_op,
- int for_cow)
+ int no_quota)
{
struct btrfs_delayed_tree_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
+ if (!is_fstree(ref_root) || !fs_info->quota_enabled)
+ no_quota = 0;
+
BUG_ON(extent_op && extent_op->is_data);
ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
if (!ref)
@@ -793,10 +801,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
num_bytes, parent, ref_root, level, action,
- for_cow);
+ no_quota);
spin_unlock(&delayed_refs->lock);
- if (need_ref_seq(for_cow, ref_root))
- btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
return 0;
}
@@ -810,12 +816,15 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
u64 parent, u64 ref_root,
u64 owner, u64 offset, int action,
struct btrfs_delayed_extent_op *extent_op,
- int for_cow)
+ int no_quota)
{
struct btrfs_delayed_data_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
+ if (!is_fstree(ref_root) || !fs_info->quota_enabled)
+ no_quota = 0;
+
BUG_ON(extent_op && !extent_op->is_data);
ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
if (!ref)
@@ -841,10 +850,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
num_bytes, parent, ref_root, owner, offset,
- action, for_cow);
+ action, no_quota);
spin_unlock(&delayed_refs->lock);
- if (need_ref_seq(for_cow, ref_root))
- btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
return 0;
}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 4ba9b93022ff..a764e2340d48 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -52,6 +52,7 @@ struct btrfs_delayed_ref_node {
unsigned int action:8;
unsigned int type:8;
+ unsigned int no_quota:1;
/* is this node still in the rbtree? */
unsigned int is_head:1;
unsigned int in_tree:1;
@@ -196,14 +197,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes, u64 parent,
u64 ref_root, int level, int action,
struct btrfs_delayed_extent_op *extent_op,
- int for_cow);
+ int no_quota);
int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
u64 parent, u64 ref_root,
u64 owner, u64 offset, int action,
struct btrfs_delayed_extent_op *extent_op,
- int for_cow);
+ int no_quota);
int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
@@ -231,25 +232,6 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
u64 seq);
/*
- * delayed refs with a ref_seq > 0 must be held back during backref walking.
- * this only applies to items in one of the fs-trees. for_cow items never need
- * to be held back, so they won't get a ref_seq number.
- */
-static inline int need_ref_seq(int for_cow, u64 rootid)
-{
- if (for_cow)
- return 0;
-
- if (rootid == BTRFS_FS_TREE_OBJECTID)
- return 1;
-
- if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
- return 1;
-
- return 0;
-}
-
-/*
* a node might live in a head or a regular ref, this lets you
* test for the proper type to use.
*/
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 9f2290509aca..2af6e66fe788 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -313,7 +313,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
if (btrfs_fs_incompat(fs_info, RAID56)) {
btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6");
- return -EINVAL;
+ return -EOPNOTSUPP;
}
switch (args->start.cont_reading_from_srcdev_mode) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 983314932af3..8bb4aa19898f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -49,6 +49,7 @@
#include "dev-replace.h"
#include "raid56.h"
#include "sysfs.h"
+#include "qgroup.h"
#ifdef CONFIG_X86
#include <asm/cpufeature.h>
@@ -1109,6 +1110,11 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize)
{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ return alloc_test_extent_buffer(root->fs_info, bytenr,
+ blocksize);
+#endif
return alloc_extent_buffer(root->fs_info, bytenr, blocksize);
}
@@ -1201,10 +1207,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->nodesize = nodesize;
root->leafsize = leafsize;
root->stripesize = stripesize;
- root->ref_cows = 0;
- root->track_dirty = 0;
- root->in_radix = 0;
- root->orphan_item_inserted = 0;
+ root->state = 0;
root->orphan_cleanup_state = 0;
root->objectid = objectid;
@@ -1265,7 +1268,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
else
root->defrag_trans_start = 0;
init_completion(&root->kobj_unregister);
- root->defrag_running = 0;
root->root_key.objectid = objectid;
root->anon_dev = 0;
@@ -1290,7 +1292,8 @@ struct btrfs_root *btrfs_alloc_dummy_root(void)
if (!root)
return ERR_PTR(-ENOMEM);
__setup_root(4096, 4096, 4096, 4096, root, NULL, 1);
- root->dummy_root = 1;
+ set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state);
+ root->alloc_bytenr = 0;
return root;
}
@@ -1341,8 +1344,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
root->commit_root = btrfs_root_node(root);
- root->track_dirty = 1;
-
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
root->root_item.flags = 0;
root->root_item.byte_limit = 0;
@@ -1371,6 +1373,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
fail:
if (leaf) {
btrfs_tree_unlock(leaf);
+ free_extent_buffer(root->commit_root);
free_extent_buffer(leaf);
}
kfree(root);
@@ -1396,13 +1399,15 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
+
/*
+ * DON'T set REF_COWS for log trees
+ *
* log trees do not get reference counted because they go away
* before a real commit is actually done. They do store pointers
* to file data extents, and those reference counts still get
* updated (along with back refs to the log tree).
*/
- root->ref_cows = 0;
leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
BTRFS_TREE_LOG_OBJECTID, NULL,
@@ -1536,7 +1541,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
return root;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
- root->ref_cows = 1;
+ set_bit(BTRFS_ROOT_REF_COWS, &root->state);
btrfs_check_and_init_root_item(&root->root_item);
}
@@ -1606,7 +1611,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
(unsigned long)root->root_key.objectid,
root);
if (ret == 0)
- root->in_radix = 1;
+ set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
spin_unlock(&fs_info->fs_roots_radix_lock);
radix_tree_preload_end();
@@ -1662,7 +1667,7 @@ again:
if (ret < 0)
goto fail;
if (ret == 0)
- root->orphan_item_inserted = 1;
+ set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
ret = btrfs_insert_fs_root(fs_info, root);
if (ret) {
@@ -2064,6 +2069,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
btrfs_destroy_workqueue(fs_info->readahead_workers);
btrfs_destroy_workqueue(fs_info->flush_workers);
btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
+ btrfs_destroy_workqueue(fs_info->extent_workers);
}
static void free_root_extent_buffers(struct btrfs_root *root)
@@ -2090,7 +2096,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
free_root_extent_buffers(info->chunk_root);
}
-static void del_fs_roots(struct btrfs_fs_info *fs_info)
+void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
{
int ret;
struct btrfs_root *gang[8];
@@ -2101,7 +2107,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
struct btrfs_root, root_list);
list_del(&gang[0]->root_list);
- if (gang[0]->in_radix) {
+ if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) {
btrfs_drop_and_free_fs_root(fs_info, gang[0]);
} else {
free_extent_buffer(gang[0]->node);
@@ -2221,6 +2227,7 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->free_chunk_lock);
spin_lock_init(&fs_info->tree_mod_seq_lock);
spin_lock_init(&fs_info->super_lock);
+ spin_lock_init(&fs_info->qgroup_op_lock);
spin_lock_init(&fs_info->buffer_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->reloc_mutex);
@@ -2246,6 +2253,7 @@ int open_ctree(struct super_block *sb,
atomic_set(&fs_info->async_submit_draining, 0);
atomic_set(&fs_info->nr_async_bios, 0);
atomic_set(&fs_info->defrag_running, 0);
+ atomic_set(&fs_info->qgroup_op_seq, 0);
atomic64_set(&fs_info->tree_mod_seq, 0);
fs_info->sb = sb;
fs_info->max_inline = 8192 * 1024;
@@ -2291,6 +2299,7 @@ int open_ctree(struct super_block *sb,
atomic_set(&fs_info->balance_cancel_req, 0);
fs_info->balance_ctl = NULL;
init_waitqueue_head(&fs_info->balance_wait_q);
+ btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
sb->s_blocksize = 4096;
sb->s_blocksize_bits = blksize_bits(4096);
@@ -2354,6 +2363,7 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->qgroup_lock);
mutex_init(&fs_info->qgroup_ioctl_lock);
fs_info->qgroup_tree = RB_ROOT;
+ fs_info->qgroup_op_tree = RB_ROOT;
INIT_LIST_HEAD(&fs_info->dirty_qgroups);
fs_info->qgroup_seq = 1;
fs_info->quota_enabled = 0;
@@ -2577,6 +2587,10 @@ int open_ctree(struct super_block *sb,
btrfs_alloc_workqueue("readahead", flags, max_active, 2);
fs_info->qgroup_rescan_workers =
btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
+ fs_info->extent_workers =
+ btrfs_alloc_workqueue("extent-refs", flags,
+ min_t(u64, fs_devices->num_devices,
+ max_active), 8);
if (!(fs_info->workers && fs_info->delalloc_workers &&
fs_info->submit_workers && fs_info->flush_workers &&
@@ -2586,6 +2600,7 @@ int open_ctree(struct super_block *sb,
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
fs_info->caching_workers && fs_info->readahead_workers &&
fs_info->fixup_workers && fs_info->delayed_workers &&
+ fs_info->fixup_workers && fs_info->extent_workers &&
fs_info->qgroup_rescan_workers)) {
err = -ENOMEM;
goto fail_sb_buffer;
@@ -2693,7 +2708,7 @@ retry_root_backup:
ret = PTR_ERR(extent_root);
goto recovery_tree_root;
}
- extent_root->track_dirty = 1;
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &extent_root->state);
fs_info->extent_root = extent_root;
location.objectid = BTRFS_DEV_TREE_OBJECTID;
@@ -2702,7 +2717,7 @@ retry_root_backup:
ret = PTR_ERR(dev_root);
goto recovery_tree_root;
}
- dev_root->track_dirty = 1;
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &dev_root->state);
fs_info->dev_root = dev_root;
btrfs_init_devices_late(fs_info);
@@ -2712,13 +2727,13 @@ retry_root_backup:
ret = PTR_ERR(csum_root);
goto recovery_tree_root;
}
- csum_root->track_dirty = 1;
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &csum_root->state);
fs_info->csum_root = csum_root;
location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
quota_root = btrfs_read_tree_root(tree_root, &location);
if (!IS_ERR(quota_root)) {
- quota_root->track_dirty = 1;
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &quota_root->state);
fs_info->quota_enabled = 1;
fs_info->pending_quota_state = 1;
fs_info->quota_root = quota_root;
@@ -2733,7 +2748,7 @@ retry_root_backup:
create_uuid_tree = true;
check_uuid_tree = false;
} else {
- uuid_root->track_dirty = 1;
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &uuid_root->state);
fs_info->uuid_root = uuid_root;
create_uuid_tree = false;
check_uuid_tree =
@@ -2966,7 +2981,7 @@ fail_qgroup:
fail_trans_kthread:
kthread_stop(fs_info->transaction_kthread);
btrfs_cleanup_transaction(fs_info->tree_root);
- del_fs_roots(fs_info);
+ btrfs_free_fs_roots(fs_info);
fail_cleaner:
kthread_stop(fs_info->cleaner_kthread);
@@ -3501,8 +3516,10 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
btrfs_free_log(NULL, root);
- __btrfs_remove_free_space_cache(root->free_ino_pinned);
- __btrfs_remove_free_space_cache(root->free_ino_ctl);
+ if (root->free_ino_pinned)
+ __btrfs_remove_free_space_cache(root->free_ino_pinned);
+ if (root->free_ino_ctl)
+ __btrfs_remove_free_space_cache(root->free_ino_ctl);
free_fs_root(root);
}
@@ -3533,28 +3550,51 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
{
u64 root_objectid = 0;
struct btrfs_root *gang[8];
- int i;
- int ret;
+ int i = 0;
+ int err = 0;
+ unsigned int ret = 0;
+ int index;
while (1) {
+ index = srcu_read_lock(&fs_info->subvol_srcu);
ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
(void **)gang, root_objectid,
ARRAY_SIZE(gang));
- if (!ret)
+ if (!ret) {
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
break;
-
+ }
root_objectid = gang[ret - 1]->root_key.objectid + 1;
+
for (i = 0; i < ret; i++) {
- int err;
+ /* Avoid to grab roots in dead_roots */
+ if (btrfs_root_refs(&gang[i]->root_item) == 0) {
+ gang[i] = NULL;
+ continue;
+ }
+ /* grab all the search result for later use */
+ gang[i] = btrfs_grab_fs_root(gang[i]);
+ }
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ for (i = 0; i < ret; i++) {
+ if (!gang[i])
+ continue;
root_objectid = gang[i]->root_key.objectid;
err = btrfs_orphan_cleanup(gang[i]);
if (err)
- return err;
+ break;
+ btrfs_put_fs_root(gang[i]);
}
root_objectid++;
}
- return 0;
+
+ /* release the uncleaned roots due to error */
+ for (; i < ret; i++) {
+ if (gang[i])
+ btrfs_put_fs_root(gang[i]);
+ }
+ return err;
}
int btrfs_commit_super(struct btrfs_root *root)
@@ -3603,6 +3643,8 @@ int close_ctree(struct btrfs_root *root)
/* clear out the rbtree of defraggable inodes */
btrfs_cleanup_defrag_inodes(fs_info);
+ cancel_work_sync(&fs_info->async_reclaim_work);
+
if (!(fs_info->sb->s_flags & MS_RDONLY)) {
ret = btrfs_commit_super(root);
if (ret)
@@ -3627,12 +3669,17 @@ int close_ctree(struct btrfs_root *root)
btrfs_sysfs_remove_one(fs_info);
- del_fs_roots(fs_info);
+ btrfs_free_fs_roots(fs_info);
btrfs_put_block_group_cache(fs_info);
btrfs_free_block_groups(fs_info);
+ /*
+ * we must make sure there is not any read request to
+ * submit after we stopping all workers.
+ */
+ invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
btrfs_stop_all_workers(fs_info);
free_root_pointers(fs_info, 1);
@@ -3709,6 +3756,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
__percpu_counter_add(&root->fs_info->dirty_metadata_bytes,
buf->len,
root->fs_info->dirty_metadata_batch);
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) {
+ btrfs_print_leaf(root, buf);
+ ASSERT(0);
+ }
+#endif
}
static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 53059df350f8..23ce3ceba0a9 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -68,6 +68,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
int btrfs_init_fs_root(struct btrfs_root *root);
int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
+void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_key *key,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5590af92094b..99c253918208 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -26,16 +26,16 @@
#include <linux/ratelimit.h>
#include <linux/percpu_counter.h>
#include "hash.h"
-#include "ctree.h"
+#include "tree-log.h"
#include "disk-io.h"
#include "print-tree.h"
-#include "transaction.h"
#include "volumes.h"
#include "raid56.h"
#include "locking.h"
#include "free-space-cache.h"
#include "math.h"
#include "sysfs.h"
+#include "qgroup.h"
#undef SCRAMBLE_DELAYED_REFS
@@ -81,7 +81,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
u64 root_objectid, u64 owner_objectid,
u64 owner_offset, int refs_to_drop,
- struct btrfs_delayed_extent_op *extra_op);
+ struct btrfs_delayed_extent_op *extra_op,
+ int no_quota);
static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
struct extent_buffer *leaf,
struct btrfs_extent_item *ei);
@@ -94,7 +95,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
u64 flags, struct btrfs_disk_key *key,
- int level, struct btrfs_key *ins);
+ int level, struct btrfs_key *ins,
+ int no_quota);
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 flags,
int force);
@@ -103,7 +105,8 @@ static int find_next_key(struct btrfs_path *path, int level,
static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
int dump_block_groups);
static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int reserve);
+ u64 num_bytes, int reserve,
+ int delalloc);
static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
int btrfs_pin_extent(struct btrfs_root *root,
@@ -1271,7 +1274,7 @@ fail:
static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- int refs_to_drop)
+ int refs_to_drop, int *last_ref)
{
struct btrfs_key key;
struct btrfs_extent_data_ref *ref1 = NULL;
@@ -1307,6 +1310,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
if (num_refs == 0) {
ret = btrfs_del_item(trans, root, path);
+ *last_ref = 1;
} else {
if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
@@ -1764,7 +1768,8 @@ void update_inline_extent_backref(struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
int refs_to_mod,
- struct btrfs_delayed_extent_op *extent_op)
+ struct btrfs_delayed_extent_op *extent_op,
+ int *last_ref)
{
struct extent_buffer *leaf;
struct btrfs_extent_item *ei;
@@ -1808,6 +1813,7 @@ void update_inline_extent_backref(struct btrfs_root *root,
else
btrfs_set_shared_data_ref_count(leaf, sref, refs);
} else {
+ *last_ref = 1;
size = btrfs_extent_inline_ref_size(type);
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
ptr = (unsigned long)iref;
@@ -1839,7 +1845,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
if (ret == 0) {
BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
update_inline_extent_backref(root, path, iref,
- refs_to_add, extent_op);
+ refs_to_add, extent_op, NULL);
} else if (ret == -ENOENT) {
setup_inline_extent_backref(root, path, iref, parent,
root_objectid, owner, offset,
@@ -1872,17 +1878,19 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
- int refs_to_drop, int is_data)
+ int refs_to_drop, int is_data, int *last_ref)
{
int ret = 0;
BUG_ON(!is_data && refs_to_drop != 1);
if (iref) {
update_inline_extent_backref(root, path, iref,
- -refs_to_drop, NULL);
+ -refs_to_drop, NULL, last_ref);
} else if (is_data) {
- ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
+ ret = remove_extent_data_ref(trans, root, path, refs_to_drop,
+ last_ref);
} else {
+ *last_ref = 1;
ret = btrfs_del_item(trans, root, path);
}
return ret;
@@ -1946,7 +1954,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 owner, u64 offset, int for_cow)
+ u64 root_objectid, u64 owner, u64 offset,
+ int no_quota)
{
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1958,12 +1967,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
num_bytes,
parent, root_objectid, (int)owner,
- BTRFS_ADD_DELAYED_REF, NULL, for_cow);
+ BTRFS_ADD_DELAYED_REF, NULL, no_quota);
} else {
ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
num_bytes,
parent, root_objectid, owner, offset,
- BTRFS_ADD_DELAYED_REF, NULL, for_cow);
+ BTRFS_ADD_DELAYED_REF, NULL, no_quota);
}
return ret;
}
@@ -1973,31 +1982,64 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
u64 parent, u64 root_objectid,
u64 owner, u64 offset, int refs_to_add,
+ int no_quota,
struct btrfs_delayed_extent_op *extent_op)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_extent_item *item;
+ struct btrfs_key key;
u64 refs;
int ret;
+ enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+ if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled)
+ no_quota = 1;
+
path->reada = 1;
path->leave_spinning = 1;
/* this will setup the path even if it fails to insert the back ref */
- ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
- path, bytenr, num_bytes, parent,
+ ret = insert_inline_extent_backref(trans, fs_info->extent_root, path,
+ bytenr, num_bytes, parent,
root_objectid, owner, offset,
refs_to_add, extent_op);
- if (ret != -EAGAIN)
+ if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota))
+ goto out;
+ /*
+ * Ok we were able to insert an inline extent and it appears to be a new
+ * reference, deal with the qgroup accounting.
+ */
+ if (!ret && !no_quota) {
+ ASSERT(root->fs_info->quota_enabled);
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_item);
+ if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add)
+ type = BTRFS_QGROUP_OPER_ADD_SHARED;
+ btrfs_release_path(path);
+
+ ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
+ bytenr, num_bytes, type, 0);
goto out;
+ }
+ /*
+ * Ok we had -EAGAIN which means we didn't have space to insert and
+ * inline extent ref, so just update the reference count and add a
+ * normal backref.
+ */
leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
refs = btrfs_extent_refs(leaf, item);
+ if (refs)
+ type = BTRFS_QGROUP_OPER_ADD_SHARED;
btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
if (extent_op)
__run_delayed_extent_op(extent_op, leaf, item);
@@ -2005,9 +2047,15 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
+ if (!no_quota) {
+ ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
+ bytenr, num_bytes, type, 0);
+ if (ret)
+ goto out;
+ }
+
path->reada = 1;
path->leave_spinning = 1;
-
/* now insert the actual backref */
ret = insert_extent_backref(trans, root->fs_info->extent_root,
path, bytenr, parent, root_objectid,
@@ -2041,8 +2089,7 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
if (node->type == BTRFS_SHARED_DATA_REF_KEY)
parent = ref->parent;
- else
- ref_root = ref->root;
+ ref_root = ref->root;
if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
if (extent_op)
@@ -2056,13 +2103,13 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
node->num_bytes, parent,
ref_root, ref->objectid,
ref->offset, node->ref_mod,
- extent_op);
+ node->no_quota, extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
ret = __btrfs_free_extent(trans, root, node->bytenr,
node->num_bytes, parent,
ref_root, ref->objectid,
ref->offset, node->ref_mod,
- extent_op);
+ extent_op, node->no_quota);
} else {
BUG();
}
@@ -2199,8 +2246,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
parent = ref->parent;
- else
- ref_root = ref->root;
+ ref_root = ref->root;
ins.objectid = node->bytenr;
if (skinny_metadata) {
@@ -2218,15 +2264,18 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
parent, ref_root,
extent_op->flags_to_set,
&extent_op->key,
- ref->level, &ins);
+ ref->level, &ins,
+ node->no_quota);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
node->num_bytes, parent, ref_root,
- ref->level, 0, 1, extent_op);
+ ref->level, 0, 1, node->no_quota,
+ extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
ret = __btrfs_free_extent(trans, root, node->bytenr,
node->num_bytes, parent, ref_root,
- ref->level, 0, 1, extent_op);
+ ref->level, 0, 1, extent_op,
+ node->no_quota);
} else {
BUG();
}
@@ -2574,42 +2623,6 @@ static u64 find_middle(struct rb_root *root)
}
#endif
-int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
-{
- struct qgroup_update *qgroup_update;
- int ret = 0;
-
- if (list_empty(&trans->qgroup_ref_list) !=
- !trans->delayed_ref_elem.seq) {
- /* list without seq or seq without list */
- btrfs_err(fs_info,
- "qgroup accounting update error, list is%s empty, seq is %#x.%x",
- list_empty(&trans->qgroup_ref_list) ? "" : " not",
- (u32)(trans->delayed_ref_elem.seq >> 32),
- (u32)trans->delayed_ref_elem.seq);
- BUG();
- }
-
- if (!trans->delayed_ref_elem.seq)
- return 0;
-
- while (!list_empty(&trans->qgroup_ref_list)) {
- qgroup_update = list_first_entry(&trans->qgroup_ref_list,
- struct qgroup_update, list);
- list_del(&qgroup_update->list);
- if (!ret)
- ret = btrfs_qgroup_account_ref(
- trans, fs_info, qgroup_update->node,
- qgroup_update->extent_op);
- kfree(qgroup_update);
- }
-
- btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
-
- return ret;
-}
-
static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
{
u64 num_bytes;
@@ -2662,15 +2675,94 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
u64 num_entries =
atomic_read(&trans->transaction->delayed_refs.num_entries);
u64 avg_runtime;
+ u64 val;
smp_mb();
avg_runtime = fs_info->avg_delayed_ref_runtime;
+ val = num_entries * avg_runtime;
if (num_entries * avg_runtime >= NSEC_PER_SEC)
return 1;
+ if (val >= NSEC_PER_SEC / 2)
+ return 2;
return btrfs_check_space_for_delayed_refs(trans, root);
}
+struct async_delayed_refs {
+ struct btrfs_root *root;
+ int count;
+ int error;
+ int sync;
+ struct completion wait;
+ struct btrfs_work work;
+};
+
+static void delayed_ref_async_start(struct btrfs_work *work)
+{
+ struct async_delayed_refs *async;
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ async = container_of(work, struct async_delayed_refs, work);
+
+ trans = btrfs_join_transaction(async->root);
+ if (IS_ERR(trans)) {
+ async->error = PTR_ERR(trans);
+ goto done;
+ }
+
+ /*
+ * trans->sync means that when we call end_transaciton, we won't
+ * wait on delayed refs
+ */
+ trans->sync = true;
+ ret = btrfs_run_delayed_refs(trans, async->root, async->count);
+ if (ret)
+ async->error = ret;
+
+ ret = btrfs_end_transaction(trans, async->root);
+ if (ret && !async->error)
+ async->error = ret;
+done:
+ if (async->sync)
+ complete(&async->wait);
+ else
+ kfree(async);
+}
+
+int btrfs_async_run_delayed_refs(struct btrfs_root *root,
+ unsigned long count, int wait)
+{
+ struct async_delayed_refs *async;
+ int ret;
+
+ async = kmalloc(sizeof(*async), GFP_NOFS);
+ if (!async)
+ return -ENOMEM;
+
+ async->root = root->fs_info->tree_root;
+ async->count = count;
+ async->error = 0;
+ if (wait)
+ async->sync = 1;
+ else
+ async->sync = 0;
+ init_completion(&async->wait);
+
+ btrfs_init_work(&async->work, delayed_ref_async_start,
+ NULL, NULL);
+
+ btrfs_queue_work(root->fs_info->extent_workers, &async->work);
+
+ if (wait) {
+ wait_for_completion(&async->wait);
+ ret = async->error;
+ kfree(async);
+ return ret;
+ }
+ return 0;
+}
+
/*
* this starts processing the delayed reference count updates and
* extent insertions we have queued up so far. count can be
@@ -2698,8 +2790,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
if (root == root->fs_info->extent_root)
root = root->fs_info->tree_root;
- btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
-
delayed_refs = &trans->transaction->delayed_refs;
if (count == 0) {
count = atomic_read(&delayed_refs->num_entries) * 2;
@@ -2758,6 +2848,9 @@ again:
goto again;
}
out:
+ ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info);
+ if (ret)
+ return ret;
assert_qgroups_uptodate(trans);
return 0;
}
@@ -2964,7 +3057,7 @@ out:
static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
- int full_backref, int inc, int for_cow)
+ int full_backref, int inc, int no_quota)
{
u64 bytenr;
u64 num_bytes;
@@ -2979,11 +3072,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
u64, u64, u64, u64, u64, u64, int);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ return 0;
+#endif
ref_root = btrfs_header_owner(buf);
nritems = btrfs_header_nritems(buf);
level = btrfs_header_level(buf);
- if (!root->ref_cows && level == 0)
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
return 0;
if (inc)
@@ -3014,7 +3111,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
key.offset -= btrfs_file_extent_offset(buf, fi);
ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, key.objectid,
- key.offset, for_cow);
+ key.offset, no_quota);
if (ret)
goto fail;
} else {
@@ -3022,7 +3119,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
num_bytes = btrfs_level_size(root, level - 1);
ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, level - 1, 0,
- for_cow);
+ no_quota);
if (ret)
goto fail;
}
@@ -3033,15 +3130,15 @@ fail:
}
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref, int for_cow)
+ struct extent_buffer *buf, int full_backref, int no_quota)
{
- return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
+ return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota);
}
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref, int for_cow)
+ struct extent_buffer *buf, int full_backref, int no_quota)
{
- return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
+ return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota);
}
static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -3164,7 +3261,8 @@ again:
spin_lock(&block_group->lock);
if (block_group->cached != BTRFS_CACHE_FINISHED ||
- !btrfs_test_opt(root, SPACE_CACHE)) {
+ !btrfs_test_opt(root, SPACE_CACHE) ||
+ block_group->delalloc_bytes) {
/*
* don't bother trying to write stuff out _if_
* a) we're not cached,
@@ -3401,10 +3499,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
return ret;
}
- for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
INIT_LIST_HEAD(&found->block_groups[i]);
- kobject_init(&found->block_group_kobjs[i], &btrfs_raid_ktype);
- }
init_rwsem(&found->groups_sem);
spin_lock_init(&found->lock);
found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
@@ -4204,6 +4300,104 @@ static int flush_space(struct btrfs_root *root,
return ret;
}
+
+static inline u64
+btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
+ struct btrfs_space_info *space_info)
+{
+ u64 used;
+ u64 expected;
+ u64 to_reclaim;
+
+ to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024,
+ 16 * 1024 * 1024);
+ spin_lock(&space_info->lock);
+ if (can_overcommit(root, space_info, to_reclaim,
+ BTRFS_RESERVE_FLUSH_ALL)) {
+ to_reclaim = 0;
+ goto out;
+ }
+
+ used = space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_pinned + space_info->bytes_readonly +
+ space_info->bytes_may_use;
+ if (can_overcommit(root, space_info, 1024 * 1024,
+ BTRFS_RESERVE_FLUSH_ALL))
+ expected = div_factor_fine(space_info->total_bytes, 95);
+ else
+ expected = div_factor_fine(space_info->total_bytes, 90);
+
+ if (used > expected)
+ to_reclaim = used - expected;
+ else
+ to_reclaim = 0;
+ to_reclaim = min(to_reclaim, space_info->bytes_may_use +
+ space_info->bytes_reserved);
+out:
+ spin_unlock(&space_info->lock);
+
+ return to_reclaim;
+}
+
+static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
+ struct btrfs_fs_info *fs_info, u64 used)
+{
+ return (used >= div_factor_fine(space_info->total_bytes, 98) &&
+ !btrfs_fs_closing(fs_info) &&
+ !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
+}
+
+static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
+ struct btrfs_fs_info *fs_info)
+{
+ u64 used;
+
+ spin_lock(&space_info->lock);
+ used = space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_pinned + space_info->bytes_readonly +
+ space_info->bytes_may_use;
+ if (need_do_async_reclaim(space_info, fs_info, used)) {
+ spin_unlock(&space_info->lock);
+ return 1;
+ }
+ spin_unlock(&space_info->lock);
+
+ return 0;
+}
+
+static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *space_info;
+ u64 to_reclaim;
+ int flush_state;
+
+ fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
+ space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+
+ to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
+ space_info);
+ if (!to_reclaim)
+ return;
+
+ flush_state = FLUSH_DELAYED_ITEMS_NR;
+ do {
+ flush_space(fs_info->fs_root, space_info, to_reclaim,
+ to_reclaim, flush_state);
+ flush_state++;
+ if (!btrfs_need_do_async_reclaim(space_info, fs_info))
+ return;
+ } while (flush_state <= COMMIT_TRANS);
+
+ if (btrfs_need_do_async_reclaim(space_info, fs_info))
+ queue_work(system_unbound_wq, work);
+}
+
+void btrfs_init_async_reclaim_work(struct work_struct *work)
+{
+ INIT_WORK(work, btrfs_async_reclaim_metadata_space);
+}
+
/**
* reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
* @root - the root we're allocating for
@@ -4311,8 +4505,13 @@ again:
if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
flushing = true;
space_info->flush = 1;
+ } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
+ used += orig_bytes;
+ if (need_do_async_reclaim(space_info, root->fs_info, used) &&
+ !work_busy(&root->fs_info->async_reclaim_work))
+ queue_work(system_unbound_wq,
+ &root->fs_info->async_reclaim_work);
}
-
spin_unlock(&space_info->lock);
if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
@@ -4369,7 +4568,7 @@ static struct btrfs_block_rsv *get_block_rsv(
{
struct btrfs_block_rsv *block_rsv = NULL;
- if (root->ref_cows)
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
block_rsv = trans->block_rsv;
if (root == root->fs_info->csum_root && trans->adding_csums)
@@ -5416,6 +5615,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,
* @cache: The cache we are manipulating
* @num_bytes: The number of bytes in question
* @reserve: One of the reservation enums
+ * @delalloc: The blocks are allocated for the delalloc write
*
* This is called by the allocator when it reserves space, or by somebody who is
* freeing space that was never actually used on disk. For example if you
@@ -5434,7 +5634,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,
* succeeds.
*/
static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int reserve)
+ u64 num_bytes, int reserve, int delalloc)
{
struct btrfs_space_info *space_info = cache->space_info;
int ret = 0;
@@ -5453,12 +5653,18 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
num_bytes, 0);
space_info->bytes_may_use -= num_bytes;
}
+
+ if (delalloc)
+ cache->delalloc_bytes += num_bytes;
}
} else {
if (cache->ro)
space_info->bytes_readonly += num_bytes;
cache->reserved -= num_bytes;
space_info->bytes_reserved -= num_bytes;
+
+ if (delalloc)
+ cache->delalloc_bytes -= num_bytes;
}
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
@@ -5621,7 +5827,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
u64 root_objectid, u64 owner_objectid,
u64 owner_offset, int refs_to_drop,
- struct btrfs_delayed_extent_op *extent_op)
+ struct btrfs_delayed_extent_op *extent_op,
+ int no_quota)
{
struct btrfs_key key;
struct btrfs_path *path;
@@ -5637,9 +5844,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
int num_to_del = 1;
u32 item_size;
u64 refs;
+ int last_ref = 0;
+ enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL;
bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
SKINNY_METADATA);
+ if (!info->quota_enabled || !is_fstree(root_objectid))
+ no_quota = 1;
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -5687,7 +5899,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
BUG_ON(iref);
ret = remove_extent_backref(trans, extent_root, path,
NULL, refs_to_drop,
- is_data);
+ is_data, &last_ref);
if (ret) {
btrfs_abort_transaction(trans, extent_root, ret);
goto out;
@@ -5806,7 +6018,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
refs = btrfs_extent_refs(leaf, ei);
if (refs < refs_to_drop) {
btrfs_err(info, "trying to drop %d refs but we only have %Lu "
- "for bytenr %Lu\n", refs_to_drop, refs, bytenr);
+ "for bytenr %Lu", refs_to_drop, refs, bytenr);
ret = -EINVAL;
btrfs_abort_transaction(trans, extent_root, ret);
goto out;
@@ -5814,6 +6026,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
refs -= refs_to_drop;
if (refs > 0) {
+ type = BTRFS_QGROUP_OPER_SUB_SHARED;
if (extent_op)
__run_delayed_extent_op(extent_op, leaf, ei);
/*
@@ -5829,7 +6042,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
if (found_extent) {
ret = remove_extent_backref(trans, extent_root, path,
iref, refs_to_drop,
- is_data);
+ is_data, &last_ref);
if (ret) {
btrfs_abort_transaction(trans, extent_root, ret);
goto out;
@@ -5850,6 +6063,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
+ last_ref = 1;
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
if (ret) {
@@ -5872,6 +6086,20 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
goto out;
}
}
+ btrfs_release_path(path);
+
+ /* Deal with the quota accounting */
+ if (!ret && last_ref && !no_quota) {
+ int mod_seq = 0;
+
+ if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
+ type == BTRFS_QGROUP_OPER_SUB_SHARED)
+ mod_seq = 1;
+
+ ret = btrfs_qgroup_record_ref(trans, info, root_objectid,
+ bytenr, num_bytes, type,
+ mod_seq);
+ }
out:
btrfs_free_path(path);
return ret;
@@ -5987,7 +6215,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
btrfs_add_free_space(cache, buf->start, buf->len);
- btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
+ btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
pin = 0;
}
@@ -6008,11 +6236,15 @@ out:
/* Can return -ENOMEM */
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
- u64 owner, u64 offset, int for_cow)
+ u64 owner, u64 offset, int no_quota)
{
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ return 0;
+#endif
add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
/*
@@ -6028,13 +6260,13 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
num_bytes,
parent, root_objectid, (int)owner,
- BTRFS_DROP_DELAYED_REF, NULL, for_cow);
+ BTRFS_DROP_DELAYED_REF, NULL, no_quota);
} else {
ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
num_bytes,
parent, root_objectid, owner,
offset, BTRFS_DROP_DELAYED_REF,
- NULL, for_cow);
+ NULL, no_quota);
}
return ret;
}
@@ -6142,6 +6374,70 @@ enum btrfs_loop_type {
LOOP_NO_EMPTY_SIZE = 3,
};
+static inline void
+btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
+ int delalloc)
+{
+ if (delalloc)
+ down_read(&cache->data_rwsem);
+}
+
+static inline void
+btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
+ int delalloc)
+{
+ btrfs_get_block_group(cache);
+ if (delalloc)
+ down_read(&cache->data_rwsem);
+}
+
+static struct btrfs_block_group_cache *
+btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster,
+ int delalloc)
+{
+ struct btrfs_block_group_cache *used_bg;
+ bool locked = false;
+again:
+ spin_lock(&cluster->refill_lock);
+ if (locked) {
+ if (used_bg == cluster->block_group)
+ return used_bg;
+
+ up_read(&used_bg->data_rwsem);
+ btrfs_put_block_group(used_bg);
+ }
+
+ used_bg = cluster->block_group;
+ if (!used_bg)
+ return NULL;
+
+ if (used_bg == block_group)
+ return used_bg;
+
+ btrfs_get_block_group(used_bg);
+
+ if (!delalloc)
+ return used_bg;
+
+ if (down_read_trylock(&used_bg->data_rwsem))
+ return used_bg;
+
+ spin_unlock(&cluster->refill_lock);
+ down_read(&used_bg->data_rwsem);
+ locked = true;
+ goto again;
+}
+
+static inline void
+btrfs_release_block_group(struct btrfs_block_group_cache *cache,
+ int delalloc)
+{
+ if (delalloc)
+ up_read(&cache->data_rwsem);
+ btrfs_put_block_group(cache);
+}
+
/*
* walks the btree of allocated extents and find a hole of a given size.
* The key ins is changed to record the hole:
@@ -6156,7 +6452,7 @@ enum btrfs_loop_type {
static noinline int find_free_extent(struct btrfs_root *orig_root,
u64 num_bytes, u64 empty_size,
u64 hint_byte, struct btrfs_key *ins,
- u64 flags)
+ u64 flags, int delalloc)
{
int ret = 0;
struct btrfs_root *root = orig_root->fs_info->extent_root;
@@ -6244,6 +6540,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
up_read(&space_info->groups_sem);
} else {
index = get_block_group_index(block_group);
+ btrfs_lock_block_group(block_group, delalloc);
goto have_block_group;
}
} else if (block_group) {
@@ -6258,7 +6555,7 @@ search:
u64 offset;
int cached;
- btrfs_get_block_group(block_group);
+ btrfs_grab_block_group(block_group, delalloc);
search_start = block_group->key.objectid;
/*
@@ -6306,16 +6603,16 @@ have_block_group:
* the refill lock keeps out other
* people trying to start a new cluster
*/
- spin_lock(&last_ptr->refill_lock);
- used_block_group = last_ptr->block_group;
- if (used_block_group != block_group &&
- (!used_block_group ||
- used_block_group->ro ||
- !block_group_bits(used_block_group, flags)))
+ used_block_group = btrfs_lock_cluster(block_group,
+ last_ptr,
+ delalloc);
+ if (!used_block_group)
goto refill_cluster;
- if (used_block_group != block_group)
- btrfs_get_block_group(used_block_group);
+ if (used_block_group != block_group &&
+ (used_block_group->ro ||
+ !block_group_bits(used_block_group, flags)))
+ goto release_cluster;
offset = btrfs_alloc_from_cluster(used_block_group,
last_ptr,
@@ -6329,16 +6626,15 @@ have_block_group:
used_block_group,
search_start, num_bytes);
if (used_block_group != block_group) {
- btrfs_put_block_group(block_group);
+ btrfs_release_block_group(block_group,
+ delalloc);
block_group = used_block_group;
}
goto checks;
}
WARN_ON(last_ptr->block_group != used_block_group);
- if (used_block_group != block_group)
- btrfs_put_block_group(used_block_group);
-refill_cluster:
+release_cluster:
/* If we are on LOOP_NO_EMPTY_SIZE, we can't
* set up a new clusters, so lets just skip it
* and let the allocator find whatever block
@@ -6355,8 +6651,10 @@ refill_cluster:
* succeeding in the unclustered
* allocation. */
if (loop >= LOOP_NO_EMPTY_SIZE &&
- last_ptr->block_group != block_group) {
+ used_block_group != block_group) {
spin_unlock(&last_ptr->refill_lock);
+ btrfs_release_block_group(used_block_group,
+ delalloc);
goto unclustered_alloc;
}
@@ -6366,6 +6664,10 @@ refill_cluster:
*/
btrfs_return_cluster_to_free_space(NULL, last_ptr);
+ if (used_block_group != block_group)
+ btrfs_release_block_group(used_block_group,
+ delalloc);
+refill_cluster:
if (loop >= LOOP_NO_EMPTY_SIZE) {
spin_unlock(&last_ptr->refill_lock);
goto unclustered_alloc;
@@ -6473,7 +6775,7 @@ checks:
BUG_ON(offset > search_start);
ret = btrfs_update_reserved_bytes(block_group, num_bytes,
- alloc_type);
+ alloc_type, delalloc);
if (ret == -EAGAIN) {
btrfs_add_free_space(block_group, offset, num_bytes);
goto loop;
@@ -6485,13 +6787,13 @@ checks:
trace_btrfs_reserve_extent(orig_root, block_group,
search_start, num_bytes);
- btrfs_put_block_group(block_group);
+ btrfs_release_block_group(block_group, delalloc);
break;
loop:
failed_cluster_refill = false;
failed_alloc = false;
BUG_ON(index != get_block_group_index(block_group));
- btrfs_put_block_group(block_group);
+ btrfs_release_block_group(block_group, delalloc);
}
up_read(&space_info->groups_sem);
@@ -6514,8 +6816,14 @@ loop:
loop++;
if (loop == LOOP_ALLOC_CHUNK) {
struct btrfs_trans_handle *trans;
+ int exist = 0;
+
+ trans = current->journal_info;
+ if (trans)
+ exist = 1;
+ else
+ trans = btrfs_join_transaction(root);
- trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out;
@@ -6532,7 +6840,8 @@ loop:
root, ret);
else
ret = 0;
- btrfs_end_transaction(trans, root);
+ if (!exist)
+ btrfs_end_transaction(trans, root);
if (ret)
goto out;
}
@@ -6597,7 +6906,7 @@ again:
int btrfs_reserve_extent(struct btrfs_root *root,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
- struct btrfs_key *ins, int is_data)
+ struct btrfs_key *ins, int is_data, int delalloc)
{
bool final_tried = false;
u64 flags;
@@ -6607,7 +6916,7 @@ int btrfs_reserve_extent(struct btrfs_root *root,
again:
WARN_ON(num_bytes < root->sectorsize);
ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
- flags);
+ flags, delalloc);
if (ret == -ENOSPC) {
if (!final_tried && ins->offset) {
@@ -6632,7 +6941,8 @@ again:
}
static int __btrfs_free_reserved_extent(struct btrfs_root *root,
- u64 start, u64 len, int pin)
+ u64 start, u64 len,
+ int pin, int delalloc)
{
struct btrfs_block_group_cache *cache;
int ret = 0;
@@ -6651,7 +6961,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
pin_down_extent(root, cache, start, len, 1);
else {
btrfs_add_free_space(cache, start, len);
- btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
+ btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
}
btrfs_put_block_group(cache);
@@ -6661,15 +6971,15 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
}
int btrfs_free_reserved_extent(struct btrfs_root *root,
- u64 start, u64 len)
+ u64 start, u64 len, int delalloc)
{
- return __btrfs_free_reserved_extent(root, start, len, 0);
+ return __btrfs_free_reserved_extent(root, start, len, 0, delalloc);
}
int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
u64 start, u64 len)
{
- return __btrfs_free_reserved_extent(root, start, len, 1);
+ return __btrfs_free_reserved_extent(root, start, len, 1, 0);
}
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
@@ -6733,6 +7043,13 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
+ /* Always set parent to 0 here since its exclusive anyway. */
+ ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
+ ins->objectid, ins->offset,
+ BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+ if (ret)
+ return ret;
+
ret = update_block_group(root, ins->objectid, ins->offset, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
@@ -6747,7 +7064,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
u64 flags, struct btrfs_disk_key *key,
- int level, struct btrfs_key *ins)
+ int level, struct btrfs_key *ins,
+ int no_quota)
{
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -6757,6 +7075,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct extent_buffer *leaf;
u32 size = sizeof(*extent_item) + sizeof(*iref);
+ u64 num_bytes = ins->offset;
bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
SKINNY_METADATA);
@@ -6790,6 +7109,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
if (skinny_metadata) {
iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
+ num_bytes = root->leafsize;
} else {
block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
btrfs_set_tree_block_key(leaf, block_info, key);
@@ -6811,6 +7131,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
+ if (!no_quota) {
+ ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
+ ins->objectid, num_bytes,
+ BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+ if (ret)
+ return ret;
+ }
+
ret = update_block_group(root, ins->objectid, root->leafsize, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
@@ -6866,7 +7194,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
return -EINVAL;
ret = btrfs_update_reserved_bytes(block_group, ins->offset,
- RESERVE_ALLOC_NO_ACCOUNT);
+ RESERVE_ALLOC_NO_ACCOUNT, 0);
BUG_ON(ret); /* logic error */
ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
0, owner, offset, ins, 1);
@@ -6994,12 +7322,21 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
SKINNY_METADATA);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) {
+ buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
+ blocksize, level);
+ if (!IS_ERR(buf))
+ root->alloc_bytenr += blocksize;
+ return buf;
+ }
+#endif
block_rsv = use_block_rsv(trans, root, blocksize);
if (IS_ERR(block_rsv))
return ERR_CAST(block_rsv);
ret = btrfs_reserve_extent(root, blocksize, blocksize,
- empty_size, hint, &ins, 0);
+ empty_size, hint, &ins, 0, 0);
if (ret) {
unuse_block_rsv(root->fs_info, block_rsv, blocksize);
return ERR_PTR(ret);
@@ -7735,7 +8072,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
}
- if (root->in_radix) {
+ if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
} else {
free_extent_buffer(root->node);
@@ -8327,8 +8664,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
list_del(&space_info->list);
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
struct kobject *kobj;
- kobj = &space_info->block_group_kobjs[i];
- if (kobj->parent) {
+ kobj = space_info->block_group_kobjs[i];
+ space_info->block_group_kobjs[i] = NULL;
+ if (kobj) {
kobject_del(kobj);
kobject_put(kobj);
}
@@ -8352,17 +8690,26 @@ static void __link_block_group(struct btrfs_space_info *space_info,
up_write(&space_info->groups_sem);
if (first) {
- struct kobject *kobj = &space_info->block_group_kobjs[index];
+ struct raid_kobject *rkobj;
int ret;
- kobject_get(&space_info->kobj); /* put in release */
- ret = kobject_add(kobj, &space_info->kobj, "%s",
- get_raid_name(index));
+ rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
+ if (!rkobj)
+ goto out_err;
+ rkobj->raid_type = index;
+ kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
+ ret = kobject_add(&rkobj->kobj, &space_info->kobj,
+ "%s", get_raid_name(index));
if (ret) {
- pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n");
- kobject_put(&space_info->kobj);
+ kobject_put(&rkobj->kobj);
+ goto out_err;
}
+ space_info->block_group_kobjs[index] = &rkobj->kobj;
}
+
+ return;
+out_err:
+ pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n");
}
static struct btrfs_block_group_cache *
@@ -8392,6 +8739,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
start);
atomic_set(&cache->count, 1);
spin_lock_init(&cache->lock);
+ init_rwsem(&cache->data_rwsem);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
INIT_LIST_HEAD(&cache->new_bg_list);
@@ -8611,7 +8959,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
extent_root = root->fs_info->extent_root;
- root->fs_info->last_trans_log_full_commit = trans->transid;
+ btrfs_set_log_full_commit(root->fs_info, trans);
cache = btrfs_create_block_group_cache(root, chunk_offset, size);
if (!cache)
@@ -8697,6 +9045,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root = root->fs_info->tree_root;
struct btrfs_key key;
struct inode *inode;
+ struct kobject *kobj = NULL;
int ret;
int index;
int factor;
@@ -8796,11 +9145,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
*/
list_del_init(&block_group->list);
if (list_empty(&block_group->space_info->block_groups[index])) {
- kobject_del(&block_group->space_info->block_group_kobjs[index]);
- kobject_put(&block_group->space_info->block_group_kobjs[index]);
+ kobj = block_group->space_info->block_group_kobjs[index];
+ block_group->space_info->block_group_kobjs[index] = NULL;
clear_avail_alloc_bits(root->fs_info, block_group->flags);
}
up_write(&block_group->space_info->groups_sem);
+ if (kobj) {
+ kobject_del(kobj);
+ kobject_put(kobj);
+ }
if (block_group->cached == BTRFS_CACHE_STARTED)
wait_block_group_cache_done(block_group);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4cd0ac983f91..a389820d158b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1693,6 +1693,7 @@ again:
* shortening the size of the delalloc range we're searching
*/
free_extent_state(cached_state);
+ cached_state = NULL;
if (!loops) {
max_bytes = PAGE_CACHE_SIZE;
loops = 1;
@@ -2353,7 +2354,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
{
int uptodate = (err == 0);
struct extent_io_tree *tree;
- int ret;
+ int ret = 0;
tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -2367,6 +2368,8 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
if (!uptodate) {
ClearPageUptodate(page);
SetPageError(page);
+ ret = ret < 0 ? ret : -EIO;
+ mapping_set_error(page->mapping, ret);
}
return 0;
}
@@ -3098,143 +3101,130 @@ static noinline void update_nr_written(struct page *page,
}
/*
- * the writepage semantics are similar to regular writepage. extent
- * records are inserted to lock ranges in the tree, and as dirty areas
- * are found, they are marked writeback. Then the lock bits are removed
- * and the end_io handler clears the writeback ranges
+ * helper for __extent_writepage, doing all of the delayed allocation setup.
+ *
+ * This returns 1 if our fill_delalloc function did all the work required
+ * to write the page (copy into inline extent). In this case the IO has
+ * been started and the page is already unlocked.
+ *
+ * This returns 0 if all went well (page still locked)
+ * This returns < 0 if there were errors (page still locked)
*/
-static int __extent_writepage(struct page *page, struct writeback_control *wbc,
- void *data)
+static noinline_for_stack int writepage_delalloc(struct inode *inode,
+ struct page *page, struct writeback_control *wbc,
+ struct extent_page_data *epd,
+ u64 delalloc_start,
+ unsigned long *nr_written)
+{
+ struct extent_io_tree *tree = epd->tree;
+ u64 page_end = delalloc_start + PAGE_CACHE_SIZE - 1;
+ u64 nr_delalloc;
+ u64 delalloc_to_write = 0;
+ u64 delalloc_end = 0;
+ int ret;
+ int page_started = 0;
+
+ if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc)
+ return 0;
+
+ while (delalloc_end < page_end) {
+ nr_delalloc = find_lock_delalloc_range(inode, tree,
+ page,
+ &delalloc_start,
+ &delalloc_end,
+ 128 * 1024 * 1024);
+ if (nr_delalloc == 0) {
+ delalloc_start = delalloc_end + 1;
+ continue;
+ }
+ ret = tree->ops->fill_delalloc(inode, page,
+ delalloc_start,
+ delalloc_end,
+ &page_started,
+ nr_written);
+ /* File system has been set read-only */
+ if (ret) {
+ SetPageError(page);
+ /* fill_delalloc should be return < 0 for error
+ * but just in case, we use > 0 here meaning the
+ * IO is started, so we don't want to return > 0
+ * unless things are going well.
+ */
+ ret = ret < 0 ? ret : -EIO;
+ goto done;
+ }
+ /*
+ * delalloc_end is already one less than the total
+ * length, so we don't subtract one from
+ * PAGE_CACHE_SIZE
+ */
+ delalloc_to_write += (delalloc_end - delalloc_start +
+ PAGE_CACHE_SIZE) >>
+ PAGE_CACHE_SHIFT;
+ delalloc_start = delalloc_end + 1;
+ }
+ if (wbc->nr_to_write < delalloc_to_write) {
+ int thresh = 8192;
+
+ if (delalloc_to_write < thresh * 2)
+ thresh = delalloc_to_write;
+ wbc->nr_to_write = min_t(u64, delalloc_to_write,
+ thresh);
+ }
+
+ /* did the fill delalloc function already unlock and start
+ * the IO?
+ */
+ if (page_started) {
+ /*
+ * we've unlocked the page, so we can't update
+ * the mapping's writeback index, just update
+ * nr_to_write.
+ */
+ wbc->nr_to_write -= *nr_written;
+ return 1;
+ }
+
+ ret = 0;
+
+done:
+ return ret;
+}
+
+/*
+ * helper for __extent_writepage. This calls the writepage start hooks,
+ * and does the loop to map the page into extents and bios.
+ *
+ * We return 1 if the IO is started and the page is unlocked,
+ * 0 if all went well (page still locked)
+ * < 0 if there were errors (page still locked)
+ */
+static noinline_for_stack int __extent_writepage_io(struct inode *inode,
+ struct page *page,
+ struct writeback_control *wbc,
+ struct extent_page_data *epd,
+ loff_t i_size,
+ unsigned long nr_written,
+ int write_flags, int *nr_ret)
{
- struct inode *inode = page->mapping->host;
- struct extent_page_data *epd = data;
struct extent_io_tree *tree = epd->tree;
u64 start = page_offset(page);
- u64 delalloc_start;
u64 page_end = start + PAGE_CACHE_SIZE - 1;
u64 end;
u64 cur = start;
u64 extent_offset;
- u64 last_byte = i_size_read(inode);
u64 block_start;
u64 iosize;
sector_t sector;
struct extent_state *cached_state = NULL;
struct extent_map *em;
struct block_device *bdev;
- int ret;
- int nr = 0;
size_t pg_offset = 0;
size_t blocksize;
- loff_t i_size = i_size_read(inode);
- unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
- u64 nr_delalloc;
- u64 delalloc_end;
- int page_started;
- int compressed;
- int write_flags;
- unsigned long nr_written = 0;
- bool fill_delalloc = true;
-
- if (wbc->sync_mode == WB_SYNC_ALL)
- write_flags = WRITE_SYNC;
- else
- write_flags = WRITE;
-
- trace___extent_writepage(page, inode, wbc);
-
- WARN_ON(!PageLocked(page));
-
- ClearPageError(page);
-
- pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
- if (page->index > end_index ||
- (page->index == end_index && !pg_offset)) {
- page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
- unlock_page(page);
- return 0;
- }
-
- if (page->index == end_index) {
- char *userpage;
-
- userpage = kmap_atomic(page);
- memset(userpage + pg_offset, 0,
- PAGE_CACHE_SIZE - pg_offset);
- kunmap_atomic(userpage);
- flush_dcache_page(page);
- }
- pg_offset = 0;
-
- set_page_extent_mapped(page);
-
- if (!tree->ops || !tree->ops->fill_delalloc)
- fill_delalloc = false;
-
- delalloc_start = start;
- delalloc_end = 0;
- page_started = 0;
- if (!epd->extent_locked && fill_delalloc) {
- u64 delalloc_to_write = 0;
- /*
- * make sure the wbc mapping index is at least updated
- * to this page.
- */
- update_nr_written(page, wbc, 0);
-
- while (delalloc_end < page_end) {
- nr_delalloc = find_lock_delalloc_range(inode, tree,
- page,
- &delalloc_start,
- &delalloc_end,
- 128 * 1024 * 1024);
- if (nr_delalloc == 0) {
- delalloc_start = delalloc_end + 1;
- continue;
- }
- ret = tree->ops->fill_delalloc(inode, page,
- delalloc_start,
- delalloc_end,
- &page_started,
- &nr_written);
- /* File system has been set read-only */
- if (ret) {
- SetPageError(page);
- goto done;
- }
- /*
- * delalloc_end is already one less than the total
- * length, so we don't subtract one from
- * PAGE_CACHE_SIZE
- */
- delalloc_to_write += (delalloc_end - delalloc_start +
- PAGE_CACHE_SIZE) >>
- PAGE_CACHE_SHIFT;
- delalloc_start = delalloc_end + 1;
- }
- if (wbc->nr_to_write < delalloc_to_write) {
- int thresh = 8192;
-
- if (delalloc_to_write < thresh * 2)
- thresh = delalloc_to_write;
- wbc->nr_to_write = min_t(u64, delalloc_to_write,
- thresh);
- }
+ int ret = 0;
+ int nr = 0;
+ bool compressed;
- /* did the fill delalloc function already unlock and start
- * the IO?
- */
- if (page_started) {
- ret = 0;
- /*
- * we've unlocked the page, so we can't update
- * the mapping's writeback index, just update
- * nr_to_write.
- */
- wbc->nr_to_write -= nr_written;
- goto done_unlocked;
- }
- }
if (tree->ops && tree->ops->writepage_start_hook) {
ret = tree->ops->writepage_start_hook(page, start,
page_end);
@@ -3244,9 +3234,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
wbc->pages_skipped++;
else
redirty_page_for_writepage(wbc, page);
+
update_nr_written(page, wbc, nr_written);
unlock_page(page);
- ret = 0;
+ ret = 1;
goto done_unlocked;
}
}
@@ -3258,7 +3249,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
update_nr_written(page, wbc, nr_written + 1);
end = page_end;
- if (last_byte <= start) {
+ if (i_size <= start) {
if (tree->ops && tree->ops->writepage_end_io_hook)
tree->ops->writepage_end_io_hook(page, start,
page_end, NULL, 1);
@@ -3268,7 +3259,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
blocksize = inode->i_sb->s_blocksize;
while (cur <= end) {
- if (cur >= last_byte) {
+ u64 em_end;
+ if (cur >= i_size) {
if (tree->ops && tree->ops->writepage_end_io_hook)
tree->ops->writepage_end_io_hook(page, cur,
page_end, NULL, 1);
@@ -3278,13 +3270,15 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
end - cur + 1, 1);
if (IS_ERR_OR_NULL(em)) {
SetPageError(page);
+ ret = PTR_ERR_OR_ZERO(em);
break;
}
extent_offset = cur - em->start;
- BUG_ON(extent_map_end(em) <= cur);
+ em_end = extent_map_end(em);
+ BUG_ON(em_end <= cur);
BUG_ON(end < cur);
- iosize = min(extent_map_end(em) - cur, end - cur + 1);
+ iosize = min(em_end - cur, end - cur + 1);
iosize = ALIGN(iosize, blocksize);
sector = (em->block_start + extent_offset) >> 9;
bdev = em->bdev;
@@ -3320,13 +3314,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
pg_offset += iosize;
continue;
}
- /* leave this out until we have a page_mkwrite call */
- if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
- EXTENT_DIRTY, 0, NULL)) {
- cur = cur + iosize;
- pg_offset += iosize;
- continue;
- }
if (tree->ops && tree->ops->writepage_io_hook) {
ret = tree->ops->writepage_io_hook(page, cur,
@@ -3337,7 +3324,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
if (ret) {
SetPageError(page);
} else {
- unsigned long max_nr = end_index + 1;
+ unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1;
set_range_writeback(tree, cur, cur + iosize - 1);
if (!PageWriteback(page)) {
@@ -3359,17 +3346,94 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
nr++;
}
done:
+ *nr_ret = nr;
+
+done_unlocked:
+
+ /* drop our reference on any cached states */
+ free_extent_state(cached_state);
+ return ret;
+}
+
+/*
+ * the writepage semantics are similar to regular writepage. extent
+ * records are inserted to lock ranges in the tree, and as dirty areas
+ * are found, they are marked writeback. Then the lock bits are removed
+ * and the end_io handler clears the writeback ranges
+ */
+static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ void *data)
+{
+ struct inode *inode = page->mapping->host;
+ struct extent_page_data *epd = data;
+ u64 start = page_offset(page);
+ u64 page_end = start + PAGE_CACHE_SIZE - 1;
+ int ret;
+ int nr = 0;
+ size_t pg_offset = 0;
+ loff_t i_size = i_size_read(inode);
+ unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
+ int write_flags;
+ unsigned long nr_written = 0;
+
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ write_flags = WRITE_SYNC;
+ else
+ write_flags = WRITE;
+
+ trace___extent_writepage(page, inode, wbc);
+
+ WARN_ON(!PageLocked(page));
+
+ ClearPageError(page);
+
+ pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
+ if (page->index > end_index ||
+ (page->index == end_index && !pg_offset)) {
+ page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ unlock_page(page);
+ return 0;
+ }
+
+ if (page->index == end_index) {
+ char *userpage;
+
+ userpage = kmap_atomic(page);
+ memset(userpage + pg_offset, 0,
+ PAGE_CACHE_SIZE - pg_offset);
+ kunmap_atomic(userpage);
+ flush_dcache_page(page);
+ }
+
+ pg_offset = 0;
+
+ set_page_extent_mapped(page);
+
+ ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written);
+ if (ret == 1)
+ goto done_unlocked;
+ if (ret)
+ goto done;
+
+ ret = __extent_writepage_io(inode, page, wbc, epd,
+ i_size, nr_written, write_flags, &nr);
+ if (ret == 1)
+ goto done_unlocked;
+
+done:
if (nr == 0) {
/* make sure the mapping tag for page dirty gets cleared */
set_page_writeback(page);
end_page_writeback(page);
}
+ if (PageError(page)) {
+ ret = ret < 0 ? ret : -EIO;
+ end_extent_writepage(page, ret, start, page_end);
+ }
unlock_page(page);
+ return ret;
done_unlocked:
-
- /* drop our reference on any cached states */
- free_extent_state(cached_state);
return 0;
}
@@ -3385,9 +3449,10 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
TASK_UNINTERRUPTIBLE);
}
-static int lock_extent_buffer_for_io(struct extent_buffer *eb,
- struct btrfs_fs_info *fs_info,
- struct extent_page_data *epd)
+static noinline_for_stack int
+lock_extent_buffer_for_io(struct extent_buffer *eb,
+ struct btrfs_fs_info *fs_info,
+ struct extent_page_data *epd)
{
unsigned long i, num_pages;
int flush = 0;
@@ -3492,7 +3557,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
bio_put(bio);
}
-static int write_one_eb(struct extent_buffer *eb,
+static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
struct btrfs_fs_info *fs_info,
struct writeback_control *wbc,
struct extent_page_data *epd)
@@ -3690,6 +3755,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
struct inode *inode = mapping->host;
int ret = 0;
int done = 0;
+ int err = 0;
int nr_to_write_done = 0;
struct pagevec pvec;
int nr_pages;
@@ -3776,8 +3842,8 @@ retry:
unlock_page(page);
ret = 0;
}
- if (ret)
- done = 1;
+ if (!err && ret < 0)
+ err = ret;
/*
* the filesystem may choose to bump up nr_to_write.
@@ -3789,7 +3855,7 @@ retry:
pagevec_release(&pvec);
cond_resched();
}
- if (!scanned && !done) {
+ if (!scanned && !done && !err) {
/*
* We hit the last page and there is more work to be done: wrap
* back to the start of the file
@@ -3799,7 +3865,7 @@ retry:
goto retry;
}
btrfs_add_delayed_iput(inode);
- return ret;
+ return err;
}
static void flush_epd_write_bio(struct extent_page_data *epd)
@@ -4543,6 +4609,53 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
return NULL;
}
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start, unsigned long len)
+{
+ struct extent_buffer *eb, *exists = NULL;
+ int ret;
+
+ eb = find_extent_buffer(fs_info, start);
+ if (eb)
+ return eb;
+ eb = alloc_dummy_extent_buffer(start, len);
+ if (!eb)
+ return NULL;
+ eb->fs_info = fs_info;
+again:
+ ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+ if (ret)
+ goto free_eb;
+ spin_lock(&fs_info->buffer_lock);
+ ret = radix_tree_insert(&fs_info->buffer_radix,
+ start >> PAGE_CACHE_SHIFT, eb);
+ spin_unlock(&fs_info->buffer_lock);
+ radix_tree_preload_end();
+ if (ret == -EEXIST) {
+ exists = find_extent_buffer(fs_info, start);
+ if (exists)
+ goto free_eb;
+ else
+ goto again;
+ }
+ check_buffer_tree_ref(eb);
+ set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
+
+ /*
+ * We will free dummy extent buffer's if they come into
+ * free_extent_buffer with a ref count of 2, but if we are using this we
+ * want the buffers to stay in memory until we're done with them, so
+ * bump the ref count again.
+ */
+ atomic_inc(&eb->refs);
+ return eb;
+free_eb:
+ btrfs_release_extent_buffer(eb);
+ return exists;
+}
+#endif
+
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, unsigned long len)
{
@@ -4955,6 +5068,43 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
}
}
+int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
+ unsigned long start,
+ unsigned long len)
+{
+ size_t cur;
+ size_t offset;
+ struct page *page;
+ char *kaddr;
+ char __user *dst = (char __user *)dstv;
+ size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+ int ret = 0;
+
+ WARN_ON(start > eb->len);
+ WARN_ON(start + len > eb->start + eb->len);
+
+ offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+
+ while (len > 0) {
+ page = extent_buffer_page(eb, i);
+
+ cur = min(len, (PAGE_CACHE_SIZE - offset));
+ kaddr = page_address(page);
+ if (copy_to_user(dst, kaddr + offset, cur)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ dst += cur;
+ len -= cur;
+ offset = 0;
+ i++;
+ }
+
+ return ret;
+}
+
int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
unsigned long min_len, char **map,
unsigned long *map_start,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c488b45237bf..ccc264e7bde1 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -158,7 +158,6 @@ struct extent_buffer {
* to unlock
*/
wait_queue_head_t read_lock_wq;
- wait_queue_head_t lock_wq;
struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
#ifdef CONFIG_BTRFS_DEBUG
struct list_head leak_list;
@@ -304,6 +303,9 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
void read_extent_buffer(struct extent_buffer *eb, void *dst,
unsigned long start,
unsigned long len);
+int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dst,
+ unsigned long start,
+ unsigned long len);
void write_extent_buffer(struct extent_buffer *eb, const void *src,
unsigned long start, unsigned long len);
void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
@@ -350,5 +352,7 @@ noinline u64 find_lock_delalloc_range(struct inode *inode,
struct extent_io_tree *tree,
struct page *locked_page, u64 *start,
u64 *end, u64 max_bytes);
+struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start, unsigned long len);
#endif
#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 1874aee69c86..225302b39afb 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -75,6 +75,8 @@ void free_extent_map(struct extent_map *em)
if (atomic_dec_and_test(&em->refs)) {
WARN_ON(extent_map_in_tree(em));
WARN_ON(!list_empty(&em->list));
+ if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
+ kfree(em->bdev);
kmem_cache_free(extent_map_cache, em);
}
}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index e7fd8a56a140..b2991fd8583e 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -15,6 +15,7 @@
#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
+#define EXTENT_FLAG_FS_MAPPING 6 /* filesystem extent mapping type */
struct extent_map {
struct rb_node rb_node;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 127555b29f58..f46cfe45d686 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -281,10 +281,10 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
found:
csum += count * csum_size;
nblocks -= count;
+ bio_index += count;
while (count--) {
disk_bytenr += bvec->bv_len;
offset += bvec->bv_len;
- bio_index++;
bvec++;
}
}
@@ -750,7 +750,7 @@ again:
int slot = path->slots[0] + 1;
/* we didn't find a csum item, insert one */
nritems = btrfs_header_nritems(path->nodes[0]);
- if (path->slots[0] >= nritems - 1) {
+ if (!nritems || (path->slots[0] >= nritems - 1)) {
ret = btrfs_next_leaf(root, path);
if (ret == 1)
found_next = 1;
@@ -885,3 +885,79 @@ out:
fail_unlock:
goto out;
}
+
+void btrfs_extent_item_to_extent_map(struct inode *inode,
+ const struct btrfs_path *path,
+ struct btrfs_file_extent_item *fi,
+ const bool new_inline,
+ struct extent_map *em)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_buffer *leaf = path->nodes[0];
+ const int slot = path->slots[0];
+ struct btrfs_key key;
+ u64 extent_start, extent_end;
+ u64 bytenr;
+ u8 type = btrfs_file_extent_type(leaf, fi);
+ int compress_type = btrfs_file_extent_compression(leaf, fi);
+
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ extent_start = key.offset;
+
+ if (type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) {
+ extent_end = extent_start +
+ btrfs_file_extent_num_bytes(leaf, fi);
+ } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ size_t size;
+ size = btrfs_file_extent_inline_len(leaf, slot, fi);
+ extent_end = ALIGN(extent_start + size, root->sectorsize);
+ }
+
+ em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+ if (type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) {
+ em->start = extent_start;
+ em->len = extent_end - extent_start;
+ em->orig_start = extent_start -
+ btrfs_file_extent_offset(leaf, fi);
+ em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ if (bytenr == 0) {
+ em->block_start = EXTENT_MAP_HOLE;
+ return;
+ }
+ if (compress_type != BTRFS_COMPRESS_NONE) {
+ set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ em->compress_type = compress_type;
+ em->block_start = bytenr;
+ em->block_len = em->orig_block_len;
+ } else {
+ bytenr += btrfs_file_extent_offset(leaf, fi);
+ em->block_start = bytenr;
+ em->block_len = em->len;
+ if (type == BTRFS_FILE_EXTENT_PREALLOC)
+ set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+ }
+ } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ em->block_start = EXTENT_MAP_INLINE;
+ em->start = extent_start;
+ em->len = extent_end - extent_start;
+ /*
+ * Initialize orig_start and block_len with the same values
+ * as in inode.c:btrfs_get_extent().
+ */
+ em->orig_start = EXTENT_MAP_HOLE;
+ em->block_len = (u64)-1;
+ if (!new_inline && compress_type != BTRFS_COMPRESS_NONE) {
+ set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ em->compress_type = compress_type;
+ }
+ } else {
+ btrfs_err(root->fs_info,
+ "unknown file extent item type %d, inode %llu, offset %llu, root %llu",
+ type, btrfs_ino(inode), extent_start,
+ root->root_key.objectid);
+ }
+}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 74272a3f9d9b..1f2b99cb55ea 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -40,6 +40,7 @@
#include "tree-log.h"
#include "locking.h"
#include "volumes.h"
+#include "qgroup.h"
static struct kmem_cache *btrfs_inode_defrag_cachep;
/*
@@ -447,7 +448,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
write_bytes -= copied;
total_copied += copied;
- /* Return to btrfs_file_aio_write to fault page */
+ /* Return to btrfs_file_write_iter to fault page */
if (unlikely(copied == 0))
break;
@@ -715,7 +716,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
int recow;
int ret;
int modify_tree = -1;
- int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
+ int update_refs;
int found = 0;
int leafs_visited = 0;
@@ -725,6 +726,8 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
modify_tree = 0;
+ update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+ root == root->fs_info->tree_root);
while (1) {
recow = 0;
ret = btrfs_lookup_file_extent(trans, root, path, ino,
@@ -781,6 +784,18 @@ next_slot:
extent_end = search_start;
}
+ /*
+ * Don't skip extent items representing 0 byte lengths. They
+ * used to be created (bug) if while punching holes we hit
+ * -ENOSPC condition. So if we find one here, just ensure we
+ * delete it, otherwise we would insert a new file extent item
+ * with the same key (offset) as that 0 bytes length file
+ * extent item in the call to setup_items_for_insert() later
+ * in this function.
+ */
+ if (extent_end == key.offset && extent_end >= search_start)
+ goto delete_extent_item;
+
if (extent_end <= search_start) {
path->slots[0]++;
goto next_slot;
@@ -836,7 +851,7 @@ next_slot:
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
new_key.objectid,
- start - extent_offset, 0);
+ start - extent_offset, 1);
BUG_ON(ret); /* -ENOMEM */
}
key.offset = start;
@@ -894,6 +909,7 @@ next_slot:
* | ------ extent ------ |
*/
if (start <= key.offset && end >= extent_end) {
+delete_extent_item:
if (del_nr == 0) {
del_slot = path->slots[0];
del_nr = 1;
@@ -1192,7 +1208,7 @@ again:
ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
root->root_key.objectid,
- ino, orig_offset, 0);
+ ino, orig_offset, 1);
BUG_ON(ret); /* -ENOMEM */
if (split == start) {
@@ -1659,27 +1675,22 @@ again:
}
static ssize_t __btrfs_direct_write(struct kiocb *iocb,
- const struct iovec *iov,
- unsigned long nr_segs, loff_t pos,
- size_t count, size_t ocount)
+ struct iov_iter *from,
+ loff_t pos)
{
struct file *file = iocb->ki_filp;
- struct iov_iter i;
ssize_t written;
ssize_t written_buffered;
loff_t endbyte;
int err;
- written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
- count, ocount);
+ written = generic_file_direct_write(iocb, from, pos);
- if (written < 0 || written == count)
+ if (written < 0 || !iov_iter_count(from))
return written;
pos += written;
- count -= written;
- iov_iter_init(&i, iov, nr_segs, count, written);
- written_buffered = __btrfs_buffered_write(file, &i, pos);
+ written_buffered = __btrfs_buffered_write(file, from, pos);
if (written_buffered < 0) {
err = written_buffered;
goto out;
@@ -1714,9 +1725,8 @@ static void update_time_for_write(struct inode *inode)
inode_inc_iversion(inode);
}
-static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
- const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
+ struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
@@ -1725,18 +1735,12 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
u64 end_pos;
ssize_t num_written = 0;
ssize_t err = 0;
- size_t count, ocount;
+ size_t count = iov_iter_count(from);
bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
+ loff_t pos = iocb->ki_pos;
mutex_lock(&inode->i_mutex);
- err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
- if (err) {
- mutex_unlock(&inode->i_mutex);
- goto out;
- }
- count = ocount;
-
current->backing_dev_info = inode->i_mapping->backing_dev_info;
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err) {
@@ -1749,6 +1753,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
goto out;
}
+ iov_iter_truncate(from, count);
+
err = file_remove_suid(file);
if (err) {
mutex_unlock(&inode->i_mutex);
@@ -1790,14 +1796,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
atomic_inc(&BTRFS_I(inode)->sync_writers);
if (unlikely(file->f_flags & O_DIRECT)) {
- num_written = __btrfs_direct_write(iocb, iov, nr_segs,
- pos, count, ocount);
+ num_written = __btrfs_direct_write(iocb, from, pos);
} else {
- struct iov_iter i;
-
- iov_iter_init(&i, iov, nr_segs, count, num_written);
-
- num_written = __btrfs_buffered_write(file, &i, pos);
+ num_written = __btrfs_buffered_write(file, from, pos);
if (num_written > 0)
iocb->ki_pos = pos + num_written;
}
@@ -2010,8 +2011,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (!full_sync) {
ret = btrfs_wait_ordered_range(inode, start,
end - start + 1);
- if (ret)
+ if (ret) {
+ btrfs_end_transaction(trans, root);
goto out;
+ }
}
ret = btrfs_commit_transaction(trans, root);
} else {
@@ -2169,6 +2172,37 @@ out:
return 0;
}
+/*
+ * Find a hole extent on given inode and change start/len to the end of hole
+ * extent.(hole/vacuum extent whose em->start <= start &&
+ * em->start + em->len > start)
+ * When a hole extent is found, return 1 and modify start/len.
+ */
+static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
+{
+ struct extent_map *em;
+ int ret = 0;
+
+ em = btrfs_get_extent(inode, NULL, 0, *start, *len, 0);
+ if (IS_ERR_OR_NULL(em)) {
+ if (!em)
+ ret = -ENOMEM;
+ else
+ ret = PTR_ERR(em);
+ return ret;
+ }
+
+ /* Hole or vacuum extent(only exists in no-hole mode) */
+ if (em->block_start == EXTENT_MAP_HOLE) {
+ ret = 1;
+ *len = em->start + em->len > *start + *len ?
+ 0 : *start + *len - em->start - em->len;
+ *start = em->start + em->len;
+ }
+ free_extent_map(em);
+ return ret;
+}
+
static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2176,25 +2210,42 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
struct btrfs_path *path;
struct btrfs_block_rsv *rsv;
struct btrfs_trans_handle *trans;
- u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
- u64 lockend = round_down(offset + len,
- BTRFS_I(inode)->root->sectorsize) - 1;
- u64 cur_offset = lockstart;
+ u64 lockstart;
+ u64 lockend;
+ u64 tail_start;
+ u64 tail_len;
+ u64 orig_start = offset;
+ u64 cur_offset;
u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
u64 drop_end;
int ret = 0;
int err = 0;
int rsv_count;
- bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
- ((offset + len - 1) >> PAGE_CACHE_SHIFT));
+ bool same_page;
bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
- u64 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
+ u64 ino_size;
ret = btrfs_wait_ordered_range(inode, offset, len);
if (ret)
return ret;
mutex_lock(&inode->i_mutex);
+ ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
+ ret = find_first_non_hole(inode, &offset, &len);
+ if (ret < 0)
+ goto out_only_mutex;
+ if (ret && !len) {
+ /* Already in a large hole */
+ ret = 0;
+ goto out_only_mutex;
+ }
+
+ lockstart = round_up(offset , BTRFS_I(inode)->root->sectorsize);
+ lockend = round_down(offset + len,
+ BTRFS_I(inode)->root->sectorsize) - 1;
+ same_page = ((offset >> PAGE_CACHE_SHIFT) ==
+ ((offset + len - 1) >> PAGE_CACHE_SHIFT));
+
/*
* We needn't truncate any page which is beyond the end of the file
* because we are sure there is no data there.
@@ -2206,8 +2257,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
if (same_page && len < PAGE_CACHE_SIZE) {
if (offset < ino_size)
ret = btrfs_truncate_page(inode, offset, len, 0);
- mutex_unlock(&inode->i_mutex);
- return ret;
+ goto out_only_mutex;
}
/* zero back part of the first page */
@@ -2219,12 +2269,39 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
}
}
- /* zero the front end of the last page */
- if (offset + len < ino_size) {
- ret = btrfs_truncate_page(inode, offset + len, 0, 1);
- if (ret) {
- mutex_unlock(&inode->i_mutex);
- return ret;
+ /* Check the aligned pages after the first unaligned page,
+ * if offset != orig_start, which means the first unaligned page
+ * including serveral following pages are already in holes,
+ * the extra check can be skipped */
+ if (offset == orig_start) {
+ /* after truncate page, check hole again */
+ len = offset + len - lockstart;
+ offset = lockstart;
+ ret = find_first_non_hole(inode, &offset, &len);
+ if (ret < 0)
+ goto out_only_mutex;
+ if (ret && !len) {
+ ret = 0;
+ goto out_only_mutex;
+ }
+ lockstart = offset;
+ }
+
+ /* Check the tail unaligned part is in a hole */
+ tail_start = lockend + 1;
+ tail_len = offset + len - tail_start;
+ if (tail_len) {
+ ret = find_first_non_hole(inode, &tail_start, &tail_len);
+ if (unlikely(ret < 0))
+ goto out_only_mutex;
+ if (!ret) {
+ /* zero the front end of the last page */
+ if (tail_start + tail_len < ino_size) {
+ ret = btrfs_truncate_page(inode,
+ tail_start + tail_len, 0, 1);
+ if (ret)
+ goto out_only_mutex;
+ }
}
}
@@ -2250,9 +2327,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
if ((!ordered ||
(ordered->file_offset + ordered->len <= lockstart ||
ordered->file_offset > lockend)) &&
- !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, EXTENT_UPTODATE, 0,
- cached_state)) {
+ !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
if (ordered)
btrfs_put_ordered_extent(ordered);
break;
@@ -2300,6 +2375,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
BUG_ON(ret);
trans->block_rsv = rsv;
+ cur_offset = lockstart;
+ len = lockend - cur_offset;
while (cur_offset < lockend) {
ret = __btrfs_drop_extents(trans, root, inode, path,
cur_offset, lockend + 1,
@@ -2340,6 +2417,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
rsv, min_size);
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
+
+ ret = find_first_non_hole(inode, &cur_offset, &len);
+ if (unlikely(ret < 0))
+ break;
+ if (ret && !len) {
+ ret = 0;
+ break;
+ }
}
if (ret) {
@@ -2348,7 +2433,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
}
trans->block_rsv = &root->fs_info->trans_block_rsv;
- if (cur_offset < ino_size) {
+ /*
+ * Don't insert file hole extent item if it's for a range beyond eof
+ * (because it's useless) or if it represents a 0 bytes range (when
+ * cur_offset == drop_end).
+ */
+ if (cur_offset < ino_size && cur_offset < drop_end) {
ret = fill_holes(trans, inode, path, cur_offset, drop_end);
if (ret) {
err = ret;
@@ -2373,6 +2463,7 @@ out_free:
out:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state, GFP_NOFS);
+out_only_mutex:
mutex_unlock(&inode->i_mutex);
if (ret && !err)
err = ret;
@@ -2634,11 +2725,11 @@ out:
const struct file_operations btrfs_file_operations = {
.llseek = btrfs_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
.splice_read = generic_file_splice_read,
- .aio_write = btrfs_file_aio_write,
+ .write_iter = btrfs_file_write_iter,
.mmap = btrfs_file_mmap,
.open = generic_file_open,
.release = btrfs_release_file,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 73f3de7a083c..2b0a627cb5f9 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -274,18 +274,32 @@ struct io_ctl {
};
static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
- struct btrfs_root *root)
+ struct btrfs_root *root, int write)
{
+ int num_pages;
+ int check_crcs = 0;
+
+ num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+
+ if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
+ check_crcs = 1;
+
+ /* Make sure we can fit our crcs into the first page */
+ if (write && check_crcs &&
+ (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
+ return -ENOSPC;
+
memset(io_ctl, 0, sizeof(struct io_ctl));
- io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
- io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages,
- GFP_NOFS);
+
+ io_ctl->pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
if (!io_ctl->pages)
return -ENOMEM;
+
+ io_ctl->num_pages = num_pages;
io_ctl->root = root;
- if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
- io_ctl->check_crcs = 1;
+ io_ctl->check_crcs = check_crcs;
+
return 0;
}
@@ -666,6 +680,13 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
generation = btrfs_free_space_generation(leaf, header);
btrfs_release_path(path);
+ if (!BTRFS_I(inode)->generation) {
+ btrfs_info(root->fs_info,
+ "The free space cache file (%llu) is invalid. skip it\n",
+ offset);
+ return 0;
+ }
+
if (BTRFS_I(inode)->generation != generation) {
btrfs_err(root->fs_info,
"free space inode generation (%llu) "
@@ -677,7 +698,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
if (!num_entries)
return 0;
- ret = io_ctl_init(&io_ctl, inode, root);
+ ret = io_ctl_init(&io_ctl, inode, root, 0);
if (ret)
return ret;
@@ -831,7 +852,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
if (!matched) {
__btrfs_remove_free_space_cache(ctl);
- btrfs_err(fs_info, "block group %llu has wrong amount of free space",
+ btrfs_warn(fs_info, "block group %llu has wrong amount of free space",
block_group->key.objectid);
ret = -1;
}
@@ -843,7 +864,7 @@ out:
spin_unlock(&block_group->lock);
ret = 0;
- btrfs_err(fs_info, "failed to load free space cache for block group %llu",
+ btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuild it now",
block_group->key.objectid);
}
@@ -851,90 +872,44 @@ out:
return ret;
}
-/**
- * __btrfs_write_out_cache - write out cached info to an inode
- * @root - the root the inode belongs to
- * @ctl - the free space cache we are going to write out
- * @block_group - the block_group for this cache if it belongs to a block_group
- * @trans - the trans handle
- * @path - the path to use
- * @offset - the offset for the key we'll insert
- *
- * This function writes out a free space cache struct to disk for quick recovery
- * on mount. This will return 0 if it was successfull in writing the cache out,
- * and -1 if it was not.
- */
-static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
- struct btrfs_free_space_ctl *ctl,
- struct btrfs_block_group_cache *block_group,
- struct btrfs_trans_handle *trans,
- struct btrfs_path *path, u64 offset)
+static noinline_for_stack
+int write_cache_extent_entries(struct io_ctl *io_ctl,
+ struct btrfs_free_space_ctl *ctl,
+ struct btrfs_block_group_cache *block_group,
+ int *entries, int *bitmaps,
+ struct list_head *bitmap_list)
{
- struct btrfs_free_space_header *header;
- struct extent_buffer *leaf;
- struct rb_node *node;
- struct list_head *pos, *n;
- struct extent_state *cached_state = NULL;
- struct btrfs_free_cluster *cluster = NULL;
- struct extent_io_tree *unpin = NULL;
- struct io_ctl io_ctl;
- struct list_head bitmap_list;
- struct btrfs_key key;
- u64 start, extent_start, extent_end, len;
- int entries = 0;
- int bitmaps = 0;
int ret;
- int err = -1;
-
- INIT_LIST_HEAD(&bitmap_list);
-
- if (!i_size_read(inode))
- return -1;
-
- ret = io_ctl_init(&io_ctl, inode, root);
- if (ret)
- return -1;
+ struct btrfs_free_cluster *cluster = NULL;
+ struct rb_node *node = rb_first(&ctl->free_space_offset);
/* Get the cluster for this block_group if it exists */
- if (block_group && !list_empty(&block_group->cluster_list))
+ if (block_group && !list_empty(&block_group->cluster_list)) {
cluster = list_entry(block_group->cluster_list.next,
struct btrfs_free_cluster,
block_group_list);
+ }
- /* Lock all pages first so we can lock the extent safely. */
- io_ctl_prepare_pages(&io_ctl, inode, 0);
-
- lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
- 0, &cached_state);
-
- node = rb_first(&ctl->free_space_offset);
if (!node && cluster) {
node = rb_first(&cluster->root);
cluster = NULL;
}
- /* Make sure we can fit our crcs into the first page */
- if (io_ctl.check_crcs &&
- (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
- goto out_nospc;
-
- io_ctl_set_generation(&io_ctl, trans->transid);
-
/* Write out the extent entries */
while (node) {
struct btrfs_free_space *e;
e = rb_entry(node, struct btrfs_free_space, offset_index);
- entries++;
+ *entries += 1;
- ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes,
+ ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
e->bitmap);
if (ret)
- goto out_nospc;
+ goto fail;
if (e->bitmap) {
- list_add_tail(&e->list, &bitmap_list);
- bitmaps++;
+ list_add_tail(&e->list, bitmap_list);
+ *bitmaps += 1;
}
node = rb_next(node);
if (!node && cluster) {
@@ -942,136 +917,289 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
cluster = NULL;
}
}
+ return 0;
+fail:
+ return -ENOSPC;
+}
+
+static noinline_for_stack int
+update_cache_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode,
+ struct btrfs_path *path, u64 offset,
+ int entries, int bitmaps)
+{
+ struct btrfs_key key;
+ struct btrfs_free_space_header *header;
+ struct extent_buffer *leaf;
+ int ret;
+
+ key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+ key.offset = offset;
+ key.type = 0;
+
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret < 0) {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
+ EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
+ GFP_NOFS);
+ goto fail;
+ }
+ leaf = path->nodes[0];
+ if (ret > 0) {
+ struct btrfs_key found_key;
+ ASSERT(path->slots[0]);
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
+ found_key.offset != offset) {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
+ inode->i_size - 1,
+ EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
+ NULL, GFP_NOFS);
+ btrfs_release_path(path);
+ goto fail;
+ }
+ }
+
+ BTRFS_I(inode)->generation = trans->transid;
+ header = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_free_space_header);
+ btrfs_set_free_space_entries(leaf, header, entries);
+ btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
+ btrfs_set_free_space_generation(leaf, header, trans->transid);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ return 0;
+
+fail:
+ return -1;
+}
+
+static noinline_for_stack int
+write_pinned_extent_entries(struct btrfs_root *root,
+ struct btrfs_block_group_cache *block_group,
+ struct io_ctl *io_ctl,
+ int *entries)
+{
+ u64 start, extent_start, extent_end, len;
+ struct extent_io_tree *unpin = NULL;
+ int ret;
+
+ if (!block_group)
+ return 0;
/*
* We want to add any pinned extents to our free space cache
* so we don't leak the space
- */
-
- /*
+ *
* We shouldn't have switched the pinned extents yet so this is the
* right one
*/
unpin = root->fs_info->pinned_extents;
- if (block_group)
- start = block_group->key.objectid;
+ start = block_group->key.objectid;
- while (block_group && (start < block_group->key.objectid +
- block_group->key.offset)) {
+ while (start < block_group->key.objectid + block_group->key.offset) {
ret = find_first_extent_bit(unpin, start,
&extent_start, &extent_end,
EXTENT_DIRTY, NULL);
- if (ret) {
- ret = 0;
- break;
- }
+ if (ret)
+ return 0;
/* This pinned extent is out of our range */
if (extent_start >= block_group->key.objectid +
block_group->key.offset)
- break;
+ return 0;
extent_start = max(extent_start, start);
extent_end = min(block_group->key.objectid +
block_group->key.offset, extent_end + 1);
len = extent_end - extent_start;
- entries++;
- ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL);
+ *entries += 1;
+ ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
if (ret)
- goto out_nospc;
+ return -ENOSPC;
start = extent_end;
}
+ return 0;
+}
+
+static noinline_for_stack int
+write_bitmap_entries(struct io_ctl *io_ctl, struct list_head *bitmap_list)
+{
+ struct list_head *pos, *n;
+ int ret;
+
/* Write out the bitmaps */
- list_for_each_safe(pos, n, &bitmap_list) {
+ list_for_each_safe(pos, n, bitmap_list) {
struct btrfs_free_space *entry =
list_entry(pos, struct btrfs_free_space, list);
- ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap);
+ ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
if (ret)
- goto out_nospc;
+ return -ENOSPC;
list_del_init(&entry->list);
}
- /* Zero out the rest of the pages just to make sure */
- io_ctl_zero_remaining_pages(&io_ctl);
-
- ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
- 0, i_size_read(inode), &cached_state);
- io_ctl_drop_pages(&io_ctl);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
- i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+ return 0;
+}
- if (ret)
- goto out;
+static int flush_dirty_cache(struct inode *inode)
+{
+ int ret;
ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
- if (ret) {
+ if (ret)
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
GFP_NOFS);
- goto out;
- }
- key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.offset = offset;
- key.type = 0;
+ return ret;
+}
- ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
- if (ret < 0) {
- clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
- EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
- GFP_NOFS);
- goto out;
+static void noinline_for_stack
+cleanup_write_cache_enospc(struct inode *inode,
+ struct io_ctl *io_ctl,
+ struct extent_state **cached_state,
+ struct list_head *bitmap_list)
+{
+ struct list_head *pos, *n;
+
+ list_for_each_safe(pos, n, bitmap_list) {
+ struct btrfs_free_space *entry =
+ list_entry(pos, struct btrfs_free_space, list);
+ list_del_init(&entry->list);
}
- leaf = path->nodes[0];
- if (ret > 0) {
- struct btrfs_key found_key;
- ASSERT(path->slots[0]);
- path->slots[0]--;
- btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
- found_key.offset != offset) {
- clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
- inode->i_size - 1,
- EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
- NULL, GFP_NOFS);
- btrfs_release_path(path);
+ io_ctl_drop_pages(io_ctl);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+ i_size_read(inode) - 1, cached_state,
+ GFP_NOFS);
+}
+
+/**
+ * __btrfs_write_out_cache - write out cached info to an inode
+ * @root - the root the inode belongs to
+ * @ctl - the free space cache we are going to write out
+ * @block_group - the block_group for this cache if it belongs to a block_group
+ * @trans - the trans handle
+ * @path - the path to use
+ * @offset - the offset for the key we'll insert
+ *
+ * This function writes out a free space cache struct to disk for quick recovery
+ * on mount. This will return 0 if it was successfull in writing the cache out,
+ * and -1 if it was not.
+ */
+static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
+ struct btrfs_free_space_ctl *ctl,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path, u64 offset)
+{
+ struct extent_state *cached_state = NULL;
+ struct io_ctl io_ctl;
+ LIST_HEAD(bitmap_list);
+ int entries = 0;
+ int bitmaps = 0;
+ int ret;
+
+ if (!i_size_read(inode))
+ return -1;
+
+ ret = io_ctl_init(&io_ctl, inode, root, 1);
+ if (ret)
+ return -1;
+
+ if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
+ down_write(&block_group->data_rwsem);
+ spin_lock(&block_group->lock);
+ if (block_group->delalloc_bytes) {
+ block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+ spin_unlock(&block_group->lock);
+ up_write(&block_group->data_rwsem);
+ BTRFS_I(inode)->generation = 0;
+ ret = 0;
goto out;
}
+ spin_unlock(&block_group->lock);
}
- BTRFS_I(inode)->generation = trans->transid;
- header = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_free_space_header);
- btrfs_set_free_space_entries(leaf, header, entries);
- btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
- btrfs_set_free_space_generation(leaf, header, trans->transid);
- btrfs_mark_buffer_dirty(leaf);
- btrfs_release_path(path);
+ /* Lock all pages first so we can lock the extent safely. */
+ io_ctl_prepare_pages(&io_ctl, inode, 0);
+
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ 0, &cached_state);
+
+ io_ctl_set_generation(&io_ctl, trans->transid);
+
+ /* Write out the extent entries in the free space cache */
+ ret = write_cache_extent_entries(&io_ctl, ctl,
+ block_group, &entries, &bitmaps,
+ &bitmap_list);
+ if (ret)
+ goto out_nospc;
+
+ /*
+ * Some spaces that are freed in the current transaction are pinned,
+ * they will be added into free space cache after the transaction is
+ * committed, we shouldn't lose them.
+ */
+ ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries);
+ if (ret)
+ goto out_nospc;
+
+ /* At last, we write out all the bitmaps. */
+ ret = write_bitmap_entries(&io_ctl, &bitmap_list);
+ if (ret)
+ goto out_nospc;
+
+ /* Zero out the rest of the pages just to make sure */
+ io_ctl_zero_remaining_pages(&io_ctl);
+
+ /* Everything is written out, now we dirty the pages in the file. */
+ ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
+ 0, i_size_read(inode), &cached_state);
+ if (ret)
+ goto out_nospc;
+
+ if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+ up_write(&block_group->data_rwsem);
+ /*
+ * Release the pages and unlock the extent, we will flush
+ * them out later
+ */
+ io_ctl_drop_pages(&io_ctl);
+
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+ i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+
+ /* Flush the dirty pages in the cache file. */
+ ret = flush_dirty_cache(inode);
+ if (ret)
+ goto out;
- err = 0;
+ /* Update the cache item to tell everyone this cache file is valid. */
+ ret = update_cache_item(trans, root, inode, path, offset,
+ entries, bitmaps);
out:
io_ctl_free(&io_ctl);
- if (err) {
+ if (ret) {
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
}
btrfs_update_inode(trans, root, inode);
- return err;
+ return ret;
out_nospc:
- list_for_each_safe(pos, n, &bitmap_list) {
- struct btrfs_free_space *entry =
- list_entry(pos, struct btrfs_free_space, list);
- list_del_init(&entry->list);
- }
- io_ctl_drop_pages(&io_ctl);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
- i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+ cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list);
+
+ if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+ up_write(&block_group->data_rwsem);
+
goto out;
}
@@ -1091,6 +1219,12 @@ int btrfs_write_out_cache(struct btrfs_root *root,
spin_unlock(&block_group->lock);
return 0;
}
+
+ if (block_group->delalloc_bytes) {
+ block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
spin_unlock(&block_group->lock);
inode = lookup_free_space_inode(root, block_group, path);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 86935f5ae291..888fbe19079f 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -174,7 +174,7 @@ static void start_caching(struct btrfs_root *root)
BTRFS_LAST_FREE_OBJECTID - objectid + 1);
}
- tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n",
+ tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu",
root->root_key.objectid);
if (IS_ERR(tsk)) {
btrfs_warn(root->fs_info, "failed to start inode caching task");
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5a3b8371772e..3668048e16f8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -125,7 +125,7 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
* the btree. The caller should have done a btrfs_drop_extents so that
* no overlapping inline items exist in the btree
*/
-static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
+static int insert_inline_extent(struct btrfs_trans_handle *trans,
struct btrfs_path *path, int extent_inserted,
struct btrfs_root *root, struct inode *inode,
u64 start, size_t size, size_t compressed_size,
@@ -693,7 +693,7 @@ retry:
ret = btrfs_reserve_extent(root,
async_extent->compressed_size,
async_extent->compressed_size,
- 0, alloc_hint, &ins, 1);
+ 0, alloc_hint, &ins, 1, 1);
if (ret) {
int i;
@@ -794,7 +794,7 @@ retry:
out:
return ret;
out_free_reserve:
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
out_free:
extent_clear_unlock_delalloc(inode, async_extent->start,
async_extent->start +
@@ -917,7 +917,7 @@ static noinline int cow_file_range(struct inode *inode,
cur_alloc_size = disk_num_bytes;
ret = btrfs_reserve_extent(root, cur_alloc_size,
root->sectorsize, 0, alloc_hint,
- &ins, 1);
+ &ins, 1, 1);
if (ret < 0)
goto out_unlock;
@@ -995,7 +995,7 @@ out:
return ret;
out_reserve:
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
out_unlock:
extent_clear_unlock_delalloc(inode, start, end, locked_page,
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
@@ -2599,6 +2599,21 @@ out_kfree:
return NULL;
}
+static void btrfs_release_delalloc_bytes(struct btrfs_root *root,
+ u64 start, u64 len)
+{
+ struct btrfs_block_group_cache *cache;
+
+ cache = btrfs_lookup_block_group(root->fs_info, start);
+ ASSERT(cache);
+
+ spin_lock(&cache->lock);
+ cache->delalloc_bytes -= len;
+ spin_unlock(&cache->lock);
+
+ btrfs_put_block_group(cache);
+}
+
/* as ordered data IO finishes, this gets called so we can finish
* an ordered extent if the range of bytes in the file it covers are
* fully written.
@@ -2678,6 +2693,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
trans = NULL;
goto out_unlock;
}
+
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
@@ -2697,6 +2713,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
logical_len, logical_len,
compress_type, 0, 0,
BTRFS_FILE_EXTENT_REG);
+ if (!ret)
+ btrfs_release_delalloc_bytes(root,
+ ordered_extent->start,
+ ordered_extent->disk_len);
}
unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
ordered_extent->file_offset, ordered_extent->len,
@@ -2749,7 +2769,7 @@ out:
!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
btrfs_free_reserved_extent(root, ordered_extent->start,
- ordered_extent->disk_len);
+ ordered_extent->disk_len, 1);
}
@@ -2947,14 +2967,15 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
root->orphan_block_rsv = NULL;
spin_unlock(&root->orphan_lock);
- if (root->orphan_item_inserted &&
+ if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
btrfs_root_refs(&root->root_item) > 0) {
ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
root->root_key.objectid);
if (ret)
btrfs_abort_transaction(trans, root, ret);
else
- root->orphan_item_inserted = 0;
+ clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
+ &root->state);
}
if (block_rsv) {
@@ -3271,7 +3292,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
btrfs_block_rsv_release(root, root->orphan_block_rsv,
(u64)-1);
- if (root->orphan_block_rsv || root->orphan_item_inserted) {
+ if (root->orphan_block_rsv ||
+ test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
trans = btrfs_join_transaction(root);
if (!IS_ERR(trans))
btrfs_end_transaction(trans, root);
@@ -3473,7 +3495,7 @@ cache_acl:
ret = btrfs_load_inode_props(inode, path);
if (ret)
btrfs_err(root->fs_info,
- "error loading props for ino %llu (root %llu): %d\n",
+ "error loading props for ino %llu (root %llu): %d",
btrfs_ino(inode),
root->root_key.objectid, ret);
}
@@ -3998,7 +4020,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
* not block aligned since we will be keeping the last block of the
* extent just the way it is.
*/
- if (root->ref_cows || root == root->fs_info->tree_root)
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+ root == root->fs_info->tree_root)
btrfs_drop_extent_cache(inode, ALIGN(new_size,
root->sectorsize), (u64)-1, 0);
@@ -4091,7 +4114,9 @@ search_again:
extent_num_bytes);
num_dec = (orig_num_bytes -
extent_num_bytes);
- if (root->ref_cows && extent_start != 0)
+ if (test_bit(BTRFS_ROOT_REF_COWS,
+ &root->state) &&
+ extent_start != 0)
inode_sub_bytes(inode, num_dec);
btrfs_mark_buffer_dirty(leaf);
} else {
@@ -4105,7 +4130,8 @@ search_again:
num_dec = btrfs_file_extent_num_bytes(leaf, fi);
if (extent_start != 0) {
found_extent = 1;
- if (root->ref_cows)
+ if (test_bit(BTRFS_ROOT_REF_COWS,
+ &root->state))
inode_sub_bytes(inode, num_dec);
}
}
@@ -4120,10 +4146,9 @@ search_again:
btrfs_file_extent_other_encoding(leaf, fi) == 0) {
u32 size = new_size - found_key.offset;
- if (root->ref_cows) {
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
inode_sub_bytes(inode, item_end + 1 -
new_size);
- }
/*
* update the ram bytes to properly reflect
@@ -4133,7 +4158,8 @@ search_again:
size =
btrfs_file_extent_calc_inline_size(size);
btrfs_truncate_item(root, path, size, 1);
- } else if (root->ref_cows) {
+ } else if (test_bit(BTRFS_ROOT_REF_COWS,
+ &root->state)) {
inode_sub_bytes(inode, item_end + 1 -
found_key.offset);
}
@@ -4155,8 +4181,9 @@ delete:
} else {
break;
}
- if (found_extent && (root->ref_cows ||
- root == root->fs_info->tree_root)) {
+ if (found_extent &&
+ (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+ root == root->fs_info->tree_root)) {
btrfs_set_path_blocking(path);
ret = btrfs_free_extent(trans, root, extent_start,
extent_num_bytes, 0,
@@ -5168,8 +5195,7 @@ static int btrfs_dentry_delete(const struct dentry *dentry)
static void btrfs_dentry_release(struct dentry *dentry)
{
- if (dentry->d_fsdata)
- kfree(dentry->d_fsdata);
+ kfree(dentry->d_fsdata);
}
static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
@@ -5553,6 +5579,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode_ref *ref;
struct btrfs_key key[2];
u32 sizes[2];
+ int nitems = name ? 2 : 1;
unsigned long ptr;
int ret;
@@ -5572,7 +5599,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
*/
inode->i_ino = objectid;
- if (dir) {
+ if (dir && name) {
trace_btrfs_inode_request(dir);
ret = btrfs_set_inode_index(dir, index);
@@ -5581,6 +5608,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
iput(inode);
return ERR_PTR(ret);
}
+ } else if (dir) {
+ *index = 0;
}
/*
* index_cnt is ignored for everything but a dir,
@@ -5605,21 +5634,24 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
key[0].offset = 0;
- /*
- * Start new inodes with an inode_ref. This is slightly more
- * efficient for small numbers of hard links since they will
- * be packed into one item. Extended refs will kick in if we
- * add more hard links than can fit in the ref item.
- */
- key[1].objectid = objectid;
- btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
- key[1].offset = ref_objectid;
-
sizes[0] = sizeof(struct btrfs_inode_item);
- sizes[1] = name_len + sizeof(*ref);
+
+ if (name) {
+ /*
+ * Start new inodes with an inode_ref. This is slightly more
+ * efficient for small numbers of hard links since they will
+ * be packed into one item. Extended refs will kick in if we
+ * add more hard links than can fit in the ref item.
+ */
+ key[1].objectid = objectid;
+ btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
+ key[1].offset = ref_objectid;
+
+ sizes[1] = name_len + sizeof(*ref);
+ }
path->leave_spinning = 1;
- ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
+ ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
if (ret != 0)
goto fail;
@@ -5632,12 +5664,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
sizeof(*inode_item));
fill_inode_item(trans, path->nodes[0], inode_item, inode);
- ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
- struct btrfs_inode_ref);
- btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
- btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
- ptr = (unsigned long)(ref + 1);
- write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ if (name) {
+ ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
+ struct btrfs_inode_ref);
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+ btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
+ ptr = (unsigned long)(ref + 1);
+ write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ }
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
@@ -5673,7 +5707,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
return inode;
fail:
- if (dir)
+ if (dir && name)
BTRFS_I(dir)->index_cnt--;
btrfs_free_path(path);
iput(inode);
@@ -5958,6 +5992,15 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
err = btrfs_update_inode(trans, root, inode);
if (err)
goto fail;
+ if (inode->i_nlink == 1) {
+ /*
+ * If new hard link count is 1, it's a file created
+ * with open(2) O_TMPFILE flag.
+ */
+ err = btrfs_orphan_del(trans, inode);
+ if (err)
+ goto fail;
+ }
d_instantiate(dentry, inode);
btrfs_log_new_name(trans, inode, NULL, parent);
}
@@ -6086,16 +6129,8 @@ static noinline int uncompress_inline(struct btrfs_path *path,
max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
ret = btrfs_decompress(compress_type, tmp, page,
extent_offset, inline_size, max_size);
- if (ret) {
- char *kaddr = kmap_atomic(page);
- unsigned long copy_size = min_t(u64,
- PAGE_CACHE_SIZE - pg_offset,
- max_size - extent_offset);
- memset(kaddr + pg_offset, 0, copy_size);
- kunmap_atomic(kaddr);
- }
kfree(tmp);
- return 0;
+ return ret;
}
/*
@@ -6113,7 +6148,6 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
{
int ret;
int err = 0;
- u64 bytenr;
u64 extent_start = 0;
u64 extent_end = 0;
u64 objectid = btrfs_ino(inode);
@@ -6127,7 +6161,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_trans_handle *trans = NULL;
- int compress_type;
+ const bool new_inline = !page || create;
again:
read_lock(&em_tree->lock);
@@ -6201,7 +6235,6 @@ again:
found_type = btrfs_file_extent_type(leaf, item);
extent_start = found_key.offset;
- compress_type = btrfs_file_extent_compression(leaf, item);
if (found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
extent_end = extent_start +
@@ -6236,32 +6269,10 @@ next:
goto not_found_em;
}
- em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
+ btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em);
+
if (found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
- em->start = extent_start;
- em->len = extent_end - extent_start;
- em->orig_start = extent_start -
- btrfs_file_extent_offset(leaf, item);
- em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf,
- item);
- bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
- if (bytenr == 0) {
- em->block_start = EXTENT_MAP_HOLE;
- goto insert;
- }
- if (compress_type != BTRFS_COMPRESS_NONE) {
- set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
- em->compress_type = compress_type;
- em->block_start = bytenr;
- em->block_len = em->orig_block_len;
- } else {
- bytenr += btrfs_file_extent_offset(leaf, item);
- em->block_start = bytenr;
- em->block_len = em->len;
- if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
- set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
- }
goto insert;
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
unsigned long ptr;
@@ -6270,12 +6281,8 @@ next:
size_t extent_offset;
size_t copy_size;
- em->block_start = EXTENT_MAP_INLINE;
- if (!page || create) {
- em->start = extent_start;
- em->len = extent_end - extent_start;
+ if (new_inline)
goto out;
- }
size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
extent_offset = page_offset(page) + pg_offset - extent_start;
@@ -6285,10 +6292,6 @@ next:
em->len = ALIGN(copy_size, root->sectorsize);
em->orig_block_len = em->len;
em->orig_start = em->start;
- if (compress_type) {
- set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
- em->compress_type = compress_type;
- }
ptr = btrfs_file_extent_inline_start(item) + extent_offset;
if (create == 0 && !PageUptodate(page)) {
if (btrfs_file_extent_compression(leaf, item) !=
@@ -6296,7 +6299,10 @@ next:
ret = uncompress_inline(path, inode, page,
pg_offset,
extent_offset, item);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ err = ret;
+ goto out;
+ }
} else {
map = kmap(page);
read_extent_buffer(leaf, map + pg_offset, ptr,
@@ -6332,8 +6338,6 @@ next:
set_extent_uptodate(io_tree, em->start,
extent_map_end(em) - 1, NULL, GFP_NOFS);
goto insert;
- } else {
- WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type);
}
not_found:
em->start = start;
@@ -6550,21 +6554,21 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
alloc_hint = get_extent_allocation_hint(inode, start, len);
ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
- alloc_hint, &ins, 1);
+ alloc_hint, &ins, 1, 1);
if (ret)
return ERR_PTR(ret);
em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
ins.offset, ins.offset, ins.offset, 0);
if (IS_ERR(em)) {
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
return em;
}
ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
ins.offset, ins.offset, 0);
if (ret) {
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
free_extent_map(em);
return ERR_PTR(ret);
}
@@ -6717,6 +6721,76 @@ out:
return ret;
}
+bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
+{
+ struct radix_tree_root *root = &inode->i_mapping->page_tree;
+ int found = false;
+ void **pagep = NULL;
+ struct page *page = NULL;
+ int start_idx;
+ int end_idx;
+
+ start_idx = start >> PAGE_CACHE_SHIFT;
+
+ /*
+ * end is the last byte in the last page. end == start is legal
+ */
+ end_idx = end >> PAGE_CACHE_SHIFT;
+
+ rcu_read_lock();
+
+ /* Most of the code in this while loop is lifted from
+ * find_get_page. It's been modified to begin searching from a
+ * page and return just the first page found in that range. If the
+ * found idx is less than or equal to the end idx then we know that
+ * a page exists. If no pages are found or if those pages are
+ * outside of the range then we're fine (yay!) */
+ while (page == NULL &&
+ radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) {
+ page = radix_tree_deref_slot(pagep);
+ if (unlikely(!page))
+ break;
+
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ page = NULL;
+ continue;
+ }
+ /*
+ * Otherwise, shmem/tmpfs must be storing a swap entry
+ * here as an exceptional entry: so return it without
+ * attempting to raise page count.
+ */
+ page = NULL;
+ break; /* TODO: Is this relevant for this use case? */
+ }
+
+ if (!page_cache_get_speculative(page)) {
+ page = NULL;
+ continue;
+ }
+
+ /*
+ * Has the page moved?
+ * This is part of the lockless pagecache protocol. See
+ * include/linux/pagemap.h for details.
+ */
+ if (unlikely(page != *pagep)) {
+ page_cache_release(page);
+ page = NULL;
+ }
+ }
+
+ if (page) {
+ if (page->index <= end_idx)
+ found = true;
+ page_cache_release(page);
+ }
+
+ rcu_read_unlock();
+ return found;
+}
+
static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
struct extent_state **cached_state, int writing)
{
@@ -6741,10 +6815,9 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
* invalidate needs to happen so that reads after a write do not
* get stale data.
*/
- if (!ordered && (!writing ||
- !test_range_bit(&BTRFS_I(inode)->io_tree,
- lockstart, lockend, EXTENT_UPTODATE, 0,
- *cached_state)))
+ if (!ordered &&
+ (!writing ||
+ !btrfs_page_exists_in_range(inode, lockstart, lockend)))
break;
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
@@ -7383,7 +7456,7 @@ free_ordered:
if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
btrfs_free_reserved_extent(root, ordered->start,
- ordered->disk_len);
+ ordered->disk_len, 1);
btrfs_put_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
}
@@ -7391,39 +7464,30 @@ free_ordered:
}
static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ const struct iov_iter *iter, loff_t offset)
{
int seg;
int i;
- size_t size;
- unsigned long addr;
unsigned blocksize_mask = root->sectorsize - 1;
ssize_t retval = -EINVAL;
- loff_t end = offset;
if (offset & blocksize_mask)
goto out;
- /* Check the memory alignment. Blocks cannot straddle pages */
- for (seg = 0; seg < nr_segs; seg++) {
- addr = (unsigned long)iov[seg].iov_base;
- size = iov[seg].iov_len;
- end += size;
- if ((addr & blocksize_mask) || (size & blocksize_mask))
- goto out;
-
- /* If this is a write we don't need to check anymore */
- if (rw & WRITE)
- continue;
+ if (iov_iter_alignment(iter) & blocksize_mask)
+ goto out;
- /*
- * Check to make sure we don't have duplicate iov_base's in this
- * iovec, if so return EINVAL, otherwise we'll get csum errors
- * when reading back.
- */
- for (i = seg + 1; i < nr_segs; i++) {
- if (iov[seg].iov_base == iov[i].iov_base)
+ /* If this is a write we don't need to check anymore */
+ if (rw & WRITE)
+ return 0;
+ /*
+ * Check to make sure we don't have duplicate iov_base's in this
+ * iovec, if so return EINVAL, otherwise we'll get csum errors
+ * when reading back.
+ */
+ for (seg = 0; seg < iter->nr_segs; seg++) {
+ for (i = seg + 1; i < iter->nr_segs; i++) {
+ if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
goto out;
}
}
@@ -7433,8 +7497,7 @@ out:
}
static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
@@ -7444,8 +7507,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
bool relock = false;
ssize_t ret;
- if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
- offset, nr_segs))
+ if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iter, offset))
return 0;
atomic_inc(&inode->i_dio_count);
@@ -7457,7 +7519,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
* we need to flush the dirty pages again to make absolutely sure
* that any outstanding dirty pages are on disk.
*/
- count = iov_length(iov, nr_segs);
+ count = iov_iter_count(iter);
if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
&BTRFS_I(inode)->runtime_flags))
filemap_fdatawrite_range(inode->i_mapping, offset, count);
@@ -7484,7 +7546,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
ret = __blockdev_direct_IO(rw, iocb, inode,
BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
- iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+ iter, offset, btrfs_get_blocks_direct, NULL,
btrfs_submit_direct, flags);
if (rw & WRITE) {
if (ret < 0 && ret != -EIOCBQUEUED)
@@ -7992,7 +8054,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
if (err)
btrfs_err(new_root->fs_info,
- "error inheriting subvolume %llu properties: %d\n",
+ "error inheriting subvolume %llu properties: %d",
new_root->root_key.objectid, err);
err = btrfs_update_inode(trans, new_root, inode);
@@ -8311,7 +8373,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
BTRFS_I(old_inode)->dir_index = 0ULL;
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
/* force full log commit if subvolume involved. */
- root->fs_info->last_trans_log_full_commit = trans->transid;
+ btrfs_set_log_full_commit(root->fs_info, trans);
} else {
ret = btrfs_insert_inode_ref(trans, dest,
new_dentry->d_name.name,
@@ -8765,7 +8827,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
cur_bytes = max(cur_bytes, min_size);
ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
- *alloc_hint, &ins, 1);
+ *alloc_hint, &ins, 1, 0);
if (ret) {
if (own_trans)
btrfs_end_transaction(trans, root);
@@ -8779,7 +8841,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
BTRFS_FILE_EXTENT_PREALLOC);
if (ret) {
btrfs_free_reserved_extent(root, ins.objectid,
- ins.offset);
+ ins.offset, 0);
btrfs_abort_transaction(trans, root, ret);
if (own_trans)
btrfs_end_transaction(trans, root);
@@ -8889,6 +8951,66 @@ static int btrfs_permission(struct inode *inode, int mask)
return generic_permission(inode, mask);
}
+static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct inode *inode = NULL;
+ u64 objectid;
+ u64 index;
+ int ret = 0;
+
+ /*
+ * 5 units required for adding orphan entry
+ */
+ trans = btrfs_start_transaction(root, 5);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ ret = btrfs_find_free_ino(root, &objectid);
+ if (ret)
+ goto out;
+
+ inode = btrfs_new_inode(trans, root, dir, NULL, 0,
+ btrfs_ino(dir), objectid, mode, &index);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ inode = NULL;
+ goto out;
+ }
+
+ ret = btrfs_init_inode_security(trans, inode, dir, NULL);
+ if (ret)
+ goto out;
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret)
+ goto out;
+
+ inode->i_fop = &btrfs_file_operations;
+ inode->i_op = &btrfs_file_inode_operations;
+
+ inode->i_mapping->a_ops = &btrfs_aops;
+ inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+ BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+
+ ret = btrfs_orphan_add(trans, inode);
+ if (ret)
+ goto out;
+
+ d_tmpfile(dentry, inode);
+ mark_inode_dirty(inode);
+
+out:
+ btrfs_end_transaction(trans, root);
+ if (ret)
+ iput(inode);
+ btrfs_balance_delayed_items(root);
+ btrfs_btree_balance_dirty(root);
+
+ return ret;
+}
+
static const struct inode_operations btrfs_dir_inode_operations = {
.getattr = btrfs_getattr,
.lookup = btrfs_lookup,
@@ -8909,6 +9031,7 @@ static const struct inode_operations btrfs_dir_inode_operations = {
.get_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
+ .tmpfile = btrfs_tmpfile,
};
static const struct inode_operations btrfs_dir_ro_inode_operations = {
.lookup = btrfs_lookup,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3f52bb7a58d2..0d321c23069a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -58,6 +58,7 @@
#include "dev-replace.h"
#include "props.h"
#include "sysfs.h"
+#include "qgroup.h"
#ifdef CONFIG_64BIT
/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
@@ -638,7 +639,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
struct btrfs_trans_handle *trans;
int ret;
- if (!root->ref_cows)
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
return -EINVAL;
atomic_inc(&root->will_be_snapshoted);
@@ -711,6 +712,35 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (ret)
goto fail;
+ /*
+ * If orphan cleanup did remove any orphans, it means the tree was
+ * modified and therefore the commit root is not the same as the
+ * current root anymore. This is a problem, because send uses the
+ * commit root and therefore can see inode items that don't exist
+ * in the current root anymore, and for example make calls to
+ * btrfs_iget, which will do tree lookups based on the current root
+ * and not on the commit root. Those lookups will fail, returning a
+ * -ESTALE error, and making send fail with that error. So make sure
+ * a send does not see any orphans we have just removed, and that it
+ * will see the same inodes regardless of whether a transaction
+ * commit happened before it started (meaning that the commit root
+ * will be the same as the current root) or not.
+ */
+ if (readonly && pending_snapshot->snap->node !=
+ pending_snapshot->snap->commit_root) {
+ trans = btrfs_join_transaction(pending_snapshot->snap);
+ if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) {
+ ret = PTR_ERR(trans);
+ goto fail;
+ }
+ if (!IS_ERR(trans)) {
+ ret = btrfs_commit_transaction(trans,
+ pending_snapshot->snap);
+ if (ret)
+ goto fail;
+ }
+ }
+
inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
@@ -1502,11 +1532,12 @@ static noinline int btrfs_ioctl_resize(struct file *file,
sizestr = vol_args->name;
devstr = strchr(sizestr, ':');
if (devstr) {
- char *end;
sizestr = devstr + 1;
*devstr = '\0';
devstr = vol_args->name;
- devid = simple_strtoull(devstr, &end, 10);
+ ret = kstrtoull(devstr, 10, &devid);
+ if (ret)
+ goto out_free;
if (!devid) {
ret = -EINVAL;
goto out_free;
@@ -1562,7 +1593,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
new_size = old_size - new_size;
} else if (mod > 0) {
if (new_size > ULLONG_MAX - old_size) {
- ret = -EINVAL;
+ ret = -ERANGE;
goto out_free;
}
new_size = old_size + new_size;
@@ -1926,7 +1957,8 @@ static noinline int copy_to_sk(struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_key *key,
struct btrfs_ioctl_search_key *sk,
- char *buf,
+ size_t *buf_size,
+ char __user *ubuf,
unsigned long *sk_offset,
int *num_found)
{
@@ -1958,13 +1990,25 @@ static noinline int copy_to_sk(struct btrfs_root *root,
if (!key_in_sk(key, sk))
continue;
- if (sizeof(sh) + item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
+ if (sizeof(sh) + item_len > *buf_size) {
+ if (*num_found) {
+ ret = 1;
+ goto out;
+ }
+
+ /*
+ * return one empty item back for v1, which does not
+ * handle -EOVERFLOW
+ */
+
+ *buf_size = sizeof(sh) + item_len;
item_len = 0;
+ ret = -EOVERFLOW;
+ }
- if (sizeof(sh) + item_len + *sk_offset >
- BTRFS_SEARCH_ARGS_BUFSIZE) {
+ if (sizeof(sh) + item_len + *sk_offset > *buf_size) {
ret = 1;
- goto overflow;
+ goto out;
}
sh.objectid = key->objectid;
@@ -1974,20 +2018,33 @@ static noinline int copy_to_sk(struct btrfs_root *root,
sh.transid = found_transid;
/* copy search result header */
- memcpy(buf + *sk_offset, &sh, sizeof(sh));
+ if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
*sk_offset += sizeof(sh);
if (item_len) {
- char *p = buf + *sk_offset;
+ char __user *up = ubuf + *sk_offset;
/* copy the item */
- read_extent_buffer(leaf, p,
- item_off, item_len);
+ if (read_extent_buffer_to_user(leaf, up,
+ item_off, item_len)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
*sk_offset += item_len;
}
(*num_found)++;
- if (*num_found >= sk->nr_items)
- break;
+ if (ret) /* -EOVERFLOW from above */
+ goto out;
+
+ if (*num_found >= sk->nr_items) {
+ ret = 1;
+ goto out;
+ }
}
advance_key:
ret = 0;
@@ -2002,22 +2059,37 @@ advance_key:
key->objectid++;
} else
ret = 1;
-overflow:
+out:
+ /*
+ * 0: all items from this leaf copied, continue with next
+ * 1: * more items can be copied, but unused buffer is too small
+ * * all items were found
+ * Either way, it will stops the loop which iterates to the next
+ * leaf
+ * -EOVERFLOW: item was to large for buffer
+ * -EFAULT: could not copy extent buffer back to userspace
+ */
return ret;
}
static noinline int search_ioctl(struct inode *inode,
- struct btrfs_ioctl_search_args *args)
+ struct btrfs_ioctl_search_key *sk,
+ size_t *buf_size,
+ char __user *ubuf)
{
struct btrfs_root *root;
struct btrfs_key key;
struct btrfs_path *path;
- struct btrfs_ioctl_search_key *sk = &args->key;
struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
int ret;
int num_found = 0;
unsigned long sk_offset = 0;
+ if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) {
+ *buf_size = sizeof(struct btrfs_ioctl_search_header);
+ return -EOVERFLOW;
+ }
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -2051,14 +2123,15 @@ static noinline int search_ioctl(struct inode *inode,
ret = 0;
goto err;
}
- ret = copy_to_sk(root, path, &key, sk, args->buf,
+ ret = copy_to_sk(root, path, &key, sk, buf_size, ubuf,
&sk_offset, &num_found);
btrfs_release_path(path);
- if (ret || num_found >= sk->nr_items)
+ if (ret)
break;
}
- ret = 0;
+ if (ret > 0)
+ ret = 0;
err:
sk->nr_items = num_found;
btrfs_free_path(path);
@@ -2068,22 +2141,73 @@ err:
static noinline int btrfs_ioctl_tree_search(struct file *file,
void __user *argp)
{
- struct btrfs_ioctl_search_args *args;
- struct inode *inode;
- int ret;
+ struct btrfs_ioctl_search_args __user *uargs;
+ struct btrfs_ioctl_search_key sk;
+ struct inode *inode;
+ int ret;
+ size_t buf_size;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- args = memdup_user(argp, sizeof(*args));
- if (IS_ERR(args))
- return PTR_ERR(args);
+ uargs = (struct btrfs_ioctl_search_args __user *)argp;
+
+ if (copy_from_user(&sk, &uargs->key, sizeof(sk)))
+ return -EFAULT;
+
+ buf_size = sizeof(uargs->buf);
inode = file_inode(file);
- ret = search_ioctl(inode, args);
- if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
+ ret = search_ioctl(inode, &sk, &buf_size, uargs->buf);
+
+ /*
+ * In the origin implementation an overflow is handled by returning a
+ * search header with a len of zero, so reset ret.
+ */
+ if (ret == -EOVERFLOW)
+ ret = 0;
+
+ if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk)))
ret = -EFAULT;
- kfree(args);
+ return ret;
+}
+
+static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
+ void __user *argp)
+{
+ struct btrfs_ioctl_search_args_v2 __user *uarg;
+ struct btrfs_ioctl_search_args_v2 args;
+ struct inode *inode;
+ int ret;
+ size_t buf_size;
+ const size_t buf_limit = 16 * 1024 * 1024;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* copy search header and buffer size */
+ uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp;
+ if (copy_from_user(&args, uarg, sizeof(args)))
+ return -EFAULT;
+
+ buf_size = args.buf_size;
+
+ if (buf_size < sizeof(struct btrfs_ioctl_search_header))
+ return -EOVERFLOW;
+
+ /* limit result size to 16MB */
+ if (buf_size > buf_limit)
+ buf_size = buf_limit;
+
+ inode = file_inode(file);
+ ret = search_ioctl(inode, &args.key, &buf_size,
+ (char *)(&uarg->buf[0]));
+ if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
+ ret = -EFAULT;
+ else if (ret == -EOVERFLOW &&
+ copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size)))
+ ret = -EFAULT;
+
return ret;
}
@@ -2219,6 +2343,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
struct btrfs_ioctl_vol_args *vol_args;
struct btrfs_trans_handle *trans;
struct btrfs_block_rsv block_rsv;
+ u64 root_flags;
u64 qgroup_reserved;
int namelen;
int ret;
@@ -2240,6 +2365,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
if (err)
goto out;
+
err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
if (err == -EINTR)
goto out_drop_write;
@@ -2301,6 +2427,27 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
}
mutex_lock(&inode->i_mutex);
+
+ /*
+ * Don't allow to delete a subvolume with send in progress. This is
+ * inside the i_mutex so the error handling that has to drop the bit
+ * again is not run concurrently.
+ */
+ spin_lock(&dest->root_item_lock);
+ root_flags = btrfs_root_flags(&dest->root_item);
+ if (dest->send_in_progress == 0) {
+ btrfs_set_root_flags(&dest->root_item,
+ root_flags | BTRFS_ROOT_SUBVOL_DEAD);
+ spin_unlock(&dest->root_item_lock);
+ } else {
+ spin_unlock(&dest->root_item_lock);
+ btrfs_warn(root->fs_info,
+ "Attempt to delete subvolume %llu during send",
+ dest->root_key.objectid);
+ err = -EPERM;
+ goto out_dput;
+ }
+
err = d_invalidate(dentry);
if (err)
goto out_unlock;
@@ -2346,7 +2493,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
dest->root_item.drop_level = 0;
btrfs_set_root_refs(&dest->root_item, 0);
- if (!xchg(&dest->orphan_item_inserted, 1)) {
+ if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
ret = btrfs_insert_orphan_item(trans,
root->fs_info->tree_root,
dest->root_key.objectid);
@@ -2389,11 +2536,19 @@ out_release:
out_up_write:
up_write(&root->fs_info->subvol_sem);
out_unlock:
+ if (err) {
+ spin_lock(&dest->root_item_lock);
+ root_flags = btrfs_root_flags(&dest->root_item);
+ btrfs_set_root_flags(&dest->root_item,
+ root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
+ spin_unlock(&dest->root_item_lock);
+ }
mutex_unlock(&inode->i_mutex);
if (!err) {
shrink_dcache_sb(root->fs_info->sb);
btrfs_invalidate_inodes(dest);
d_delete(dentry);
+ ASSERT(dest->send_in_progress == 0);
/* the last ref */
if (dest->cache_inode) {
@@ -2557,9 +2712,6 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
int ret = 0;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL);
if (!fi_args)
return -ENOMEM;
@@ -2574,6 +2726,10 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
}
mutex_unlock(&fs_devices->device_list_mutex);
+ fi_args->nodesize = root->fs_info->super_copy->nodesize;
+ fi_args->sectorsize = root->fs_info->super_copy->sectorsize;
+ fi_args->clone_alignment = root->fs_info->super_copy->sectorsize;
+
if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
ret = -EFAULT;
@@ -2589,9 +2745,6 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
int ret = 0;
char *s_uuid = NULL;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
di_args = memdup_user(arg, sizeof(*di_args));
if (IS_ERR(di_args))
return PTR_ERR(di_args);
@@ -2669,10 +2822,15 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
ordered = btrfs_lookup_first_ordered_extent(inode,
off + len - 1);
- if (!ordered &&
+ if ((!ordered ||
+ ordered->file_offset + ordered->len <= off ||
+ ordered->file_offset >= off + len) &&
!test_range_bit(&BTRFS_I(inode)->io_tree, off,
- off + len - 1, EXTENT_DELALLOC, 0, NULL))
+ off + len - 1, EXTENT_DELALLOC, 0, NULL)) {
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
break;
+ }
unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
if (ordered)
btrfs_put_ordered_extent(ordered);
@@ -2912,6 +3070,126 @@ out:
return ret;
}
+/* Helper to check and see if this root currently has a ref on the given disk
+ * bytenr. If it does then we need to update the quota for this root. This
+ * doesn't do anything if quotas aren't enabled.
+ */
+static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ u64 disko)
+{
+ struct seq_list tree_mod_seq_elem = {};
+ struct ulist *roots;
+ struct ulist_iterator uiter;
+ struct ulist_node *root_node = NULL;
+ int ret;
+
+ if (!root->fs_info->quota_enabled)
+ return 1;
+
+ btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
+ ret = btrfs_find_all_roots(trans, root->fs_info, disko,
+ tree_mod_seq_elem.seq, &roots);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+ ULIST_ITER_INIT(&uiter);
+ while ((root_node = ulist_next(roots, &uiter))) {
+ if (root_node->val == root->objectid) {
+ ret = 1;
+ break;
+ }
+ }
+ ulist_free(roots);
+out:
+ btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
+ return ret;
+}
+
+static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ u64 endoff,
+ const u64 destoff,
+ const u64 olen)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ inode_inc_iversion(inode);
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ /*
+ * We round up to the block size at eof when determining which
+ * extents to clone above, but shouldn't round up the file size.
+ */
+ if (endoff > destoff + olen)
+ endoff = destoff + olen;
+ if (endoff > inode->i_size)
+ btrfs_i_size_write(inode, endoff);
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
+ ret = btrfs_end_transaction(trans, root);
+out:
+ return ret;
+}
+
+static void clone_update_extent_map(struct inode *inode,
+ const struct btrfs_trans_handle *trans,
+ const struct btrfs_path *path,
+ struct btrfs_file_extent_item *fi,
+ const u64 hole_offset,
+ const u64 hole_len)
+{
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map *em;
+ int ret;
+
+ em = alloc_extent_map();
+ if (!em) {
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ return;
+ }
+
+ if (fi) {
+ btrfs_extent_item_to_extent_map(inode, path, fi, false, em);
+ em->generation = -1;
+ if (btrfs_file_extent_type(path->nodes[0], fi) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ } else {
+ em->start = hole_offset;
+ em->len = hole_len;
+ em->ram_bytes = em->len;
+ em->orig_start = hole_offset;
+ em->block_start = EXTENT_MAP_HOLE;
+ em->block_len = 0;
+ em->orig_block_len = 0;
+ em->compress_type = BTRFS_COMPRESS_NONE;
+ em->generation = trans->transid;
+ }
+
+ while (1) {
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em, 1);
+ write_unlock(&em_tree->lock);
+ if (ret != -EEXIST) {
+ free_extent_map(em);
+ break;
+ }
+ btrfs_drop_extent_cache(inode, em->start,
+ em->start + em->len - 1, 0);
+ }
+
+ if (unlikely(ret))
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+}
+
/**
* btrfs_clone() - clone a range from inode file to another
*
@@ -2924,7 +3202,8 @@ out:
* @destoff: Offset within @inode to start clone
*/
static int btrfs_clone(struct inode *src, struct inode *inode,
- u64 off, u64 olen, u64 olen_aligned, u64 destoff)
+ const u64 off, const u64 olen, const u64 olen_aligned,
+ const u64 destoff)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path = NULL;
@@ -2935,7 +3214,10 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
u32 nritems;
int slot;
int ret;
- u64 len = olen_aligned;
+ int no_quota;
+ const u64 len = olen_aligned;
+ u64 last_disko = 0;
+ u64 last_dest_end = destoff;
ret = -ENOMEM;
buf = vmalloc(btrfs_level_size(root, 0));
@@ -2952,7 +3234,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
/* clone data */
key.objectid = btrfs_ino(src);
key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = 0;
+ key.offset = off;
while (1) {
/*
@@ -2964,9 +3246,21 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
0, 0);
if (ret < 0)
goto out;
+ /*
+ * First search, if no extent item that starts at offset off was
+ * found but the previous item is an extent item, it's possible
+ * it might overlap our target range, therefore process it.
+ */
+ if (key.offset == off && ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0] - 1);
+ if (key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
nritems = btrfs_header_nritems(path->nodes[0]);
process_slot:
+ no_quota = 1;
if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
if (ret < 0)
@@ -2991,7 +3285,7 @@ process_slot:
u64 disko = 0, diskl = 0;
u64 datao = 0, datal = 0;
u8 comp;
- u64 endoff;
+ u64 drop_start;
extent = btrfs_item_ptr(leaf, slot,
struct btrfs_file_extent_item);
@@ -3012,10 +3306,16 @@ process_slot:
extent);
}
- if (key.offset + datal <= off ||
- key.offset >= off + len - 1) {
+ /*
+ * The first search might have left us at an extent
+ * item that ends before our target range's start, can
+ * happen if we have holes and NO_HOLES feature enabled.
+ */
+ if (key.offset + datal <= off) {
path->slots[0]++;
goto process_slot;
+ } else if (key.offset >= off + len) {
+ break;
}
size = btrfs_item_size_nr(leaf, slot);
@@ -3034,6 +3334,18 @@ process_slot:
new_key.offset = destoff;
/*
+ * Deal with a hole that doesn't have an extent item
+ * that represents it (NO_HOLES feature enabled).
+ * This hole is either in the middle of the cloning
+ * range or at the beginning (fully overlaps it or
+ * partially overlaps it).
+ */
+ if (new_key.offset != last_dest_end)
+ drop_start = last_dest_end;
+ else
+ drop_start = new_key.offset;
+
+ /*
* 1 - adjusting old extent (we may have to split it)
* 1 - add new extent
* 1 - inode update
@@ -3051,18 +3363,18 @@ process_slot:
* | ------------- extent ------------- |
*/
- /* substract range b */
+ /* subtract range b */
if (key.offset + datal > off + len)
datal = off + len - key.offset;
- /* substract range a */
+ /* subtract range a */
if (off > key.offset) {
datao += off - key.offset;
datal -= off - key.offset;
}
ret = btrfs_drop_extents(trans, root, inode,
- new_key.offset,
+ drop_start,
new_key.offset + datal,
1);
if (ret) {
@@ -3099,6 +3411,28 @@ process_slot:
datao);
btrfs_set_file_extent_num_bytes(leaf, extent,
datal);
+
+ /*
+ * We need to look up the roots that point at
+ * this bytenr and see if the new root does. If
+ * it does not we need to make sure we update
+ * quotas appropriately.
+ */
+ if (disko && root != BTRFS_I(src)->root &&
+ disko != last_disko) {
+ no_quota = check_ref(trans, root,
+ disko);
+ if (no_quota < 0) {
+ btrfs_abort_transaction(trans,
+ root,
+ ret);
+ btrfs_end_transaction(trans,
+ root);
+ ret = no_quota;
+ goto out;
+ }
+ }
+
if (disko) {
inode_add_bytes(inode, datal);
ret = btrfs_inc_extent_ref(trans, root,
@@ -3106,7 +3440,7 @@ process_slot:
root->root_key.objectid,
btrfs_ino(inode),
new_key.offset - datao,
- 0);
+ no_quota);
if (ret) {
btrfs_abort_transaction(trans,
root,
@@ -3141,7 +3475,7 @@ process_slot:
aligned_end = ALIGN(new_key.offset + datal,
root->sectorsize);
ret = btrfs_drop_extents(trans, root, inode,
- new_key.offset,
+ drop_start,
aligned_end,
1);
if (ret) {
@@ -3174,40 +3508,69 @@ process_slot:
btrfs_item_ptr_offset(leaf, slot),
size);
inode_add_bytes(inode, datal);
+ extent = btrfs_item_ptr(leaf, slot,
+ struct btrfs_file_extent_item);
}
+ /* If we have an implicit hole (NO_HOLES feature). */
+ if (drop_start < new_key.offset)
+ clone_update_extent_map(inode, trans,
+ path, NULL, drop_start,
+ new_key.offset - drop_start);
+
+ clone_update_extent_map(inode, trans, path,
+ extent, 0, 0);
+
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- inode_inc_iversion(inode);
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-
- /*
- * we round up to the block size at eof when
- * determining which extents to clone above,
- * but shouldn't round up the file size
- */
- endoff = new_key.offset + datal;
- if (endoff > destoff+olen)
- endoff = destoff+olen;
- if (endoff > inode->i_size)
- btrfs_i_size_write(inode, endoff);
-
- ret = btrfs_update_inode(trans, root, inode);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- btrfs_end_transaction(trans, root);
+ last_dest_end = new_key.offset + datal;
+ ret = clone_finish_inode_update(trans, inode,
+ last_dest_end,
+ destoff, olen);
+ if (ret)
goto out;
- }
- ret = btrfs_end_transaction(trans, root);
+ if (new_key.offset + datal >= destoff + len)
+ break;
}
btrfs_release_path(path);
key.offset++;
}
ret = 0;
+ if (last_dest_end < destoff + len) {
+ /*
+ * We have an implicit hole (NO_HOLES feature is enabled) that
+ * fully or partially overlaps our cloning range at its end.
+ */
+ btrfs_release_path(path);
+
+ /*
+ * 1 - remove extent(s)
+ * 1 - inode update
+ */
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+ ret = btrfs_drop_extents(trans, root, inode,
+ last_dest_end, destoff + len, 1);
+ if (ret) {
+ if (ret != -EOPNOTSUPP)
+ btrfs_abort_transaction(trans, root, ret);
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
+ ret = clone_finish_inode_update(trans, inode, destoff + len,
+ destoff, olen);
+ if (ret)
+ goto out;
+ clone_update_extent_map(inode, trans, path, NULL, last_dest_end,
+ destoff + len - last_dest_end);
+ }
+
out:
- btrfs_release_path(path);
btrfs_free_path(path);
vfree(buf);
return ret;
@@ -3319,15 +3682,41 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
goto out_unlock;
}
- /* truncate page cache pages from target inode range */
- truncate_inode_pages_range(&inode->i_data, destoff,
- PAGE_CACHE_ALIGN(destoff + len) - 1);
+ /*
+ * Lock the target range too. Right after we replace the file extent
+ * items in the fs tree (which now point to the cloned data), we might
+ * have a worker replace them with extent items relative to a write
+ * operation that was issued before this clone operation (i.e. confront
+ * with inode.c:btrfs_finish_ordered_io).
+ */
+ if (same_inode) {
+ u64 lock_start = min_t(u64, off, destoff);
+ u64 lock_len = max_t(u64, off, destoff) + len - lock_start;
- lock_extent_range(src, off, len);
+ lock_extent_range(src, lock_start, lock_len);
+ } else {
+ lock_extent_range(src, off, len);
+ lock_extent_range(inode, destoff, len);
+ }
ret = btrfs_clone(src, inode, off, olen, len, destoff);
- unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
+ if (same_inode) {
+ u64 lock_start = min_t(u64, off, destoff);
+ u64 lock_end = max_t(u64, off, destoff) + len - 1;
+
+ unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end);
+ } else {
+ unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
+ unlock_extent(&BTRFS_I(inode)->io_tree, destoff,
+ destoff + len - 1);
+ }
+ /*
+ * Truncate page cache pages so that future reads will see the cloned
+ * data immediately and not the previous data.
+ */
+ truncate_inode_pages_range(&inode->i_data, destoff,
+ PAGE_CACHE_ALIGN(destoff + len) - 1);
out_unlock:
if (!same_inode) {
if (inode < src) {
@@ -4902,6 +5291,8 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_trans_end(file);
case BTRFS_IOC_TREE_SEARCH:
return btrfs_ioctl_tree_search(file, argp);
+ case BTRFS_IOC_TREE_SEARCH_V2:
+ return btrfs_ioctl_tree_search_v2(file, argp);
case BTRFS_IOC_INO_LOOKUP:
return btrfs_ioctl_ino_lookup(file, argp);
case BTRFS_IOC_INO_PATHS:
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 01277b8f2373..5665d2149249 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -33,14 +33,14 @@ static void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
*/
void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
{
- if (eb->lock_nested) {
- read_lock(&eb->lock);
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- read_unlock(&eb->lock);
- return;
- }
- read_unlock(&eb->lock);
- }
+ /*
+ * no lock is required. The lock owner may change if
+ * we have a read lock, but it won't change to or away
+ * from us. If we have the write lock, we are the owner
+ * and it'll never change.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner)
+ return;
if (rw == BTRFS_WRITE_LOCK) {
if (atomic_read(&eb->blocking_writers) == 0) {
WARN_ON(atomic_read(&eb->spinning_writers) != 1);
@@ -65,14 +65,15 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
*/
void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
{
- if (eb->lock_nested) {
- read_lock(&eb->lock);
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- read_unlock(&eb->lock);
- return;
- }
- read_unlock(&eb->lock);
- }
+ /*
+ * no lock is required. The lock owner may change if
+ * we have a read lock, but it won't change to or away
+ * from us. If we have the write lock, we are the owner
+ * and it'll never change.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner)
+ return;
+
if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
BUG_ON(atomic_read(&eb->blocking_writers) != 1);
write_lock(&eb->lock);
@@ -99,6 +100,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
void btrfs_tree_read_lock(struct extent_buffer *eb)
{
again:
+ BUG_ON(!atomic_read(&eb->blocking_writers) &&
+ current->pid == eb->lock_owner);
+
read_lock(&eb->lock);
if (atomic_read(&eb->blocking_writers) &&
current->pid == eb->lock_owner) {
@@ -132,7 +136,9 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb)
if (atomic_read(&eb->blocking_writers))
return 0;
- read_lock(&eb->lock);
+ if (!read_trylock(&eb->lock))
+ return 0;
+
if (atomic_read(&eb->blocking_writers)) {
read_unlock(&eb->lock);
return 0;
@@ -151,7 +157,10 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
if (atomic_read(&eb->blocking_writers) ||
atomic_read(&eb->blocking_readers))
return 0;
- write_lock(&eb->lock);
+
+ if (!write_trylock(&eb->lock))
+ return 0;
+
if (atomic_read(&eb->blocking_writers) ||
atomic_read(&eb->blocking_readers)) {
write_unlock(&eb->lock);
@@ -168,14 +177,15 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
*/
void btrfs_tree_read_unlock(struct extent_buffer *eb)
{
- if (eb->lock_nested) {
- read_lock(&eb->lock);
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- eb->lock_nested = 0;
- read_unlock(&eb->lock);
- return;
- }
- read_unlock(&eb->lock);
+ /*
+ * if we're nested, we have the write lock. No new locking
+ * is needed as long as we are the lock owner.
+ * The write unlock will do a barrier for us, and the lock_nested
+ * field only matters to the lock owner.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner) {
+ eb->lock_nested = 0;
+ return;
}
btrfs_assert_tree_read_locked(eb);
WARN_ON(atomic_read(&eb->spinning_readers) == 0);
@@ -189,14 +199,15 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
*/
void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
{
- if (eb->lock_nested) {
- read_lock(&eb->lock);
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- eb->lock_nested = 0;
- read_unlock(&eb->lock);
- return;
- }
- read_unlock(&eb->lock);
+ /*
+ * if we're nested, we have the write lock. No new locking
+ * is needed as long as we are the lock owner.
+ * The write unlock will do a barrier for us, and the lock_nested
+ * field only matters to the lock owner.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner) {
+ eb->lock_nested = 0;
+ return;
}
btrfs_assert_tree_read_locked(eb);
WARN_ON(atomic_read(&eb->blocking_readers) == 0);
@@ -244,6 +255,7 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
BUG_ON(blockers > 1);
btrfs_assert_tree_locked(eb);
+ eb->lock_owner = 0;
atomic_dec(&eb->write_locks);
if (blockers) {
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index b47f669aca75..dfad8514f0da 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -143,7 +143,7 @@ static int lzo_compress_pages(struct list_head *ws,
if (ret != LZO_E_OK) {
printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
ret);
- ret = -1;
+ ret = -EIO;
goto out;
}
@@ -189,7 +189,7 @@ static int lzo_compress_pages(struct list_head *ws,
kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
- ret = -1;
+ ret = -E2BIG;
goto out;
}
@@ -208,7 +208,7 @@ static int lzo_compress_pages(struct list_head *ws,
/* we're making it bigger, give up */
if (tot_in > 8192 && tot_in < tot_out) {
- ret = -1;
+ ret = -E2BIG;
goto out;
}
@@ -335,7 +335,7 @@ cont:
break;
if (page_in_index + 1 >= total_pages_in) {
- ret = -1;
+ ret = -EIO;
goto done;
}
@@ -358,7 +358,7 @@ cont:
kunmap(pages_in[page_in_index - 1]);
if (ret != LZO_E_OK) {
printk(KERN_WARNING "BTRFS: decompress failed\n");
- ret = -1;
+ ret = -EIO;
break;
}
@@ -402,12 +402,12 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
if (ret != LZO_E_OK) {
printk(KERN_WARNING "BTRFS: decompress failed!\n");
- ret = -1;
+ ret = -EIO;
goto out;
}
if (out_len < start_byte) {
- ret = -1;
+ ret = -EIO;
goto out;
}
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a94b05f72869..e12441c7cf1d 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -67,7 +67,7 @@ static void ordered_data_tree_panic(struct inode *inode, int errno,
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset "
- "%llu\n", offset);
+ "%llu", offset);
}
/*
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 2cf905877aaf..98cb6b2630f9 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -32,6 +32,7 @@
#include "ulist.h"
#include "backref.h"
#include "extent_io.h"
+#include "qgroup.h"
/* TODO XXX FIXME
* - subvol delete -> delete when ref goes to 0? delete limits also?
@@ -84,8 +85,8 @@ struct btrfs_qgroup {
/*
* temp variables for accounting operations
*/
- u64 tag;
- u64 refcnt;
+ u64 old_refcnt;
+ u64 new_refcnt;
};
/*
@@ -98,6 +99,9 @@ struct btrfs_qgroup_list {
struct btrfs_qgroup *member;
};
+#define ptr_to_u64(x) ((u64)(uintptr_t)x)
+#define u64_to_ptr(x) ((struct btrfs_qgroup *)(uintptr_t)x)
+
static int
qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
int init_flags);
@@ -242,6 +246,21 @@ static int del_relation_rb(struct btrfs_fs_info *fs_info,
return -ENOENT;
}
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
+ u64 rfer, u64 excl)
+{
+ struct btrfs_qgroup *qgroup;
+
+ qgroup = find_qgroup_rb(fs_info, qgroupid);
+ if (!qgroup)
+ return -EINVAL;
+ if (qgroup->rfer != rfer || qgroup->excl != excl)
+ return -EINVAL;
+ return 0;
+}
+#endif
+
/*
* The full config is read in one go, only called from open_ctree()
* It doesn't use any locking, as at this point we're still single-threaded
@@ -520,6 +539,10 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_key key;
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &quota_root->state)))
+ return 0;
+#endif
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -669,6 +692,10 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
int ret;
int slot;
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ return 0;
+#endif
key.objectid = 0;
key.type = BTRFS_QGROUP_INFO_KEY;
key.offset = qgroup->qgroupid;
@@ -1174,33 +1201,198 @@ out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
}
+static int comp_oper(struct btrfs_qgroup_operation *oper1,
+ struct btrfs_qgroup_operation *oper2)
+{
+ if (oper1->bytenr < oper2->bytenr)
+ return -1;
+ if (oper1->bytenr > oper2->bytenr)
+ return 1;
+ if (oper1->seq < oper2->seq)
+ return -1;
+ if (oper1->seq > oper2->seq)
+ return -1;
+ if (oper1->ref_root < oper2->ref_root)
+ return -1;
+ if (oper1->ref_root > oper2->ref_root)
+ return 1;
+ if (oper1->type < oper2->type)
+ return -1;
+ if (oper1->type > oper2->type)
+ return 1;
+ return 0;
+}
+
+static int insert_qgroup_oper(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper)
+{
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ struct btrfs_qgroup_operation *cur;
+ int cmp;
+
+ spin_lock(&fs_info->qgroup_op_lock);
+ p = &fs_info->qgroup_op_tree.rb_node;
+ while (*p) {
+ parent = *p;
+ cur = rb_entry(parent, struct btrfs_qgroup_operation, n);
+ cmp = comp_oper(cur, oper);
+ if (cmp < 0) {
+ p = &(*p)->rb_right;
+ } else if (cmp) {
+ p = &(*p)->rb_left;
+ } else {
+ spin_unlock(&fs_info->qgroup_op_lock);
+ return -EEXIST;
+ }
+ }
+ rb_link_node(&oper->n, parent, p);
+ rb_insert_color(&oper->n, &fs_info->qgroup_op_tree);
+ spin_unlock(&fs_info->qgroup_op_lock);
+ return 0;
+}
/*
- * btrfs_qgroup_record_ref is called when the ref is added or deleted. it puts
- * the modification into a list that's later used by btrfs_end_transaction to
- * pass the recorded modifications on to btrfs_qgroup_account_ref.
+ * Record a quota operation for processing later on.
+ * @trans: the transaction we are adding the delayed op to.
+ * @fs_info: the fs_info for this fs.
+ * @ref_root: the root of the reference we are acting on,
+ * @bytenr: the bytenr we are acting on.
+ * @num_bytes: the number of bytes in the reference.
+ * @type: the type of operation this is.
+ * @mod_seq: do we need to get a sequence number for looking up roots.
+ *
+ * We just add it to our trans qgroup_ref_list and carry on and process these
+ * operations in order at some later point. If the reference root isn't a fs
+ * root then we don't bother with doing anything.
+ *
+ * MUST BE HOLDING THE REF LOCK.
*/
int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *node,
- struct btrfs_delayed_extent_op *extent_op)
+ struct btrfs_fs_info *fs_info, u64 ref_root,
+ u64 bytenr, u64 num_bytes,
+ enum btrfs_qgroup_operation_type type, int mod_seq)
{
- struct qgroup_update *u;
+ struct btrfs_qgroup_operation *oper;
+ int ret;
+
+ if (!is_fstree(ref_root) || !fs_info->quota_enabled)
+ return 0;
- BUG_ON(!trans->delayed_ref_elem.seq);
- u = kmalloc(sizeof(*u), GFP_NOFS);
- if (!u)
+ oper = kmalloc(sizeof(*oper), GFP_NOFS);
+ if (!oper)
return -ENOMEM;
- u->node = node;
- u->extent_op = extent_op;
- list_add_tail(&u->list, &trans->qgroup_ref_list);
+ oper->ref_root = ref_root;
+ oper->bytenr = bytenr;
+ oper->num_bytes = num_bytes;
+ oper->type = type;
+ oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq);
+ INIT_LIST_HEAD(&oper->elem.list);
+ oper->elem.seq = 0;
+ ret = insert_qgroup_oper(fs_info, oper);
+ if (ret) {
+ /* Shouldn't happen so have an assert for developers */
+ ASSERT(0);
+ kfree(oper);
+ return ret;
+ }
+ list_add_tail(&oper->list, &trans->qgroup_ref_list);
+
+ if (mod_seq)
+ btrfs_get_tree_mod_seq(fs_info, &oper->elem);
return 0;
}
-static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info,
- struct ulist *roots, struct ulist *tmp,
- u64 seq)
+/*
+ * The easy accounting, if we are adding/removing the only ref for an extent
+ * then this qgroup and all of the parent qgroups get their refrence and
+ * exclusive counts adjusted.
+ */
+static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper)
+{
+ struct btrfs_qgroup *qgroup;
+ struct ulist *tmp;
+ struct btrfs_qgroup_list *glist;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ int sign = 0;
+ int ret = 0;
+
+ tmp = ulist_alloc(GFP_NOFS);
+ if (!tmp)
+ return -ENOMEM;
+
+ spin_lock(&fs_info->qgroup_lock);
+ if (!fs_info->quota_root)
+ goto out;
+ qgroup = find_qgroup_rb(fs_info, oper->ref_root);
+ if (!qgroup)
+ goto out;
+ switch (oper->type) {
+ case BTRFS_QGROUP_OPER_ADD_EXCL:
+ sign = 1;
+ break;
+ case BTRFS_QGROUP_OPER_SUB_EXCL:
+ sign = -1;
+ break;
+ default:
+ ASSERT(0);
+ }
+ qgroup->rfer += sign * oper->num_bytes;
+ qgroup->rfer_cmpr += sign * oper->num_bytes;
+
+ WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes);
+ qgroup->excl += sign * oper->num_bytes;
+ qgroup->excl_cmpr += sign * oper->num_bytes;
+
+ qgroup_dirty(fs_info, qgroup);
+
+ /* Get all of the parent groups that contain this qgroup */
+ list_for_each_entry(glist, &qgroup->groups, next_group) {
+ ret = ulist_add(tmp, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
+ }
+
+ /* Iterate all of the parents and adjust their reference counts */
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(tmp, &uiter))) {
+ qgroup = u64_to_ptr(unode->aux);
+ qgroup->rfer += sign * oper->num_bytes;
+ qgroup->rfer_cmpr += sign * oper->num_bytes;
+ qgroup->excl += sign * oper->num_bytes;
+ if (sign < 0)
+ WARN_ON(qgroup->excl < oper->num_bytes);
+ qgroup->excl_cmpr += sign * oper->num_bytes;
+ qgroup_dirty(fs_info, qgroup);
+
+ /* Add any parents of the parents */
+ list_for_each_entry(glist, &qgroup->groups, next_group) {
+ ret = ulist_add(tmp, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+ spin_unlock(&fs_info->qgroup_lock);
+ ulist_free(tmp);
+ return ret;
+}
+
+/*
+ * Walk all of the roots that pointed to our bytenr and adjust their refcnts as
+ * properly.
+ */
+static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
+ u64 root_to_skip, struct ulist *tmp,
+ struct ulist *roots, struct ulist *qgroups,
+ u64 seq, int *old_roots, int rescan)
{
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -1211,256 +1403,551 @@ static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info,
ULIST_ITER_INIT(&uiter);
while ((unode = ulist_next(roots, &uiter))) {
+ /* We don't count our current root here */
+ if (unode->val == root_to_skip)
+ continue;
qg = find_qgroup_rb(fs_info, unode->val);
if (!qg)
continue;
+ /*
+ * We could have a pending removal of this same ref so we may
+ * not have actually found our ref root when doing
+ * btrfs_find_all_roots, so we need to keep track of how many
+ * old roots we find in case we removed ours and added a
+ * different one at the same time. I don't think this could
+ * happen in practice but that sort of thinking leads to pain
+ * and suffering and to the dark side.
+ */
+ (*old_roots)++;
ulist_reinit(tmp);
- /* XXX id not needed */
- ret = ulist_add(tmp, qg->qgroupid,
- (u64)(uintptr_t)qg, GFP_ATOMIC);
+ ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
+ GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), GFP_ATOMIC);
if (ret < 0)
return ret;
ULIST_ITER_INIT(&tmp_uiter);
while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
struct btrfs_qgroup_list *glist;
- qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
- if (qg->refcnt < seq)
- qg->refcnt = seq + 1;
+ qg = u64_to_ptr(tmp_unode->aux);
+ /*
+ * We use this sequence number to keep from having to
+ * run the whole list and 0 out the refcnt every time.
+ * We basically use sequnce as the known 0 count and
+ * then add 1 everytime we see a qgroup. This is how we
+ * get how many of the roots actually point up to the
+ * upper level qgroups in order to determine exclusive
+ * counts.
+ *
+ * For rescan we want to set old_refcnt to seq so our
+ * exclusive calculations end up correct.
+ */
+ if (rescan)
+ qg->old_refcnt = seq;
+ else if (qg->old_refcnt < seq)
+ qg->old_refcnt = seq + 1;
else
- ++qg->refcnt;
+ qg->old_refcnt++;
+ if (qg->new_refcnt < seq)
+ qg->new_refcnt = seq + 1;
+ else
+ qg->new_refcnt++;
list_for_each_entry(glist, &qg->groups, next_group) {
+ ret = ulist_add(qgroups, glist->group->qgroupid,
+ ptr_to_u64(glist->group),
+ GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
ret = ulist_add(tmp, glist->group->qgroupid,
- (u64)(uintptr_t)glist->group,
+ ptr_to_u64(glist->group),
GFP_ATOMIC);
if (ret < 0)
return ret;
}
}
}
+ return 0;
+}
+
+/*
+ * We need to walk forward in our operation tree and account for any roots that
+ * were deleted after we made this operation.
+ */
+static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper,
+ struct ulist *tmp,
+ struct ulist *qgroups, u64 seq,
+ int *old_roots)
+{
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ struct btrfs_qgroup *qg;
+ struct btrfs_qgroup_operation *tmp_oper;
+ struct rb_node *n;
+ int ret;
+
+ ulist_reinit(tmp);
+ /*
+ * We only walk forward in the tree since we're only interested in
+ * removals that happened _after_ our operation.
+ */
+ spin_lock(&fs_info->qgroup_op_lock);
+ n = rb_next(&oper->n);
+ spin_unlock(&fs_info->qgroup_op_lock);
+ if (!n)
+ return 0;
+ tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
+ while (tmp_oper->bytenr == oper->bytenr) {
+ /*
+ * If it's not a removal we don't care, additions work out
+ * properly with our refcnt tracking.
+ */
+ if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED &&
+ tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL)
+ goto next;
+ qg = find_qgroup_rb(fs_info, tmp_oper->ref_root);
+ if (!qg)
+ goto next;
+ ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
+ GFP_ATOMIC);
+ if (ret) {
+ if (ret < 0)
+ return ret;
+ /*
+ * We only want to increase old_roots if this qgroup is
+ * not already in the list of qgroups. If it is already
+ * there then that means it must have been re-added or
+ * the delete will be discarded because we had an
+ * existing ref that we haven't looked up yet. In this
+ * case we don't want to increase old_roots. So if ret
+ * == 1 then we know that this is the first time we've
+ * seen this qgroup and we can bump the old_roots.
+ */
+ (*old_roots)++;
+ ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg),
+ GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ }
+next:
+ spin_lock(&fs_info->qgroup_op_lock);
+ n = rb_next(&tmp_oper->n);
+ spin_unlock(&fs_info->qgroup_op_lock);
+ if (!n)
+ break;
+ tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
+ }
+
+ /* Ok now process the qgroups we found */
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(tmp, &uiter))) {
+ struct btrfs_qgroup_list *glist;
+
+ qg = u64_to_ptr(unode->aux);
+ if (qg->old_refcnt < seq)
+ qg->old_refcnt = seq + 1;
+ else
+ qg->old_refcnt++;
+ if (qg->new_refcnt < seq)
+ qg->new_refcnt = seq + 1;
+ else
+ qg->new_refcnt++;
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ ret = ulist_add(qgroups, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ ret = ulist_add(tmp, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ }
+ }
return 0;
}
-static int qgroup_account_ref_step2(struct btrfs_fs_info *fs_info,
- struct ulist *roots, struct ulist *tmp,
- u64 seq, int sgn, u64 num_bytes,
- struct btrfs_qgroup *qgroup)
+/* Add refcnt for the newly added reference. */
+static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper,
+ struct btrfs_qgroup *qgroup,
+ struct ulist *tmp, struct ulist *qgroups,
+ u64 seq)
{
struct ulist_node *unode;
struct ulist_iterator uiter;
struct btrfs_qgroup *qg;
- struct btrfs_qgroup_list *glist;
int ret;
ulist_reinit(tmp);
- ret = ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
+ ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup),
+ GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup),
+ GFP_ATOMIC);
if (ret < 0)
return ret;
-
ULIST_ITER_INIT(&uiter);
while ((unode = ulist_next(tmp, &uiter))) {
- qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
- if (qg->refcnt < seq) {
- /* not visited by step 1 */
- qg->rfer += sgn * num_bytes;
- qg->rfer_cmpr += sgn * num_bytes;
- if (roots->nnodes == 0) {
- qg->excl += sgn * num_bytes;
- qg->excl_cmpr += sgn * num_bytes;
- }
- qgroup_dirty(fs_info, qg);
- }
- WARN_ON(qg->tag >= seq);
- qg->tag = seq;
+ struct btrfs_qgroup_list *glist;
+ qg = u64_to_ptr(unode->aux);
+ if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
+ if (qg->new_refcnt < seq)
+ qg->new_refcnt = seq + 1;
+ else
+ qg->new_refcnt++;
+ } else {
+ if (qg->old_refcnt < seq)
+ qg->old_refcnt = seq + 1;
+ else
+ qg->old_refcnt++;
+ }
list_for_each_entry(glist, &qg->groups, next_group) {
ret = ulist_add(tmp, glist->group->qgroupid,
- (uintptr_t)glist->group, GFP_ATOMIC);
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ ret = ulist_add(qgroups, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
if (ret < 0)
return ret;
}
}
-
return 0;
}
-static int qgroup_account_ref_step3(struct btrfs_fs_info *fs_info,
- struct ulist *roots, struct ulist *tmp,
- u64 seq, int sgn, u64 num_bytes)
+/*
+ * This adjusts the counters for all referenced qgroups if need be.
+ */
+static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info,
+ u64 root_to_skip, u64 num_bytes,
+ struct ulist *qgroups, u64 seq,
+ int old_roots, int new_roots, int rescan)
{
struct ulist_node *unode;
struct ulist_iterator uiter;
struct btrfs_qgroup *qg;
- struct ulist_node *tmp_unode;
- struct ulist_iterator tmp_uiter;
- int ret;
+ u64 cur_new_count, cur_old_count;
ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(roots, &uiter))) {
- qg = find_qgroup_rb(fs_info, unode->val);
- if (!qg)
- continue;
+ while ((unode = ulist_next(qgroups, &uiter))) {
+ bool dirty = false;
- ulist_reinit(tmp);
- ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC);
- if (ret < 0)
- return ret;
+ qg = u64_to_ptr(unode->aux);
+ /*
+ * Wasn't referenced before but is now, add to the reference
+ * counters.
+ */
+ if (qg->old_refcnt <= seq && qg->new_refcnt > seq) {
+ qg->rfer += num_bytes;
+ qg->rfer_cmpr += num_bytes;
+ dirty = true;
+ }
- ULIST_ITER_INIT(&tmp_uiter);
- while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
- struct btrfs_qgroup_list *glist;
+ /*
+ * Was referenced before but isn't now, subtract from the
+ * reference counters.
+ */
+ if (qg->old_refcnt > seq && qg->new_refcnt <= seq) {
+ qg->rfer -= num_bytes;
+ qg->rfer_cmpr -= num_bytes;
+ dirty = true;
+ }
- qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
- if (qg->tag == seq)
- continue;
+ if (qg->old_refcnt < seq)
+ cur_old_count = 0;
+ else
+ cur_old_count = qg->old_refcnt - seq;
+ if (qg->new_refcnt < seq)
+ cur_new_count = 0;
+ else
+ cur_new_count = qg->new_refcnt - seq;
- if (qg->refcnt - seq == roots->nnodes) {
- qg->excl -= sgn * num_bytes;
- qg->excl_cmpr -= sgn * num_bytes;
- qgroup_dirty(fs_info, qg);
- }
+ /*
+ * If our refcount was the same as the roots previously but our
+ * new count isn't the same as the number of roots now then we
+ * went from having a exclusive reference on this range to not.
+ */
+ if (old_roots && cur_old_count == old_roots &&
+ (cur_new_count != new_roots || new_roots == 0)) {
+ WARN_ON(cur_new_count != new_roots && new_roots == 0);
+ qg->excl -= num_bytes;
+ qg->excl_cmpr -= num_bytes;
+ dirty = true;
+ }
- list_for_each_entry(glist, &qg->groups, next_group) {
- ret = ulist_add(tmp, glist->group->qgroupid,
- (uintptr_t)glist->group,
- GFP_ATOMIC);
- if (ret < 0)
- return ret;
- }
+ /*
+ * If we didn't reference all the roots before but now we do we
+ * have an exclusive reference to this range.
+ */
+ if ((!old_roots || (old_roots && cur_old_count != old_roots))
+ && cur_new_count == new_roots) {
+ qg->excl += num_bytes;
+ qg->excl_cmpr += num_bytes;
+ dirty = true;
}
- }
+ if (dirty)
+ qgroup_dirty(fs_info, qg);
+ }
return 0;
}
/*
- * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
- * from the fs. First, all roots referencing the extent are searched, and
- * then the space is accounted accordingly to the different roots. The
- * accounting algorithm works in 3 steps documented inline.
+ * If we removed a data extent and there were other references for that bytenr
+ * then we need to lookup all referenced roots to make sure we still don't
+ * reference this bytenr. If we do then we can just discard this operation.
*/
-int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_node *node,
- struct btrfs_delayed_extent_op *extent_op)
+static int check_existing_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper)
{
- struct btrfs_root *quota_root;
- u64 ref_root;
- struct btrfs_qgroup *qgroup;
struct ulist *roots = NULL;
- u64 seq;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
int ret = 0;
- int sgn;
- if (!fs_info->quota_enabled)
- return 0;
-
- BUG_ON(!fs_info->quota_root);
+ ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
+ oper->elem.seq, &roots);
+ if (ret < 0)
+ return ret;
+ ret = 0;
- if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
- node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
- struct btrfs_delayed_tree_ref *ref;
- ref = btrfs_delayed_node_to_tree_ref(node);
- ref_root = ref->root;
- } else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
- node->type == BTRFS_SHARED_DATA_REF_KEY) {
- struct btrfs_delayed_data_ref *ref;
- ref = btrfs_delayed_node_to_data_ref(node);
- ref_root = ref->root;
- } else {
- BUG();
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(roots, &uiter))) {
+ if (unode->val == oper->ref_root) {
+ ret = 1;
+ break;
+ }
}
+ ulist_free(roots);
+ btrfs_put_tree_mod_seq(fs_info, &oper->elem);
- if (!is_fstree(ref_root)) {
- /*
- * non-fs-trees are not being accounted
- */
- return 0;
- }
+ return ret;
+}
- switch (node->action) {
- case BTRFS_ADD_DELAYED_REF:
- case BTRFS_ADD_DELAYED_EXTENT:
- sgn = 1;
- seq = btrfs_tree_mod_seq_prev(node->seq);
- break;
- case BTRFS_DROP_DELAYED_REF:
- sgn = -1;
- seq = node->seq;
- break;
- case BTRFS_UPDATE_DELAYED_HEAD:
- return 0;
- default:
- BUG();
- }
+/*
+ * If we share a reference across multiple roots then we may need to adjust
+ * various qgroups referenced and exclusive counters. The basic premise is this
+ *
+ * 1) We have seq to represent a 0 count. Instead of looping through all of the
+ * qgroups and resetting their refcount to 0 we just constantly bump this
+ * sequence number to act as the base reference count. This means that if
+ * anybody is equal to or below this sequence they were never referenced. We
+ * jack this sequence up by the number of roots we found each time in order to
+ * make sure we don't have any overlap.
+ *
+ * 2) We first search all the roots that reference the area _except_ the root
+ * we're acting on currently. This makes up the old_refcnt of all the qgroups
+ * before.
+ *
+ * 3) We walk all of the qgroups referenced by the root we are currently acting
+ * on, and will either adjust old_refcnt in the case of a removal or the
+ * new_refcnt in the case of an addition.
+ *
+ * 4) Finally we walk all the qgroups that are referenced by this range
+ * including the root we are acting on currently. We will adjust the counters
+ * based on the number of roots we had and will have after this operation.
+ *
+ * Take this example as an illustration
+ *
+ * [qgroup 1/0]
+ * / | \
+ * [qg 0/0] [qg 0/1] [qg 0/2]
+ * \ | /
+ * [ extent ]
+ *
+ * Say we are adding a reference that is covered by qg 0/0. The first step
+ * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with
+ * old_roots being 2. Because it is adding new_roots will be 1. We then go
+ * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's
+ * new_refcnt, bringing it to 3. We then walk through all of the qgroups, we
+ * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a
+ * reference and thus must add the size to the referenced bytes. Everything
+ * else is the same so nothing else changes.
+ */
+static int qgroup_shared_accounting(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper)
+{
+ struct ulist *roots = NULL;
+ struct ulist *qgroups, *tmp;
+ struct btrfs_qgroup *qgroup;
+ struct seq_list elem = {};
+ u64 seq;
+ int old_roots = 0;
+ int new_roots = 0;
+ int ret = 0;
- mutex_lock(&fs_info->qgroup_rescan_lock);
- if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
- if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) {
- mutex_unlock(&fs_info->qgroup_rescan_lock);
+ if (oper->elem.seq) {
+ ret = check_existing_refs(trans, fs_info, oper);
+ if (ret < 0)
+ return ret;
+ if (ret)
return 0;
- }
}
- mutex_unlock(&fs_info->qgroup_rescan_lock);
- /*
- * the delayed ref sequence number we pass depends on the direction of
- * the operation. for add operations, we pass
- * tree_mod_log_prev_seq(node->seq) to skip
- * the delayed ref's current sequence number, because we need the state
- * of the tree before the add operation. for delete operations, we pass
- * (node->seq) to include the delayed ref's current sequence number,
- * because we need the state of the tree after the delete operation.
- */
- ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, seq, &roots);
- if (ret < 0)
- return ret;
-
- spin_lock(&fs_info->qgroup_lock);
+ qgroups = ulist_alloc(GFP_NOFS);
+ if (!qgroups)
+ return -ENOMEM;
- quota_root = fs_info->quota_root;
- if (!quota_root)
- goto unlock;
+ tmp = ulist_alloc(GFP_NOFS);
+ if (!tmp) {
+ ulist_free(qgroups);
+ return -ENOMEM;
+ }
- qgroup = find_qgroup_rb(fs_info, ref_root);
+ btrfs_get_tree_mod_seq(fs_info, &elem);
+ ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq,
+ &roots);
+ btrfs_put_tree_mod_seq(fs_info, &elem);
+ if (ret < 0) {
+ ulist_free(qgroups);
+ ulist_free(tmp);
+ return ret;
+ }
+ spin_lock(&fs_info->qgroup_lock);
+ qgroup = find_qgroup_rb(fs_info, oper->ref_root);
if (!qgroup)
- goto unlock;
+ goto out;
+ seq = fs_info->qgroup_seq;
/*
- * step 1: for each old ref, visit all nodes once and inc refcnt
+ * So roots is the list of all the roots currently pointing at the
+ * bytenr, including the ref we are adding if we are adding, or not if
+ * we are removing a ref. So we pass in the ref_root to skip that root
+ * in our calculations. We set old_refnct and new_refcnt cause who the
+ * hell knows what everything looked like before, and it doesn't matter
+ * except...
*/
- ulist_reinit(fs_info->qgroup_ulist);
- seq = fs_info->qgroup_seq;
- fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
+ ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups,
+ seq, &old_roots, 0);
+ if (ret < 0)
+ goto out;
- ret = qgroup_account_ref_step1(fs_info, roots, fs_info->qgroup_ulist,
- seq);
- if (ret)
- goto unlock;
+ /*
+ * Now adjust the refcounts of the qgroups that care about this
+ * reference, either the old_count in the case of removal or new_count
+ * in the case of an addition.
+ */
+ ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups,
+ seq);
+ if (ret < 0)
+ goto out;
/*
- * step 2: walk from the new root
+ * ...in the case of removals. If we had a removal before we got around
+ * to processing this operation then we need to find that guy and count
+ * his references as if they really existed so we don't end up screwing
+ * up the exclusive counts. Then whenever we go to process the delete
+ * everything will be grand and we can account for whatever exclusive
+ * changes need to be made there. We also have to pass in old_roots so
+ * we have an accurate count of the roots as it pertains to this
+ * operations view of the world.
*/
- ret = qgroup_account_ref_step2(fs_info, roots, fs_info->qgroup_ulist,
- seq, sgn, node->num_bytes, qgroup);
- if (ret)
- goto unlock;
+ ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq,
+ &old_roots);
+ if (ret < 0)
+ goto out;
/*
- * step 3: walk again from old refs
+ * We are adding our root, need to adjust up the number of roots,
+ * otherwise old_roots is the number of roots we want.
*/
- ret = qgroup_account_ref_step3(fs_info, roots, fs_info->qgroup_ulist,
- seq, sgn, node->num_bytes);
- if (ret)
- goto unlock;
+ if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
+ new_roots = old_roots + 1;
+ } else {
+ new_roots = old_roots;
+ old_roots++;
+ }
+ fs_info->qgroup_seq += old_roots + 1;
-unlock:
+
+ /*
+ * And now the magic happens, bless Arne for having a pretty elegant
+ * solution for this.
+ */
+ qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes,
+ qgroups, seq, old_roots, new_roots, 0);
+out:
spin_unlock(&fs_info->qgroup_lock);
+ ulist_free(qgroups);
ulist_free(roots);
+ ulist_free(tmp);
+ return ret;
+}
+
+/*
+ * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
+ * from the fs. First, all roots referencing the extent are searched, and
+ * then the space is accounted accordingly to the different roots. The
+ * accounting algorithm works in 3 steps documented inline.
+ */
+static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper)
+{
+ int ret = 0;
+
+ if (!fs_info->quota_enabled)
+ return 0;
+
+ BUG_ON(!fs_info->quota_root);
+
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+ if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) {
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+ return 0;
+ }
+ }
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ ASSERT(is_fstree(oper->ref_root));
+
+ switch (oper->type) {
+ case BTRFS_QGROUP_OPER_ADD_EXCL:
+ case BTRFS_QGROUP_OPER_SUB_EXCL:
+ ret = qgroup_excl_accounting(fs_info, oper);
+ break;
+ case BTRFS_QGROUP_OPER_ADD_SHARED:
+ case BTRFS_QGROUP_OPER_SUB_SHARED:
+ ret = qgroup_shared_accounting(trans, fs_info, oper);
+ break;
+ default:
+ ASSERT(0);
+ }
+ return ret;
+}
+
+/*
+ * Needs to be called everytime we run delayed refs, even if there is an error
+ * in order to cleanup outstanding operations.
+ */
+int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_qgroup_operation *oper;
+ int ret = 0;
+ while (!list_empty(&trans->qgroup_ref_list)) {
+ oper = list_first_entry(&trans->qgroup_ref_list,
+ struct btrfs_qgroup_operation, list);
+ list_del_init(&oper->list);
+ if (!ret || !trans->aborted)
+ ret = btrfs_qgroup_account(trans, fs_info, oper);
+ spin_lock(&fs_info->qgroup_op_lock);
+ rb_erase(&oper->n, &fs_info->qgroup_op_tree);
+ spin_unlock(&fs_info->qgroup_op_lock);
+ btrfs_put_tree_mod_seq(fs_info, &oper->elem);
+ kfree(oper);
+ }
return ret;
}
@@ -1629,8 +2116,16 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
srcgroup = find_qgroup_rb(fs_info, srcid);
if (!srcgroup)
goto unlock;
- dstgroup->rfer = srcgroup->rfer - level_size;
- dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size;
+
+ /*
+ * We call inherit after we clone the root in order to make sure
+ * our counts don't go crazy, so at this point the only
+ * difference between the two roots should be the root node.
+ */
+ dstgroup->rfer = srcgroup->rfer;
+ dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
+ dstgroup->excl = level_size;
+ dstgroup->excl_cmpr = level_size;
srcgroup->excl = level_size;
srcgroup->excl_cmpr = level_size;
qgroup_dirty(fs_info, dstgroup);
@@ -1734,7 +2229,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
struct btrfs_qgroup *qg;
struct btrfs_qgroup_list *glist;
- qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
+ qg = u64_to_ptr(unode->aux);
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
qg->reserved + (s64)qg->rfer + num_bytes >
@@ -1766,7 +2261,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
struct btrfs_qgroup *qg;
- qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
+ qg = u64_to_ptr(unode->aux);
qg->reserved += num_bytes;
}
@@ -1812,7 +2307,7 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
struct btrfs_qgroup *qg;
struct btrfs_qgroup_list *glist;
- qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
+ qg = u64_to_ptr(unode->aux);
qg->reserved -= num_bytes;
@@ -1848,15 +2343,15 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
*/
static int
qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
- struct btrfs_trans_handle *trans, struct ulist *tmp,
- struct extent_buffer *scratch_leaf)
+ struct btrfs_trans_handle *trans, struct ulist *qgroups,
+ struct ulist *tmp, struct extent_buffer *scratch_leaf)
{
struct btrfs_key found;
struct ulist *roots = NULL;
- struct ulist_node *unode;
- struct ulist_iterator uiter;
struct seq_list tree_mod_seq_elem = {};
+ u64 num_bytes;
u64 seq;
+ int new_roots;
int slot;
int ret;
@@ -1897,8 +2392,6 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
mutex_unlock(&fs_info->qgroup_rescan_lock);
for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
- u64 num_bytes;
-
btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
if (found.type != BTRFS_EXTENT_ITEM_KEY &&
found.type != BTRFS_METADATA_ITEM_KEY)
@@ -1908,76 +2401,34 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
else
num_bytes = found.offset;
- ret = btrfs_find_all_roots(trans, fs_info, found.objectid,
- tree_mod_seq_elem.seq, &roots);
+ ulist_reinit(qgroups);
+ ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
+ &roots);
if (ret < 0)
goto out;
spin_lock(&fs_info->qgroup_lock);
seq = fs_info->qgroup_seq;
fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
- ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq);
- if (ret) {
+ new_roots = 0;
+ ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups,
+ seq, &new_roots, 1);
+ if (ret < 0) {
spin_unlock(&fs_info->qgroup_lock);
ulist_free(roots);
goto out;
}
- /*
- * step2 of btrfs_qgroup_account_ref works from a single root,
- * we're doing all at once here.
- */
- ulist_reinit(tmp);
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(roots, &uiter))) {
- struct btrfs_qgroup *qg;
-
- qg = find_qgroup_rb(fs_info, unode->val);
- if (!qg)
- continue;
-
- ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg,
- GFP_ATOMIC);
- if (ret < 0) {
- spin_unlock(&fs_info->qgroup_lock);
- ulist_free(roots);
- goto out;
- }
- }
-
- /* this loop is similar to step 2 of btrfs_qgroup_account_ref */
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(tmp, &uiter))) {
- struct btrfs_qgroup *qg;
- struct btrfs_qgroup_list *glist;
-
- qg = (struct btrfs_qgroup *)(uintptr_t) unode->aux;
- qg->rfer += num_bytes;
- qg->rfer_cmpr += num_bytes;
- WARN_ON(qg->tag >= seq);
- if (qg->refcnt - seq == roots->nnodes) {
- qg->excl += num_bytes;
- qg->excl_cmpr += num_bytes;
- }
- qgroup_dirty(fs_info, qg);
-
- list_for_each_entry(glist, &qg->groups, next_group) {
- ret = ulist_add(tmp, glist->group->qgroupid,
- (uintptr_t)glist->group,
- GFP_ATOMIC);
- if (ret < 0) {
- spin_unlock(&fs_info->qgroup_lock);
- ulist_free(roots);
- goto out;
- }
- }
+ ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups,
+ seq, 0, new_roots, 1);
+ if (ret < 0) {
+ spin_unlock(&fs_info->qgroup_lock);
+ ulist_free(roots);
+ goto out;
}
-
spin_unlock(&fs_info->qgroup_lock);
ulist_free(roots);
- ret = 0;
}
-
out:
btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
@@ -1990,13 +2441,16 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
qgroup_rescan_work);
struct btrfs_path *path;
struct btrfs_trans_handle *trans = NULL;
- struct ulist *tmp = NULL;
+ struct ulist *tmp = NULL, *qgroups = NULL;
struct extent_buffer *scratch_leaf = NULL;
int err = -ENOMEM;
path = btrfs_alloc_path();
if (!path)
goto out;
+ qgroups = ulist_alloc(GFP_NOFS);
+ if (!qgroups)
+ goto out;
tmp = ulist_alloc(GFP_NOFS);
if (!tmp)
goto out;
@@ -2015,7 +2469,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
err = -EINTR;
} else {
err = qgroup_rescan_leaf(fs_info, path, trans,
- tmp, scratch_leaf);
+ qgroups, tmp, scratch_leaf);
}
if (err > 0)
btrfs_commit_transaction(trans, fs_info->fs_root);
@@ -2025,6 +2479,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
out:
kfree(scratch_leaf);
+ ulist_free(qgroups);
ulist_free(tmp);
btrfs_free_path(path);
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
new file mode 100644
index 000000000000..5952ff1fbd7a
--- /dev/null
+++ b/fs/btrfs/qgroup.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (C) 2014 Facebook. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_QGROUP__
+#define __BTRFS_QGROUP__
+
+/*
+ * A description of the operations, all of these operations only happen when we
+ * are adding the 1st reference for that subvolume in the case of adding space
+ * or on the last reference delete in the case of subtraction. The only
+ * exception is the last one, which is added for confusion.
+ *
+ * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only
+ * one pointing at the bytes we are adding. This is called on the first
+ * allocation.
+ *
+ * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be
+ * shared between subvols. This is called on the creation of a ref that already
+ * has refs from a different subvolume, so basically reflink.
+ *
+ * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only
+ * one referencing the range.
+ *
+ * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with
+ * refs with other subvolumes.
+ */
+enum btrfs_qgroup_operation_type {
+ BTRFS_QGROUP_OPER_ADD_EXCL,
+ BTRFS_QGROUP_OPER_ADD_SHARED,
+ BTRFS_QGROUP_OPER_SUB_EXCL,
+ BTRFS_QGROUP_OPER_SUB_SHARED,
+};
+
+struct btrfs_qgroup_operation {
+ u64 ref_root;
+ u64 bytenr;
+ u64 num_bytes;
+ u64 seq;
+ enum btrfs_qgroup_operation_type type;
+ struct seq_list elem;
+ struct rb_node n;
+ struct list_head list;
+};
+
+int btrfs_quota_enable(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int btrfs_quota_disable(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
+void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 src, u64 dst);
+int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 src, u64 dst);
+int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 qgroupid,
+ char *name);
+int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 qgroupid);
+int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 qgroupid,
+ struct btrfs_qgroup_limit *limit);
+int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
+void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
+struct btrfs_delayed_extent_op;
+int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 ref_root,
+ u64 bytenr, u64 num_bytes,
+ enum btrfs_qgroup_operation_type type,
+ int mod_seq);
+int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper);
+int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
+ struct btrfs_qgroup_inherit *inherit);
+int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
+void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
+
+void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
+ u64 rfer, u64 excl);
+#endif
+
+#endif /* __BTRFS_QGROUP__ */
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 30947f923620..09230cf3a244 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -428,8 +428,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
continue;
}
if (!dev->bdev) {
- /* cannot read ahead on missing device */
- continue;
+ /*
+ * cannot read ahead on missing device, but for RAID5/6,
+ * REQ_GET_READ_MIRRORS return 1. So don't skip missing
+ * device for such case.
+ */
+ if (nzones > 1)
+ continue;
}
if (dev_replace_is_ongoing &&
dev == fs_info->dev_replace.tgtdev) {
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 7f92ab1daa87..65245a07275b 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -337,7 +337,7 @@ static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
if (bnode->root)
fs_info = bnode->root->fs_info;
btrfs_panic(fs_info, errno, "Inconsistency in backref cache "
- "found at offset %llu\n", bytenr);
+ "found at offset %llu", bytenr);
}
/*
@@ -528,7 +528,7 @@ static int should_ignore_root(struct btrfs_root *root)
{
struct btrfs_root *reloc_root;
- if (!root->ref_cows)
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
return 0;
reloc_root = root->reloc_root;
@@ -610,7 +610,7 @@ struct btrfs_root *find_tree_root(struct reloc_control *rc,
root = read_fs_root(rc->extent_root->fs_info, root_objectid);
BUG_ON(IS_ERR(root));
- if (root->ref_cows &&
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
generation != btrfs_root_generation(&root->root_item))
return NULL;
@@ -887,7 +887,7 @@ again:
goto out;
}
- if (!root->ref_cows)
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
cur->cowonly = 1;
if (btrfs_root_level(&root->root_item) == cur->level) {
@@ -954,7 +954,8 @@ again:
upper->bytenr = eb->start;
upper->owner = btrfs_header_owner(eb);
upper->level = lower->level + 1;
- if (!root->ref_cows)
+ if (!test_bit(BTRFS_ROOT_REF_COWS,
+ &root->state))
upper->cowonly = 1;
/*
@@ -1258,7 +1259,7 @@ static int __must_check __add_reloc_root(struct btrfs_root *root)
if (rb_node) {
btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found "
"for start=%llu while inserting into relocation "
- "tree\n", node->bytenr);
+ "tree", node->bytenr);
kfree(node);
return -EEXIST;
}
@@ -2441,7 +2442,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
next = walk_up_backref(next, edges, &index);
root = next->root;
BUG_ON(!root);
- BUG_ON(!root->ref_cows);
+ BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state));
if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
record_reloc_root_in_trans(trans, root);
@@ -2506,7 +2507,7 @@ struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
BUG_ON(!root);
/* no other choice for non-references counted tree */
- if (!root->ref_cows)
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
return root;
if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
@@ -2893,14 +2894,14 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
goto out;
}
- if (!root || root->ref_cows) {
+ if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
ret = reserve_metadata_space(trans, rc, node);
if (ret)
goto out;
}
if (root) {
- if (root->ref_cows) {
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
BUG_ON(node->new_bytenr);
BUG_ON(!list_empty(&node->list));
btrfs_record_root_in_trans(trans, root);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 38bb47e7d6b1..360a728a639f 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -306,7 +306,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
break;
}
- root->orphan_item_inserted = 1;
+ set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
err = btrfs_insert_fs_root(root->fs_info, root);
if (err) {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 0be77993378e..b6d198f5181e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -588,8 +588,9 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
do {
- ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
- &ref_root, &ref_level);
+ ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
+ item_size, &ref_root,
+ &ref_level);
printk_in_rcu(KERN_WARNING
"BTRFS: %s at logical %llu on dev %s, "
"sector %llu: metadata %s (level %d) in tree "
@@ -717,8 +718,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
out:
if (page)
put_page(page);
- if (inode)
- iput(inode);
+
+ iput(inode);
if (ret < 0)
return ret;
@@ -2724,11 +2725,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
length = btrfs_dev_extent_length(l, dev_extent);
- if (found_key.offset + length <= start) {
- key.offset = found_key.offset + length;
- btrfs_release_path(path);
- continue;
- }
+ if (found_key.offset + length <= start)
+ goto skip;
chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
@@ -2739,10 +2737,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
* the chunk from going away while we scrub it
*/
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
- if (!cache) {
- ret = -ENOENT;
- break;
- }
+
+ /* some chunks are removed but not committed to disk yet,
+ * continue scrubbing */
+ if (!cache)
+ goto skip;
+
dev_replace->cursor_right = found_key.offset + length;
dev_replace->cursor_left = found_key.offset;
dev_replace->item_needs_writeback = 1;
@@ -2801,7 +2801,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
dev_replace->cursor_left = dev_replace->cursor_right;
dev_replace->item_needs_writeback = 1;
-
+skip:
key.offset = found_key.offset + length;
btrfs_release_path(path);
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 484aacac2c89..6528aa662181 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -975,7 +975,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_dir_item *di;
struct btrfs_key di_key;
char *buf = NULL;
- const int buf_len = PATH_MAX;
+ int buf_len;
u32 name_len;
u32 data_len;
u32 cur;
@@ -985,6 +985,11 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
int num;
u8 type;
+ if (found_key->type == BTRFS_XATTR_ITEM_KEY)
+ buf_len = BTRFS_MAX_XATTR_SIZE(root);
+ else
+ buf_len = PATH_MAX;
+
buf = kmalloc(buf_len, GFP_NOFS);
if (!buf) {
ret = -ENOMEM;
@@ -1006,12 +1011,23 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
type = btrfs_dir_type(eb, di);
btrfs_dir_item_key_to_cpu(eb, di, &di_key);
- /*
- * Path too long
- */
- if (name_len + data_len > buf_len) {
- ret = -ENAMETOOLONG;
- goto out;
+ if (type == BTRFS_FT_XATTR) {
+ if (name_len > XATTR_NAME_MAX) {
+ ret = -ENAMETOOLONG;
+ goto out;
+ }
+ if (name_len + data_len > buf_len) {
+ ret = -E2BIG;
+ goto out;
+ }
+ } else {
+ /*
+ * Path too long
+ */
+ if (name_len + data_len > buf_len) {
+ ret = -ENAMETOOLONG;
+ goto out;
+ }
}
read_extent_buffer(eb, buf, (unsigned long)(di + 1),
@@ -1349,7 +1365,7 @@ static int find_extent_clone(struct send_ctx *sctx,
ret = -EIO;
btrfs_err(sctx->send_root->fs_info, "did not find backref in "
"send_root. inode=%llu, offset=%llu, "
- "disk_byte=%llu found extent=%llu\n",
+ "disk_byte=%llu found extent=%llu",
ino, data_offset, disk_byte, found_key.objectid);
goto out;
}
@@ -1628,6 +1644,10 @@ static int lookup_dir_item_inode(struct btrfs_root *root,
goto out;
}
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
+ if (key.type == BTRFS_ROOT_ITEM_KEY) {
+ ret = -ENOENT;
+ goto out;
+ }
*found_inode = key.objectid;
*found_type = btrfs_dir_type(path->nodes[0], di);
@@ -1693,10 +1713,12 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
goto out;
btrfs_release_path(path);
- ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, NULL,
- NULL, NULL);
- if (ret < 0)
- goto out;
+ if (dir_gen) {
+ ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL,
+ NULL, NULL, NULL);
+ if (ret < 0)
+ goto out;
+ }
*dir = parent_dir;
@@ -1712,13 +1734,12 @@ static int is_first_ref(struct btrfs_root *root,
int ret;
struct fs_path *tmp_name;
u64 tmp_dir;
- u64 tmp_dir_gen;
tmp_name = fs_path_alloc();
if (!tmp_name)
return -ENOMEM;
- ret = get_first_ref(root, ino, &tmp_dir, &tmp_dir_gen, tmp_name);
+ ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name);
if (ret < 0)
goto out;
@@ -2029,7 +2050,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
{
int ret;
int nce_ret;
- struct btrfs_path *path = NULL;
struct name_cache_entry *nce = NULL;
/*
@@ -2055,10 +2075,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
}
}
- path = alloc_path_for_send();
- if (!path)
- return -ENOMEM;
-
/*
* If the inode is not existent yet, add the orphan name and return 1.
* This should only happen for the parent dir that we determine in
@@ -2134,7 +2150,6 @@ out_cache:
name_cache_clean_unused(sctx);
out:
- btrfs_free_path(path);
return ret;
}
@@ -2945,7 +2960,9 @@ static void free_waiting_dir_move(struct send_ctx *sctx,
static int add_pending_dir_move(struct send_ctx *sctx,
u64 ino,
u64 ino_gen,
- u64 parent_ino)
+ u64 parent_ino,
+ struct list_head *new_refs,
+ struct list_head *deleted_refs)
{
struct rb_node **p = &sctx->pending_dir_moves.rb_node;
struct rb_node *parent = NULL;
@@ -2977,12 +2994,12 @@ static int add_pending_dir_move(struct send_ctx *sctx,
}
}
- list_for_each_entry(cur, &sctx->deleted_refs, list) {
+ list_for_each_entry(cur, deleted_refs, list) {
ret = dup_ref(cur, &pm->update_refs);
if (ret < 0)
goto out;
}
- list_for_each_entry(cur, &sctx->new_refs, list) {
+ list_for_each_entry(cur, new_refs, list) {
ret = dup_ref(cur, &pm->update_refs);
if (ret < 0)
goto out;
@@ -3025,6 +3042,48 @@ static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
return NULL;
}
+static int path_loop(struct send_ctx *sctx, struct fs_path *name,
+ u64 ino, u64 gen, u64 *ancestor_ino)
+{
+ int ret = 0;
+ u64 parent_inode = 0;
+ u64 parent_gen = 0;
+ u64 start_ino = ino;
+
+ *ancestor_ino = 0;
+ while (ino != BTRFS_FIRST_FREE_OBJECTID) {
+ fs_path_reset(name);
+
+ if (is_waiting_for_rm(sctx, ino))
+ break;
+ if (is_waiting_for_move(sctx, ino)) {
+ if (*ancestor_ino == 0)
+ *ancestor_ino = ino;
+ ret = get_first_ref(sctx->parent_root, ino,
+ &parent_inode, &parent_gen, name);
+ } else {
+ ret = __get_cur_name_and_parent(sctx, ino, gen,
+ &parent_inode,
+ &parent_gen, name);
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ }
+ if (ret < 0)
+ break;
+ if (parent_inode == start_ino) {
+ ret = 1;
+ if (*ancestor_ino == 0)
+ *ancestor_ino = ino;
+ break;
+ }
+ ino = parent_inode;
+ gen = parent_gen;
+ }
+ return ret;
+}
+
static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
{
struct fs_path *from_path = NULL;
@@ -3036,6 +3095,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
struct waiting_dir_move *dm = NULL;
u64 rmdir_ino = 0;
int ret;
+ u64 ancestor = 0;
name = fs_path_alloc();
from_path = fs_path_alloc();
@@ -3054,34 +3114,33 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
if (ret < 0)
goto out;
- if (parent_ino == sctx->cur_ino) {
- /* child only renamed, not moved */
- ASSERT(parent_gen == sctx->cur_inode_gen);
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
- from_path);
- if (ret < 0)
- goto out;
- ret = fs_path_add_path(from_path, name);
- if (ret < 0)
- goto out;
- } else {
- /* child moved and maybe renamed too */
- sctx->send_progress = pm->ino;
- ret = get_cur_path(sctx, pm->ino, pm->gen, from_path);
+ ret = get_cur_path(sctx, parent_ino, parent_gen,
+ from_path);
+ if (ret < 0)
+ goto out;
+ ret = fs_path_add_path(from_path, name);
+ if (ret < 0)
+ goto out;
+
+ sctx->send_progress = sctx->cur_ino + 1;
+ ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor);
+ if (ret) {
+ LIST_HEAD(deleted_refs);
+ ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
+ ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
+ &pm->update_refs, &deleted_refs);
if (ret < 0)
goto out;
- }
-
- fs_path_free(name);
- name = NULL;
-
- to_path = fs_path_alloc();
- if (!to_path) {
- ret = -ENOMEM;
+ if (rmdir_ino) {
+ dm = get_waiting_dir_move(sctx, pm->ino);
+ ASSERT(dm);
+ dm->rmdir_ino = rmdir_ino;
+ }
goto out;
}
-
- sctx->send_progress = sctx->cur_ino + 1;
+ fs_path_reset(name);
+ to_path = name;
+ name = NULL;
ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
if (ret < 0)
goto out;
@@ -3205,127 +3264,74 @@ out:
static int wait_for_parent_move(struct send_ctx *sctx,
struct recorded_ref *parent_ref)
{
- int ret;
+ int ret = 0;
u64 ino = parent_ref->dir;
u64 parent_ino_before, parent_ino_after;
- u64 old_gen;
struct fs_path *path_before = NULL;
struct fs_path *path_after = NULL;
int len1, len2;
- int register_upper_dirs;
- u64 gen;
-
- if (is_waiting_for_move(sctx, ino))
- return 1;
-
- if (parent_ref->dir <= sctx->cur_ino)
- return 0;
-
- ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen,
- NULL, NULL, NULL, NULL);
- if (ret == -ENOENT)
- return 0;
- else if (ret < 0)
- return ret;
-
- if (parent_ref->dir_gen != old_gen)
- return 0;
-
- path_before = fs_path_alloc();
- if (!path_before)
- return -ENOMEM;
-
- ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
- NULL, path_before);
- if (ret == -ENOENT) {
- ret = 0;
- goto out;
- } else if (ret < 0) {
- goto out;
- }
path_after = fs_path_alloc();
- if (!path_after) {
+ path_before = fs_path_alloc();
+ if (!path_after || !path_before) {
ret = -ENOMEM;
goto out;
}
- ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
- &gen, path_after);
- if (ret == -ENOENT) {
- ret = 0;
- goto out;
- } else if (ret < 0) {
- goto out;
- }
-
- len1 = fs_path_len(path_before);
- len2 = fs_path_len(path_after);
- if (parent_ino_before != parent_ino_after || len1 != len2 ||
- memcmp(path_before->start, path_after->start, len1)) {
- ret = 1;
- goto out;
- }
- ret = 0;
-
/*
- * Ok, our new most direct ancestor has a higher inode number but
- * wasn't moved/renamed. So maybe some of the new ancestors higher in
- * the hierarchy have an higher inode number too *and* were renamed
- * or moved - in this case we need to wait for the ancestor's rename
- * or move operation before we can do the move/rename for the current
- * inode.
+ * Our current directory inode may not yet be renamed/moved because some
+ * ancestor (immediate or not) has to be renamed/moved first. So find if
+ * such ancestor exists and make sure our own rename/move happens after
+ * that ancestor is processed.
*/
- register_upper_dirs = 0;
- ino = parent_ino_after;
-again:
- while ((ret == 0 || register_upper_dirs) && ino > sctx->cur_ino) {
- u64 parent_gen;
+ while (ino > BTRFS_FIRST_FREE_OBJECTID) {
+ if (is_waiting_for_move(sctx, ino)) {
+ ret = 1;
+ break;
+ }
fs_path_reset(path_before);
fs_path_reset(path_after);
ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
- &parent_gen, path_after);
+ NULL, path_after);
if (ret < 0)
goto out;
ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
NULL, path_before);
- if (ret == -ENOENT) {
- ret = 0;
- break;
- } else if (ret < 0) {
+ if (ret < 0 && ret != -ENOENT) {
goto out;
+ } else if (ret == -ENOENT) {
+ ret = 1;
+ break;
}
len1 = fs_path_len(path_before);
len2 = fs_path_len(path_after);
- if (parent_ino_before != parent_ino_after || len1 != len2 ||
- memcmp(path_before->start, path_after->start, len1)) {
+ if (ino > sctx->cur_ino &&
+ (parent_ino_before != parent_ino_after || len1 != len2 ||
+ memcmp(path_before->start, path_after->start, len1))) {
ret = 1;
- if (register_upper_dirs) {
- break;
- } else {
- register_upper_dirs = 1;
- ino = parent_ref->dir;
- gen = parent_ref->dir_gen;
- goto again;
- }
- } else if (register_upper_dirs) {
- ret = add_pending_dir_move(sctx, ino, gen,
- parent_ino_after);
- if (ret < 0 && ret != -EEXIST)
- goto out;
+ break;
}
-
ino = parent_ino_after;
- gen = parent_gen;
}
out:
fs_path_free(path_before);
fs_path_free(path_after);
+ if (ret == 1) {
+ ret = add_pending_dir_move(sctx,
+ sctx->cur_ino,
+ sctx->cur_inode_gen,
+ ino,
+ &sctx->new_refs,
+ &sctx->deleted_refs);
+ if (!ret)
+ ret = 1;
+ }
+
return ret;
}
@@ -3486,10 +3492,6 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
if (ret < 0)
goto out;
if (ret) {
- ret = add_pending_dir_move(sctx,
- sctx->cur_ino,
- sctx->cur_inode_gen,
- cur->dir);
*pending_move = 1;
} else {
ret = send_rename(sctx, valid_path,
@@ -5490,7 +5492,7 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
*/
if (root->send_in_progress < 0)
btrfs_err(root->fs_info,
- "send_in_progres unbalanced %d root %llu\n",
+ "send_in_progres unbalanced %d root %llu",
root->send_in_progress, root->root_key.objectid);
spin_unlock(&root->root_item_lock);
}
@@ -5518,7 +5520,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
/*
* The subvolume must remain read-only during send, protect against
- * making it RW.
+ * making it RW. This also protects against deletion.
*/
spin_lock(&send_root->root_item_lock);
send_root->send_in_progress++;
@@ -5578,6 +5580,15 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
}
sctx->send_root = send_root;
+ /*
+ * Unlikely but possible, if the subvolume is marked for deletion but
+ * is slow to remove the directory entry, send can still be started
+ */
+ if (btrfs_root_dead(sctx->send_root)) {
+ ret = -EPERM;
+ goto out;
+ }
+
sctx->clone_roots_cnt = arg->clone_sources_count;
sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
@@ -5667,7 +5678,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
spin_lock(&sctx->parent_root->root_item_lock);
sctx->parent_root->send_in_progress++;
- if (!btrfs_root_readonly(sctx->parent_root)) {
+ if (!btrfs_root_readonly(sctx->parent_root) ||
+ btrfs_root_dead(sctx->parent_root)) {
spin_unlock(&sctx->parent_root->root_item_lock);
srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = -EPERM;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9601d25a4607..4662d92a4b73 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -511,7 +511,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
} else if (compress) {
if (!btrfs_test_opt(root, COMPRESS))
btrfs_info(root->fs_info,
- "btrfs: use %s compression\n",
+ "btrfs: use %s compression",
compress_type);
}
break;
@@ -580,8 +580,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
}
break;
case Opt_acl:
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
root->fs_info->sb->s_flags |= MS_POSIXACL;
break;
+#else
+ btrfs_err(root->fs_info,
+ "support for ACL not compiled in!");
+ ret = -EINVAL;
+ goto out;
+#endif
case Opt_noacl:
root->fs_info->sb->s_flags &= ~MS_POSIXACL;
break;
@@ -1413,6 +1420,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
* this also happens on 'umount -rf' or on shutdown, when
* the filesystem is busy.
*/
+ cancel_work_sync(&fs_info->async_reclaim_work);
/* wait for the uuid_scan task to finish */
down(&fs_info->uuid_tree_rescan_sem);
@@ -1894,6 +1902,9 @@ static int btrfs_run_sanity_tests(void)
if (ret)
goto out;
ret = btrfs_test_inodes();
+ if (ret)
+ goto out;
+ ret = btrfs_test_qgroups();
out:
btrfs_destroy_test_fs();
return ret;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index c5eb2143dc66..df39458f1487 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -254,6 +254,7 @@ static ssize_t global_rsv_reserved_show(struct kobject *kobj,
BTRFS_ATTR(global_rsv_reserved, 0444, global_rsv_reserved_show);
#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj)
+#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj)
static ssize_t raid_bytes_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf);
@@ -266,7 +267,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
{
struct btrfs_space_info *sinfo = to_space_info(kobj->parent);
struct btrfs_block_group_cache *block_group;
- int index = kobj - sinfo->block_group_kobjs;
+ int index = to_raid_kobj(kobj)->raid_type;
u64 val = 0;
down_read(&sinfo->groups_sem);
@@ -288,7 +289,7 @@ static struct attribute *raid_attributes[] = {
static void release_raid_kobj(struct kobject *kobj)
{
- kobject_put(kobj->parent);
+ kfree(to_raid_kobj(kobj));
}
struct kobj_type btrfs_raid_ktype = {
@@ -374,11 +375,8 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
struct btrfs_root *root = fs_info->fs_root;
int ret;
- if (len >= BTRFS_LABEL_SIZE) {
- pr_err("BTRFS: unable to set label with more than %d bytes\n",
- BTRFS_LABEL_SIZE - 1);
+ if (len >= BTRFS_LABEL_SIZE)
return -EINVAL;
- }
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans))
@@ -396,8 +394,48 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
}
BTRFS_ATTR_RW(label, 0644, btrfs_label_show, btrfs_label_store);
+static ssize_t btrfs_no_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ return -EPERM;
+}
+
+static ssize_t btrfs_nodesize_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
+}
+
+BTRFS_ATTR_RW(nodesize, 0444, btrfs_nodesize_show, btrfs_no_store);
+
+static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
+}
+
+BTRFS_ATTR_RW(sectorsize, 0444, btrfs_sectorsize_show, btrfs_no_store);
+
+static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
+}
+
+BTRFS_ATTR_RW(clone_alignment, 0444, btrfs_clone_alignment_show, btrfs_no_store);
+
static struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(label),
+ BTRFS_ATTR_PTR(nodesize),
+ BTRFS_ATTR_PTR(sectorsize),
+ BTRFS_ATTR_PTR(clone_alignment),
NULL,
};
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 757ef00a75a4..9626252ee6b4 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -21,6 +21,9 @@
#include <linux/magic.h>
#include "btrfs-tests.h"
#include "../ctree.h"
+#include "../volumes.h"
+#include "../disk-io.h"
+#include "../qgroup.h"
static struct vfsmount *test_mnt = NULL;
@@ -72,3 +75,97 @@ void btrfs_destroy_test_fs(void)
kern_unmount(test_mnt);
unregister_filesystem(&test_type);
}
+
+struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
+{
+ struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info),
+ GFP_NOFS);
+
+ if (!fs_info)
+ return fs_info;
+ fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices),
+ GFP_NOFS);
+ if (!fs_info->fs_devices) {
+ kfree(fs_info);
+ return NULL;
+ }
+ fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block),
+ GFP_NOFS);
+ if (!fs_info->super_copy) {
+ kfree(fs_info->fs_devices);
+ kfree(fs_info);
+ return NULL;
+ }
+
+ if (init_srcu_struct(&fs_info->subvol_srcu)) {
+ kfree(fs_info->fs_devices);
+ kfree(fs_info->super_copy);
+ kfree(fs_info);
+ return NULL;
+ }
+
+ spin_lock_init(&fs_info->buffer_lock);
+ spin_lock_init(&fs_info->qgroup_lock);
+ spin_lock_init(&fs_info->qgroup_op_lock);
+ spin_lock_init(&fs_info->super_lock);
+ spin_lock_init(&fs_info->fs_roots_radix_lock);
+ spin_lock_init(&fs_info->tree_mod_seq_lock);
+ mutex_init(&fs_info->qgroup_ioctl_lock);
+ mutex_init(&fs_info->qgroup_rescan_lock);
+ rwlock_init(&fs_info->tree_mod_log_lock);
+ fs_info->running_transaction = NULL;
+ fs_info->qgroup_tree = RB_ROOT;
+ fs_info->qgroup_ulist = NULL;
+ atomic64_set(&fs_info->tree_mod_seq, 0);
+ INIT_LIST_HEAD(&fs_info->dirty_qgroups);
+ INIT_LIST_HEAD(&fs_info->dead_roots);
+ INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
+ INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
+ INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+ return fs_info;
+}
+
+static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+
+ spin_lock(&fs_info->buffer_lock);
+restart:
+ radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
+ struct extent_buffer *eb;
+
+ eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock);
+ if (!eb)
+ continue;
+ /* Shouldn't happen but that kind of thinking creates CVE's */
+ if (radix_tree_exception(eb)) {
+ if (radix_tree_deref_retry(eb))
+ goto restart;
+ continue;
+ }
+ spin_unlock(&fs_info->buffer_lock);
+ free_extent_buffer_stale(eb);
+ spin_lock(&fs_info->buffer_lock);
+ }
+ spin_unlock(&fs_info->buffer_lock);
+
+ btrfs_free_qgroup_config(fs_info);
+ btrfs_free_fs_roots(fs_info);
+ cleanup_srcu_struct(&fs_info->subvol_srcu);
+ kfree(fs_info->super_copy);
+ kfree(fs_info->fs_devices);
+ kfree(fs_info);
+}
+
+void btrfs_free_dummy_root(struct btrfs_root *root)
+{
+ if (!root)
+ return;
+ if (root->node)
+ free_extent_buffer(root->node);
+ if (root->fs_info)
+ btrfs_free_dummy_fs_info(root->fs_info);
+ kfree(root);
+}
+
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index 312560a9123d..fd3954224480 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -23,13 +23,18 @@
#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__)
+struct btrfs_root;
+
int btrfs_test_free_space_cache(void);
int btrfs_test_extent_buffer_operations(void);
int btrfs_test_extent_io(void);
int btrfs_test_inodes(void);
+int btrfs_test_qgroups(void);
int btrfs_init_test_fs(void);
void btrfs_destroy_test_fs(void);
struct inode *btrfs_new_test_inode(void);
+struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void);
+void btrfs_free_dummy_root(struct btrfs_root *root);
#else
static inline int btrfs_test_free_space_cache(void)
{
@@ -54,6 +59,10 @@ static inline int btrfs_test_inodes(void)
{
return 0;
}
+static inline int btrfs_test_qgroups(void)
+{
+ return 0;
+}
#endif
#endif
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 397d1f99a8eb..3ae0f5b8bb80 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -23,33 +23,6 @@
#include "../extent_io.h"
#include "../volumes.h"
-static struct btrfs_fs_info *alloc_dummy_fs_info(void)
-{
- struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info),
- GFP_NOFS);
- if (!fs_info)
- return fs_info;
- fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices),
- GFP_NOFS);
- if (!fs_info->fs_devices) {
- kfree(fs_info);
- return NULL;
- }
- return fs_info;
-}
-static void free_dummy_root(struct btrfs_root *root)
-{
- if (!root)
- return;
- if (root->fs_info) {
- kfree(root->fs_info->fs_devices);
- kfree(root->fs_info);
- }
- if (root->node)
- free_extent_buffer(root->node);
- kfree(root);
-}
-
static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
u64 ram_bytes, u64 offset, u64 disk_bytenr,
u64 disk_len, u32 type, u8 compression, int slot)
@@ -276,7 +249,7 @@ static noinline int test_btrfs_get_extent(void)
* We do this since btrfs_get_extent wants to assign em->bdev to
* root->fs_info->fs_devices->latest_bdev.
*/
- root->fs_info = alloc_dummy_fs_info();
+ root->fs_info = btrfs_alloc_dummy_fs_info();
if (!root->fs_info) {
test_msg("Couldn't allocate dummy fs info\n");
goto out;
@@ -837,7 +810,7 @@ out:
if (!IS_ERR(em))
free_extent_map(em);
iput(inode);
- free_dummy_root(root);
+ btrfs_free_dummy_root(root);
return ret;
}
@@ -864,7 +837,7 @@ static int test_hole_first(void)
goto out;
}
- root->fs_info = alloc_dummy_fs_info();
+ root->fs_info = btrfs_alloc_dummy_fs_info();
if (!root->fs_info) {
test_msg("Couldn't allocate dummy fs info\n");
goto out;
@@ -934,7 +907,7 @@ out:
if (!IS_ERR(em))
free_extent_map(em);
iput(inode);
- free_dummy_root(root);
+ btrfs_free_dummy_root(root);
return ret;
}
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
new file mode 100644
index 000000000000..ec3dcb202357
--- /dev/null
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -0,0 +1,470 @@
+/*
+ * Copyright (C) 2013 Facebook. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../transaction.h"
+#include "../disk-io.h"
+#include "../qgroup.h"
+
+static void init_dummy_trans(struct btrfs_trans_handle *trans)
+{
+ memset(trans, 0, sizeof(*trans));
+ trans->transid = 1;
+ INIT_LIST_HEAD(&trans->qgroup_ref_list);
+ trans->type = __TRANS_DUMMY;
+}
+
+static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 parent, u64 root_objectid)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_extent_item *item;
+ struct btrfs_extent_inline_ref *iref;
+ struct btrfs_tree_block_info *block_info;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key ins;
+ u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info);
+ int ret;
+
+ init_dummy_trans(&trans);
+
+ ins.objectid = bytenr;
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+ ins.offset = num_bytes;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ test_msg("Couldn't allocate path\n");
+ return -ENOMEM;
+ }
+
+ path->leave_spinning = 1;
+ ret = btrfs_insert_empty_item(&trans, root, path, &ins, size);
+ if (ret) {
+ test_msg("Couldn't insert ref %d\n", ret);
+ btrfs_free_path(path);
+ return ret;
+ }
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ btrfs_set_extent_refs(leaf, item, 1);
+ btrfs_set_extent_generation(leaf, item, 1);
+ btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_TREE_BLOCK);
+ block_info = (struct btrfs_tree_block_info *)(item + 1);
+ btrfs_set_tree_block_level(leaf, block_info, 1);
+ iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
+ if (parent > 0) {
+ btrfs_set_extent_inline_ref_type(leaf, iref,
+ BTRFS_SHARED_BLOCK_REF_KEY);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+ } else {
+ btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+ }
+ btrfs_free_path(path);
+ return 0;
+}
+
+static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
+ u64 parent, u64 root_objectid)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_extent_item *item;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ u64 refs;
+ int ret;
+
+ init_dummy_trans(&trans);
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ test_msg("Couldn't allocate path\n");
+ return -ENOMEM;
+ }
+
+ path->leave_spinning = 1;
+ ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
+ if (ret) {
+ test_msg("Couldn't find extent ref\n");
+ btrfs_free_path(path);
+ return ret;
+ }
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_extent_item);
+ refs = btrfs_extent_refs(path->nodes[0], item);
+ btrfs_set_extent_refs(path->nodes[0], item, refs + 1);
+ btrfs_release_path(path);
+
+ key.objectid = bytenr;
+ if (parent) {
+ key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+ key.offset = parent;
+ } else {
+ key.type = BTRFS_TREE_BLOCK_REF_KEY;
+ key.offset = root_objectid;
+ }
+
+ ret = btrfs_insert_empty_item(&trans, root, path, &key, 0);
+ if (ret)
+ test_msg("Failed to insert backref\n");
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int remove_extent_item(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ int ret;
+
+ init_dummy_trans(&trans);
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ test_msg("Couldn't allocate path\n");
+ return -ENOMEM;
+ }
+ path->leave_spinning = 1;
+
+ ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
+ if (ret) {
+ test_msg("Didn't find our key %d\n", ret);
+ btrfs_free_path(path);
+ return ret;
+ }
+ btrfs_del_item(&trans, root, path);
+ btrfs_free_path(path);
+ return 0;
+}
+
+static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 parent, u64 root_objectid)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_extent_item *item;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ u64 refs;
+ int ret;
+
+ init_dummy_trans(&trans);
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ test_msg("Couldn't allocate path\n");
+ return -ENOMEM;
+ }
+
+ path->leave_spinning = 1;
+ ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
+ if (ret) {
+ test_msg("Couldn't find extent ref\n");
+ btrfs_free_path(path);
+ return ret;
+ }
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_extent_item);
+ refs = btrfs_extent_refs(path->nodes[0], item);
+ btrfs_set_extent_refs(path->nodes[0], item, refs - 1);
+ btrfs_release_path(path);
+
+ key.objectid = bytenr;
+ if (parent) {
+ key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+ key.offset = parent;
+ } else {
+ key.type = BTRFS_TREE_BLOCK_REF_KEY;
+ key.offset = root_objectid;
+ }
+
+ ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
+ if (ret) {
+ test_msg("Couldn't find backref %d\n", ret);
+ btrfs_free_path(path);
+ return ret;
+ }
+ btrfs_del_item(&trans, root, path);
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int test_no_shared_qgroup(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
+
+ init_dummy_trans(&trans);
+
+ test_msg("Qgroup basic add\n");
+ ret = btrfs_create_qgroup(NULL, fs_info, 5, NULL);
+ if (ret) {
+ test_msg("Couldn't create a qgroup %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
+ BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+ if (ret) {
+ test_msg("Couldn't add space to a qgroup %d\n", ret);
+ return ret;
+ }
+
+ ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5);
+ if (ret)
+ return ret;
+
+ ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ if (ret) {
+ test_msg("Delayed qgroup accounting failed %d\n", ret);
+ return ret;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
+ test_msg("Qgroup counts didn't match expected values\n");
+ return -EINVAL;
+ }
+
+ ret = remove_extent_item(root, 4096, 4096);
+ if (ret)
+ return -EINVAL;
+
+ ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
+ BTRFS_QGROUP_OPER_SUB_EXCL, 0);
+ if (ret) {
+ test_msg("Couldn't remove space from the qgroup %d\n", ret);
+ return -EINVAL;
+ }
+
+ ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ if (ret) {
+ test_msg("Qgroup accounting failed %d\n", ret);
+ return -EINVAL;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, 5, 0, 0)) {
+ test_msg("Qgroup counts didn't match expected values\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Add a ref for two different roots to make sure the shared value comes out
+ * right, also remove one of the roots and make sure the exclusive count is
+ * adjusted properly.
+ */
+static int test_multiple_refs(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
+
+ init_dummy_trans(&trans);
+
+ test_msg("Qgroup multiple refs test\n");
+
+ /* We have 5 created already from the previous test */
+ ret = btrfs_create_qgroup(NULL, fs_info, 256, NULL);
+ if (ret) {
+ test_msg("Couldn't create a qgroup %d\n", ret);
+ return ret;
+ }
+
+ ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5);
+ if (ret)
+ return ret;
+
+ ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
+ BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+ if (ret) {
+ test_msg("Couldn't add space to a qgroup %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ if (ret) {
+ test_msg("Delayed qgroup accounting failed %d\n", ret);
+ return ret;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
+ test_msg("Qgroup counts didn't match expected values\n");
+ return -EINVAL;
+ }
+
+ ret = add_tree_ref(root, 4096, 4096, 0, 256);
+ if (ret)
+ return ret;
+
+ ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096,
+ BTRFS_QGROUP_OPER_ADD_SHARED, 0);
+ if (ret) {
+ test_msg("Qgroup record ref failed %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ if (ret) {
+ test_msg("Qgroup accounting failed %d\n", ret);
+ return ret;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 0)) {
+ test_msg("Qgroup counts didn't match expected values\n");
+ return -EINVAL;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, 256, 4096, 0)) {
+ test_msg("Qgroup counts didn't match expected values\n");
+ return -EINVAL;
+ }
+
+ ret = remove_extent_ref(root, 4096, 4096, 0, 256);
+ if (ret)
+ return ret;
+
+ ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096,
+ BTRFS_QGROUP_OPER_SUB_SHARED, 0);
+ if (ret) {
+ test_msg("Qgroup record ref failed %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ if (ret) {
+ test_msg("Qgroup accounting failed %d\n", ret);
+ return ret;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, 256, 0, 0)) {
+ test_msg("Qgroup counts didn't match expected values\n");
+ return -EINVAL;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
+ test_msg("Qgroup counts didn't match expected values\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int btrfs_test_qgroups(void)
+{
+ struct btrfs_root *root;
+ struct btrfs_root *tmp_root;
+ int ret = 0;
+
+ root = btrfs_alloc_dummy_root();
+ if (IS_ERR(root)) {
+ test_msg("Couldn't allocate root\n");
+ return PTR_ERR(root);
+ }
+
+ root->fs_info = btrfs_alloc_dummy_fs_info();
+ if (!root->fs_info) {
+ test_msg("Couldn't allocate dummy fs info\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Can't use bytenr 0, some things freak out
+ * *cough*backref walking code*cough*
+ */
+ root->node = alloc_test_extent_buffer(root->fs_info, 4096, 4096);
+ if (!root->node) {
+ test_msg("Couldn't allocate dummy buffer\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+ btrfs_set_header_level(root->node, 0);
+ btrfs_set_header_nritems(root->node, 0);
+ root->alloc_bytenr += 8192;
+
+ tmp_root = btrfs_alloc_dummy_root();
+ if (IS_ERR(tmp_root)) {
+ test_msg("Couldn't allocate a fs root\n");
+ ret = PTR_ERR(tmp_root);
+ goto out;
+ }
+
+ tmp_root->root_key.objectid = 5;
+ root->fs_info->fs_root = tmp_root;
+ ret = btrfs_insert_fs_root(root->fs_info, tmp_root);
+ if (ret) {
+ test_msg("Couldn't insert fs root %d\n", ret);
+ goto out;
+ }
+
+ tmp_root = btrfs_alloc_dummy_root();
+ if (IS_ERR(tmp_root)) {
+ test_msg("Couldn't allocate a fs root\n");
+ ret = PTR_ERR(tmp_root);
+ goto out;
+ }
+
+ tmp_root->root_key.objectid = 256;
+ ret = btrfs_insert_fs_root(root->fs_info, tmp_root);
+ if (ret) {
+ test_msg("Couldn't insert fs root %d\n", ret);
+ goto out;
+ }
+
+ /* We are using this root as our extent root */
+ root->fs_info->extent_root = root;
+
+ /*
+ * Some of the paths we test assume we have a filled out fs_info, so we
+ * just need to addt he root in there so we don't panic.
+ */
+ root->fs_info->tree_root = root;
+ root->fs_info->quota_root = root;
+ root->fs_info->quota_enabled = 1;
+
+ test_msg("Running qgroup tests\n");
+ ret = test_no_shared_qgroup(root);
+ if (ret)
+ goto out;
+ ret = test_multiple_refs(root);
+out:
+ btrfs_free_dummy_root(root);
+ return ret;
+}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 7579f6d0b854..511839c04f11 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -31,6 +31,7 @@
#include "inode-map.h"
#include "volumes.h"
#include "dev-replace.h"
+#include "qgroup.h"
#define BTRFS_ROOT_TRANS_TAG 0
@@ -241,18 +242,19 @@ loop:
static int record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- if (root->ref_cows && root->last_trans < trans->transid) {
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ root->last_trans < trans->transid) {
WARN_ON(root == root->fs_info->extent_root);
WARN_ON(root->commit_root != root->node);
/*
- * see below for in_trans_setup usage rules
+ * see below for IN_TRANS_SETUP usage rules
* we have the reloc mutex held now, so there
* is only one writer in this function
*/
- root->in_trans_setup = 1;
+ set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
- /* make sure readers find in_trans_setup before
+ /* make sure readers find IN_TRANS_SETUP before
* they find our root->last_trans update
*/
smp_wmb();
@@ -279,7 +281,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
* But, we have to set root->last_trans before we
* init the relocation root, otherwise, we trip over warnings
* in ctree.c. The solution used here is to flag ourselves
- * with root->in_trans_setup. When this is 1, we're still
+ * with root IN_TRANS_SETUP. When this is 1, we're still
* fixing up the reloc trees and everyone must wait.
*
* When this is zero, they can trust root->last_trans and fly
@@ -288,8 +290,8 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
* done before we pop in the zero below
*/
btrfs_init_reloc_root(trans, root);
- smp_wmb();
- root->in_trans_setup = 0;
+ smp_mb__before_atomic();
+ clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
}
return 0;
}
@@ -298,16 +300,16 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- if (!root->ref_cows)
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
return 0;
/*
- * see record_root_in_trans for comments about in_trans_setup usage
+ * see record_root_in_trans for comments about IN_TRANS_SETUP usage
* and barriers
*/
smp_rmb();
if (root->last_trans == trans->transid &&
- !root->in_trans_setup)
+ !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state))
return 0;
mutex_lock(&root->fs_info->reloc_mutex);
@@ -365,7 +367,7 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
static inline bool need_reserve_reloc_root(struct btrfs_root *root)
{
if (!root->fs_info->reloc_ctl ||
- !root->ref_cows ||
+ !test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
root->reloc_root)
return false;
@@ -695,6 +697,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
unsigned long cur = trans->delayed_ref_updates;
int lock = (trans->type != TRANS_JOIN_NOLOCK);
int err = 0;
+ int must_run_delayed_refs = 0;
if (trans->use_count > 1) {
trans->use_count--;
@@ -702,14 +705,27 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
return 0;
}
- /*
- * do the qgroup accounting as early as possible
- */
- err = btrfs_delayed_refs_qgroup_accounting(trans, info);
-
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
+ if (!list_empty(&trans->new_bgs))
+ btrfs_create_pending_block_groups(trans, root);
+
+ trans->delayed_ref_updates = 0;
+ if (!trans->sync) {
+ must_run_delayed_refs =
+ btrfs_should_throttle_delayed_refs(trans, root);
+ cur = max_t(unsigned long, cur, 32);
+
+ /*
+ * don't make the caller wait if they are from a NOLOCK
+ * or ATTACH transaction, it will deadlock with commit
+ */
+ if (must_run_delayed_refs == 1 &&
+ (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH)))
+ must_run_delayed_refs = 2;
+ }
+
if (trans->qgroup_reserved) {
/*
* the same root has to be passed here between start_transaction
@@ -719,16 +735,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
trans->qgroup_reserved = 0;
}
- if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans, root);
-
- trans->delayed_ref_updates = 0;
- if (!trans->sync && btrfs_should_throttle_delayed_refs(trans, root)) {
- cur = max_t(unsigned long, cur, 32);
- trans->delayed_ref_updates = 0;
- btrfs_run_delayed_refs(trans, root, cur);
- }
-
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
@@ -778,6 +784,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
assert_qgroups_uptodate(trans);
kmem_cache_free(btrfs_trans_handle_cachep, trans);
+ if (must_run_delayed_refs) {
+ btrfs_async_run_delayed_refs(root, cur,
+ must_run_delayed_refs == 1);
+ }
return err;
}
@@ -1049,8 +1059,8 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
btrfs_save_ino_cache(root, trans);
/* see comments in should_cow_block() */
- root->force_cow = 0;
- smp_wmb();
+ clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
+ smp_mb__after_atomic();
if (root->commit_root != root->node) {
list_add_tail(&root->dirty_list,
@@ -1081,7 +1091,7 @@ int btrfs_defrag_root(struct btrfs_root *root)
struct btrfs_trans_handle *trans;
int ret;
- if (xchg(&root->defrag_running, 1))
+ if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
return 0;
while (1) {
@@ -1104,7 +1114,7 @@ int btrfs_defrag_root(struct btrfs_root *root)
break;
}
}
- root->defrag_running = 0;
+ clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
return ret;
}
@@ -1168,12 +1178,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto no_free_objectid;
}
- pending->error = btrfs_qgroup_inherit(trans, fs_info,
- root->root_key.objectid,
- objectid, pending->inherit);
- if (pending->error)
- goto no_free_objectid;
-
key.objectid = objectid;
key.offset = (u64)-1;
key.type = BTRFS_ROOT_ITEM_KEY;
@@ -1270,8 +1274,26 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
+ /*
+ * We need to flush delayed refs in order to make sure all of our quota
+ * operations have been done before we call btrfs_qgroup_inherit.
+ */
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
+ ret = btrfs_qgroup_inherit(trans, fs_info,
+ root->root_key.objectid,
+ objectid, pending->inherit);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
/* see comments in should_cow_block() */
- root->force_cow = 1;
+ set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
smp_wmb();
btrfs_set_root_node(new_root_item, tmp);
@@ -1598,12 +1620,6 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
* them now so that they hinder processing of more delayed refs
* as little as possible.
*/
- if (ret) {
- btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
- return ret;
- }
-
- ret = btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
if (ret)
return ret;
@@ -1984,19 +2000,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
}
root = list_first_entry(&fs_info->dead_roots,
struct btrfs_root, root_list);
- /*
- * Make sure root is not involved in send,
- * if we fail with first root, we return
- * directly rather than continue.
- */
- spin_lock(&root->root_item_lock);
- if (root->send_in_progress) {
- spin_unlock(&fs_info->trans_lock);
- spin_unlock(&root->root_item_lock);
- return 0;
- }
- spin_unlock(&root->root_item_lock);
-
list_del_init(&root->root_list);
spin_unlock(&fs_info->trans_lock);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index b57b924e8e03..7dd558ed0716 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -69,6 +69,7 @@ struct btrfs_transaction {
#define __TRANS_ATTACH (1U << 10)
#define __TRANS_JOIN (1U << 11)
#define __TRANS_JOIN_NOLOCK (1U << 12)
+#define __TRANS_DUMMY (1U << 13)
#define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE)
#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE)
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 76928ca97741..a63719cc9578 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -49,7 +49,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
goto out;
}
- if (root->ref_cows == 0)
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
goto out;
if (btrfs_test_opt(root, SSD))
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index e2f45fc02610..9e1f2cd5e67a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -20,13 +20,11 @@
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/list_sort.h>
-#include "ctree.h"
-#include "transaction.h"
+#include "tree-log.h"
#include "disk-io.h"
#include "locking.h"
#include "print-tree.h"
#include "backref.h"
-#include "tree-log.h"
#include "hash.h"
/* magic values for the inode_only field in btrfs_log_inode:
@@ -144,17 +142,15 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
mutex_lock(&root->log_mutex);
if (root->log_root) {
- if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
- trans->transid) {
+ if (btrfs_need_log_full_commit(root->fs_info, trans)) {
ret = -EAGAIN;
goto out;
}
-
if (!root->log_start_pid) {
root->log_start_pid = current->pid;
- root->log_multiple_pids = false;
+ clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
} else if (root->log_start_pid != current->pid) {
- root->log_multiple_pids = true;
+ set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
}
atomic_inc(&root->log_batch);
@@ -181,7 +177,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
if (ret)
goto out;
}
- root->log_multiple_pids = false;
+ clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
root->log_start_pid = current->pid;
atomic_inc(&root->log_batch);
atomic_inc(&root->log_writers);
@@ -2500,7 +2496,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
while (1) {
int batch = atomic_read(&root->log_batch);
/* when we're on an ssd, just kick the log commit out */
- if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
+ if (!btrfs_test_opt(root, SSD) &&
+ test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
mutex_unlock(&root->log_mutex);
schedule_timeout_uninterruptible(1);
mutex_lock(&root->log_mutex);
@@ -2511,8 +2508,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
/* bail out if we need to do a full commit */
- if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
- trans->transid) {
+ if (btrfs_need_log_full_commit(root->fs_info, trans)) {
ret = -EAGAIN;
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&root->log_mutex);
@@ -2533,8 +2529,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
blk_finish_plug(&plug);
btrfs_abort_transaction(trans, root, ret);
btrfs_free_logged_extents(log, log_transid);
- ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
- trans->transid;
+ btrfs_set_log_full_commit(root->fs_info, trans);
mutex_unlock(&root->log_mutex);
goto out;
}
@@ -2577,8 +2572,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
list_del_init(&root_log_ctx.list);
blk_finish_plug(&plug);
- ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
- trans->transid;
+ btrfs_set_log_full_commit(root->fs_info, trans);
+
if (ret != -ENOSPC) {
btrfs_abort_transaction(trans, root, ret);
mutex_unlock(&log_root_tree->log_mutex);
@@ -2622,8 +2617,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* now that we've moved on to the tree of log tree roots,
* check the full commit flag again
*/
- if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
- trans->transid) {
+ if (btrfs_need_log_full_commit(root->fs_info, trans)) {
blk_finish_plug(&plug);
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
btrfs_free_logged_extents(log, log_transid);
@@ -2637,8 +2631,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
EXTENT_DIRTY | EXTENT_NEW);
blk_finish_plug(&plug);
if (ret) {
- ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
- trans->transid;
+ btrfs_set_log_full_commit(root->fs_info, trans);
btrfs_abort_transaction(trans, root, ret);
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
@@ -2667,8 +2660,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
*/
ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
if (ret) {
- ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
- trans->transid;
+ btrfs_set_log_full_commit(root->fs_info, trans);
btrfs_abort_transaction(trans, root, ret);
goto out_wake_log_root;
}
@@ -2886,7 +2878,7 @@ fail:
out_unlock:
mutex_unlock(&BTRFS_I(dir)->log_mutex);
if (ret == -ENOSPC) {
- root->fs_info->last_trans_log_full_commit = trans->transid;
+ btrfs_set_log_full_commit(root->fs_info, trans);
ret = 0;
} else if (ret < 0)
btrfs_abort_transaction(trans, root, ret);
@@ -2919,7 +2911,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
dirid, &index);
mutex_unlock(&BTRFS_I(inode)->log_mutex);
if (ret == -ENOSPC) {
- root->fs_info->last_trans_log_full_commit = trans->transid;
+ btrfs_set_log_full_commit(root->fs_info, trans);
ret = 0;
} else if (ret < 0 && ret != -ENOENT)
btrfs_abort_transaction(trans, root, ret);
@@ -4130,8 +4122,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
* make sure any commits to the log are forced
* to be full commits
*/
- root->fs_info->last_trans_log_full_commit =
- trans->transid;
+ btrfs_set_log_full_commit(root->fs_info, trans);
ret = 1;
break;
}
@@ -4177,6 +4168,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_no_trans;
}
+ /*
+ * The prev transaction commit doesn't complete, we need do
+ * full commit by ourselves.
+ */
if (root->fs_info->last_trans_log_full_commit >
root->fs_info->last_trans_committed) {
ret = 1;
@@ -4246,7 +4241,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
end_trans:
dput(old_parent);
if (ret < 0) {
- root->fs_info->last_trans_log_full_commit = trans->transid;
+ btrfs_set_log_full_commit(root->fs_info, trans);
ret = 1;
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 91b145fce333..7f5b41bd5373 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -19,6 +19,9 @@
#ifndef __TREE_LOG_
#define __TREE_LOG_
+#include "ctree.h"
+#include "transaction.h"
+
/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
#define BTRFS_NO_LOG_SYNC 256
@@ -35,6 +38,19 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
INIT_LIST_HEAD(&ctx->list);
}
+static inline void btrfs_set_log_full_commit(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans)
+{
+ ACCESS_ONCE(fs_info->last_trans_log_full_commit) = trans->transid;
+}
+
+static inline int btrfs_need_log_full_commit(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans)
+{
+ return ACCESS_ONCE(fs_info->last_trans_log_full_commit) ==
+ trans->transid;
+}
+
int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_log_ctx *ctx);
int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 49d7fab73360..c83b24251e53 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1452,6 +1452,22 @@ out:
return ret;
}
+/*
+ * Function to update ctime/mtime for a given device path.
+ * Mainly used for ctime/mtime based probe like libblkid.
+ */
+static void update_dev_time(char *path_name)
+{
+ struct file *filp;
+
+ filp = filp_open(path_name, O_RDWR, 0);
+ if (!filp)
+ return;
+ file_update_time(filp);
+ filp_close(filp, NULL);
+ return;
+}
+
static int btrfs_rm_dev_item(struct btrfs_root *root,
struct btrfs_device *device)
{
@@ -1674,11 +1690,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
struct btrfs_fs_devices *fs_devices;
fs_devices = root->fs_info->fs_devices;
while (fs_devices) {
- if (fs_devices->seed == cur_devices)
+ if (fs_devices->seed == cur_devices) {
+ fs_devices->seed = cur_devices->seed;
break;
+ }
fs_devices = fs_devices->seed;
}
- fs_devices->seed = cur_devices->seed;
cur_devices->seed = NULL;
lock_chunks(root);
__btrfs_close_devices(cur_devices);
@@ -1694,20 +1711,55 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
* remove it from the devices list and zero out the old super
*/
if (clear_super && disk_super) {
+ u64 bytenr;
+ int i;
+
/* make sure this device isn't detected as part of
* the FS anymore
*/
memset(&disk_super->magic, 0, sizeof(disk_super->magic));
set_buffer_dirty(bh);
sync_dirty_buffer(bh);
+
+ /* clear the mirror copies of super block on the disk
+ * being removed, 0th copy is been taken care above and
+ * the below would take of the rest
+ */
+ for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ bytenr = btrfs_sb_offset(i);
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+ i_size_read(bdev->bd_inode))
+ break;
+
+ brelse(bh);
+ bh = __bread(bdev, bytenr / 4096,
+ BTRFS_SUPER_INFO_SIZE);
+ if (!bh)
+ continue;
+
+ disk_super = (struct btrfs_super_block *)bh->b_data;
+
+ if (btrfs_super_bytenr(disk_super) != bytenr ||
+ btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
+ continue;
+ }
+ memset(&disk_super->magic, 0,
+ sizeof(disk_super->magic));
+ set_buffer_dirty(bh);
+ sync_dirty_buffer(bh);
+ }
}
ret = 0;
- /* Notify udev that device has changed */
- if (bdev)
+ if (bdev) {
+ /* Notify udev that device has changed */
btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
+ /* Update ctime/mtime for device path for libblkid */
+ update_dev_time(device_path);
+ }
+
error_brelse:
brelse(bh);
if (bdev)
@@ -1883,7 +1935,6 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
fs_devices->seeding = 0;
fs_devices->num_devices = 0;
fs_devices->open_devices = 0;
- fs_devices->total_devices = 0;
fs_devices->seed = seed_devices;
generate_random_uuid(fs_devices->fsid);
@@ -2146,6 +2197,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
ret = btrfs_commit_transaction(trans, root);
}
+ /* Update ctime/mtime for libblkid */
+ update_dev_time(device_path);
return ret;
error_trans:
@@ -2490,9 +2543,6 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
remove_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
- kfree(map);
- em->bdev = NULL;
-
/* once for the tree */
free_extent_map(em);
/* once for us */
@@ -2922,6 +2972,16 @@ static int should_balance_chunk(struct btrfs_root *root,
return 0;
}
+ /*
+ * limited by count, must be the last filter
+ */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
+ if (bargs->limit == 0)
+ return 0;
+ else
+ bargs->limit--;
+ }
+
return 1;
}
@@ -2944,6 +3004,9 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
int ret;
int enospc_errors = 0;
bool counting = true;
+ u64 limit_data = bctl->data.limit;
+ u64 limit_meta = bctl->meta.limit;
+ u64 limit_sys = bctl->sys.limit;
/* step one make some room on all the devices */
devices = &fs_info->fs_devices->devices;
@@ -2982,6 +3045,11 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
memset(&bctl->stat, 0, sizeof(bctl->stat));
spin_unlock(&fs_info->balance_lock);
again:
+ if (!counting) {
+ bctl->data.limit = limit_data;
+ bctl->meta.limit = limit_meta;
+ bctl->sys.limit = limit_sys;
+ }
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
@@ -3881,7 +3949,8 @@ static int btrfs_add_system_chunk(struct btrfs_root *root,
u8 *ptr;
array_size = btrfs_super_sys_array_size(super_copy);
- if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
+ if (array_size + item_size + sizeof(disk_key)
+ > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
return -EFBIG;
ptr = super_copy->sys_chunk_array + array_size;
@@ -3986,6 +4055,16 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
btrfs_set_fs_incompat(info, RAID56);
}
+#define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r) \
+ - sizeof(struct btrfs_item) \
+ - sizeof(struct btrfs_chunk)) \
+ / sizeof(struct btrfs_stripe) + 1)
+
+#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \
+ - 2 * sizeof(struct btrfs_disk_key) \
+ - 2 * sizeof(struct btrfs_chunk)) \
+ / sizeof(struct btrfs_stripe) + 1)
+
static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 start,
u64 type)
@@ -4035,6 +4114,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (type & BTRFS_BLOCK_GROUP_DATA) {
max_stripe_size = 1024 * 1024 * 1024;
max_chunk_size = 10 * max_stripe_size;
+ if (!devs_max)
+ devs_max = BTRFS_MAX_DEVS(info->chunk_root);
} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
/* for larger filesystems, use larger metadata chunks */
if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
@@ -4042,11 +4123,15 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
else
max_stripe_size = 256 * 1024 * 1024;
max_chunk_size = max_stripe_size;
+ if (!devs_max)
+ devs_max = BTRFS_MAX_DEVS(info->chunk_root);
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
max_stripe_size = 32 * 1024 * 1024;
max_chunk_size = 2 * max_stripe_size;
+ if (!devs_max)
+ devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
} else {
- btrfs_err(info, "invalid chunk type 0x%llx requested\n",
+ btrfs_err(info, "invalid chunk type 0x%llx requested",
type);
BUG_ON(1);
}
@@ -4213,9 +4298,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
em = alloc_extent_map();
if (!em) {
+ kfree(map);
ret = -ENOMEM;
goto error;
}
+ set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
em->bdev = (struct block_device *)map;
em->start = start;
em->len = num_bytes;
@@ -4258,7 +4345,6 @@ error_del_extent:
/* One for the tree reference */
free_extent_map(em);
error:
- kfree(map);
kfree(devices_info);
return ret;
}
@@ -4294,7 +4380,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
if (em->start != chunk_offset || em->len != chunk_size) {
btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
- " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset,
+ " %Lu-%Lu, found %Lu-%Lu", chunk_offset,
chunk_size, em->start, em->len);
free_extent_map(em);
return -EINVAL;
@@ -4470,7 +4556,6 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
write_unlock(&tree->map_tree.lock);
if (!em)
break;
- kfree(em->bdev);
/* once for us */
free_extent_map(em);
/* once for the tree */
@@ -4496,14 +4581,14 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
* and exit, so return 1 so the callers don't try to use other copies.
*/
if (!em) {
- btrfs_crit(fs_info, "No mapping for %Lu-%Lu\n", logical,
+ btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical,
logical+len);
return 1;
}
if (em->start > logical || em->start + em->len < logical) {
btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got "
- "%Lu-%Lu\n", logical, logical+len, em->start,
+ "%Lu-%Lu", logical, logical+len, em->start,
em->start + em->len);
free_extent_map(em);
return 1;
@@ -4684,7 +4769,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
if (em->start > logical || em->start + em->len < logical) {
btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, "
- "found %Lu-%Lu\n", logical, em->start,
+ "found %Lu-%Lu", logical, em->start,
em->start + em->len);
free_extent_map(em);
return -EINVAL;
@@ -5274,6 +5359,15 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
return 0;
}
+static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err)
+{
+ if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED))
+ bio_endio_nodec(bio, err);
+ else
+ bio_endio(bio, err);
+ kfree(bbio);
+}
+
static void btrfs_end_bio(struct bio *bio, int err)
{
struct btrfs_bio *bbio = bio->bi_private;
@@ -5314,12 +5408,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
bio = bbio->orig_bio;
}
- /*
- * We have original bio now. So increment bi_remaining to
- * account for it in endio
- */
- atomic_inc(&bio->bi_remaining);
-
bio->bi_private = bbio->private;
bio->bi_end_io = bbio->end_io;
btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
@@ -5336,9 +5424,8 @@ static void btrfs_end_bio(struct bio *bio, int err)
set_bit(BIO_UPTODATE, &bio->bi_flags);
err = 0;
}
- kfree(bbio);
- bio_endio(bio, err);
+ btrfs_end_bbio(bbio, bio, err);
} else if (!is_orig_bio) {
bio_put(bio);
}
@@ -5501,12 +5588,15 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
{
atomic_inc(&bbio->error);
if (atomic_dec_and_test(&bbio->stripes_pending)) {
+ /* Shoud be the original bio. */
+ WARN_ON(bio != bbio->orig_bio);
+
bio->bi_private = bbio->private;
bio->bi_end_io = bbio->end_io;
btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
bio->bi_iter.bi_sector = logical >> 9;
- kfree(bbio);
- bio_endio(bio, -EIO);
+
+ btrfs_end_bbio(bbio, bio, -EIO);
}
}
@@ -5593,6 +5683,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
BUG_ON(!bio); /* -ENOMEM */
} else {
bio = first_bio;
+ bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED;
}
submit_stripe_bio(root, bbio, bio,
@@ -5734,6 +5825,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
return -ENOMEM;
}
+ set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
em->bdev = (struct block_device *)map;
em->start = logical;
em->len = length;
@@ -5758,7 +5850,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
uuid, NULL);
if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
- kfree(map);
free_extent_map(em);
return -EIO;
}
@@ -5766,7 +5857,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
map->stripes[i].dev =
add_missing_dev(root, devid, uuid);
if (!map->stripes[i].dev) {
- kfree(map);
free_extent_map(em);
return -EIO;
}
@@ -6058,10 +6148,14 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry(device, &fs_devices->devices, dev_list)
- device->dev_root = fs_info->dev_root;
- mutex_unlock(&fs_devices->device_list_mutex);
+ while (fs_devices) {
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list)
+ device->dev_root = fs_info->dev_root;
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ fs_devices = fs_devices->seed;
+ }
}
static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 80754f9dd3df..2aaa00c47816 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -190,11 +190,14 @@ struct btrfs_bio_stripe {
struct btrfs_bio;
typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
+#define BTRFS_BIO_ORIG_BIO_SUBMITTED 0x1
+
struct btrfs_bio {
atomic_t stripes_pending;
struct btrfs_fs_info *fs_info;
bio_end_io_t *end_io;
struct bio *orig_bio;
+ unsigned long flags;
void *private;
atomic_t error;
int max_errors;
@@ -255,6 +258,7 @@ struct map_lookup {
#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2)
#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3)
#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4)
+#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5)
/*
* Profile changing flags. When SOFT is set we won't relocate chunk if
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 8e57191950cb..4f196314c0c1 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -98,7 +98,7 @@ static int zlib_compress_pages(struct list_head *ws,
if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
printk(KERN_WARNING "BTRFS: deflateInit failed\n");
- ret = -1;
+ ret = -EIO;
goto out;
}
@@ -110,7 +110,7 @@ static int zlib_compress_pages(struct list_head *ws,
out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
if (out_page == NULL) {
- ret = -1;
+ ret = -ENOMEM;
goto out;
}
cpage_out = kmap(out_page);
@@ -128,7 +128,7 @@ static int zlib_compress_pages(struct list_head *ws,
printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
ret);
zlib_deflateEnd(&workspace->def_strm);
- ret = -1;
+ ret = -EIO;
goto out;
}
@@ -136,7 +136,7 @@ static int zlib_compress_pages(struct list_head *ws,
if (workspace->def_strm.total_in > 8192 &&
workspace->def_strm.total_in <
workspace->def_strm.total_out) {
- ret = -1;
+ ret = -EIO;
goto out;
}
/* we need another page for writing out. Test this
@@ -147,12 +147,12 @@ static int zlib_compress_pages(struct list_head *ws,
kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
- ret = -1;
+ ret = -E2BIG;
goto out;
}
out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
if (out_page == NULL) {
- ret = -1;
+ ret = -ENOMEM;
goto out;
}
cpage_out = kmap(out_page);
@@ -188,12 +188,12 @@ static int zlib_compress_pages(struct list_head *ws,
zlib_deflateEnd(&workspace->def_strm);
if (ret != Z_STREAM_END) {
- ret = -1;
+ ret = -EIO;
goto out;
}
if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
- ret = -1;
+ ret = -E2BIG;
goto out;
}
@@ -253,7 +253,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
printk(KERN_WARNING "BTRFS: inflateInit failed\n");
- return -1;
+ return -EIO;
}
while (workspace->inf_strm.total_in < srclen) {
ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
@@ -295,7 +295,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
}
}
if (ret != Z_STREAM_END)
- ret = -1;
+ ret = -EIO;
else
ret = 0;
done:
@@ -337,7 +337,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
printk(KERN_WARNING "BTRFS: inflateInit failed\n");
- return -1;
+ return -EIO;
}
while (bytes_left > 0) {
@@ -354,7 +354,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
total_out = workspace->inf_strm.total_out;
if (total_out == buf_start) {
- ret = -1;
+ ret = -EIO;
break;
}
@@ -382,7 +382,7 @@ next:
}
if (ret != Z_STREAM_END && bytes_left != 0)
- ret = -1;
+ ret = -EIO;
else
ret = 0;
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 21887d63dad5..469f2e8657e8 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -104,12 +104,6 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
struct dentry *dentry;
- if (acl) {
- ret = posix_acl_valid(acl);
- if (ret < 0)
- goto out;
- }
-
switch (type) {
case ACL_TYPE_ACCESS:
name = POSIX_ACL_XATTR_ACCESS;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 65a30e817dd8..90b3954d48ed 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -211,18 +211,15 @@ static int readpage_nounlock(struct file *filp, struct page *page)
SetPageError(page);
ceph_fscache_readpage_cancel(inode, page);
goto out;
- } else {
- if (err < PAGE_CACHE_SIZE) {
- /* zero fill remainder of page */
- zero_user_segment(page, err, PAGE_CACHE_SIZE);
- } else {
- flush_dcache_page(page);
- }
}
- SetPageUptodate(page);
+ if (err < PAGE_CACHE_SIZE)
+ /* zero fill remainder of page */
+ zero_user_segment(page, err, PAGE_CACHE_SIZE);
+ else
+ flush_dcache_page(page);
- if (err >= 0)
- ceph_readpage_to_fscache(inode, page);
+ SetPageUptodate(page);
+ ceph_readpage_to_fscache(inode, page);
out:
return err < 0 ? err : 0;
@@ -1187,8 +1184,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
* never get called.
*/
static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
- const struct iovec *iov,
- loff_t pos, unsigned long nr_segs)
+ struct iov_iter *iter,
+ loff_t pos)
{
WARN_ON(1);
return -EINVAL;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index c561b628ebce..1fde164b74b5 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -221,8 +221,8 @@ int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
return 0;
}
-static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,
- struct ceph_cap_reservation *ctx)
+struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
+ struct ceph_cap_reservation *ctx)
{
struct ceph_cap *cap = NULL;
@@ -508,15 +508,14 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
* it is < 0. (This is so we can atomically add the cap and add an
* open file reference to it.)
*/
-int ceph_add_cap(struct inode *inode,
- struct ceph_mds_session *session, u64 cap_id,
- int fmode, unsigned issued, unsigned wanted,
- unsigned seq, unsigned mseq, u64 realmino, int flags,
- struct ceph_cap_reservation *caps_reservation)
+void ceph_add_cap(struct inode *inode,
+ struct ceph_mds_session *session, u64 cap_id,
+ int fmode, unsigned issued, unsigned wanted,
+ unsigned seq, unsigned mseq, u64 realmino, int flags,
+ struct ceph_cap **new_cap)
{
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_cap *new_cap = NULL;
struct ceph_cap *cap;
int mds = session->s_mds;
int actual_wanted;
@@ -531,20 +530,10 @@ int ceph_add_cap(struct inode *inode,
if (fmode >= 0)
wanted |= ceph_caps_for_mode(fmode);
-retry:
- spin_lock(&ci->i_ceph_lock);
cap = __get_cap_for_mds(ci, mds);
if (!cap) {
- if (new_cap) {
- cap = new_cap;
- new_cap = NULL;
- } else {
- spin_unlock(&ci->i_ceph_lock);
- new_cap = get_cap(mdsc, caps_reservation);
- if (new_cap == NULL)
- return -ENOMEM;
- goto retry;
- }
+ cap = *new_cap;
+ *new_cap = NULL;
cap->issued = 0;
cap->implemented = 0;
@@ -562,9 +551,6 @@ retry:
session->s_nr_caps++;
spin_unlock(&session->s_cap_lock);
} else {
- if (new_cap)
- ceph_put_cap(mdsc, new_cap);
-
/*
* auth mds of the inode changed. we received the cap export
* message, but still haven't received the cap import message.
@@ -626,7 +612,6 @@ retry:
ci->i_auth_cap = cap;
cap->mds_wanted = wanted;
}
- ci->i_cap_exporting_issued = 0;
} else {
WARN_ON(ci->i_auth_cap == cap);
}
@@ -648,9 +633,6 @@ retry:
if (fmode >= 0)
__ceph_get_fmode(ci, fmode);
- spin_unlock(&ci->i_ceph_lock);
- wake_up_all(&ci->i_cap_wq);
- return 0;
}
/*
@@ -685,7 +667,7 @@ static int __cap_is_valid(struct ceph_cap *cap)
*/
int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
{
- int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
+ int have = ci->i_snap_caps;
struct ceph_cap *cap;
struct rb_node *p;
@@ -900,7 +882,7 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
*/
static int __ceph_is_any_caps(struct ceph_inode_info *ci)
{
- return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_issued;
+ return !RB_EMPTY_ROOT(&ci->i_caps);
}
int ceph_is_any_caps(struct inode *inode)
@@ -2397,32 +2379,30 @@ static void invalidate_aliases(struct inode *inode)
* actually be a revocation if it specifies a smaller cap set.)
*
* caller holds s_mutex and i_ceph_lock, we drop both.
- *
- * return value:
- * 0 - ok
- * 1 - check_caps on auth cap only (writeback)
- * 2 - check_caps (ack revoke)
*/
-static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
+static void handle_cap_grant(struct ceph_mds_client *mdsc,
+ struct inode *inode, struct ceph_mds_caps *grant,
+ void *snaptrace, int snaptrace_len,
+ struct ceph_buffer *xattr_buf,
struct ceph_mds_session *session,
- struct ceph_cap *cap,
- struct ceph_buffer *xattr_buf)
- __releases(ci->i_ceph_lock)
+ struct ceph_cap *cap, int issued)
+ __releases(ci->i_ceph_lock)
{
struct ceph_inode_info *ci = ceph_inode(inode);
int mds = session->s_mds;
int seq = le32_to_cpu(grant->seq);
int newcaps = le32_to_cpu(grant->caps);
- int issued, implemented, used, wanted, dirty;
+ int used, wanted, dirty;
u64 size = le64_to_cpu(grant->size);
u64 max_size = le64_to_cpu(grant->max_size);
struct timespec mtime, atime, ctime;
int check_caps = 0;
- int wake = 0;
- int writeback = 0;
- int queue_invalidate = 0;
- int deleted_inode = 0;
- int queue_revalidate = 0;
+ bool wake = 0;
+ bool writeback = 0;
+ bool queue_trunc = 0;
+ bool queue_invalidate = 0;
+ bool queue_revalidate = 0;
+ bool deleted_inode = 0;
dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
inode, cap, mds, seq, ceph_cap_string(newcaps));
@@ -2466,16 +2446,13 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
}
/* side effects now are allowed */
-
- issued = __ceph_caps_issued(ci, &implemented);
- issued |= implemented | __ceph_caps_dirty(ci);
-
cap->cap_gen = session->s_cap_gen;
cap->seq = seq;
__check_cap_issue(ci, cap, newcaps);
- if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+ if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
+ (issued & CEPH_CAP_AUTH_EXCL) == 0) {
inode->i_mode = le32_to_cpu(grant->mode);
inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
@@ -2484,7 +2461,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
from_kgid(&init_user_ns, inode->i_gid));
}
- if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
+ if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
+ (issued & CEPH_CAP_LINK_EXCL) == 0) {
set_nlink(inode, le32_to_cpu(grant->nlink));
if (inode->i_nlink == 0 &&
(newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
@@ -2511,30 +2489,35 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1)
queue_revalidate = 1;
- /* size/ctime/mtime/atime? */
- ceph_fill_file_size(inode, issued,
- le32_to_cpu(grant->truncate_seq),
- le64_to_cpu(grant->truncate_size), size);
- ceph_decode_timespec(&mtime, &grant->mtime);
- ceph_decode_timespec(&atime, &grant->atime);
- ceph_decode_timespec(&ctime, &grant->ctime);
- ceph_fill_file_time(inode, issued,
- le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
- &atime);
-
-
- /* file layout may have changed */
- ci->i_layout = grant->layout;
-
- /* max size increase? */
- if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
- dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
- ci->i_max_size = max_size;
- if (max_size >= ci->i_wanted_max_size) {
- ci->i_wanted_max_size = 0; /* reset */
- ci->i_requested_max_size = 0;
+ if (newcaps & CEPH_CAP_ANY_RD) {
+ /* ctime/mtime/atime? */
+ ceph_decode_timespec(&mtime, &grant->mtime);
+ ceph_decode_timespec(&atime, &grant->atime);
+ ceph_decode_timespec(&ctime, &grant->ctime);
+ ceph_fill_file_time(inode, issued,
+ le32_to_cpu(grant->time_warp_seq),
+ &ctime, &mtime, &atime);
+ }
+
+ if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
+ /* file layout may have changed */
+ ci->i_layout = grant->layout;
+ /* size/truncate_seq? */
+ queue_trunc = ceph_fill_file_size(inode, issued,
+ le32_to_cpu(grant->truncate_seq),
+ le64_to_cpu(grant->truncate_size),
+ size);
+ /* max size increase? */
+ if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
+ dout("max_size %lld -> %llu\n",
+ ci->i_max_size, max_size);
+ ci->i_max_size = max_size;
+ if (max_size >= ci->i_wanted_max_size) {
+ ci->i_wanted_max_size = 0; /* reset */
+ ci->i_requested_max_size = 0;
+ }
+ wake = 1;
}
- wake = 1;
}
/* check cap bits */
@@ -2595,6 +2578,23 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
spin_unlock(&ci->i_ceph_lock);
+ if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
+ down_write(&mdsc->snap_rwsem);
+ ceph_update_snap_trace(mdsc, snaptrace,
+ snaptrace + snaptrace_len, false);
+ downgrade_write(&mdsc->snap_rwsem);
+ kick_flushing_inode_caps(mdsc, session, inode);
+ up_read(&mdsc->snap_rwsem);
+ if (newcaps & ~issued)
+ wake = 1;
+ }
+
+ if (queue_trunc) {
+ ceph_queue_vmtruncate(inode);
+ ceph_queue_revalidate(inode);
+ } else if (queue_revalidate)
+ ceph_queue_revalidate(inode);
+
if (writeback)
/*
* queue inode for writeback: we can't actually call
@@ -2606,8 +2606,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
ceph_queue_invalidate(inode);
if (deleted_inode)
invalidate_aliases(inode);
- if (queue_revalidate)
- ceph_queue_revalidate(inode);
if (wake)
wake_up_all(&ci->i_cap_wq);
@@ -2784,7 +2782,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
{
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_mds_session *tsession = NULL;
- struct ceph_cap *cap, *tcap;
+ struct ceph_cap *cap, *tcap, *new_cap = NULL;
struct ceph_inode_info *ci = ceph_inode(inode);
u64 t_cap_id;
unsigned mseq = le32_to_cpu(ex->migrate_seq);
@@ -2807,7 +2805,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
retry:
spin_lock(&ci->i_ceph_lock);
cap = __get_cap_for_mds(ci, mds);
- if (!cap)
+ if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id))
goto out_unlock;
if (target < 0) {
@@ -2846,15 +2844,14 @@ retry:
}
__ceph_remove_cap(cap, false);
goto out_unlock;
- }
-
- if (tsession) {
- int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
- spin_unlock(&ci->i_ceph_lock);
+ } else if (tsession) {
/* add placeholder for the export tagert */
+ int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
- t_seq - 1, t_mseq, (u64)-1, flag, NULL);
- goto retry;
+ t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
+
+ __ceph_remove_cap(cap, false);
+ goto out_unlock;
}
spin_unlock(&ci->i_ceph_lock);
@@ -2873,6 +2870,7 @@ retry:
SINGLE_DEPTH_NESTING);
}
ceph_add_cap_releases(mdsc, tsession);
+ new_cap = ceph_get_cap(mdsc, NULL);
} else {
WARN_ON(1);
tsession = NULL;
@@ -2887,24 +2885,27 @@ out_unlock:
mutex_unlock(&tsession->s_mutex);
ceph_put_mds_session(tsession);
}
+ if (new_cap)
+ ceph_put_cap(mdsc, new_cap);
}
/*
- * Handle cap IMPORT. If there are temp bits from an older EXPORT,
- * clean them up.
+ * Handle cap IMPORT.
*
- * caller holds s_mutex.
+ * caller holds s_mutex. acquires i_ceph_lock
*/
static void handle_cap_import(struct ceph_mds_client *mdsc,
struct inode *inode, struct ceph_mds_caps *im,
struct ceph_mds_cap_peer *ph,
struct ceph_mds_session *session,
- void *snaptrace, int snaptrace_len)
+ struct ceph_cap **target_cap, int *old_issued)
+ __acquires(ci->i_ceph_lock)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_cap *cap;
+ struct ceph_cap *cap, *ocap, *new_cap = NULL;
int mds = session->s_mds;
- unsigned issued = le32_to_cpu(im->caps);
+ int issued;
+ unsigned caps = le32_to_cpu(im->caps);
unsigned wanted = le32_to_cpu(im->wanted);
unsigned seq = le32_to_cpu(im->seq);
unsigned mseq = le32_to_cpu(im->migrate_seq);
@@ -2924,40 +2925,52 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n",
inode, ci, mds, mseq, peer);
+retry:
spin_lock(&ci->i_ceph_lock);
- cap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
- if (cap && cap->cap_id == p_cap_id) {
+ cap = __get_cap_for_mds(ci, mds);
+ if (!cap) {
+ if (!new_cap) {
+ spin_unlock(&ci->i_ceph_lock);
+ new_cap = ceph_get_cap(mdsc, NULL);
+ goto retry;
+ }
+ cap = new_cap;
+ } else {
+ if (new_cap) {
+ ceph_put_cap(mdsc, new_cap);
+ new_cap = NULL;
+ }
+ }
+
+ __ceph_caps_issued(ci, &issued);
+ issued |= __ceph_caps_dirty(ci);
+
+ ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq,
+ realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
+
+ ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
+ if (ocap && ocap->cap_id == p_cap_id) {
dout(" remove export cap %p mds%d flags %d\n",
- cap, peer, ph->flags);
+ ocap, peer, ph->flags);
if ((ph->flags & CEPH_CAP_FLAG_AUTH) &&
- (cap->seq != le32_to_cpu(ph->seq) ||
- cap->mseq != le32_to_cpu(ph->mseq))) {
+ (ocap->seq != le32_to_cpu(ph->seq) ||
+ ocap->mseq != le32_to_cpu(ph->mseq))) {
pr_err("handle_cap_import: mismatched seq/mseq: "
"ino (%llx.%llx) mds%d seq %d mseq %d "
"importer mds%d has peer seq %d mseq %d\n",
- ceph_vinop(inode), peer, cap->seq,
- cap->mseq, mds, le32_to_cpu(ph->seq),
+ ceph_vinop(inode), peer, ocap->seq,
+ ocap->mseq, mds, le32_to_cpu(ph->seq),
le32_to_cpu(ph->mseq));
}
- ci->i_cap_exporting_issued = cap->issued;
- __ceph_remove_cap(cap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
+ __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
}
/* make sure we re-request max_size, if necessary */
ci->i_wanted_max_size = 0;
ci->i_requested_max_size = 0;
- spin_unlock(&ci->i_ceph_lock);
-
- down_write(&mdsc->snap_rwsem);
- ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
- false);
- downgrade_write(&mdsc->snap_rwsem);
- ceph_add_cap(inode, session, cap_id, -1,
- issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
- NULL /* no caps context */);
- kick_flushing_inode_caps(mdsc, session, inode);
- up_read(&mdsc->snap_rwsem);
+ *old_issued = issued;
+ *target_cap = cap;
}
/*
@@ -2977,7 +2990,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
struct ceph_mds_caps *h;
struct ceph_mds_cap_peer *peer = NULL;
int mds = session->s_mds;
- int op;
+ int op, issued;
u32 seq, mseq;
struct ceph_vino vino;
u64 cap_id;
@@ -3069,7 +3082,10 @@ void ceph_handle_caps(struct ceph_mds_session *session,
case CEPH_CAP_OP_IMPORT:
handle_cap_import(mdsc, inode, h, peer, session,
- snaptrace, snaptrace_len);
+ &cap, &issued);
+ handle_cap_grant(mdsc, inode, h, snaptrace, snaptrace_len,
+ msg->middle, session, cap, issued);
+ goto done_unlocked;
}
/* the rest require a cap */
@@ -3086,8 +3102,10 @@ void ceph_handle_caps(struct ceph_mds_session *session,
switch (op) {
case CEPH_CAP_OP_REVOKE:
case CEPH_CAP_OP_GRANT:
- case CEPH_CAP_OP_IMPORT:
- handle_cap_grant(inode, h, session, cap, msg->middle);
+ __ceph_caps_issued(ci, &issued);
+ issued |= __ceph_caps_dirty(ci);
+ handle_cap_grant(mdsc, inode, h, NULL, 0, msg->middle,
+ session, cap, issued);
goto done_unlocked;
case CEPH_CAP_OP_FLUSH_ACK:
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 00d6af6a32ec..8d7d782f4382 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -169,7 +169,7 @@ static struct dentry *__get_parent(struct super_block *sb,
return dentry;
}
-struct dentry *ceph_get_parent(struct dentry *child)
+static struct dentry *ceph_get_parent(struct dentry *child)
{
/* don't re-export snaps */
if (ceph_snap(child->d_inode) != CEPH_NOSNAP)
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 88a6df4cbe6d..302085100c28 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -418,7 +418,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
struct page **pages;
u64 off = iocb->ki_pos;
int num_pages, ret;
- size_t len = i->count;
+ size_t len = iov_iter_count(i);
dout("sync_read on file %p %llu~%u %s\n", file, off,
(unsigned)len,
@@ -436,25 +436,26 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
if (file->f_flags & O_DIRECT) {
while (iov_iter_count(i)) {
- void __user *data = i->iov[0].iov_base + i->iov_offset;
- size_t len = i->iov[0].iov_len - i->iov_offset;
+ size_t start;
+ ssize_t n;
- num_pages = calc_pages_for((unsigned long)data, len);
- pages = ceph_get_direct_page_vector(data,
- num_pages, true);
- if (IS_ERR(pages))
- return PTR_ERR(pages);
+ n = iov_iter_get_pages_alloc(i, &pages, INT_MAX, &start);
+ if (n < 0)
+ return n;
- ret = striped_read(inode, off, len,
+ num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
+
+ ret = striped_read(inode, off, n,
pages, num_pages, checkeof,
- 1, (unsigned long)data & ~PAGE_MASK);
+ 1, start);
+
ceph_put_page_vector(pages, num_pages, true);
if (ret <= 0)
break;
off += ret;
iov_iter_advance(i, ret);
- if (ret < len)
+ if (ret < n)
break;
}
} else {
@@ -466,25 +467,14 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
num_pages, checkeof, 0, 0);
if (ret > 0) {
int l, k = 0;
- size_t left = len = ret;
+ size_t left = ret;
while (left) {
- void __user *data = i->iov[0].iov_base
- + i->iov_offset;
- l = min(i->iov[0].iov_len - i->iov_offset,
- left);
-
- ret = ceph_copy_page_vector_to_user(&pages[k],
- data, off,
- l);
- if (ret > 0) {
- iov_iter_advance(i, ret);
- left -= ret;
- off += ret;
- k = calc_pages_for(iocb->ki_pos,
- len - left + 1) - 1;
- BUG_ON(k >= num_pages && left);
- } else
+ int copy = min_t(size_t, PAGE_SIZE, left);
+ l = copy_page_to_iter(pages[k++], 0, copy, i);
+ off += l;
+ left -= l;
+ if (l < copy)
break;
}
}
@@ -541,8 +531,7 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
* objects, rollback on failure, etc.)
*/
static ssize_t
-ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, size_t count)
+ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
@@ -556,11 +545,10 @@ ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov,
int written = 0;
int flags;
int check_caps = 0;
- int page_align;
int ret;
struct timespec mtime = CURRENT_TIME;
loff_t pos = iocb->ki_pos;
- struct iov_iter i;
+ size_t count = iov_iter_count(from);
if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
return -EROFS;
@@ -582,13 +570,10 @@ ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov,
CEPH_OSD_FLAG_ONDISK |
CEPH_OSD_FLAG_WRITE;
- iov_iter_init(&i, iov, nr_segs, count, 0);
-
- while (iov_iter_count(&i) > 0) {
- void __user *data = i.iov->iov_base + i.iov_offset;
- u64 len = i.iov->iov_len - i.iov_offset;
-
- page_align = (unsigned long)data & ~PAGE_MASK;
+ while (iov_iter_count(from) > 0) {
+ u64 len = iov_iter_single_seg_count(from);
+ size_t start;
+ ssize_t n;
snapc = ci->i_snap_realm->cached_context;
vino = ceph_vino(inode);
@@ -604,20 +589,21 @@ ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov,
break;
}
- num_pages = calc_pages_for(page_align, len);
- pages = ceph_get_direct_page_vector(data, num_pages, false);
- if (IS_ERR(pages)) {
- ret = PTR_ERR(pages);
- goto out;
+ n = iov_iter_get_pages_alloc(from, &pages, len, &start);
+ if (unlikely(n < 0)) {
+ ret = n;
+ ceph_osdc_put_request(req);
+ break;
}
+ num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
/*
* throw out any page cache pages in this range. this
* may block.
*/
truncate_inode_pages_range(inode->i_mapping, pos,
- (pos+len) | (PAGE_CACHE_SIZE-1));
- osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
+ (pos+n) | (PAGE_CACHE_SIZE-1));
+ osd_req_op_extent_osd_data_pages(req, 0, pages, n, start,
false, false);
/* BUG_ON(vino.snap != CEPH_NOSNAP); */
@@ -629,22 +615,20 @@ ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov,
ceph_put_page_vector(pages, num_pages, false);
-out:
ceph_osdc_put_request(req);
- if (ret == 0) {
- pos += len;
- written += len;
- iov_iter_advance(&i, (size_t)len);
-
- if (pos > i_size_read(inode)) {
- check_caps = ceph_inode_set_size(inode, pos);
- if (check_caps)
- ceph_check_caps(ceph_inode(inode),
- CHECK_CAPS_AUTHONLY,
- NULL);
- }
- } else
+ if (ret)
break;
+ pos += n;
+ written += n;
+ iov_iter_advance(from, n);
+
+ if (pos > i_size_read(inode)) {
+ check_caps = ceph_inode_set_size(inode, pos);
+ if (check_caps)
+ ceph_check_caps(ceph_inode(inode),
+ CHECK_CAPS_AUTHONLY,
+ NULL);
+ }
}
if (ret != -EOLDSNAPC && written > 0) {
@@ -662,8 +646,7 @@ out:
* correct atomic write, we should e.g. take write locks on all
* objects, rollback on failure, etc.)
*/
-static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, size_t count)
+static ssize_t ceph_sync_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
@@ -681,7 +664,7 @@ static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov,
int ret;
struct timespec mtime = CURRENT_TIME;
loff_t pos = iocb->ki_pos;
- struct iov_iter i;
+ size_t count = iov_iter_count(from);
if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
return -EROFS;
@@ -703,9 +686,7 @@ static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov,
CEPH_OSD_FLAG_WRITE |
CEPH_OSD_FLAG_ACK;
- iov_iter_init(&i, iov, nr_segs, count, 0);
-
- while ((len = iov_iter_count(&i)) > 0) {
+ while ((len = iov_iter_count(from)) > 0) {
size_t left;
int n;
@@ -737,13 +718,12 @@ static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov,
left = len;
for (n = 0; n < num_pages; n++) {
size_t plen = min_t(size_t, left, PAGE_SIZE);
- ret = iov_iter_copy_from_user(pages[n], &i, 0, plen);
+ ret = copy_page_from_iter(pages[n], 0, plen, from);
if (ret != plen) {
ret = -EFAULT;
break;
}
left -= ret;
- iov_iter_advance(&i, ret);
}
if (ret < 0) {
@@ -796,8 +776,7 @@ out:
*
* Hmm, the sync read case isn't actually async... should it be?
*/
-static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *filp = iocb->ki_filp;
struct ceph_file_info *fi = filp->private_data;
@@ -823,40 +802,20 @@ again:
if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
(iocb->ki_filp->f_flags & O_DIRECT) ||
(fi->flags & CEPH_F_SYNC)) {
- struct iov_iter i;
dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n",
inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
ceph_cap_string(got));
- if (!read) {
- ret = generic_segment_checks(iov, &nr_segs,
- &len, VERIFY_WRITE);
- if (ret)
- goto out;
- }
-
- iov_iter_init(&i, iov, nr_segs, len, read);
-
/* hmm, this isn't really async... */
- ret = ceph_sync_read(iocb, &i, &checkeof);
+ ret = ceph_sync_read(iocb, to, &checkeof);
} else {
- /*
- * We can't modify the content of iov,
- * so we only read from beginning.
- */
- if (read) {
- iocb->ki_pos = pos;
- len = iocb->ki_nbytes;
- read = 0;
- }
dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
- inode, ceph_vinop(inode), pos, (unsigned)len,
+ inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
ceph_cap_string(got));
- ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
+ ret = generic_file_read_iter(iocb, to);
}
-out:
dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
ceph_put_cap_refs(ci, got);
@@ -872,6 +831,7 @@ out:
", reading more\n", iocb->ki_pos,
inode->i_size);
+ iov_iter_advance(to, ret);
read += ret;
len -= ret;
checkeof = 0;
@@ -895,8 +855,7 @@ out:
*
* If we are near ENOSPC, write synchronously.
*/
-static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct ceph_file_info *fi = file->private_data;
@@ -904,18 +863,15 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc =
&ceph_sb_to_client(inode->i_sb)->client->osdc;
- ssize_t count, written = 0;
+ ssize_t count = iov_iter_count(from), written = 0;
int err, want, got;
+ loff_t pos = iocb->ki_pos;
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
mutex_lock(&inode->i_mutex);
- err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
- if (err)
- goto out;
-
/* We can write back this queue in page reclaim */
current->backing_dev_info = file->f_mapping->backing_dev_info;
@@ -925,6 +881,7 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (count == 0)
goto out;
+ iov_iter_truncate(from, count);
err = file_remove_suid(file);
if (err)
@@ -956,23 +913,26 @@ retry_snap:
if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
(file->f_flags & O_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
+ struct iov_iter data;
mutex_unlock(&inode->i_mutex);
+ /* we might need to revert back to that point */
+ data = *from;
if (file->f_flags & O_DIRECT)
- written = ceph_sync_direct_write(iocb, iov,
- nr_segs, count);
+ written = ceph_sync_direct_write(iocb, &data);
else
- written = ceph_sync_write(iocb, iov, nr_segs, count);
+ written = ceph_sync_write(iocb, &data);
if (written == -EOLDSNAPC) {
dout("aio_write %p %llx.%llx %llu~%u"
"got EOLDSNAPC, retrying\n",
inode, ceph_vinop(inode),
- pos, (unsigned)iov->iov_len);
+ pos, (unsigned)count);
mutex_lock(&inode->i_mutex);
goto retry_snap;
}
+ if (written > 0)
+ iov_iter_advance(from, written);
} else {
loff_t old_size = inode->i_size;
- struct iov_iter from;
/*
* No need to acquire the i_truncate_mutex. Because
* the MDS revokes Fwb caps before sending truncate
@@ -980,8 +940,7 @@ retry_snap:
* are pending vmtruncate. So write and vmtruncate
* can not run at the same time
*/
- iov_iter_init(&from, iov, nr_segs, count, 0);
- written = generic_perform_write(file, &from, pos);
+ written = generic_perform_write(file, from, pos);
if (likely(written >= 0))
iocb->ki_pos = pos + written;
if (inode->i_size > old_size)
@@ -999,7 +958,7 @@ retry_snap:
}
dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
- inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
+ inode, ceph_vinop(inode), pos, (unsigned)count,
ceph_cap_string(got));
ceph_put_cap_refs(ci, got);
@@ -1276,16 +1235,16 @@ const struct file_operations ceph_file_fops = {
.open = ceph_open,
.release = ceph_release,
.llseek = ceph_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = ceph_aio_read,
- .aio_write = ceph_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = ceph_read_iter,
+ .write_iter = ceph_write_iter,
.mmap = ceph_mmap,
.fsync = ceph_fsync,
.lock = ceph_lock,
.flock = ceph_flock,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
.unlocked_ioctl = ceph_ioctl,
.compat_ioctl = ceph_ioctl,
.fallocate = ceph_fallocate,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e4fff9ff1c27..04c89c266cec 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -10,6 +10,7 @@
#include <linux/writeback.h>
#include <linux/vmalloc.h>
#include <linux/posix_acl.h>
+#include <linux/random.h>
#include "super.h"
#include "mds_client.h"
@@ -179,9 +180,8 @@ struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
* specified, copy the frag delegation info to the caller if
* it is present.
*/
-u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
- struct ceph_inode_frag *pfrag,
- int *found)
+static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+ struct ceph_inode_frag *pfrag, int *found)
{
u32 t = ceph_frag_make(0, 0);
struct ceph_inode_frag *frag;
@@ -191,7 +191,6 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
if (found)
*found = 0;
- mutex_lock(&ci->i_fragtree_mutex);
while (1) {
WARN_ON(!ceph_frag_contains_value(t, v));
frag = __ceph_find_frag(ci, t);
@@ -220,10 +219,19 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
}
dout("choose_frag(%x) = %x\n", v, t);
- mutex_unlock(&ci->i_fragtree_mutex);
return t;
}
+u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+ struct ceph_inode_frag *pfrag, int *found)
+{
+ u32 ret;
+ mutex_lock(&ci->i_fragtree_mutex);
+ ret = __ceph_choose_frag(ci, v, pfrag, found);
+ mutex_unlock(&ci->i_fragtree_mutex);
+ return ret;
+}
+
/*
* Process dirfrag (delegation) info from the mds. Include leaf
* fragment in tree ONLY if ndist > 0. Otherwise, only
@@ -237,11 +245,17 @@ static int ceph_fill_dirfrag(struct inode *inode,
u32 id = le32_to_cpu(dirinfo->frag);
int mds = le32_to_cpu(dirinfo->auth);
int ndist = le32_to_cpu(dirinfo->ndist);
+ int diri_auth = -1;
int i;
int err = 0;
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_auth_cap)
+ diri_auth = ci->i_auth_cap->mds;
+ spin_unlock(&ci->i_ceph_lock);
+
mutex_lock(&ci->i_fragtree_mutex);
- if (ndist == 0) {
+ if (ndist == 0 && mds == diri_auth) {
/* no delegation info needed. */
frag = __ceph_find_frag(ci, id);
if (!frag)
@@ -286,6 +300,75 @@ out:
return err;
}
+static int ceph_fill_fragtree(struct inode *inode,
+ struct ceph_frag_tree_head *fragtree,
+ struct ceph_mds_reply_dirfrag *dirinfo)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_inode_frag *frag;
+ struct rb_node *rb_node;
+ int i;
+ u32 id, nsplits;
+ bool update = false;
+
+ mutex_lock(&ci->i_fragtree_mutex);
+ nsplits = le32_to_cpu(fragtree->nsplits);
+ if (nsplits) {
+ i = prandom_u32() % nsplits;
+ id = le32_to_cpu(fragtree->splits[i].frag);
+ if (!__ceph_find_frag(ci, id))
+ update = true;
+ } else if (!RB_EMPTY_ROOT(&ci->i_fragtree)) {
+ rb_node = rb_first(&ci->i_fragtree);
+ frag = rb_entry(rb_node, struct ceph_inode_frag, node);
+ if (frag->frag != ceph_frag_make(0, 0) || rb_next(rb_node))
+ update = true;
+ }
+ if (!update && dirinfo) {
+ id = le32_to_cpu(dirinfo->frag);
+ if (id != __ceph_choose_frag(ci, id, NULL, NULL))
+ update = true;
+ }
+ if (!update)
+ goto out_unlock;
+
+ dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode));
+ rb_node = rb_first(&ci->i_fragtree);
+ for (i = 0; i < nsplits; i++) {
+ id = le32_to_cpu(fragtree->splits[i].frag);
+ frag = NULL;
+ while (rb_node) {
+ frag = rb_entry(rb_node, struct ceph_inode_frag, node);
+ if (ceph_frag_compare(frag->frag, id) >= 0) {
+ if (frag->frag != id)
+ frag = NULL;
+ else
+ rb_node = rb_next(rb_node);
+ break;
+ }
+ rb_node = rb_next(rb_node);
+ rb_erase(&frag->node, &ci->i_fragtree);
+ kfree(frag);
+ frag = NULL;
+ }
+ if (!frag) {
+ frag = __get_or_create_frag(ci, id);
+ if (IS_ERR(frag))
+ continue;
+ }
+ frag->split_by = le32_to_cpu(fragtree->splits[i].by);
+ dout(" frag %x split by %d\n", frag->frag, frag->split_by);
+ }
+ while (rb_node) {
+ frag = rb_entry(rb_node, struct ceph_inode_frag, node);
+ rb_node = rb_next(rb_node);
+ rb_erase(&frag->node, &ci->i_fragtree);
+ kfree(frag);
+ }
+out_unlock:
+ mutex_unlock(&ci->i_fragtree_mutex);
+ return 0;
+}
/*
* initialize a newly allocated inode.
@@ -341,7 +424,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
INIT_LIST_HEAD(&ci->i_cap_snaps);
ci->i_head_snapc = NULL;
ci->i_snap_caps = 0;
- ci->i_cap_exporting_issued = 0;
for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
ci->i_nr_by_mode[i] = 0;
@@ -407,7 +489,7 @@ void ceph_destroy_inode(struct inode *inode)
/*
* we may still have a snap_realm reference if there are stray
- * caps in i_cap_exporting_issued or i_snap_caps.
+ * caps in i_snap_caps.
*/
if (ci->i_snap_realm) {
struct ceph_mds_client *mdsc =
@@ -582,22 +664,26 @@ static int fill_inode(struct inode *inode,
unsigned long ttl_from, int cap_fmode,
struct ceph_cap_reservation *caps_reservation)
{
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_mds_reply_inode *info = iinfo->in;
struct ceph_inode_info *ci = ceph_inode(inode);
- int i;
- int issued = 0, implemented;
+ int issued = 0, implemented, new_issued;
struct timespec mtime, atime, ctime;
- u32 nsplits;
- struct ceph_inode_frag *frag;
- struct rb_node *rb_node;
struct ceph_buffer *xattr_blob = NULL;
+ struct ceph_cap *new_cap = NULL;
int err = 0;
- int queue_trunc = 0;
+ bool wake = false;
+ bool queue_trunc = false;
+ bool new_version = false;
dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
inode, ceph_vinop(inode), le64_to_cpu(info->version),
ci->i_version);
+ /* prealloc new cap struct */
+ if (info->cap.caps && ceph_snap(inode) == CEPH_NOSNAP)
+ new_cap = ceph_get_cap(mdsc, caps_reservation);
+
/*
* prealloc xattr data, if it looks like we'll need it. only
* if len > 4 (meaning there are actually xattrs; the first 4
@@ -623,19 +709,23 @@ static int fill_inode(struct inode *inode,
* 3 2 skip
* 3 3 update
*/
- if (le64_to_cpu(info->version) > 0 &&
- (ci->i_version & ~1) >= le64_to_cpu(info->version))
- goto no_change;
-
+ if (ci->i_version == 0 ||
+ ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+ le64_to_cpu(info->version) > (ci->i_version & ~1)))
+ new_version = true;
+
issued = __ceph_caps_issued(ci, &implemented);
issued |= implemented | __ceph_caps_dirty(ci);
+ new_issued = ~issued & le32_to_cpu(info->cap.caps);
/* update inode */
ci->i_version = le64_to_cpu(info->version);
inode->i_version++;
inode->i_rdev = le32_to_cpu(info->rdev);
+ inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
- if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+ if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
+ (issued & CEPH_CAP_AUTH_EXCL) == 0) {
inode->i_mode = le32_to_cpu(info->mode);
inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid));
inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid));
@@ -644,23 +734,35 @@ static int fill_inode(struct inode *inode,
from_kgid(&init_user_ns, inode->i_gid));
}
- if ((issued & CEPH_CAP_LINK_EXCL) == 0)
+ if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
+ (issued & CEPH_CAP_LINK_EXCL) == 0)
set_nlink(inode, le32_to_cpu(info->nlink));
- /* be careful with mtime, atime, size */
- ceph_decode_timespec(&atime, &info->atime);
- ceph_decode_timespec(&mtime, &info->mtime);
- ceph_decode_timespec(&ctime, &info->ctime);
- queue_trunc = ceph_fill_file_size(inode, issued,
- le32_to_cpu(info->truncate_seq),
- le64_to_cpu(info->truncate_size),
- le64_to_cpu(info->size));
- ceph_fill_file_time(inode, issued,
- le32_to_cpu(info->time_warp_seq),
- &ctime, &mtime, &atime);
-
- ci->i_layout = info->layout;
- inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
+ if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
+ /* be careful with mtime, atime, size */
+ ceph_decode_timespec(&atime, &info->atime);
+ ceph_decode_timespec(&mtime, &info->mtime);
+ ceph_decode_timespec(&ctime, &info->ctime);
+ ceph_fill_file_time(inode, issued,
+ le32_to_cpu(info->time_warp_seq),
+ &ctime, &mtime, &atime);
+ }
+
+ if (new_version ||
+ (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
+ ci->i_layout = info->layout;
+ queue_trunc = ceph_fill_file_size(inode, issued,
+ le32_to_cpu(info->truncate_seq),
+ le64_to_cpu(info->truncate_size),
+ le64_to_cpu(info->size));
+ /* only update max_size on auth cap */
+ if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+ ci->i_max_size != le64_to_cpu(info->max_size)) {
+ dout("max_size %lld -> %llu\n", ci->i_max_size,
+ le64_to_cpu(info->max_size));
+ ci->i_max_size = le64_to_cpu(info->max_size);
+ }
+ }
/* xattrs */
/* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
@@ -745,58 +847,6 @@ static int fill_inode(struct inode *inode,
dout(" marking %p complete (empty)\n", inode);
__ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count));
}
-no_change:
- /* only update max_size on auth cap */
- if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
- ci->i_max_size != le64_to_cpu(info->max_size)) {
- dout("max_size %lld -> %llu\n", ci->i_max_size,
- le64_to_cpu(info->max_size));
- ci->i_max_size = le64_to_cpu(info->max_size);
- }
-
- spin_unlock(&ci->i_ceph_lock);
-
- /* queue truncate if we saw i_size decrease */
- if (queue_trunc)
- ceph_queue_vmtruncate(inode);
-
- /* populate frag tree */
- /* FIXME: move me up, if/when version reflects fragtree changes */
- nsplits = le32_to_cpu(info->fragtree.nsplits);
- mutex_lock(&ci->i_fragtree_mutex);
- rb_node = rb_first(&ci->i_fragtree);
- for (i = 0; i < nsplits; i++) {
- u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
- frag = NULL;
- while (rb_node) {
- frag = rb_entry(rb_node, struct ceph_inode_frag, node);
- if (ceph_frag_compare(frag->frag, id) >= 0) {
- if (frag->frag != id)
- frag = NULL;
- else
- rb_node = rb_next(rb_node);
- break;
- }
- rb_node = rb_next(rb_node);
- rb_erase(&frag->node, &ci->i_fragtree);
- kfree(frag);
- frag = NULL;
- }
- if (!frag) {
- frag = __get_or_create_frag(ci, id);
- if (IS_ERR(frag))
- continue;
- }
- frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
- dout(" frag %x split by %d\n", frag->frag, frag->split_by);
- }
- while (rb_node) {
- frag = rb_entry(rb_node, struct ceph_inode_frag, node);
- rb_node = rb_next(rb_node);
- rb_erase(&frag->node, &ci->i_fragtree);
- kfree(frag);
- }
- mutex_unlock(&ci->i_fragtree_mutex);
/* were we issued a capability? */
if (info->cap.caps) {
@@ -809,30 +859,41 @@ no_change:
le32_to_cpu(info->cap.seq),
le32_to_cpu(info->cap.mseq),
le64_to_cpu(info->cap.realm),
- info->cap.flags,
- caps_reservation);
+ info->cap.flags, &new_cap);
+ wake = true;
} else {
- spin_lock(&ci->i_ceph_lock);
dout(" %p got snap_caps %s\n", inode,
ceph_cap_string(le32_to_cpu(info->cap.caps)));
ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
if (cap_fmode >= 0)
__ceph_get_fmode(ci, cap_fmode);
- spin_unlock(&ci->i_ceph_lock);
}
} else if (cap_fmode >= 0) {
pr_warn("mds issued no caps on %llx.%llx\n",
ceph_vinop(inode));
__ceph_get_fmode(ci, cap_fmode);
}
+ spin_unlock(&ci->i_ceph_lock);
+
+ if (wake)
+ wake_up_all(&ci->i_cap_wq);
+
+ /* queue truncate if we saw i_size decrease */
+ if (queue_trunc)
+ ceph_queue_vmtruncate(inode);
+
+ /* populate frag tree */
+ if (S_ISDIR(inode->i_mode))
+ ceph_fill_fragtree(inode, &info->fragtree, dirinfo);
/* update delegation info? */
if (dirinfo)
ceph_fill_dirfrag(inode, dirinfo);
err = 0;
-
out:
+ if (new_cap)
+ ceph_put_cap(mdsc, new_cap);
if (xattr_blob)
ceph_buffer_put(xattr_blob);
return err;
@@ -1485,7 +1546,7 @@ static void ceph_invalidate_work(struct work_struct *work)
orig_gen = ci->i_rdcache_gen;
spin_unlock(&ci->i_ceph_lock);
- truncate_inode_pages(inode->i_mapping, 0);
+ truncate_pagecache(inode, 0);
spin_lock(&ci->i_ceph_lock);
if (orig_gen == ci->i_rdcache_gen &&
@@ -1588,7 +1649,7 @@ retry:
ci->i_truncate_pending, to);
spin_unlock(&ci->i_ceph_lock);
- truncate_inode_pages(inode->i_mapping, to);
+ truncate_pagecache(inode, to);
spin_lock(&ci->i_ceph_lock);
if (to == ci->i_truncate_size) {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 9a33b98cb000..92a2548278fc 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1558,6 +1558,8 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
init_completion(&req->r_safe_completion);
INIT_LIST_HEAD(&req->r_unsafe_item);
+ req->r_stamp = CURRENT_TIME;
+
req->r_op = op;
req->r_direct_mode = mode;
return req;
@@ -1783,7 +1785,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
}
len = sizeof(*head) +
- pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64));
+ pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
+ sizeof(struct timespec);
/* calculate (max) length for cap releases */
len += sizeof(struct ceph_mds_request_release) *
@@ -1800,6 +1803,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
goto out_free2;
}
+ msg->hdr.version = 2;
msg->hdr.tid = cpu_to_le64(req->r_tid);
head = msg->front.iov_base;
@@ -1836,6 +1840,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
head->num_releases = cpu_to_le16(releases);
+ /* time stamp */
+ ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp));
+
BUG_ON(p > end);
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index e90cfccf93bd..e00737cf523c 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -194,6 +194,7 @@ struct ceph_mds_request {
int r_fmode; /* file mode, if expecting cap */
kuid_t r_uid;
kgid_t r_gid;
+ struct timespec r_stamp;
/* for choosing which mds to send this request to */
int r_direct_mode;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index ead05cc1f447..12b20744e386 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -292,7 +292,6 @@ struct ceph_inode_info {
struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or
dirty|flushing caps */
unsigned i_snap_caps; /* cap bits for snapped files */
- unsigned i_cap_exporting_issued;
int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
@@ -775,11 +774,13 @@ static inline void ceph_forget_all_cached_acls(struct inode *inode)
extern const char *ceph_cap_string(int c);
extern void ceph_handle_caps(struct ceph_mds_session *session,
struct ceph_msg *msg);
-extern int ceph_add_cap(struct inode *inode,
- struct ceph_mds_session *session, u64 cap_id,
- int fmode, unsigned issued, unsigned wanted,
- unsigned cap, unsigned seq, u64 realmino, int flags,
- struct ceph_cap_reservation *caps_reservation);
+extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
+ struct ceph_cap_reservation *ctx);
+extern void ceph_add_cap(struct inode *inode,
+ struct ceph_mds_session *session, u64 cap_id,
+ int fmode, unsigned issued, unsigned wanted,
+ unsigned cap, unsigned seq, u64 realmino, int flags,
+ struct ceph_cap **new_cap);
extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
extern void ceph_put_cap(struct ceph_mds_client *mdsc,
struct ceph_cap *cap);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 6aaa8112c538..2c90d07c0b3a 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -725,8 +725,7 @@ out_nls:
goto out;
}
-static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t cifs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
struct cifsInodeInfo *cinode = CIFS_I(inode);
@@ -737,14 +736,14 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (written)
return written;
- written = generic_file_aio_write(iocb, iov, nr_segs, pos);
+ written = generic_file_write_iter(iocb, from);
if (CIFS_CACHE_WRITE(CIFS_I(inode)))
goto out;
rc = filemap_fdatawrite(inode->i_mapping);
if (rc)
- cifs_dbg(FYI, "cifs_file_aio_write: %d rc on %p inode\n",
+ cifs_dbg(FYI, "cifs_file_write_iter: %d rc on %p inode\n",
rc, inode);
out:
@@ -880,10 +879,10 @@ const struct inode_operations cifs_symlink_inode_ops = {
};
const struct file_operations cifs_file_ops = {
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = cifs_file_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = cifs_file_write_iter,
.open = cifs_open,
.release = cifs_close,
.lock = cifs_lock,
@@ -899,10 +898,10 @@ const struct file_operations cifs_file_ops = {
};
const struct file_operations cifs_file_strict_ops = {
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = cifs_strict_readv,
- .aio_write = cifs_strict_writev,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = cifs_strict_readv,
+ .write_iter = cifs_strict_writev,
.open = cifs_open,
.release = cifs_close,
.lock = cifs_lock,
@@ -919,10 +918,10 @@ const struct file_operations cifs_file_strict_ops = {
const struct file_operations cifs_file_direct_ops = {
/* BB reevaluate whether they can be done with directio, no cache */
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = cifs_user_readv,
- .aio_write = cifs_user_writev,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = cifs_user_readv,
+ .write_iter = cifs_user_writev,
.open = cifs_open,
.release = cifs_close,
.lock = cifs_lock,
@@ -938,10 +937,10 @@ const struct file_operations cifs_file_direct_ops = {
};
const struct file_operations cifs_file_nobrl_ops = {
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = cifs_file_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = cifs_file_write_iter,
.open = cifs_open,
.release = cifs_close,
.fsync = cifs_fsync,
@@ -956,10 +955,10 @@ const struct file_operations cifs_file_nobrl_ops = {
};
const struct file_operations cifs_file_strict_nobrl_ops = {
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = cifs_strict_readv,
- .aio_write = cifs_strict_writev,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = cifs_strict_readv,
+ .write_iter = cifs_strict_writev,
.open = cifs_open,
.release = cifs_close,
.fsync = cifs_strict_fsync,
@@ -975,10 +974,10 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
const struct file_operations cifs_file_direct_nobrl_ops = {
/* BB reevaluate whether they can be done with directio, no cache */
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = cifs_user_readv,
- .aio_write = cifs_user_writev,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = cifs_user_readv,
+ .write_iter = cifs_user_writev,
.open = cifs_open,
.release = cifs_close,
.fsync = cifs_fsync,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 8fe51166d6e3..70f178a7c759 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -95,14 +95,10 @@ extern const struct file_operations cifs_file_strict_nobrl_ops;
extern int cifs_open(struct inode *inode, struct file *file);
extern int cifs_close(struct inode *inode, struct file *file);
extern int cifs_closedir(struct inode *inode, struct file *file);
-extern ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos);
-extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos);
-extern ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos);
-extern ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos);
+extern ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to);
+extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to);
+extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
+extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
extern int cifs_lock(struct file *, int, struct file_lock *);
extern int cifs_fsync(struct file *, loff_t, loff_t, int);
extern int cifs_strict_fsync(struct file *, loff_t, loff_t, int);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 208f56eca4bf..e90a1e9aa627 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2385,14 +2385,12 @@ cifs_uncached_retry_writev(struct cifs_writedata *wdata)
}
static ssize_t
-cifs_iovec_write(struct file *file, const struct iovec *iov,
- unsigned long nr_segs, loff_t *poffset)
+cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset)
{
unsigned long nr_pages, i;
size_t bytes, copied, len, cur_len;
ssize_t total_written = 0;
loff_t offset;
- struct iov_iter it;
struct cifsFileInfo *open_file;
struct cifs_tcon *tcon;
struct cifs_sb_info *cifs_sb;
@@ -2401,14 +2399,16 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
int rc;
pid_t pid;
- len = iov_length(iov, nr_segs);
- if (!len)
- return 0;
-
+ len = iov_iter_count(from);
rc = generic_write_checks(file, poffset, &len, 0);
if (rc)
return rc;
+ if (!len)
+ return 0;
+
+ iov_iter_truncate(from, len);
+
INIT_LIST_HEAD(&wdata_list);
cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
open_file = file->private_data;
@@ -2424,7 +2424,6 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
else
pid = current->tgid;
- iov_iter_init(&it, iov, nr_segs, len, 0);
do {
size_t save_len;
@@ -2444,11 +2443,10 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
save_len = cur_len;
for (i = 0; i < nr_pages; i++) {
- bytes = min_t(const size_t, cur_len, PAGE_SIZE);
- copied = iov_iter_copy_from_user(wdata->pages[i], &it,
- 0, bytes);
+ bytes = min_t(size_t, cur_len, PAGE_SIZE);
+ copied = copy_page_from_iter(wdata->pages[i], 0, bytes,
+ from);
cur_len -= copied;
- iov_iter_advance(&it, copied);
/*
* If we didn't copy as much as we expected, then that
* may mean we trod into an unmapped area. Stop copying
@@ -2546,11 +2544,11 @@ restart_loop:
return total_written ? total_written : (ssize_t)rc;
}
-ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
{
ssize_t written;
struct inode *inode;
+ loff_t pos = iocb->ki_pos;
inode = file_inode(iocb->ki_filp);
@@ -2560,7 +2558,7 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
* write request.
*/
- written = cifs_iovec_write(iocb->ki_filp, iov, nr_segs, &pos);
+ written = cifs_iovec_write(iocb->ki_filp, from, &pos);
if (written > 0) {
set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(inode)->flags);
iocb->ki_pos = pos;
@@ -2570,8 +2568,7 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
}
static ssize_t
-cifs_writev(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+cifs_writev(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
@@ -2589,10 +2586,10 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
mutex_lock(&inode->i_mutex);
if (file->f_flags & O_APPEND)
lock_pos = i_size_read(inode);
- if (!cifs_find_lock_conflict(cfile, lock_pos, iov_length(iov, nr_segs),
+ if (!cifs_find_lock_conflict(cfile, lock_pos, iov_iter_count(from),
server->vals->exclusive_lock_type, NULL,
CIFS_WRITE_OP)) {
- rc = __generic_file_aio_write(iocb, iov, nr_segs);
+ rc = __generic_file_write_iter(iocb, from);
mutex_unlock(&inode->i_mutex);
if (rc > 0) {
@@ -2610,8 +2607,7 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
}
ssize_t
-cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
struct cifsInodeInfo *cinode = CIFS_I(inode);
@@ -2629,11 +2625,10 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
if (cap_unix(tcon->ses) &&
(CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))
&& ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) {
- written = generic_file_aio_write(
- iocb, iov, nr_segs, pos);
+ written = generic_file_write_iter(iocb, from);
goto out;
}
- written = cifs_writev(iocb, iov, nr_segs, pos);
+ written = cifs_writev(iocb, from);
goto out;
}
/*
@@ -2642,7 +2637,7 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
* affected pages because it may cause a error with mandatory locks on
* these pages but not on the region from pos to ppos+len-1.
*/
- written = cifs_user_writev(iocb, iov, nr_segs, pos);
+ written = cifs_user_writev(iocb, from);
if (written > 0 && CIFS_CACHE_READ(cinode)) {
/*
* Windows 7 server can delay breaking level2 oplock if a write
@@ -2831,32 +2826,25 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server,
return total_read > 0 ? total_read : result;
}
-ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
{
struct file *file = iocb->ki_filp;
ssize_t rc;
size_t len, cur_len;
ssize_t total_read = 0;
- loff_t offset = pos;
+ loff_t offset = iocb->ki_pos;
unsigned int npages;
struct cifs_sb_info *cifs_sb;
struct cifs_tcon *tcon;
struct cifsFileInfo *open_file;
struct cifs_readdata *rdata, *tmp;
struct list_head rdata_list;
- struct iov_iter to;
pid_t pid;
- if (!nr_segs)
- return 0;
-
- len = iov_length(iov, nr_segs);
+ len = iov_iter_count(to);
if (!len)
return 0;
- iov_iter_init(&to, iov, nr_segs, len, 0);
-
INIT_LIST_HEAD(&rdata_list);
cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
open_file = file->private_data;
@@ -2914,7 +2902,7 @@ error:
if (!list_empty(&rdata_list))
rc = 0;
- len = iov_iter_count(&to);
+ len = iov_iter_count(to);
/* the loop below should proceed in the order of increasing offsets */
list_for_each_entry_safe(rdata, tmp, &rdata_list, list) {
again:
@@ -2931,7 +2919,7 @@ error:
goto again;
}
} else {
- rc = cifs_readdata_to_iov(rdata, &to);
+ rc = cifs_readdata_to_iov(rdata, to);
}
}
@@ -2939,7 +2927,7 @@ error:
kref_put(&rdata->refcount, cifs_uncached_readdata_release);
}
- total_read = len - iov_iter_count(&to);
+ total_read = len - iov_iter_count(to);
cifs_stats_bytes_read(tcon, total_read);
@@ -2948,15 +2936,14 @@ error:
rc = 0;
if (total_read) {
- iocb->ki_pos = pos + total_read;
+ iocb->ki_pos += total_read;
return total_read;
}
return rc;
}
ssize_t
-cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = file_inode(iocb->ki_filp);
struct cifsInodeInfo *cinode = CIFS_I(inode);
@@ -2975,22 +2962,22 @@ cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
* pos+len-1.
*/
if (!CIFS_CACHE_READ(cinode))
- return cifs_user_readv(iocb, iov, nr_segs, pos);
+ return cifs_user_readv(iocb, to);
if (cap_unix(tcon->ses) &&
(CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
- return generic_file_aio_read(iocb, iov, nr_segs, pos);
+ return generic_file_read_iter(iocb, to);
/*
* We need to hold the sem to be sure nobody modifies lock list
* with a brlock that prevents reading.
*/
down_read(&cinode->lock_sem);
- if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs),
+ if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(to),
tcon->ses->server->vals->shared_lock_type,
NULL, CIFS_READ_OP))
- rc = generic_file_aio_read(iocb, iov, nr_segs, pos);
+ rc = generic_file_read_iter(iocb, to);
up_read(&cinode->lock_sem);
return rc;
}
@@ -3703,8 +3690,8 @@ void cifs_oplock_break(struct work_struct *work)
* Direct IO is not yet supported in the cached mode.
*/
static ssize_t
-cifs_direct_io(int rw, struct kiocb *iocb, const struct iovec *iov,
- loff_t pos, unsigned long nr_segs)
+cifs_direct_io(int rw, struct kiocb *iocb, struct iov_iter *iter,
+ loff_t pos)
{
/*
* FIXME
diff --git a/fs/dcache.c b/fs/dcache.c
index 1792d6075b4f..06f65857a855 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -532,10 +532,12 @@ static inline struct dentry *lock_parent(struct dentry *dentry)
struct dentry *parent = dentry->d_parent;
if (IS_ROOT(dentry))
return NULL;
+ if (unlikely((int)dentry->d_lockref.count < 0))
+ return NULL;
if (likely(spin_trylock(&parent->d_lock)))
return parent;
- spin_unlock(&dentry->d_lock);
rcu_read_lock();
+ spin_unlock(&dentry->d_lock);
again:
parent = ACCESS_ONCE(dentry->d_parent);
spin_lock(&parent->d_lock);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 31ba0935e32e..98040ba388ac 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -77,7 +77,6 @@ struct dio_submit {
unsigned blocks_available; /* At block_in_file. changes */
int reap_counter; /* rate limit reaping */
sector_t final_block_in_request;/* doesn't change */
- unsigned first_block_in_page; /* doesn't change, Used only once */
int boundary; /* prev block is at a boundary */
get_block_t *get_block; /* block mapping function */
dio_submit_t *submit_io; /* IO submition function */
@@ -98,19 +97,14 @@ struct dio_submit {
sector_t cur_page_block; /* Where it starts */
loff_t cur_page_fs_offset; /* Offset in file */
- /*
- * Page fetching state. These variables belong to dio_refill_pages().
- */
- int curr_page; /* changes */
- int total_pages; /* doesn't change */
- unsigned long curr_user_address;/* changes */
-
+ struct iov_iter *iter;
/*
* Page queue. These variables belong to dio_refill_pages() and
* dio_get_page().
*/
unsigned head; /* next page to process */
unsigned tail; /* last valid page + 1 */
+ size_t from, to;
};
/* dio_state communicated between submission path and end_io */
@@ -163,15 +157,10 @@ static inline unsigned dio_pages_present(struct dio_submit *sdio)
*/
static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
{
- int ret;
- int nr_pages;
+ ssize_t ret;
- nr_pages = min(sdio->total_pages - sdio->curr_page, DIO_PAGES);
- ret = get_user_pages_fast(
- sdio->curr_user_address, /* Where from? */
- nr_pages, /* How many pages? */
- dio->rw == READ, /* Write to memory? */
- &dio->pages[0]); /* Put results here */
+ ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES * PAGE_SIZE,
+ &sdio->from);
if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
struct page *page = ZERO_PAGE(0);
@@ -186,18 +175,19 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
dio->pages[0] = page;
sdio->head = 0;
sdio->tail = 1;
- ret = 0;
- goto out;
+ sdio->from = 0;
+ sdio->to = PAGE_SIZE;
+ return 0;
}
if (ret >= 0) {
- sdio->curr_user_address += ret * PAGE_SIZE;
- sdio->curr_page += ret;
+ iov_iter_advance(sdio->iter, ret);
+ ret += sdio->from;
sdio->head = 0;
- sdio->tail = ret;
- ret = 0;
+ sdio->tail = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
+ sdio->to = ((ret - 1) & (PAGE_SIZE - 1)) + 1;
+ return 0;
}
-out:
return ret;
}
@@ -208,8 +198,9 @@ out:
* L1 cache.
*/
static inline struct page *dio_get_page(struct dio *dio,
- struct dio_submit *sdio)
+ struct dio_submit *sdio, size_t *from, size_t *to)
{
+ int n;
if (dio_pages_present(sdio) == 0) {
int ret;
@@ -218,7 +209,10 @@ static inline struct page *dio_get_page(struct dio *dio,
return ERR_PTR(ret);
BUG_ON(dio_pages_present(sdio) == 0);
}
- return dio->pages[sdio->head++];
+ n = sdio->head++;
+ *from = n ? 0 : sdio->from;
+ *to = (n == sdio->tail - 1) ? sdio->to : PAGE_SIZE;
+ return dio->pages[n];
}
/**
@@ -422,8 +416,8 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
*/
static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
{
- while (dio_pages_present(sdio))
- page_cache_release(dio_get_page(dio, sdio));
+ while (sdio->head < sdio->tail)
+ page_cache_release(dio->pages[sdio->head++]);
}
/*
@@ -912,23 +906,18 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
struct buffer_head *map_bh)
{
const unsigned blkbits = sdio->blkbits;
- const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
- struct page *page;
- unsigned block_in_page;
int ret = 0;
- /* The I/O can start at any block offset within the first page */
- block_in_page = sdio->first_block_in_page;
-
while (sdio->block_in_file < sdio->final_block_in_request) {
- page = dio_get_page(dio, sdio);
+ struct page *page;
+ size_t from, to;
+ page = dio_get_page(dio, sdio, &from, &to);
if (IS_ERR(page)) {
ret = PTR_ERR(page);
goto out;
}
- while (block_in_page < blocks_per_page) {
- unsigned offset_in_page = block_in_page << blkbits;
+ while (from < to) {
unsigned this_chunk_bytes; /* # of bytes mapped */
unsigned this_chunk_blocks; /* # of blocks */
unsigned u;
@@ -999,10 +988,10 @@ do_holes:
page_cache_release(page);
goto out;
}
- zero_user(page, block_in_page << blkbits,
- 1 << blkbits);
+ zero_user(page, from, 1 << blkbits);
sdio->block_in_file++;
- block_in_page++;
+ from += 1 << blkbits;
+ dio->result += 1 << blkbits;
goto next_block;
}
@@ -1019,7 +1008,7 @@ do_holes:
* can add to this page
*/
this_chunk_blocks = sdio->blocks_available;
- u = (PAGE_SIZE - offset_in_page) >> blkbits;
+ u = (to - from) >> blkbits;
if (this_chunk_blocks > u)
this_chunk_blocks = u;
u = sdio->final_block_in_request - sdio->block_in_file;
@@ -1031,7 +1020,7 @@ do_holes:
if (this_chunk_blocks == sdio->blocks_available)
sdio->boundary = buffer_boundary(map_bh);
ret = submit_page_section(dio, sdio, page,
- offset_in_page,
+ from,
this_chunk_bytes,
sdio->next_block_for_io,
map_bh);
@@ -1042,7 +1031,8 @@ do_holes:
sdio->next_block_for_io += this_chunk_blocks;
sdio->block_in_file += this_chunk_blocks;
- block_in_page += this_chunk_blocks;
+ from += this_chunk_bytes;
+ dio->result += this_chunk_bytes;
sdio->blocks_available -= this_chunk_blocks;
next_block:
BUG_ON(sdio->block_in_file > sdio->final_block_in_request);
@@ -1052,7 +1042,6 @@ next_block:
/* Drop the ref which was taken in get_user_pages() */
page_cache_release(page);
- block_in_page = 0;
}
out:
return ret;
@@ -1107,24 +1096,20 @@ static inline int drop_refcount(struct dio *dio)
*/
static inline ssize_t
do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
- struct block_device *bdev, const struct iovec *iov, loff_t offset,
- unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+ struct block_device *bdev, struct iov_iter *iter, loff_t offset,
+ get_block_t get_block, dio_iodone_t end_io,
dio_submit_t submit_io, int flags)
{
- int seg;
- size_t size;
- unsigned long addr;
unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
unsigned blkbits = i_blkbits;
unsigned blocksize_mask = (1 << blkbits) - 1;
ssize_t retval = -EINVAL;
- loff_t end = offset;
+ loff_t end = offset + iov_iter_count(iter);
struct dio *dio;
struct dio_submit sdio = { 0, };
- unsigned long user_addr;
- size_t bytes;
struct buffer_head map_bh = { 0, };
struct blk_plug plug;
+ unsigned long align = offset | iov_iter_alignment(iter);
if (rw & WRITE)
rw = WRITE_ODIRECT;
@@ -1134,32 +1119,16 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
* the early prefetch in the caller enough time.
*/
- if (offset & blocksize_mask) {
+ if (align & blocksize_mask) {
if (bdev)
blkbits = blksize_bits(bdev_logical_block_size(bdev));
blocksize_mask = (1 << blkbits) - 1;
- if (offset & blocksize_mask)
+ if (align & blocksize_mask)
goto out;
}
- /* Check the memory alignment. Blocks cannot straddle pages */
- for (seg = 0; seg < nr_segs; seg++) {
- addr = (unsigned long)iov[seg].iov_base;
- size = iov[seg].iov_len;
- end += size;
- if (unlikely((addr & blocksize_mask) ||
- (size & blocksize_mask))) {
- if (bdev)
- blkbits = blksize_bits(
- bdev_logical_block_size(bdev));
- blocksize_mask = (1 << blkbits) - 1;
- if ((addr & blocksize_mask) || (size & blocksize_mask))
- goto out;
- }
- }
-
/* watch out for a 0 len io from a tricksy fs */
- if (rw == READ && end == offset)
+ if (rw == READ && !iov_iter_count(iter))
return 0;
dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
@@ -1249,6 +1218,10 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
spin_lock_init(&dio->bio_lock);
dio->refcount = 1;
+ sdio.iter = iter;
+ sdio.final_block_in_request =
+ (offset + iov_iter_count(iter)) >> blkbits;
+
/*
* In case of non-aligned buffers, we may need 2 more
* pages since we need to zero out first and last block.
@@ -1256,47 +1229,13 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
if (unlikely(sdio.blkfactor))
sdio.pages_in_io = 2;
- for (seg = 0; seg < nr_segs; seg++) {
- user_addr = (unsigned long)iov[seg].iov_base;
- sdio.pages_in_io +=
- ((user_addr + iov[seg].iov_len + PAGE_SIZE-1) /
- PAGE_SIZE - user_addr / PAGE_SIZE);
- }
+ sdio.pages_in_io += iov_iter_npages(iter, INT_MAX);
blk_start_plug(&plug);
- for (seg = 0; seg < nr_segs; seg++) {
- user_addr = (unsigned long)iov[seg].iov_base;
- sdio.size += bytes = iov[seg].iov_len;
-
- /* Index into the first page of the first block */
- sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
- sdio.final_block_in_request = sdio.block_in_file +
- (bytes >> blkbits);
- /* Page fetching state */
- sdio.head = 0;
- sdio.tail = 0;
- sdio.curr_page = 0;
-
- sdio.total_pages = 0;
- if (user_addr & (PAGE_SIZE-1)) {
- sdio.total_pages++;
- bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
- }
- sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
- sdio.curr_user_address = user_addr;
-
- retval = do_direct_IO(dio, &sdio, &map_bh);
-
- dio->result += iov[seg].iov_len -
- ((sdio.final_block_in_request - sdio.block_in_file) <<
- blkbits);
-
- if (retval) {
- dio_cleanup(dio, &sdio);
- break;
- }
- } /* end iovec loop */
+ retval = do_direct_IO(dio, &sdio, &map_bh);
+ if (retval)
+ dio_cleanup(dio, &sdio);
if (retval == -ENOTBLK) {
/*
@@ -1365,8 +1304,8 @@ out:
ssize_t
__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
- struct block_device *bdev, const struct iovec *iov, loff_t offset,
- unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+ struct block_device *bdev, struct iov_iter *iter, loff_t offset,
+ get_block_t get_block, dio_iodone_t end_io,
dio_submit_t submit_io, int flags)
{
/*
@@ -1381,9 +1320,8 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
prefetch(bdev->bd_queue);
prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
- return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
- nr_segs, get_block, end_io,
- submit_io, flags);
+ return do_blockdev_direct_IO(rw, iocb, inode, bdev, iter, offset,
+ get_block, end_io, submit_io, flags);
}
EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 1e5b45359509..d08e079ea5d3 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -617,6 +617,11 @@ static void retry_failed_sctp_send(struct connection *recv_con,
int nodeid = sn_send_failed->ssf_info.sinfo_ppid;
log_print("Retry sending %d bytes to node id %d", len, nodeid);
+
+ if (!nodeid) {
+ log_print("Shouldn't resend data via listening connection.");
+ return;
+ }
con = nodeid2con(nodeid, 0);
if (!con) {
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index b1eaa7a1f82c..db0fad3269c0 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -45,14 +45,13 @@
* The function to be used for directory reads is ecryptfs_read.
*/
static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
- const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+ struct iov_iter *to)
{
ssize_t rc;
struct path *path;
struct file *file = iocb->ki_filp;
- rc = generic_file_aio_read(iocb, iov, nr_segs, pos);
+ rc = generic_file_read_iter(iocb, to);
/*
* Even though this is a async interface, we need to wait
* for IO to finish to update atime
@@ -352,10 +351,10 @@ const struct file_operations ecryptfs_dir_fops = {
const struct file_operations ecryptfs_main_fops = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .aio_read = ecryptfs_read_update_atime,
- .write = do_sync_write,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = ecryptfs_read_update_atime,
+ .write = new_sync_write,
+ .write_iter = generic_file_write_iter,
.iterate = ecryptfs_readdir,
.unlocked_ioctl = ecryptfs_unlocked_ioctl,
#ifdef CONFIG_COMPAT
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index b73e0621ce9e..b10b48c2a7af 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -910,7 +910,7 @@ static const struct file_operations eventpoll_fops = {
void eventpoll_release_file(struct file *file)
{
struct eventpoll *ep;
- struct epitem *epi;
+ struct epitem *epi, *next;
/*
* We don't want to get "file->f_lock" because it is not
@@ -926,7 +926,7 @@ void eventpoll_release_file(struct file *file)
* Besides, ep_remove() acquires the lock, so we can't hold it here.
*/
mutex_lock(&epmutex);
- list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
+ list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
ep = epi->ep;
mutex_lock_nested(&ep->mtx, 0);
ep_remove(ep, epi);
diff --git a/fs/exec.c b/fs/exec.c
index 238b7aa26f68..a3d33fe592d6 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1046,13 +1046,13 @@ EXPORT_SYMBOL_GPL(get_task_comm);
* so that a new one can be started
*/
-void set_task_comm(struct task_struct *tsk, const char *buf)
+void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
{
task_lock(tsk);
trace_task_rename(tsk, buf);
strlcpy(tsk->comm, buf, sizeof(tsk->comm));
task_unlock(tsk);
- perf_event_comm(tsk);
+ perf_event_comm(tsk, exec);
}
int flush_old_exec(struct linux_binprm * bprm)
@@ -1110,7 +1110,8 @@ void setup_new_exec(struct linux_binprm * bprm)
else
set_dumpable(current->mm, suid_dumpable);
- set_task_comm(current, kbasename(bprm->filename));
+ perf_event_exec();
+ __set_task_comm(current, kbasename(bprm->filename), true);
/* Set the new mm task size. We have to do that late because it may
* depend on TIF_32BIT which is only updated in flush_thread() on
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 491c6c078e7f..71bf8e4fb5d4 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -67,17 +67,17 @@ static int exofs_flush(struct file *file, fl_owner_t id)
const struct file_operations exofs_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.open = generic_file_open,
.release = exofs_release_file,
.fsync = exofs_file_fsync,
.flush = exofs_flush,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
};
const struct inode_operations exofs_file_inode_operations = {
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index d1c244d67667..3f9cafd73931 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -964,7 +964,7 @@ static void exofs_invalidatepage(struct page *page, unsigned int offset,
/* TODO: Should be easy enough to do proprly */
static ssize_t exofs_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
return 0;
}
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 44c36e590765..7c87b22a7228 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -62,10 +62,10 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
*/
const struct file_operations ext2_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.unlocked_ioctl = ext2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext2_compat_ioctl,
@@ -75,7 +75,7 @@ const struct file_operations ext2_file_operations = {
.release = ext2_release_file,
.fsync = ext2_fsync,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
};
#ifdef CONFIG_EXT2_FS_XIP
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index b1d2a4675d42..36d35c36311d 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -850,18 +850,18 @@ static sector_t ext2_bmap(struct address_space *mapping, sector_t block)
}
static ssize_t
-ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs)
+ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
+ size_t count = iov_iter_count(iter);
ssize_t ret;
- ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
- ext2_get_block);
+ ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext2_get_block);
if (ret < 0 && (rw & WRITE))
- ext2_write_failed(mapping, offset + iov_length(iov, nr_segs));
+ ext2_write_failed(mapping, offset + count);
return ret;
}
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index aad05311392a..a062fa1e1b11 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -50,10 +50,10 @@ static int ext3_release_file (struct inode * inode, struct file * filp)
const struct file_operations ext3_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.unlocked_ioctl = ext3_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext3_compat_ioctl,
@@ -63,7 +63,7 @@ const struct file_operations ext3_file_operations = {
.release = ext3_release_file,
.fsync = ext3_sync_file,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
};
const struct inode_operations ext3_file_inode_operations = {
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index f5157d0d1b43..2c6ccc49ba27 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1716,17 +1716,17 @@ static int ext3_journalled_writepage(struct page *page,
WARN_ON_ONCE(IS_RDONLY(inode) &&
!(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
- if (ext3_journal_current_handle())
- goto no_write;
-
trace_ext3_journalled_writepage(page);
- handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto no_write;
- }
-
if (!page_has_buffers(page) || PageChecked(page)) {
+ if (ext3_journal_current_handle())
+ goto no_write;
+
+ handle = ext3_journal_start(inode,
+ ext3_writepage_trans_blocks(inode));
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto no_write;
+ }
/*
* It's mmapped pagecache. Add buffers and journal it. There
* doesn't seem much point in redirtying the page here.
@@ -1749,17 +1749,18 @@ static int ext3_journalled_writepage(struct page *page,
atomic_set(&EXT3_I(inode)->i_datasync_tid,
handle->h_transaction->t_tid);
unlock_page(page);
+ err = ext3_journal_stop(handle);
+ if (!ret)
+ ret = err;
} else {
/*
- * It may be a page full of checkpoint-mode buffers. We don't
- * really know unless we go poke around in the buffer_heads.
- * But block_write_full_page will do the right thing.
+ * It is a page full of checkpoint-mode buffers. Go and write
+ * them. They should have been already mapped when they went
+ * to the journal so provide NULL get_block function to catch
+ * errors.
*/
- ret = block_write_full_page(page, ext3_get_block, wbc);
+ ret = block_write_full_page(page, NULL, wbc);
}
- err = ext3_journal_stop(handle);
- if (!ret)
- ret = err;
out:
return ret;
@@ -1820,8 +1821,7 @@ static int ext3_releasepage(struct page *page, gfp_t wait)
* VFS code falls back into buffered path in that case so we are safe.
*/
static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
@@ -1829,10 +1829,10 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
handle_t *handle;
ssize_t ret;
int orphan = 0;
- size_t count = iov_length(iov, nr_segs);
+ size_t count = iov_iter_count(iter);
int retries = 0;
- trace_ext3_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
+ trace_ext3_direct_IO_enter(inode, offset, count, rw);
if (rw == WRITE) {
loff_t final_size = offset + count;
@@ -1856,15 +1856,14 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
}
retry:
- ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
- ext3_get_block);
+ ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext3_get_block);
/*
* In case of error extending write may have instantiated a few
* blocks outside i_size. Trim these off again.
*/
if (unlikely((rw & WRITE) && ret < 0)) {
loff_t isize = i_size_read(inode);
- loff_t end = offset + iov_length(iov, nr_segs);
+ loff_t end = offset + count;
if (end > isize)
ext3_truncate_failed_direct_write(inode);
@@ -1909,8 +1908,7 @@ retry:
ret = err;
}
out:
- trace_ext3_direct_IO_exit(inode, offset,
- iov_length(iov, nr_segs), rw, ret);
+ trace_ext3_direct_IO_exit(inode, offset, count, rw, ret);
return ret;
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1479e2ae00d2..7cc5a0e23688 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2140,8 +2140,7 @@ extern void ext4_da_update_reserve_space(struct inode *inode,
extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs);
+ struct iov_iter *iter, loff_t offset);
extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
extern void ext4_ind_truncate(handle_t *, struct inode *inode);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4e8bc284ec0e..8695f70af1ef 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -74,26 +74,22 @@ static void ext4_unwritten_wait(struct inode *inode)
* or one thread will zero the other's data, causing corruption.
*/
static int
-ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
{
struct super_block *sb = inode->i_sb;
int blockmask = sb->s_blocksize - 1;
- size_t count = iov_length(iov, nr_segs);
- loff_t final_size = pos + count;
if (pos >= i_size_read(inode))
return 0;
- if ((pos & blockmask) || (final_size & blockmask))
+ if ((pos | iov_iter_alignment(from)) & blockmask)
return 1;
return 0;
}
static ssize_t
-ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(iocb->ki_filp);
@@ -101,10 +97,9 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
struct blk_plug plug;
int o_direct = file->f_flags & O_DIRECT;
int overwrite = 0;
- size_t length = iov_length(iov, nr_segs);
+ size_t length = iov_iter_count(from);
ssize_t ret;
-
- BUG_ON(iocb->ki_pos != pos);
+ loff_t pos = iocb->ki_pos;
/*
* Unaligned direct AIO must be serialized; see comment above
@@ -114,7 +109,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
!is_sync_kiocb(iocb) &&
(file->f_flags & O_APPEND ||
- ext4_unaligned_aio(inode, iov, nr_segs, pos))) {
+ ext4_unaligned_aio(inode, from, pos))) {
aio_mutex = ext4_aio_mutex(inode);
mutex_lock(aio_mutex);
ext4_unwritten_wait(inode);
@@ -138,10 +133,8 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
goto errout;
}
- if (pos + length > sbi->s_bitmap_maxbytes) {
- nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
- sbi->s_bitmap_maxbytes - pos);
- }
+ if (pos + length > sbi->s_bitmap_maxbytes)
+ iov_iter_truncate(from, sbi->s_bitmap_maxbytes - pos);
}
if (o_direct) {
@@ -179,7 +172,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
}
}
- ret = __generic_file_aio_write(iocb, iov, nr_segs);
+ ret = __generic_file_write_iter(iocb, from);
mutex_unlock(&inode->i_mutex);
if (ret > 0) {
@@ -594,10 +587,10 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
const struct file_operations ext4_file_operations = {
.llseek = ext4_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = ext4_file_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = ext4_file_write_iter,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
@@ -607,7 +600,7 @@ const struct file_operations ext4_file_operations = {
.release = ext4_release_file,
.fsync = ext4_sync_file,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
.fallocate = ext4_fallocate,
};
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 594009f5f523..8a57e9fcd1b9 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -639,8 +639,7 @@ out:
* VFS code falls back into buffered path in that case so we are safe.
*/
ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
@@ -648,7 +647,7 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
handle_t *handle;
ssize_t ret;
int orphan = 0;
- size_t count = iov_length(iov, nr_segs);
+ size_t count = iov_iter_count(iter);
int retries = 0;
if (rw == WRITE) {
@@ -687,18 +686,17 @@ retry:
goto locked;
}
ret = __blockdev_direct_IO(rw, iocb, inode,
- inode->i_sb->s_bdev, iov,
- offset, nr_segs,
+ inode->i_sb->s_bdev, iter, offset,
ext4_get_block, NULL, NULL, 0);
inode_dio_done(inode);
} else {
locked:
- ret = blockdev_direct_IO(rw, iocb, inode, iov,
- offset, nr_segs, ext4_get_block);
+ ret = blockdev_direct_IO(rw, iocb, inode, iter,
+ offset, ext4_get_block);
if (unlikely((rw & WRITE) && ret < 0)) {
loff_t isize = i_size_read(inode);
- loff_t end = offset + iov_length(iov, nr_segs);
+ loff_t end = offset + count;
if (end > isize)
ext4_truncate_failed_write(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7fcd68ee9155..8a064734e6eb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3093,13 +3093,12 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
*
*/
static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
ssize_t ret;
- size_t count = iov_length(iov, nr_segs);
+ size_t count = iov_iter_count(iter);
int overwrite = 0;
get_block_t *get_block_func = NULL;
int dio_flags = 0;
@@ -3108,7 +3107,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
/* Use the old path for reads and writes beyond i_size. */
if (rw != WRITE || final_size > inode->i_size)
- return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+ return ext4_ind_direct_IO(rw, iocb, iter, offset);
BUG_ON(iocb->private == NULL);
@@ -3175,8 +3174,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
dio_flags = DIO_LOCKING;
}
ret = __blockdev_direct_IO(rw, iocb, inode,
- inode->i_sb->s_bdev, iov,
- offset, nr_segs,
+ inode->i_sb->s_bdev, iter,
+ offset,
get_block_func,
ext4_end_io_dio,
NULL,
@@ -3230,11 +3229,11 @@ retake_lock:
}
static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
+ size_t count = iov_iter_count(iter);
ssize_t ret;
/*
@@ -3247,13 +3246,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
if (ext4_has_inline_data(inode))
return 0;
- trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
+ trace_ext4_direct_IO_enter(inode, offset, count, rw);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+ ret = ext4_ext_direct_IO(rw, iocb, iter, offset);
else
- ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
- trace_ext4_direct_IO_exit(inode, offset,
- iov_length(iov, nr_segs), rw, ret);
+ ret = ext4_ind_direct_IO(rw, iocb, iter, offset);
+ trace_ext4_direct_IO_exit(inode, offset, count, rw, ret);
return ret;
}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index c1fb6dd10911..0924521306b4 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1017,10 +1017,9 @@ static int f2fs_write_end(struct file *file,
}
static int check_direct_IO(struct inode *inode, int rw,
- const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
unsigned blocksize_mask = inode->i_sb->s_blocksize - 1;
- int i;
if (rw == READ)
return 0;
@@ -1028,14 +1027,14 @@ static int check_direct_IO(struct inode *inode, int rw,
if (offset & blocksize_mask)
return -EINVAL;
- for (i = 0; i < nr_segs; i++)
- if (iov[i].iov_len & blocksize_mask)
- return -EINVAL;
+ if (iov_iter_alignment(iter) & blocksize_mask)
+ return -EINVAL;
+
return 0;
}
static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
@@ -1044,14 +1043,14 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
if (f2fs_has_inline_data(inode))
return 0;
- if (check_direct_IO(inode, rw, iov, offset, nr_segs))
+ if (check_direct_IO(inode, rw, iter, offset))
return 0;
/* clear fsync mark to recover these blocks */
fsync_mark_clear(F2FS_SB(inode->i_sb), inode->i_ino);
- return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
- get_data_block);
+ return blockdev_direct_IO(rw, iocb, inode, iter, offset,
+ get_data_block);
}
static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 9c49c593d8eb..c58e33075719 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -808,10 +808,10 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
const struct file_operations f2fs_file_operations = {
.llseek = f2fs_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.open = generic_file_open,
.mmap = f2fs_file_mmap,
.fsync = f2fs_sync_file,
@@ -821,5 +821,5 @@ const struct file_operations f2fs_file_operations = {
.compat_ioctl = f2fs_compat_ioctl,
#endif
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
};
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 9b104f543056..85f79a89e747 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -170,10 +170,10 @@ int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
const struct file_operations fat_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.release = fat_file_release,
.unlocked_ioctl = fat_generic_ioctl,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 9c83594d7fb5..756aead10d96 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -247,12 +247,13 @@ static int fat_write_end(struct file *file, struct address_space *mapping,
}
static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov,
- loff_t offset, unsigned long nr_segs)
+ struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
+ size_t count = iov_iter_count(iter);
ssize_t ret;
if (rw == WRITE) {
@@ -265,7 +266,7 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
*
* Return 0, and fallback to normal buffered write.
*/
- loff_t size = offset + iov_length(iov, nr_segs);
+ loff_t size = offset + count;
if (MSDOS_I(inode)->mmu_private < size)
return 0;
}
@@ -274,10 +275,9 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
* FAT need to use the DIO_LOCKING for avoiding the race
* condition of fat_get_block() and ->truncate().
*/
- ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
- fat_get_block);
+ ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, fat_get_block);
if (ret < 0 && (rw & WRITE))
- fat_write_failed(mapping, offset + iov_length(iov, nr_segs));
+ fat_write_failed(mapping, offset + count);
return ret;
}
diff --git a/fs/file.c b/fs/file.c
index 8f294cfac697..66923fe3176e 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -44,15 +44,10 @@ static void *alloc_fdmem(size_t size)
return vmalloc(size);
}
-static void free_fdmem(void *ptr)
-{
- is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr);
-}
-
static void __free_fdtable(struct fdtable *fdt)
{
- free_fdmem(fdt->fd);
- free_fdmem(fdt->open_fds);
+ kvfree(fdt->fd);
+ kvfree(fdt->open_fds);
kfree(fdt);
}
@@ -130,7 +125,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
return fdt;
out_arr:
- free_fdmem(fdt->fd);
+ kvfree(fdt->fd);
out_fdt:
kfree(fdt);
out:
diff --git a/fs/file_table.c b/fs/file_table.c
index 40bf4660f0a3..385bfd31512a 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -175,6 +175,12 @@ struct file *alloc_file(struct path *path, fmode_t mode,
file->f_path = *path;
file->f_inode = path->dentry->d_inode;
file->f_mapping = path->dentry->d_inode->i_mapping;
+ if ((mode & FMODE_READ) &&
+ likely(fop->read || fop->aio_read || fop->read_iter))
+ mode |= FMODE_CAN_READ;
+ if ((mode & FMODE_WRITE) &&
+ likely(fop->write || fop->aio_write || fop->write_iter))
+ mode |= FMODE_CAN_WRITE;
file->f_mode = mode;
file->f_op = fop;
if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 13b691a8a7d2..966ace8b243f 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -94,8 +94,10 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count,
loff_t pos = 0;
struct iovec iov = { .iov_base = buf, .iov_len = count };
struct fuse_io_priv io = { .async = 0, .file = file };
+ struct iov_iter ii;
+ iov_iter_init(&ii, READ, &iov, 1, count);
- return fuse_direct_io(&io, &iov, 1, count, &pos, FUSE_DIO_CUSE);
+ return fuse_direct_io(&io, &ii, &pos, FUSE_DIO_CUSE);
}
static ssize_t cuse_write(struct file *file, const char __user *buf,
@@ -104,12 +106,14 @@ static ssize_t cuse_write(struct file *file, const char __user *buf,
loff_t pos = 0;
struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
struct fuse_io_priv io = { .async = 0, .file = file };
+ struct iov_iter ii;
+ iov_iter_init(&ii, WRITE, &iov, 1, count);
/*
* No locking or generic_write_checks(), the server is
* responsible for locking and sanity checks.
*/
- return fuse_direct_io(&io, &iov, 1, count, &pos,
+ return fuse_direct_io(&io, &ii, &pos,
FUSE_DIO_WRITE | FUSE_DIO_CUSE);
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 903cbc9cd6bd..6e16dad13e9b 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -933,8 +933,7 @@ out:
return err;
}
-static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
@@ -945,14 +944,14 @@ static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
* i_size is up to date).
*/
if (fc->auto_inval_data ||
- (pos + iov_length(iov, nr_segs) > i_size_read(inode))) {
+ (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) {
int err;
err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL);
if (err)
return err;
}
- return generic_file_aio_read(iocb, iov, nr_segs, pos);
+ return generic_file_read_iter(iocb, to);
}
static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,
@@ -1181,19 +1180,17 @@ static ssize_t fuse_perform_write(struct file *file,
return res > 0 ? res : err;
}
-static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
- size_t count = 0;
- size_t ocount = 0;
+ size_t count = iov_iter_count(from);
ssize_t written = 0;
ssize_t written_buffered = 0;
struct inode *inode = mapping->host;
ssize_t err;
- struct iov_iter i;
loff_t endbyte = 0;
+ loff_t pos = iocb->ki_pos;
if (get_fuse_conn(inode)->writeback_cache) {
/* Update size (EOF optimization) and mode (SUID clearing) */
@@ -1201,17 +1198,9 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (err)
return err;
- return generic_file_aio_write(iocb, iov, nr_segs, pos);
+ return generic_file_write_iter(iocb, from);
}
- WARN_ON(iocb->ki_pos != pos);
-
- ocount = 0;
- err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
- if (err)
- return err;
-
- count = ocount;
mutex_lock(&inode->i_mutex);
/* We can write back this queue in page reclaim */
@@ -1224,6 +1213,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (count == 0)
goto out;
+ iov_iter_truncate(from, count);
err = file_remove_suid(file);
if (err)
goto out;
@@ -1233,16 +1223,13 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
goto out;
if (file->f_flags & O_DIRECT) {
- written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
- count, ocount);
- if (written < 0 || written == count)
+ written = generic_file_direct_write(iocb, from, pos);
+ if (written < 0 || !iov_iter_count(from))
goto out;
pos += written;
- count -= written;
- iov_iter_init(&i, iov, nr_segs, count, written);
- written_buffered = fuse_perform_write(file, mapping, &i, pos);
+ written_buffered = fuse_perform_write(file, mapping, from, pos);
if (written_buffered < 0) {
err = written_buffered;
goto out;
@@ -1261,8 +1248,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
written += written_buffered;
iocb->ki_pos = pos + written_buffered;
} else {
- iov_iter_init(&i, iov, nr_segs, count, 0);
- written = fuse_perform_write(file, mapping, &i, pos);
+ written = fuse_perform_write(file, mapping, from, pos);
if (written >= 0)
iocb->ki_pos = pos + written;
}
@@ -1300,7 +1286,7 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
size_t nbytes = 0; /* # bytes already packed in req */
/* Special case for kernel I/O: can copy directly into the buffer */
- if (segment_eq(get_fs(), KERNEL_DS)) {
+ if (ii->type & ITER_KVEC) {
unsigned long user_addr = fuse_get_user_addr(ii);
size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
@@ -1316,35 +1302,26 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
unsigned npages;
- unsigned long user_addr = fuse_get_user_addr(ii);
- unsigned offset = user_addr & ~PAGE_MASK;
- size_t frag_size = fuse_get_frag_size(ii, *nbytesp - nbytes);
- int ret;
-
+ size_t start;
unsigned n = req->max_pages - req->num_pages;
- frag_size = min_t(size_t, frag_size, n << PAGE_SHIFT);
-
- npages = (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
- npages = clamp(npages, 1U, n);
-
- ret = get_user_pages_fast(user_addr, npages, !write,
- &req->pages[req->num_pages]);
+ ssize_t ret = iov_iter_get_pages(ii,
+ &req->pages[req->num_pages],
+ n * PAGE_SIZE, &start);
if (ret < 0)
return ret;
- npages = ret;
- frag_size = min_t(size_t, frag_size,
- (npages << PAGE_SHIFT) - offset);
- iov_iter_advance(ii, frag_size);
+ iov_iter_advance(ii, ret);
+ nbytes += ret;
+
+ ret += start;
+ npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
- req->page_descs[req->num_pages].offset = offset;
+ req->page_descs[req->num_pages].offset = start;
fuse_page_descs_length_init(req, req->num_pages, npages);
req->num_pages += npages;
req->page_descs[req->num_pages - 1].length -=
- (npages << PAGE_SHIFT) - offset - frag_size;
-
- nbytes += frag_size;
+ (PAGE_SIZE - ret) & (PAGE_SIZE - 1);
}
if (write)
@@ -1359,24 +1336,11 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
static inline int fuse_iter_npages(const struct iov_iter *ii_p)
{
- struct iov_iter ii = *ii_p;
- int npages = 0;
-
- while (iov_iter_count(&ii) && npages < FUSE_MAX_PAGES_PER_REQ) {
- unsigned long user_addr = fuse_get_user_addr(&ii);
- unsigned offset = user_addr & ~PAGE_MASK;
- size_t frag_size = iov_iter_single_seg_count(&ii);
-
- npages += (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
- iov_iter_advance(&ii, frag_size);
- }
-
- return min(npages, FUSE_MAX_PAGES_PER_REQ);
+ return iov_iter_npages(ii_p, FUSE_MAX_PAGES_PER_REQ);
}
-ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
- unsigned long nr_segs, size_t count, loff_t *ppos,
- int flags)
+ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
+ loff_t *ppos, int flags)
{
int write = flags & FUSE_DIO_WRITE;
int cuse = flags & FUSE_DIO_CUSE;
@@ -1386,18 +1350,16 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
struct fuse_conn *fc = ff->fc;
size_t nmax = write ? fc->max_write : fc->max_read;
loff_t pos = *ppos;
+ size_t count = iov_iter_count(iter);
pgoff_t idx_from = pos >> PAGE_CACHE_SHIFT;
pgoff_t idx_to = (pos + count - 1) >> PAGE_CACHE_SHIFT;
ssize_t res = 0;
struct fuse_req *req;
- struct iov_iter ii;
-
- iov_iter_init(&ii, iov, nr_segs, count, 0);
if (io->async)
- req = fuse_get_req_for_background(fc, fuse_iter_npages(&ii));
+ req = fuse_get_req_for_background(fc, fuse_iter_npages(iter));
else
- req = fuse_get_req(fc, fuse_iter_npages(&ii));
+ req = fuse_get_req(fc, fuse_iter_npages(iter));
if (IS_ERR(req))
return PTR_ERR(req);
@@ -1413,7 +1375,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
size_t nres;
fl_owner_t owner = current->files;
size_t nbytes = min(count, nmax);
- int err = fuse_get_user_pages(req, &ii, &nbytes, write);
+ int err = fuse_get_user_pages(req, iter, &nbytes, write);
if (err) {
res = err;
break;
@@ -1443,9 +1405,9 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
fuse_put_request(fc, req);
if (io->async)
req = fuse_get_req_for_background(fc,
- fuse_iter_npages(&ii));
+ fuse_iter_npages(iter));
else
- req = fuse_get_req(fc, fuse_iter_npages(&ii));
+ req = fuse_get_req(fc, fuse_iter_npages(iter));
if (IS_ERR(req))
break;
}
@@ -1460,9 +1422,8 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
EXPORT_SYMBOL_GPL(fuse_direct_io);
static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
- const struct iovec *iov,
- unsigned long nr_segs, loff_t *ppos,
- size_t count)
+ struct iov_iter *iter,
+ loff_t *ppos)
{
ssize_t res;
struct file *file = io->file;
@@ -1471,7 +1432,7 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
if (is_bad_inode(inode))
return -EIO;
- res = fuse_direct_io(io, iov, nr_segs, count, ppos, 0);
+ res = fuse_direct_io(io, iter, ppos, 0);
fuse_invalidate_attr(inode);
@@ -1483,22 +1444,26 @@ static ssize_t fuse_direct_read(struct file *file, char __user *buf,
{
struct fuse_io_priv io = { .async = 0, .file = file };
struct iovec iov = { .iov_base = buf, .iov_len = count };
- return __fuse_direct_read(&io, &iov, 1, ppos, count);
+ struct iov_iter ii;
+ iov_iter_init(&ii, READ, &iov, 1, count);
+ return __fuse_direct_read(&io, &ii, ppos);
}
static ssize_t __fuse_direct_write(struct fuse_io_priv *io,
- const struct iovec *iov,
- unsigned long nr_segs, loff_t *ppos)
+ struct iov_iter *iter,
+ loff_t *ppos)
{
struct file *file = io->file;
struct inode *inode = file_inode(file);
- size_t count = iov_length(iov, nr_segs);
+ size_t count = iov_iter_count(iter);
ssize_t res;
+
res = generic_write_checks(file, ppos, &count, 0);
- if (!res)
- res = fuse_direct_io(io, iov, nr_segs, count, ppos,
- FUSE_DIO_WRITE);
+ if (!res) {
+ iov_iter_truncate(iter, count);
+ res = fuse_direct_io(io, iter, ppos, FUSE_DIO_WRITE);
+ }
fuse_invalidate_attr(inode);
@@ -1512,13 +1477,15 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
struct inode *inode = file_inode(file);
ssize_t res;
struct fuse_io_priv io = { .async = 0, .file = file };
+ struct iov_iter ii;
+ iov_iter_init(&ii, WRITE, &iov, 1, count);
if (is_bad_inode(inode))
return -EIO;
/* Don't allow parallel writes to the same file */
mutex_lock(&inode->i_mutex);
- res = __fuse_direct_write(&io, &iov, 1, ppos);
+ res = __fuse_direct_write(&io, &ii, ppos);
if (res > 0)
fuse_write_update_size(inode, *ppos);
mutex_unlock(&inode->i_mutex);
@@ -2372,7 +2339,7 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
if (!bytes)
return 0;
- iov_iter_init(&ii, iov, nr_segs, bytes, 0);
+ iov_iter_init(&ii, to_user ? READ : WRITE, iov, nr_segs, bytes);
while (iov_iter_count(&ii)) {
struct page *page = pages[page_idx++];
@@ -2894,8 +2861,8 @@ static inline loff_t fuse_round_up(loff_t off)
}
static ssize_t
-fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs)
+fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
{
ssize_t ret = 0;
struct file *file = iocb->ki_filp;
@@ -2904,7 +2871,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
loff_t pos = 0;
struct inode *inode;
loff_t i_size;
- size_t count = iov_length(iov, nr_segs);
+ size_t count = iov_iter_count(iter);
struct fuse_io_priv *io;
pos = offset;
@@ -2919,6 +2886,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
if (offset >= i_size)
return 0;
count = min_t(loff_t, count, fuse_round_up(i_size - offset));
+ iov_iter_truncate(iter, count);
}
io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
@@ -2948,9 +2916,9 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
io->async = false;
if (rw == WRITE)
- ret = __fuse_direct_write(io, iov, nr_segs, &pos);
+ ret = __fuse_direct_write(io, iter, &pos);
else
- ret = __fuse_direct_read(io, iov, nr_segs, &pos, count);
+ ret = __fuse_direct_read(io, iter, &pos);
if (io->async) {
fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
@@ -3061,10 +3029,10 @@ out:
static const struct file_operations fuse_file_operations = {
.llseek = fuse_file_llseek,
- .read = do_sync_read,
- .aio_read = fuse_file_aio_read,
- .write = do_sync_write,
- .aio_write = fuse_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = fuse_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = fuse_file_write_iter,
.mmap = fuse_file_mmap,
.open = fuse_open,
.flush = fuse_flush,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 7aa5c75e0de1..e8e47a6ab518 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -880,9 +880,8 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
/** CUSE pass fuse_direct_io() a file which f_mapping->host is not from FUSE */
#define FUSE_DIO_CUSE (1 << 1)
-ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
- unsigned long nr_segs, size_t count, loff_t *ppos,
- int flags);
+ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
+ loff_t *ppos, int flags);
long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
unsigned int flags);
long fuse_ioctl_common(struct file *file, unsigned int cmd,
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 492123cda64a..805b37fed638 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1040,8 +1040,7 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
@@ -1081,7 +1080,7 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
*/
if (mapping->nrpages) {
loff_t lstart = offset & (PAGE_CACHE_SIZE - 1);
- loff_t len = iov_length(iov, nr_segs);
+ loff_t len = iov_iter_count(iter);
loff_t end = PAGE_ALIGN(offset + len) - 1;
rv = 0;
@@ -1096,9 +1095,9 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
truncate_inode_pages_range(mapping, lstart, end);
}
- rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
- offset, nr_segs, gfs2_get_block_direct,
- NULL, NULL, 0);
+ rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
+ iter, offset,
+ gfs2_get_block_direct, NULL, NULL, 0);
out:
gfs2_glock_dq(&gh);
gfs2_holder_uninit(&gh);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 6ab0cfb2e891..4fc3a3046174 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -684,7 +684,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
}
/**
- * gfs2_file_aio_write - Perform a write to a file
+ * gfs2_file_write_iter - Perform a write to a file
* @iocb: The io context
* @iov: The data to write
* @nr_segs: Number of @iov segments
@@ -697,11 +697,9 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
*
*/
-static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- size_t writesize = iov_length(iov, nr_segs);
struct gfs2_inode *ip = GFS2_I(file_inode(file));
int ret;
@@ -709,7 +707,7 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (ret)
return ret;
- gfs2_size_hint(file, pos, writesize);
+ gfs2_size_hint(file, iocb->ki_pos, iov_iter_count(from));
if (file->f_flags & O_APPEND) {
struct gfs2_holder gh;
@@ -720,7 +718,7 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
gfs2_glock_dq_uninit(&gh);
}
- return generic_file_aio_write(iocb, iov, nr_segs, pos);
+ return generic_file_write_iter(iocb, from);
}
static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
@@ -1058,10 +1056,10 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
const struct file_operations gfs2_file_fops = {
.llseek = gfs2_llseek,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .write = do_sync_write,
- .aio_write = gfs2_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = gfs2_file_write_iter,
.unlocked_ioctl = gfs2_ioctl,
.mmap = gfs2_mmap,
.open = gfs2_open,
@@ -1070,7 +1068,7 @@ const struct file_operations gfs2_file_fops = {
.lock = gfs2_lock,
.flock = gfs2_flock,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
.setlease = gfs2_setlease,
.fallocate = gfs2_fallocate,
};
@@ -1090,17 +1088,17 @@ const struct file_operations gfs2_dir_fops = {
const struct file_operations gfs2_file_fops_nolock = {
.llseek = gfs2_llseek,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .write = do_sync_write,
- .aio_write = gfs2_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = gfs2_file_write_iter,
.unlocked_ioctl = gfs2_ioctl,
.mmap = gfs2_mmap,
.open = gfs2_open,
.release = gfs2_release,
.fsync = gfs2_fsync,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
.setlease = generic_setlease,
.fallocate = gfs2_fallocate,
};
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 9e2fecd62f62..d0929bc81782 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -125,15 +125,15 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
}
static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = file_inode(file)->i_mapping->host;
+ size_t count = iov_iter_count(iter);
ssize_t ret;
- ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
- hfs_get_block);
+ ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, hfs_get_block);
/*
* In case of error extending write may have instantiated a few
@@ -141,7 +141,7 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
*/
if (unlikely((rw & WRITE) && ret < 0)) {
loff_t isize = i_size_read(inode);
- loff_t end = offset + iov_length(iov, nr_segs);
+ loff_t end = offset + count;
if (end > isize)
hfs_write_failed(mapping, end);
@@ -674,10 +674,10 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end,
static const struct file_operations hfs_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .write = do_sync_write,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.splice_read = generic_file_splice_read,
.fsync = hfs_file_fsync,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index a4f45bd88a63..0cf786f2d046 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -123,14 +123,15 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
}
static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = file_inode(file)->i_mapping->host;
+ size_t count = iov_iter_count(iter);
ssize_t ret;
- ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
+ ret = blockdev_direct_IO(rw, iocb, inode, iter, offset,
hfsplus_get_block);
/*
@@ -139,7 +140,7 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
*/
if (unlikely((rw & WRITE) && ret < 0)) {
loff_t isize = i_size_read(inode);
- loff_t end = offset + iov_length(iov, nr_segs);
+ loff_t end = offset + count;
if (end > isize)
hfsplus_write_failed(mapping, end);
@@ -340,10 +341,10 @@ static const struct inode_operations hfsplus_file_inode_operations = {
static const struct file_operations hfsplus_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .write = do_sync_write,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.splice_read = generic_file_splice_read,
.fsync = hfsplus_file_fsync,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 9c470fde9878..bb529f3b7f2b 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -378,11 +378,11 @@ static int hostfs_fsync(struct file *file, loff_t start, loff_t end,
static const struct file_operations hostfs_file_fops = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
+ .read = new_sync_read,
.splice_read = generic_file_splice_read,
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
- .write = do_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
+ .write = new_sync_write,
.mmap = generic_file_mmap,
.open = hostfs_file_open,
.release = hostfs_file_release,
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 67c1a61e0955..7f54e5f76cec 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -197,10 +197,10 @@ const struct address_space_operations hpfs_aops = {
const struct file_operations hpfs_file_ops =
{
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .write = do_sync_write,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.release = hpfs_file_release,
.fsync = hpfs_file_fsync,
diff --git a/fs/inode.c b/fs/inode.c
index 2feb9b69f1be..6eecb7ff0b9a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1839,14 +1839,18 @@ EXPORT_SYMBOL(inode_init_owner);
* inode_owner_or_capable - check current task permissions to inode
* @inode: inode being checked
*
- * Return true if current either has CAP_FOWNER to the inode, or
- * owns the file.
+ * Return true if current either has CAP_FOWNER in a namespace with the
+ * inode owner uid mapped, or owns the file.
*/
bool inode_owner_or_capable(const struct inode *inode)
{
+ struct user_namespace *ns;
+
if (uid_eq(current_fsuid(), inode->i_uid))
return true;
- if (inode_capable(inode, CAP_FOWNER))
+
+ ns = current_user_ns();
+ if (ns_capable(ns, CAP_FOWNER) && kuid_has_mapping(ns, inode->i_uid))
return true;
return false;
}
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index 25c713e7071c..8898bbd2b61e 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -231,19 +231,15 @@ record_cache_failure:
static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size)
{
- int shift = 0;
- int tmp = hash_size;
+ int i;
struct jbd_revoke_table_s *table;
table = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
if (!table)
goto out;
- while((tmp >>= 1UL) != 0UL)
- shift++;
-
table->hash_size = hash_size;
- table->hash_shift = shift;
+ table->hash_shift = ilog2(hash_size);
table->hash_table =
kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
if (!table->hash_table) {
@@ -252,8 +248,8 @@ static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size)
goto out;
}
- for (tmp = 0; tmp < hash_size; tmp++)
- INIT_LIST_HEAD(&table->hash_table[tmp]);
+ for (i = 0; i < hash_size; i++)
+ INIT_LIST_HEAD(&table->hash_table[i]);
out:
return table;
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 256cd19a3b78..64989ca9ba90 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -51,10 +51,10 @@ const struct file_operations jffs2_file_operations =
{
.llseek = generic_file_llseek,
.open = generic_file_open,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .write = do_sync_write,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = generic_file_write_iter,
.unlocked_ioctl=jffs2_ioctl,
.mmap = generic_file_readonly_mmap,
.fsync = jffs2_fsync,
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 794da944d5cd..33aa0cc1f8b8 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -151,13 +151,13 @@ const struct inode_operations jfs_file_inode_operations = {
const struct file_operations jfs_file_operations = {
.open = jfs_open,
.llseek = generic_file_llseek,
- .write = do_sync_write,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
+ .write = new_sync_write,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
.fsync = jfs_fsync,
.release = jfs_release,
.unlocked_ioctl = jfs_ioctl,
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 6f8fe72c2a7a..bd3df1ca3c9b 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -331,15 +331,15 @@ static sector_t jfs_bmap(struct address_space *mapping, sector_t block)
}
static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = file->f_mapping->host;
+ size_t count = iov_iter_count(iter);
ssize_t ret;
- ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
- jfs_get_block);
+ ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, jfs_get_block);
/*
* In case of error extending write may have instantiated a few
@@ -347,7 +347,7 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
*/
if (unlikely((rw & WRITE) && ret < 0)) {
loff_t isize = i_size_read(inode);
- loff_t end = offset + iov_length(iov, nr_segs);
+ loff_t end = offset + count;
if (end > isize)
jfs_write_failed(mapping, end);
diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
index 00ec0b9c94d1..d3e40db28930 100644
--- a/fs/lockd/clnt4xdr.c
+++ b/fs/lockd/clnt4xdr.c
@@ -14,6 +14,8 @@
#include <linux/sunrpc/stats.h>
#include <linux/lockd/lockd.h>
+#include <uapi/linux/nfs3.h>
+
#define NLMDBG_FACILITY NLMDBG_XDR
#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
index 9a55797a1cd4..3e9f7874b975 100644
--- a/fs/lockd/clntxdr.c
+++ b/fs/lockd/clntxdr.c
@@ -15,6 +15,8 @@
#include <linux/sunrpc/stats.h>
#include <linux/lockd/lockd.h>
+#include <uapi/linux/nfs2.h>
+
#define NLMDBG_FACILITY NLMDBG_XDR
#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index de051cb1f553..8f27c93f8d2e 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -622,8 +622,8 @@ static int __init init_nlm(void)
err_pernet:
#ifdef CONFIG_SYSCTL
unregister_sysctl_table(nlm_sysctl_table);
-#endif
err_sysctl:
+#endif
return err;
}
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index dc5c75930f0f..b6f3b84b6e99 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -14,12 +14,11 @@
#include <linux/mutex.h>
#include <linux/sunrpc/svc.h>
#include <linux/sunrpc/addr.h>
-#include <linux/nfsd/nfsfh.h>
-#include <linux/nfsd/export.h>
#include <linux/lockd/lockd.h>
#include <linux/lockd/share.h>
#include <linux/module.h>
#include <linux/mount.h>
+#include <uapi/linux/nfs2.h>
#define NLMDBG_FACILITY NLMDBG_SVCSUBS
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 964666c68a86..9340e7e10ef6 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -16,6 +16,8 @@
#include <linux/sunrpc/stats.h>
#include <linux/lockd/lockd.h>
+#include <uapi/linux/nfs2.h>
+
#define NLMDBG_FACILITY NLMDBG_XDR
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 57914fc32b62..8538752df2f6 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -264,15 +264,15 @@ const struct inode_operations logfs_reg_iops = {
};
const struct file_operations logfs_reg_fops = {
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.fsync = logfs_fsync,
.unlocked_ioctl = logfs_ioctl,
.llseek = generic_file_llseek,
.mmap = generic_file_readonly_mmap,
.open = generic_file_open,
- .read = do_sync_read,
- .write = do_sync_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
};
const struct address_space_operations logfs_reg_aops = {
diff --git a/fs/minix/file.c b/fs/minix/file.c
index adc6f5494231..a967de085ac0 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -14,10 +14,10 @@
*/
const struct file_operations minix_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .write = do_sync_write,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.fsync = generic_file_fsync,
.splice_read = generic_file_splice_read,
diff --git a/fs/namei.c b/fs/namei.c
index 80168273396b..985c6f368485 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -332,10 +332,11 @@ int generic_permission(struct inode *inode, int mask)
if (S_ISDIR(inode->i_mode)) {
/* DACs are overridable for directories */
- if (inode_capable(inode, CAP_DAC_OVERRIDE))
+ if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
return 0;
if (!(mask & MAY_WRITE))
- if (inode_capable(inode, CAP_DAC_READ_SEARCH))
+ if (capable_wrt_inode_uidgid(inode,
+ CAP_DAC_READ_SEARCH))
return 0;
return -EACCES;
}
@@ -345,7 +346,7 @@ int generic_permission(struct inode *inode, int mask)
* at least one exec bit set.
*/
if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
- if (inode_capable(inode, CAP_DAC_OVERRIDE))
+ if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
return 0;
/*
@@ -353,7 +354,7 @@ int generic_permission(struct inode *inode, int mask)
*/
mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
if (mask == MAY_READ)
- if (inode_capable(inode, CAP_DAC_READ_SEARCH))
+ if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
return 0;
return -EACCES;
@@ -2379,7 +2380,7 @@ static inline int check_sticky(struct inode *dir, struct inode *inode)
return 0;
if (uid_eq(dir->i_uid, fsuid))
return 0;
- return !inode_capable(inode, CAP_FOWNER);
+ return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
}
/*
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 03192a66c143..4782e0840dcc 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -29,8 +29,6 @@ nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o
nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o
-obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
-nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
-
+obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/
obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 65d849bdf77a..9b431f44fad9 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -210,7 +210,7 @@ static void bl_end_io_read(struct bio *bio, int err)
SetPageUptodate(bvec->bv_page);
if (err) {
- struct nfs_read_data *rdata = par->data;
+ struct nfs_pgio_data *rdata = par->data;
struct nfs_pgio_header *header = rdata->header;
if (!header->pnfs_error)
@@ -224,17 +224,17 @@ static void bl_end_io_read(struct bio *bio, int err)
static void bl_read_cleanup(struct work_struct *work)
{
struct rpc_task *task;
- struct nfs_read_data *rdata;
+ struct nfs_pgio_data *rdata;
dprintk("%s enter\n", __func__);
task = container_of(work, struct rpc_task, u.tk_work);
- rdata = container_of(task, struct nfs_read_data, task);
+ rdata = container_of(task, struct nfs_pgio_data, task);
pnfs_ld_read_done(rdata);
}
static void
bl_end_par_io_read(void *data, int unused)
{
- struct nfs_read_data *rdata = data;
+ struct nfs_pgio_data *rdata = data;
rdata->task.tk_status = rdata->header->pnfs_error;
INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
@@ -242,7 +242,7 @@ bl_end_par_io_read(void *data, int unused)
}
static enum pnfs_try_status
-bl_read_pagelist(struct nfs_read_data *rdata)
+bl_read_pagelist(struct nfs_pgio_data *rdata)
{
struct nfs_pgio_header *header = rdata->header;
int i, hole;
@@ -390,7 +390,7 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
}
if (unlikely(err)) {
- struct nfs_write_data *data = par->data;
+ struct nfs_pgio_data *data = par->data;
struct nfs_pgio_header *header = data->header;
if (!header->pnfs_error)
@@ -405,7 +405,7 @@ static void bl_end_io_write(struct bio *bio, int err)
{
struct parallel_io *par = bio->bi_private;
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- struct nfs_write_data *data = par->data;
+ struct nfs_pgio_data *data = par->data;
struct nfs_pgio_header *header = data->header;
if (!uptodate) {
@@ -423,10 +423,10 @@ static void bl_end_io_write(struct bio *bio, int err)
static void bl_write_cleanup(struct work_struct *work)
{
struct rpc_task *task;
- struct nfs_write_data *wdata;
+ struct nfs_pgio_data *wdata;
dprintk("%s enter\n", __func__);
task = container_of(work, struct rpc_task, u.tk_work);
- wdata = container_of(task, struct nfs_write_data, task);
+ wdata = container_of(task, struct nfs_pgio_data, task);
if (likely(!wdata->header->pnfs_error)) {
/* Marks for LAYOUTCOMMIT */
mark_extents_written(BLK_LSEG2EXT(wdata->header->lseg),
@@ -438,7 +438,7 @@ static void bl_write_cleanup(struct work_struct *work)
/* Called when last of bios associated with a bl_write_pagelist call finishes */
static void bl_end_par_io_write(void *data, int num_se)
{
- struct nfs_write_data *wdata = data;
+ struct nfs_pgio_data *wdata = data;
if (unlikely(wdata->header->pnfs_error)) {
bl_free_short_extents(&BLK_LSEG2EXT(wdata->header->lseg)->bl_inval,
@@ -673,7 +673,7 @@ check_page:
}
static enum pnfs_try_status
-bl_write_pagelist(struct nfs_write_data *wdata, int sync)
+bl_write_pagelist(struct nfs_pgio_data *wdata, int sync)
{
struct nfs_pgio_header *header = wdata->header;
int i, ret, npg_zero, pg_index, last = 0;
@@ -1189,13 +1189,17 @@ bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
pnfs_generic_pg_init_read(pgio, req);
}
-static bool
+/*
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+static size_t
bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
if (pgio->pg_dreq != NULL &&
!is_aligned_req(req, SECTOR_SIZE))
- return false;
+ return 0;
return pnfs_generic_pg_test(pgio, prev, req);
}
@@ -1241,13 +1245,17 @@ bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
}
}
-static bool
+/*
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+static size_t
bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
if (pgio->pg_dreq != NULL &&
!is_aligned_req(req, PAGE_CACHE_SIZE))
- return false;
+ return 0;
return pnfs_generic_pg_test(pgio, prev, req);
}
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index b8797ae6831f..8f98138cbc43 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -108,6 +108,97 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
return atomic_dec_and_test(&dreq->io_count);
}
+/*
+ * nfs_direct_select_verf - select the right verifier
+ * @dreq - direct request possibly spanning multiple servers
+ * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs
+ * @ds_idx - index of data server in data server list, only valid if ds_clp set
+ *
+ * returns the correct verifier to use given the role of the server
+ */
+static struct nfs_writeverf *
+nfs_direct_select_verf(struct nfs_direct_req *dreq,
+ struct nfs_client *ds_clp,
+ int ds_idx)
+{
+ struct nfs_writeverf *verfp = &dreq->verf;
+
+#ifdef CONFIG_NFS_V4_1
+ if (ds_clp) {
+ /* pNFS is in use, use the DS verf */
+ if (ds_idx >= 0 && ds_idx < dreq->ds_cinfo.nbuckets)
+ verfp = &dreq->ds_cinfo.buckets[ds_idx].direct_verf;
+ else
+ WARN_ON_ONCE(1);
+ }
+#endif
+ return verfp;
+}
+
+
+/*
+ * nfs_direct_set_hdr_verf - set the write/commit verifier
+ * @dreq - direct request possibly spanning multiple servers
+ * @hdr - pageio header to validate against previously seen verfs
+ *
+ * Set the server's (MDS or DS) "seen" verifier
+ */
+static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
+ struct nfs_pgio_header *hdr)
+{
+ struct nfs_writeverf *verfp;
+
+ verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp,
+ hdr->data->ds_idx);
+ WARN_ON_ONCE(verfp->committed >= 0);
+ memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
+ WARN_ON_ONCE(verfp->committed < 0);
+}
+
+/*
+ * nfs_direct_cmp_hdr_verf - compare verifier for pgio header
+ * @dreq - direct request possibly spanning multiple servers
+ * @hdr - pageio header to validate against previously seen verf
+ *
+ * set the server's "seen" verf if not initialized.
+ * returns result of comparison between @hdr->verf and the "seen"
+ * verf of the server used by @hdr (DS or MDS)
+ */
+static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
+ struct nfs_pgio_header *hdr)
+{
+ struct nfs_writeverf *verfp;
+
+ verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp,
+ hdr->data->ds_idx);
+ if (verfp->committed < 0) {
+ nfs_direct_set_hdr_verf(dreq, hdr);
+ return 0;
+ }
+ return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
+}
+
+#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
+/*
+ * nfs_direct_cmp_commit_data_verf - compare verifier for commit data
+ * @dreq - direct request possibly spanning multiple servers
+ * @data - commit data to validate against previously seen verf
+ *
+ * returns result of comparison between @data->verf and the verf of
+ * the server used by @data (DS or MDS)
+ */
+static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
+ struct nfs_commit_data *data)
+{
+ struct nfs_writeverf *verfp;
+
+ verfp = nfs_direct_select_verf(dreq, data->ds_clp,
+ data->ds_commit_index);
+ WARN_ON_ONCE(verfp->committed < 0);
+ return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
+}
+#endif
+
/**
* nfs_direct_IO - NFS address space operation for direct I/O
* @rw: direction (read or write)
@@ -121,20 +212,20 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
* shunt off direct read and write requests before the VFS gets them,
* so this method is only ever called for swap.
*/
-ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
+ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
{
#ifndef CONFIG_NFS_SWAP
dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n",
- iocb->ki_filp, (long long) pos, nr_segs);
+ iocb->ki_filp, (long long) pos, iter->nr_segs);
return -EINVAL;
#else
VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
if (rw == READ || rw == KERNEL_READ)
- return nfs_file_direct_read(iocb, iov, nr_segs, pos,
+ return nfs_file_direct_read(iocb, iter, pos,
rw == READ ? true : false);
- return nfs_file_direct_write(iocb, iov, nr_segs, pos,
+ return nfs_file_direct_write(iocb, iter, pos,
rw == WRITE ? true : false);
#endif /* CONFIG_NFS_SWAP */
}
@@ -168,6 +259,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
kref_get(&dreq->kref);
init_completion(&dreq->completion);
INIT_LIST_HEAD(&dreq->mds_cinfo.list);
+ dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */
INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
spin_lock_init(&dreq->lock);
@@ -322,66 +414,42 @@ static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {
* handled automatically by nfs_direct_read_result(). Otherwise, if
* no requests have been sent, just return an error.
*/
-static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *desc,
- const struct iovec *iov,
- loff_t pos, bool uio)
-{
- struct nfs_direct_req *dreq = desc->pg_dreq;
- struct nfs_open_context *ctx = dreq->ctx;
- struct inode *inode = ctx->dentry->d_inode;
- unsigned long user_addr = (unsigned long)iov->iov_base;
- size_t count = iov->iov_len;
- size_t rsize = NFS_SERVER(inode)->rsize;
- unsigned int pgbase;
- int result;
- ssize_t started = 0;
- struct page **pagevec = NULL;
- unsigned int npages;
-
- do {
- size_t bytes;
- int i;
- pgbase = user_addr & ~PAGE_MASK;
- bytes = min(max_t(size_t, rsize, PAGE_SIZE), count);
+static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
+ struct iov_iter *iter,
+ loff_t pos)
+{
+ struct nfs_pageio_descriptor desc;
+ struct inode *inode = dreq->inode;
+ ssize_t result = -EINVAL;
+ size_t requested_bytes = 0;
+ size_t rsize = max_t(size_t, NFS_SERVER(inode)->rsize, PAGE_SIZE);
- result = -ENOMEM;
- npages = nfs_page_array_len(pgbase, bytes);
- if (!pagevec)
- pagevec = kmalloc(npages * sizeof(struct page *),
- GFP_KERNEL);
- if (!pagevec)
- break;
- if (uio) {
- down_read(&current->mm->mmap_sem);
- result = get_user_pages(current, current->mm, user_addr,
- npages, 1, 0, pagevec, NULL);
- up_read(&current->mm->mmap_sem);
- if (result < 0)
- break;
- } else {
- WARN_ON(npages != 1);
- result = get_kernel_page(user_addr, 1, pagevec);
- if (WARN_ON(result != 1))
- break;
- }
+ nfs_pageio_init_read(&desc, dreq->inode, false,
+ &nfs_direct_read_completion_ops);
+ get_dreq(dreq);
+ desc.pg_dreq = dreq;
+ atomic_inc(&inode->i_dio_count);
- if ((unsigned)result < npages) {
- bytes = result * PAGE_SIZE;
- if (bytes <= pgbase) {
- nfs_direct_release_pages(pagevec, result);
- break;
- }
- bytes -= pgbase;
- npages = result;
- }
+ while (iov_iter_count(iter)) {
+ struct page **pagevec;
+ size_t bytes;
+ size_t pgbase;
+ unsigned npages, i;
+ result = iov_iter_get_pages_alloc(iter, &pagevec,
+ rsize, &pgbase);
+ if (result < 0)
+ break;
+
+ bytes = result;
+ iov_iter_advance(iter, bytes);
+ npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
for (i = 0; i < npages; i++) {
struct nfs_page *req;
unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
/* XXX do we need to do the eof zeroing found in async_filler? */
- req = nfs_create_request(dreq->ctx, dreq->inode,
- pagevec[i],
+ req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
pgbase, req_len);
if (IS_ERR(req)) {
result = PTR_ERR(req);
@@ -389,56 +457,21 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
}
req->wb_index = pos >> PAGE_SHIFT;
req->wb_offset = pos & ~PAGE_MASK;
- if (!nfs_pageio_add_request(desc, req)) {
- result = desc->pg_error;
+ if (!nfs_pageio_add_request(&desc, req)) {
+ result = desc.pg_error;
nfs_release_request(req);
break;
}
pgbase = 0;
bytes -= req_len;
- started += req_len;
- user_addr += req_len;
+ requested_bytes += req_len;
pos += req_len;
- count -= req_len;
dreq->bytes_left -= req_len;
}
- /* The nfs_page now hold references to these pages */
nfs_direct_release_pages(pagevec, npages);
- } while (count != 0 && result >= 0);
-
- kfree(pagevec);
-
- if (started)
- return started;
- return result < 0 ? (ssize_t) result : -EFAULT;
-}
-
-static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
- const struct iovec *iov,
- unsigned long nr_segs,
- loff_t pos, bool uio)
-{
- struct nfs_pageio_descriptor desc;
- struct inode *inode = dreq->inode;
- ssize_t result = -EINVAL;
- size_t requested_bytes = 0;
- unsigned long seg;
-
- NFS_PROTO(dreq->inode)->read_pageio_init(&desc, dreq->inode,
- &nfs_direct_read_completion_ops);
- get_dreq(dreq);
- desc.pg_dreq = dreq;
- atomic_inc(&inode->i_dio_count);
-
- for (seg = 0; seg < nr_segs; seg++) {
- const struct iovec *vec = &iov[seg];
- result = nfs_direct_read_schedule_segment(&desc, vec, pos, uio);
+ kvfree(pagevec);
if (result < 0)
break;
- requested_bytes += result;
- if ((size_t)result < vec->iov_len)
- break;
- pos += vec->iov_len;
}
nfs_pageio_complete(&desc);
@@ -461,8 +494,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
/**
* nfs_file_direct_read - file direct read operation for NFS files
* @iocb: target I/O control block
- * @iov: vector of user buffers into which to read data
- * @nr_segs: size of iov vector
+ * @iter: vector of user buffers into which to read data
* @pos: byte offset in file where reading starts
*
* We use this function for direct reads instead of calling
@@ -479,8 +511,8 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
* client must read the updated atime from the server back into its
* cache.
*/
-ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos, bool uio)
+ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t pos, bool uio)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -488,9 +520,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
struct nfs_direct_req *dreq;
struct nfs_lock_context *l_ctx;
ssize_t result = -EINVAL;
- size_t count;
-
- count = iov_length(iov, nr_segs);
+ size_t count = iov_iter_count(iter);
nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
@@ -513,7 +543,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
goto out_unlock;
dreq->inode = inode;
- dreq->bytes_left = iov_length(iov, nr_segs);
+ dreq->bytes_left = count;
dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
l_ctx = nfs_get_lock_context(dreq->ctx);
if (IS_ERR(l_ctx)) {
@@ -524,8 +554,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
if (!is_sync_kiocb(iocb))
dreq->iocb = iocb;
- NFS_I(inode)->read_io += iov_length(iov, nr_segs);
- result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio);
+ NFS_I(inode)->read_io += count;
+ result = nfs_direct_read_schedule_iovec(dreq, iter, pos);
mutex_unlock(&inode->i_mutex);
@@ -564,7 +594,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
dreq->count = 0;
get_dreq(dreq);
- NFS_PROTO(dreq->inode)->write_pageio_init(&desc, dreq->inode, FLUSH_STABLE,
+ nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false,
&nfs_direct_write_completion_ops);
desc.pg_dreq = dreq;
@@ -603,7 +633,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
dprintk("NFS: %5u commit failed with error %d.\n",
data->task.tk_pid, status);
dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
- } else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) {
+ } else if (nfs_direct_cmp_commit_data_verf(dreq, data)) {
dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid);
dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
}
@@ -681,109 +711,6 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
}
#endif
-/*
- * NB: Return the value of the first error return code. Subsequent
- * errors after the first one are ignored.
- */
-/*
- * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
- * operation. If nfs_writedata_alloc() or get_user_pages() fails,
- * bail and stop sending more writes. Write length accounting is
- * handled automatically by nfs_direct_write_result(). Otherwise, if
- * no requests have been sent, just return an error.
- */
-static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *desc,
- const struct iovec *iov,
- loff_t pos, bool uio)
-{
- struct nfs_direct_req *dreq = desc->pg_dreq;
- struct nfs_open_context *ctx = dreq->ctx;
- struct inode *inode = ctx->dentry->d_inode;
- unsigned long user_addr = (unsigned long)iov->iov_base;
- size_t count = iov->iov_len;
- size_t wsize = NFS_SERVER(inode)->wsize;
- unsigned int pgbase;
- int result;
- ssize_t started = 0;
- struct page **pagevec = NULL;
- unsigned int npages;
-
- do {
- size_t bytes;
- int i;
-
- pgbase = user_addr & ~PAGE_MASK;
- bytes = min(max_t(size_t, wsize, PAGE_SIZE), count);
-
- result = -ENOMEM;
- npages = nfs_page_array_len(pgbase, bytes);
- if (!pagevec)
- pagevec = kmalloc(npages * sizeof(struct page *), GFP_KERNEL);
- if (!pagevec)
- break;
-
- if (uio) {
- down_read(&current->mm->mmap_sem);
- result = get_user_pages(current, current->mm, user_addr,
- npages, 0, 0, pagevec, NULL);
- up_read(&current->mm->mmap_sem);
- if (result < 0)
- break;
- } else {
- WARN_ON(npages != 1);
- result = get_kernel_page(user_addr, 0, pagevec);
- if (WARN_ON(result != 1))
- break;
- }
-
- if ((unsigned)result < npages) {
- bytes = result * PAGE_SIZE;
- if (bytes <= pgbase) {
- nfs_direct_release_pages(pagevec, result);
- break;
- }
- bytes -= pgbase;
- npages = result;
- }
-
- for (i = 0; i < npages; i++) {
- struct nfs_page *req;
- unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
-
- req = nfs_create_request(dreq->ctx, dreq->inode,
- pagevec[i],
- pgbase, req_len);
- if (IS_ERR(req)) {
- result = PTR_ERR(req);
- break;
- }
- nfs_lock_request(req);
- req->wb_index = pos >> PAGE_SHIFT;
- req->wb_offset = pos & ~PAGE_MASK;
- if (!nfs_pageio_add_request(desc, req)) {
- result = desc->pg_error;
- nfs_unlock_and_release_request(req);
- break;
- }
- pgbase = 0;
- bytes -= req_len;
- started += req_len;
- user_addr += req_len;
- pos += req_len;
- count -= req_len;
- dreq->bytes_left -= req_len;
- }
- /* The nfs_page now hold references to these pages */
- nfs_direct_release_pages(pagevec, npages);
- } while (count != 0 && result >= 0);
-
- kfree(pagevec);
-
- if (started)
- return started;
- return result < 0 ? (ssize_t) result : -EFAULT;
-}
-
static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
{
struct nfs_direct_req *dreq = hdr->dreq;
@@ -813,13 +740,13 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
bit = NFS_IOHDR_NEED_RESCHED;
else if (dreq->flags == 0) {
- memcpy(&dreq->verf, hdr->verf,
- sizeof(dreq->verf));
+ nfs_direct_set_hdr_verf(dreq, hdr);
bit = NFS_IOHDR_NEED_COMMIT;
dreq->flags = NFS_ODIRECT_DO_COMMIT;
} else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) {
- if (memcmp(&dreq->verf, hdr->verf, sizeof(dreq->verf))) {
- dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr)) {
+ dreq->flags =
+ NFS_ODIRECT_RESCHED_WRITES;
bit = NFS_IOHDR_NEED_RESCHED;
} else
bit = NFS_IOHDR_NEED_COMMIT;
@@ -829,6 +756,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
spin_unlock(&dreq->lock);
while (!list_empty(&hdr->pages)) {
+ bool do_destroy = true;
+
req = nfs_list_entry(hdr->pages.next);
nfs_list_remove_request(req);
switch (bit) {
@@ -836,6 +765,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
case NFS_IOHDR_NEED_COMMIT:
kref_get(&req->wb_kref);
nfs_mark_request_commit(req, hdr->lseg, &cinfo);
+ do_destroy = false;
}
nfs_unlock_and_release_request(req);
}
@@ -863,33 +793,77 @@ static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
.completion = nfs_direct_write_completion,
};
+
+/*
+ * NB: Return the value of the first error return code. Subsequent
+ * errors after the first one are ignored.
+ */
+/*
+ * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
+ * operation. If nfs_writedata_alloc() or get_user_pages() fails,
+ * bail and stop sending more writes. Write length accounting is
+ * handled automatically by nfs_direct_write_result(). Otherwise, if
+ * no requests have been sent, just return an error.
+ */
static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
- const struct iovec *iov,
- unsigned long nr_segs,
- loff_t pos, bool uio)
+ struct iov_iter *iter,
+ loff_t pos)
{
struct nfs_pageio_descriptor desc;
struct inode *inode = dreq->inode;
ssize_t result = 0;
size_t requested_bytes = 0;
- unsigned long seg;
+ size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
- NFS_PROTO(inode)->write_pageio_init(&desc, inode, FLUSH_COND_STABLE,
+ nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false,
&nfs_direct_write_completion_ops);
desc.pg_dreq = dreq;
get_dreq(dreq);
atomic_inc(&inode->i_dio_count);
- NFS_I(dreq->inode)->write_io += iov_length(iov, nr_segs);
- for (seg = 0; seg < nr_segs; seg++) {
- const struct iovec *vec = &iov[seg];
- result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio);
+ NFS_I(inode)->write_io += iov_iter_count(iter);
+ while (iov_iter_count(iter)) {
+ struct page **pagevec;
+ size_t bytes;
+ size_t pgbase;
+ unsigned npages, i;
+
+ result = iov_iter_get_pages_alloc(iter, &pagevec,
+ wsize, &pgbase);
if (result < 0)
break;
- requested_bytes += result;
- if ((size_t)result < vec->iov_len)
+
+ bytes = result;
+ iov_iter_advance(iter, bytes);
+ npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
+ for (i = 0; i < npages; i++) {
+ struct nfs_page *req;
+ unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
+
+ req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
+ pgbase, req_len);
+ if (IS_ERR(req)) {
+ result = PTR_ERR(req);
+ break;
+ }
+ nfs_lock_request(req);
+ req->wb_index = pos >> PAGE_SHIFT;
+ req->wb_offset = pos & ~PAGE_MASK;
+ if (!nfs_pageio_add_request(&desc, req)) {
+ result = desc.pg_error;
+ nfs_unlock_and_release_request(req);
+ break;
+ }
+ pgbase = 0;
+ bytes -= req_len;
+ requested_bytes += req_len;
+ pos += req_len;
+ dreq->bytes_left -= req_len;
+ }
+ nfs_direct_release_pages(pagevec, npages);
+ kvfree(pagevec);
+ if (result < 0)
break;
- pos += vec->iov_len;
}
nfs_pageio_complete(&desc);
@@ -911,8 +885,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
/**
* nfs_file_direct_write - file direct write operation for NFS files
* @iocb: target I/O control block
- * @iov: vector of user buffers from which to write data
- * @nr_segs: size of iov vector
+ * @iter: vector of user buffers from which to write data
* @pos: byte offset in file where writing starts
*
* We use this function for direct writes instead of calling
@@ -930,8 +903,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
* Note that O_APPEND is not supported for NFS direct writes, as there
* is no atomic O_APPEND write facility in the NFS protocol.
*/
-ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos, bool uio)
+ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t pos, bool uio)
{
ssize_t result = -EINVAL;
struct file *file = iocb->ki_filp;
@@ -940,9 +913,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
struct nfs_direct_req *dreq;
struct nfs_lock_context *l_ctx;
loff_t end;
- size_t count;
-
- count = iov_length(iov, nr_segs);
+ size_t count = iov_iter_count(iter);
end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
@@ -993,7 +964,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
if (!is_sync_kiocb(iocb))
dreq->iocb = iocb;
- result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
+ result = nfs_direct_write_schedule_iovec(dreq, iter, pos);
if (mapping->nrpages) {
invalidate_inode_pages2_range(mapping,
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index c1edf7336315..4042ff58fe3f 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -165,22 +165,21 @@ nfs_file_flush(struct file *file, fl_owner_t id)
EXPORT_SYMBOL_GPL(nfs_file_flush);
ssize_t
-nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t result;
if (iocb->ki_filp->f_flags & O_DIRECT)
- return nfs_file_direct_read(iocb, iov, nr_segs, pos, true);
+ return nfs_file_direct_read(iocb, to, iocb->ki_pos, true);
- dprintk("NFS: read(%pD2, %lu@%lu)\n",
+ dprintk("NFS: read(%pD2, %zu@%lu)\n",
iocb->ki_filp,
- (unsigned long) iov_length(iov, nr_segs), (unsigned long) pos);
+ iov_iter_count(to), (unsigned long) iocb->ki_pos);
result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
if (!result) {
- result = generic_file_aio_read(iocb, iov, nr_segs, pos);
+ result = generic_file_read_iter(iocb, to);
if (result > 0)
nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
}
@@ -635,24 +634,24 @@ static int nfs_need_sync_write(struct file *filp, struct inode *inode)
return 0;
}
-ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
unsigned long written = 0;
ssize_t result;
- size_t count = iov_length(iov, nr_segs);
+ size_t count = iov_iter_count(from);
+ loff_t pos = iocb->ki_pos;
result = nfs_key_timeout_notify(file, inode);
if (result)
return result;
if (file->f_flags & O_DIRECT)
- return nfs_file_direct_write(iocb, iov, nr_segs, pos, true);
+ return nfs_file_direct_write(iocb, from, pos, true);
- dprintk("NFS: write(%pD2, %lu@%Ld)\n",
- file, (unsigned long) count, (long long) pos);
+ dprintk("NFS: write(%pD2, %zu@%Ld)\n",
+ file, count, (long long) pos);
result = -EBUSY;
if (IS_SWAPFILE(inode))
@@ -670,7 +669,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
if (!count)
goto out;
- result = generic_file_aio_write(iocb, iov, nr_segs, pos);
+ result = generic_file_write_iter(iocb, from);
if (result > 0)
written = result;
@@ -691,36 +690,6 @@ out_swapfile:
}
EXPORT_SYMBOL_GPL(nfs_file_write);
-ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
- struct file *filp, loff_t *ppos,
- size_t count, unsigned int flags)
-{
- struct inode *inode = file_inode(filp);
- unsigned long written = 0;
- ssize_t ret;
-
- dprintk("NFS splice_write(%pD2, %lu@%llu)\n",
- filp, (unsigned long) count, (unsigned long long) *ppos);
-
- /*
- * The combination of splice and an O_APPEND destination is disallowed.
- */
-
- ret = generic_file_splice_write(pipe, filp, ppos, count, flags);
- if (ret > 0)
- written = ret;
-
- if (ret >= 0 && nfs_need_sync_write(filp, inode)) {
- int err = vfs_fsync(filp, 0);
- if (err < 0)
- ret = err;
- }
- if (ret > 0)
- nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
- return ret;
-}
-EXPORT_SYMBOL_GPL(nfs_file_splice_write);
-
static int
do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
{
@@ -935,10 +904,10 @@ EXPORT_SYMBOL_GPL(nfs_setlease);
const struct file_operations nfs_file_operations = {
.llseek = nfs_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = nfs_file_read,
- .aio_write = nfs_file_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = nfs_file_read,
+ .write_iter = nfs_file_write,
.mmap = nfs_file_mmap,
.open = nfs_file_open,
.flush = nfs_file_flush,
@@ -947,7 +916,7 @@ const struct file_operations nfs_file_operations = {
.lock = nfs_lock,
.flock = nfs_flock,
.splice_read = nfs_file_splice_read,
- .splice_write = nfs_file_splice_write,
+ .splice_write = iter_file_splice_write,
.check_flags = nfs_check_flags,
.setlease = nfs_setlease,
};
diff --git a/fs/nfs/filelayout/Makefile b/fs/nfs/filelayout/Makefile
new file mode 100644
index 000000000000..8516cdffb9e9
--- /dev/null
+++ b/fs/nfs/filelayout/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the pNFS Files Layout Driver kernel module
+#
+obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
+nfs_layout_nfsv41_files-y := filelayout.o filelayoutdev.o
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/filelayout/filelayout.c
index b9a35c05b60f..d2eba1c13b7e 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -35,11 +35,11 @@
#include <linux/sunrpc/metrics.h>
-#include "nfs4session.h"
-#include "internal.h"
-#include "delegation.h"
-#include "nfs4filelayout.h"
-#include "nfs4trace.h"
+#include "../nfs4session.h"
+#include "../internal.h"
+#include "../delegation.h"
+#include "filelayout.h"
+#include "../nfs4trace.h"
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
@@ -84,7 +84,7 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
BUG();
}
-static void filelayout_reset_write(struct nfs_write_data *data)
+static void filelayout_reset_write(struct nfs_pgio_data *data)
{
struct nfs_pgio_header *hdr = data->header;
struct rpc_task *task = &data->task;
@@ -105,7 +105,7 @@ static void filelayout_reset_write(struct nfs_write_data *data)
}
}
-static void filelayout_reset_read(struct nfs_read_data *data)
+static void filelayout_reset_read(struct nfs_pgio_data *data)
{
struct nfs_pgio_header *hdr = data->header;
struct rpc_task *task = &data->task;
@@ -243,7 +243,7 @@ wait_on_recovery:
/* NFS_PROTO call done callback routines */
static int filelayout_read_done_cb(struct rpc_task *task,
- struct nfs_read_data *data)
+ struct nfs_pgio_data *data)
{
struct nfs_pgio_header *hdr = data->header;
int err;
@@ -270,7 +270,7 @@ static int filelayout_read_done_cb(struct rpc_task *task,
* rfc5661 is not clear about which credential should be used.
*/
static void
-filelayout_set_layoutcommit(struct nfs_write_data *wdata)
+filelayout_set_layoutcommit(struct nfs_pgio_data *wdata)
{
struct nfs_pgio_header *hdr = wdata->header;
@@ -279,7 +279,7 @@ filelayout_set_layoutcommit(struct nfs_write_data *wdata)
return;
pnfs_set_layoutcommit(wdata);
- dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
+ dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
(unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
}
@@ -305,7 +305,7 @@ filelayout_reset_to_mds(struct pnfs_layout_segment *lseg)
*/
static void filelayout_read_prepare(struct rpc_task *task, void *data)
{
- struct nfs_read_data *rdata = data;
+ struct nfs_pgio_data *rdata = data;
if (unlikely(test_bit(NFS_CONTEXT_BAD, &rdata->args.context->flags))) {
rpc_exit(task, -EIO);
@@ -317,7 +317,7 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data)
rpc_exit(task, 0);
return;
}
- rdata->read_done_cb = filelayout_read_done_cb;
+ rdata->pgio_done_cb = filelayout_read_done_cb;
if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
&rdata->args.seq_args,
@@ -331,7 +331,7 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data)
static void filelayout_read_call_done(struct rpc_task *task, void *data)
{
- struct nfs_read_data *rdata = data;
+ struct nfs_pgio_data *rdata = data;
dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
@@ -347,14 +347,14 @@ static void filelayout_read_call_done(struct rpc_task *task, void *data)
static void filelayout_read_count_stats(struct rpc_task *task, void *data)
{
- struct nfs_read_data *rdata = data;
+ struct nfs_pgio_data *rdata = data;
rpc_count_iostats(task, NFS_SERVER(rdata->header->inode)->client->cl_metrics);
}
static void filelayout_read_release(void *data)
{
- struct nfs_read_data *rdata = data;
+ struct nfs_pgio_data *rdata = data;
struct pnfs_layout_hdr *lo = rdata->header->lseg->pls_layout;
filelayout_fenceme(lo->plh_inode, lo);
@@ -363,7 +363,7 @@ static void filelayout_read_release(void *data)
}
static int filelayout_write_done_cb(struct rpc_task *task,
- struct nfs_write_data *data)
+ struct nfs_pgio_data *data)
{
struct nfs_pgio_header *hdr = data->header;
int err;
@@ -419,7 +419,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
static void filelayout_write_prepare(struct rpc_task *task, void *data)
{
- struct nfs_write_data *wdata = data;
+ struct nfs_pgio_data *wdata = data;
if (unlikely(test_bit(NFS_CONTEXT_BAD, &wdata->args.context->flags))) {
rpc_exit(task, -EIO);
@@ -443,7 +443,7 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data)
static void filelayout_write_call_done(struct rpc_task *task, void *data)
{
- struct nfs_write_data *wdata = data;
+ struct nfs_pgio_data *wdata = data;
if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) &&
task->tk_status == 0) {
@@ -457,14 +457,14 @@ static void filelayout_write_call_done(struct rpc_task *task, void *data)
static void filelayout_write_count_stats(struct rpc_task *task, void *data)
{
- struct nfs_write_data *wdata = data;
+ struct nfs_pgio_data *wdata = data;
rpc_count_iostats(task, NFS_SERVER(wdata->header->inode)->client->cl_metrics);
}
static void filelayout_write_release(void *data)
{
- struct nfs_write_data *wdata = data;
+ struct nfs_pgio_data *wdata = data;
struct pnfs_layout_hdr *lo = wdata->header->lseg->pls_layout;
filelayout_fenceme(lo->plh_inode, lo);
@@ -529,7 +529,7 @@ static const struct rpc_call_ops filelayout_commit_call_ops = {
};
static enum pnfs_try_status
-filelayout_read_pagelist(struct nfs_read_data *data)
+filelayout_read_pagelist(struct nfs_pgio_data *data)
{
struct nfs_pgio_header *hdr = data->header;
struct pnfs_layout_segment *lseg = hdr->lseg;
@@ -560,6 +560,7 @@ filelayout_read_pagelist(struct nfs_read_data *data)
/* No multipath support. Use first DS */
atomic_inc(&ds->ds_clp->cl_count);
data->ds_clp = ds->ds_clp;
+ data->ds_idx = idx;
fh = nfs4_fl_select_ds_fh(lseg, j);
if (fh)
data->args.fh = fh;
@@ -568,14 +569,14 @@ filelayout_read_pagelist(struct nfs_read_data *data)
data->mds_offset = offset;
/* Perform an asynchronous read to ds */
- nfs_initiate_read(ds_clnt, data,
- &filelayout_read_call_ops, RPC_TASK_SOFTCONN);
+ nfs_initiate_pgio(ds_clnt, data,
+ &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN);
return PNFS_ATTEMPTED;
}
/* Perform async writes. */
static enum pnfs_try_status
-filelayout_write_pagelist(struct nfs_write_data *data, int sync)
+filelayout_write_pagelist(struct nfs_pgio_data *data, int sync)
{
struct nfs_pgio_header *hdr = data->header;
struct pnfs_layout_segment *lseg = hdr->lseg;
@@ -600,20 +601,18 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
__func__, hdr->inode->i_ino, sync, (size_t) data->args.count,
offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
- data->write_done_cb = filelayout_write_done_cb;
+ data->pgio_done_cb = filelayout_write_done_cb;
atomic_inc(&ds->ds_clp->cl_count);
data->ds_clp = ds->ds_clp;
+ data->ds_idx = idx;
fh = nfs4_fl_select_ds_fh(lseg, j);
if (fh)
data->args.fh = fh;
- /*
- * Get the file offset on the dserver. Set the write offset to
- * this offset and save the original offset.
- */
+
data->args.offset = filelayout_get_dserver_offset(lseg, offset);
/* Perform an asynchronous write */
- nfs_initiate_write(ds_clnt, data,
+ nfs_initiate_pgio(ds_clnt, data,
&filelayout_write_call_ops, sync,
RPC_TASK_SOFTCONN);
return PNFS_ATTEMPTED;
@@ -637,7 +636,6 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
struct nfs4_deviceid_node *d;
struct nfs4_file_layout_dsaddr *dsaddr;
int status = -EINVAL;
- struct nfs_server *nfss = NFS_SERVER(lo->plh_inode);
dprintk("--> %s\n", __func__);
@@ -655,7 +653,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
goto out;
}
- if (!fl->stripe_unit || fl->stripe_unit % PAGE_SIZE) {
+ if (!fl->stripe_unit) {
dprintk("%s Invalid stripe unit (%u)\n",
__func__, fl->stripe_unit);
goto out;
@@ -692,12 +690,6 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
goto out_put;
}
- if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) {
- dprintk("%s Stripe unit (%u) not aligned with rsize %u "
- "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize,
- nfss->wsize);
- }
-
status = 0;
out:
dprintk("--> %s returns %d\n", __func__, status);
@@ -850,11 +842,15 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
{
struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
struct pnfs_commit_bucket *buckets;
- int size;
+ int size, i;
if (fl->commit_through_mds)
return 0;
- if (cinfo->ds->nbuckets != 0) {
+
+ size = (fl->stripe_type == STRIPE_SPARSE) ?
+ fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
+
+ if (cinfo->ds->nbuckets >= size) {
/* This assumes there is only one IOMODE_RW lseg. What
* we really want to do is have a layout_hdr level
* dictionary of <multipath_list4, fh> keys, each
@@ -864,30 +860,36 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
return 0;
}
- size = (fl->stripe_type == STRIPE_SPARSE) ?
- fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
-
buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket),
gfp_flags);
if (!buckets)
return -ENOMEM;
- else {
- int i;
+ for (i = 0; i < size; i++) {
+ INIT_LIST_HEAD(&buckets[i].written);
+ INIT_LIST_HEAD(&buckets[i].committing);
+ /* mark direct verifier as unset */
+ buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
+ }
- spin_lock(cinfo->lock);
- if (cinfo->ds->nbuckets != 0)
- kfree(buckets);
- else {
- cinfo->ds->buckets = buckets;
- cinfo->ds->nbuckets = size;
- for (i = 0; i < size; i++) {
- INIT_LIST_HEAD(&buckets[i].written);
- INIT_LIST_HEAD(&buckets[i].committing);
- }
- }
- spin_unlock(cinfo->lock);
- return 0;
+ spin_lock(cinfo->lock);
+ if (cinfo->ds->nbuckets >= size)
+ goto out;
+ for (i = 0; i < cinfo->ds->nbuckets; i++) {
+ list_splice(&cinfo->ds->buckets[i].written,
+ &buckets[i].written);
+ list_splice(&cinfo->ds->buckets[i].committing,
+ &buckets[i].committing);
+ buckets[i].direct_verf.committed =
+ cinfo->ds->buckets[i].direct_verf.committed;
+ buckets[i].wlseg = cinfo->ds->buckets[i].wlseg;
+ buckets[i].clseg = cinfo->ds->buckets[i].clseg;
}
+ swap(cinfo->ds->buckets, buckets);
+ cinfo->ds->nbuckets = size;
+out:
+ spin_unlock(cinfo->lock);
+ kfree(buckets);
+ return 0;
}
static struct pnfs_layout_segment *
@@ -915,47 +917,51 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
/*
* filelayout_pg_test(). Called by nfs_can_coalesce_requests()
*
- * return true : coalesce page
- * return false : don't coalesce page
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
*/
-static bool
+static size_t
filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
+ unsigned int size;
u64 p_stripe, r_stripe;
- u32 stripe_unit;
+ u32 stripe_offset;
+ u64 segment_offset = pgio->pg_lseg->pls_range.offset;
+ u32 stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
- if (!pnfs_generic_pg_test(pgio, prev, req) ||
- !nfs_generic_pg_test(pgio, prev, req))
- return false;
+ /* calls nfs_generic_pg_test */
+ size = pnfs_generic_pg_test(pgio, prev, req);
+ if (!size)
+ return 0;
- p_stripe = (u64)req_offset(prev);
- r_stripe = (u64)req_offset(req);
- stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
+ /* see if req and prev are in the same stripe */
+ if (prev) {
+ p_stripe = (u64)req_offset(prev) - segment_offset;
+ r_stripe = (u64)req_offset(req) - segment_offset;
+ do_div(p_stripe, stripe_unit);
+ do_div(r_stripe, stripe_unit);
- do_div(p_stripe, stripe_unit);
- do_div(r_stripe, stripe_unit);
+ if (p_stripe != r_stripe)
+ return 0;
+ }
- return (p_stripe == r_stripe);
+ /* calculate remaining bytes in the current stripe */
+ div_u64_rem((u64)req_offset(req) - segment_offset,
+ stripe_unit,
+ &stripe_offset);
+ WARN_ON_ONCE(stripe_offset > stripe_unit);
+ if (stripe_offset >= stripe_unit)
+ return 0;
+ return min(stripe_unit - (unsigned int)stripe_offset, size);
}
static void
filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
- WARN_ON_ONCE(pgio->pg_lseg != NULL);
-
- if (req->wb_offset != req->wb_pgbase) {
- /*
- * Handling unaligned pages is difficult, because have to
- * somehow split a req in two in certain cases in the
- * pg.test code. Avoid this by just not using pnfs
- * in this case.
- */
- nfs_pageio_reset_read_mds(pgio);
- return;
- }
- pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+ if (!pgio->pg_lseg)
+ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
@@ -973,11 +979,8 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_commit_info cinfo;
int status;
- WARN_ON_ONCE(pgio->pg_lseg != NULL);
-
- if (req->wb_offset != req->wb_pgbase)
- goto out_mds;
- pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+ if (!pgio->pg_lseg)
+ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
@@ -1067,6 +1070,7 @@ filelayout_choose_commit_list(struct nfs_page *req,
*/
j = nfs4_fl_calc_j_index(lseg, req_offset(req));
i = select_bucket_index(fl, j);
+ spin_lock(cinfo->lock);
buckets = cinfo->ds->buckets;
list = &buckets[i].written;
if (list_empty(list)) {
@@ -1080,6 +1084,7 @@ filelayout_choose_commit_list(struct nfs_page *req,
}
set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
cinfo->ds->nwritten++;
+ spin_unlock(cinfo->lock);
return list;
}
@@ -1176,6 +1181,7 @@ transfer_commit_list(struct list_head *src, struct list_head *dst,
return ret;
}
+/* Note called with cinfo->lock held. */
static int
filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
struct nfs_commit_info *cinfo,
@@ -1220,15 +1226,18 @@ static void filelayout_recover_commit_reqs(struct list_head *dst,
struct nfs_commit_info *cinfo)
{
struct pnfs_commit_bucket *b;
+ struct pnfs_layout_segment *freeme;
int i;
+restart:
spin_lock(cinfo->lock);
for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
if (transfer_commit_list(&b->written, dst, cinfo, 0)) {
- spin_unlock(cinfo->lock);
- pnfs_put_lseg(b->wlseg);
+ freeme = b->wlseg;
b->wlseg = NULL;
- spin_lock(cinfo->lock);
+ spin_unlock(cinfo->lock);
+ pnfs_put_lseg(freeme);
+ goto restart;
}
}
cinfo->ds->nwritten = 0;
@@ -1243,6 +1252,7 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
struct nfs_commit_data *data;
int i, j;
unsigned int nreq = 0;
+ struct pnfs_layout_segment *freeme;
fl_cinfo = cinfo->ds;
bucket = fl_cinfo->buckets;
@@ -1253,8 +1263,10 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
if (!data)
break;
data->ds_commit_index = i;
+ spin_lock(cinfo->lock);
data->lseg = bucket->clseg;
bucket->clseg = NULL;
+ spin_unlock(cinfo->lock);
list_add(&data->pages, list);
nreq++;
}
@@ -1264,8 +1276,11 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
if (list_empty(&bucket->committing))
continue;
nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
- pnfs_put_lseg(bucket->clseg);
+ spin_lock(cinfo->lock);
+ freeme = bucket->clseg;
bucket->clseg = NULL;
+ spin_unlock(cinfo->lock);
+ pnfs_put_lseg(freeme);
}
/* Caller will clean up entries put on list */
return nreq;
@@ -1330,7 +1345,7 @@ filelayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
struct nfs4_filelayout *flo;
flo = kzalloc(sizeof(*flo), gfp_flags);
- return &flo->generic_hdr;
+ return flo != NULL ? &flo->generic_hdr : NULL;
}
static void
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/filelayout/filelayout.h
index cebd20e7e923..ffbddf2219ea 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/filelayout/filelayout.h
@@ -30,7 +30,7 @@
#ifndef FS_NFS_NFS4FILELAYOUT_H
#define FS_NFS_NFS4FILELAYOUT_H
-#include "pnfs.h"
+#include "../pnfs.h"
/*
* Default data server connection timeout and retrans vaules.
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index b9c61efe9660..44bf0140a4c7 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -33,9 +33,9 @@
#include <linux/module.h>
#include <linux/sunrpc/addr.h>
-#include "internal.h"
-#include "nfs4session.h"
-#include "nfs4filelayout.h"
+#include "../internal.h"
+#include "../nfs4session.h"
+#include "filelayout.h"
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 66984a9aafaa..b94f80420a58 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -120,7 +120,8 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
security_d_instantiate(ret, inode);
spin_lock(&ret->d_lock);
- if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) {
+ if (IS_ROOT(ret) && !ret->d_fsdata &&
+ !(ret->d_flags & DCACHE_NFSFS_RENAMED)) {
ret->d_fsdata = name;
name = NULL;
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e6f7398d2b3c..c496f8a74639 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1575,18 +1575,20 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
inode->i_version = fattr->change_attr;
}
} else if (server->caps & NFS_CAP_CHANGE_ATTR)
- invalid |= save_cache_validity;
+ nfsi->cache_validity |= save_cache_validity;
if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
} else if (server->caps & NFS_CAP_MTIME)
- invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_FORCED);
if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
} else if (server->caps & NFS_CAP_CTIME)
- invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_FORCED);
/* Check if our cached file size is stale */
@@ -1608,7 +1610,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
(long long)new_isize);
}
} else
- invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_PAGECACHE
| NFS_INO_REVAL_FORCED);
@@ -1616,7 +1619,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
if (fattr->valid & NFS_ATTR_FATTR_ATIME)
memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
else if (server->caps & NFS_CAP_ATIME)
- invalid |= save_cache_validity & (NFS_INO_INVALID_ATIME
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_ATIME
| NFS_INO_REVAL_FORCED);
if (fattr->valid & NFS_ATTR_FATTR_MODE) {
@@ -1627,7 +1631,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
}
} else if (server->caps & NFS_CAP_MODE)
- invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
| NFS_INO_REVAL_FORCED);
@@ -1638,7 +1643,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
inode->i_uid = fattr->uid;
}
} else if (server->caps & NFS_CAP_OWNER)
- invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
| NFS_INO_REVAL_FORCED);
@@ -1649,7 +1655,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
inode->i_gid = fattr->gid;
}
} else if (server->caps & NFS_CAP_OWNER_GROUP)
- invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
| NFS_INO_REVAL_FORCED);
@@ -1662,7 +1669,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
set_nlink(inode, fattr->nlink);
}
} else if (server->caps & NFS_CAP_NLINK)
- invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_FORCED);
if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index dd8bfc2e2464..82ddbf46660e 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -231,13 +231,20 @@ extern void nfs_destroy_writepagecache(void);
extern int __init nfs_init_directcache(void);
extern void nfs_destroy_directcache(void);
-extern bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount);
extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
struct nfs_pgio_header *hdr,
void (*release)(struct nfs_pgio_header *hdr));
void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
int nfs_iocounter_wait(struct nfs_io_counter *c);
+extern const struct nfs_pageio_ops nfs_pgio_rw_ops;
+struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *);
+void nfs_rw_header_free(struct nfs_pgio_header *);
+void nfs_pgio_data_release(struct nfs_pgio_data *);
+int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
+int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_data *,
+ const struct rpc_call_ops *, int, int);
+
static inline void nfs_iocounter_init(struct nfs_io_counter *c)
{
c->flags = 0;
@@ -320,16 +327,14 @@ int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *)
int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int);
loff_t nfs_file_llseek(struct file *, loff_t, int);
int nfs_file_flush(struct file *, fl_owner_t);
-ssize_t nfs_file_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
+ssize_t nfs_file_read(struct kiocb *, struct iov_iter *);
ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *,
size_t, unsigned int);
int nfs_file_mmap(struct file *, struct vm_area_struct *);
-ssize_t nfs_file_write(struct kiocb *, const struct iovec *, unsigned long, loff_t);
+ssize_t nfs_file_write(struct kiocb *, struct iov_iter *);
int nfs_file_release(struct inode *, struct file *);
int nfs_lock(struct file *, int, struct file_lock *);
int nfs_flock(struct file *, int, struct file_lock *);
-ssize_t nfs_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *,
- size_t, unsigned int);
int nfs_check_flags(int);
int nfs_setlease(struct file *, long, struct file_lock **);
@@ -395,19 +400,11 @@ extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool
struct nfs_pgio_completion_ops;
/* read.c */
-extern struct nfs_read_header *nfs_readhdr_alloc(void);
-extern void nfs_readhdr_free(struct nfs_pgio_header *hdr);
extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
- struct inode *inode,
+ struct inode *inode, bool force_mds,
const struct nfs_pgio_completion_ops *compl_ops);
-extern int nfs_initiate_read(struct rpc_clnt *clnt,
- struct nfs_read_data *data,
- const struct rpc_call_ops *call_ops, int flags);
extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
-extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr);
extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
-extern void nfs_readdata_release(struct nfs_read_data *rdata);
/* super.c */
void nfs_clone_super(struct super_block *, struct nfs_mount_info *);
@@ -422,19 +419,10 @@ int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
/* write.c */
extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
- struct inode *inode, int ioflags,
+ struct inode *inode, int ioflags, bool force_mds,
const struct nfs_pgio_completion_ops *compl_ops);
-extern struct nfs_write_header *nfs_writehdr_alloc(void);
-extern void nfs_writehdr_free(struct nfs_pgio_header *hdr);
-extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr);
extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
-extern void nfs_writedata_release(struct nfs_write_data *wdata);
extern void nfs_commit_free(struct nfs_commit_data *p);
-extern int nfs_initiate_write(struct rpc_clnt *clnt,
- struct nfs_write_data *data,
- const struct rpc_call_ops *call_ops,
- int how, int flags);
extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
extern void nfs_commit_prepare(struct rpc_task *task, void *calldata);
extern int nfs_initiate_commit(struct rpc_clnt *clnt,
@@ -447,6 +435,7 @@ extern void nfs_init_commit(struct nfs_commit_data *data,
struct nfs_commit_info *cinfo);
int nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
struct nfs_commit_info *cinfo, int max);
+unsigned long nfs_reqs_to_commit(struct nfs_commit_info *);
int nfs_scan_commit(struct inode *inode, struct list_head *dst,
struct nfs_commit_info *cinfo);
void nfs_mark_request_commit(struct nfs_page *req,
@@ -492,7 +481,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode)
extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
/* nfs4proc.c */
-extern void __nfs4_read_done_cb(struct nfs_read_data *);
+extern void __nfs4_read_done_cb(struct nfs_pgio_data *);
extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
const struct rpc_timeout *timeparms,
const char *ip_addr);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 62db136339ea..5f61b83f4a1c 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -103,7 +103,7 @@ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
/*
* typedef opaque nfsdata<>;
*/
-static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_readres *result)
+static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_pgio_res *result)
{
u32 recvd, count;
__be32 *p;
@@ -613,7 +613,7 @@ static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req,
* };
*/
static void encode_readargs(struct xdr_stream *xdr,
- const struct nfs_readargs *args)
+ const struct nfs_pgio_args *args)
{
u32 offset = args->offset;
u32 count = args->count;
@@ -629,7 +629,7 @@ static void encode_readargs(struct xdr_stream *xdr,
static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,
struct xdr_stream *xdr,
- const struct nfs_readargs *args)
+ const struct nfs_pgio_args *args)
{
encode_readargs(xdr, args);
prepare_reply_buffer(req, args->pages, args->pgbase,
@@ -649,7 +649,7 @@ static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,
* };
*/
static void encode_writeargs(struct xdr_stream *xdr,
- const struct nfs_writeargs *args)
+ const struct nfs_pgio_args *args)
{
u32 offset = args->offset;
u32 count = args->count;
@@ -669,7 +669,7 @@ static void encode_writeargs(struct xdr_stream *xdr,
static void nfs2_xdr_enc_writeargs(struct rpc_rqst *req,
struct xdr_stream *xdr,
- const struct nfs_writeargs *args)
+ const struct nfs_pgio_args *args)
{
encode_writeargs(xdr, args);
xdr->buf->flags |= XDRBUF_WRITE;
@@ -857,7 +857,7 @@ out_default:
* };
*/
static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr,
- struct nfs_readres *result)
+ struct nfs_pgio_res *result)
{
enum nfs_stat status;
int error;
@@ -878,7 +878,7 @@ out_default:
}
static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr,
- struct nfs_writeres *result)
+ struct nfs_pgio_res *result)
{
/* All NFSv2 writes are "file sync" writes */
result->verf->committed = NFS_FILE_SYNC;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index db60149c4579..e7daa42bbc86 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -795,7 +795,7 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
return status;
}
-static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_data *data)
{
struct inode *inode = data->header->inode;
@@ -807,18 +807,18 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data)
return 0;
}
-static void nfs3_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
+static void nfs3_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
{
msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
}
-static int nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
{
rpc_call_start(task);
return 0;
}
-static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_data *data)
{
struct inode *inode = data->header->inode;
@@ -829,17 +829,11 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
return 0;
}
-static void nfs3_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
+static void nfs3_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
{
msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
}
-static int nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
-{
- rpc_call_start(task);
- return 0;
-}
-
static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
{
rpc_call_start(task);
@@ -946,13 +940,10 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
.fsinfo = nfs3_proc_fsinfo,
.pathconf = nfs3_proc_pathconf,
.decode_dirent = nfs3_decode_dirent,
+ .pgio_rpc_prepare = nfs3_proc_pgio_rpc_prepare,
.read_setup = nfs3_proc_read_setup,
- .read_pageio_init = nfs_pageio_init_read,
- .read_rpc_prepare = nfs3_proc_read_rpc_prepare,
.read_done = nfs3_read_done,
.write_setup = nfs3_proc_write_setup,
- .write_pageio_init = nfs_pageio_init_write,
- .write_rpc_prepare = nfs3_proc_write_rpc_prepare,
.write_done = nfs3_write_done,
.commit_setup = nfs3_proc_commit_setup,
.commit_rpc_prepare = nfs3_proc_commit_rpc_prepare,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index fa6d72131c19..8f4cbe7f4aa8 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -953,7 +953,7 @@ static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req,
* };
*/
static void encode_read3args(struct xdr_stream *xdr,
- const struct nfs_readargs *args)
+ const struct nfs_pgio_args *args)
{
__be32 *p;
@@ -966,7 +966,7 @@ static void encode_read3args(struct xdr_stream *xdr,
static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,
struct xdr_stream *xdr,
- const struct nfs_readargs *args)
+ const struct nfs_pgio_args *args)
{
encode_read3args(xdr, args);
prepare_reply_buffer(req, args->pages, args->pgbase,
@@ -992,7 +992,7 @@ static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,
* };
*/
static void encode_write3args(struct xdr_stream *xdr,
- const struct nfs_writeargs *args)
+ const struct nfs_pgio_args *args)
{
__be32 *p;
@@ -1008,7 +1008,7 @@ static void encode_write3args(struct xdr_stream *xdr,
static void nfs3_xdr_enc_write3args(struct rpc_rqst *req,
struct xdr_stream *xdr,
- const struct nfs_writeargs *args)
+ const struct nfs_pgio_args *args)
{
encode_write3args(xdr, args);
xdr->buf->flags |= XDRBUF_WRITE;
@@ -1589,7 +1589,7 @@ out_default:
* };
*/
static int decode_read3resok(struct xdr_stream *xdr,
- struct nfs_readres *result)
+ struct nfs_pgio_res *result)
{
u32 eof, count, ocount, recvd;
__be32 *p;
@@ -1625,7 +1625,7 @@ out_overflow:
}
static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
- struct nfs_readres *result)
+ struct nfs_pgio_res *result)
{
enum nfs_stat status;
int error;
@@ -1673,7 +1673,7 @@ out_status:
* };
*/
static int decode_write3resok(struct xdr_stream *xdr,
- struct nfs_writeres *result)
+ struct nfs_pgio_res *result)
{
__be32 *p;
@@ -1697,7 +1697,7 @@ out_eio:
}
static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
- struct nfs_writeres *result)
+ struct nfs_pgio_res *result)
{
enum nfs_stat status;
int error;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index e1d1badbe53c..f63cb87cd730 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -337,7 +337,7 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode,
*/
static inline void
nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
- struct rpc_message *msg, struct nfs_write_data *wdata)
+ struct rpc_message *msg, struct nfs_pgio_data *wdata)
{
if (_nfs4_state_protect(clp, NFS_SP4_MACH_CRED_WRITE, clntp, msg) &&
!test_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags))
@@ -369,7 +369,7 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_flags,
static inline void
nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
- struct rpc_message *msg, struct nfs_write_data *wdata)
+ struct rpc_message *msg, struct nfs_pgio_data *wdata)
{
}
#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 8de3407e0360..a816f0627a6c 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -100,8 +100,7 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
break;
mutex_lock(&inode->i_mutex);
ret = nfs_file_fsync_commit(file, start, end, datasync);
- if (!ret && !datasync)
- /* application has asked for meta-data sync */
+ if (!ret)
ret = pnfs_layoutcommit_inode(inode, true);
mutex_unlock(&inode->i_mutex);
/*
@@ -118,10 +117,10 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
const struct file_operations nfs4_file_operations = {
.llseek = nfs_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = nfs_file_read,
- .aio_write = nfs_file_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = nfs_file_read,
+ .write_iter = nfs_file_write,
.mmap = nfs_file_mmap,
.open = nfs4_file_open,
.flush = nfs_file_flush,
@@ -130,7 +129,7 @@ const struct file_operations nfs4_file_operations = {
.lock = nfs_lock,
.flock = nfs_flock,
.splice_read = nfs_file_splice_read,
- .splice_write = nfs_file_splice_write,
+ .splice_write = iter_file_splice_write,
.check_flags = nfs_check_flags,
.setlease = nfs_setlease,
};
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 397be39c6dc8..285ad5334018 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2027,7 +2027,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
return status;
}
if (!(o_res->f_attr->valid & NFS_ATTR_FATTR))
- _nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label);
+ nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label);
return 0;
}
@@ -2750,7 +2750,7 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
#define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL)
#define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL)
-#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_CHANGE_SECURITY_LABEL - 1UL)
+#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_SECURITY_LABEL - 1UL)
static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
{
@@ -4033,12 +4033,12 @@ static bool nfs4_error_stateid_expired(int err)
return false;
}
-void __nfs4_read_done_cb(struct nfs_read_data *data)
+void __nfs4_read_done_cb(struct nfs_pgio_data *data)
{
nfs_invalidate_atime(data->header->inode);
}
-static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_data *data)
{
struct nfs_server *server = NFS_SERVER(data->header->inode);
@@ -4055,7 +4055,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
}
static bool nfs4_read_stateid_changed(struct rpc_task *task,
- struct nfs_readargs *args)
+ struct nfs_pgio_args *args)
{
if (!nfs4_error_stateid_expired(task->tk_status) ||
@@ -4068,7 +4068,7 @@ static bool nfs4_read_stateid_changed(struct rpc_task *task,
return true;
}
-static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_data *data)
{
dprintk("--> %s\n", __func__);
@@ -4077,19 +4077,19 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
return -EAGAIN;
if (nfs4_read_stateid_changed(task, &data->args))
return -EAGAIN;
- return data->read_done_cb ? data->read_done_cb(task, data) :
+ return data->pgio_done_cb ? data->pgio_done_cb(task, data) :
nfs4_read_done_cb(task, data);
}
-static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
+static void nfs4_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
{
data->timestamp = jiffies;
- data->read_done_cb = nfs4_read_done_cb;
+ data->pgio_done_cb = nfs4_read_done_cb;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
}
-static int nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
{
if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
&data->args.seq_args,
@@ -4097,14 +4097,14 @@ static int nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_dat
task))
return 0;
if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
- data->args.lock_context, FMODE_READ) == -EIO)
+ data->args.lock_context, data->header->rw_ops->rw_mode) == -EIO)
return -EIO;
if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
return -EIO;
return 0;
}
-static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_pgio_data *data)
{
struct inode *inode = data->header->inode;
@@ -4121,7 +4121,7 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data
}
static bool nfs4_write_stateid_changed(struct rpc_task *task,
- struct nfs_writeargs *args)
+ struct nfs_pgio_args *args)
{
if (!nfs4_error_stateid_expired(task->tk_status) ||
@@ -4134,18 +4134,18 @@ static bool nfs4_write_stateid_changed(struct rpc_task *task,
return true;
}
-static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_data *data)
{
if (!nfs4_sequence_done(task, &data->res.seq_res))
return -EAGAIN;
if (nfs4_write_stateid_changed(task, &data->args))
return -EAGAIN;
- return data->write_done_cb ? data->write_done_cb(task, data) :
+ return data->pgio_done_cb ? data->pgio_done_cb(task, data) :
nfs4_write_done_cb(task, data);
}
static
-bool nfs4_write_need_cache_consistency_data(const struct nfs_write_data *data)
+bool nfs4_write_need_cache_consistency_data(const struct nfs_pgio_data *data)
{
const struct nfs_pgio_header *hdr = data->header;
@@ -4158,7 +4158,7 @@ bool nfs4_write_need_cache_consistency_data(const struct nfs_write_data *data)
return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0;
}
-static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
+static void nfs4_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
{
struct nfs_server *server = NFS_SERVER(data->header->inode);
@@ -4168,8 +4168,8 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
} else
data->args.bitmask = server->cache_consistency_bitmask;
- if (!data->write_done_cb)
- data->write_done_cb = nfs4_write_done_cb;
+ if (!data->pgio_done_cb)
+ data->pgio_done_cb = nfs4_write_done_cb;
data->res.server = server;
data->timestamp = jiffies;
@@ -4177,21 +4177,6 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
}
-static int nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
-{
- if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
- &data->args.seq_args,
- &data->res.seq_res,
- task))
- return 0;
- if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
- data->args.lock_context, FMODE_WRITE) == -EIO)
- return -EIO;
- if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
- return -EIO;
- return 0;
-}
-
static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
{
nfs4_setup_sequence(NFS_SERVER(data->inode),
@@ -8432,13 +8417,10 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
.pathconf = nfs4_proc_pathconf,
.set_capabilities = nfs4_server_capabilities,
.decode_dirent = nfs4_decode_dirent,
+ .pgio_rpc_prepare = nfs4_proc_pgio_rpc_prepare,
.read_setup = nfs4_proc_read_setup,
- .read_pageio_init = pnfs_pageio_init_read,
- .read_rpc_prepare = nfs4_proc_read_rpc_prepare,
.read_done = nfs4_read_done,
.write_setup = nfs4_proc_write_setup,
- .write_pageio_init = pnfs_pageio_init_write,
- .write_rpc_prepare = nfs4_proc_write_rpc_prepare,
.write_done = nfs4_write_done,
.commit_setup = nfs4_proc_commit_setup,
.commit_rpc_prepare = nfs4_proc_commit_rpc_prepare,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index c0583b9bef71..848f6853c59e 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1456,7 +1456,7 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
* server that doesn't support a grace period.
*/
spin_lock(&sp->so_lock);
- write_seqcount_begin(&sp->so_reclaim_seqcount);
+ raw_write_seqcount_begin(&sp->so_reclaim_seqcount);
restart:
list_for_each_entry(state, &sp->so_states, open_states) {
if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
@@ -1519,13 +1519,13 @@ restart:
spin_lock(&sp->so_lock);
goto restart;
}
- write_seqcount_end(&sp->so_reclaim_seqcount);
+ raw_write_seqcount_end(&sp->so_reclaim_seqcount);
spin_unlock(&sp->so_lock);
return 0;
out_err:
nfs4_put_open_state(state);
spin_lock(&sp->so_lock);
- write_seqcount_end(&sp->so_reclaim_seqcount);
+ raw_write_seqcount_end(&sp->so_reclaim_seqcount);
spin_unlock(&sp->so_lock);
return status;
}
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 849cf146db30..0a744f3a86f6 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -932,7 +932,7 @@ DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group);
DECLARE_EVENT_CLASS(nfs4_read_event,
TP_PROTO(
- const struct nfs_read_data *data,
+ const struct nfs_pgio_data *data,
int error
),
@@ -972,7 +972,7 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
#define DEFINE_NFS4_READ_EVENT(name) \
DEFINE_EVENT(nfs4_read_event, name, \
TP_PROTO( \
- const struct nfs_read_data *data, \
+ const struct nfs_pgio_data *data, \
int error \
), \
TP_ARGS(data, error))
@@ -983,7 +983,7 @@ DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read);
DECLARE_EVENT_CLASS(nfs4_write_event,
TP_PROTO(
- const struct nfs_write_data *data,
+ const struct nfs_pgio_data *data,
int error
),
@@ -1024,7 +1024,7 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
#define DEFINE_NFS4_WRITE_EVENT(name) \
DEFINE_EVENT(nfs4_write_event, name, \
TP_PROTO( \
- const struct nfs_write_data *data, \
+ const struct nfs_pgio_data *data, \
int error \
), \
TP_ARGS(data, error))
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 73ce8d4fe2c8..939ae606cfa4 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1556,7 +1556,8 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
encode_op_hdr(xdr, OP_PUTROOTFH, decode_putrootfh_maxsz, hdr);
}
-static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr)
+static void encode_read(struct xdr_stream *xdr, const struct nfs_pgio_args *args,
+ struct compound_hdr *hdr)
{
__be32 *p;
@@ -1701,7 +1702,8 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4
encode_nfs4_verifier(xdr, &arg->confirm);
}
-static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
+static void encode_write(struct xdr_stream *xdr, const struct nfs_pgio_args *args,
+ struct compound_hdr *hdr)
{
__be32 *p;
@@ -2451,7 +2453,7 @@ static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr,
* Encode a READ request
*/
static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr,
- struct nfs_readargs *args)
+ struct nfs_pgio_args *args)
{
struct compound_hdr hdr = {
.minorversion = nfs4_xdr_minorversion(&args->seq_args),
@@ -2513,7 +2515,7 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
* Encode a WRITE request
*/
static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
- struct nfs_writeargs *args)
+ struct nfs_pgio_args *args)
{
struct compound_hdr hdr = {
.minorversion = nfs4_xdr_minorversion(&args->seq_args),
@@ -5085,7 +5087,8 @@ static int decode_putrootfh(struct xdr_stream *xdr)
return decode_op_hdr(xdr, OP_PUTROOTFH);
}
-static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_readres *res)
+static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req,
+ struct nfs_pgio_res *res)
{
__be32 *p;
uint32_t count, eof, recvd;
@@ -5339,7 +5342,7 @@ static int decode_setclientid_confirm(struct xdr_stream *xdr)
return decode_op_hdr(xdr, OP_SETCLIENTID_CONFIRM);
}
-static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res)
+static int decode_write(struct xdr_stream *xdr, struct nfs_pgio_res *res)
{
__be32 *p;
int status;
@@ -6636,7 +6639,7 @@ out:
* Decode Read response
*/
static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
- struct nfs_readres *res)
+ struct nfs_pgio_res *res)
{
struct compound_hdr hdr;
int status;
@@ -6661,7 +6664,7 @@ out:
* Decode WRITE response
*/
static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
- struct nfs_writeres *res)
+ struct nfs_pgio_res *res)
{
struct compound_hdr hdr;
int status;
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 5457745dd4f1..611320753db2 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -439,7 +439,7 @@ static void _read_done(struct ore_io_state *ios, void *private)
objlayout_read_done(&objios->oir, status, objios->sync);
}
-int objio_read_pagelist(struct nfs_read_data *rdata)
+int objio_read_pagelist(struct nfs_pgio_data *rdata)
{
struct nfs_pgio_header *hdr = rdata->header;
struct objio_state *objios;
@@ -487,7 +487,7 @@ static void _write_done(struct ore_io_state *ios, void *private)
static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
{
struct objio_state *objios = priv;
- struct nfs_write_data *wdata = objios->oir.rpcdata;
+ struct nfs_pgio_data *wdata = objios->oir.rpcdata;
struct address_space *mapping = wdata->header->inode->i_mapping;
pgoff_t index = offset / PAGE_SIZE;
struct page *page;
@@ -531,7 +531,7 @@ static const struct _ore_r4w_op _r4w_op = {
.put_page = &__r4w_put_page,
};
-int objio_write_pagelist(struct nfs_write_data *wdata, int how)
+int objio_write_pagelist(struct nfs_pgio_data *wdata, int how)
{
struct nfs_pgio_header *hdr = wdata->header;
struct objio_state *objios;
@@ -564,14 +564,22 @@ int objio_write_pagelist(struct nfs_write_data *wdata, int how)
return 0;
}
-static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
+/*
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio,
struct nfs_page *prev, struct nfs_page *req)
{
- if (!pnfs_generic_pg_test(pgio, prev, req))
- return false;
+ unsigned int size;
+
+ size = pnfs_generic_pg_test(pgio, prev, req);
+
+ if (!size || pgio->pg_count + req->wb_bytes >
+ (unsigned long)pgio->pg_layout_private)
+ return 0;
- return pgio->pg_count + req->wb_bytes <=
- (unsigned long)pgio->pg_layout_private;
+ return min(size, req->wb_bytes);
}
static void objio_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index e4f9cbfec67b..765d3f54e986 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -53,10 +53,10 @@ objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
struct objlayout *objlay;
objlay = kzalloc(sizeof(struct objlayout), gfp_flags);
- if (objlay) {
- spin_lock_init(&objlay->lock);
- INIT_LIST_HEAD(&objlay->err_list);
- }
+ if (!objlay)
+ return NULL;
+ spin_lock_init(&objlay->lock);
+ INIT_LIST_HEAD(&objlay->err_list);
dprintk("%s: Return %p\n", __func__, objlay);
return &objlay->pnfs_layout;
}
@@ -229,11 +229,11 @@ objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index,
static void _rpc_read_complete(struct work_struct *work)
{
struct rpc_task *task;
- struct nfs_read_data *rdata;
+ struct nfs_pgio_data *rdata;
dprintk("%s enter\n", __func__);
task = container_of(work, struct rpc_task, u.tk_work);
- rdata = container_of(task, struct nfs_read_data, task);
+ rdata = container_of(task, struct nfs_pgio_data, task);
pnfs_ld_read_done(rdata);
}
@@ -241,7 +241,7 @@ static void _rpc_read_complete(struct work_struct *work)
void
objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
{
- struct nfs_read_data *rdata = oir->rpcdata;
+ struct nfs_pgio_data *rdata = oir->rpcdata;
oir->status = rdata->task.tk_status = status;
if (status >= 0)
@@ -266,7 +266,7 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
* Perform sync or async reads.
*/
enum pnfs_try_status
-objlayout_read_pagelist(struct nfs_read_data *rdata)
+objlayout_read_pagelist(struct nfs_pgio_data *rdata)
{
struct nfs_pgio_header *hdr = rdata->header;
struct inode *inode = hdr->inode;
@@ -312,11 +312,11 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
static void _rpc_write_complete(struct work_struct *work)
{
struct rpc_task *task;
- struct nfs_write_data *wdata;
+ struct nfs_pgio_data *wdata;
dprintk("%s enter\n", __func__);
task = container_of(work, struct rpc_task, u.tk_work);
- wdata = container_of(task, struct nfs_write_data, task);
+ wdata = container_of(task, struct nfs_pgio_data, task);
pnfs_ld_write_done(wdata);
}
@@ -324,7 +324,7 @@ static void _rpc_write_complete(struct work_struct *work)
void
objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
{
- struct nfs_write_data *wdata = oir->rpcdata;
+ struct nfs_pgio_data *wdata = oir->rpcdata;
oir->status = wdata->task.tk_status = status;
if (status >= 0) {
@@ -351,7 +351,7 @@ objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
* Perform sync or async writes.
*/
enum pnfs_try_status
-objlayout_write_pagelist(struct nfs_write_data *wdata,
+objlayout_write_pagelist(struct nfs_pgio_data *wdata,
int how)
{
struct nfs_pgio_header *hdr = wdata->header;
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index 87aa1dec6120..01e041029a6c 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -119,8 +119,8 @@ extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
*/
extern void objio_free_result(struct objlayout_io_res *oir);
-extern int objio_read_pagelist(struct nfs_read_data *rdata);
-extern int objio_write_pagelist(struct nfs_write_data *wdata, int how);
+extern int objio_read_pagelist(struct nfs_pgio_data *rdata);
+extern int objio_write_pagelist(struct nfs_pgio_data *wdata, int how);
/*
* callback API
@@ -168,10 +168,10 @@ extern struct pnfs_layout_segment *objlayout_alloc_lseg(
extern void objlayout_free_lseg(struct pnfs_layout_segment *);
extern enum pnfs_try_status objlayout_read_pagelist(
- struct nfs_read_data *);
+ struct nfs_pgio_data *);
extern enum pnfs_try_status objlayout_write_pagelist(
- struct nfs_write_data *,
+ struct nfs_pgio_data *,
int how);
extern void objlayout_encode_layoutcommit(
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 03ed984ab4d8..b6ee3a6ee96d 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -24,9 +24,14 @@
#include "internal.h"
#include "pnfs.h"
+#define NFSDBG_FACILITY NFSDBG_PAGECACHE
+
static struct kmem_cache *nfs_page_cachep;
+static const struct rpc_call_ops nfs_pgio_common_ops;
+
+static void nfs_free_request(struct nfs_page *);
-bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
+static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
{
p->npages = pagecount;
if (pagecount <= ARRAY_SIZE(p->page_array))
@@ -133,11 +138,156 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
return __nfs_iocounter_wait(c);
}
+static int nfs_wait_bit_uninterruptible(void *word)
+{
+ io_schedule();
+ return 0;
+}
+
+/*
+ * nfs_page_group_lock - lock the head of the page group
+ * @req - request in group that is to be locked
+ *
+ * this lock must be held if modifying the page group list
+ */
+void
+nfs_page_group_lock(struct nfs_page *req)
+{
+ struct nfs_page *head = req->wb_head;
+
+ WARN_ON_ONCE(head != head->wb_head);
+
+ wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
+ nfs_wait_bit_uninterruptible,
+ TASK_UNINTERRUPTIBLE);
+}
+
+/*
+ * nfs_page_group_unlock - unlock the head of the page group
+ * @req - request in group that is to be unlocked
+ */
+void
+nfs_page_group_unlock(struct nfs_page *req)
+{
+ struct nfs_page *head = req->wb_head;
+
+ WARN_ON_ONCE(head != head->wb_head);
+
+ smp_mb__before_atomic();
+ clear_bit(PG_HEADLOCK, &head->wb_flags);
+ smp_mb__after_atomic();
+ wake_up_bit(&head->wb_flags, PG_HEADLOCK);
+}
+
+/*
+ * nfs_page_group_sync_on_bit_locked
+ *
+ * must be called with page group lock held
+ */
+static bool
+nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
+{
+ struct nfs_page *head = req->wb_head;
+ struct nfs_page *tmp;
+
+ WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_flags));
+ WARN_ON_ONCE(test_and_set_bit(bit, &req->wb_flags));
+
+ tmp = req->wb_this_page;
+ while (tmp != req) {
+ if (!test_bit(bit, &tmp->wb_flags))
+ return false;
+ tmp = tmp->wb_this_page;
+ }
+
+ /* true! reset all bits */
+ tmp = req;
+ do {
+ clear_bit(bit, &tmp->wb_flags);
+ tmp = tmp->wb_this_page;
+ } while (tmp != req);
+
+ return true;
+}
+
+/*
+ * nfs_page_group_sync_on_bit - set bit on current request, but only
+ * return true if the bit is set for all requests in page group
+ * @req - request in page group
+ * @bit - PG_* bit that is used to sync page group
+ */
+bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
+{
+ bool ret;
+
+ nfs_page_group_lock(req);
+ ret = nfs_page_group_sync_on_bit_locked(req, bit);
+ nfs_page_group_unlock(req);
+
+ return ret;
+}
+
+/*
+ * nfs_page_group_init - Initialize the page group linkage for @req
+ * @req - a new nfs request
+ * @prev - the previous request in page group, or NULL if @req is the first
+ * or only request in the group (the head).
+ */
+static inline void
+nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev)
+{
+ WARN_ON_ONCE(prev == req);
+
+ if (!prev) {
+ req->wb_head = req;
+ req->wb_this_page = req;
+ } else {
+ WARN_ON_ONCE(prev->wb_this_page != prev->wb_head);
+ WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags));
+ req->wb_head = prev->wb_head;
+ req->wb_this_page = prev->wb_this_page;
+ prev->wb_this_page = req;
+
+ /* grab extra ref if head request has extra ref from
+ * the write/commit path to handle handoff between write
+ * and commit lists */
+ if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags))
+ kref_get(&req->wb_kref);
+ }
+}
+
+/*
+ * nfs_page_group_destroy - sync the destruction of page groups
+ * @req - request that no longer needs the page group
+ *
+ * releases the page group reference from each member once all
+ * members have called this function.
+ */
+static void
+nfs_page_group_destroy(struct kref *kref)
+{
+ struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
+ struct nfs_page *tmp, *next;
+
+ if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN))
+ return;
+
+ tmp = req;
+ do {
+ next = tmp->wb_this_page;
+ /* unlink and free */
+ tmp->wb_this_page = tmp;
+ tmp->wb_head = tmp;
+ nfs_free_request(tmp);
+ tmp = next;
+ } while (tmp != req);
+}
+
/**
* nfs_create_request - Create an NFS read/write request.
* @ctx: open context to use
- * @inode: inode to which the request is attached
* @page: page to write
+ * @last: last nfs request created for this page group or NULL if head
* @offset: starting offset within the page for the write
* @count: number of bytes to read/write
*
@@ -146,9 +296,9 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
* User should ensure it is safe to sleep in this function.
*/
struct nfs_page *
-nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
- struct page *page,
- unsigned int offset, unsigned int count)
+nfs_create_request(struct nfs_open_context *ctx, struct page *page,
+ struct nfs_page *last, unsigned int offset,
+ unsigned int count)
{
struct nfs_page *req;
struct nfs_lock_context *l_ctx;
@@ -180,6 +330,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
req->wb_bytes = count;
req->wb_context = get_nfs_open_context(ctx);
kref_init(&req->wb_kref);
+ nfs_page_group_init(req, last);
return req;
}
@@ -237,16 +388,22 @@ static void nfs_clear_request(struct nfs_page *req)
}
}
-
/**
* nfs_release_request - Release the count on an NFS read/write request
* @req: request to release
*
* Note: Should never be called with the spinlock held!
*/
-static void nfs_free_request(struct kref *kref)
+static void nfs_free_request(struct nfs_page *req)
{
- struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
+ WARN_ON_ONCE(req->wb_this_page != req);
+
+ /* extra debug: make sure no sync bits are still set */
+ WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
+ WARN_ON_ONCE(test_bit(PG_UNLOCKPAGE, &req->wb_flags));
+ WARN_ON_ONCE(test_bit(PG_UPTODATE, &req->wb_flags));
+ WARN_ON_ONCE(test_bit(PG_WB_END, &req->wb_flags));
+ WARN_ON_ONCE(test_bit(PG_REMOVE, &req->wb_flags));
/* Release struct file and open context */
nfs_clear_request(req);
@@ -255,13 +412,7 @@ static void nfs_free_request(struct kref *kref)
void nfs_release_request(struct nfs_page *req)
{
- kref_put(&req->wb_kref, nfs_free_request);
-}
-
-static int nfs_wait_bit_uninterruptible(void *word)
-{
- io_schedule();
- return 0;
+ kref_put(&req->wb_kref, nfs_page_group_destroy);
}
/**
@@ -279,22 +430,249 @@ nfs_wait_on_request(struct nfs_page *req)
TASK_UNINTERRUPTIBLE);
}
-bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req)
+/*
+ * nfs_generic_pg_test - determine if requests can be coalesced
+ * @desc: pointer to descriptor
+ * @prev: previous request in desc, or NULL
+ * @req: this request
+ *
+ * Returns zero if @req can be coalesced into @desc, otherwise it returns
+ * the size of the request.
+ */
+size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
+ struct nfs_page *prev, struct nfs_page *req)
{
- /*
- * FIXME: ideally we should be able to coalesce all requests
- * that are not block boundary aligned, but currently this
- * is problematic for the case of bsize < PAGE_CACHE_SIZE,
- * since nfs_flush_multi and nfs_pagein_multi assume you
- * can have only one struct nfs_page.
- */
- if (desc->pg_bsize < PAGE_SIZE)
+ if (desc->pg_count > desc->pg_bsize) {
+ /* should never happen */
+ WARN_ON_ONCE(1);
return 0;
+ }
- return desc->pg_count + req->wb_bytes <= desc->pg_bsize;
+ return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes);
}
EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
+static inline struct nfs_rw_header *NFS_RW_HEADER(struct nfs_pgio_header *hdr)
+{
+ return container_of(hdr, struct nfs_rw_header, header);
+}
+
+/**
+ * nfs_rw_header_alloc - Allocate a header for a read or write
+ * @ops: Read or write function vector
+ */
+struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *ops)
+{
+ struct nfs_rw_header *header = ops->rw_alloc_header();
+
+ if (header) {
+ struct nfs_pgio_header *hdr = &header->header;
+
+ INIT_LIST_HEAD(&hdr->pages);
+ spin_lock_init(&hdr->lock);
+ atomic_set(&hdr->refcnt, 0);
+ hdr->rw_ops = ops;
+ }
+ return header;
+}
+EXPORT_SYMBOL_GPL(nfs_rw_header_alloc);
+
+/*
+ * nfs_rw_header_free - Free a read or write header
+ * @hdr: The header to free
+ */
+void nfs_rw_header_free(struct nfs_pgio_header *hdr)
+{
+ hdr->rw_ops->rw_free_header(NFS_RW_HEADER(hdr));
+}
+EXPORT_SYMBOL_GPL(nfs_rw_header_free);
+
+/**
+ * nfs_pgio_data_alloc - Allocate pageio data
+ * @hdr: The header making a request
+ * @pagecount: Number of pages to create
+ */
+static struct nfs_pgio_data *nfs_pgio_data_alloc(struct nfs_pgio_header *hdr,
+ unsigned int pagecount)
+{
+ struct nfs_pgio_data *data, *prealloc;
+
+ prealloc = &NFS_RW_HEADER(hdr)->rpc_data;
+ if (prealloc->header == NULL)
+ data = prealloc;
+ else
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ goto out;
+
+ if (nfs_pgarray_set(&data->pages, pagecount)) {
+ data->header = hdr;
+ atomic_inc(&hdr->refcnt);
+ } else {
+ if (data != prealloc)
+ kfree(data);
+ data = NULL;
+ }
+out:
+ return data;
+}
+
+/**
+ * nfs_pgio_data_release - Properly free pageio data
+ * @data: The data to release
+ */
+void nfs_pgio_data_release(struct nfs_pgio_data *data)
+{
+ struct nfs_pgio_header *hdr = data->header;
+ struct nfs_rw_header *pageio_header = NFS_RW_HEADER(hdr);
+
+ put_nfs_open_context(data->args.context);
+ if (data->pages.pagevec != data->pages.page_array)
+ kfree(data->pages.pagevec);
+ if (data == &pageio_header->rpc_data) {
+ data->header = NULL;
+ data = NULL;
+ }
+ if (atomic_dec_and_test(&hdr->refcnt))
+ hdr->completion_ops->completion(hdr);
+ /* Note: we only free the rpc_task after callbacks are done.
+ * See the comment in rpc_free_task() for why
+ */
+ kfree(data);
+}
+EXPORT_SYMBOL_GPL(nfs_pgio_data_release);
+
+/**
+ * nfs_pgio_rpcsetup - Set up arguments for a pageio call
+ * @data: The pageio data
+ * @count: Number of bytes to read
+ * @offset: Initial offset
+ * @how: How to commit data (writes only)
+ * @cinfo: Commit information for the call (writes only)
+ */
+static void nfs_pgio_rpcsetup(struct nfs_pgio_data *data,
+ unsigned int count, unsigned int offset,
+ int how, struct nfs_commit_info *cinfo)
+{
+ struct nfs_page *req = data->header->req;
+
+ /* Set up the RPC argument and reply structs
+ * NB: take care not to mess about with data->commit et al. */
+
+ data->args.fh = NFS_FH(data->header->inode);
+ data->args.offset = req_offset(req) + offset;
+ /* pnfs_set_layoutcommit needs this */
+ data->mds_offset = data->args.offset;
+ data->args.pgbase = req->wb_pgbase + offset;
+ data->args.pages = data->pages.pagevec;
+ data->args.count = count;
+ data->args.context = get_nfs_open_context(req->wb_context);
+ data->args.lock_context = req->wb_lock_context;
+ data->args.stable = NFS_UNSTABLE;
+ switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) {
+ case 0:
+ break;
+ case FLUSH_COND_STABLE:
+ if (nfs_reqs_to_commit(cinfo))
+ break;
+ default:
+ data->args.stable = NFS_FILE_SYNC;
+ }
+
+ data->res.fattr = &data->fattr;
+ data->res.count = count;
+ data->res.eof = 0;
+ data->res.verf = &data->verf;
+ nfs_fattr_init(&data->fattr);
+}
+
+/**
+ * nfs_pgio_prepare - Prepare pageio data to go over the wire
+ * @task: The current task
+ * @calldata: pageio data to prepare
+ */
+static void nfs_pgio_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs_pgio_data *data = calldata;
+ int err;
+ err = NFS_PROTO(data->header->inode)->pgio_rpc_prepare(task, data);
+ if (err)
+ rpc_exit(task, err);
+}
+
+int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_data *data,
+ const struct rpc_call_ops *call_ops, int how, int flags)
+{
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_argp = &data->args,
+ .rpc_resp = &data->res,
+ .rpc_cred = data->header->cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = clnt,
+ .task = &data->task,
+ .rpc_message = &msg,
+ .callback_ops = call_ops,
+ .callback_data = data,
+ .workqueue = nfsiod_workqueue,
+ .flags = RPC_TASK_ASYNC | flags,
+ };
+ int ret = 0;
+
+ data->header->rw_ops->rw_initiate(data, &msg, &task_setup_data, how);
+
+ dprintk("NFS: %5u initiated pgio call "
+ "(req %s/%llu, %u bytes @ offset %llu)\n",
+ data->task.tk_pid,
+ data->header->inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(data->header->inode),
+ data->args.count,
+ (unsigned long long)data->args.offset);
+
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task)) {
+ ret = PTR_ERR(task);
+ goto out;
+ }
+ if (how & FLUSH_SYNC) {
+ ret = rpc_wait_for_completion_task(task);
+ if (ret == 0)
+ ret = task->tk_status;
+ }
+ rpc_put_task(task);
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
+
+/**
+ * nfs_pgio_error - Clean up from a pageio error
+ * @desc: IO descriptor
+ * @hdr: pageio header
+ */
+static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr)
+{
+ set_bit(NFS_IOHDR_REDO, &hdr->flags);
+ nfs_pgio_data_release(hdr->data);
+ hdr->data = NULL;
+ desc->pg_completion_ops->error_cleanup(&desc->pg_list);
+ return -ENOMEM;
+}
+
+/**
+ * nfs_pgio_release - Release pageio data
+ * @calldata: The pageio data to release
+ */
+static void nfs_pgio_release(void *calldata)
+{
+ struct nfs_pgio_data *data = calldata;
+ if (data->header->rw_ops->rw_release)
+ data->header->rw_ops->rw_release(data);
+ nfs_pgio_data_release(data);
+}
+
/**
* nfs_pageio_init - initialise a page io descriptor
* @desc: pointer to descriptor
@@ -307,6 +685,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
struct inode *inode,
const struct nfs_pageio_ops *pg_ops,
const struct nfs_pgio_completion_ops *compl_ops,
+ const struct nfs_rw_ops *rw_ops,
size_t bsize,
int io_flags)
{
@@ -320,6 +699,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
desc->pg_inode = inode;
desc->pg_ops = pg_ops;
desc->pg_completion_ops = compl_ops;
+ desc->pg_rw_ops = rw_ops;
desc->pg_ioflags = io_flags;
desc->pg_error = 0;
desc->pg_lseg = NULL;
@@ -328,6 +708,94 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
}
EXPORT_SYMBOL_GPL(nfs_pageio_init);
+/**
+ * nfs_pgio_result - Basic pageio error handling
+ * @task: The task that ran
+ * @calldata: Pageio data to check
+ */
+static void nfs_pgio_result(struct rpc_task *task, void *calldata)
+{
+ struct nfs_pgio_data *data = calldata;
+ struct inode *inode = data->header->inode;
+
+ dprintk("NFS: %s: %5u, (status %d)\n", __func__,
+ task->tk_pid, task->tk_status);
+
+ if (data->header->rw_ops->rw_done(task, data, inode) != 0)
+ return;
+ if (task->tk_status < 0)
+ nfs_set_pgio_error(data->header, task->tk_status, data->args.offset);
+ else
+ data->header->rw_ops->rw_result(task, data);
+}
+
+/*
+ * Create an RPC task for the given read or write request and kick it.
+ * The page must have been locked by the caller.
+ *
+ * It may happen that the page we're passed is not marked dirty.
+ * This is the case if nfs_updatepage detects a conflicting request
+ * that has been written but not committed.
+ */
+int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr)
+{
+ struct nfs_page *req;
+ struct page **pages;
+ struct nfs_pgio_data *data;
+ struct list_head *head = &desc->pg_list;
+ struct nfs_commit_info cinfo;
+
+ data = nfs_pgio_data_alloc(hdr, nfs_page_array_len(desc->pg_base,
+ desc->pg_count));
+ if (!data)
+ return nfs_pgio_error(desc, hdr);
+
+ nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
+ pages = data->pages.pagevec;
+ while (!list_empty(head)) {
+ req = nfs_list_entry(head->next);
+ nfs_list_remove_request(req);
+ nfs_list_add_request(req, &hdr->pages);
+ *pages++ = req->wb_page;
+ }
+
+ if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
+ (desc->pg_moreio || nfs_reqs_to_commit(&cinfo)))
+ desc->pg_ioflags &= ~FLUSH_COND_STABLE;
+
+ /* Set up the argument struct */
+ nfs_pgio_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
+ hdr->data = data;
+ desc->pg_rpc_callops = &nfs_pgio_common_ops;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_generic_pgio);
+
+static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
+{
+ struct nfs_rw_header *rw_hdr;
+ struct nfs_pgio_header *hdr;
+ int ret;
+
+ rw_hdr = nfs_rw_header_alloc(desc->pg_rw_ops);
+ if (!rw_hdr) {
+ desc->pg_completion_ops->error_cleanup(&desc->pg_list);
+ return -ENOMEM;
+ }
+ hdr = &rw_hdr->header;
+ nfs_pgheader_init(desc, hdr, nfs_rw_header_free);
+ atomic_inc(&hdr->refcnt);
+ ret = nfs_generic_pgio(desc, hdr);
+ if (ret == 0)
+ ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode),
+ hdr->data, desc->pg_rpc_callops,
+ desc->pg_ioflags, 0);
+ if (atomic_dec_and_test(&hdr->refcnt))
+ hdr->completion_ops->completion(hdr);
+ return ret;
+}
+
static bool nfs_match_open_context(const struct nfs_open_context *ctx1,
const struct nfs_open_context *ctx2)
{
@@ -356,18 +824,23 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
struct nfs_page *req,
struct nfs_pageio_descriptor *pgio)
{
- if (!nfs_match_open_context(req->wb_context, prev->wb_context))
- return false;
- if (req->wb_context->dentry->d_inode->i_flock != NULL &&
- !nfs_match_lock_context(req->wb_lock_context, prev->wb_lock_context))
- return false;
- if (req->wb_pgbase != 0)
- return false;
- if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
- return false;
- if (req_offset(req) != req_offset(prev) + prev->wb_bytes)
- return false;
- return pgio->pg_ops->pg_test(pgio, prev, req);
+ size_t size;
+
+ if (prev) {
+ if (!nfs_match_open_context(req->wb_context, prev->wb_context))
+ return false;
+ if (req->wb_context->dentry->d_inode->i_flock != NULL &&
+ !nfs_match_lock_context(req->wb_lock_context,
+ prev->wb_lock_context))
+ return false;
+ if (req_offset(req) != req_offset(prev) + prev->wb_bytes)
+ return false;
+ }
+ size = pgio->pg_ops->pg_test(pgio, prev, req);
+ WARN_ON_ONCE(size > req->wb_bytes);
+ if (size && size < req->wb_bytes)
+ req->wb_bytes = size;
+ return size > 0;
}
/**
@@ -381,17 +854,16 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
struct nfs_page *req)
{
+ struct nfs_page *prev = NULL;
if (desc->pg_count != 0) {
- struct nfs_page *prev;
-
prev = nfs_list_entry(desc->pg_list.prev);
- if (!nfs_can_coalesce_requests(prev, req, desc))
- return 0;
} else {
if (desc->pg_ops->pg_init)
desc->pg_ops->pg_init(desc, req);
desc->pg_base = req->wb_pgbase;
}
+ if (!nfs_can_coalesce_requests(prev, req, desc))
+ return 0;
nfs_list_remove_request(req);
nfs_list_add_request(req, &desc->pg_list);
desc->pg_count += req->wb_bytes;
@@ -421,22 +893,73 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
* @desc: destination io descriptor
* @req: request
*
+ * This may split a request into subrequests which are all part of the
+ * same page group.
+ *
* Returns true if the request 'req' was successfully coalesced into the
* existing list of pages 'desc'.
*/
static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
struct nfs_page *req)
{
- while (!nfs_pageio_do_add_request(desc, req)) {
- desc->pg_moreio = 1;
- nfs_pageio_doio(desc);
- if (desc->pg_error < 0)
- return 0;
- desc->pg_moreio = 0;
- if (desc->pg_recoalesce)
- return 0;
- }
+ struct nfs_page *subreq;
+ unsigned int bytes_left = 0;
+ unsigned int offset, pgbase;
+
+ nfs_page_group_lock(req);
+
+ subreq = req;
+ bytes_left = subreq->wb_bytes;
+ offset = subreq->wb_offset;
+ pgbase = subreq->wb_pgbase;
+
+ do {
+ if (!nfs_pageio_do_add_request(desc, subreq)) {
+ /* make sure pg_test call(s) did nothing */
+ WARN_ON_ONCE(subreq->wb_bytes != bytes_left);
+ WARN_ON_ONCE(subreq->wb_offset != offset);
+ WARN_ON_ONCE(subreq->wb_pgbase != pgbase);
+
+ nfs_page_group_unlock(req);
+ desc->pg_moreio = 1;
+ nfs_pageio_doio(desc);
+ if (desc->pg_error < 0)
+ return 0;
+ desc->pg_moreio = 0;
+ if (desc->pg_recoalesce)
+ return 0;
+ /* retry add_request for this subreq */
+ nfs_page_group_lock(req);
+ continue;
+ }
+
+ /* check for buggy pg_test call(s) */
+ WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE);
+ WARN_ON_ONCE(subreq->wb_bytes > bytes_left);
+ WARN_ON_ONCE(subreq->wb_bytes == 0);
+
+ bytes_left -= subreq->wb_bytes;
+ offset += subreq->wb_bytes;
+ pgbase += subreq->wb_bytes;
+
+ if (bytes_left) {
+ subreq = nfs_create_request(req->wb_context,
+ req->wb_page,
+ subreq, pgbase, bytes_left);
+ if (IS_ERR(subreq))
+ goto err_ptr;
+ nfs_lock_request(subreq);
+ subreq->wb_offset = offset;
+ subreq->wb_index = req->wb_index;
+ }
+ } while (bytes_left > 0);
+
+ nfs_page_group_unlock(req);
return 1;
+err_ptr:
+ desc->pg_error = PTR_ERR(subreq);
+ nfs_page_group_unlock(req);
+ return 0;
}
static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
@@ -535,3 +1058,13 @@ void nfs_destroy_nfspagecache(void)
kmem_cache_destroy(nfs_page_cachep);
}
+static const struct rpc_call_ops nfs_pgio_common_ops = {
+ .rpc_call_prepare = nfs_pgio_prepare,
+ .rpc_call_done = nfs_pgio_result,
+ .rpc_release = nfs_pgio_release,
+};
+
+const struct nfs_pageio_ops nfs_pgio_rw_ops = {
+ .pg_test = nfs_generic_pg_test,
+ .pg_doio = nfs_generic_pg_pgios,
+};
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index fd9536e494bc..6fdcd233d6f7 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1388,11 +1388,6 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
WARN_ON_ONCE(pgio->pg_lseg != NULL);
- if (req->wb_offset != req->wb_pgbase) {
- nfs_pageio_reset_read_mds(pgio);
- return;
- }
-
if (pgio->pg_dreq == NULL)
rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
else
@@ -1417,11 +1412,6 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
{
WARN_ON_ONCE(pgio->pg_lseg != NULL);
- if (req->wb_offset != req->wb_pgbase) {
- nfs_pageio_reset_write_mds(pgio);
- return;
- }
-
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
req_offset(req),
@@ -1434,56 +1424,49 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
}
EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
-void
-pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode,
- const struct nfs_pgio_completion_ops *compl_ops)
-{
- struct nfs_server *server = NFS_SERVER(inode);
- struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
-
- if (ld == NULL)
- nfs_pageio_init_read(pgio, inode, compl_ops);
- else
- nfs_pageio_init(pgio, inode, ld->pg_read_ops, compl_ops, server->rsize, 0);
-}
-
-void
-pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode,
- int ioflags,
- const struct nfs_pgio_completion_ops *compl_ops)
-{
- struct nfs_server *server = NFS_SERVER(inode);
- struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
-
- if (ld == NULL)
- nfs_pageio_init_write(pgio, inode, ioflags, compl_ops);
- else
- nfs_pageio_init(pgio, inode, ld->pg_write_ops, compl_ops, server->wsize, ioflags);
-}
-
-bool
+/*
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+size_t
pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
- if (pgio->pg_lseg == NULL)
- return nfs_generic_pg_test(pgio, prev, req);
+ unsigned int size;
+ u64 seg_end, req_start, seg_left;
+
+ size = nfs_generic_pg_test(pgio, prev, req);
+ if (!size)
+ return 0;
/*
- * Test if a nfs_page is fully contained in the pnfs_layout_range.
- * Note that this test makes several assumptions:
- * - that the previous nfs_page in the struct nfs_pageio_descriptor
- * is known to lie within the range.
- * - that the nfs_page being tested is known to be contiguous with the
- * previous nfs_page.
- * - Layout ranges are page aligned, so we only have to test the
- * start offset of the request.
+ * 'size' contains the number of bytes left in the current page (up
+ * to the original size asked for in @req->wb_bytes).
+ *
+ * Calculate how many bytes are left in the layout segment
+ * and if there are less bytes than 'size', return that instead.
*
* Please also note that 'end_offset' is actually the offset of the
* first byte that lies outside the pnfs_layout_range. FIXME?
*
*/
- return req_offset(req) < end_offset(pgio->pg_lseg->pls_range.offset,
- pgio->pg_lseg->pls_range.length);
+ if (pgio->pg_lseg) {
+ seg_end = end_offset(pgio->pg_lseg->pls_range.offset,
+ pgio->pg_lseg->pls_range.length);
+ req_start = req_offset(req);
+ WARN_ON_ONCE(req_start > seg_end);
+ /* start of request is past the last byte of this segment */
+ if (req_start >= seg_end)
+ return 0;
+
+ /* adjust 'size' iff there are fewer bytes left in the
+ * segment than what nfs_generic_pg_test returned */
+ seg_left = seg_end - req_start;
+ if (seg_left < size)
+ size = (unsigned int)seg_left;
+ }
+
+ return size;
}
EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
@@ -1496,7 +1479,7 @@ int pnfs_write_done_resend_to_mds(struct inode *inode,
LIST_HEAD(failed);
/* Resend all requests through the MDS */
- nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, compl_ops);
+ nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, true, compl_ops);
pgio.pg_dreq = dreq;
while (!list_empty(head)) {
struct nfs_page *req = nfs_list_entry(head->next);
@@ -1519,7 +1502,7 @@ int pnfs_write_done_resend_to_mds(struct inode *inode,
}
EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
-static void pnfs_ld_handle_write_error(struct nfs_write_data *data)
+static void pnfs_ld_handle_write_error(struct nfs_pgio_data *data)
{
struct nfs_pgio_header *hdr = data->header;
@@ -1538,7 +1521,7 @@ static void pnfs_ld_handle_write_error(struct nfs_write_data *data)
/*
* Called by non rpc-based layout drivers
*/
-void pnfs_ld_write_done(struct nfs_write_data *data)
+void pnfs_ld_write_done(struct nfs_pgio_data *data)
{
struct nfs_pgio_header *hdr = data->header;
@@ -1554,7 +1537,7 @@ EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
static void
pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
- struct nfs_write_data *data)
+ struct nfs_pgio_data *data)
{
struct nfs_pgio_header *hdr = data->header;
@@ -1563,11 +1546,11 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
nfs_pageio_reset_write_mds(desc);
desc->pg_recoalesce = 1;
}
- nfs_writedata_release(data);
+ nfs_pgio_data_release(data);
}
static enum pnfs_try_status
-pnfs_try_to_write_data(struct nfs_write_data *wdata,
+pnfs_try_to_write_data(struct nfs_pgio_data *wdata,
const struct rpc_call_ops *call_ops,
struct pnfs_layout_segment *lseg,
int how)
@@ -1589,41 +1572,36 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,
}
static void
-pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *head, int how)
+pnfs_do_write(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr, int how)
{
- struct nfs_write_data *data;
+ struct nfs_pgio_data *data = hdr->data;
const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
struct pnfs_layout_segment *lseg = desc->pg_lseg;
+ enum pnfs_try_status trypnfs;
desc->pg_lseg = NULL;
- while (!list_empty(head)) {
- enum pnfs_try_status trypnfs;
-
- data = list_first_entry(head, struct nfs_write_data, list);
- list_del_init(&data->list);
-
- trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
- if (trypnfs == PNFS_NOT_ATTEMPTED)
- pnfs_write_through_mds(desc, data);
- }
+ trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
+ if (trypnfs == PNFS_NOT_ATTEMPTED)
+ pnfs_write_through_mds(desc, data);
pnfs_put_lseg(lseg);
}
static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
{
pnfs_put_lseg(hdr->lseg);
- nfs_writehdr_free(hdr);
+ nfs_rw_header_free(hdr);
}
EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
int
pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
{
- struct nfs_write_header *whdr;
+ struct nfs_rw_header *whdr;
struct nfs_pgio_header *hdr;
int ret;
- whdr = nfs_writehdr_alloc();
+ whdr = nfs_rw_header_alloc(desc->pg_rw_ops);
if (!whdr) {
desc->pg_completion_ops->error_cleanup(&desc->pg_list);
pnfs_put_lseg(desc->pg_lseg);
@@ -1634,12 +1612,12 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
atomic_inc(&hdr->refcnt);
- ret = nfs_generic_flush(desc, hdr);
+ ret = nfs_generic_pgio(desc, hdr);
if (ret != 0) {
pnfs_put_lseg(desc->pg_lseg);
desc->pg_lseg = NULL;
} else
- pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags);
+ pnfs_do_write(desc, hdr, desc->pg_ioflags);
if (atomic_dec_and_test(&hdr->refcnt))
hdr->completion_ops->completion(hdr);
return ret;
@@ -1655,7 +1633,7 @@ int pnfs_read_done_resend_to_mds(struct inode *inode,
LIST_HEAD(failed);
/* Resend all requests through the MDS */
- nfs_pageio_init_read(&pgio, inode, compl_ops);
+ nfs_pageio_init_read(&pgio, inode, true, compl_ops);
pgio.pg_dreq = dreq;
while (!list_empty(head)) {
struct nfs_page *req = nfs_list_entry(head->next);
@@ -1674,7 +1652,7 @@ int pnfs_read_done_resend_to_mds(struct inode *inode,
}
EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds);
-static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
+static void pnfs_ld_handle_read_error(struct nfs_pgio_data *data)
{
struct nfs_pgio_header *hdr = data->header;
@@ -1693,7 +1671,7 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
/*
* Called by non rpc-based layout drivers
*/
-void pnfs_ld_read_done(struct nfs_read_data *data)
+void pnfs_ld_read_done(struct nfs_pgio_data *data)
{
struct nfs_pgio_header *hdr = data->header;
@@ -1709,7 +1687,7 @@ EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
static void
pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
- struct nfs_read_data *data)
+ struct nfs_pgio_data *data)
{
struct nfs_pgio_header *hdr = data->header;
@@ -1718,14 +1696,14 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
nfs_pageio_reset_read_mds(desc);
desc->pg_recoalesce = 1;
}
- nfs_readdata_release(data);
+ nfs_pgio_data_release(data);
}
/*
* Call the appropriate parallel I/O subsystem read function.
*/
static enum pnfs_try_status
-pnfs_try_to_read_data(struct nfs_read_data *rdata,
+pnfs_try_to_read_data(struct nfs_pgio_data *rdata,
const struct rpc_call_ops *call_ops,
struct pnfs_layout_segment *lseg)
{
@@ -1747,41 +1725,35 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata,
}
static void
-pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *head)
+pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
{
- struct nfs_read_data *data;
+ struct nfs_pgio_data *data = hdr->data;
const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
struct pnfs_layout_segment *lseg = desc->pg_lseg;
+ enum pnfs_try_status trypnfs;
desc->pg_lseg = NULL;
- while (!list_empty(head)) {
- enum pnfs_try_status trypnfs;
-
- data = list_first_entry(head, struct nfs_read_data, list);
- list_del_init(&data->list);
-
- trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
- if (trypnfs == PNFS_NOT_ATTEMPTED)
- pnfs_read_through_mds(desc, data);
- }
+ trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
+ if (trypnfs == PNFS_NOT_ATTEMPTED)
+ pnfs_read_through_mds(desc, data);
pnfs_put_lseg(lseg);
}
static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
{
pnfs_put_lseg(hdr->lseg);
- nfs_readhdr_free(hdr);
+ nfs_rw_header_free(hdr);
}
EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
int
pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
{
- struct nfs_read_header *rhdr;
+ struct nfs_rw_header *rhdr;
struct nfs_pgio_header *hdr;
int ret;
- rhdr = nfs_readhdr_alloc();
+ rhdr = nfs_rw_header_alloc(desc->pg_rw_ops);
if (!rhdr) {
desc->pg_completion_ops->error_cleanup(&desc->pg_list);
ret = -ENOMEM;
@@ -1793,12 +1765,12 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
atomic_inc(&hdr->refcnt);
- ret = nfs_generic_pagein(desc, hdr);
+ ret = nfs_generic_pgio(desc, hdr);
if (ret != 0) {
pnfs_put_lseg(desc->pg_lseg);
desc->pg_lseg = NULL;
} else
- pnfs_do_multiple_reads(desc, &hdr->rpc_list);
+ pnfs_do_read(desc, hdr);
if (atomic_dec_and_test(&hdr->refcnt))
hdr->completion_ops->completion(hdr);
return ret;
@@ -1848,7 +1820,7 @@ void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
void
-pnfs_set_layoutcommit(struct nfs_write_data *wdata)
+pnfs_set_layoutcommit(struct nfs_pgio_data *wdata)
{
struct nfs_pgio_header *hdr = wdata->header;
struct inode *inode = hdr->inode;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index c3058a076596..4fb309a2b4c4 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -113,8 +113,8 @@ struct pnfs_layoutdriver_type {
* Return PNFS_ATTEMPTED to indicate the layout code has attempted
* I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
*/
- enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
- enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
+ enum pnfs_try_status (*read_pagelist) (struct nfs_pgio_data *nfs_data);
+ enum pnfs_try_status (*write_pagelist) (struct nfs_pgio_data *nfs_data, int how);
void (*free_deviceid_node) (struct nfs4_deviceid_node *);
@@ -180,11 +180,6 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
-void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *,
- const struct nfs_pgio_completion_ops *);
-void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *,
- int, const struct nfs_pgio_completion_ops *);
-
void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
void unset_pnfs_layoutdriver(struct nfs_server *);
void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *);
@@ -192,7 +187,8 @@ int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req, u64 wb_size);
int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc);
-bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
+size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *prev, struct nfs_page *req);
void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg);
struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp);
void pnfs_free_lseg_list(struct list_head *tmp_list);
@@ -217,13 +213,13 @@ bool pnfs_roc(struct inode *ino);
void pnfs_roc_release(struct inode *ino);
void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
-void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
+void pnfs_set_layoutcommit(struct nfs_pgio_data *wdata);
void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
int _pnfs_return_layout(struct inode *);
int pnfs_commit_and_return_layout(struct inode *);
-void pnfs_ld_write_done(struct nfs_write_data *);
-void pnfs_ld_read_done(struct nfs_read_data *);
+void pnfs_ld_write_done(struct nfs_pgio_data *);
+void pnfs_ld_read_done(struct nfs_pgio_data *);
struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
struct nfs_open_context *ctx,
loff_t pos,
@@ -461,18 +457,6 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
{
}
-static inline void pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode,
- const struct nfs_pgio_completion_ops *compl_ops)
-{
- nfs_pageio_init_read(pgio, inode, compl_ops);
-}
-
-static inline void pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags,
- const struct nfs_pgio_completion_ops *compl_ops)
-{
- nfs_pageio_init_write(pgio, inode, ioflags, compl_ops);
-}
-
static inline int
pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how,
struct nfs_commit_info *cinfo)
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index e55ce9e8b034..c171ce1a8a30 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -578,7 +578,7 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
return 0;
}
-static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_data *data)
{
struct inode *inode = data->header->inode;
@@ -594,18 +594,18 @@ static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
return 0;
}
-static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
+static void nfs_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
{
msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
}
-static int nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
{
rpc_call_start(task);
return 0;
}
-static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_data *data)
{
struct inode *inode = data->header->inode;
@@ -614,19 +614,13 @@ static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
return 0;
}
-static void nfs_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
+static void nfs_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
{
/* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */
data->args.stable = NFS_FILE_SYNC;
msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
}
-static int nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
-{
- rpc_call_start(task);
- return 0;
-}
-
static void nfs_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
{
BUG();
@@ -734,13 +728,10 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
.fsinfo = nfs_proc_fsinfo,
.pathconf = nfs_proc_pathconf,
.decode_dirent = nfs2_decode_dirent,
+ .pgio_rpc_prepare = nfs_proc_pgio_rpc_prepare,
.read_setup = nfs_proc_read_setup,
- .read_pageio_init = nfs_pageio_init_read,
- .read_rpc_prepare = nfs_proc_read_rpc_prepare,
.read_done = nfs_read_done,
.write_setup = nfs_proc_write_setup,
- .write_pageio_init = nfs_pageio_init_write,
- .write_rpc_prepare = nfs_proc_write_rpc_prepare,
.write_done = nfs_write_done,
.commit_setup = nfs_proc_commit_setup,
.commit_rpc_prepare = nfs_proc_commit_rpc_prepare,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 411aedda14bb..e818a475ca64 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -24,85 +24,24 @@
#include "internal.h"
#include "iostat.h"
#include "fscache.h"
+#include "pnfs.h"
#define NFSDBG_FACILITY NFSDBG_PAGECACHE
-static const struct nfs_pageio_ops nfs_pageio_read_ops;
-static const struct rpc_call_ops nfs_read_common_ops;
static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops;
+static const struct nfs_rw_ops nfs_rw_read_ops;
static struct kmem_cache *nfs_rdata_cachep;
-struct nfs_read_header *nfs_readhdr_alloc(void)
+static struct nfs_rw_header *nfs_readhdr_alloc(void)
{
- struct nfs_read_header *rhdr;
-
- rhdr = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
- if (rhdr) {
- struct nfs_pgio_header *hdr = &rhdr->header;
-
- INIT_LIST_HEAD(&hdr->pages);
- INIT_LIST_HEAD(&hdr->rpc_list);
- spin_lock_init(&hdr->lock);
- atomic_set(&hdr->refcnt, 0);
- }
- return rhdr;
+ return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
}
-EXPORT_SYMBOL_GPL(nfs_readhdr_alloc);
-static struct nfs_read_data *nfs_readdata_alloc(struct nfs_pgio_header *hdr,
- unsigned int pagecount)
+static void nfs_readhdr_free(struct nfs_rw_header *rhdr)
{
- struct nfs_read_data *data, *prealloc;
-
- prealloc = &container_of(hdr, struct nfs_read_header, header)->rpc_data;
- if (prealloc->header == NULL)
- data = prealloc;
- else
- data = kzalloc(sizeof(*data), GFP_KERNEL);
- if (!data)
- goto out;
-
- if (nfs_pgarray_set(&data->pages, pagecount)) {
- data->header = hdr;
- atomic_inc(&hdr->refcnt);
- } else {
- if (data != prealloc)
- kfree(data);
- data = NULL;
- }
-out:
- return data;
-}
-
-void nfs_readhdr_free(struct nfs_pgio_header *hdr)
-{
- struct nfs_read_header *rhdr = container_of(hdr, struct nfs_read_header, header);
-
kmem_cache_free(nfs_rdata_cachep, rhdr);
}
-EXPORT_SYMBOL_GPL(nfs_readhdr_free);
-
-void nfs_readdata_release(struct nfs_read_data *rdata)
-{
- struct nfs_pgio_header *hdr = rdata->header;
- struct nfs_read_header *read_header = container_of(hdr, struct nfs_read_header, header);
-
- put_nfs_open_context(rdata->args.context);
- if (rdata->pages.pagevec != rdata->pages.page_array)
- kfree(rdata->pages.pagevec);
- if (rdata == &read_header->rpc_data) {
- rdata->header = NULL;
- rdata = NULL;
- }
- if (atomic_dec_and_test(&hdr->refcnt))
- hdr->completion_ops->completion(hdr);
- /* Note: we only free the rpc_task after callbacks are done.
- * See the comment in rpc_free_task() for why
- */
- kfree(rdata);
-}
-EXPORT_SYMBOL_GPL(nfs_readdata_release);
static
int nfs_return_empty_page(struct page *page)
@@ -114,17 +53,24 @@ int nfs_return_empty_page(struct page *page)
}
void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
- struct inode *inode,
+ struct inode *inode, bool force_mds,
const struct nfs_pgio_completion_ops *compl_ops)
{
- nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops, compl_ops,
- NFS_SERVER(inode)->rsize, 0);
+ struct nfs_server *server = NFS_SERVER(inode);
+ const struct nfs_pageio_ops *pg_ops = &nfs_pgio_rw_ops;
+
+#ifdef CONFIG_NFS_V4_1
+ if (server->pnfs_curr_ld && !force_mds)
+ pg_ops = server->pnfs_curr_ld->pg_read_ops;
+#endif
+ nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops,
+ server->rsize, 0);
}
EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
{
- pgio->pg_ops = &nfs_pageio_read_ops;
+ pgio->pg_ops = &nfs_pgio_rw_ops;
pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize;
}
EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
@@ -139,7 +85,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
len = nfs_page_length(page);
if (len == 0)
return nfs_return_empty_page(page);
- new = nfs_create_request(ctx, inode, page, 0, len);
+ new = nfs_create_request(ctx, page, NULL, 0, len);
if (IS_ERR(new)) {
unlock_page(page);
return PTR_ERR(new);
@@ -147,7 +93,8 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
if (len < PAGE_CACHE_SIZE)
zero_user_segment(page, len, PAGE_CACHE_SIZE);
- NFS_PROTO(inode)->read_pageio_init(&pgio, inode, &nfs_async_read_completion_ops);
+ nfs_pageio_init_read(&pgio, inode, false,
+ &nfs_async_read_completion_ops);
nfs_pageio_add_request(&pgio, new);
nfs_pageio_complete(&pgio);
NFS_I(inode)->read_io += pgio.pg_bytes_written;
@@ -158,10 +105,16 @@ static void nfs_readpage_release(struct nfs_page *req)
{
struct inode *d_inode = req->wb_context->dentry->d_inode;
- if (PageUptodate(req->wb_page))
- nfs_readpage_to_fscache(d_inode, req->wb_page, 0);
+ dprintk("NFS: read done (%s/%llu %d@%lld)\n", d_inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(d_inode), req->wb_bytes,
+ (long long)req_offset(req));
- unlock_page(req->wb_page);
+ if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
+ if (PageUptodate(req->wb_page))
+ nfs_readpage_to_fscache(d_inode, req->wb_page, 0);
+
+ unlock_page(req->wb_page);
+ }
dprintk("NFS: read done (%s/%Lu %d@%Ld)\n",
req->wb_context->dentry->d_inode->i_sb->s_id,
@@ -171,7 +124,12 @@ static void nfs_readpage_release(struct nfs_page *req)
nfs_release_request(req);
}
-/* Note io was page aligned */
+static void nfs_page_group_set_uptodate(struct nfs_page *req)
+{
+ if (nfs_page_group_sync_on_bit(req, PG_UPTODATE))
+ SetPageUptodate(req->wb_page);
+}
+
static void nfs_read_completion(struct nfs_pgio_header *hdr)
{
unsigned long bytes = 0;
@@ -181,21 +139,32 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr)
while (!list_empty(&hdr->pages)) {
struct nfs_page *req = nfs_list_entry(hdr->pages.next);
struct page *page = req->wb_page;
+ unsigned long start = req->wb_pgbase;
+ unsigned long end = req->wb_pgbase + req->wb_bytes;
if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) {
- if (bytes > hdr->good_bytes)
- zero_user(page, 0, PAGE_SIZE);
- else if (hdr->good_bytes - bytes < PAGE_SIZE)
- zero_user_segment(page,
- hdr->good_bytes & ~PAGE_MASK,
- PAGE_SIZE);
+ /* note: regions of the page not covered by a
+ * request are zeroed in nfs_readpage_async /
+ * readpage_async_filler */
+ if (bytes > hdr->good_bytes) {
+ /* nothing in this request was good, so zero
+ * the full extent of the request */
+ zero_user_segment(page, start, end);
+
+ } else if (hdr->good_bytes - bytes < req->wb_bytes) {
+ /* part of this request has good bytes, but
+ * not all. zero the bad bytes */
+ start += hdr->good_bytes - bytes;
+ WARN_ON(start < req->wb_pgbase);
+ zero_user_segment(page, start, end);
+ }
}
bytes += req->wb_bytes;
if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
if (bytes <= hdr->good_bytes)
- SetPageUptodate(page);
+ nfs_page_group_set_uptodate(req);
} else
- SetPageUptodate(page);
+ nfs_page_group_set_uptodate(req);
nfs_list_remove_request(req);
nfs_readpage_release(req);
}
@@ -203,95 +172,14 @@ out:
hdr->release(hdr);
}
-int nfs_initiate_read(struct rpc_clnt *clnt,
- struct nfs_read_data *data,
- const struct rpc_call_ops *call_ops, int flags)
+static void nfs_initiate_read(struct nfs_pgio_data *data, struct rpc_message *msg,
+ struct rpc_task_setup *task_setup_data, int how)
{
struct inode *inode = data->header->inode;
int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
- struct rpc_task *task;
- struct rpc_message msg = {
- .rpc_argp = &data->args,
- .rpc_resp = &data->res,
- .rpc_cred = data->header->cred,
- };
- struct rpc_task_setup task_setup_data = {
- .task = &data->task,
- .rpc_client = clnt,
- .rpc_message = &msg,
- .callback_ops = call_ops,
- .callback_data = data,
- .workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC | swap_flags | flags,
- };
- /* Set up the initial task struct. */
- NFS_PROTO(inode)->read_setup(data, &msg);
-
- dprintk("NFS: %5u initiated read call (req %s/%llu, %u bytes @ "
- "offset %llu)\n",
- data->task.tk_pid,
- inode->i_sb->s_id,
- (unsigned long long)NFS_FILEID(inode),
- data->args.count,
- (unsigned long long)data->args.offset);
-
- task = rpc_run_task(&task_setup_data);
- if (IS_ERR(task))
- return PTR_ERR(task);
- rpc_put_task(task);
- return 0;
-}
-EXPORT_SYMBOL_GPL(nfs_initiate_read);
-
-/*
- * Set up the NFS read request struct
- */
-static void nfs_read_rpcsetup(struct nfs_read_data *data,
- unsigned int count, unsigned int offset)
-{
- struct nfs_page *req = data->header->req;
-
- data->args.fh = NFS_FH(data->header->inode);
- data->args.offset = req_offset(req) + offset;
- data->args.pgbase = req->wb_pgbase + offset;
- data->args.pages = data->pages.pagevec;
- data->args.count = count;
- data->args.context = get_nfs_open_context(req->wb_context);
- data->args.lock_context = req->wb_lock_context;
-
- data->res.fattr = &data->fattr;
- data->res.count = count;
- data->res.eof = 0;
- nfs_fattr_init(&data->fattr);
-}
-
-static int nfs_do_read(struct nfs_read_data *data,
- const struct rpc_call_ops *call_ops)
-{
- struct inode *inode = data->header->inode;
-
- return nfs_initiate_read(NFS_CLIENT(inode), data, call_ops, 0);
-}
-
-static int
-nfs_do_multiple_reads(struct list_head *head,
- const struct rpc_call_ops *call_ops)
-{
- struct nfs_read_data *data;
- int ret = 0;
-
- while (!list_empty(head)) {
- int ret2;
-
- data = list_first_entry(head, struct nfs_read_data, list);
- list_del_init(&data->list);
-
- ret2 = nfs_do_read(data, call_ops);
- if (ret == 0)
- ret = ret2;
- }
- return ret;
+ task_setup_data->flags |= swap_flags;
+ NFS_PROTO(inode)->read_setup(data, msg);
}
static void
@@ -311,143 +199,14 @@ static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = {
.completion = nfs_read_completion,
};
-static void nfs_pagein_error(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr)
-{
- set_bit(NFS_IOHDR_REDO, &hdr->flags);
- while (!list_empty(&hdr->rpc_list)) {
- struct nfs_read_data *data = list_first_entry(&hdr->rpc_list,
- struct nfs_read_data, list);
- list_del(&data->list);
- nfs_readdata_release(data);
- }
- desc->pg_completion_ops->error_cleanup(&desc->pg_list);
-}
-
-/*
- * Generate multiple requests to fill a single page.
- *
- * We optimize to reduce the number of read operations on the wire. If we
- * detect that we're reading a page, or an area of a page, that is past the
- * end of file, we do not generate NFS read operations but just clear the
- * parts of the page that would have come back zero from the server anyway.
- *
- * We rely on the cached value of i_size to make this determination; another
- * client can fill pages on the server past our cached end-of-file, but we
- * won't see the new data until our attribute cache is updated. This is more
- * or less conventional NFS client behavior.
- */
-static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr)
-{
- struct nfs_page *req = hdr->req;
- struct page *page = req->wb_page;
- struct nfs_read_data *data;
- size_t rsize = desc->pg_bsize, nbytes;
- unsigned int offset;
-
- offset = 0;
- nbytes = desc->pg_count;
- do {
- size_t len = min(nbytes,rsize);
-
- data = nfs_readdata_alloc(hdr, 1);
- if (!data) {
- nfs_pagein_error(desc, hdr);
- return -ENOMEM;
- }
- data->pages.pagevec[0] = page;
- nfs_read_rpcsetup(data, len, offset);
- list_add(&data->list, &hdr->rpc_list);
- nbytes -= len;
- offset += len;
- } while (nbytes != 0);
-
- nfs_list_remove_request(req);
- nfs_list_add_request(req, &hdr->pages);
- desc->pg_rpc_callops = &nfs_read_common_ops;
- return 0;
-}
-
-static int nfs_pagein_one(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr)
-{
- struct nfs_page *req;
- struct page **pages;
- struct nfs_read_data *data;
- struct list_head *head = &desc->pg_list;
-
- data = nfs_readdata_alloc(hdr, nfs_page_array_len(desc->pg_base,
- desc->pg_count));
- if (!data) {
- nfs_pagein_error(desc, hdr);
- return -ENOMEM;
- }
-
- pages = data->pages.pagevec;
- while (!list_empty(head)) {
- req = nfs_list_entry(head->next);
- nfs_list_remove_request(req);
- nfs_list_add_request(req, &hdr->pages);
- *pages++ = req->wb_page;
- }
-
- nfs_read_rpcsetup(data, desc->pg_count, 0);
- list_add(&data->list, &hdr->rpc_list);
- desc->pg_rpc_callops = &nfs_read_common_ops;
- return 0;
-}
-
-int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr)
-{
- if (desc->pg_bsize < PAGE_CACHE_SIZE)
- return nfs_pagein_multi(desc, hdr);
- return nfs_pagein_one(desc, hdr);
-}
-EXPORT_SYMBOL_GPL(nfs_generic_pagein);
-
-static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
-{
- struct nfs_read_header *rhdr;
- struct nfs_pgio_header *hdr;
- int ret;
-
- rhdr = nfs_readhdr_alloc();
- if (!rhdr) {
- desc->pg_completion_ops->error_cleanup(&desc->pg_list);
- return -ENOMEM;
- }
- hdr = &rhdr->header;
- nfs_pgheader_init(desc, hdr, nfs_readhdr_free);
- atomic_inc(&hdr->refcnt);
- ret = nfs_generic_pagein(desc, hdr);
- if (ret == 0)
- ret = nfs_do_multiple_reads(&hdr->rpc_list,
- desc->pg_rpc_callops);
- if (atomic_dec_and_test(&hdr->refcnt))
- hdr->completion_ops->completion(hdr);
- return ret;
-}
-
-static const struct nfs_pageio_ops nfs_pageio_read_ops = {
- .pg_test = nfs_generic_pg_test,
- .pg_doio = nfs_generic_pg_readpages,
-};
-
/*
* This is the callback from RPC telling us whether a reply was
* received or some error occurred (timeout or socket shutdown).
*/
-int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs_readpage_done(struct rpc_task *task, struct nfs_pgio_data *data,
+ struct inode *inode)
{
- struct inode *inode = data->header->inode;
- int status;
-
- dprintk("NFS: %s: %5u, (status %d)\n", __func__, task->tk_pid,
- task->tk_status);
-
- status = NFS_PROTO(inode)->read_done(task, data);
+ int status = NFS_PROTO(inode)->read_done(task, data);
if (status != 0)
return status;
@@ -460,10 +219,10 @@ int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data)
return 0;
}
-static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data)
+static void nfs_readpage_retry(struct rpc_task *task, struct nfs_pgio_data *data)
{
- struct nfs_readargs *argp = &data->args;
- struct nfs_readres *resp = &data->res;
+ struct nfs_pgio_args *argp = &data->args;
+ struct nfs_pgio_res *resp = &data->res;
/* This is a short read! */
nfs_inc_stats(data->header->inode, NFSIOS_SHORTREAD);
@@ -480,17 +239,11 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
rpc_restart_call_prepare(task);
}
-static void nfs_readpage_result_common(struct rpc_task *task, void *calldata)
+static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *data)
{
- struct nfs_read_data *data = calldata;
struct nfs_pgio_header *hdr = data->header;
- /* Note the only returns of nfs_readpage_result are 0 and -EAGAIN */
- if (nfs_readpage_result(task, data) != 0)
- return;
- if (task->tk_status < 0)
- nfs_set_pgio_error(hdr, task->tk_status, data->args.offset);
- else if (data->res.eof) {
+ if (data->res.eof) {
loff_t bound;
bound = data->args.offset + data->res.count;
@@ -505,26 +258,6 @@ static void nfs_readpage_result_common(struct rpc_task *task, void *calldata)
nfs_readpage_retry(task, data);
}
-static void nfs_readpage_release_common(void *calldata)
-{
- nfs_readdata_release(calldata);
-}
-
-void nfs_read_prepare(struct rpc_task *task, void *calldata)
-{
- struct nfs_read_data *data = calldata;
- int err;
- err = NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data);
- if (err)
- rpc_exit(task, err);
-}
-
-static const struct rpc_call_ops nfs_read_common_ops = {
- .rpc_call_prepare = nfs_read_prepare,
- .rpc_call_done = nfs_readpage_result_common,
- .rpc_release = nfs_readpage_release_common,
-};
-
/*
* Read a page over NFS.
* We read the page synchronously in the following case:
@@ -592,7 +325,6 @@ static int
readpage_async_filler(void *data, struct page *page)
{
struct nfs_readdesc *desc = (struct nfs_readdesc *)data;
- struct inode *inode = page_file_mapping(page)->host;
struct nfs_page *new;
unsigned int len;
int error;
@@ -601,7 +333,7 @@ readpage_async_filler(void *data, struct page *page)
if (len == 0)
return nfs_return_empty_page(page);
- new = nfs_create_request(desc->ctx, inode, page, 0, len);
+ new = nfs_create_request(desc->ctx, page, NULL, 0, len);
if (IS_ERR(new))
goto out_error;
@@ -654,7 +386,8 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
if (ret == 0)
goto read_complete; /* all pages were read */
- NFS_PROTO(inode)->read_pageio_init(&pgio, inode, &nfs_async_read_completion_ops);
+ nfs_pageio_init_read(&pgio, inode, false,
+ &nfs_async_read_completion_ops);
ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
@@ -671,7 +404,7 @@ out:
int __init nfs_init_readpagecache(void)
{
nfs_rdata_cachep = kmem_cache_create("nfs_read_data",
- sizeof(struct nfs_read_header),
+ sizeof(struct nfs_rw_header),
0, SLAB_HWCACHE_ALIGN,
NULL);
if (nfs_rdata_cachep == NULL)
@@ -684,3 +417,12 @@ void nfs_destroy_readpagecache(void)
{
kmem_cache_destroy(nfs_rdata_cachep);
}
+
+static const struct nfs_rw_ops nfs_rw_read_ops = {
+ .rw_mode = FMODE_READ,
+ .rw_alloc_header = nfs_readhdr_alloc,
+ .rw_free_header = nfs_readhdr_free,
+ .rw_done = nfs_readpage_done,
+ .rw_result = nfs_readpage_result,
+ .rw_initiate = nfs_initiate_read,
+};
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2cb56943e232..084af1060d79 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2180,11 +2180,23 @@ out_no_address:
return -EINVAL;
}
+#define NFS_MOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \
+ | NFS_MOUNT_SECURE \
+ | NFS_MOUNT_TCP \
+ | NFS_MOUNT_VER3 \
+ | NFS_MOUNT_KERBEROS \
+ | NFS_MOUNT_NONLM \
+ | NFS_MOUNT_BROKEN_SUID \
+ | NFS_MOUNT_STRICTLOCK \
+ | NFS_MOUNT_UNSHARED \
+ | NFS_MOUNT_NORESVPORT \
+ | NFS_MOUNT_LEGACY_INTERFACE)
+
static int
nfs_compare_remount_data(struct nfs_server *nfss,
struct nfs_parsed_mount_data *data)
{
- if (data->flags != nfss->flags ||
+ if ((data->flags ^ nfss->flags) & NFS_MOUNT_CMP_FLAGMASK ||
data->rsize != nfss->rsize ||
data->wsize != nfss->wsize ||
data->version != nfss->nfs_client->rpc_ops->version ||
@@ -2248,6 +2260,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen;
data->version = nfsvers;
data->minorversion = nfss->nfs_client->cl_minorversion;
+ data->net = current->nsproxy->net_ns;
memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
data->nfs_server.addrlen);
@@ -2347,18 +2360,6 @@ void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)
nfs_initialise_sb(sb);
}
-#define NFS_MOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \
- | NFS_MOUNT_SECURE \
- | NFS_MOUNT_TCP \
- | NFS_MOUNT_VER3 \
- | NFS_MOUNT_KERBEROS \
- | NFS_MOUNT_NONLM \
- | NFS_MOUNT_BROKEN_SUID \
- | NFS_MOUNT_STRICTLOCK \
- | NFS_MOUNT_UNSHARED \
- | NFS_MOUNT_NORESVPORT \
- | NFS_MOUNT_LEGACY_INTERFACE)
-
static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags)
{
const struct nfs_server *a = s->s_fs_info;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ffb9459f180b..3ee5af4e738e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -42,10 +42,10 @@
* Local function declarations
*/
static void nfs_redirty_request(struct nfs_page *req);
-static const struct rpc_call_ops nfs_write_common_ops;
static const struct rpc_call_ops nfs_commit_ops;
static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops;
static const struct nfs_commit_completion_ops nfs_commit_completion_ops;
+static const struct nfs_rw_ops nfs_rw_write_ops;
static struct kmem_cache *nfs_wdata_cachep;
static mempool_t *nfs_wdata_mempool;
@@ -70,76 +70,19 @@ void nfs_commit_free(struct nfs_commit_data *p)
}
EXPORT_SYMBOL_GPL(nfs_commit_free);
-struct nfs_write_header *nfs_writehdr_alloc(void)
+static struct nfs_rw_header *nfs_writehdr_alloc(void)
{
- struct nfs_write_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
-
- if (p) {
- struct nfs_pgio_header *hdr = &p->header;
+ struct nfs_rw_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
+ if (p)
memset(p, 0, sizeof(*p));
- INIT_LIST_HEAD(&hdr->pages);
- INIT_LIST_HEAD(&hdr->rpc_list);
- spin_lock_init(&hdr->lock);
- atomic_set(&hdr->refcnt, 0);
- hdr->verf = &p->verf;
- }
return p;
}
-EXPORT_SYMBOL_GPL(nfs_writehdr_alloc);
-
-static struct nfs_write_data *nfs_writedata_alloc(struct nfs_pgio_header *hdr,
- unsigned int pagecount)
-{
- struct nfs_write_data *data, *prealloc;
-
- prealloc = &container_of(hdr, struct nfs_write_header, header)->rpc_data;
- if (prealloc->header == NULL)
- data = prealloc;
- else
- data = kzalloc(sizeof(*data), GFP_KERNEL);
- if (!data)
- goto out;
-
- if (nfs_pgarray_set(&data->pages, pagecount)) {
- data->header = hdr;
- atomic_inc(&hdr->refcnt);
- } else {
- if (data != prealloc)
- kfree(data);
- data = NULL;
- }
-out:
- return data;
-}
-void nfs_writehdr_free(struct nfs_pgio_header *hdr)
+static void nfs_writehdr_free(struct nfs_rw_header *whdr)
{
- struct nfs_write_header *whdr = container_of(hdr, struct nfs_write_header, header);
mempool_free(whdr, nfs_wdata_mempool);
}
-EXPORT_SYMBOL_GPL(nfs_writehdr_free);
-
-void nfs_writedata_release(struct nfs_write_data *wdata)
-{
- struct nfs_pgio_header *hdr = wdata->header;
- struct nfs_write_header *write_header = container_of(hdr, struct nfs_write_header, header);
-
- put_nfs_open_context(wdata->args.context);
- if (wdata->pages.pagevec != wdata->pages.page_array)
- kfree(wdata->pages.pagevec);
- if (wdata == &write_header->rpc_data) {
- wdata->header = NULL;
- wdata = NULL;
- }
- if (atomic_dec_and_test(&hdr->refcnt))
- hdr->completion_ops->completion(hdr);
- /* Note: we only free the rpc_task after callbacks are done.
- * See the comment in rpc_free_task() for why
- */
- kfree(wdata);
-}
-EXPORT_SYMBOL_GPL(nfs_writedata_release);
static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
{
@@ -211,18 +154,78 @@ static void nfs_set_pageerror(struct page *page)
nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page));
}
+/*
+ * nfs_page_group_search_locked
+ * @head - head request of page group
+ * @page_offset - offset into page
+ *
+ * Search page group with head @head to find a request that contains the
+ * page offset @page_offset.
+ *
+ * Returns a pointer to the first matching nfs request, or NULL if no
+ * match is found.
+ *
+ * Must be called with the page group lock held
+ */
+static struct nfs_page *
+nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset)
+{
+ struct nfs_page *req;
+
+ WARN_ON_ONCE(head != head->wb_head);
+ WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_head->wb_flags));
+
+ req = head;
+ do {
+ if (page_offset >= req->wb_pgbase &&
+ page_offset < (req->wb_pgbase + req->wb_bytes))
+ return req;
+
+ req = req->wb_this_page;
+ } while (req != head);
+
+ return NULL;
+}
+
+/*
+ * nfs_page_group_covers_page
+ * @head - head request of page group
+ *
+ * Return true if the page group with head @head covers the whole page,
+ * returns false otherwise
+ */
+static bool nfs_page_group_covers_page(struct nfs_page *req)
+{
+ struct nfs_page *tmp;
+ unsigned int pos = 0;
+ unsigned int len = nfs_page_length(req->wb_page);
+
+ nfs_page_group_lock(req);
+
+ do {
+ tmp = nfs_page_group_search_locked(req->wb_head, pos);
+ if (tmp) {
+ /* no way this should happen */
+ WARN_ON_ONCE(tmp->wb_pgbase != pos);
+ pos += tmp->wb_bytes - (pos - tmp->wb_pgbase);
+ }
+ } while (tmp && pos < len);
+
+ nfs_page_group_unlock(req);
+ WARN_ON_ONCE(pos > len);
+ return pos == len;
+}
+
/* We can set the PG_uptodate flag if we see that a write request
* covers the full page.
*/
-static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int count)
+static void nfs_mark_uptodate(struct nfs_page *req)
{
- if (PageUptodate(page))
- return;
- if (base != 0)
+ if (PageUptodate(req->wb_page))
return;
- if (count != nfs_page_length(page))
+ if (!nfs_page_group_covers_page(req))
return;
- SetPageUptodate(page);
+ SetPageUptodate(req->wb_page);
}
static int wb_priority(struct writeback_control *wbc)
@@ -258,12 +261,15 @@ static void nfs_set_page_writeback(struct page *page)
}
}
-static void nfs_end_page_writeback(struct page *page)
+static void nfs_end_page_writeback(struct nfs_page *req)
{
- struct inode *inode = page_file_mapping(page)->host;
+ struct inode *inode = page_file_mapping(req->wb_page)->host;
struct nfs_server *nfss = NFS_SERVER(inode);
- end_page_writeback(page);
+ if (!nfs_page_group_sync_on_bit(req, PG_WB_END))
+ return;
+
+ end_page_writeback(req->wb_page);
if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
}
@@ -354,10 +360,8 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
struct nfs_pageio_descriptor pgio;
int err;
- NFS_PROTO(page_file_mapping(page)->host)->write_pageio_init(&pgio,
- page->mapping->host,
- wb_priority(wbc),
- &nfs_async_write_completion_ops);
+ nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc),
+ false, &nfs_async_write_completion_ops);
err = nfs_do_writepage(page, wbc, &pgio);
nfs_pageio_complete(&pgio);
if (err < 0)
@@ -400,7 +404,8 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
- NFS_PROTO(inode)->write_pageio_init(&pgio, inode, wb_priority(wbc), &nfs_async_write_completion_ops);
+ nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false,
+ &nfs_async_write_completion_ops);
err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
nfs_pageio_complete(&pgio);
@@ -425,6 +430,8 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
{
struct nfs_inode *nfsi = NFS_I(inode);
+ WARN_ON_ONCE(req->wb_this_page != req);
+
/* Lock the request! */
nfs_lock_request(req);
@@ -441,6 +448,7 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
set_page_private(req->wb_page, (unsigned long)req);
}
nfsi->npages++;
+ set_bit(PG_INODE_REF, &req->wb_flags);
kref_get(&req->wb_kref);
spin_unlock(&inode->i_lock);
}
@@ -452,15 +460,20 @@ static void nfs_inode_remove_request(struct nfs_page *req)
{
struct inode *inode = req->wb_context->dentry->d_inode;
struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_page *head;
- spin_lock(&inode->i_lock);
- if (likely(!PageSwapCache(req->wb_page))) {
- set_page_private(req->wb_page, 0);
- ClearPagePrivate(req->wb_page);
- clear_bit(PG_MAPPED, &req->wb_flags);
+ if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) {
+ head = req->wb_head;
+
+ spin_lock(&inode->i_lock);
+ if (likely(!PageSwapCache(head->wb_page))) {
+ set_page_private(head->wb_page, 0);
+ ClearPagePrivate(head->wb_page);
+ clear_bit(PG_MAPPED, &head->wb_flags);
+ }
+ nfsi->npages--;
+ spin_unlock(&inode->i_lock);
}
- nfsi->npages--;
- spin_unlock(&inode->i_lock);
nfs_release_request(req);
}
@@ -583,7 +596,7 @@ nfs_clear_request_commit(struct nfs_page *req)
}
static inline
-int nfs_write_need_commit(struct nfs_write_data *data)
+int nfs_write_need_commit(struct nfs_pgio_data *data)
{
if (data->verf.committed == NFS_DATA_SYNC)
return data->header->lseg == NULL;
@@ -614,7 +627,7 @@ nfs_clear_request_commit(struct nfs_page *req)
}
static inline
-int nfs_write_need_commit(struct nfs_write_data *data)
+int nfs_write_need_commit(struct nfs_pgio_data *data)
{
return 0;
}
@@ -625,6 +638,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
{
struct nfs_commit_info cinfo;
unsigned long bytes = 0;
+ bool do_destroy;
if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
goto out;
@@ -645,7 +659,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
goto next;
}
if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
- memcpy(&req->wb_verf, &hdr->verf->verifier, sizeof(req->wb_verf));
+ memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
nfs_mark_request_commit(req, hdr->lseg, &cinfo);
goto next;
}
@@ -653,7 +667,8 @@ remove_req:
nfs_inode_remove_request(req);
next:
nfs_unlock_request(req);
- nfs_end_page_writeback(req->wb_page);
+ nfs_end_page_writeback(req);
+ do_destroy = !test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags);
nfs_release_request(req);
}
out:
@@ -661,7 +676,7 @@ out:
}
#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
-static unsigned long
+unsigned long
nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
{
return cinfo->mds->ncommit;
@@ -718,7 +733,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
}
#else
-static unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
+unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
{
return 0;
}
@@ -758,6 +773,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
if (req == NULL)
goto out_unlock;
+ /* should be handled by nfs_flush_incompatible */
+ WARN_ON_ONCE(req->wb_head != req);
+ WARN_ON_ONCE(req->wb_this_page != req);
+
rqend = req->wb_offset + req->wb_bytes;
/*
* Tell the caller to flush out the request if
@@ -819,7 +838,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
req = nfs_try_to_update_request(inode, page, offset, bytes);
if (req != NULL)
goto out;
- req = nfs_create_request(ctx, inode, page, offset, bytes);
+ req = nfs_create_request(ctx, page, NULL, offset, bytes);
if (IS_ERR(req))
goto out;
nfs_inode_add_request(inode, req);
@@ -837,7 +856,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
return PTR_ERR(req);
/* Update file length */
nfs_grow_file(page, offset, count);
- nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
+ nfs_mark_uptodate(req);
nfs_mark_request_dirty(req);
nfs_unlock_and_release_request(req);
return 0;
@@ -863,6 +882,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
return 0;
l_ctx = req->wb_lock_context;
do_flush = req->wb_page != page || req->wb_context != ctx;
+ /* for now, flush if more than 1 request in page_group */
+ do_flush |= req->wb_this_page != req;
if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) {
do_flush |= l_ctx->lockowner.l_owner != current->files
|| l_ctx->lockowner.l_pid != current->tgid;
@@ -990,126 +1011,17 @@ static int flush_task_priority(int how)
return RPC_PRIORITY_NORMAL;
}
-int nfs_initiate_write(struct rpc_clnt *clnt,
- struct nfs_write_data *data,
- const struct rpc_call_ops *call_ops,
- int how, int flags)
+static void nfs_initiate_write(struct nfs_pgio_data *data, struct rpc_message *msg,
+ struct rpc_task_setup *task_setup_data, int how)
{
struct inode *inode = data->header->inode;
int priority = flush_task_priority(how);
- struct rpc_task *task;
- struct rpc_message msg = {
- .rpc_argp = &data->args,
- .rpc_resp = &data->res,
- .rpc_cred = data->header->cred,
- };
- struct rpc_task_setup task_setup_data = {
- .rpc_client = clnt,
- .task = &data->task,
- .rpc_message = &msg,
- .callback_ops = call_ops,
- .callback_data = data,
- .workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC | flags,
- .priority = priority,
- };
- int ret = 0;
-
- /* Set up the initial task struct. */
- NFS_PROTO(inode)->write_setup(data, &msg);
- dprintk("NFS: %5u initiated write call "
- "(req %s/%llu, %u bytes @ offset %llu)\n",
- data->task.tk_pid,
- inode->i_sb->s_id,
- (unsigned long long)NFS_FILEID(inode),
- data->args.count,
- (unsigned long long)data->args.offset);
+ task_setup_data->priority = priority;
+ NFS_PROTO(inode)->write_setup(data, msg);
nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client,
- &task_setup_data.rpc_client, &msg, data);
-
- task = rpc_run_task(&task_setup_data);
- if (IS_ERR(task)) {
- ret = PTR_ERR(task);
- goto out;
- }
- if (how & FLUSH_SYNC) {
- ret = rpc_wait_for_completion_task(task);
- if (ret == 0)
- ret = task->tk_status;
- }
- rpc_put_task(task);
-out:
- return ret;
-}
-EXPORT_SYMBOL_GPL(nfs_initiate_write);
-
-/*
- * Set up the argument/result storage required for the RPC call.
- */
-static void nfs_write_rpcsetup(struct nfs_write_data *data,
- unsigned int count, unsigned int offset,
- int how, struct nfs_commit_info *cinfo)
-{
- struct nfs_page *req = data->header->req;
-
- /* Set up the RPC argument and reply structs
- * NB: take care not to mess about with data->commit et al. */
-
- data->args.fh = NFS_FH(data->header->inode);
- data->args.offset = req_offset(req) + offset;
- /* pnfs_set_layoutcommit needs this */
- data->mds_offset = data->args.offset;
- data->args.pgbase = req->wb_pgbase + offset;
- data->args.pages = data->pages.pagevec;
- data->args.count = count;
- data->args.context = get_nfs_open_context(req->wb_context);
- data->args.lock_context = req->wb_lock_context;
- data->args.stable = NFS_UNSTABLE;
- switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) {
- case 0:
- break;
- case FLUSH_COND_STABLE:
- if (nfs_reqs_to_commit(cinfo))
- break;
- default:
- data->args.stable = NFS_FILE_SYNC;
- }
-
- data->res.fattr = &data->fattr;
- data->res.count = count;
- data->res.verf = &data->verf;
- nfs_fattr_init(&data->fattr);
-}
-
-static int nfs_do_write(struct nfs_write_data *data,
- const struct rpc_call_ops *call_ops,
- int how)
-{
- struct inode *inode = data->header->inode;
-
- return nfs_initiate_write(NFS_CLIENT(inode), data, call_ops, how, 0);
-}
-
-static int nfs_do_multiple_writes(struct list_head *head,
- const struct rpc_call_ops *call_ops,
- int how)
-{
- struct nfs_write_data *data;
- int ret = 0;
-
- while (!list_empty(head)) {
- int ret2;
-
- data = list_first_entry(head, struct nfs_write_data, list);
- list_del_init(&data->list);
-
- ret2 = nfs_do_write(data, call_ops, how);
- if (ret == 0)
- ret = ret2;
- }
- return ret;
+ &task_setup_data->rpc_client, msg, data);
}
/* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -1120,7 +1032,7 @@ static void nfs_redirty_request(struct nfs_page *req)
{
nfs_mark_request_dirty(req);
nfs_unlock_request(req);
- nfs_end_page_writeback(req->wb_page);
+ nfs_end_page_writeback(req);
nfs_release_request(req);
}
@@ -1140,173 +1052,30 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = {
.completion = nfs_write_completion,
};
-static void nfs_flush_error(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr)
-{
- set_bit(NFS_IOHDR_REDO, &hdr->flags);
- while (!list_empty(&hdr->rpc_list)) {
- struct nfs_write_data *data = list_first_entry(&hdr->rpc_list,
- struct nfs_write_data, list);
- list_del(&data->list);
- nfs_writedata_release(data);
- }
- desc->pg_completion_ops->error_cleanup(&desc->pg_list);
-}
-
-/*
- * Generate multiple small requests to write out a single
- * contiguous dirty area on one page.
- */
-static int nfs_flush_multi(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr)
-{
- struct nfs_page *req = hdr->req;
- struct page *page = req->wb_page;
- struct nfs_write_data *data;
- size_t wsize = desc->pg_bsize, nbytes;
- unsigned int offset;
- int requests = 0;
- struct nfs_commit_info cinfo;
-
- nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
-
- if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
- (desc->pg_moreio || nfs_reqs_to_commit(&cinfo) ||
- desc->pg_count > wsize))
- desc->pg_ioflags &= ~FLUSH_COND_STABLE;
-
-
- offset = 0;
- nbytes = desc->pg_count;
- do {
- size_t len = min(nbytes, wsize);
-
- data = nfs_writedata_alloc(hdr, 1);
- if (!data) {
- nfs_flush_error(desc, hdr);
- return -ENOMEM;
- }
- data->pages.pagevec[0] = page;
- nfs_write_rpcsetup(data, len, offset, desc->pg_ioflags, &cinfo);
- list_add(&data->list, &hdr->rpc_list);
- requests++;
- nbytes -= len;
- offset += len;
- } while (nbytes != 0);
- nfs_list_remove_request(req);
- nfs_list_add_request(req, &hdr->pages);
- desc->pg_rpc_callops = &nfs_write_common_ops;
- return 0;
-}
-
-/*
- * Create an RPC task for the given write request and kick it.
- * The page must have been locked by the caller.
- *
- * It may happen that the page we're passed is not marked dirty.
- * This is the case if nfs_updatepage detects a conflicting request
- * that has been written but not committed.
- */
-static int nfs_flush_one(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr)
-{
- struct nfs_page *req;
- struct page **pages;
- struct nfs_write_data *data;
- struct list_head *head = &desc->pg_list;
- struct nfs_commit_info cinfo;
-
- data = nfs_writedata_alloc(hdr, nfs_page_array_len(desc->pg_base,
- desc->pg_count));
- if (!data) {
- nfs_flush_error(desc, hdr);
- return -ENOMEM;
- }
-
- nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
- pages = data->pages.pagevec;
- while (!list_empty(head)) {
- req = nfs_list_entry(head->next);
- nfs_list_remove_request(req);
- nfs_list_add_request(req, &hdr->pages);
- *pages++ = req->wb_page;
- }
-
- if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
- (desc->pg_moreio || nfs_reqs_to_commit(&cinfo)))
- desc->pg_ioflags &= ~FLUSH_COND_STABLE;
-
- /* Set up the argument struct */
- nfs_write_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
- list_add(&data->list, &hdr->rpc_list);
- desc->pg_rpc_callops = &nfs_write_common_ops;
- return 0;
-}
-
-int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr)
-{
- if (desc->pg_bsize < PAGE_CACHE_SIZE)
- return nfs_flush_multi(desc, hdr);
- return nfs_flush_one(desc, hdr);
-}
-EXPORT_SYMBOL_GPL(nfs_generic_flush);
-
-static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
-{
- struct nfs_write_header *whdr;
- struct nfs_pgio_header *hdr;
- int ret;
-
- whdr = nfs_writehdr_alloc();
- if (!whdr) {
- desc->pg_completion_ops->error_cleanup(&desc->pg_list);
- return -ENOMEM;
- }
- hdr = &whdr->header;
- nfs_pgheader_init(desc, hdr, nfs_writehdr_free);
- atomic_inc(&hdr->refcnt);
- ret = nfs_generic_flush(desc, hdr);
- if (ret == 0)
- ret = nfs_do_multiple_writes(&hdr->rpc_list,
- desc->pg_rpc_callops,
- desc->pg_ioflags);
- if (atomic_dec_and_test(&hdr->refcnt))
- hdr->completion_ops->completion(hdr);
- return ret;
-}
-
-static const struct nfs_pageio_ops nfs_pageio_write_ops = {
- .pg_test = nfs_generic_pg_test,
- .pg_doio = nfs_generic_pg_writepages,
-};
-
void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
- struct inode *inode, int ioflags,
+ struct inode *inode, int ioflags, bool force_mds,
const struct nfs_pgio_completion_ops *compl_ops)
{
- nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops, compl_ops,
- NFS_SERVER(inode)->wsize, ioflags);
+ struct nfs_server *server = NFS_SERVER(inode);
+ const struct nfs_pageio_ops *pg_ops = &nfs_pgio_rw_ops;
+
+#ifdef CONFIG_NFS_V4_1
+ if (server->pnfs_curr_ld && !force_mds)
+ pg_ops = server->pnfs_curr_ld->pg_write_ops;
+#endif
+ nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops,
+ server->wsize, ioflags);
}
EXPORT_SYMBOL_GPL(nfs_pageio_init_write);
void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
{
- pgio->pg_ops = &nfs_pageio_write_ops;
+ pgio->pg_ops = &nfs_pgio_rw_ops;
pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize;
}
EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
-void nfs_write_prepare(struct rpc_task *task, void *calldata)
-{
- struct nfs_write_data *data = calldata;
- int err;
- err = NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data);
- if (err)
- rpc_exit(task, err);
-}
-
void nfs_commit_prepare(struct rpc_task *task, void *calldata)
{
struct nfs_commit_data *data = calldata;
@@ -1314,23 +1083,8 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata)
NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
}
-/*
- * Handle a write reply that flushes a whole page.
- *
- * FIXME: There is an inherent race with invalidate_inode_pages and
- * writebacks since the page->count is kept > 1 for as long
- * as the page has a write request pending.
- */
-static void nfs_writeback_done_common(struct rpc_task *task, void *calldata)
-{
- struct nfs_write_data *data = calldata;
-
- nfs_writeback_done(task, data);
-}
-
-static void nfs_writeback_release_common(void *calldata)
+static void nfs_writeback_release_common(struct nfs_pgio_data *data)
{
- struct nfs_write_data *data = calldata;
struct nfs_pgio_header *hdr = data->header;
int status = data->task.tk_status;
@@ -1339,34 +1093,46 @@ static void nfs_writeback_release_common(void *calldata)
if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags))
; /* Do nothing */
else if (!test_and_set_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags))
- memcpy(hdr->verf, &data->verf, sizeof(*hdr->verf));
- else if (memcmp(hdr->verf, &data->verf, sizeof(*hdr->verf)))
+ memcpy(&hdr->verf, &data->verf, sizeof(hdr->verf));
+ else if (memcmp(&hdr->verf, &data->verf, sizeof(hdr->verf)))
set_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags);
spin_unlock(&hdr->lock);
}
- nfs_writedata_release(data);
}
-static const struct rpc_call_ops nfs_write_common_ops = {
- .rpc_call_prepare = nfs_write_prepare,
- .rpc_call_done = nfs_writeback_done_common,
- .rpc_release = nfs_writeback_release_common,
-};
+/*
+ * Special version of should_remove_suid() that ignores capabilities.
+ */
+static int nfs_should_remove_suid(const struct inode *inode)
+{
+ umode_t mode = inode->i_mode;
+ int kill = 0;
+
+ /* suid always must be killed */
+ if (unlikely(mode & S_ISUID))
+ kill = ATTR_KILL_SUID;
+ /*
+ * sgid without any exec bits is just a mandatory locking mark; leave
+ * it alone. If some exec bits are set, it's a real sgid; kill it.
+ */
+ if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
+ kill |= ATTR_KILL_SGID;
+
+ if (unlikely(kill && S_ISREG(mode)))
+ return kill;
+
+ return 0;
+}
/*
* This function is called when the WRITE call is complete.
*/
-void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
+ struct inode *inode)
{
- struct nfs_writeargs *argp = &data->args;
- struct nfs_writeres *resp = &data->res;
- struct inode *inode = data->header->inode;
int status;
- dprintk("NFS: %5u nfs_writeback_done (status %d)\n",
- task->tk_pid, task->tk_status);
-
/*
* ->write_done will attempt to use post-op attributes to detect
* conflicting writes by other clients. A strict interpretation
@@ -1376,11 +1142,11 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
*/
status = NFS_PROTO(inode)->write_done(task, data);
if (status != 0)
- return;
- nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
+ return status;
+ nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, data->res.count);
#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
- if (resp->verf->committed < argp->stable && task->tk_status >= 0) {
+ if (data->res.verf->committed < data->args.stable && task->tk_status >= 0) {
/* We tried a write call, but the server did not
* commit data to stable storage even though we
* requested it.
@@ -1396,18 +1162,31 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
dprintk("NFS: faulty NFS server %s:"
" (committed = %d) != (stable = %d)\n",
NFS_SERVER(inode)->nfs_client->cl_hostname,
- resp->verf->committed, argp->stable);
+ data->res.verf->committed, data->args.stable);
complain = jiffies + 300 * HZ;
}
}
#endif
- if (task->tk_status < 0)
- nfs_set_pgio_error(data->header, task->tk_status, argp->offset);
- else if (resp->count < argp->count) {
+
+ /* Deal with the suid/sgid bit corner case */
+ if (nfs_should_remove_suid(inode))
+ nfs_mark_for_revalidate(inode);
+ return 0;
+}
+
+/*
+ * This function is called when the WRITE call is complete.
+ */
+static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *data)
+{
+ struct nfs_pgio_args *argp = &data->args;
+ struct nfs_pgio_res *resp = &data->res;
+
+ if (resp->count < argp->count) {
static unsigned long complain;
/* This a short write! */
- nfs_inc_stats(inode, NFSIOS_SHORTWRITE);
+ nfs_inc_stats(data->header->inode, NFSIOS_SHORTWRITE);
/* Has the server at least made some progress? */
if (resp->count == 0) {
@@ -1874,7 +1653,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
int __init nfs_init_writepagecache(void)
{
nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
- sizeof(struct nfs_write_header),
+ sizeof(struct nfs_rw_header),
0, SLAB_HWCACHE_ALIGN,
NULL);
if (nfs_wdata_cachep == NULL)
@@ -1936,3 +1715,12 @@ void nfs_destroy_writepagecache(void)
kmem_cache_destroy(nfs_wdata_cachep);
}
+static const struct nfs_rw_ops nfs_rw_write_ops = {
+ .rw_mode = FMODE_WRITE,
+ .rw_alloc_header = nfs_writehdr_alloc,
+ .rw_free_header = nfs_writehdr_free,
+ .rw_release = nfs_writeback_release_common,
+ .rw_done = nfs_writeback_done,
+ .rw_result = nfs_writeback_result,
+ .rw_initiate = nfs_initiate_write,
+};
diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
index b481e1f5eecc..a986ceb6fd0d 100644
--- a/fs/nfsd/acl.h
+++ b/fs/nfsd/acl.h
@@ -49,7 +49,7 @@ struct svc_rqst;
struct nfs4_acl *nfs4_acl_new(int);
int nfs4_acl_get_whotype(char *, u32);
-__be32 nfs4_acl_write_who(int who, __be32 **p, int *len);
+__be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who);
int nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
struct nfs4_acl **acl);
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 2645be435e75..72f44823adbb 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -1,7 +1,6 @@
/* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */
#include <linux/sched.h>
-#include <linux/user_namespace.h>
#include "nfsd.h"
#include "auth.h"
@@ -25,7 +24,6 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
struct cred *new;
int i;
int flags = nfsexp_flags(rqstp, exp);
- int ret;
validate_process_creds();
@@ -86,8 +84,7 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
return 0;
oom:
- ret = -ENOMEM;
abort_creds(new);
- return ret;
+ return -ENOMEM;
}
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 8513c598fabf..13b85f94d9e2 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -17,17 +17,12 @@
#include <linux/exportfs.h>
#include <linux/sunrpc/svc_xprt.h>
-#include <net/ipv6.h>
-
#include "nfsd.h"
#include "nfsfh.h"
#include "netns.h"
#define NFSDDBG_FACILITY NFSDDBG_EXPORT
-typedef struct auth_domain svc_client;
-typedef struct svc_export svc_export;
-
/*
* We have two caches.
* One maps client+vfsmnt+dentry to export options - the export map
@@ -73,7 +68,7 @@ static struct svc_expkey *svc_expkey_lookup(struct cache_detail *cd, struct svc_
static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
{
- /* client fsidtype fsid [path] */
+ /* client fsidtype fsid expiry [path] */
char *buf;
int len;
struct auth_domain *dom = NULL;
@@ -295,13 +290,19 @@ svc_expkey_update(struct cache_detail *cd, struct svc_expkey *new,
static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc)
{
+ struct nfsd4_fs_location *locations = fsloc->locations;
int i;
+ if (!locations)
+ return;
+
for (i = 0; i < fsloc->locations_count; i++) {
- kfree(fsloc->locations[i].path);
- kfree(fsloc->locations[i].hosts);
+ kfree(locations[i].path);
+ kfree(locations[i].hosts);
}
- kfree(fsloc->locations);
+
+ kfree(locations);
+ fsloc->locations = NULL;
}
static void svc_export_put(struct kref *ref)
@@ -388,6 +389,10 @@ fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc)
int len;
int migrated, i, err;
+ /* more than one fsloc */
+ if (fsloc->locations)
+ return -EINVAL;
+
/* listsize */
err = get_uint(mesg, &fsloc->locations_count);
if (err)
@@ -437,13 +442,18 @@ out_free_all:
static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp)
{
- int listsize, err;
struct exp_flavor_info *f;
+ u32 listsize;
+ int err;
+
+ /* more than one secinfo */
+ if (exp->ex_nflavors)
+ return -EINVAL;
- err = get_int(mesg, &listsize);
+ err = get_uint(mesg, &listsize);
if (err)
return err;
- if (listsize < 0 || listsize > MAX_SECINFO_LIST)
+ if (listsize > MAX_SECINFO_LIST)
return -EINVAL;
for (f = exp->ex_flavors; f < exp->ex_flavors + listsize; f++) {
@@ -474,6 +484,27 @@ static inline int
secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; }
#endif
+static inline int
+uuid_parse(char **mesg, char *buf, unsigned char **puuid)
+{
+ int len;
+
+ /* more than one uuid */
+ if (*puuid)
+ return -EINVAL;
+
+ /* expect a 16 byte uuid encoded as \xXXXX... */
+ len = qword_get(mesg, buf, PAGE_SIZE);
+ if (len != EX_UUID_LEN)
+ return -EINVAL;
+
+ *puuid = kmemdup(buf, EX_UUID_LEN, GFP_KERNEL);
+ if (*puuid == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
{
/* client path expiry [flags anonuid anongid fsid] */
@@ -552,18 +583,9 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
while ((len = qword_get(&mesg, buf, PAGE_SIZE)) > 0) {
if (strcmp(buf, "fsloc") == 0)
err = fsloc_parse(&mesg, buf, &exp.ex_fslocs);
- else if (strcmp(buf, "uuid") == 0) {
- /* expect a 16 byte uuid encoded as \xXXXX... */
- len = qword_get(&mesg, buf, PAGE_SIZE);
- if (len != 16)
- err = -EINVAL;
- else {
- exp.ex_uuid =
- kmemdup(buf, 16, GFP_KERNEL);
- if (exp.ex_uuid == NULL)
- err = -ENOMEM;
- }
- } else if (strcmp(buf, "secinfo") == 0)
+ else if (strcmp(buf, "uuid") == 0)
+ err = uuid_parse(&mesg, buf, &exp.ex_uuid);
+ else if (strcmp(buf, "secinfo") == 0)
err = secinfo_parse(&mesg, buf, &exp);
else
/* quietly ignore unknown words and anything
@@ -649,7 +671,7 @@ static int svc_export_show(struct seq_file *m,
if (exp->ex_uuid) {
int i;
seq_puts(m, ",uuid=");
- for (i=0; i<16; i++) {
+ for (i = 0; i < EX_UUID_LEN; i++) {
if ((i&3) == 0 && i)
seq_putc(m, ':');
seq_printf(m, "%02x", exp->ex_uuid[i]);
@@ -771,7 +793,7 @@ svc_export_update(struct svc_export *new, struct svc_export *old)
static struct svc_expkey *
-exp_find_key(struct cache_detail *cd, svc_client *clp, int fsid_type,
+exp_find_key(struct cache_detail *cd, struct auth_domain *clp, int fsid_type,
u32 *fsidv, struct cache_req *reqp)
{
struct svc_expkey key, *ek;
@@ -793,9 +815,9 @@ exp_find_key(struct cache_detail *cd, svc_client *clp, int fsid_type,
return ek;
}
-
-static svc_export *exp_get_by_name(struct cache_detail *cd, svc_client *clp,
- const struct path *path, struct cache_req *reqp)
+static struct svc_export *
+exp_get_by_name(struct cache_detail *cd, struct auth_domain *clp,
+ const struct path *path, struct cache_req *reqp)
{
struct svc_export *exp, key;
int err;
@@ -819,11 +841,11 @@ static svc_export *exp_get_by_name(struct cache_detail *cd, svc_client *clp,
/*
* Find the export entry for a given dentry.
*/
-static struct svc_export *exp_parent(struct cache_detail *cd, svc_client *clp,
- struct path *path)
+static struct svc_export *
+exp_parent(struct cache_detail *cd, struct auth_domain *clp, struct path *path)
{
struct dentry *saved = dget(path->dentry);
- svc_export *exp = exp_get_by_name(cd, clp, path, NULL);
+ struct svc_export *exp = exp_get_by_name(cd, clp, path, NULL);
while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) {
struct dentry *parent = dget_parent(path->dentry);
@@ -844,7 +866,7 @@ static struct svc_export *exp_parent(struct cache_detail *cd, svc_client *clp,
* since its harder to fool a kernel module than a user space program.
*/
int
-exp_rootfh(struct net *net, svc_client *clp, char *name,
+exp_rootfh(struct net *net, struct auth_domain *clp, char *name,
struct knfsd_fh *f, int maxsize)
{
struct svc_export *exp;
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
new file mode 100644
index 000000000000..cfeea85c5bed
--- /dev/null
+++ b/fs/nfsd/export.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
+ */
+#ifndef NFSD_EXPORT_H
+#define NFSD_EXPORT_H
+
+#include <linux/sunrpc/cache.h>
+#include <uapi/linux/nfsd/export.h>
+
+struct knfsd_fh;
+struct svc_fh;
+struct svc_rqst;
+
+/*
+ * FS Locations
+ */
+
+#define MAX_FS_LOCATIONS 128
+
+struct nfsd4_fs_location {
+ char *hosts; /* colon separated list of hosts */
+ char *path; /* slash separated list of path components */
+};
+
+struct nfsd4_fs_locations {
+ uint32_t locations_count;
+ struct nfsd4_fs_location *locations;
+/* If we're not actually serving this data ourselves (only providing a
+ * list of replicas that do serve it) then we set "migrated": */
+ int migrated;
+};
+
+/*
+ * We keep an array of pseudoflavors with the export, in order from most
+ * to least preferred. For the foreseeable future, we don't expect more
+ * than the eight pseudoflavors null, unix, krb5, krb5i, krb5p, skpm3,
+ * spkm3i, and spkm3p (and using all 8 at once should be rare).
+ */
+#define MAX_SECINFO_LIST 8
+#define EX_UUID_LEN 16
+
+struct exp_flavor_info {
+ u32 pseudoflavor;
+ u32 flags;
+};
+
+struct svc_export {
+ struct cache_head h;
+ struct auth_domain * ex_client;
+ int ex_flags;
+ struct path ex_path;
+ kuid_t ex_anon_uid;
+ kgid_t ex_anon_gid;
+ int ex_fsid;
+ unsigned char * ex_uuid; /* 16 byte fsid */
+ struct nfsd4_fs_locations ex_fslocs;
+ uint32_t ex_nflavors;
+ struct exp_flavor_info ex_flavors[MAX_SECINFO_LIST];
+ struct cache_detail *cd;
+};
+
+/* an "export key" (expkey) maps a filehandlefragement to an
+ * svc_export for a given client. There can be several per export,
+ * for the different fsid types.
+ */
+struct svc_expkey {
+ struct cache_head h;
+
+ struct auth_domain * ek_client;
+ int ek_fsidtype;
+ u32 ek_fsid[6];
+
+ struct path ek_path;
+};
+
+#define EX_ISSYNC(exp) (!((exp)->ex_flags & NFSEXP_ASYNC))
+#define EX_NOHIDE(exp) ((exp)->ex_flags & NFSEXP_NOHIDE)
+#define EX_WGATHER(exp) ((exp)->ex_flags & NFSEXP_GATHERED_WRITES)
+
+int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp);
+__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp);
+
+/*
+ * Function declarations
+ */
+int nfsd_export_init(struct net *);
+void nfsd_export_shutdown(struct net *);
+void nfsd_export_flush(struct net *);
+struct svc_export * rqst_exp_get_by_name(struct svc_rqst *,
+ struct path *);
+struct svc_export * rqst_exp_parent(struct svc_rqst *,
+ struct path *);
+struct svc_export * rqst_find_fsidzero_export(struct svc_rqst *);
+int exp_rootfh(struct net *, struct auth_domain *,
+ char *path, struct knfsd_fh *, int maxsize);
+__be32 exp_pseudoroot(struct svc_rqst *, struct svc_fh *);
+__be32 nfserrno(int errno);
+
+static inline void exp_put(struct svc_export *exp)
+{
+ cache_put(&exp->h, exp->cd);
+}
+
+static inline void exp_get(struct svc_export *exp)
+{
+ cache_get(&exp->h);
+}
+struct svc_export * rqst_exp_find(struct svc_rqst *, int, u32 *);
+
+#endif /* NFSD_EXPORT_H */
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
index d620e7f81429..2ed05c3cd43d 100644
--- a/fs/nfsd/fault_inject.c
+++ b/fs/nfsd/fault_inject.c
@@ -97,25 +97,14 @@ static ssize_t fault_inject_read(struct file *file, char __user *buf,
{
static u64 val;
char read_buf[25];
- size_t size, ret;
+ size_t size;
loff_t pos = *ppos;
if (!pos)
nfsd_inject_get(file_inode(file)->i_private, &val);
size = scnprintf(read_buf, sizeof(read_buf), "%llu\n", val);
- if (pos < 0)
- return -EINVAL;
- if (pos >= size || !len)
- return 0;
- if (len > size - pos)
- len = size - pos;
- ret = copy_to_user(buf, read_buf + pos, len);
- if (ret == len)
- return -EFAULT;
- len -= ret;
- *ppos = pos + len;
- return len;
+ return simple_read_from_buffer(buf, len, ppos, read_buf, size);
}
static ssize_t fault_inject_write(struct file *file, const char __user *buf,
diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h
index 66e58db01936..a3f34900091f 100644
--- a/fs/nfsd/idmap.h
+++ b/fs/nfsd/idmap.h
@@ -56,7 +56,7 @@ static inline void nfsd_idmap_shutdown(struct net *net)
__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, kuid_t *);
__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, kgid_t *);
-__be32 nfsd4_encode_user(struct svc_rqst *, kuid_t, __be32 **, int *);
-__be32 nfsd4_encode_group(struct svc_rqst *, kgid_t, __be32 **, int *);
+__be32 nfsd4_encode_user(struct xdr_stream *, struct svc_rqst *, kuid_t);
+__be32 nfsd4_encode_group(struct xdr_stream *, struct svc_rqst *, kgid_t);
#endif /* LINUX_NFSD_IDMAP_H */
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 11c1fba29312..12b023a7ab7d 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -182,7 +182,8 @@ static __be32 nfsacld_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessarg
static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_getaclargs *argp)
{
- if (!(p = nfs2svc_decode_fh(p, &argp->fh)))
+ p = nfs2svc_decode_fh(p, &argp->fh);
+ if (!p)
return 0;
argp->mask = ntohl(*p); p++;
@@ -197,7 +198,8 @@ static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p,
unsigned int base;
int n;
- if (!(p = nfs2svc_decode_fh(p, &argp->fh)))
+ p = nfs2svc_decode_fh(p, &argp->fh);
+ if (!p)
return 0;
argp->mask = ntohl(*p++);
if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) ||
@@ -218,7 +220,8 @@ static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p,
static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_fhandle *argp)
{
- if (!(p = nfs2svc_decode_fh(p, &argp->fh)))
+ p = nfs2svc_decode_fh(p, &argp->fh);
+ if (!p)
return 0;
return xdr_argsize_check(rqstp, p);
}
@@ -226,7 +229,8 @@ static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p,
static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_accessargs *argp)
{
- if (!(p = nfs2svc_decode_fh(p, &argp->fh)))
+ p = nfs2svc_decode_fh(p, &argp->fh);
+ if (!p)
return 0;
argp->access = ntohl(*p++);
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index adc5f1b1dc26..2a514e21dc74 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -128,7 +128,8 @@ out:
static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_getaclargs *args)
{
- if (!(p = nfs3svc_decode_fh(p, &args->fh)))
+ p = nfs3svc_decode_fh(p, &args->fh);
+ if (!p)
return 0;
args->mask = ntohl(*p); p++;
@@ -143,7 +144,8 @@ static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p,
unsigned int base;
int n;
- if (!(p = nfs3svc_decode_fh(p, &args->fh)))
+ p = nfs3svc_decode_fh(p, &args->fh);
+ if (!p)
return 0;
args->mask = ntohl(*p++);
if (args->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) ||
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index de6e39e12cb3..e6c01e80325e 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -278,7 +278,8 @@ void fill_post_wcc(struct svc_fh *fhp)
int
nfs3svc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *args)
{
- if (!(p = decode_fh(p, &args->fh)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
return xdr_argsize_check(rqstp, p);
}
@@ -287,7 +288,8 @@ int
nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_sattrargs *args)
{
- if (!(p = decode_fh(p, &args->fh)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
p = decode_sattr3(p, &args->attrs);
@@ -315,7 +317,8 @@ int
nfs3svc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_accessargs *args)
{
- if (!(p = decode_fh(p, &args->fh)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
args->access = ntohl(*p++);
@@ -330,7 +333,8 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
int v;
u32 max_blocksize = svc_max_payload(rqstp);
- if (!(p = decode_fh(p, &args->fh)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
p = xdr_decode_hyper(p, &args->offset);
@@ -360,7 +364,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
unsigned int len, v, hdr, dlen;
u32 max_blocksize = svc_max_payload(rqstp);
- if (!(p = decode_fh(p, &args->fh)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
p = xdr_decode_hyper(p, &args->offset);
@@ -535,7 +540,8 @@ int
nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_readlinkargs *args)
{
- if (!(p = decode_fh(p, &args->fh)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
args->buffer = page_address(*(rqstp->rq_next_page++));
@@ -558,7 +564,8 @@ int
nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_readdirargs *args)
{
- if (!(p = decode_fh(p, &args->fh)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
p = xdr_decode_hyper(p, &args->cookie);
args->verf = p; p += 2;
@@ -580,7 +587,8 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
int len;
u32 max_blocksize = svc_max_payload(rqstp);
- if (!(p = decode_fh(p, &args->fh)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
p = xdr_decode_hyper(p, &args->cookie);
args->verf = p; p += 2;
@@ -605,7 +613,8 @@ int
nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_commitargs *args)
{
- if (!(p = decode_fh(p, &args->fh)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
p = xdr_decode_hyper(p, &args->offset);
args->count = ntohl(*p++);
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index f66c66b9f182..d714156a19fd 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -36,7 +36,6 @@
#include <linux/slab.h>
#include <linux/nfs_fs.h>
-#include <linux/export.h>
#include "nfsfh.h"
#include "nfsd.h"
#include "acl.h"
@@ -920,20 +919,19 @@ nfs4_acl_get_whotype(char *p, u32 len)
return NFS4_ACL_WHO_NAMED;
}
-__be32 nfs4_acl_write_who(int who, __be32 **p, int *len)
+__be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who)
{
+ __be32 *p;
int i;
- int bytes;
for (i = 0; i < ARRAY_SIZE(s2t_map); i++) {
if (s2t_map[i].type != who)
continue;
- bytes = 4 + (XDR_QUADLEN(s2t_map[i].stringlen) << 2);
- if (bytes > *len)
+ p = xdr_reserve_space(xdr, s2t_map[i].stringlen + 4);
+ if (!p)
return nfserr_resource;
- *p = xdr_encode_opaque(*p, s2t_map[i].string,
+ p = xdr_encode_opaque(p, s2t_map[i].string,
s2t_map[i].stringlen);
- *len -= bytes;
return 0;
}
WARN_ON_ONCE(1);
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index c0dfde68742e..a0ab0a847d69 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -551,44 +551,43 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen
return 0;
}
-static __be32 encode_ascii_id(u32 id, __be32 **p, int *buflen)
+static __be32 encode_ascii_id(struct xdr_stream *xdr, u32 id)
{
char buf[11];
int len;
- int bytes;
+ __be32 *p;
len = sprintf(buf, "%u", id);
- bytes = 4 + (XDR_QUADLEN(len) << 2);
- if (bytes > *buflen)
+ p = xdr_reserve_space(xdr, len + 4);
+ if (!p)
return nfserr_resource;
- *p = xdr_encode_opaque(*p, buf, len);
- *buflen -= bytes;
+ p = xdr_encode_opaque(p, buf, len);
return 0;
}
-static __be32 idmap_id_to_name(struct svc_rqst *rqstp, int type, u32 id, __be32 **p, int *buflen)
+static __be32 idmap_id_to_name(struct xdr_stream *xdr,
+ struct svc_rqst *rqstp, int type, u32 id)
{
struct ent *item, key = {
.id = id,
.type = type,
};
+ __be32 *p;
int ret;
- int bytes;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
ret = idmap_lookup(rqstp, idtoname_lookup, &key, nn->idtoname_cache, &item);
if (ret == -ENOENT)
- return encode_ascii_id(id, p, buflen);
+ return encode_ascii_id(xdr, id);
if (ret)
return nfserrno(ret);
ret = strlen(item->name);
WARN_ON_ONCE(ret > IDMAP_NAMESZ);
- bytes = 4 + (XDR_QUADLEN(ret) << 2);
- if (bytes > *buflen)
+ p = xdr_reserve_space(xdr, ret + 4);
+ if (!p)
return nfserr_resource;
- *p = xdr_encode_opaque(*p, item->name, ret);
- *buflen -= bytes;
+ p = xdr_encode_opaque(p, item->name, ret);
cache_put(&item->h, nn->idtoname_cache);
return 0;
}
@@ -622,11 +621,12 @@ do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u
return idmap_name_to_id(rqstp, type, name, namelen, id);
}
-static __be32 encode_name_from_id(struct svc_rqst *rqstp, int type, u32 id, __be32 **p, int *buflen)
+static __be32 encode_name_from_id(struct xdr_stream *xdr,
+ struct svc_rqst *rqstp, int type, u32 id)
{
if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS)
- return encode_ascii_id(id, p, buflen);
- return idmap_id_to_name(rqstp, type, id, p, buflen);
+ return encode_ascii_id(xdr, id);
+ return idmap_id_to_name(xdr, rqstp, type, id);
}
__be32
@@ -655,14 +655,16 @@ nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen,
return status;
}
-__be32 nfsd4_encode_user(struct svc_rqst *rqstp, kuid_t uid, __be32 **p, int *buflen)
+__be32 nfsd4_encode_user(struct xdr_stream *xdr, struct svc_rqst *rqstp,
+ kuid_t uid)
{
u32 id = from_kuid(&init_user_ns, uid);
- return encode_name_from_id(rqstp, IDMAP_TYPE_USER, id, p, buflen);
+ return encode_name_from_id(xdr, rqstp, IDMAP_TYPE_USER, id);
}
-__be32 nfsd4_encode_group(struct svc_rqst *rqstp, kgid_t gid, __be32 **p, int *buflen)
+__be32 nfsd4_encode_group(struct xdr_stream *xdr, struct svc_rqst *rqstp,
+ kgid_t gid)
{
u32 id = from_kgid(&init_user_ns, gid);
- return encode_name_from_id(rqstp, IDMAP_TYPE_GROUP, id, p, buflen);
+ return encode_name_from_id(xdr, rqstp, IDMAP_TYPE_GROUP, id);
}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index d543222babf3..6851b003f2a4 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -430,12 +430,12 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
break;
case NFS4_OPEN_CLAIM_PREVIOUS:
- open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
status = nfs4_check_open_reclaim(&open->op_clientid,
cstate->minorversion,
nn);
if (status)
goto out;
+ open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
case NFS4_OPEN_CLAIM_FH:
case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
status = do_open_fhandle(rqstp, cstate, open);
@@ -445,7 +445,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
break;
case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
case NFS4_OPEN_CLAIM_DELEGATE_PREV:
- open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
dprintk("NFSD: unsupported OPEN claim type %d\n",
open->op_claim_type);
status = nfserr_notsupp;
@@ -786,7 +785,6 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (!nfsd4_last_compound_op(rqstp))
rqstp->rq_splice_ok = false;
- nfs4_lock_state();
/* check stateid */
if ((status = nfs4_preprocess_stateid_op(SVC_NET(rqstp),
cstate, &read->rd_stateid,
@@ -794,11 +792,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
goto out;
}
- if (read->rd_filp)
- get_file(read->rd_filp);
status = nfs_ok;
out:
- nfs4_unlock_state();
read->rd_rqstp = rqstp;
read->rd_fhp = &cstate->current_fh;
return status;
@@ -937,10 +932,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
int err;
if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
- nfs4_lock_state();
status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate,
&setattr->sa_stateid, WR_STATE, NULL);
- nfs4_unlock_state();
if (status) {
dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
return status;
@@ -1006,17 +999,12 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (write->wr_offset >= OFFSET_MAX)
return nfserr_inval;
- nfs4_lock_state();
status = nfs4_preprocess_stateid_op(SVC_NET(rqstp),
cstate, stateid, WR_STATE, &filp);
if (status) {
- nfs4_unlock_state();
dprintk("NFSD: nfsd4_write: couldn't process stateid!\n");
return status;
}
- if (filp)
- get_file(filp);
- nfs4_unlock_state();
cnt = write->wr_buflen;
write->wr_how_written = write->wr_stable_how;
@@ -1072,10 +1060,10 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return nfserr_jukebox;
p = buf;
- status = nfsd4_encode_fattr(&cstate->current_fh,
+ status = nfsd4_encode_fattr_to_buf(&p, count, &cstate->current_fh,
cstate->current_fh.fh_export,
- cstate->current_fh.fh_dentry, &p,
- count, verify->ve_bmval,
+ cstate->current_fh.fh_dentry,
+ verify->ve_bmval,
rqstp, 0);
/*
* If nfsd4_encode_fattr() ran out of space, assume that's because
@@ -1182,9 +1170,7 @@ struct nfsd4_operation {
static struct nfsd4_operation nfsd4_ops[];
-#ifdef NFSD_DEBUG
static const char *nfsd4_op_name(unsigned opnum);
-#endif
/*
* Enforce NFSv4.1 COMPOUND ordering rules:
@@ -1226,6 +1212,8 @@ static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op)
bool nfsd4_cache_this_op(struct nfsd4_op *op)
{
+ if (op->opnum == OP_ILLEGAL)
+ return false;
return OPDESC(op)->op_flags & OP_CACHEME;
}
@@ -1262,6 +1250,25 @@ static bool need_wrongsec_check(struct svc_rqst *rqstp)
return !(nextd->op_flags & OP_HANDLES_WRONGSEC);
}
+static void svcxdr_init_encode(struct svc_rqst *rqstp,
+ struct nfsd4_compoundres *resp)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_buf *buf = &rqstp->rq_res;
+ struct kvec *head = buf->head;
+
+ xdr->buf = buf;
+ xdr->iov = head;
+ xdr->p = head->iov_base + head->iov_len;
+ xdr->end = head->iov_base + PAGE_SIZE - rqstp->rq_auth_slack;
+ /* Tail and page_len should be zero at this point: */
+ buf->len = buf->head[0].iov_len;
+ xdr->scratch.iov_len = 0;
+ xdr->page_ptr = buf->pages - 1;
+ buf->buflen = PAGE_SIZE * (1 + rqstp->rq_page_end - buf->pages)
+ - rqstp->rq_auth_slack;
+}
+
/*
* COMPOUND call.
*/
@@ -1275,24 +1282,16 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate = &resp->cstate;
struct svc_fh *current_fh = &cstate->current_fh;
struct svc_fh *save_fh = &cstate->save_fh;
- int slack_bytes;
- u32 plen = 0;
__be32 status;
- resp->xbuf = &rqstp->rq_res;
- resp->p = rqstp->rq_res.head[0].iov_base +
- rqstp->rq_res.head[0].iov_len;
- resp->tagp = resp->p;
+ svcxdr_init_encode(rqstp, resp);
+ resp->tagp = resp->xdr.p;
/* reserve space for: taglen, tag, and opcnt */
- resp->p += 2 + XDR_QUADLEN(args->taglen);
- resp->end = rqstp->rq_res.head[0].iov_base + PAGE_SIZE;
+ xdr_reserve_space(&resp->xdr, 8 + args->taglen);
resp->taglen = args->taglen;
resp->tag = args->tag;
- resp->opcnt = 0;
resp->rqstp = rqstp;
cstate->minorversion = args->minorversion;
- cstate->replay_owner = NULL;
- cstate->session = NULL;
fh_init(current_fh, NFS4_FHSIZE);
fh_init(save_fh, NFS4_FHSIZE);
/*
@@ -1332,19 +1331,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
goto encode_op;
}
- /* We must be able to encode a successful response to
- * this operation, with enough room left over to encode a
- * failed response to the next operation. If we don't
- * have enough room, fail with ERR_RESOURCE.
- */
- slack_bytes = (char *)resp->end - (char *)resp->p;
- if (slack_bytes < COMPOUND_SLACK_SPACE
- + COMPOUND_ERR_SLACK_SPACE) {
- BUG_ON(slack_bytes < COMPOUND_ERR_SLACK_SPACE);
- op->status = nfserr_resource;
- goto encode_op;
- }
-
opdesc = OPDESC(op);
if (!current_fh->fh_dentry) {
@@ -1362,9 +1348,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
/* If op is non-idempotent */
if (opdesc->op_flags & OP_MODIFIES_SOMETHING) {
- plen = opdesc->op_rsize_bop(rqstp, op);
/*
- * If there's still another operation, make sure
+ * Don't execute this op if we couldn't encode a
+ * succesful reply:
+ */
+ u32 plen = opdesc->op_rsize_bop(rqstp, op);
+ /*
+ * Plus if there's another operation, make sure
* we'll have space to at least encode an error:
*/
if (resp->opcnt < args->opcnt)
@@ -1399,7 +1389,7 @@ encode_op:
}
if (op->status == nfserr_replay_me) {
op->replay = &cstate->replay_owner->so_replay;
- nfsd4_encode_replay(resp, op);
+ nfsd4_encode_replay(&resp->xdr, op);
status = op->status = op->replay->rp_status;
} else {
nfsd4_encode_operation(resp, op);
@@ -1438,7 +1428,8 @@ out:
#define op_encode_change_info_maxsz (5)
#define nfs4_fattr_bitmap_maxsz (4)
-#define op_encode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
+/* We'll fall back on returning no lockowner if run out of space: */
+#define op_encode_lockowner_maxsz (0)
#define op_encode_lock_denied_maxsz (8 + op_encode_lockowner_maxsz)
#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
@@ -1470,6 +1461,49 @@ static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op
+ nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
}
+/*
+ * Note since this is an idempotent operation we won't insist on failing
+ * the op prematurely if the estimate is too large. We may turn off splice
+ * reads unnecessarily.
+ */
+static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp,
+ struct nfsd4_op *op)
+{
+ u32 *bmap = op->u.getattr.ga_bmval;
+ u32 bmap0 = bmap[0], bmap1 = bmap[1], bmap2 = bmap[2];
+ u32 ret = 0;
+
+ if (bmap0 & FATTR4_WORD0_ACL)
+ return svc_max_payload(rqstp);
+ if (bmap0 & FATTR4_WORD0_FS_LOCATIONS)
+ return svc_max_payload(rqstp);
+
+ if (bmap1 & FATTR4_WORD1_OWNER) {
+ ret += IDMAP_NAMESZ + 4;
+ bmap1 &= ~FATTR4_WORD1_OWNER;
+ }
+ if (bmap1 & FATTR4_WORD1_OWNER_GROUP) {
+ ret += IDMAP_NAMESZ + 4;
+ bmap1 &= ~FATTR4_WORD1_OWNER_GROUP;
+ }
+ if (bmap0 & FATTR4_WORD0_FILEHANDLE) {
+ ret += NFS4_FHSIZE + 4;
+ bmap0 &= ~FATTR4_WORD0_FILEHANDLE;
+ }
+ if (bmap2 & FATTR4_WORD2_SECURITY_LABEL) {
+ ret += NFSD4_MAX_SEC_LABEL_LEN + 12;
+ bmap2 &= ~FATTR4_WORD2_SECURITY_LABEL;
+ }
+ /*
+ * Largest of remaining attributes are 16 bytes (e.g.,
+ * supported_attributes)
+ */
+ ret += 16 * (hweight32(bmap0) + hweight32(bmap1) + hweight32(bmap2));
+ /* bitmask, length */
+ ret += 20;
+ return ret;
+}
+
static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
return (op_encode_hdr_size + op_encode_change_info_maxsz)
@@ -1500,18 +1534,19 @@ static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
if (rlen > maxcount)
rlen = maxcount;
- return (op_encode_hdr_size + 2) * sizeof(__be32) + rlen;
+ return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32);
}
static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
+ u32 maxcount = svc_max_payload(rqstp);
u32 rlen = op->u.readdir.rd_maxcount;
- if (rlen > PAGE_SIZE)
- rlen = PAGE_SIZE;
+ if (rlen > maxcount)
+ rlen = maxcount;
- return (op_encode_hdr_size + op_encode_verifier_maxsz)
- * sizeof(__be32) + rlen;
+ return (op_encode_hdr_size + op_encode_verifier_maxsz +
+ XDR_QUADLEN(rlen)) * sizeof(__be32);
}
static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
@@ -1526,6 +1561,12 @@ static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op
+ op_encode_change_info_maxsz) * sizeof(__be32);
}
+static inline u32 nfsd4_sequence_rsize(struct svc_rqst *rqstp,
+ struct nfsd4_op *op)
+{
+ return NFS4_MAX_SESSIONID_LEN + 20;
+}
+
static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
@@ -1539,7 +1580,7 @@ static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_o
static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
- return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32);
+ return (op_encode_hdr_size + 2 + op_encode_verifier_maxsz) * sizeof(__be32);
}
static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
@@ -1607,6 +1648,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
[OP_GETATTR] = {
.op_func = (nfsd4op_func)nfsd4_getattr,
.op_flags = ALLOWED_ON_ABSENT_FS,
+ .op_rsize_bop = nfsd4_getattr_rsize,
.op_name = "OP_GETATTR",
},
[OP_GETFH] = {
@@ -1676,37 +1718,32 @@ static struct nfsd4_operation nfsd4_ops[] = {
[OP_PUTFH] = {
.op_func = (nfsd4op_func)nfsd4_putfh,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
- | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING
- | OP_CLEAR_STATEID,
+ | OP_IS_PUTFH_LIKE | OP_CLEAR_STATEID,
.op_name = "OP_PUTFH",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_PUTPUBFH] = {
.op_func = (nfsd4op_func)nfsd4_putrootfh,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
- | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING
- | OP_CLEAR_STATEID,
+ | OP_IS_PUTFH_LIKE | OP_CLEAR_STATEID,
.op_name = "OP_PUTPUBFH",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_PUTROOTFH] = {
.op_func = (nfsd4op_func)nfsd4_putrootfh,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
- | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING
- | OP_CLEAR_STATEID,
+ | OP_IS_PUTFH_LIKE | OP_CLEAR_STATEID,
.op_name = "OP_PUTROOTFH",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_READ] = {
.op_func = (nfsd4op_func)nfsd4_read,
- .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_READ",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_read_rsize,
.op_get_currentstateid = (stateid_getter)nfsd4_get_readstateid,
},
[OP_READDIR] = {
.op_func = (nfsd4op_func)nfsd4_readdir,
- .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_READDIR",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_readdir_rsize,
},
@@ -1864,14 +1901,33 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
};
-#ifdef NFSD_DEBUG
+int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ struct nfsd4_operation *opdesc;
+ nfsd4op_rsize estimator;
+
+ if (op->opnum == OP_ILLEGAL)
+ return op_encode_hdr_size * sizeof(__be32);
+ opdesc = OPDESC(op);
+ estimator = opdesc->op_rsize_bop;
+ return estimator ? estimator(rqstp, op) : PAGE_SIZE;
+}
+
+void warn_on_nonidempotent_op(struct nfsd4_op *op)
+{
+ if (OPDESC(op)->op_flags & OP_MODIFIES_SOMETHING) {
+ pr_err("unable to encode reply to nonidempotent op %d (%s)\n",
+ op->opnum, nfsd4_op_name(op->opnum));
+ WARN_ON_ONCE(1);
+ }
+}
+
static const char *nfsd4_op_name(unsigned opnum)
{
if (opnum < ARRAY_SIZE(nfsd4_ops))
return nfsd4_ops[opnum].op_name;
return "unknown_operation";
}
-#endif
#define nfsd4_voidres nfsd4_voidargs
struct nfsd4_voidargs { int dummy; };
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9a77a5a21557..2204e1fe5725 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -41,6 +41,7 @@
#include <linux/ratelimit.h>
#include <linux/sunrpc/svcauth_gss.h>
#include <linux/sunrpc/addr.h>
+#include <linux/hash.h>
#include "xdr4.h"
#include "xdr4cb.h"
#include "vfs.h"
@@ -81,13 +82,13 @@ static DEFINE_MUTEX(client_mutex);
* effort to decrease the scope of the client_mutex, this spinlock may
* eventually cover more:
*/
-static DEFINE_SPINLOCK(recall_lock);
+static DEFINE_SPINLOCK(state_lock);
-static struct kmem_cache *openowner_slab = NULL;
-static struct kmem_cache *lockowner_slab = NULL;
-static struct kmem_cache *file_slab = NULL;
-static struct kmem_cache *stateid_slab = NULL;
-static struct kmem_cache *deleg_slab = NULL;
+static struct kmem_cache *openowner_slab;
+static struct kmem_cache *lockowner_slab;
+static struct kmem_cache *file_slab;
+static struct kmem_cache *stateid_slab;
+static struct kmem_cache *deleg_slab;
void
nfs4_lock_state(void)
@@ -235,9 +236,9 @@ static void nfsd4_free_file(struct nfs4_file *f)
static inline void
put_nfs4_file(struct nfs4_file *fi)
{
- if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) {
+ if (atomic_dec_and_lock(&fi->fi_ref, &state_lock)) {
hlist_del(&fi->fi_hash);
- spin_unlock(&recall_lock);
+ spin_unlock(&state_lock);
iput(fi->fi_inode);
nfsd4_free_file(fi);
}
@@ -364,6 +365,79 @@ static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
return openlockstateid(nfs4_alloc_stid(clp, stateid_slab));
}
+/*
+ * When we recall a delegation, we should be careful not to hand it
+ * out again straight away.
+ * To ensure this we keep a pair of bloom filters ('new' and 'old')
+ * in which the filehandles of recalled delegations are "stored".
+ * If a filehandle appear in either filter, a delegation is blocked.
+ * When a delegation is recalled, the filehandle is stored in the "new"
+ * filter.
+ * Every 30 seconds we swap the filters and clear the "new" one,
+ * unless both are empty of course.
+ *
+ * Each filter is 256 bits. We hash the filehandle to 32bit and use the
+ * low 3 bytes as hash-table indices.
+ *
+ * 'state_lock', which is always held when block_delegations() is called,
+ * is used to manage concurrent access. Testing does not need the lock
+ * except when swapping the two filters.
+ */
+static struct bloom_pair {
+ int entries, old_entries;
+ time_t swap_time;
+ int new; /* index into 'set' */
+ DECLARE_BITMAP(set[2], 256);
+} blocked_delegations;
+
+static int delegation_blocked(struct knfsd_fh *fh)
+{
+ u32 hash;
+ struct bloom_pair *bd = &blocked_delegations;
+
+ if (bd->entries == 0)
+ return 0;
+ if (seconds_since_boot() - bd->swap_time > 30) {
+ spin_lock(&state_lock);
+ if (seconds_since_boot() - bd->swap_time > 30) {
+ bd->entries -= bd->old_entries;
+ bd->old_entries = bd->entries;
+ memset(bd->set[bd->new], 0,
+ sizeof(bd->set[0]));
+ bd->new = 1-bd->new;
+ bd->swap_time = seconds_since_boot();
+ }
+ spin_unlock(&state_lock);
+ }
+ hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
+ if (test_bit(hash&255, bd->set[0]) &&
+ test_bit((hash>>8)&255, bd->set[0]) &&
+ test_bit((hash>>16)&255, bd->set[0]))
+ return 1;
+
+ if (test_bit(hash&255, bd->set[1]) &&
+ test_bit((hash>>8)&255, bd->set[1]) &&
+ test_bit((hash>>16)&255, bd->set[1]))
+ return 1;
+
+ return 0;
+}
+
+static void block_delegations(struct knfsd_fh *fh)
+{
+ u32 hash;
+ struct bloom_pair *bd = &blocked_delegations;
+
+ hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
+
+ __set_bit(hash&255, bd->set[bd->new]);
+ __set_bit((hash>>8)&255, bd->set[bd->new]);
+ __set_bit((hash>>16)&255, bd->set[bd->new]);
+ if (bd->entries == 0)
+ bd->swap_time = seconds_since_boot();
+ bd->entries += 1;
+}
+
static struct nfs4_delegation *
alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh)
{
@@ -372,10 +446,11 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
dprintk("NFSD alloc_init_deleg\n");
if (num_delegations > max_delegations)
return NULL;
+ if (delegation_blocked(&current_fh->fh_handle))
+ return NULL;
dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab));
if (dp == NULL)
return dp;
- dp->dl_stid.sc_type = NFS4_DELEG_STID;
/*
* delegation seqid's are never incremented. The 4.1 special
* meaning of seqid 0 isn't meaningful, really, but let's avoid
@@ -418,6 +493,8 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
static void nfs4_put_deleg_lease(struct nfs4_file *fp)
{
+ if (!fp->fi_lease)
+ return;
if (atomic_dec_and_test(&fp->fi_delegees)) {
vfs_setlease(fp->fi_deleg_file, F_UNLCK, &fp->fi_lease);
fp->fi_lease = NULL;
@@ -431,18 +508,30 @@ static void unhash_stid(struct nfs4_stid *s)
s->sc_type = 0;
}
+static void
+hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
+{
+ lockdep_assert_held(&state_lock);
+
+ dp->dl_stid.sc_type = NFS4_DELEG_STID;
+ list_add(&dp->dl_perfile, &fp->fi_delegations);
+ list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
+}
+
/* Called under the state lock. */
static void
unhash_delegation(struct nfs4_delegation *dp)
{
+ spin_lock(&state_lock);
list_del_init(&dp->dl_perclnt);
- spin_lock(&recall_lock);
list_del_init(&dp->dl_perfile);
list_del_init(&dp->dl_recall_lru);
- spin_unlock(&recall_lock);
- nfs4_put_deleg_lease(dp->dl_file);
- put_nfs4_file(dp->dl_file);
- dp->dl_file = NULL;
+ spin_unlock(&state_lock);
+ if (dp->dl_file) {
+ nfs4_put_deleg_lease(dp->dl_file);
+ put_nfs4_file(dp->dl_file);
+ dp->dl_file = NULL;
+ }
}
@@ -645,6 +734,12 @@ static void unhash_lockowner(struct nfs4_lockowner *lo)
}
}
+static void nfs4_free_lockowner(struct nfs4_lockowner *lo)
+{
+ kfree(lo->lo_owner.so_owner.data);
+ kmem_cache_free(lockowner_slab, lo);
+}
+
static void release_lockowner(struct nfs4_lockowner *lo)
{
unhash_lockowner(lo);
@@ -699,6 +794,12 @@ static void release_last_closed_stateid(struct nfs4_openowner *oo)
}
}
+static void nfs4_free_openowner(struct nfs4_openowner *oo)
+{
+ kfree(oo->oo_owner.so_owner.data);
+ kmem_cache_free(openowner_slab, oo);
+}
+
static void release_openowner(struct nfs4_openowner *oo)
{
unhash_openowner(oo);
@@ -1093,7 +1194,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
return clp;
}
-static inline void
+static void
free_client(struct nfs4_client *clp)
{
struct nfsd_net __maybe_unused *nn = net_generic(clp->net, nfsd_net_id);
@@ -1136,13 +1237,13 @@ destroy_client(struct nfs4_client *clp)
struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
INIT_LIST_HEAD(&reaplist);
- spin_lock(&recall_lock);
+ spin_lock(&state_lock);
while (!list_empty(&clp->cl_delegations)) {
dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt);
list_del_init(&dp->dl_perclnt);
list_move(&dp->dl_recall_lru, &reaplist);
}
- spin_unlock(&recall_lock);
+ spin_unlock(&state_lock);
while (!list_empty(&reaplist)) {
dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
destroy_delegation(dp);
@@ -1544,6 +1645,7 @@ out_err:
void
nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
{
+ struct xdr_buf *buf = resp->xdr.buf;
struct nfsd4_slot *slot = resp->cstate.slot;
unsigned int base;
@@ -1557,11 +1659,9 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
slot->sl_datalen = 0;
return;
}
- slot->sl_datalen = (char *)resp->p - (char *)resp->cstate.datap;
- base = (char *)resp->cstate.datap -
- (char *)resp->xbuf->head[0].iov_base;
- if (read_bytes_from_xdr_buf(resp->xbuf, base, slot->sl_data,
- slot->sl_datalen))
+ base = resp->cstate.data_offset;
+ slot->sl_datalen = buf->len - base;
+ if (read_bytes_from_xdr_buf(buf, base, slot->sl_data, slot->sl_datalen))
WARN("%s: sessions DRC could not cache compound\n", __func__);
return;
}
@@ -1602,6 +1702,8 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
struct nfsd4_sequence *seq)
{
struct nfsd4_slot *slot = resp->cstate.slot;
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
__be32 status;
dprintk("--> %s slot %p\n", __func__, slot);
@@ -1610,14 +1712,16 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
if (status)
return status;
- /* The sequence operation has been encoded, cstate->datap set. */
- memcpy(resp->cstate.datap, slot->sl_data, slot->sl_datalen);
+ p = xdr_reserve_space(xdr, slot->sl_datalen);
+ if (!p) {
+ WARN_ON_ONCE(1);
+ return nfserr_serverfault;
+ }
+ xdr_encode_opaque_fixed(p, slot->sl_data, slot->sl_datalen);
+ xdr_commit_encode(xdr);
resp->opcnt = slot->sl_opcnt;
- resp->p = resp->cstate.datap + XDR_QUADLEN(slot->sl_datalen);
- status = slot->sl_status;
-
- return status;
+ return slot->sl_status;
}
/*
@@ -2189,11 +2293,13 @@ nfsd4_sequence(struct svc_rqst *rqstp,
struct nfsd4_sequence *seq)
{
struct nfsd4_compoundres *resp = rqstp->rq_resp;
+ struct xdr_stream *xdr = &resp->xdr;
struct nfsd4_session *session;
struct nfs4_client *clp;
struct nfsd4_slot *slot;
struct nfsd4_conn *conn;
__be32 status;
+ int buflen;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
if (resp->opcnt != 1)
@@ -2262,6 +2368,16 @@ nfsd4_sequence(struct svc_rqst *rqstp,
if (status)
goto out_put_session;
+ buflen = (seq->cachethis) ?
+ session->se_fchannel.maxresp_cached :
+ session->se_fchannel.maxresp_sz;
+ status = (seq->cachethis) ? nfserr_rep_too_big_to_cache :
+ nfserr_rep_too_big;
+ if (xdr_restrict_buflen(xdr, buflen - rqstp->rq_auth_slack))
+ goto out_put_session;
+ svc_reserve(rqstp, buflen);
+
+ status = nfs_ok;
/* Success! bump slot seqid */
slot->sl_seqid = seq->seqid;
slot->sl_flags |= NFSD4_SLOT_INUSE;
@@ -2499,28 +2615,19 @@ static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino)
fp->fi_lease = NULL;
memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
memset(fp->fi_access, 0, sizeof(fp->fi_access));
- spin_lock(&recall_lock);
+ spin_lock(&state_lock);
hlist_add_head(&fp->fi_hash, &file_hashtbl[hashval]);
- spin_unlock(&recall_lock);
-}
-
-static void
-nfsd4_free_slab(struct kmem_cache **slab)
-{
- if (*slab == NULL)
- return;
- kmem_cache_destroy(*slab);
- *slab = NULL;
+ spin_unlock(&state_lock);
}
void
nfsd4_free_slabs(void)
{
- nfsd4_free_slab(&openowner_slab);
- nfsd4_free_slab(&lockowner_slab);
- nfsd4_free_slab(&file_slab);
- nfsd4_free_slab(&stateid_slab);
- nfsd4_free_slab(&deleg_slab);
+ kmem_cache_destroy(openowner_slab);
+ kmem_cache_destroy(lockowner_slab);
+ kmem_cache_destroy(file_slab);
+ kmem_cache_destroy(stateid_slab);
+ kmem_cache_destroy(deleg_slab);
}
int
@@ -2529,42 +2636,38 @@ nfsd4_init_slabs(void)
openowner_slab = kmem_cache_create("nfsd4_openowners",
sizeof(struct nfs4_openowner), 0, 0, NULL);
if (openowner_slab == NULL)
- goto out_nomem;
+ goto out;
lockowner_slab = kmem_cache_create("nfsd4_lockowners",
sizeof(struct nfs4_lockowner), 0, 0, NULL);
if (lockowner_slab == NULL)
- goto out_nomem;
+ goto out_free_openowner_slab;
file_slab = kmem_cache_create("nfsd4_files",
sizeof(struct nfs4_file), 0, 0, NULL);
if (file_slab == NULL)
- goto out_nomem;
+ goto out_free_lockowner_slab;
stateid_slab = kmem_cache_create("nfsd4_stateids",
sizeof(struct nfs4_ol_stateid), 0, 0, NULL);
if (stateid_slab == NULL)
- goto out_nomem;
+ goto out_free_file_slab;
deleg_slab = kmem_cache_create("nfsd4_delegations",
sizeof(struct nfs4_delegation), 0, 0, NULL);
if (deleg_slab == NULL)
- goto out_nomem;
+ goto out_free_stateid_slab;
return 0;
-out_nomem:
- nfsd4_free_slabs();
+
+out_free_stateid_slab:
+ kmem_cache_destroy(stateid_slab);
+out_free_file_slab:
+ kmem_cache_destroy(file_slab);
+out_free_lockowner_slab:
+ kmem_cache_destroy(lockowner_slab);
+out_free_openowner_slab:
+ kmem_cache_destroy(openowner_slab);
+out:
dprintk("nfsd4: out of memory while initializing nfsv4\n");
return -ENOMEM;
}
-void nfs4_free_openowner(struct nfs4_openowner *oo)
-{
- kfree(oo->oo_owner.so_owner.data);
- kmem_cache_free(openowner_slab, oo);
-}
-
-void nfs4_free_lockowner(struct nfs4_lockowner *lo)
-{
- kfree(lo->lo_owner.so_owner.data);
- kmem_cache_free(lockowner_slab, lo);
-}
-
static void init_nfs4_replay(struct nfs4_replay *rp)
{
rp->rp_status = nfserr_serverfault;
@@ -2685,15 +2788,15 @@ find_file(struct inode *ino)
unsigned int hashval = file_hashval(ino);
struct nfs4_file *fp;
- spin_lock(&recall_lock);
+ spin_lock(&state_lock);
hlist_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
if (fp->fi_inode == ino) {
get_nfs4_file(fp);
- spin_unlock(&recall_lock);
+ spin_unlock(&state_lock);
return fp;
}
}
- spin_unlock(&recall_lock);
+ spin_unlock(&state_lock);
return NULL;
}
@@ -2730,6 +2833,7 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
struct nfs4_client *clp = dp->dl_stid.sc_client;
struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+ lockdep_assert_held(&state_lock);
/* We're assuming the state code never drops its reference
* without first removing the lease. Since we're in this lease
* callback (and since the lease code is serialized by the kernel
@@ -2742,6 +2846,8 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
/* Only place dl_time is set; protected by i_lock: */
dp->dl_time = get_seconds();
+ block_delegations(&dp->dl_fh);
+
nfsd4_cb_recall(dp);
}
@@ -2766,11 +2872,11 @@ static void nfsd_break_deleg_cb(struct file_lock *fl)
*/
fl->fl_break_time = 0;
- spin_lock(&recall_lock);
+ spin_lock(&state_lock);
fp->fi_had_conflict = true;
list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
nfsd_break_one_deleg(dp);
- spin_unlock(&recall_lock);
+ spin_unlock(&state_lock);
}
static
@@ -3047,11 +3153,12 @@ static int nfs4_setlease(struct nfs4_delegation *dp)
status = vfs_setlease(fl->fl_file, fl->fl_type, &fl);
if (status)
goto out_free;
- list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
fp->fi_lease = fl;
fp->fi_deleg_file = get_file(fl->fl_file);
atomic_set(&fp->fi_delegees, 1);
- list_add(&dp->dl_perfile, &fp->fi_delegations);
+ spin_lock(&state_lock);
+ hash_delegation_locked(dp, fp);
+ spin_unlock(&state_lock);
return 0;
out_free:
locks_free_lock(fl);
@@ -3060,33 +3167,21 @@ out_free:
static int nfs4_set_delegation(struct nfs4_delegation *dp, struct nfs4_file *fp)
{
- int status;
-
if (fp->fi_had_conflict)
return -EAGAIN;
get_nfs4_file(fp);
dp->dl_file = fp;
- if (!fp->fi_lease) {
- status = nfs4_setlease(dp);
- if (status)
- goto out_free;
- return 0;
- }
- spin_lock(&recall_lock);
+ if (!fp->fi_lease)
+ return nfs4_setlease(dp);
+ spin_lock(&state_lock);
+ atomic_inc(&fp->fi_delegees);
if (fp->fi_had_conflict) {
- spin_unlock(&recall_lock);
- status = -EAGAIN;
- goto out_free;
+ spin_unlock(&state_lock);
+ return -EAGAIN;
}
- atomic_inc(&fp->fi_delegees);
- list_add(&dp->dl_perfile, &fp->fi_delegations);
- spin_unlock(&recall_lock);
- list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
+ hash_delegation_locked(dp, fp);
+ spin_unlock(&state_lock);
return 0;
-out_free:
- put_nfs4_file(fp);
- dp->dl_file = fp;
- return status;
}
static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
@@ -3173,8 +3268,7 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
open->op_delegate_type = NFS4_OPEN_DELEGATE_READ;
return;
out_free:
- remove_stid(&dp->dl_stid);
- nfs4_put_delegation(dp);
+ destroy_delegation(dp);
out_no_deleg:
open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE;
if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS &&
@@ -3391,8 +3485,7 @@ nfs4_laundromat(struct nfsd_net *nn)
struct nfs4_delegation *dp;
struct list_head *pos, *next, reaplist;
time_t cutoff = get_seconds() - nn->nfsd4_lease;
- time_t t, clientid_val = nn->nfsd4_lease;
- time_t u, test_val = nn->nfsd4_lease;
+ time_t t, new_timeo = nn->nfsd4_lease;
nfs4_lock_state();
@@ -3404,8 +3497,7 @@ nfs4_laundromat(struct nfsd_net *nn)
clp = list_entry(pos, struct nfs4_client, cl_lru);
if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {
t = clp->cl_time - cutoff;
- if (clientid_val > t)
- clientid_val = t;
+ new_timeo = min(new_timeo, t);
break;
}
if (mark_client_expired_locked(clp)) {
@@ -3422,39 +3514,35 @@ nfs4_laundromat(struct nfsd_net *nn)
clp->cl_clientid.cl_id);
expire_client(clp);
}
- spin_lock(&recall_lock);
+ spin_lock(&state_lock);
list_for_each_safe(pos, next, &nn->del_recall_lru) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
if (net_generic(dp->dl_stid.sc_client->net, nfsd_net_id) != nn)
continue;
if (time_after((unsigned long)dp->dl_time, (unsigned long)cutoff)) {
- u = dp->dl_time - cutoff;
- if (test_val > u)
- test_val = u;
+ t = dp->dl_time - cutoff;
+ new_timeo = min(new_timeo, t);
break;
}
list_move(&dp->dl_recall_lru, &reaplist);
}
- spin_unlock(&recall_lock);
+ spin_unlock(&state_lock);
list_for_each_safe(pos, next, &reaplist) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
revoke_delegation(dp);
}
- test_val = nn->nfsd4_lease;
list_for_each_safe(pos, next, &nn->close_lru) {
oo = container_of(pos, struct nfs4_openowner, oo_close_lru);
if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) {
- u = oo->oo_time - cutoff;
- if (test_val > u)
- test_val = u;
+ t = oo->oo_time - cutoff;
+ new_timeo = min(new_timeo, t);
break;
}
release_openowner(oo);
}
- if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT)
- clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT;
+ new_timeo = max_t(time_t, new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT);
nfs4_unlock_state();
- return clientid_val;
+ return new_timeo;
}
static struct workqueue_struct *laundry_wq;
@@ -3654,6 +3742,7 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
struct svc_fh *current_fh = &cstate->current_fh;
struct inode *ino = current_fh->fh_dentry->d_inode;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct file *file = NULL;
__be32 status;
if (filpp)
@@ -3665,10 +3754,12 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
return check_special_stateids(net, current_fh, stateid, flags);
+ nfs4_lock_state();
+
status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID,
&s, cstate->minorversion, nn);
if (status)
- return status;
+ goto out;
status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate));
if (status)
goto out;
@@ -3679,8 +3770,8 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
if (status)
goto out;
if (filpp) {
- *filpp = dp->dl_file->fi_deleg_file;
- if (!*filpp) {
+ file = dp->dl_file->fi_deleg_file;
+ if (!file) {
WARN_ON_ONCE(1);
status = nfserr_serverfault;
goto out;
@@ -3701,16 +3792,20 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
goto out;
if (filpp) {
if (flags & RD_STATE)
- *filpp = find_readable_file(stp->st_file);
+ file = find_readable_file(stp->st_file);
else
- *filpp = find_writeable_file(stp->st_file);
+ file = find_writeable_file(stp->st_file);
}
break;
default:
- return nfserr_bad_stateid;
+ status = nfserr_bad_stateid;
+ goto out;
}
status = nfs_ok;
+ if (file)
+ *filpp = get_file(file);
out:
+ nfs4_unlock_state();
return status;
}
@@ -3726,7 +3821,7 @@ nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp)
* correspondance, and we have to delete the lockowner when we
* delete the lock stateid:
*/
- unhash_lockowner(lo);
+ release_lockowner(lo);
return nfs_ok;
}
@@ -4896,6 +4991,7 @@ static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max,
struct nfs4_delegation *dp, *next;
u64 count = 0;
+ lockdep_assert_held(&state_lock);
list_for_each_entry_safe(dp, next, &clp->cl_delegations, dl_perclnt) {
if (victims)
list_move(&dp->dl_recall_lru, victims);
@@ -4911,9 +5007,9 @@ u64 nfsd_forget_client_delegations(struct nfs4_client *clp, u64 max)
LIST_HEAD(victims);
u64 count;
- spin_lock(&recall_lock);
+ spin_lock(&state_lock);
count = nfsd_find_all_delegations(clp, max, &victims);
- spin_unlock(&recall_lock);
+ spin_unlock(&state_lock);
list_for_each_entry_safe(dp, next, &victims, dl_recall_lru)
revoke_delegation(dp);
@@ -4927,11 +5023,11 @@ u64 nfsd_recall_client_delegations(struct nfs4_client *clp, u64 max)
LIST_HEAD(victims);
u64 count;
- spin_lock(&recall_lock);
+ spin_lock(&state_lock);
count = nfsd_find_all_delegations(clp, max, &victims);
list_for_each_entry_safe(dp, next, &victims, dl_recall_lru)
nfsd_break_one_deleg(dp);
- spin_unlock(&recall_lock);
+ spin_unlock(&state_lock);
return count;
}
@@ -4940,9 +5036,9 @@ u64 nfsd_print_client_delegations(struct nfs4_client *clp, u64 max)
{
u64 count = 0;
- spin_lock(&recall_lock);
+ spin_lock(&state_lock);
count = nfsd_find_all_delegations(clp, max, NULL);
- spin_unlock(&recall_lock);
+ spin_unlock(&state_lock);
nfsd_print_count(clp, count, "delegations");
return count;
@@ -4983,13 +5079,6 @@ struct nfs4_client *nfsd_find_client(struct sockaddr_storage *addr, size_t addr_
#endif /* CONFIG_NFSD_FAULT_INJECTION */
-/* initialization to perform at module load time: */
-
-void
-nfs4_state_init(void)
-{
-}
-
/*
* Since the lifetime of a delegation isn't limited to that of an open, a
* client may quite reasonably hang on to a delegation as long as it has
@@ -5160,12 +5249,12 @@ nfs4_state_shutdown_net(struct net *net)
nfs4_lock_state();
INIT_LIST_HEAD(&reaplist);
- spin_lock(&recall_lock);
+ spin_lock(&state_lock);
list_for_each_safe(pos, next, &nn->del_recall_lru) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
list_move(&dp->dl_recall_lru, &reaplist);
}
- spin_unlock(&recall_lock);
+ spin_unlock(&state_lock);
list_for_each_safe(pos, next, &reaplist) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
destroy_delegation(dp);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 18881f34737a..83baf2bfe9e9 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -98,11 +98,6 @@ xdr_error: \
status = nfserr_bad_xdr; \
goto out
-#define READ32(x) (x) = ntohl(*p++)
-#define READ64(x) do { \
- (x) = (u64)ntohl(*p++) << 32; \
- (x) |= ntohl(*p++); \
-} while (0)
#define READMEM(x,nbytes) do { \
x = (char *)p; \
p += XDR_QUADLEN(nbytes); \
@@ -248,17 +243,17 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
bmval[2] = 0;
READ_BUF(4);
- READ32(bmlen);
+ bmlen = be32_to_cpup(p++);
if (bmlen > 1000)
goto xdr_error;
READ_BUF(bmlen << 2);
if (bmlen > 0)
- READ32(bmval[0]);
+ bmval[0] = be32_to_cpup(p++);
if (bmlen > 1)
- READ32(bmval[1]);
+ bmval[1] = be32_to_cpup(p++);
if (bmlen > 2)
- READ32(bmval[2]);
+ bmval[2] = be32_to_cpup(p++);
DECODE_TAIL;
}
@@ -270,6 +265,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
{
int expected_len, len = 0;
u32 dummy32;
+ u64 sec;
char *buf;
DECODE_HEAD;
@@ -278,12 +274,12 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
return status;
READ_BUF(4);
- READ32(expected_len);
+ expected_len = be32_to_cpup(p++);
if (bmval[0] & FATTR4_WORD0_SIZE) {
READ_BUF(8);
len += 8;
- READ64(iattr->ia_size);
+ p = xdr_decode_hyper(p, &iattr->ia_size);
iattr->ia_valid |= ATTR_SIZE;
}
if (bmval[0] & FATTR4_WORD0_ACL) {
@@ -291,7 +287,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
struct nfs4_ace *ace;
READ_BUF(4); len += 4;
- READ32(nace);
+ nace = be32_to_cpup(p++);
if (nace > NFS4_ACL_MAX)
return nfserr_fbig;
@@ -305,10 +301,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
(*acl)->naces = nace;
for (ace = (*acl)->aces; ace < (*acl)->aces + nace; ace++) {
READ_BUF(16); len += 16;
- READ32(ace->type);
- READ32(ace->flag);
- READ32(ace->access_mask);
- READ32(dummy32);
+ ace->type = be32_to_cpup(p++);
+ ace->flag = be32_to_cpup(p++);
+ ace->access_mask = be32_to_cpup(p++);
+ dummy32 = be32_to_cpup(p++);
READ_BUF(dummy32);
len += XDR_QUADLEN(dummy32) << 2;
READMEM(buf, dummy32);
@@ -330,14 +326,14 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
if (bmval[1] & FATTR4_WORD1_MODE) {
READ_BUF(4);
len += 4;
- READ32(iattr->ia_mode);
+ iattr->ia_mode = be32_to_cpup(p++);
iattr->ia_mode &= (S_IFMT | S_IALLUGO);
iattr->ia_valid |= ATTR_MODE;
}
if (bmval[1] & FATTR4_WORD1_OWNER) {
READ_BUF(4);
len += 4;
- READ32(dummy32);
+ dummy32 = be32_to_cpup(p++);
READ_BUF(dummy32);
len += (XDR_QUADLEN(dummy32) << 2);
READMEM(buf, dummy32);
@@ -348,7 +344,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) {
READ_BUF(4);
len += 4;
- READ32(dummy32);
+ dummy32 = be32_to_cpup(p++);
READ_BUF(dummy32);
len += (XDR_QUADLEN(dummy32) << 2);
READMEM(buf, dummy32);
@@ -359,15 +355,16 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) {
READ_BUF(4);
len += 4;
- READ32(dummy32);
+ dummy32 = be32_to_cpup(p++);
switch (dummy32) {
case NFS4_SET_TO_CLIENT_TIME:
/* We require the high 32 bits of 'seconds' to be 0, and we ignore
all 32 bits of 'nseconds'. */
READ_BUF(12);
len += 12;
- READ64(iattr->ia_atime.tv_sec);
- READ32(iattr->ia_atime.tv_nsec);
+ p = xdr_decode_hyper(p, &sec);
+ iattr->ia_atime.tv_sec = (time_t)sec;
+ iattr->ia_atime.tv_nsec = be32_to_cpup(p++);
if (iattr->ia_atime.tv_nsec >= (u32)1000000000)
return nfserr_inval;
iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET);
@@ -382,15 +379,16 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) {
READ_BUF(4);
len += 4;
- READ32(dummy32);
+ dummy32 = be32_to_cpup(p++);
switch (dummy32) {
case NFS4_SET_TO_CLIENT_TIME:
/* We require the high 32 bits of 'seconds' to be 0, and we ignore
all 32 bits of 'nseconds'. */
READ_BUF(12);
len += 12;
- READ64(iattr->ia_mtime.tv_sec);
- READ32(iattr->ia_mtime.tv_nsec);
+ p = xdr_decode_hyper(p, &sec);
+ iattr->ia_mtime.tv_sec = sec;
+ iattr->ia_mtime.tv_nsec = be32_to_cpup(p++);
if (iattr->ia_mtime.tv_nsec >= (u32)1000000000)
return nfserr_inval;
iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET);
@@ -408,13 +406,13 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) {
READ_BUF(4);
len += 4;
- READ32(dummy32); /* lfs: we don't use it */
+ dummy32 = be32_to_cpup(p++); /* lfs: we don't use it */
READ_BUF(4);
len += 4;
- READ32(dummy32); /* pi: we don't use it either */
+ dummy32 = be32_to_cpup(p++); /* pi: we don't use it either */
READ_BUF(4);
len += 4;
- READ32(dummy32);
+ dummy32 = be32_to_cpup(p++);
READ_BUF(dummy32);
if (dummy32 > NFSD4_MAX_SEC_LABEL_LEN)
return nfserr_badlabel;
@@ -445,7 +443,7 @@ nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid)
DECODE_HEAD;
READ_BUF(sizeof(stateid_t));
- READ32(sid->si_generation);
+ sid->si_generation = be32_to_cpup(p++);
COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
DECODE_TAIL;
@@ -457,7 +455,7 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access
DECODE_HEAD;
READ_BUF(4);
- READ32(access->ac_req_access);
+ access->ac_req_access = be32_to_cpup(p++);
DECODE_TAIL;
}
@@ -472,7 +470,7 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_
/* callback_sec_params4 */
READ_BUF(4);
- READ32(nr_secflavs);
+ nr_secflavs = be32_to_cpup(p++);
if (nr_secflavs)
cbs->flavor = (u32)(-1);
else
@@ -480,7 +478,7 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_
cbs->flavor = 0;
for (i = 0; i < nr_secflavs; ++i) {
READ_BUF(4);
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
switch (dummy) {
case RPC_AUTH_NULL:
/* Nothing to read */
@@ -490,21 +488,21 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_
case RPC_AUTH_UNIX:
READ_BUF(8);
/* stamp */
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
/* machine name */
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
READ_BUF(dummy);
SAVEMEM(machine_name, dummy);
/* uid, gid */
READ_BUF(8);
- READ32(uid);
- READ32(gid);
+ uid = be32_to_cpup(p++);
+ gid = be32_to_cpup(p++);
/* more gids */
READ_BUF(4);
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
READ_BUF(dummy * 4);
if (cbs->flavor == (u32)(-1)) {
kuid_t kuid = make_kuid(&init_user_ns, uid);
@@ -524,14 +522,14 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_
"not supported!\n");
READ_BUF(8);
/* gcbp_service */
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
/* gcbp_handle_from_server */
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
READ_BUF(dummy);
p += XDR_QUADLEN(dummy);
/* gcbp_handle_from_client */
READ_BUF(4);
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
READ_BUF(dummy);
break;
default:
@@ -547,7 +545,7 @@ static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, stru
DECODE_HEAD;
READ_BUF(4);
- READ32(bc->bc_cb_program);
+ bc->bc_cb_program = be32_to_cpup(p++);
nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec);
DECODE_TAIL;
@@ -559,7 +557,7 @@ static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp,
READ_BUF(NFS4_MAX_SESSIONID_LEN + 8);
COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
- READ32(bcts->dir);
+ bcts->dir = be32_to_cpup(p++);
/* XXX: skipping ctsa_use_conn_in_rdma_mode. Perhaps Tom Tucker
* could help us figure out we should be using it. */
DECODE_TAIL;
@@ -571,7 +569,7 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
DECODE_HEAD;
READ_BUF(4);
- READ32(close->cl_seqid);
+ close->cl_seqid = be32_to_cpup(p++);
return nfsd4_decode_stateid(argp, &close->cl_stateid);
DECODE_TAIL;
@@ -584,8 +582,8 @@ nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit
DECODE_HEAD;
READ_BUF(12);
- READ64(commit->co_offset);
- READ32(commit->co_count);
+ p = xdr_decode_hyper(p, &commit->co_offset);
+ commit->co_count = be32_to_cpup(p++);
DECODE_TAIL;
}
@@ -596,19 +594,19 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
DECODE_HEAD;
READ_BUF(4);
- READ32(create->cr_type);
+ create->cr_type = be32_to_cpup(p++);
switch (create->cr_type) {
case NF4LNK:
READ_BUF(4);
- READ32(create->cr_linklen);
+ create->cr_linklen = be32_to_cpup(p++);
READ_BUF(create->cr_linklen);
SAVEMEM(create->cr_linkname, create->cr_linklen);
break;
case NF4BLK:
case NF4CHR:
READ_BUF(8);
- READ32(create->cr_specdata1);
- READ32(create->cr_specdata2);
+ create->cr_specdata1 = be32_to_cpup(p++);
+ create->cr_specdata2 = be32_to_cpup(p++);
break;
case NF4SOCK:
case NF4FIFO:
@@ -618,7 +616,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
}
READ_BUF(4);
- READ32(create->cr_namelen);
+ create->cr_namelen = be32_to_cpup(p++);
READ_BUF(create->cr_namelen);
SAVEMEM(create->cr_name, create->cr_namelen);
if ((status = check_filename(create->cr_name, create->cr_namelen)))
@@ -650,7 +648,7 @@ nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link)
DECODE_HEAD;
READ_BUF(4);
- READ32(link->li_namelen);
+ link->li_namelen = be32_to_cpup(p++);
READ_BUF(link->li_namelen);
SAVEMEM(link->li_name, link->li_namelen);
if ((status = check_filename(link->li_name, link->li_namelen)))
@@ -668,24 +666,24 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
* type, reclaim(boolean), offset, length, new_lock_owner(boolean)
*/
READ_BUF(28);
- READ32(lock->lk_type);
+ lock->lk_type = be32_to_cpup(p++);
if ((lock->lk_type < NFS4_READ_LT) || (lock->lk_type > NFS4_WRITEW_LT))
goto xdr_error;
- READ32(lock->lk_reclaim);
- READ64(lock->lk_offset);
- READ64(lock->lk_length);
- READ32(lock->lk_is_new);
+ lock->lk_reclaim = be32_to_cpup(p++);
+ p = xdr_decode_hyper(p, &lock->lk_offset);
+ p = xdr_decode_hyper(p, &lock->lk_length);
+ lock->lk_is_new = be32_to_cpup(p++);
if (lock->lk_is_new) {
READ_BUF(4);
- READ32(lock->lk_new_open_seqid);
+ lock->lk_new_open_seqid = be32_to_cpup(p++);
status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid);
if (status)
return status;
READ_BUF(8 + sizeof(clientid_t));
- READ32(lock->lk_new_lock_seqid);
+ lock->lk_new_lock_seqid = be32_to_cpup(p++);
COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t));
- READ32(lock->lk_new_owner.len);
+ lock->lk_new_owner.len = be32_to_cpup(p++);
READ_BUF(lock->lk_new_owner.len);
READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len);
} else {
@@ -693,7 +691,7 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
if (status)
return status;
READ_BUF(4);
- READ32(lock->lk_old_lock_seqid);
+ lock->lk_old_lock_seqid = be32_to_cpup(p++);
}
DECODE_TAIL;
@@ -705,13 +703,13 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
DECODE_HEAD;
READ_BUF(32);
- READ32(lockt->lt_type);
+ lockt->lt_type = be32_to_cpup(p++);
if((lockt->lt_type < NFS4_READ_LT) || (lockt->lt_type > NFS4_WRITEW_LT))
goto xdr_error;
- READ64(lockt->lt_offset);
- READ64(lockt->lt_length);
+ p = xdr_decode_hyper(p, &lockt->lt_offset);
+ p = xdr_decode_hyper(p, &lockt->lt_length);
COPYMEM(&lockt->lt_clientid, 8);
- READ32(lockt->lt_owner.len);
+ lockt->lt_owner.len = be32_to_cpup(p++);
READ_BUF(lockt->lt_owner.len);
READMEM(lockt->lt_owner.data, lockt->lt_owner.len);
@@ -724,16 +722,16 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
DECODE_HEAD;
READ_BUF(8);
- READ32(locku->lu_type);
+ locku->lu_type = be32_to_cpup(p++);
if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT))
goto xdr_error;
- READ32(locku->lu_seqid);
+ locku->lu_seqid = be32_to_cpup(p++);
status = nfsd4_decode_stateid(argp, &locku->lu_stateid);
if (status)
return status;
READ_BUF(16);
- READ64(locku->lu_offset);
- READ64(locku->lu_length);
+ p = xdr_decode_hyper(p, &locku->lu_offset);
+ p = xdr_decode_hyper(p, &locku->lu_length);
DECODE_TAIL;
}
@@ -744,7 +742,7 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup
DECODE_HEAD;
READ_BUF(4);
- READ32(lookup->lo_len);
+ lookup->lo_len = be32_to_cpup(p++);
READ_BUF(lookup->lo_len);
SAVEMEM(lookup->lo_name, lookup->lo_len);
if ((status = check_filename(lookup->lo_name, lookup->lo_len)))
@@ -759,7 +757,7 @@ static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *sh
u32 w;
READ_BUF(4);
- READ32(w);
+ w = be32_to_cpup(p++);
*share_access = w & NFS4_SHARE_ACCESS_MASK;
*deleg_want = w & NFS4_SHARE_WANT_MASK;
if (deleg_when)
@@ -811,7 +809,7 @@ static __be32 nfsd4_decode_share_deny(struct nfsd4_compoundargs *argp, u32 *x)
__be32 *p;
READ_BUF(4);
- READ32(*x);
+ *x = be32_to_cpup(p++);
/* Note: unlinke access bits, deny bits may be zero. */
if (*x & ~NFS4_SHARE_DENY_BOTH)
return nfserr_bad_xdr;
@@ -825,7 +823,7 @@ static __be32 nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_ne
__be32 *p;
READ_BUF(4);
- READ32(o->len);
+ o->len = be32_to_cpup(p++);
if (o->len == 0 || o->len > NFS4_OPAQUE_LIMIT)
return nfserr_bad_xdr;
@@ -850,7 +848,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
open->op_xdr_error = 0;
/* seqid, share_access, share_deny, clientid, ownerlen */
READ_BUF(4);
- READ32(open->op_seqid);
+ open->op_seqid = be32_to_cpup(p++);
/* decode, yet ignore deleg_when until supported */
status = nfsd4_decode_share_access(argp, &open->op_share_access,
&open->op_deleg_want, &dummy);
@@ -865,13 +863,13 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
if (status)
goto xdr_error;
READ_BUF(4);
- READ32(open->op_create);
+ open->op_create = be32_to_cpup(p++);
switch (open->op_create) {
case NFS4_OPEN_NOCREATE:
break;
case NFS4_OPEN_CREATE:
READ_BUF(4);
- READ32(open->op_createmode);
+ open->op_createmode = be32_to_cpup(p++);
switch (open->op_createmode) {
case NFS4_CREATE_UNCHECKED:
case NFS4_CREATE_GUARDED:
@@ -904,12 +902,12 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
/* open_claim */
READ_BUF(4);
- READ32(open->op_claim_type);
+ open->op_claim_type = be32_to_cpup(p++);
switch (open->op_claim_type) {
case NFS4_OPEN_CLAIM_NULL:
case NFS4_OPEN_CLAIM_DELEGATE_PREV:
READ_BUF(4);
- READ32(open->op_fname.len);
+ open->op_fname.len = be32_to_cpup(p++);
READ_BUF(open->op_fname.len);
SAVEMEM(open->op_fname.data, open->op_fname.len);
if ((status = check_filename(open->op_fname.data, open->op_fname.len)))
@@ -917,14 +915,14 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
break;
case NFS4_OPEN_CLAIM_PREVIOUS:
READ_BUF(4);
- READ32(open->op_delegate_type);
+ open->op_delegate_type = be32_to_cpup(p++);
break;
case NFS4_OPEN_CLAIM_DELEGATE_CUR:
status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid);
if (status)
return status;
READ_BUF(4);
- READ32(open->op_fname.len);
+ open->op_fname.len = be32_to_cpup(p++);
READ_BUF(open->op_fname.len);
SAVEMEM(open->op_fname.data, open->op_fname.len);
if ((status = check_filename(open->op_fname.data, open->op_fname.len)))
@@ -962,7 +960,7 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con
if (status)
return status;
READ_BUF(4);
- READ32(open_conf->oc_seqid);
+ open_conf->oc_seqid = be32_to_cpup(p++);
DECODE_TAIL;
}
@@ -976,7 +974,7 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
if (status)
return status;
READ_BUF(4);
- READ32(open_down->od_seqid);
+ open_down->od_seqid = be32_to_cpup(p++);
status = nfsd4_decode_share_access(argp, &open_down->od_share_access,
&open_down->od_deleg_want, NULL);
if (status)
@@ -993,7 +991,7 @@ nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh)
DECODE_HEAD;
READ_BUF(4);
- READ32(putfh->pf_fhlen);
+ putfh->pf_fhlen = be32_to_cpup(p++);
if (putfh->pf_fhlen > NFS4_FHSIZE)
goto xdr_error;
READ_BUF(putfh->pf_fhlen);
@@ -1019,8 +1017,8 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
if (status)
return status;
READ_BUF(12);
- READ64(read->rd_offset);
- READ32(read->rd_length);
+ p = xdr_decode_hyper(p, &read->rd_offset);
+ read->rd_length = be32_to_cpup(p++);
DECODE_TAIL;
}
@@ -1031,10 +1029,10 @@ nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *read
DECODE_HEAD;
READ_BUF(24);
- READ64(readdir->rd_cookie);
+ p = xdr_decode_hyper(p, &readdir->rd_cookie);
COPYMEM(readdir->rd_verf.data, sizeof(readdir->rd_verf.data));
- READ32(readdir->rd_dircount); /* just in case you needed a useless field... */
- READ32(readdir->rd_maxcount);
+ readdir->rd_dircount = be32_to_cpup(p++);
+ readdir->rd_maxcount = be32_to_cpup(p++);
if ((status = nfsd4_decode_bitmap(argp, readdir->rd_bmval)))
goto out;
@@ -1047,7 +1045,7 @@ nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove
DECODE_HEAD;
READ_BUF(4);
- READ32(remove->rm_namelen);
+ remove->rm_namelen = be32_to_cpup(p++);
READ_BUF(remove->rm_namelen);
SAVEMEM(remove->rm_name, remove->rm_namelen);
if ((status = check_filename(remove->rm_name, remove->rm_namelen)))
@@ -1062,10 +1060,10 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename
DECODE_HEAD;
READ_BUF(4);
- READ32(rename->rn_snamelen);
+ rename->rn_snamelen = be32_to_cpup(p++);
READ_BUF(rename->rn_snamelen + 4);
SAVEMEM(rename->rn_sname, rename->rn_snamelen);
- READ32(rename->rn_tnamelen);
+ rename->rn_tnamelen = be32_to_cpup(p++);
READ_BUF(rename->rn_tnamelen);
SAVEMEM(rename->rn_tname, rename->rn_tnamelen);
if ((status = check_filename(rename->rn_sname, rename->rn_snamelen)))
@@ -1097,7 +1095,7 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
DECODE_HEAD;
READ_BUF(4);
- READ32(secinfo->si_namelen);
+ secinfo->si_namelen = be32_to_cpup(p++);
READ_BUF(secinfo->si_namelen);
SAVEMEM(secinfo->si_name, secinfo->si_namelen);
status = check_filename(secinfo->si_name, secinfo->si_namelen);
@@ -1113,7 +1111,7 @@ nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp,
DECODE_HEAD;
READ_BUF(4);
- READ32(sin->sin_style);
+ sin->sin_style = be32_to_cpup(p++);
DECODE_TAIL;
}
@@ -1144,16 +1142,16 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient
if (status)
return nfserr_bad_xdr;
READ_BUF(8);
- READ32(setclientid->se_callback_prog);
- READ32(setclientid->se_callback_netid_len);
+ setclientid->se_callback_prog = be32_to_cpup(p++);
+ setclientid->se_callback_netid_len = be32_to_cpup(p++);
READ_BUF(setclientid->se_callback_netid_len + 4);
SAVEMEM(setclientid->se_callback_netid_val, setclientid->se_callback_netid_len);
- READ32(setclientid->se_callback_addr_len);
+ setclientid->se_callback_addr_len = be32_to_cpup(p++);
READ_BUF(setclientid->se_callback_addr_len + 4);
SAVEMEM(setclientid->se_callback_addr_val, setclientid->se_callback_addr_len);
- READ32(setclientid->se_callback_ident);
+ setclientid->se_callback_ident = be32_to_cpup(p++);
DECODE_TAIL;
}
@@ -1186,7 +1184,7 @@ nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify
* nfsd4_proc_verify */
READ_BUF(4);
- READ32(verify->ve_attrlen);
+ verify->ve_attrlen = be32_to_cpup(p++);
READ_BUF(verify->ve_attrlen);
SAVEMEM(verify->ve_attrval, verify->ve_attrlen);
@@ -1204,11 +1202,11 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
if (status)
return status;
READ_BUF(16);
- READ64(write->wr_offset);
- READ32(write->wr_stable_how);
+ p = xdr_decode_hyper(p, &write->wr_offset);
+ write->wr_stable_how = be32_to_cpup(p++);
if (write->wr_stable_how > 2)
goto xdr_error;
- READ32(write->wr_buflen);
+ write->wr_buflen = be32_to_cpup(p++);
/* Sorry .. no magic macros for this.. *
* READ_BUF(write->wr_buflen);
@@ -1254,7 +1252,7 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
READ_BUF(12);
COPYMEM(&rlockowner->rl_clientid, sizeof(clientid_t));
- READ32(rlockowner->rl_owner.len);
+ rlockowner->rl_owner.len = be32_to_cpup(p++);
READ_BUF(rlockowner->rl_owner.len);
READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len);
@@ -1278,63 +1276,63 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
return nfserr_bad_xdr;
READ_BUF(4);
- READ32(exid->flags);
+ exid->flags = be32_to_cpup(p++);
/* Ignore state_protect4_a */
READ_BUF(4);
- READ32(exid->spa_how);
+ exid->spa_how = be32_to_cpup(p++);
switch (exid->spa_how) {
case SP4_NONE:
break;
case SP4_MACH_CRED:
/* spo_must_enforce */
READ_BUF(4);
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
READ_BUF(dummy * 4);
p += dummy;
/* spo_must_allow */
READ_BUF(4);
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
READ_BUF(dummy * 4);
p += dummy;
break;
case SP4_SSV:
/* ssp_ops */
READ_BUF(4);
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
READ_BUF(dummy * 4);
p += dummy;
READ_BUF(4);
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
READ_BUF(dummy * 4);
p += dummy;
/* ssp_hash_algs<> */
READ_BUF(4);
- READ32(tmp);
+ tmp = be32_to_cpup(p++);
while (tmp--) {
READ_BUF(4);
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
READ_BUF(dummy);
p += XDR_QUADLEN(dummy);
}
/* ssp_encr_algs<> */
READ_BUF(4);
- READ32(tmp);
+ tmp = be32_to_cpup(p++);
while (tmp--) {
READ_BUF(4);
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
READ_BUF(dummy);
p += XDR_QUADLEN(dummy);
}
/* ssp_window and ssp_num_gss_handles */
READ_BUF(8);
- READ32(dummy);
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
+ dummy = be32_to_cpup(p++);
break;
default:
goto xdr_error;
@@ -1342,7 +1340,7 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
/* Ignore Implementation ID */
READ_BUF(4); /* nfs_impl_id4 array length */
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
if (dummy > 1)
goto xdr_error;
@@ -1350,13 +1348,13 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
if (dummy == 1) {
/* nii_domain */
READ_BUF(4);
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
READ_BUF(dummy);
p += XDR_QUADLEN(dummy);
/* nii_name */
READ_BUF(4);
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
READ_BUF(dummy);
p += XDR_QUADLEN(dummy);
@@ -1376,21 +1374,21 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
READ_BUF(16);
COPYMEM(&sess->clientid, 8);
- READ32(sess->seqid);
- READ32(sess->flags);
+ sess->seqid = be32_to_cpup(p++);
+ sess->flags = be32_to_cpup(p++);
/* Fore channel attrs */
READ_BUF(28);
- READ32(dummy); /* headerpadsz is always 0 */
- READ32(sess->fore_channel.maxreq_sz);
- READ32(sess->fore_channel.maxresp_sz);
- READ32(sess->fore_channel.maxresp_cached);
- READ32(sess->fore_channel.maxops);
- READ32(sess->fore_channel.maxreqs);
- READ32(sess->fore_channel.nr_rdma_attrs);
+ dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */
+ sess->fore_channel.maxreq_sz = be32_to_cpup(p++);
+ sess->fore_channel.maxresp_sz = be32_to_cpup(p++);
+ sess->fore_channel.maxresp_cached = be32_to_cpup(p++);
+ sess->fore_channel.maxops = be32_to_cpup(p++);
+ sess->fore_channel.maxreqs = be32_to_cpup(p++);
+ sess->fore_channel.nr_rdma_attrs = be32_to_cpup(p++);
if (sess->fore_channel.nr_rdma_attrs == 1) {
READ_BUF(4);
- READ32(sess->fore_channel.rdma_attrs);
+ sess->fore_channel.rdma_attrs = be32_to_cpup(p++);
} else if (sess->fore_channel.nr_rdma_attrs > 1) {
dprintk("Too many fore channel attr bitmaps!\n");
goto xdr_error;
@@ -1398,23 +1396,23 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
/* Back channel attrs */
READ_BUF(28);
- READ32(dummy); /* headerpadsz is always 0 */
- READ32(sess->back_channel.maxreq_sz);
- READ32(sess->back_channel.maxresp_sz);
- READ32(sess->back_channel.maxresp_cached);
- READ32(sess->back_channel.maxops);
- READ32(sess->back_channel.maxreqs);
- READ32(sess->back_channel.nr_rdma_attrs);
+ dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */
+ sess->back_channel.maxreq_sz = be32_to_cpup(p++);
+ sess->back_channel.maxresp_sz = be32_to_cpup(p++);
+ sess->back_channel.maxresp_cached = be32_to_cpup(p++);
+ sess->back_channel.maxops = be32_to_cpup(p++);
+ sess->back_channel.maxreqs = be32_to_cpup(p++);
+ sess->back_channel.nr_rdma_attrs = be32_to_cpup(p++);
if (sess->back_channel.nr_rdma_attrs == 1) {
READ_BUF(4);
- READ32(sess->back_channel.rdma_attrs);
+ sess->back_channel.rdma_attrs = be32_to_cpup(p++);
} else if (sess->back_channel.nr_rdma_attrs > 1) {
dprintk("Too many back channel attr bitmaps!\n");
goto xdr_error;
}
READ_BUF(4);
- READ32(sess->callback_prog);
+ sess->callback_prog = be32_to_cpup(p++);
nfsd4_decode_cb_sec(argp, &sess->cb_sec);
DECODE_TAIL;
}
@@ -1437,7 +1435,7 @@ nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp,
DECODE_HEAD;
READ_BUF(sizeof(stateid_t));
- READ32(free_stateid->fr_stateid.si_generation);
+ free_stateid->fr_stateid.si_generation = be32_to_cpup(p++);
COPYMEM(&free_stateid->fr_stateid.si_opaque, sizeof(stateid_opaque_t));
DECODE_TAIL;
@@ -1451,10 +1449,10 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
- READ32(seq->seqid);
- READ32(seq->slotid);
- READ32(seq->maxslots);
- READ32(seq->cachethis);
+ seq->seqid = be32_to_cpup(p++);
+ seq->slotid = be32_to_cpup(p++);
+ seq->maxslots = be32_to_cpup(p++);
+ seq->cachethis = be32_to_cpup(p++);
DECODE_TAIL;
}
@@ -1511,7 +1509,7 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
DECODE_HEAD;
READ_BUF(4);
- READ32(rc->rca_one_fs);
+ rc->rca_one_fs = be32_to_cpup(p++);
DECODE_TAIL;
}
@@ -1605,47 +1603,25 @@ nfsd4_opnum_in_range(struct nfsd4_compoundargs *argp, struct nfsd4_op *op)
return true;
}
-/*
- * Return a rough estimate of the maximum possible reply size. Note the
- * estimate includes rpc headers so is meant to be passed to
- * svc_reserve, not svc_reserve_auth.
- *
- * Also note the current compound encoding permits only one operation to
- * use pages beyond the first one, so the maximum possible length is the
- * maximum over these values, not the sum.
- */
-static int nfsd4_max_reply(u32 opnum)
-{
- switch (opnum) {
- case OP_READLINK:
- case OP_READDIR:
- /*
- * Both of these ops take a single page for data and put
- * the head and tail in another page:
- */
- return 2 * PAGE_SIZE;
- case OP_READ:
- return INT_MAX;
- default:
- return PAGE_SIZE;
- }
-}
-
static __be32
nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
{
DECODE_HEAD;
struct nfsd4_op *op;
bool cachethis = false;
- int max_reply = PAGE_SIZE;
+ int auth_slack= argp->rqstp->rq_auth_slack;
+ int max_reply = auth_slack + 8; /* opcnt, status */
+ int readcount = 0;
+ int readbytes = 0;
int i;
READ_BUF(4);
- READ32(argp->taglen);
+ argp->taglen = be32_to_cpup(p++);
READ_BUF(argp->taglen + 8);
SAVEMEM(argp->tag, argp->taglen);
- READ32(argp->minorversion);
- READ32(argp->opcnt);
+ argp->minorversion = be32_to_cpup(p++);
+ argp->opcnt = be32_to_cpup(p++);
+ max_reply += 4 + (XDR_QUADLEN(argp->taglen) << 2);
if (argp->taglen > NFSD4_MAX_TAGLEN)
goto xdr_error;
@@ -1669,7 +1645,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
op->replay = NULL;
READ_BUF(4);
- READ32(op->opnum);
+ op->opnum = be32_to_cpup(p++);
if (nfsd4_opnum_in_range(argp, op))
op->status = nfsd4_dec_ops[op->opnum](argp, &op->u);
@@ -1677,97 +1653,82 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
op->opnum = OP_ILLEGAL;
op->status = nfserr_op_illegal;
}
-
- if (op->status) {
- argp->opcnt = i+1;
- break;
- }
/*
* We'll try to cache the result in the DRC if any one
* op in the compound wants to be cached:
*/
cachethis |= nfsd4_cache_this_op(op);
- max_reply = max(max_reply, nfsd4_max_reply(op->opnum));
+ if (op->opnum == OP_READ) {
+ readcount++;
+ readbytes += nfsd4_max_reply(argp->rqstp, op);
+ } else
+ max_reply += nfsd4_max_reply(argp->rqstp, op);
+
+ if (op->status) {
+ argp->opcnt = i+1;
+ break;
+ }
}
/* Sessions make the DRC unnecessary: */
if (argp->minorversion)
cachethis = false;
- if (max_reply != INT_MAX)
- svc_reserve(argp->rqstp, max_reply);
+ svc_reserve(argp->rqstp, max_reply + readbytes);
argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE;
- DECODE_TAIL;
-}
-
-#define WRITE32(n) *p++ = htonl(n)
-#define WRITE64(n) do { \
- *p++ = htonl((u32)((n) >> 32)); \
- *p++ = htonl((u32)(n)); \
-} while (0)
-#define WRITEMEM(ptr,nbytes) do { if (nbytes > 0) { \
- *(p + XDR_QUADLEN(nbytes) -1) = 0; \
- memcpy(p, ptr, nbytes); \
- p += XDR_QUADLEN(nbytes); \
-}} while (0)
-
-static void write32(__be32 **p, u32 n)
-{
- *(*p)++ = htonl(n);
-}
+ if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack)
+ argp->rqstp->rq_splice_ok = false;
-static void write64(__be32 **p, u64 n)
-{
- write32(p, (n >> 32));
- write32(p, (u32)n);
+ DECODE_TAIL;
}
-static void write_change(__be32 **p, struct kstat *stat, struct inode *inode)
+static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode)
{
if (IS_I_VERSION(inode)) {
- write64(p, inode->i_version);
+ p = xdr_encode_hyper(p, inode->i_version);
} else {
- write32(p, stat->ctime.tv_sec);
- write32(p, stat->ctime.tv_nsec);
+ *p++ = cpu_to_be32(stat->ctime.tv_sec);
+ *p++ = cpu_to_be32(stat->ctime.tv_nsec);
}
+ return p;
}
-static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
+static __be32 *encode_cinfo(__be32 *p, struct nfsd4_change_info *c)
{
- write32(p, c->atomic);
+ *p++ = cpu_to_be32(c->atomic);
if (c->change_supported) {
- write64(p, c->before_change);
- write64(p, c->after_change);
+ p = xdr_encode_hyper(p, c->before_change);
+ p = xdr_encode_hyper(p, c->after_change);
} else {
- write32(p, c->before_ctime_sec);
- write32(p, c->before_ctime_nsec);
- write32(p, c->after_ctime_sec);
- write32(p, c->after_ctime_nsec);
+ *p++ = cpu_to_be32(c->before_ctime_sec);
+ *p++ = cpu_to_be32(c->before_ctime_nsec);
+ *p++ = cpu_to_be32(c->after_ctime_sec);
+ *p++ = cpu_to_be32(c->after_ctime_nsec);
}
+ return p;
}
-#define RESERVE_SPACE(nbytes) do { \
- p = resp->p; \
- BUG_ON(p + XDR_QUADLEN(nbytes) > resp->end); \
-} while (0)
-#define ADJUST_ARGS() resp->p = p
-
/* Encode as an array of strings the string given with components
* separated @sep, escaped with esc_enter and esc_exit.
*/
-static __be32 nfsd4_encode_components_esc(char sep, char *components,
- __be32 **pp, int *buflen,
- char esc_enter, char esc_exit)
+static __be32 nfsd4_encode_components_esc(struct xdr_stream *xdr, char sep,
+ char *components, char esc_enter,
+ char esc_exit)
{
- __be32 *p = *pp;
- __be32 *countp = p;
+ __be32 *p;
+ __be32 pathlen;
+ int pathlen_offset;
int strlen, count=0;
char *str, *end, *next;
dprintk("nfsd4_encode_components(%s)\n", components);
- if ((*buflen -= 4) < 0)
+
+ pathlen_offset = xdr->buf->len;
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
return nfserr_resource;
- WRITE32(0); /* We will fill this in with @count later */
+ p++; /* We will fill this in with @count later */
+
end = str = components;
while (*end) {
bool found_esc = false;
@@ -1789,59 +1750,57 @@ static __be32 nfsd4_encode_components_esc(char sep, char *components,
strlen = end - str;
if (strlen) {
- if ((*buflen -= ((XDR_QUADLEN(strlen) << 2) + 4)) < 0)
+ p = xdr_reserve_space(xdr, strlen + 4);
+ if (!p)
return nfserr_resource;
- WRITE32(strlen);
- WRITEMEM(str, strlen);
+ p = xdr_encode_opaque(p, str, strlen);
count++;
}
else
end++;
str = end;
}
- *pp = p;
- p = countp;
- WRITE32(count);
+ pathlen = htonl(xdr->buf->len - pathlen_offset);
+ write_bytes_to_xdr_buf(xdr->buf, pathlen_offset, &pathlen, 4);
return 0;
}
/* Encode as an array of strings the string given with components
* separated @sep.
*/
-static __be32 nfsd4_encode_components(char sep, char *components,
- __be32 **pp, int *buflen)
+static __be32 nfsd4_encode_components(struct xdr_stream *xdr, char sep,
+ char *components)
{
- return nfsd4_encode_components_esc(sep, components, pp, buflen, 0, 0);
+ return nfsd4_encode_components_esc(xdr, sep, components, 0, 0);
}
/*
* encode a location element of a fs_locations structure
*/
-static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location,
- __be32 **pp, int *buflen)
+static __be32 nfsd4_encode_fs_location4(struct xdr_stream *xdr,
+ struct nfsd4_fs_location *location)
{
__be32 status;
- __be32 *p = *pp;
- status = nfsd4_encode_components_esc(':', location->hosts, &p, buflen,
+ status = nfsd4_encode_components_esc(xdr, ':', location->hosts,
'[', ']');
if (status)
return status;
- status = nfsd4_encode_components('/', location->path, &p, buflen);
+ status = nfsd4_encode_components(xdr, '/', location->path);
if (status)
return status;
- *pp = p;
return 0;
}
/*
* Encode a path in RFC3530 'pathname4' format
*/
-static __be32 nfsd4_encode_path(const struct path *root,
- const struct path *path, __be32 **pp, int *buflen)
+static __be32 nfsd4_encode_path(struct xdr_stream *xdr,
+ const struct path *root,
+ const struct path *path)
{
struct path cur = *path;
- __be32 *p = *pp;
+ __be32 *p;
struct dentry **components = NULL;
unsigned int ncomponents = 0;
__be32 err = nfserr_jukebox;
@@ -1872,11 +1831,11 @@ static __be32 nfsd4_encode_path(const struct path *root,
components[ncomponents++] = cur.dentry;
cur.dentry = dget_parent(cur.dentry);
}
-
- *buflen -= 4;
- if (*buflen < 0)
+ err = nfserr_resource;
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_free;
- WRITE32(ncomponents);
+ *p++ = cpu_to_be32(ncomponents);
while (ncomponents) {
struct dentry *dentry = components[ncomponents - 1];
@@ -1884,20 +1843,18 @@ static __be32 nfsd4_encode_path(const struct path *root,
spin_lock(&dentry->d_lock);
len = dentry->d_name.len;
- *buflen -= 4 + (XDR_QUADLEN(len) << 2);
- if (*buflen < 0) {
+ p = xdr_reserve_space(xdr, len + 4);
+ if (!p) {
spin_unlock(&dentry->d_lock);
goto out_free;
}
- WRITE32(len);
- WRITEMEM(dentry->d_name.name, len);
+ p = xdr_encode_opaque(p, dentry->d_name.name, len);
dprintk("/%s", dentry->d_name.name);
spin_unlock(&dentry->d_lock);
dput(dentry);
ncomponents--;
}
- *pp = p;
err = 0;
out_free:
dprintk(")\n");
@@ -1908,8 +1865,8 @@ out_free:
return err;
}
-static __be32 nfsd4_encode_fsloc_fsroot(struct svc_rqst *rqstp,
- const struct path *path, __be32 **pp, int *buflen)
+static __be32 nfsd4_encode_fsloc_fsroot(struct xdr_stream *xdr,
+ struct svc_rqst *rqstp, const struct path *path)
{
struct svc_export *exp_ps;
__be32 res;
@@ -1917,7 +1874,7 @@ static __be32 nfsd4_encode_fsloc_fsroot(struct svc_rqst *rqstp,
exp_ps = rqst_find_fsidzero_export(rqstp);
if (IS_ERR(exp_ps))
return nfserrno(PTR_ERR(exp_ps));
- res = nfsd4_encode_path(&exp_ps->ex_path, path, pp, buflen);
+ res = nfsd4_encode_path(xdr, &exp_ps->ex_path, path);
exp_put(exp_ps);
return res;
}
@@ -1925,28 +1882,26 @@ static __be32 nfsd4_encode_fsloc_fsroot(struct svc_rqst *rqstp,
/*
* encode a fs_locations structure
*/
-static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp,
- struct svc_export *exp,
- __be32 **pp, int *buflen)
+static __be32 nfsd4_encode_fs_locations(struct xdr_stream *xdr,
+ struct svc_rqst *rqstp, struct svc_export *exp)
{
__be32 status;
int i;
- __be32 *p = *pp;
+ __be32 *p;
struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs;
- status = nfsd4_encode_fsloc_fsroot(rqstp, &exp->ex_path, &p, buflen);
+ status = nfsd4_encode_fsloc_fsroot(xdr, rqstp, &exp->ex_path);
if (status)
return status;
- if ((*buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
return nfserr_resource;
- WRITE32(fslocs->locations_count);
+ *p++ = cpu_to_be32(fslocs->locations_count);
for (i=0; i<fslocs->locations_count; i++) {
- status = nfsd4_encode_fs_location4(&fslocs->locations[i],
- &p, buflen);
+ status = nfsd4_encode_fs_location4(xdr, &fslocs->locations[i]);
if (status)
return status;
}
- *pp = p;
return 0;
}
@@ -1965,15 +1920,15 @@ static u32 nfs4_file_type(umode_t mode)
}
static inline __be32
-nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace,
- __be32 **p, int *buflen)
+nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp,
+ struct nfs4_ace *ace)
{
if (ace->whotype != NFS4_ACL_WHO_NAMED)
- return nfs4_acl_write_who(ace->whotype, p, buflen);
+ return nfs4_acl_write_who(xdr, ace->whotype);
else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
- return nfsd4_encode_group(rqstp, ace->who_gid, p, buflen);
+ return nfsd4_encode_group(xdr, rqstp, ace->who_gid);
else
- return nfsd4_encode_user(rqstp, ace->who_uid, p, buflen);
+ return nfsd4_encode_user(xdr, rqstp, ace->who_uid);
}
#define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \
@@ -1982,31 +1937,28 @@ nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace,
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
static inline __be32
-nfsd4_encode_security_label(struct svc_rqst *rqstp, void *context, int len, __be32 **pp, int *buflen)
+nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp,
+ void *context, int len)
{
- __be32 *p = *pp;
+ __be32 *p;
- if (*buflen < ((XDR_QUADLEN(len) << 2) + 4 + 4 + 4))
+ p = xdr_reserve_space(xdr, len + 4 + 4 + 4);
+ if (!p)
return nfserr_resource;
/*
* For now we use a 0 here to indicate the null translation; in
* the future we may place a call to translation code here.
*/
- if ((*buflen -= 8) < 0)
- return nfserr_resource;
-
- WRITE32(0); /* lfs */
- WRITE32(0); /* pi */
+ *p++ = cpu_to_be32(0); /* lfs */
+ *p++ = cpu_to_be32(0); /* pi */
p = xdr_encode_opaque(p, context, len);
- *buflen -= (XDR_QUADLEN(len) << 2) + 4;
-
- *pp = p;
return 0;
}
#else
static inline __be32
-nfsd4_encode_security_label(struct svc_rqst *rqstp, void *context, int len, __be32 **pp, int *buflen)
+nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp,
+ void *context, int len)
{ return 0; }
#endif
@@ -2045,12 +1997,11 @@ static int get_parent_attributes(struct svc_export *exp, struct kstat *stat)
/*
* Note: @fhp can be NULL; in this case, we might have to compose the filehandle
* ourselves.
- *
- * countp is the buffer size in _words_
*/
-__be32
-nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
- struct dentry *dentry, __be32 **buffer, int count, u32 *bmval,
+static __be32
+nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
+ struct svc_export *exp,
+ struct dentry *dentry, u32 *bmval,
struct svc_rqst *rqstp, int ignore_crossmnt)
{
u32 bmval0 = bmval[0];
@@ -2059,12 +2010,13 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
struct kstat stat;
struct svc_fh *tempfh = NULL;
struct kstatfs statfs;
- int buflen = count << 2;
- __be32 *attrlenp;
+ __be32 *p;
+ int starting_len = xdr->buf->len;
+ int attrlen_offset;
+ __be32 attrlen;
u32 dummy;
u64 dummy64;
u32 rdattr_err = 0;
- __be32 *p = *buffer;
__be32 status;
int err;
int aclsupport = 0;
@@ -2095,8 +2047,8 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
err = vfs_getattr(&path, &stat);
if (err)
goto out_nfserr;
- if ((bmval0 & (FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL |
- FATTR4_WORD0_MAXNAME)) ||
+ if ((bmval0 & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE |
+ FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) ||
(bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
FATTR4_WORD1_SPACE_TOTAL))) {
err = vfs_statfs(&path, &statfs);
@@ -2145,25 +2097,33 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
if (bmval2) {
- if ((buflen -= 16) < 0)
+ p = xdr_reserve_space(xdr, 16);
+ if (!p)
goto out_resource;
- WRITE32(3);
- WRITE32(bmval0);
- WRITE32(bmval1);
- WRITE32(bmval2);
+ *p++ = cpu_to_be32(3);
+ *p++ = cpu_to_be32(bmval0);
+ *p++ = cpu_to_be32(bmval1);
+ *p++ = cpu_to_be32(bmval2);
} else if (bmval1) {
- if ((buflen -= 12) < 0)
+ p = xdr_reserve_space(xdr, 12);
+ if (!p)
goto out_resource;
- WRITE32(2);
- WRITE32(bmval0);
- WRITE32(bmval1);
+ *p++ = cpu_to_be32(2);
+ *p++ = cpu_to_be32(bmval0);
+ *p++ = cpu_to_be32(bmval1);
} else {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
- WRITE32(1);
- WRITE32(bmval0);
+ *p++ = cpu_to_be32(1);
+ *p++ = cpu_to_be32(bmval0);
}
- attrlenp = p++; /* to be backfilled later */
+
+ attrlen_offset = xdr->buf->len;
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ p++; /* to be backfilled later */
if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
u32 word0 = nfsd_suppattrs0(minorversion);
@@ -2175,296 +2135,343 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
if (!contextsupport)
word2 &= ~FATTR4_WORD2_SECURITY_LABEL;
if (!word2) {
- if ((buflen -= 12) < 0)
+ p = xdr_reserve_space(xdr, 12);
+ if (!p)
goto out_resource;
- WRITE32(2);
- WRITE32(word0);
- WRITE32(word1);
+ *p++ = cpu_to_be32(2);
+ *p++ = cpu_to_be32(word0);
+ *p++ = cpu_to_be32(word1);
} else {
- if ((buflen -= 16) < 0)
+ p = xdr_reserve_space(xdr, 16);
+ if (!p)
goto out_resource;
- WRITE32(3);
- WRITE32(word0);
- WRITE32(word1);
- WRITE32(word2);
+ *p++ = cpu_to_be32(3);
+ *p++ = cpu_to_be32(word0);
+ *p++ = cpu_to_be32(word1);
+ *p++ = cpu_to_be32(word2);
}
}
if (bmval0 & FATTR4_WORD0_TYPE) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
dummy = nfs4_file_type(stat.mode);
if (dummy == NF4BAD) {
status = nfserr_serverfault;
goto out;
}
- WRITE32(dummy);
+ *p++ = cpu_to_be32(dummy);
}
if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
if (exp->ex_flags & NFSEXP_NOSUBTREECHECK)
- WRITE32(NFS4_FH_PERSISTENT);
+ *p++ = cpu_to_be32(NFS4_FH_PERSISTENT);
else
- WRITE32(NFS4_FH_PERSISTENT|NFS4_FH_VOL_RENAME);
+ *p++ = cpu_to_be32(NFS4_FH_PERSISTENT|
+ NFS4_FH_VOL_RENAME);
}
if (bmval0 & FATTR4_WORD0_CHANGE) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
- write_change(&p, &stat, dentry->d_inode);
+ p = encode_change(p, &stat, dentry->d_inode);
}
if (bmval0 & FATTR4_WORD0_SIZE) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
- WRITE64(stat.size);
+ p = xdr_encode_hyper(p, stat.size);
}
if (bmval0 & FATTR4_WORD0_LINK_SUPPORT) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(1);
+ *p++ = cpu_to_be32(1);
}
if (bmval0 & FATTR4_WORD0_SYMLINK_SUPPORT) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(1);
+ *p++ = cpu_to_be32(1);
}
if (bmval0 & FATTR4_WORD0_NAMED_ATTR) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(0);
+ *p++ = cpu_to_be32(0);
}
if (bmval0 & FATTR4_WORD0_FSID) {
- if ((buflen -= 16) < 0)
+ p = xdr_reserve_space(xdr, 16);
+ if (!p)
goto out_resource;
if (exp->ex_fslocs.migrated) {
- WRITE64(NFS4_REFERRAL_FSID_MAJOR);
- WRITE64(NFS4_REFERRAL_FSID_MINOR);
+ p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MAJOR);
+ p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MINOR);
} else switch(fsid_source(fhp)) {
case FSIDSOURCE_FSID:
- WRITE64((u64)exp->ex_fsid);
- WRITE64((u64)0);
+ p = xdr_encode_hyper(p, (u64)exp->ex_fsid);
+ p = xdr_encode_hyper(p, (u64)0);
break;
case FSIDSOURCE_DEV:
- WRITE32(0);
- WRITE32(MAJOR(stat.dev));
- WRITE32(0);
- WRITE32(MINOR(stat.dev));
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(MAJOR(stat.dev));
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(MINOR(stat.dev));
break;
case FSIDSOURCE_UUID:
- WRITEMEM(exp->ex_uuid, 16);
+ p = xdr_encode_opaque_fixed(p, exp->ex_uuid,
+ EX_UUID_LEN);
break;
}
}
if (bmval0 & FATTR4_WORD0_UNIQUE_HANDLES) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(0);
+ *p++ = cpu_to_be32(0);
}
if (bmval0 & FATTR4_WORD0_LEASE_TIME) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(nn->nfsd4_lease);
+ *p++ = cpu_to_be32(nn->nfsd4_lease);
}
if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(rdattr_err);
+ *p++ = cpu_to_be32(rdattr_err);
}
if (bmval0 & FATTR4_WORD0_ACL) {
struct nfs4_ace *ace;
if (acl == NULL) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(0);
+ *p++ = cpu_to_be32(0);
goto out_acl;
}
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(acl->naces);
+ *p++ = cpu_to_be32(acl->naces);
for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) {
- if ((buflen -= 4*3) < 0)
+ p = xdr_reserve_space(xdr, 4*3);
+ if (!p)
goto out_resource;
- WRITE32(ace->type);
- WRITE32(ace->flag);
- WRITE32(ace->access_mask & NFS4_ACE_MASK_ALL);
- status = nfsd4_encode_aclname(rqstp, ace, &p, &buflen);
+ *p++ = cpu_to_be32(ace->type);
+ *p++ = cpu_to_be32(ace->flag);
+ *p++ = cpu_to_be32(ace->access_mask &
+ NFS4_ACE_MASK_ALL);
+ status = nfsd4_encode_aclname(xdr, rqstp, ace);
if (status)
goto out;
}
}
out_acl:
if (bmval0 & FATTR4_WORD0_ACLSUPPORT) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(aclsupport ?
+ *p++ = cpu_to_be32(aclsupport ?
ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL : 0);
}
if (bmval0 & FATTR4_WORD0_CANSETTIME) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(1);
+ *p++ = cpu_to_be32(1);
}
if (bmval0 & FATTR4_WORD0_CASE_INSENSITIVE) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(0);
+ *p++ = cpu_to_be32(0);
}
if (bmval0 & FATTR4_WORD0_CASE_PRESERVING) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(1);
+ *p++ = cpu_to_be32(1);
}
if (bmval0 & FATTR4_WORD0_CHOWN_RESTRICTED) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(1);
+ *p++ = cpu_to_be32(1);
}
if (bmval0 & FATTR4_WORD0_FILEHANDLE) {
- buflen -= (XDR_QUADLEN(fhp->fh_handle.fh_size) << 2) + 4;
- if (buflen < 0)
+ p = xdr_reserve_space(xdr, fhp->fh_handle.fh_size + 4);
+ if (!p)
goto out_resource;
- WRITE32(fhp->fh_handle.fh_size);
- WRITEMEM(&fhp->fh_handle.fh_base, fhp->fh_handle.fh_size);
+ p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base,
+ fhp->fh_handle.fh_size);
}
if (bmval0 & FATTR4_WORD0_FILEID) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
- WRITE64(stat.ino);
+ p = xdr_encode_hyper(p, stat.ino);
}
if (bmval0 & FATTR4_WORD0_FILES_AVAIL) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
- WRITE64((u64) statfs.f_ffree);
+ p = xdr_encode_hyper(p, (u64) statfs.f_ffree);
}
if (bmval0 & FATTR4_WORD0_FILES_FREE) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
- WRITE64((u64) statfs.f_ffree);
+ p = xdr_encode_hyper(p, (u64) statfs.f_ffree);
}
if (bmval0 & FATTR4_WORD0_FILES_TOTAL) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
- WRITE64((u64) statfs.f_files);
+ p = xdr_encode_hyper(p, (u64) statfs.f_files);
}
if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) {
- status = nfsd4_encode_fs_locations(rqstp, exp, &p, &buflen);
+ status = nfsd4_encode_fs_locations(xdr, rqstp, exp);
if (status)
goto out;
}
if (bmval0 & FATTR4_WORD0_HOMOGENEOUS) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(1);
+ *p++ = cpu_to_be32(1);
}
if (bmval0 & FATTR4_WORD0_MAXFILESIZE) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
- WRITE64(exp->ex_path.mnt->mnt_sb->s_maxbytes);
+ p = xdr_encode_hyper(p, exp->ex_path.mnt->mnt_sb->s_maxbytes);
}
if (bmval0 & FATTR4_WORD0_MAXLINK) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(255);
+ *p++ = cpu_to_be32(255);
}
if (bmval0 & FATTR4_WORD0_MAXNAME) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(statfs.f_namelen);
+ *p++ = cpu_to_be32(statfs.f_namelen);
}
if (bmval0 & FATTR4_WORD0_MAXREAD) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
- WRITE64((u64) svc_max_payload(rqstp));
+ p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp));
}
if (bmval0 & FATTR4_WORD0_MAXWRITE) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
- WRITE64((u64) svc_max_payload(rqstp));
+ p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp));
}
if (bmval1 & FATTR4_WORD1_MODE) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(stat.mode & S_IALLUGO);
+ *p++ = cpu_to_be32(stat.mode & S_IALLUGO);
}
if (bmval1 & FATTR4_WORD1_NO_TRUNC) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(1);
+ *p++ = cpu_to_be32(1);
}
if (bmval1 & FATTR4_WORD1_NUMLINKS) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(stat.nlink);
+ *p++ = cpu_to_be32(stat.nlink);
}
if (bmval1 & FATTR4_WORD1_OWNER) {
- status = nfsd4_encode_user(rqstp, stat.uid, &p, &buflen);
+ status = nfsd4_encode_user(xdr, rqstp, stat.uid);
if (status)
goto out;
}
if (bmval1 & FATTR4_WORD1_OWNER_GROUP) {
- status = nfsd4_encode_group(rqstp, stat.gid, &p, &buflen);
+ status = nfsd4_encode_group(xdr, rqstp, stat.gid);
if (status)
goto out;
}
if (bmval1 & FATTR4_WORD1_RAWDEV) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
- WRITE32((u32) MAJOR(stat.rdev));
- WRITE32((u32) MINOR(stat.rdev));
+ *p++ = cpu_to_be32((u32) MAJOR(stat.rdev));
+ *p++ = cpu_to_be32((u32) MINOR(stat.rdev));
}
if (bmval1 & FATTR4_WORD1_SPACE_AVAIL) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
dummy64 = (u64)statfs.f_bavail * (u64)statfs.f_bsize;
- WRITE64(dummy64);
+ p = xdr_encode_hyper(p, dummy64);
}
if (bmval1 & FATTR4_WORD1_SPACE_FREE) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
dummy64 = (u64)statfs.f_bfree * (u64)statfs.f_bsize;
- WRITE64(dummy64);
+ p = xdr_encode_hyper(p, dummy64);
}
if (bmval1 & FATTR4_WORD1_SPACE_TOTAL) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
dummy64 = (u64)statfs.f_blocks * (u64)statfs.f_bsize;
- WRITE64(dummy64);
+ p = xdr_encode_hyper(p, dummy64);
}
if (bmval1 & FATTR4_WORD1_SPACE_USED) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
dummy64 = (u64)stat.blocks << 9;
- WRITE64(dummy64);
+ p = xdr_encode_hyper(p, dummy64);
}
if (bmval1 & FATTR4_WORD1_TIME_ACCESS) {
- if ((buflen -= 12) < 0)
+ p = xdr_reserve_space(xdr, 12);
+ if (!p)
goto out_resource;
- WRITE64((s64)stat.atime.tv_sec);
- WRITE32(stat.atime.tv_nsec);
+ p = xdr_encode_hyper(p, (s64)stat.atime.tv_sec);
+ *p++ = cpu_to_be32(stat.atime.tv_nsec);
}
if (bmval1 & FATTR4_WORD1_TIME_DELTA) {
- if ((buflen -= 12) < 0)
+ p = xdr_reserve_space(xdr, 12);
+ if (!p)
goto out_resource;
- WRITE32(0);
- WRITE32(1);
- WRITE32(0);
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(1);
+ *p++ = cpu_to_be32(0);
}
if (bmval1 & FATTR4_WORD1_TIME_METADATA) {
- if ((buflen -= 12) < 0)
+ p = xdr_reserve_space(xdr, 12);
+ if (!p)
goto out_resource;
- WRITE64((s64)stat.ctime.tv_sec);
- WRITE32(stat.ctime.tv_nsec);
+ p = xdr_encode_hyper(p, (s64)stat.ctime.tv_sec);
+ *p++ = cpu_to_be32(stat.ctime.tv_nsec);
}
if (bmval1 & FATTR4_WORD1_TIME_MODIFY) {
- if ((buflen -= 12) < 0)
+ p = xdr_reserve_space(xdr, 12);
+ if (!p)
goto out_resource;
- WRITE64((s64)stat.mtime.tv_sec);
- WRITE32(stat.mtime.tv_nsec);
+ p = xdr_encode_hyper(p, (s64)stat.mtime.tv_sec);
+ *p++ = cpu_to_be32(stat.mtime.tv_nsec);
}
if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
/*
* Get parent's attributes if not ignoring crossmount
@@ -2473,25 +2480,26 @@ out_acl:
if (ignore_crossmnt == 0 &&
dentry == exp->ex_path.mnt->mnt_root)
get_parent_attributes(exp, &stat);
- WRITE64(stat.ino);
+ p = xdr_encode_hyper(p, stat.ino);
}
if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
- status = nfsd4_encode_security_label(rqstp, context,
- contextlen, &p, &buflen);
+ status = nfsd4_encode_security_label(xdr, rqstp, context,
+ contextlen);
if (status)
goto out;
}
if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
- if ((buflen -= 16) < 0)
+ p = xdr_reserve_space(xdr, 16);
+ if (!p)
goto out_resource;
- WRITE32(3);
- WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
- WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1);
- WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD2);
+ *p++ = cpu_to_be32(3);
+ *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
+ *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD1);
+ *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD2);
}
- *attrlenp = htonl((char *)p - (char *)attrlenp - 4);
- *buffer = p;
+ attrlen = htonl(xdr->buf->len - attrlen_offset - 4);
+ write_bytes_to_xdr_buf(xdr->buf, attrlen_offset, &attrlen, 4);
status = nfs_ok;
out:
@@ -2504,6 +2512,8 @@ out:
fh_put(tempfh);
kfree(tempfh);
}
+ if (status)
+ xdr_truncate_encode(xdr, starting_len);
return status;
out_nfserr:
status = nfserrno(err);
@@ -2513,6 +2523,37 @@ out_resource:
goto out;
}
+static void svcxdr_init_encode_from_buffer(struct xdr_stream *xdr,
+ struct xdr_buf *buf, __be32 *p, int bytes)
+{
+ xdr->scratch.iov_len = 0;
+ memset(buf, 0, sizeof(struct xdr_buf));
+ buf->head[0].iov_base = p;
+ buf->head[0].iov_len = 0;
+ buf->len = 0;
+ xdr->buf = buf;
+ xdr->iov = buf->head;
+ xdr->p = p;
+ xdr->end = (void *)p + bytes;
+ buf->buflen = bytes;
+}
+
+__be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words,
+ struct svc_fh *fhp, struct svc_export *exp,
+ struct dentry *dentry, u32 *bmval,
+ struct svc_rqst *rqstp, int ignore_crossmnt)
+{
+ struct xdr_buf dummy;
+ struct xdr_stream xdr;
+ __be32 ret;
+
+ svcxdr_init_encode_from_buffer(&xdr, &dummy, *p, words << 2);
+ ret = nfsd4_encode_fattr(&xdr, fhp, exp, dentry, bmval, rqstp,
+ ignore_crossmnt);
+ *p = xdr.p;
+ return ret;
+}
+
static inline int attributes_need_mount(u32 *bmval)
{
if (bmval[0] & ~(FATTR4_WORD0_RDATTR_ERROR | FATTR4_WORD0_LEASE_TIME))
@@ -2523,8 +2564,8 @@ static inline int attributes_need_mount(u32 *bmval)
}
static __be32
-nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
- const char *name, int namlen, __be32 **p, int buflen)
+nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
+ const char *name, int namlen)
{
struct svc_export *exp = cd->rd_fhp->fh_export;
struct dentry *dentry;
@@ -2576,7 +2617,7 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
}
out_encode:
- nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval,
+ nfserr = nfsd4_encode_fattr(xdr, NULL, exp, dentry, cd->rd_bmval,
cd->rd_rqstp, ignore_crossmnt);
out_put:
dput(dentry);
@@ -2585,9 +2626,12 @@ out_put:
}
static __be32 *
-nfsd4_encode_rdattr_error(__be32 *p, int buflen, __be32 nfserr)
+nfsd4_encode_rdattr_error(struct xdr_stream *xdr, __be32 nfserr)
{
- if (buflen < 6)
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 6);
+ if (!p)
return NULL;
*p++ = htonl(2);
*p++ = htonl(FATTR4_WORD0_RDATTR_ERROR); /* bmval0 */
@@ -2604,10 +2648,13 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
{
struct readdir_cd *ccd = ccdv;
struct nfsd4_readdir *cd = container_of(ccd, struct nfsd4_readdir, common);
- int buflen;
- __be32 *p = cd->buffer;
- __be32 *cookiep;
+ struct xdr_stream *xdr = cd->xdr;
+ int start_offset = xdr->buf->len;
+ int cookie_offset;
+ int entry_bytes;
__be32 nfserr = nfserr_toosmall;
+ __be64 wire_offset;
+ __be32 *p;
/* In nfsv4, "." and ".." never make it onto the wire.. */
if (name && isdotent(name, namlen)) {
@@ -2615,19 +2662,24 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
return 0;
}
- if (cd->offset)
- xdr_encode_hyper(cd->offset, (u64) offset);
+ if (cd->cookie_offset) {
+ wire_offset = cpu_to_be64(offset);
+ write_bytes_to_xdr_buf(xdr->buf, cd->cookie_offset,
+ &wire_offset, 8);
+ }
- buflen = cd->buflen - 4 - XDR_QUADLEN(namlen);
- if (buflen < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto fail;
-
*p++ = xdr_one; /* mark entry present */
- cookiep = p;
+ cookie_offset = xdr->buf->len;
+ p = xdr_reserve_space(xdr, 3*4 + namlen);
+ if (!p)
+ goto fail;
p = xdr_encode_hyper(p, NFS_OFFSET_MAX); /* offset of next entry */
p = xdr_encode_array(p, name, namlen); /* name length & name */
- nfserr = nfsd4_encode_dirent_fattr(cd, name, namlen, &p, buflen);
+ nfserr = nfsd4_encode_dirent_fattr(xdr, cd, name, namlen);
switch (nfserr) {
case nfs_ok:
break;
@@ -2635,6 +2687,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
nfserr = nfserr_toosmall;
goto fail;
case nfserr_noent:
+ xdr_truncate_encode(xdr, start_offset);
goto skip_entry;
default:
/*
@@ -2646,59 +2699,74 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
*/
if (!(cd->rd_bmval[0] & FATTR4_WORD0_RDATTR_ERROR))
goto fail;
- p = nfsd4_encode_rdattr_error(p, buflen, nfserr);
+ p = nfsd4_encode_rdattr_error(xdr, nfserr);
if (p == NULL) {
nfserr = nfserr_toosmall;
goto fail;
}
}
- cd->buflen -= (p - cd->buffer);
- cd->buffer = p;
- cd->offset = cookiep;
+ nfserr = nfserr_toosmall;
+ entry_bytes = xdr->buf->len - start_offset;
+ if (entry_bytes > cd->rd_maxcount)
+ goto fail;
+ cd->rd_maxcount -= entry_bytes;
+ if (!cd->rd_dircount)
+ goto fail;
+ cd->rd_dircount--;
+ cd->cookie_offset = cookie_offset;
skip_entry:
cd->common.err = nfs_ok;
return 0;
fail:
+ xdr_truncate_encode(xdr, start_offset);
cd->common.err = nfserr;
return -EINVAL;
}
-static void
-nfsd4_encode_stateid(struct nfsd4_compoundres *resp, stateid_t *sid)
+static __be32
+nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
{
__be32 *p;
- RESERVE_SPACE(sizeof(stateid_t));
- WRITE32(sid->si_generation);
- WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, sizeof(stateid_t));
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(sid->si_generation);
+ p = xdr_encode_opaque_fixed(p, &sid->si_opaque,
+ sizeof(stateid_opaque_t));
+ return 0;
}
static __be32
nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
{
+ struct xdr_stream *xdr = &resp->xdr;
__be32 *p;
if (!nfserr) {
- RESERVE_SPACE(8);
- WRITE32(access->ac_supported);
- WRITE32(access->ac_resp_access);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(access->ac_supported);
+ *p++ = cpu_to_be32(access->ac_resp_access);
}
return nfserr;
}
static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts)
{
+ struct xdr_stream *xdr = &resp->xdr;
__be32 *p;
if (!nfserr) {
- RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 8);
- WRITEMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
- WRITE32(bcts->dir);
+ p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 8);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_opaque_fixed(p, bcts->sessionid.data,
+ NFS4_MAX_SESSIONID_LEN);
+ *p++ = cpu_to_be32(bcts->dir);
/* Sorry, we do not yet support RDMA over 4.1: */
- WRITE32(0);
- ADJUST_ARGS();
+ *p++ = cpu_to_be32(0);
}
return nfserr;
}
@@ -2706,8 +2774,10 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp,
static __be32
nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
{
+ struct xdr_stream *xdr = &resp->xdr;
+
if (!nfserr)
- nfsd4_encode_stateid(resp, &close->cl_stateid);
+ nfserr = nfsd4_encode_stateid(xdr, &close->cl_stateid);
return nfserr;
}
@@ -2716,12 +2786,15 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c
static __be32
nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit)
{
+ struct xdr_stream *xdr = &resp->xdr;
__be32 *p;
if (!nfserr) {
- RESERVE_SPACE(NFS4_VERIFIER_SIZE);
- WRITEMEM(commit->co_verf.data, NFS4_VERIFIER_SIZE);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_opaque_fixed(p, commit->co_verf.data,
+ NFS4_VERIFIER_SIZE);
}
return nfserr;
}
@@ -2729,15 +2802,17 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
static __be32
nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create)
{
+ struct xdr_stream *xdr = &resp->xdr;
__be32 *p;
if (!nfserr) {
- RESERVE_SPACE(32);
- write_cinfo(&p, &create->cr_cinfo);
- WRITE32(2);
- WRITE32(create->cr_bmval[0]);
- WRITE32(create->cr_bmval[1]);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 32);
+ if (!p)
+ return nfserr_resource;
+ p = encode_cinfo(p, &create->cr_cinfo);
+ *p++ = cpu_to_be32(2);
+ *p++ = cpu_to_be32(create->cr_bmval[0]);
+ *p++ = cpu_to_be32(create->cr_bmval[1]);
}
return nfserr;
}
@@ -2746,14 +2821,13 @@ static __be32
nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getattr *getattr)
{
struct svc_fh *fhp = getattr->ga_fhp;
- int buflen;
+ struct xdr_stream *xdr = &resp->xdr;
if (nfserr)
return nfserr;
- buflen = resp->end - resp->p - (COMPOUND_ERR_SLACK_SPACE >> 2);
- nfserr = nfsd4_encode_fattr(fhp, fhp->fh_export, fhp->fh_dentry,
- &resp->p, buflen, getattr->ga_bmval,
+ nfserr = nfsd4_encode_fattr(xdr, fhp, fhp->fh_export, fhp->fh_dentry,
+ getattr->ga_bmval,
resp->rqstp, 0);
return nfserr;
}
@@ -2761,16 +2835,17 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
static __be32
nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp)
{
+ struct xdr_stream *xdr = &resp->xdr;
struct svc_fh *fhp = *fhpp;
unsigned int len;
__be32 *p;
if (!nfserr) {
len = fhp->fh_handle.fh_size;
- RESERVE_SPACE(len + 4);
- WRITE32(len);
- WRITEMEM(&fhp->fh_handle.fh_base, len);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, len + 4);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base, len);
}
return nfserr;
}
@@ -2779,52 +2854,69 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh
* Including all fields other than the name, a LOCK4denied structure requires
* 8(clientid) + 4(namelen) + 8(offset) + 8(length) + 4(type) = 32 bytes.
*/
-static void
-nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denied *ld)
+static __be32
+nfsd4_encode_lock_denied(struct xdr_stream *xdr, struct nfsd4_lock_denied *ld)
{
struct xdr_netobj *conf = &ld->ld_owner;
__be32 *p;
- RESERVE_SPACE(32 + XDR_LEN(conf->len));
- WRITE64(ld->ld_start);
- WRITE64(ld->ld_length);
- WRITE32(ld->ld_type);
+again:
+ p = xdr_reserve_space(xdr, 32 + XDR_LEN(conf->len));
+ if (!p) {
+ /*
+ * Don't fail to return the result just because we can't
+ * return the conflicting open:
+ */
+ if (conf->len) {
+ conf->len = 0;
+ conf->data = NULL;
+ goto again;
+ }
+ return nfserr_resource;
+ }
+ p = xdr_encode_hyper(p, ld->ld_start);
+ p = xdr_encode_hyper(p, ld->ld_length);
+ *p++ = cpu_to_be32(ld->ld_type);
if (conf->len) {
- WRITEMEM(&ld->ld_clientid, 8);
- WRITE32(conf->len);
- WRITEMEM(conf->data, conf->len);
- kfree(conf->data);
+ p = xdr_encode_opaque_fixed(p, &ld->ld_clientid, 8);
+ p = xdr_encode_opaque(p, conf->data, conf->len);
} else { /* non - nfsv4 lock in conflict, no clientid nor owner */
- WRITE64((u64)0); /* clientid */
- WRITE32(0); /* length of owner name */
+ p = xdr_encode_hyper(p, (u64)0); /* clientid */
+ *p++ = cpu_to_be32(0); /* length of owner name */
}
- ADJUST_ARGS();
+ return nfserr_denied;
}
static __be32
nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock)
{
+ struct xdr_stream *xdr = &resp->xdr;
+
if (!nfserr)
- nfsd4_encode_stateid(resp, &lock->lk_resp_stateid);
+ nfserr = nfsd4_encode_stateid(xdr, &lock->lk_resp_stateid);
else if (nfserr == nfserr_denied)
- nfsd4_encode_lock_denied(resp, &lock->lk_denied);
-
+ nfserr = nfsd4_encode_lock_denied(xdr, &lock->lk_denied);
+ kfree(lock->lk_denied.ld_owner.data);
return nfserr;
}
static __be32
nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt)
{
+ struct xdr_stream *xdr = &resp->xdr;
+
if (nfserr == nfserr_denied)
- nfsd4_encode_lock_denied(resp, &lockt->lt_denied);
+ nfsd4_encode_lock_denied(xdr, &lockt->lt_denied);
return nfserr;
}
static __be32
nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku)
{
+ struct xdr_stream *xdr = &resp->xdr;
+
if (!nfserr)
- nfsd4_encode_stateid(resp, &locku->lu_stateid);
+ nfserr = nfsd4_encode_stateid(xdr, &locku->lu_stateid);
return nfserr;
}
@@ -2833,12 +2925,14 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
static __be32
nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link)
{
+ struct xdr_stream *xdr = &resp->xdr;
__be32 *p;
if (!nfserr) {
- RESERVE_SPACE(20);
- write_cinfo(&p, &link->li_cinfo);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 20);
+ if (!p)
+ return nfserr_resource;
+ p = encode_cinfo(p, &link->li_cinfo);
}
return nfserr;
}
@@ -2847,72 +2941,86 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li
static __be32
nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
{
+ struct xdr_stream *xdr = &resp->xdr;
__be32 *p;
if (nfserr)
goto out;
- nfsd4_encode_stateid(resp, &open->op_stateid);
- RESERVE_SPACE(40);
- write_cinfo(&p, &open->op_cinfo);
- WRITE32(open->op_rflags);
- WRITE32(2);
- WRITE32(open->op_bmval[0]);
- WRITE32(open->op_bmval[1]);
- WRITE32(open->op_delegate_type);
- ADJUST_ARGS();
+ nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid);
+ if (nfserr)
+ goto out;
+ p = xdr_reserve_space(xdr, 40);
+ if (!p)
+ return nfserr_resource;
+ p = encode_cinfo(p, &open->op_cinfo);
+ *p++ = cpu_to_be32(open->op_rflags);
+ *p++ = cpu_to_be32(2);
+ *p++ = cpu_to_be32(open->op_bmval[0]);
+ *p++ = cpu_to_be32(open->op_bmval[1]);
+ *p++ = cpu_to_be32(open->op_delegate_type);
switch (open->op_delegate_type) {
case NFS4_OPEN_DELEGATE_NONE:
break;
case NFS4_OPEN_DELEGATE_READ:
- nfsd4_encode_stateid(resp, &open->op_delegate_stateid);
- RESERVE_SPACE(20);
- WRITE32(open->op_recall);
+ nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid);
+ if (nfserr)
+ return nfserr;
+ p = xdr_reserve_space(xdr, 20);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(open->op_recall);
/*
* TODO: ACE's in delegations
*/
- WRITE32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE);
- WRITE32(0);
- WRITE32(0);
- WRITE32(0); /* XXX: is NULL principal ok? */
- ADJUST_ARGS();
+ *p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE);
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(0); /* XXX: is NULL principal ok? */
break;
case NFS4_OPEN_DELEGATE_WRITE:
- nfsd4_encode_stateid(resp, &open->op_delegate_stateid);
- RESERVE_SPACE(32);
- WRITE32(0);
+ nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid);
+ if (nfserr)
+ return nfserr;
+ p = xdr_reserve_space(xdr, 32);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(0);
/*
* TODO: space_limit's in delegations
*/
- WRITE32(NFS4_LIMIT_SIZE);
- WRITE32(~(u32)0);
- WRITE32(~(u32)0);
+ *p++ = cpu_to_be32(NFS4_LIMIT_SIZE);
+ *p++ = cpu_to_be32(~(u32)0);
+ *p++ = cpu_to_be32(~(u32)0);
/*
* TODO: ACE's in delegations
*/
- WRITE32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE);
- WRITE32(0);
- WRITE32(0);
- WRITE32(0); /* XXX: is NULL principal ok? */
- ADJUST_ARGS();
+ *p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE);
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(0); /* XXX: is NULL principal ok? */
break;
case NFS4_OPEN_DELEGATE_NONE_EXT: /* 4.1 */
switch (open->op_why_no_deleg) {
case WND4_CONTENTION:
case WND4_RESOURCE:
- RESERVE_SPACE(8);
- WRITE32(open->op_why_no_deleg);
- WRITE32(0); /* deleg signaling not supported yet */
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(open->op_why_no_deleg);
+ /* deleg signaling not supported yet: */
+ *p++ = cpu_to_be32(0);
break;
default:
- RESERVE_SPACE(4);
- WRITE32(open->op_why_no_deleg);
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(open->op_why_no_deleg);
}
- ADJUST_ARGS();
break;
default:
BUG();
@@ -2925,8 +3033,10 @@ out:
static __be32
nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
{
+ struct xdr_stream *xdr = &resp->xdr;
+
if (!nfserr)
- nfsd4_encode_stateid(resp, &oc->oc_resp_stateid);
+ nfserr = nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid);
return nfserr;
}
@@ -2934,127 +3044,233 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct
static __be32
nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
{
+ struct xdr_stream *xdr = &resp->xdr;
+
if (!nfserr)
- nfsd4_encode_stateid(resp, &od->od_stateid);
+ nfserr = nfsd4_encode_stateid(xdr, &od->od_stateid);
return nfserr;
}
-static __be32
-nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_read *read)
+static __be32 nfsd4_encode_splice_read(
+ struct nfsd4_compoundres *resp,
+ struct nfsd4_read *read,
+ struct file *file, unsigned long maxcount)
{
+ struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_buf *buf = xdr->buf;
u32 eof;
- int v;
- struct page *page;
- unsigned long maxcount;
- long len;
- __be32 *p;
+ int space_left;
+ __be32 nfserr;
+ __be32 *p = xdr->p - 2;
- if (nfserr)
- return nfserr;
- if (resp->xbuf->page_len)
+ /*
+ * Don't inline pages unless we know there's room for eof,
+ * count, and possible padding:
+ */
+ if (xdr->end - xdr->p < 3)
return nfserr_resource;
- RESERVE_SPACE(8); /* eof flag and byte count */
+ nfserr = nfsd_splice_read(read->rd_rqstp, file,
+ read->rd_offset, &maxcount);
+ if (nfserr) {
+ /*
+ * nfsd_splice_actor may have already messed with the
+ * page length; reset it so as not to confuse
+ * xdr_truncate_encode:
+ */
+ buf->page_len = 0;
+ return nfserr;
+ }
- maxcount = svc_max_payload(resp->rqstp);
- if (maxcount > read->rd_length)
- maxcount = read->rd_length;
+ eof = (read->rd_offset + maxcount >=
+ read->rd_fhp->fh_dentry->d_inode->i_size);
+
+ *(p++) = htonl(eof);
+ *(p++) = htonl(maxcount);
+
+ buf->page_len = maxcount;
+ buf->len += maxcount;
+ xdr->page_ptr += (maxcount + PAGE_SIZE - 1) / PAGE_SIZE;
+
+ /* Use rest of head for padding and remaining ops: */
+ buf->tail[0].iov_base = xdr->p;
+ buf->tail[0].iov_len = 0;
+ xdr->iov = buf->tail;
+ if (maxcount&3) {
+ int pad = 4 - (maxcount&3);
+
+ *(xdr->p++) = 0;
+
+ buf->tail[0].iov_base += maxcount&3;
+ buf->tail[0].iov_len = pad;
+ buf->len += pad;
+ }
+
+ space_left = min_t(int, (void *)xdr->end - (void *)xdr->p,
+ buf->buflen - buf->len);
+ buf->buflen = buf->len + space_left;
+ xdr->end = (__be32 *)((void *)xdr->end + space_left);
+
+ return 0;
+}
+
+static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
+ struct nfsd4_read *read,
+ struct file *file, unsigned long maxcount)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ u32 eof;
+ int v;
+ int starting_len = xdr->buf->len - 8;
+ long len;
+ int thislen;
+ __be32 nfserr;
+ __be32 tmp;
+ __be32 *p;
+ u32 zzz = 0;
+ int pad;
len = maxcount;
v = 0;
- while (len > 0) {
- page = *(resp->rqstp->rq_next_page);
- if (!page) { /* ran out of pages */
- maxcount -= len;
- break;
- }
- resp->rqstp->rq_vec[v].iov_base = page_address(page);
- resp->rqstp->rq_vec[v].iov_len =
- len < PAGE_SIZE ? len : PAGE_SIZE;
- resp->rqstp->rq_next_page++;
+
+ thislen = (void *)xdr->end - (void *)xdr->p;
+ if (len < thislen)
+ thislen = len;
+ p = xdr_reserve_space(xdr, (thislen+3)&~3);
+ WARN_ON_ONCE(!p);
+ resp->rqstp->rq_vec[v].iov_base = p;
+ resp->rqstp->rq_vec[v].iov_len = thislen;
+ v++;
+ len -= thislen;
+
+ while (len) {
+ thislen = min_t(long, len, PAGE_SIZE);
+ p = xdr_reserve_space(xdr, (thislen+3)&~3);
+ WARN_ON_ONCE(!p);
+ resp->rqstp->rq_vec[v].iov_base = p;
+ resp->rqstp->rq_vec[v].iov_len = thislen;
v++;
- len -= PAGE_SIZE;
+ len -= thislen;
}
read->rd_vlen = v;
- nfserr = nfsd_read_file(read->rd_rqstp, read->rd_fhp, read->rd_filp,
- read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
- &maxcount);
-
+ nfserr = nfsd_readv(file, read->rd_offset, resp->rqstp->rq_vec,
+ read->rd_vlen, &maxcount);
if (nfserr)
return nfserr;
+ xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3));
+
eof = (read->rd_offset + maxcount >=
read->rd_fhp->fh_dentry->d_inode->i_size);
- WRITE32(eof);
- WRITE32(maxcount);
- ADJUST_ARGS();
- resp->xbuf->head[0].iov_len = (char*)p
- - (char*)resp->xbuf->head[0].iov_base;
- resp->xbuf->page_len = maxcount;
+ tmp = htonl(eof);
+ write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4);
+ tmp = htonl(maxcount);
+ write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
- /* Use rest of head for padding and remaining ops: */
- resp->xbuf->tail[0].iov_base = p;
- resp->xbuf->tail[0].iov_len = 0;
- if (maxcount&3) {
- RESERVE_SPACE(4);
- WRITE32(0);
- resp->xbuf->tail[0].iov_base += maxcount&3;
- resp->xbuf->tail[0].iov_len = 4 - (maxcount&3);
- ADJUST_ARGS();
- }
+ pad = (maxcount&3) ? 4 - (maxcount&3) : 0;
+ write_bytes_to_xdr_buf(xdr->buf, starting_len + 8 + maxcount,
+ &zzz, pad);
return 0;
+
}
static __be32
-nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink)
+nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_read *read)
{
- int maxcount;
- char *page;
+ unsigned long maxcount;
+ struct xdr_stream *xdr = &resp->xdr;
+ struct file *file = read->rd_filp;
+ int starting_len = xdr->buf->len;
+ struct raparms *ra;
__be32 *p;
+ __be32 err;
if (nfserr)
return nfserr;
- if (resp->xbuf->page_len)
+
+ p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */
+ if (!p) {
+ WARN_ON_ONCE(resp->rqstp->rq_splice_ok);
return nfserr_resource;
- if (!*resp->rqstp->rq_next_page)
+ }
+ if (resp->xdr.buf->page_len && resp->rqstp->rq_splice_ok) {
+ WARN_ON_ONCE(1);
return nfserr_resource;
+ }
+ xdr_commit_encode(xdr);
+
+ maxcount = svc_max_payload(resp->rqstp);
+ if (maxcount > xdr->buf->buflen - xdr->buf->len)
+ maxcount = xdr->buf->buflen - xdr->buf->len;
+ if (maxcount > read->rd_length)
+ maxcount = read->rd_length;
+
+ if (!read->rd_filp) {
+ err = nfsd_get_tmp_read_open(resp->rqstp, read->rd_fhp,
+ &file, &ra);
+ if (err)
+ goto err_truncate;
+ }
+
+ if (file->f_op->splice_read && resp->rqstp->rq_splice_ok)
+ err = nfsd4_encode_splice_read(resp, read, file, maxcount);
+ else
+ err = nfsd4_encode_readv(resp, read, file, maxcount);
+
+ if (!read->rd_filp)
+ nfsd_put_tmp_read_open(file, ra);
+
+err_truncate:
+ if (err)
+ xdr_truncate_encode(xdr, starting_len);
+ return err;
+}
+
+static __be32
+nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink)
+{
+ int maxcount;
+ __be32 wire_count;
+ int zero = 0;
+ struct xdr_stream *xdr = &resp->xdr;
+ int length_offset = xdr->buf->len;
+ __be32 *p;
- page = page_address(*(resp->rqstp->rq_next_page++));
+ if (nfserr)
+ return nfserr;
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
maxcount = PAGE_SIZE;
- RESERVE_SPACE(4);
+ p = xdr_reserve_space(xdr, maxcount);
+ if (!p)
+ return nfserr_resource;
/*
* XXX: By default, the ->readlink() VFS op will truncate symlinks
* if they would overflow the buffer. Is this kosher in NFSv4? If
* not, one easy fix is: if ->readlink() precisely fills the buffer,
* assume that truncation occurred, and return NFS4ERR_RESOURCE.
*/
- nfserr = nfsd_readlink(readlink->rl_rqstp, readlink->rl_fhp, page, &maxcount);
+ nfserr = nfsd_readlink(readlink->rl_rqstp, readlink->rl_fhp,
+ (char *)p, &maxcount);
if (nfserr == nfserr_isdir)
- return nfserr_inval;
- if (nfserr)
+ nfserr = nfserr_inval;
+ if (nfserr) {
+ xdr_truncate_encode(xdr, length_offset);
return nfserr;
-
- WRITE32(maxcount);
- ADJUST_ARGS();
- resp->xbuf->head[0].iov_len = (char*)p
- - (char*)resp->xbuf->head[0].iov_base;
- resp->xbuf->page_len = maxcount;
-
- /* Use rest of head for padding and remaining ops: */
- resp->xbuf->tail[0].iov_base = p;
- resp->xbuf->tail[0].iov_len = 0;
- if (maxcount&3) {
- RESERVE_SPACE(4);
- WRITE32(0);
- resp->xbuf->tail[0].iov_base += maxcount&3;
- resp->xbuf->tail[0].iov_len = 4 - (maxcount&3);
- ADJUST_ARGS();
}
+
+ wire_count = htonl(maxcount);
+ write_bytes_to_xdr_buf(xdr->buf, length_offset, &wire_count, 4);
+ xdr_truncate_encode(xdr, length_offset + 4 + maxcount);
+ if (maxcount & 3)
+ write_bytes_to_xdr_buf(xdr->buf, length_offset + 4 + maxcount,
+ &zero, 4 - (maxcount&3));
return 0;
}
@@ -3062,47 +3278,52 @@ static __be32
nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readdir *readdir)
{
int maxcount;
+ int bytes_left;
loff_t offset;
- __be32 *page, *savep, *tailbase;
+ __be64 wire_offset;
+ struct xdr_stream *xdr = &resp->xdr;
+ int starting_len = xdr->buf->len;
__be32 *p;
if (nfserr)
return nfserr;
- if (resp->xbuf->page_len)
- return nfserr_resource;
- if (!*resp->rqstp->rq_next_page)
- return nfserr_resource;
- RESERVE_SPACE(NFS4_VERIFIER_SIZE);
- savep = p;
+ p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
+ if (!p)
+ return nfserr_resource;
/* XXX: Following NFSv3, we ignore the READDIR verifier for now. */
- WRITE32(0);
- WRITE32(0);
- ADJUST_ARGS();
- resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base;
- tailbase = p;
-
- maxcount = PAGE_SIZE;
- if (maxcount > readdir->rd_maxcount)
- maxcount = readdir->rd_maxcount;
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(0);
+ resp->xdr.buf->head[0].iov_len = ((char *)resp->xdr.p)
+ - (char *)resp->xdr.buf->head[0].iov_base;
/*
- * Convert from bytes to words, account for the two words already
- * written, make sure to leave two words at the end for the next
- * pointer and eof field.
+ * Number of bytes left for directory entries allowing for the
+ * final 8 bytes of the readdir and a following failed op:
+ */
+ bytes_left = xdr->buf->buflen - xdr->buf->len
+ - COMPOUND_ERR_SLACK_SPACE - 8;
+ if (bytes_left < 0) {
+ nfserr = nfserr_resource;
+ goto err_no_verf;
+ }
+ maxcount = min_t(u32, readdir->rd_maxcount, INT_MAX);
+ /*
+ * Note the rfc defines rd_maxcount as the size of the
+ * READDIR4resok structure, which includes the verifier above
+ * and the 8 bytes encoded at the end of this function:
*/
- maxcount = (maxcount >> 2) - 4;
- if (maxcount < 0) {
- nfserr = nfserr_toosmall;
+ if (maxcount < 16) {
+ nfserr = nfserr_toosmall;
goto err_no_verf;
}
+ maxcount = min_t(int, maxcount-16, bytes_left);
- page = page_address(*(resp->rqstp->rq_next_page++));
+ readdir->xdr = xdr;
+ readdir->rd_maxcount = maxcount;
readdir->common.err = 0;
- readdir->buflen = maxcount;
- readdir->buffer = page;
- readdir->offset = NULL;
+ readdir->cookie_offset = 0;
offset = readdir->rd_cookie;
nfserr = nfsd_readdir(readdir->rd_rqstp, readdir->rd_fhp,
@@ -3110,42 +3331,49 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
&readdir->common, nfsd4_encode_dirent);
if (nfserr == nfs_ok &&
readdir->common.err == nfserr_toosmall &&
- readdir->buffer == page)
- nfserr = nfserr_toosmall;
+ xdr->buf->len == starting_len + 8) {
+ /* nothing encoded; which limit did we hit?: */
+ if (maxcount - 16 < bytes_left)
+ /* It was the fault of rd_maxcount: */
+ nfserr = nfserr_toosmall;
+ else
+ /* We ran out of buffer space: */
+ nfserr = nfserr_resource;
+ }
if (nfserr)
goto err_no_verf;
- if (readdir->offset)
- xdr_encode_hyper(readdir->offset, offset);
+ if (readdir->cookie_offset) {
+ wire_offset = cpu_to_be64(offset);
+ write_bytes_to_xdr_buf(xdr->buf, readdir->cookie_offset,
+ &wire_offset, 8);
+ }
- p = readdir->buffer;
+ p = xdr_reserve_space(xdr, 8);
+ if (!p) {
+ WARN_ON_ONCE(1);
+ goto err_no_verf;
+ }
*p++ = 0; /* no more entries */
*p++ = htonl(readdir->common.err == nfserr_eof);
- resp->xbuf->page_len = ((char*)p) -
- (char*)page_address(*(resp->rqstp->rq_next_page-1));
-
- /* Use rest of head for padding and remaining ops: */
- resp->xbuf->tail[0].iov_base = tailbase;
- resp->xbuf->tail[0].iov_len = 0;
- resp->p = resp->xbuf->tail[0].iov_base;
- resp->end = resp->p + (PAGE_SIZE - resp->xbuf->head[0].iov_len)/4;
return 0;
err_no_verf:
- p = savep;
- ADJUST_ARGS();
+ xdr_truncate_encode(xdr, starting_len);
return nfserr;
}
static __be32
nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove)
{
+ struct xdr_stream *xdr = &resp->xdr;
__be32 *p;
if (!nfserr) {
- RESERVE_SPACE(20);
- write_cinfo(&p, &remove->rm_cinfo);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 20);
+ if (!p)
+ return nfserr_resource;
+ p = encode_cinfo(p, &remove->rm_cinfo);
}
return nfserr;
}
@@ -3153,19 +3381,21 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
static __be32
nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename)
{
+ struct xdr_stream *xdr = &resp->xdr;
__be32 *p;
if (!nfserr) {
- RESERVE_SPACE(40);
- write_cinfo(&p, &rename->rn_sinfo);
- write_cinfo(&p, &rename->rn_tinfo);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 40);
+ if (!p)
+ return nfserr_resource;
+ p = encode_cinfo(p, &rename->rn_sinfo);
+ p = encode_cinfo(p, &rename->rn_tinfo);
}
return nfserr;
}
static __be32
-nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
+nfsd4_do_encode_secinfo(struct xdr_stream *xdr,
__be32 nfserr, struct svc_export *exp)
{
u32 i, nflavs, supported;
@@ -3176,6 +3406,7 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
if (nfserr)
goto out;
+ nfserr = nfserr_resource;
if (exp->ex_nflavors) {
flavs = exp->ex_flavors;
nflavs = exp->ex_nflavors;
@@ -3197,9 +3428,10 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
}
supported = 0;
- RESERVE_SPACE(4);
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out;
flavorsp = p++; /* to be backfilled later */
- ADJUST_ARGS();
for (i = 0; i < nflavs; i++) {
rpc_authflavor_t pf = flavs[i].pseudoflavor;
@@ -3207,18 +3439,20 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
if (rpcauth_get_gssinfo(pf, &info) == 0) {
supported++;
- RESERVE_SPACE(4 + 4 + XDR_LEN(info.oid.len) + 4 + 4);
- WRITE32(RPC_AUTH_GSS);
- WRITE32(info.oid.len);
- WRITEMEM(info.oid.data, info.oid.len);
- WRITE32(info.qop);
- WRITE32(info.service);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 4 + 4 +
+ XDR_LEN(info.oid.len) + 4 + 4);
+ if (!p)
+ goto out;
+ *p++ = cpu_to_be32(RPC_AUTH_GSS);
+ p = xdr_encode_opaque(p, info.oid.data, info.oid.len);
+ *p++ = cpu_to_be32(info.qop);
+ *p++ = cpu_to_be32(info.service);
} else if (pf < RPC_AUTH_MAXFLAVOR) {
supported++;
- RESERVE_SPACE(4);
- WRITE32(pf);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out;
+ *p++ = cpu_to_be32(pf);
} else {
if (report)
pr_warn("NFS: SECINFO: security flavor %u "
@@ -3229,7 +3463,7 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
if (nflavs != supported)
report = false;
*flavorsp = htonl(supported);
-
+ nfserr = 0;
out:
if (exp)
exp_put(exp);
@@ -3240,14 +3474,18 @@ static __be32
nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_secinfo *secinfo)
{
- return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->si_exp);
+ struct xdr_stream *xdr = &resp->xdr;
+
+ return nfsd4_do_encode_secinfo(xdr, nfserr, secinfo->si_exp);
}
static __be32
nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_secinfo_no_name *secinfo)
{
- return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->sin_exp);
+ struct xdr_stream *xdr = &resp->xdr;
+
+ return nfsd4_do_encode_secinfo(xdr, nfserr, secinfo->sin_exp);
}
/*
@@ -3257,41 +3495,47 @@ nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
static __be32
nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr)
{
+ struct xdr_stream *xdr = &resp->xdr;
__be32 *p;
- RESERVE_SPACE(16);
+ p = xdr_reserve_space(xdr, 16);
+ if (!p)
+ return nfserr_resource;
if (nfserr) {
- WRITE32(3);
- WRITE32(0);
- WRITE32(0);
- WRITE32(0);
+ *p++ = cpu_to_be32(3);
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(0);
}
else {
- WRITE32(3);
- WRITE32(setattr->sa_bmval[0]);
- WRITE32(setattr->sa_bmval[1]);
- WRITE32(setattr->sa_bmval[2]);
+ *p++ = cpu_to_be32(3);
+ *p++ = cpu_to_be32(setattr->sa_bmval[0]);
+ *p++ = cpu_to_be32(setattr->sa_bmval[1]);
+ *p++ = cpu_to_be32(setattr->sa_bmval[2]);
}
- ADJUST_ARGS();
return nfserr;
}
static __be32
nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd)
{
+ struct xdr_stream *xdr = &resp->xdr;
__be32 *p;
if (!nfserr) {
- RESERVE_SPACE(8 + NFS4_VERIFIER_SIZE);
- WRITEMEM(&scd->se_clientid, 8);
- WRITEMEM(&scd->se_confirm, NFS4_VERIFIER_SIZE);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 8 + NFS4_VERIFIER_SIZE);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_opaque_fixed(p, &scd->se_clientid, 8);
+ p = xdr_encode_opaque_fixed(p, &scd->se_confirm,
+ NFS4_VERIFIER_SIZE);
}
else if (nfserr == nfserr_clid_inuse) {
- RESERVE_SPACE(8);
- WRITE32(0);
- WRITE32(0);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(0);
}
return nfserr;
}
@@ -3299,14 +3543,17 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct n
static __be32
nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write)
{
+ struct xdr_stream *xdr = &resp->xdr;
__be32 *p;
if (!nfserr) {
- RESERVE_SPACE(16);
- WRITE32(write->wr_bytes_written);
- WRITE32(write->wr_how_written);
- WRITEMEM(write->wr_verifier.data, NFS4_VERIFIER_SIZE);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 16);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(write->wr_bytes_written);
+ *p++ = cpu_to_be32(write->wr_how_written);
+ p = xdr_encode_opaque_fixed(p, write->wr_verifier.data,
+ NFS4_VERIFIER_SIZE);
}
return nfserr;
}
@@ -3323,6 +3570,7 @@ static __be32
nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_exchange_id *exid)
{
+ struct xdr_stream *xdr = &resp->xdr;
__be32 *p;
char *major_id;
char *server_scope;
@@ -3338,60 +3586,61 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
server_scope = utsname()->nodename;
server_scope_sz = strlen(server_scope);
- RESERVE_SPACE(
+ p = xdr_reserve_space(xdr,
8 /* eir_clientid */ +
4 /* eir_sequenceid */ +
4 /* eir_flags */ +
4 /* spr_how */);
+ if (!p)
+ return nfserr_resource;
- WRITEMEM(&exid->clientid, 8);
- WRITE32(exid->seqid);
- WRITE32(exid->flags);
+ p = xdr_encode_opaque_fixed(p, &exid->clientid, 8);
+ *p++ = cpu_to_be32(exid->seqid);
+ *p++ = cpu_to_be32(exid->flags);
- WRITE32(exid->spa_how);
- ADJUST_ARGS();
+ *p++ = cpu_to_be32(exid->spa_how);
switch (exid->spa_how) {
case SP4_NONE:
break;
case SP4_MACH_CRED:
/* spo_must_enforce, spo_must_allow */
- RESERVE_SPACE(16);
+ p = xdr_reserve_space(xdr, 16);
+ if (!p)
+ return nfserr_resource;
/* spo_must_enforce bitmap: */
- WRITE32(2);
- WRITE32(nfs4_minimal_spo_must_enforce[0]);
- WRITE32(nfs4_minimal_spo_must_enforce[1]);
+ *p++ = cpu_to_be32(2);
+ *p++ = cpu_to_be32(nfs4_minimal_spo_must_enforce[0]);
+ *p++ = cpu_to_be32(nfs4_minimal_spo_must_enforce[1]);
/* empty spo_must_allow bitmap: */
- WRITE32(0);
+ *p++ = cpu_to_be32(0);
- ADJUST_ARGS();
break;
default:
WARN_ON_ONCE(1);
}
- RESERVE_SPACE(
+ p = xdr_reserve_space(xdr,
8 /* so_minor_id */ +
4 /* so_major_id.len */ +
(XDR_QUADLEN(major_id_sz) * 4) +
4 /* eir_server_scope.len */ +
(XDR_QUADLEN(server_scope_sz) * 4) +
4 /* eir_server_impl_id.count (0) */);
+ if (!p)
+ return nfserr_resource;
/* The server_owner struct */
- WRITE64(minor_id); /* Minor id */
+ p = xdr_encode_hyper(p, minor_id); /* Minor id */
/* major id */
- WRITE32(major_id_sz);
- WRITEMEM(major_id, major_id_sz);
+ p = xdr_encode_opaque(p, major_id, major_id_sz);
/* Server scope */
- WRITE32(server_scope_sz);
- WRITEMEM(server_scope, server_scope_sz);
+ p = xdr_encode_opaque(p, server_scope, server_scope_sz);
/* Implementation id */
- WRITE32(0); /* zero length nfs_impl_id4 array */
- ADJUST_ARGS();
+ *p++ = cpu_to_be32(0); /* zero length nfs_impl_id4 array */
return 0;
}
@@ -3399,47 +3648,54 @@ static __be32
nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_create_session *sess)
{
+ struct xdr_stream *xdr = &resp->xdr;
__be32 *p;
if (nfserr)
return nfserr;
- RESERVE_SPACE(24);
- WRITEMEM(sess->sessionid.data, NFS4_MAX_SESSIONID_LEN);
- WRITE32(sess->seqid);
- WRITE32(sess->flags);
- ADJUST_ARGS();
-
- RESERVE_SPACE(28);
- WRITE32(0); /* headerpadsz */
- WRITE32(sess->fore_channel.maxreq_sz);
- WRITE32(sess->fore_channel.maxresp_sz);
- WRITE32(sess->fore_channel.maxresp_cached);
- WRITE32(sess->fore_channel.maxops);
- WRITE32(sess->fore_channel.maxreqs);
- WRITE32(sess->fore_channel.nr_rdma_attrs);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 24);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_opaque_fixed(p, sess->sessionid.data,
+ NFS4_MAX_SESSIONID_LEN);
+ *p++ = cpu_to_be32(sess->seqid);
+ *p++ = cpu_to_be32(sess->flags);
+
+ p = xdr_reserve_space(xdr, 28);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(0); /* headerpadsz */
+ *p++ = cpu_to_be32(sess->fore_channel.maxreq_sz);
+ *p++ = cpu_to_be32(sess->fore_channel.maxresp_sz);
+ *p++ = cpu_to_be32(sess->fore_channel.maxresp_cached);
+ *p++ = cpu_to_be32(sess->fore_channel.maxops);
+ *p++ = cpu_to_be32(sess->fore_channel.maxreqs);
+ *p++ = cpu_to_be32(sess->fore_channel.nr_rdma_attrs);
if (sess->fore_channel.nr_rdma_attrs) {
- RESERVE_SPACE(4);
- WRITE32(sess->fore_channel.rdma_attrs);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(sess->fore_channel.rdma_attrs);
}
- RESERVE_SPACE(28);
- WRITE32(0); /* headerpadsz */
- WRITE32(sess->back_channel.maxreq_sz);
- WRITE32(sess->back_channel.maxresp_sz);
- WRITE32(sess->back_channel.maxresp_cached);
- WRITE32(sess->back_channel.maxops);
- WRITE32(sess->back_channel.maxreqs);
- WRITE32(sess->back_channel.nr_rdma_attrs);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 28);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(0); /* headerpadsz */
+ *p++ = cpu_to_be32(sess->back_channel.maxreq_sz);
+ *p++ = cpu_to_be32(sess->back_channel.maxresp_sz);
+ *p++ = cpu_to_be32(sess->back_channel.maxresp_cached);
+ *p++ = cpu_to_be32(sess->back_channel.maxops);
+ *p++ = cpu_to_be32(sess->back_channel.maxreqs);
+ *p++ = cpu_to_be32(sess->back_channel.nr_rdma_attrs);
if (sess->back_channel.nr_rdma_attrs) {
- RESERVE_SPACE(4);
- WRITE32(sess->back_channel.rdma_attrs);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(sess->back_channel.rdma_attrs);
}
return 0;
}
@@ -3448,22 +3704,25 @@ static __be32
nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_sequence *seq)
{
+ struct xdr_stream *xdr = &resp->xdr;
__be32 *p;
if (nfserr)
return nfserr;
- RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 20);
- WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
- WRITE32(seq->seqid);
- WRITE32(seq->slotid);
+ p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 20);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_opaque_fixed(p, seq->sessionid.data,
+ NFS4_MAX_SESSIONID_LEN);
+ *p++ = cpu_to_be32(seq->seqid);
+ *p++ = cpu_to_be32(seq->slotid);
/* Note slotid's are numbered from zero: */
- WRITE32(seq->maxslots - 1); /* sr_highest_slotid */
- WRITE32(seq->maxslots - 1); /* sr_target_highest_slotid */
- WRITE32(seq->status_flags);
+ *p++ = cpu_to_be32(seq->maxslots - 1); /* sr_highest_slotid */
+ *p++ = cpu_to_be32(seq->maxslots - 1); /* sr_target_highest_slotid */
+ *p++ = cpu_to_be32(seq->status_flags);
- ADJUST_ARGS();
- resp->cstate.datap = p; /* DRC cache data pointer */
+ resp->cstate.data_offset = xdr->buf->len; /* DRC cache data pointer */
return 0;
}
@@ -3471,20 +3730,22 @@ static __be32
nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_test_stateid *test_stateid)
{
+ struct xdr_stream *xdr = &resp->xdr;
struct nfsd4_test_stateid_id *stateid, *next;
__be32 *p;
if (nfserr)
return nfserr;
- RESERVE_SPACE(4 + (4 * test_stateid->ts_num_ids));
+ p = xdr_reserve_space(xdr, 4 + (4 * test_stateid->ts_num_ids));
+ if (!p)
+ return nfserr_resource;
*p++ = htonl(test_stateid->ts_num_ids);
list_for_each_entry_safe(stateid, next, &test_stateid->ts_stateid_list, ts_id_list) {
*p++ = stateid->ts_id_status;
}
- ADJUST_ARGS();
return nfserr;
}
@@ -3563,81 +3824,99 @@ static nfsd4_enc nfsd4_enc_ops[] = {
};
/*
- * Calculate the total amount of memory that the compound response has taken
- * after encoding the current operation with pad.
- *
- * pad: if operation is non-idempotent, pad was calculate by op_rsize_bop()
- * which was specified at nfsd4_operation, else pad is zero.
- *
- * Compare this length to the session se_fmaxresp_sz and se_fmaxresp_cached.
+ * Calculate whether we still have space to encode repsize bytes.
+ * There are two considerations:
+ * - For NFS versions >=4.1, the size of the reply must stay within
+ * session limits
+ * - For all NFS versions, we must stay within limited preallocated
+ * buffer space.
*
- * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so
- * will be at least a page and will therefore hold the xdr_buf head.
+ * This is called before the operation is processed, so can only provide
+ * an upper estimate. For some nonidempotent operations (such as
+ * getattr), it's not necessarily a problem if that estimate is wrong,
+ * as we can fail it after processing without significant side effects.
*/
-__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad)
+__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 respsize)
{
- struct xdr_buf *xb = &resp->rqstp->rq_res;
- struct nfsd4_session *session = NULL;
+ struct xdr_buf *buf = &resp->rqstp->rq_res;
struct nfsd4_slot *slot = resp->cstate.slot;
- u32 length, tlen = 0;
+ if (buf->len + respsize <= buf->buflen)
+ return nfs_ok;
if (!nfsd4_has_session(&resp->cstate))
- return 0;
-
- session = resp->cstate.session;
-
- if (xb->page_len == 0) {
- length = (char *)resp->p - (char *)xb->head[0].iov_base + pad;
- } else {
- if (xb->tail[0].iov_base && xb->tail[0].iov_len > 0)
- tlen = (char *)resp->p - (char *)xb->tail[0].iov_base;
-
- length = xb->head[0].iov_len + xb->page_len + tlen + pad;
- }
- dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__,
- length, xb->page_len, tlen, pad);
-
- if (length > session->se_fchannel.maxresp_sz)
- return nfserr_rep_too_big;
-
- if ((slot->sl_flags & NFSD4_SLOT_CACHETHIS) &&
- length > session->se_fchannel.maxresp_cached)
+ return nfserr_resource;
+ if (slot->sl_flags & NFSD4_SLOT_CACHETHIS) {
+ WARN_ON_ONCE(1);
return nfserr_rep_too_big_to_cache;
-
- return 0;
+ }
+ return nfserr_rep_too_big;
}
void
nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
{
+ struct xdr_stream *xdr = &resp->xdr;
struct nfs4_stateowner *so = resp->cstate.replay_owner;
- __be32 *statp;
+ struct svc_rqst *rqstp = resp->rqstp;
+ int post_err_offset;
+ nfsd4_enc encoder;
__be32 *p;
- RESERVE_SPACE(8);
- WRITE32(op->opnum);
- statp = p++; /* to be backfilled at the end */
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 8);
+ if (!p) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+ *p++ = cpu_to_be32(op->opnum);
+ post_err_offset = xdr->buf->len;
if (op->opnum == OP_ILLEGAL)
goto status;
BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) ||
!nfsd4_enc_ops[op->opnum]);
- op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
+ encoder = nfsd4_enc_ops[op->opnum];
+ op->status = encoder(resp, op->status, &op->u);
+ xdr_commit_encode(xdr);
+
/* nfsd4_check_resp_size guarantees enough room for error status */
- if (!op->status)
- op->status = nfsd4_check_resp_size(resp, 0);
+ if (!op->status) {
+ int space_needed = 0;
+ if (!nfsd4_last_compound_op(rqstp))
+ space_needed = COMPOUND_ERR_SLACK_SPACE;
+ op->status = nfsd4_check_resp_size(resp, space_needed);
+ }
+ if (op->status == nfserr_resource && nfsd4_has_session(&resp->cstate)) {
+ struct nfsd4_slot *slot = resp->cstate.slot;
+
+ if (slot->sl_flags & NFSD4_SLOT_CACHETHIS)
+ op->status = nfserr_rep_too_big_to_cache;
+ else
+ op->status = nfserr_rep_too_big;
+ }
+ if (op->status == nfserr_resource ||
+ op->status == nfserr_rep_too_big ||
+ op->status == nfserr_rep_too_big_to_cache) {
+ /*
+ * The operation may have already been encoded or
+ * partially encoded. No op returns anything additional
+ * in the case of one of these three errors, so we can
+ * just truncate back to after the status. But it's a
+ * bug if we had to do this on a non-idempotent op:
+ */
+ warn_on_nonidempotent_op(op);
+ xdr_truncate_encode(xdr, post_err_offset);
+ }
if (so) {
+ int len = xdr->buf->len - post_err_offset;
+
so->so_replay.rp_status = op->status;
- so->so_replay.rp_buflen = (char *)resp->p - (char *)(statp+1);
- memcpy(so->so_replay.rp_buf, statp+1, so->so_replay.rp_buflen);
+ so->so_replay.rp_buflen = len;
+ read_bytes_from_xdr_buf(xdr->buf, post_err_offset,
+ so->so_replay.rp_buf, len);
}
status:
- /*
- * Note: We write the status directly, instead of using WRITE32(),
- * since it is already in network byte order.
- */
- *statp = op->status;
+ /* Note that op->status is already in network byte order: */
+ write_bytes_to_xdr_buf(xdr->buf, post_err_offset - 4, &op->status, 4);
}
/*
@@ -3649,21 +3928,22 @@ status:
* called with nfs4_lock_state() held
*/
void
-nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
+nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op)
{
__be32 *p;
struct nfs4_replay *rp = op->replay;
BUG_ON(!rp);
- RESERVE_SPACE(8);
- WRITE32(op->opnum);
+ p = xdr_reserve_space(xdr, 8 + rp->rp_buflen);
+ if (!p) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+ *p++ = cpu_to_be32(op->opnum);
*p++ = rp->rp_status; /* already xdr'ed */
- ADJUST_ARGS();
- RESERVE_SPACE(rp->rp_buflen);
- WRITEMEM(rp->rp_buf, rp->rp_buflen);
- ADJUST_ARGS();
+ p = xdr_encode_opaque_fixed(p, rp->rp_buf, rp->rp_buflen);
}
int
@@ -3720,19 +4000,19 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
* All that remains is to write the tag and operation count...
*/
struct nfsd4_compound_state *cs = &resp->cstate;
- struct kvec *iov;
+ struct xdr_buf *buf = resp->xdr.buf;
+
+ WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len +
+ buf->tail[0].iov_len);
+
+ rqstp->rq_next_page = resp->xdr.page_ptr + 1;
+
p = resp->tagp;
*p++ = htonl(resp->taglen);
memcpy(p, resp->tag, resp->taglen);
p += XDR_QUADLEN(resp->taglen);
*p++ = htonl(resp->opcnt);
- if (rqstp->rq_res.page_len)
- iov = &rqstp->rq_res.tail[0];
- else
- iov = &rqstp->rq_res.head[0];
- iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
- BUG_ON(iov->iov_len > PAGE_SIZE);
if (nfsd4_has_session(cs)) {
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
struct nfs4_client *clp = cs->session->se_client;
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index f8f060ffbf4f..6040da8830ff 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -224,13 +224,6 @@ hash_refile(struct svc_cacherep *rp)
hlist_add_head(&rp->c_hash, cache_hash + hash_32(rp->c_xid, maskbits));
}
-static inline bool
-nfsd_cache_entry_expired(struct svc_cacherep *rp)
-{
- return rp->c_state != RC_INPROG &&
- time_after(jiffies, rp->c_timestamp + RC_EXPIRE);
-}
-
/*
* Walk the LRU list and prune off entries that are older than RC_EXPIRE.
* Also prune the oldest ones when the total exceeds the max number of entries.
@@ -242,8 +235,14 @@ prune_cache_entries(void)
long freed = 0;
list_for_each_entry_safe(rp, tmp, &lru_head, c_lru) {
- if (!nfsd_cache_entry_expired(rp) &&
- num_drc_entries <= max_drc_entries)
+ /*
+ * Don't free entries attached to calls that are still
+ * in-progress, but do keep scanning the list.
+ */
+ if (rp->c_state == RC_INPROG)
+ continue;
+ if (num_drc_entries <= max_drc_entries &&
+ time_before(jiffies, rp->c_timestamp + RC_EXPIRE))
break;
nfsd_reply_cache_free_locked(rp);
freed++;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index f34d9de802ab..51844048937f 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1179,7 +1179,6 @@ static int __init init_nfsd(void)
retval = nfsd4_init_slabs();
if (retval)
goto out_unregister_pernet;
- nfs4_state_init();
retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
if (retval)
goto out_free_slabs;
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 479eb681c27c..847daf37e566 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -15,11 +15,20 @@
#include <linux/nfs2.h>
#include <linux/nfs3.h>
#include <linux/nfs4.h>
+#include <linux/sunrpc/svc.h>
#include <linux/sunrpc/msg_prot.h>
-#include <linux/nfsd/debug.h>
-#include <linux/nfsd/export.h>
-#include <linux/nfsd/stats.h>
+#include <uapi/linux/nfsd/debug.h>
+
+#include "stats.h"
+#include "export.h"
+
+#undef ifdebug
+#ifdef NFSD_DEBUG
+# define ifdebug(flag) if (nfsd_debug & NFSDDBG_##flag)
+#else
+# define ifdebug(flag) if (0)
+#endif
/*
* nfsd version
@@ -106,7 +115,6 @@ static inline int nfsd_v4client(struct svc_rqst *rq)
*/
#ifdef CONFIG_NFSD_V4
extern unsigned long max_delegations;
-void nfs4_state_init(void);
int nfsd4_init_slabs(void);
void nfsd4_free_slabs(void);
int nfs4_state_start(void);
@@ -117,7 +125,6 @@ void nfs4_reset_lease(time_t leasetime);
int nfs4_reset_recoverydir(char *recdir);
char * nfs4_recoverydir(void);
#else
-static inline void nfs4_state_init(void) { }
static inline int nfsd4_init_slabs(void) { return 0; }
static inline void nfsd4_free_slabs(void) { }
static inline int nfs4_state_start(void) { return 0; }
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 3c37b160dcad..ec8393418154 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -88,9 +88,8 @@ static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp,
/* Check if the request originated from a secure port. */
if (!rqstp->rq_secure && !(flags & NFSEXP_INSECURE_PORT)) {
RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
- dprintk(KERN_WARNING
- "nfsd: request from insecure port %s!\n",
- svc_print_addr(rqstp, buf, sizeof(buf)));
+ dprintk("nfsd: request from insecure port %s!\n",
+ svc_print_addr(rqstp, buf, sizeof(buf)));
return nfserr_perm;
}
@@ -169,8 +168,8 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
data_left -= len;
if (data_left < 0)
return error;
- exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_auth);
- fid = (struct fid *)(fh->fh_auth + len);
+ exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_fsid);
+ fid = (struct fid *)(fh->fh_fsid + len);
} else {
__u32 tfh[2];
dev_t xdev;
@@ -385,7 +384,7 @@ static void _fh_update(struct svc_fh *fhp, struct svc_export *exp,
{
if (dentry != exp->ex_path.dentry) {
struct fid *fid = (struct fid *)
- (fhp->fh_handle.fh_auth + fhp->fh_handle.fh_size/4 - 1);
+ (fhp->fh_handle.fh_fsid + fhp->fh_handle.fh_size/4 - 1);
int maxsize = (fhp->fh_maxsize - fhp->fh_handle.fh_size)/4;
int subtreecheck = !(exp->ex_flags & NFSEXP_NOSUBTREECHECK);
@@ -513,7 +512,6 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
*/
struct inode * inode = dentry->d_inode;
- __u32 *datap;
dev_t ex_dev = exp_sb(exp)->s_dev;
dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %pd2, ino=%ld)\n",
@@ -557,17 +555,16 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
if (inode)
_fh_update_old(dentry, exp, &fhp->fh_handle);
} else {
- int len;
+ fhp->fh_handle.fh_size =
+ key_len(fhp->fh_handle.fh_fsid_type) + 4;
fhp->fh_handle.fh_auth_type = 0;
- datap = fhp->fh_handle.fh_auth+0;
- mk_fsid(fhp->fh_handle.fh_fsid_type, datap, ex_dev,
+
+ mk_fsid(fhp->fh_handle.fh_fsid_type,
+ fhp->fh_handle.fh_fsid,
+ ex_dev,
exp->ex_path.dentry->d_inode->i_ino,
exp->ex_fsid, exp->ex_uuid);
- len = key_len(fhp->fh_handle.fh_fsid_type);
- datap += len/4;
- fhp->fh_handle.fh_size = 4 + len;
-
if (inode)
_fh_update(fhp, exp, dentry);
if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) {
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index ad67964d0bb1..2e89e70ac15c 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -1,9 +1,58 @@
-/* Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */
+/*
+ * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
+ *
+ * This file describes the layout of the file handles as passed
+ * over the wire.
+ */
+#ifndef _LINUX_NFSD_NFSFH_H
+#define _LINUX_NFSD_NFSFH_H
+
+#include <linux/sunrpc/svc.h>
+#include <uapi/linux/nfsd/nfsfh.h>
+
+static inline __u32 ino_t_to_u32(ino_t ino)
+{
+ return (__u32) ino;
+}
+
+static inline ino_t u32_to_ino_t(__u32 uino)
+{
+ return (ino_t) uino;
+}
-#ifndef _LINUX_NFSD_FH_INT_H
-#define _LINUX_NFSD_FH_INT_H
+/*
+ * This is the internal representation of an NFS handle used in knfsd.
+ * pre_mtime/post_version will be used to support wcc_attr's in NFSv3.
+ */
+typedef struct svc_fh {
+ struct knfsd_fh fh_handle; /* FH data */
+ struct dentry * fh_dentry; /* validated dentry */
+ struct svc_export * fh_export; /* export pointer */
+ int fh_maxsize; /* max size for fh_handle */
+
+ unsigned char fh_locked; /* inode locked by us */
+ unsigned char fh_want_write; /* remount protection taken */
+
+#ifdef CONFIG_NFSD_V3
+ unsigned char fh_post_saved; /* post-op attrs saved */
+ unsigned char fh_pre_saved; /* pre-op attrs saved */
+
+ /* Pre-op attributes saved during fh_lock */
+ __u64 fh_pre_size; /* size before operation */
+ struct timespec fh_pre_mtime; /* mtime before oper */
+ struct timespec fh_pre_ctime; /* ctime before oper */
+ /*
+ * pre-op nfsv4 change attr: note must check IS_I_VERSION(inode)
+ * to find out if it is valid.
+ */
+ u64 fh_pre_change;
+
+ /* Post-op attributes saved in fh_unlock */
+ struct kstat fh_post_attr; /* full attrs after operation */
+ u64 fh_post_change; /* nfsv4 change; see above */
+#endif /* CONFIG_NFSD_V3 */
-#include <linux/nfsd/nfsfh.h>
+} svc_fh;
enum nfsd_fsid {
FSID_DEV = 0,
@@ -215,4 +264,4 @@ fh_unlock(struct svc_fh *fhp)
}
}
-#endif /* _LINUX_NFSD_FH_INT_H */
+#endif /* _LINUX_NFSD_NFSFH_H */
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 9a4a5f9e7468..1879e43f2868 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -591,12 +591,6 @@ nfsd(void *vrqstp)
nfsdstats.th_cnt++;
mutex_unlock(&nfsd_mutex);
- /*
- * We want less throttling in balance_dirty_pages() so that nfs to
- * localhost doesn't cause nfsd to lock up due to all the client's
- * dirty pages.
- */
- current->flags |= PF_LESS_THROTTLE;
set_freezable();
/*
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 9c769a47ac5a..1ac306b769df 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -214,7 +214,8 @@ nfssvc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
int
nfssvc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *args)
{
- if (!(p = decode_fh(p, &args->fh)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
return xdr_argsize_check(rqstp, p);
}
@@ -248,7 +249,8 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
{
unsigned int len;
int v;
- if (!(p = decode_fh(p, &args->fh)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
args->offset = ntohl(*p++);
@@ -281,7 +283,8 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
unsigned int len, hdr, dlen;
int v;
- if (!(p = decode_fh(p, &args->fh)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
p++; /* beginoffset */
@@ -355,7 +358,8 @@ nfssvc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p,
int
nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readlinkargs *args)
{
- if (!(p = decode_fh(p, &args->fh)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
args->buffer = page_address(*(rqstp->rq_next_page++));
@@ -391,7 +395,8 @@ int
nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_readdirargs *args)
{
- if (!(p = decode_fh(p, &args->fh)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
args->cookie = ntohl(*p++);
args->count = ntohl(*p++);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 424d8f5f2317..374c66283ac5 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -37,7 +37,6 @@
#include <linux/idr.h>
#include <linux/sunrpc/svc_xprt.h>
-#include <linux/nfsd/nfsfh.h>
#include "nfsfh.h"
typedef struct {
@@ -123,7 +122,7 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s)
/* Maximum number of operations per session compound */
#define NFSD_MAX_OPS_PER_COMPOUND 16
/* Maximum session per slot cache size */
-#define NFSD_SLOT_CACHE_SIZE 1024
+#define NFSD_SLOT_CACHE_SIZE 2048
/* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */
#define NFSD_CACHE_SIZE_SLOTS_PER_SESSION 32
#define NFSD_MAX_MEM_PER_SESSION \
@@ -464,8 +463,6 @@ extern void nfs4_release_reclaim(struct nfsd_net *);
extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir,
struct nfsd_net *nn);
extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn);
-extern void nfs4_free_openowner(struct nfs4_openowner *);
-extern void nfs4_free_lockowner(struct nfs4_lockowner *);
extern int set_callback_cred(void);
extern void nfsd4_init_callback(struct nfsd4_callback *);
extern void nfsd4_probe_callback(struct nfs4_client *clp);
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 6d4521feb6e3..cd90878a76aa 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -24,7 +24,6 @@
#include <linux/seq_file.h>
#include <linux/module.h>
#include <linux/sunrpc/stats.h>
-#include <linux/nfsd/stats.h>
#include <net/net_namespace.h>
#include "nfsd.h"
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
new file mode 100644
index 000000000000..a5c944b771c6
--- /dev/null
+++ b/fs/nfsd/stats.h
@@ -0,0 +1,43 @@
+/*
+ * Statistics for NFS server.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+#ifndef _NFSD_STATS_H
+#define _NFSD_STATS_H
+
+#include <uapi/linux/nfsd/stats.h>
+
+
+struct nfsd_stats {
+ unsigned int rchits; /* repcache hits */
+ unsigned int rcmisses; /* repcache hits */
+ unsigned int rcnocache; /* uncached reqs */
+ unsigned int fh_stale; /* FH stale error */
+ unsigned int fh_lookup; /* dentry cached */
+ unsigned int fh_anon; /* anon file dentry returned */
+ unsigned int fh_nocache_dir; /* filehandle not found in dcache */
+ unsigned int fh_nocache_nondir; /* filehandle not found in dcache */
+ unsigned int io_read; /* bytes returned to read requests */
+ unsigned int io_write; /* bytes passed in write requests */
+ unsigned int th_cnt; /* number of available threads */
+ unsigned int th_usage[10]; /* number of ticks during which n perdeciles
+ * of available threads were in use */
+ unsigned int th_fullcnt; /* number of times last free thread was used */
+ unsigned int ra_size; /* size of ra cache */
+ unsigned int ra_depth[11]; /* number of times ra entry was found that deep
+ * in the cache (10percentiles). [10] = not found */
+#ifdef CONFIG_NFSD_V4
+ unsigned int nfs4_opcount[LAST_NFS4_OP + 1]; /* count of individual nfsv4 operations */
+#endif
+
+};
+
+
+extern struct nfsd_stats nfsdstats;
+extern struct svc_stat nfsd_svcstats;
+
+void nfsd_stat_init(void);
+void nfsd_stat_shutdown(void);
+
+#endif /* _NFSD_STATS_H */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 16f0673a423c..140c496f612c 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -820,55 +820,54 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
}
-static __be32
-nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
- loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
+__be32 nfsd_finish_read(struct file *file, unsigned long *count, int host_err)
{
- mm_segment_t oldfs;
- __be32 err;
- int host_err;
-
- err = nfserr_perm;
-
- if (file->f_op->splice_read && rqstp->rq_splice_ok) {
- struct splice_desc sd = {
- .len = 0,
- .total_len = *count,
- .pos = offset,
- .u.data = rqstp,
- };
-
- rqstp->rq_next_page = rqstp->rq_respages + 1;
- host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);
- } else {
- oldfs = get_fs();
- set_fs(KERNEL_DS);
- host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset);
- set_fs(oldfs);
- }
-
if (host_err >= 0) {
nfsdstats.io_read += host_err;
*count = host_err;
- err = 0;
fsnotify_access(file);
+ return 0;
} else
- err = nfserrno(host_err);
- return err;
+ return nfserrno(host_err);
+}
+
+int nfsd_splice_read(struct svc_rqst *rqstp,
+ struct file *file, loff_t offset, unsigned long *count)
+{
+ struct splice_desc sd = {
+ .len = 0,
+ .total_len = *count,
+ .pos = offset,
+ .u.data = rqstp,
+ };
+ int host_err;
+
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
+ host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);
+ return nfsd_finish_read(file, count, host_err);
}
-static void kill_suid(struct dentry *dentry)
+int nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen,
+ unsigned long *count)
{
- struct iattr ia;
- ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
+ mm_segment_t oldfs;
+ int host_err;
- mutex_lock(&dentry->d_inode->i_mutex);
- /*
- * Note we call this on write, so notify_change will not
- * encounter any conflicting delegations:
- */
- notify_change(dentry, &ia, NULL);
- mutex_unlock(&dentry->d_inode->i_mutex);
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset);
+ set_fs(oldfs);
+ return nfsd_finish_read(file, count, host_err);
+}
+
+static __be32
+nfsd_vfs_read(struct svc_rqst *rqstp, struct file *file,
+ loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
+{
+ if (file->f_op->splice_read && rqstp->rq_splice_ok)
+ return nfsd_splice_read(rqstp, file, offset, count);
+ else
+ return nfsd_readv(file, offset, vec, vlen, count);
}
/*
@@ -922,6 +921,16 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
int stable = *stablep;
int use_wgather;
loff_t pos = offset;
+ unsigned int pflags = current->flags;
+
+ if (rqstp->rq_local)
+ /*
+ * We want less throttling in balance_dirty_pages()
+ * and shrink_inactive_list() so that nfs to
+ * localhost doesn't cause nfsd to lock up due to all
+ * the client's dirty pages or its congested queue.
+ */
+ current->flags |= PF_LESS_THROTTLE;
dentry = file->f_path.dentry;
inode = dentry->d_inode;
@@ -942,10 +951,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
nfsdstats.io_write += host_err;
fsnotify_modify(file);
- /* clear setuid/setgid flag after write */
- if (inode->i_mode & (S_ISUID | S_ISGID))
- kill_suid(dentry);
-
if (stable) {
if (use_wgather)
host_err = wait_for_concurrent_writes(file);
@@ -959,36 +964,33 @@ out_nfserr:
err = 0;
else
err = nfserrno(host_err);
+ if (rqstp->rq_local)
+ tsk_restore_flags(current, pflags, PF_LESS_THROTTLE);
return err;
}
-/*
- * Read data from a file. count must contain the requested read count
- * on entry. On return, *count contains the number of bytes actually read.
- * N.B. After this call fhp needs an fh_put
- */
-__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
- loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
+__be32 nfsd_get_tmp_read_open(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct file **file, struct raparms **ra)
{
- struct file *file;
struct inode *inode;
- struct raparms *ra;
__be32 err;
- err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
+ err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, file);
if (err)
return err;
- inode = file_inode(file);
+ inode = file_inode(*file);
/* Get readahead parameters */
- ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
+ *ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
- if (ra && ra->p_set)
- file->f_ra = ra->p_ra;
-
- err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
+ if (*ra && (*ra)->p_set)
+ (*file)->f_ra = (*ra)->p_ra;
+ return nfs_ok;
+}
+void nfsd_put_tmp_read_open(struct file *file, struct raparms *ra)
+{
/* Write back readahead params */
if (ra) {
struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
@@ -998,28 +1000,29 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
ra->p_count--;
spin_unlock(&rab->pb_lock);
}
-
nfsd_close(file);
- return err;
}
-/* As above, but use the provided file descriptor. */
-__be32
-nfsd_read_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
- loff_t offset, struct kvec *vec, int vlen,
- unsigned long *count)
+/*
+ * Read data from a file. count must contain the requested read count
+ * on entry. On return, *count contains the number of bytes actually read.
+ * N.B. After this call fhp needs an fh_put
+ */
+__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
{
- __be32 err;
+ struct file *file;
+ struct raparms *ra;
+ __be32 err;
+
+ err = nfsd_get_tmp_read_open(rqstp, fhp, &file, &ra);
+ if (err)
+ return err;
+
+ err = nfsd_vfs_read(rqstp, file, offset, vec, vlen, count);
+
+ nfsd_put_tmp_read_open(file, ra);
- if (file) {
- err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
- NFSD_MAY_READ|NFSD_MAY_OWNER_OVERRIDE);
- if (err)
- goto out;
- err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
- } else /* Note file may still be NULL in NFSv4 special stateid case: */
- err = nfsd_read(rqstp, fhp, offset, vec, vlen, count);
-out:
return err;
}
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index fbe90bdb2214..91b6ae3f658b 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -70,10 +70,16 @@ __be32 nfsd_commit(struct svc_rqst *, struct svc_fh *,
__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t,
int, struct file **);
void nfsd_close(struct file *);
+struct raparms;
+__be32 nfsd_get_tmp_read_open(struct svc_rqst *, struct svc_fh *,
+ struct file **, struct raparms **);
+void nfsd_put_tmp_read_open(struct file *, struct raparms *);
+int nfsd_splice_read(struct svc_rqst *,
+ struct file *, loff_t, unsigned long *);
+int nfsd_readv(struct file *, loff_t, struct kvec *, int,
+ unsigned long *);
__be32 nfsd_read(struct svc_rqst *, struct svc_fh *,
loff_t, struct kvec *, int, unsigned long *);
-__be32 nfsd_read_file(struct svc_rqst *, struct svc_fh *, struct file *,
- loff_t, struct kvec *, int, unsigned long *);
__be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *,
loff_t, struct kvec *,int, unsigned long *, int *);
__be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *,
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 5ea7df305083..18cbb6d9c8a9 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -58,7 +58,7 @@ struct nfsd4_compound_state {
/* For sessions DRC */
struct nfsd4_session *session;
struct nfsd4_slot *slot;
- __be32 *datap;
+ int data_offset;
size_t iovlen;
u32 minorversion;
__be32 status;
@@ -287,9 +287,8 @@ struct nfsd4_readdir {
struct svc_fh * rd_fhp; /* response */
struct readdir_cd common;
- __be32 * buffer;
- int buflen;
- __be32 * offset;
+ struct xdr_stream *xdr;
+ int cookie_offset;
};
struct nfsd4_release_lockowner {
@@ -506,9 +505,7 @@ struct nfsd4_compoundargs {
struct nfsd4_compoundres {
/* scratch variables for XDR encode */
- __be32 * p;
- __be32 * end;
- struct xdr_buf * xbuf;
+ struct xdr_stream xdr;
struct svc_rqst * rqstp;
u32 taglen;
@@ -538,6 +535,9 @@ static inline bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
return argp->opcnt == resp->opcnt;
}
+int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op);
+void warn_on_nonidempotent_op(struct nfsd4_op *op);
+
#define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs)
static inline void
@@ -563,10 +563,11 @@ int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *,
struct nfsd4_compoundres *);
__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *, u32);
void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *);
-void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op);
-__be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
- struct dentry *dentry, __be32 **buffer, int countp,
- u32 *bmval, struct svc_rqst *, int ignore_crossmnt);
+void nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op);
+__be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words,
+ struct svc_fh *fhp, struct svc_export *exp,
+ struct dentry *dentry,
+ u32 *bmval, struct svc_rqst *, int ignore_crossmnt);
extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp,
struct nfsd4_compound_state *,
struct nfsd4_setclientid *setclid);
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index f3a82fbcae02..24978153c0c4 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -152,10 +152,10 @@ static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
*/
const struct file_operations nilfs_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.unlocked_ioctl = nilfs_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = nilfs_compat_ioctl,
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index b9c5726120e3..6252b173a465 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -298,19 +298,20 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
}
static ssize_t
-nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs)
+nilfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = file->f_mapping->host;
+ size_t count = iov_iter_count(iter);
ssize_t size;
if (rw == WRITE)
return 0;
/* Needs synchronization with the cleaner */
- size = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
+ size = blockdev_direct_IO(rw, iocb, inode, iter, offset,
nilfs_get_block);
/*
@@ -319,7 +320,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
*/
if (unlikely((rw & WRITE) && size < 0)) {
loff_t isize = i_size_read(inode);
- loff_t end = offset + iov_length(iov, nr_segs);
+ loff_t end = offset + count;
if (end > isize)
nilfs_write_failed(mapping, end);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 86ddab916b66..5c9e2c81cb11 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2090,10 +2090,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
size_t count; /* after file limit checks */
ssize_t written, err;
- count = 0;
- err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
- if (err)
- return err;
+ count = iov_length(iov, nr_segs);
pos = *ppos;
/* We can write back this queue in page reclaim. */
current->backing_dev_info = mapping->backing_dev_info;
@@ -2202,8 +2199,8 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
const struct file_operations ntfs_file_ops = {
.llseek = generic_file_llseek, /* Seek inside file. */
- .read = do_sync_read, /* Read from file. */
- .aio_read = generic_file_aio_read, /* Async read from file. */
+ .read = new_sync_read, /* Read from file. */
+ .read_iter = generic_file_read_iter, /* Async read from file. */
#ifdef NTFS_RW
.write = do_sync_write, /* Write to file. */
.aio_write = ntfs_file_aio_write, /* Async write to file. */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index d310d12a9adc..4a231a166cf8 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -599,9 +599,8 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
static ssize_t ocfs2_direct_IO(int rw,
struct kiocb *iocb,
- const struct iovec *iov,
- loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file)->i_mapping->host;
@@ -618,7 +617,7 @@ static ssize_t ocfs2_direct_IO(int rw,
return 0;
return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
- iov, offset, nr_segs,
+ iter, offset,
ocfs2_direct_IO_get_blocks,
ocfs2_dio_end_io, NULL, 0);
}
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index a68e07a9bd46..681691bc233a 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1799,7 +1799,7 @@ int o2net_register_hb_callbacks(void)
/* ------------------------------------------------------------ */
-static int o2net_accept_one(struct socket *sock)
+static int o2net_accept_one(struct socket *sock, int *more)
{
int ret, slen;
struct sockaddr_in sin;
@@ -1810,6 +1810,7 @@ static int o2net_accept_one(struct socket *sock)
struct o2net_node *nn;
BUG_ON(sock == NULL);
+ *more = 0;
ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
sock->sk->sk_protocol, &new_sock);
if (ret)
@@ -1821,6 +1822,7 @@ static int o2net_accept_one(struct socket *sock)
if (ret < 0)
goto out;
+ *more = 1;
new_sock->sk->sk_allocation = GFP_ATOMIC;
ret = o2net_set_nodelay(new_sock);
@@ -1919,11 +1921,36 @@ out:
return ret;
}
+/*
+ * This function is invoked in response to one or more
+ * pending accepts at softIRQ level. We must drain the
+ * entire que before returning.
+ */
+
static void o2net_accept_many(struct work_struct *work)
{
struct socket *sock = o2net_listen_sock;
- while (o2net_accept_one(sock) == 0)
+ int more;
+ int err;
+
+ /*
+ * It is critical to note that due to interrupt moderation
+ * at the network driver level, we can't assume to get a
+ * softIRQ for every single conn since tcp SYN packets
+ * can arrive back-to-back, and therefore many pending
+ * accepts may result in just 1 softIRQ. If we terminate
+ * the o2net_accept_one() loop upon seeing an err, what happens
+ * to the rest of the conns in the queue? If no new SYN
+ * arrives for hours, no softIRQ will be delivered,
+ * and the connections will just sit in the queue.
+ */
+
+ for (;;) {
+ err = o2net_accept_one(sock, &more);
+ if (!more)
+ break;
cond_resched();
+ }
}
static void o2net_listen_data_ready(struct sock *sk)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 8eb6e5732d3b..2930e231f3f9 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2233,16 +2233,13 @@ out:
return ret;
}
-static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
- const struct iovec *iov,
- unsigned long nr_segs,
- loff_t pos)
+static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
+ struct iov_iter *from)
{
int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
int can_do_direct, has_refcount = 0;
ssize_t written = 0;
- size_t ocount; /* original count */
- size_t count; /* after file limit checks */
+ size_t count = iov_iter_count(from);
loff_t old_size, *ppos = &iocb->ki_pos;
u32 old_clusters;
struct file *file = iocb->ki_filp;
@@ -2256,7 +2253,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
file->f_path.dentry->d_name.len,
file->f_path.dentry->d_name.name,
- (unsigned int)nr_segs);
+ (unsigned int)from->nr_segs); /* GRRRRR */
if (iocb->ki_nbytes == 0)
return 0;
@@ -2354,29 +2351,21 @@ relock:
/* communicate with ocfs2_dio_end_io */
ocfs2_iocb_set_rw_locked(iocb, rw_level);
- ret = generic_segment_checks(iov, &nr_segs, &ocount,
- VERIFY_READ);
- if (ret)
- goto out_dio;
-
- count = ocount;
ret = generic_write_checks(file, ppos, &count,
S_ISBLK(inode->i_mode));
if (ret)
goto out_dio;
+ iov_iter_truncate(from, count);
if (direct_io) {
- written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
- count, ocount);
+ written = generic_file_direct_write(iocb, from, *ppos);
if (written < 0) {
ret = written;
goto out_dio;
}
} else {
- struct iov_iter from;
- iov_iter_init(&from, iov, nr_segs, count, 0);
current->backing_dev_info = file->f_mapping->backing_dev_info;
- written = generic_perform_write(file, &from, *ppos);
+ written = generic_perform_write(file, from, *ppos);
if (likely(written >= 0))
iocb->ki_pos = *ppos + written;
current->backing_dev_info = NULL;
@@ -2441,84 +2430,6 @@ out_sems:
return ret;
}
-static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
- struct file *out,
- struct splice_desc *sd)
-{
- int ret;
-
- ret = ocfs2_prepare_inode_for_write(out, &sd->pos,
- sd->total_len, 0, NULL, NULL);
- if (ret < 0) {
- mlog_errno(ret);
- return ret;
- }
-
- return splice_from_pipe_feed(pipe, sd, pipe_to_file);
-}
-
-static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
- struct file *out,
- loff_t *ppos,
- size_t len,
- unsigned int flags)
-{
- int ret;
- struct address_space *mapping = out->f_mapping;
- struct inode *inode = mapping->host;
- struct splice_desc sd = {
- .total_len = len,
- .flags = flags,
- .pos = *ppos,
- .u.file = out,
- };
-
-
- trace_ocfs2_file_splice_write(inode, out, out->f_path.dentry,
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- out->f_path.dentry->d_name.len,
- out->f_path.dentry->d_name.name, len);
-
- pipe_lock(pipe);
-
- splice_from_pipe_begin(&sd);
- do {
- ret = splice_from_pipe_next(pipe, &sd);
- if (ret <= 0)
- break;
-
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
- ret = ocfs2_rw_lock(inode, 1);
- if (ret < 0)
- mlog_errno(ret);
- else {
- ret = ocfs2_splice_to_file(pipe, out, &sd);
- ocfs2_rw_unlock(inode, 1);
- }
- mutex_unlock(&inode->i_mutex);
- } while (ret > 0);
- splice_from_pipe_end(pipe, &sd);
-
- pipe_unlock(pipe);
-
- if (sd.num_spliced)
- ret = sd.num_spliced;
-
- if (ret > 0) {
- int err;
-
- err = generic_write_sync(out, *ppos, ret);
- if (err)
- ret = err;
- else
- *ppos += ret;
-
- balance_dirty_pages_ratelimited(mapping);
- }
-
- return ret;
-}
-
static ssize_t ocfs2_file_splice_read(struct file *in,
loff_t *ppos,
struct pipe_inode_info *pipe,
@@ -2534,7 +2445,7 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
in->f_path.dentry->d_name.name, len);
/*
- * See the comment in ocfs2_file_aio_read()
+ * See the comment in ocfs2_file_read_iter()
*/
ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level);
if (ret < 0) {
@@ -2549,10 +2460,8 @@ bail:
return ret;
}
-static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
- const struct iovec *iov,
- unsigned long nr_segs,
- loff_t pos)
+static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
+ struct iov_iter *to)
{
int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
struct file *filp = iocb->ki_filp;
@@ -2561,7 +2470,8 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
filp->f_path.dentry->d_name.len,
- filp->f_path.dentry->d_name.name, nr_segs);
+ filp->f_path.dentry->d_name.name,
+ to->nr_segs); /* GRRRRR */
if (!inode) {
@@ -2606,13 +2516,13 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
}
ocfs2_inode_unlock(inode, lock_level);
- ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
+ ret = generic_file_read_iter(iocb, to);
trace_generic_file_aio_read_ret(ret);
/* buffered aio wouldn't have proper lock coverage today */
BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
- /* see ocfs2_file_aio_write */
+ /* see ocfs2_file_write_iter */
if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
rw_level = -1;
have_alloc_sem = 0;
@@ -2705,14 +2615,14 @@ const struct inode_operations ocfs2_special_file_iops = {
*/
const struct file_operations ocfs2_fops = {
.llseek = ocfs2_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
.mmap = ocfs2_mmap,
.fsync = ocfs2_sync_file,
.release = ocfs2_file_release,
.open = ocfs2_file_open,
- .aio_read = ocfs2_file_aio_read,
- .aio_write = ocfs2_file_aio_write,
+ .read_iter = ocfs2_file_read_iter,
+ .write_iter = ocfs2_file_write_iter,
.unlocked_ioctl = ocfs2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ocfs2_compat_ioctl,
@@ -2720,7 +2630,7 @@ const struct file_operations ocfs2_fops = {
.lock = ocfs2_lock,
.flock = ocfs2_flock,
.splice_read = ocfs2_file_splice_read,
- .splice_write = ocfs2_file_splice_write,
+ .splice_write = iter_file_splice_write,
.fallocate = ocfs2_fallocate,
};
@@ -2753,21 +2663,21 @@ const struct file_operations ocfs2_dops = {
*/
const struct file_operations ocfs2_fops_no_plocks = {
.llseek = ocfs2_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
.mmap = ocfs2_mmap,
.fsync = ocfs2_sync_file,
.release = ocfs2_file_release,
.open = ocfs2_file_open,
- .aio_read = ocfs2_file_aio_read,
- .aio_write = ocfs2_file_aio_write,
+ .read_iter = ocfs2_file_read_iter,
+ .write_iter = ocfs2_file_write_iter,
.unlocked_ioctl = ocfs2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ocfs2_compat_ioctl,
#endif
.flock = ocfs2_flock,
.splice_read = ocfs2_file_splice_read,
- .splice_write = ocfs2_file_splice_write,
+ .splice_write = iter_file_splice_write,
.fallocate = ocfs2_fallocate,
};
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 54d57d6ba68d..902e88527fce 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -337,10 +337,10 @@ static sector_t omfs_bmap(struct address_space *mapping, sector_t block)
const struct file_operations omfs_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.fsync = generic_file_fsync,
.splice_read = generic_file_splice_read,
diff --git a/fs/open.c b/fs/open.c
index 9d64679cec73..36662d036237 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -725,6 +725,12 @@ static int do_dentry_open(struct file *f,
}
if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
i_readcount_inc(inode);
+ if ((f->f_mode & FMODE_READ) &&
+ likely(f->f_op->read || f->f_op->aio_read || f->f_op->read_iter))
+ f->f_mode |= FMODE_CAN_READ;
+ if ((f->f_mode & FMODE_WRITE) &&
+ likely(f->f_op->write || f->f_op->aio_write || f->f_op->write_iter))
+ f->f_mode |= FMODE_CAN_WRITE;
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
diff --git a/fs/pipe.c b/fs/pipe.c
index 034bffac3f97..21981e58e2a6 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -116,50 +116,6 @@ void pipe_wait(struct pipe_inode_info *pipe)
pipe_lock(pipe);
}
-static int
-pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len,
- int atomic)
-{
- unsigned long copy;
-
- while (len > 0) {
- while (!iov->iov_len)
- iov++;
- copy = min_t(unsigned long, len, iov->iov_len);
-
- if (atomic) {
- if (__copy_from_user_inatomic(to, iov->iov_base, copy))
- return -EFAULT;
- } else {
- if (copy_from_user(to, iov->iov_base, copy))
- return -EFAULT;
- }
- to += copy;
- len -= copy;
- iov->iov_base += copy;
- iov->iov_len -= copy;
- }
- return 0;
-}
-
-/*
- * Pre-fault in the user memory, so we can use atomic copies.
- */
-static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len)
-{
- while (!iov->iov_len)
- iov++;
-
- while (len > 0) {
- unsigned long this_len;
-
- this_len = min_t(unsigned long, len, iov->iov_len);
- fault_in_pages_readable(iov->iov_base, this_len);
- len -= this_len;
- iov++;
- }
-}
-
static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
@@ -271,24 +227,18 @@ static const struct pipe_buf_operations packet_pipe_buf_ops = {
};
static ssize_t
-pipe_read(struct kiocb *iocb, const struct iovec *_iov,
- unsigned long nr_segs, loff_t pos)
+pipe_read(struct kiocb *iocb, struct iov_iter *to)
{
+ size_t total_len = iov_iter_count(to);
struct file *filp = iocb->ki_filp;
struct pipe_inode_info *pipe = filp->private_data;
int do_wakeup;
ssize_t ret;
- struct iovec *iov = (struct iovec *)_iov;
- size_t total_len;
- struct iov_iter iter;
- total_len = iov_length(iov, nr_segs);
/* Null read succeeds. */
if (unlikely(total_len == 0))
return 0;
- iov_iter_init(&iter, iov, nr_segs, total_len, 0);
-
do_wakeup = 0;
ret = 0;
__pipe_lock(pipe);
@@ -312,7 +262,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
break;
}
- written = copy_page_to_iter(buf->page, buf->offset, chars, &iter);
+ written = copy_page_to_iter(buf->page, buf->offset, chars, to);
if (unlikely(written < chars)) {
if (!ret)
ret = -EFAULT;
@@ -386,24 +336,19 @@ static inline int is_packetized(struct file *file)
}
static ssize_t
-pipe_write(struct kiocb *iocb, const struct iovec *_iov,
- unsigned long nr_segs, loff_t ppos)
+pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *filp = iocb->ki_filp;
struct pipe_inode_info *pipe = filp->private_data;
- ssize_t ret;
- int do_wakeup;
- struct iovec *iov = (struct iovec *)_iov;
- size_t total_len;
+ ssize_t ret = 0;
+ int do_wakeup = 0;
+ size_t total_len = iov_iter_count(from);
ssize_t chars;
- total_len = iov_length(iov, nr_segs);
/* Null write succeeds. */
if (unlikely(total_len == 0))
return 0;
- do_wakeup = 0;
- ret = 0;
__pipe_lock(pipe);
if (!pipe->readers) {
@@ -422,38 +367,19 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
int offset = buf->offset + buf->len;
if (ops->can_merge && offset + chars <= PAGE_SIZE) {
- int error, atomic = 1;
- void *addr;
-
- error = ops->confirm(pipe, buf);
+ int error = ops->confirm(pipe, buf);
if (error)
goto out;
- iov_fault_in_pages_read(iov, chars);
-redo1:
- if (atomic)
- addr = kmap_atomic(buf->page);
- else
- addr = kmap(buf->page);
- error = pipe_iov_copy_from_user(offset + addr, iov,
- chars, atomic);
- if (atomic)
- kunmap_atomic(addr);
- else
- kunmap(buf->page);
- ret = error;
- do_wakeup = 1;
- if (error) {
- if (atomic) {
- atomic = 0;
- goto redo1;
- }
+ ret = copy_page_from_iter(buf->page, offset, chars, from);
+ if (unlikely(ret < chars)) {
+ error = -EFAULT;
goto out;
}
+ do_wakeup = 1;
buf->len += chars;
- total_len -= chars;
ret = chars;
- if (!total_len)
+ if (!iov_iter_count(from))
goto out;
}
}
@@ -472,8 +398,7 @@ redo1:
int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
struct pipe_buffer *buf = pipe->bufs + newbuf;
struct page *page = pipe->tmp_page;
- char *src;
- int error, atomic = 1;
+ int copied;
if (!page) {
page = alloc_page(GFP_HIGHUSER);
@@ -489,40 +414,19 @@ redo1:
* FIXME! Is this really true?
*/
do_wakeup = 1;
- chars = PAGE_SIZE;
- if (chars > total_len)
- chars = total_len;
-
- iov_fault_in_pages_read(iov, chars);
-redo2:
- if (atomic)
- src = kmap_atomic(page);
- else
- src = kmap(page);
-
- error = pipe_iov_copy_from_user(src, iov, chars,
- atomic);
- if (atomic)
- kunmap_atomic(src);
- else
- kunmap(page);
-
- if (unlikely(error)) {
- if (atomic) {
- atomic = 0;
- goto redo2;
- }
+ copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
+ if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
if (!ret)
- ret = error;
+ ret = -EFAULT;
break;
}
- ret += chars;
+ ret += copied;
/* Insert it into the buffer array */
buf->page = page;
buf->ops = &anon_pipe_buf_ops;
buf->offset = 0;
- buf->len = chars;
+ buf->len = copied;
buf->flags = 0;
if (is_packetized(filp)) {
buf->ops = &packet_pipe_buf_ops;
@@ -531,8 +435,7 @@ redo2:
pipe->nrbufs = ++bufs;
pipe->tmp_page = NULL;
- total_len -= chars;
- if (!total_len)
+ if (!iov_iter_count(from))
break;
}
if (bufs < pipe->buffers)
@@ -1044,10 +947,10 @@ err:
const struct file_operations pipefifo_fops = {
.open = fifo_open,
.llseek = no_llseek,
- .read = do_sync_read,
- .aio_read = pipe_read,
- .write = do_sync_write,
- .aio_write = pipe_write,
+ .read = new_sync_read,
+ .read_iter = pipe_read,
+ .write = new_sync_write,
+ .write_iter = pipe_write,
.poll = pipe_poll,
.unlocked_ioctl = pipe_ioctl,
.release = pipe_release,
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 2b363e23f36e..ff3f0b3cfdb3 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -278,6 +278,17 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
return ret;
}
+static int quota_rmxquota(struct super_block *sb, void __user *addr)
+{
+ __u32 flags;
+
+ if (copy_from_user(&flags, addr, sizeof(flags)))
+ return -EFAULT;
+ if (!sb->s_qcop->rm_xquota)
+ return -ENOSYS;
+ return sb->s_qcop->rm_xquota(sb, flags);
+}
+
/* Copy parameters and call proper function */
static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
void __user *addr, struct path *path)
@@ -316,8 +327,9 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
return sb->s_qcop->quota_sync(sb, type);
case Q_XQUOTAON:
case Q_XQUOTAOFF:
- case Q_XQUOTARM:
return quota_setxstate(sb, cmd, addr);
+ case Q_XQUOTARM:
+ return quota_rmxquota(sb, addr);
case Q_XGETQSTAT:
return quota_getxstate(sb, addr);
case Q_XGETQSTATV:
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 1e56a4e8cf7c..4f56de822d2f 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -31,14 +31,14 @@
#include "internal.h"
const struct file_operations ramfs_file_operations = {
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .write = do_sync_write,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.fsync = noop_fsync,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
.llseek = generic_file_llseek,
};
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 0b3d8e4cb2fa..dda012ad4208 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -37,13 +37,13 @@ static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
const struct file_operations ramfs_file_operations = {
.mmap = ramfs_nommu_mmap,
.get_unmapped_area = ramfs_nommu_get_unmapped_area,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .write = do_sync_write,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = generic_file_write_iter,
.fsync = noop_fsync,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
.llseek = generic_file_llseek,
};
diff --git a/fs/read_write.c b/fs/read_write.c
index 31c6efa43183..009d8542a889 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -25,11 +25,12 @@
typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
unsigned long, loff_t);
+typedef ssize_t (*iter_fn_t)(struct kiocb *, struct iov_iter *);
const struct file_operations generic_ro_fops = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
.mmap = generic_file_readonly_mmap,
.splice_read = generic_file_splice_read,
};
@@ -390,13 +391,34 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
EXPORT_SYMBOL(do_sync_read);
+ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
+{
+ struct iovec iov = { .iov_base = buf, .iov_len = len };
+ struct kiocb kiocb;
+ struct iov_iter iter;
+ ssize_t ret;
+
+ init_sync_kiocb(&kiocb, filp);
+ kiocb.ki_pos = *ppos;
+ kiocb.ki_nbytes = len;
+ iov_iter_init(&iter, READ, &iov, 1, len);
+
+ ret = filp->f_op->read_iter(&kiocb, &iter);
+ if (-EIOCBQUEUED == ret)
+ ret = wait_on_sync_kiocb(&kiocb);
+ *ppos = kiocb.ki_pos;
+ return ret;
+}
+
+EXPORT_SYMBOL(new_sync_read);
+
ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
if (!(file->f_mode & FMODE_READ))
return -EBADF;
- if (!file->f_op->read && !file->f_op->aio_read)
+ if (!(file->f_mode & FMODE_CAN_READ))
return -EINVAL;
if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
return -EFAULT;
@@ -406,8 +428,10 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
count = ret;
if (file->f_op->read)
ret = file->f_op->read(file, buf, count, pos);
- else
+ else if (file->f_op->aio_read)
ret = do_sync_read(file, buf, count, pos);
+ else
+ ret = new_sync_read(file, buf, count, pos);
if (ret > 0) {
fsnotify_access(file);
add_rchar(current, ret);
@@ -439,13 +463,34 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
EXPORT_SYMBOL(do_sync_write);
+ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
+{
+ struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
+ struct kiocb kiocb;
+ struct iov_iter iter;
+ ssize_t ret;
+
+ init_sync_kiocb(&kiocb, filp);
+ kiocb.ki_pos = *ppos;
+ kiocb.ki_nbytes = len;
+ iov_iter_init(&iter, WRITE, &iov, 1, len);
+
+ ret = filp->f_op->write_iter(&kiocb, &iter);
+ if (-EIOCBQUEUED == ret)
+ ret = wait_on_sync_kiocb(&kiocb);
+ *ppos = kiocb.ki_pos;
+ return ret;
+}
+
+EXPORT_SYMBOL(new_sync_write);
+
ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
{
mm_segment_t old_fs;
const char __user *p;
ssize_t ret;
- if (!file->f_op->write && !file->f_op->aio_write)
+ if (!(file->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
old_fs = get_fs();
@@ -455,8 +500,10 @@ ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t
count = MAX_RW_COUNT;
if (file->f_op->write)
ret = file->f_op->write(file, p, count, pos);
- else
+ else if (file->f_op->aio_write)
ret = do_sync_write(file, p, count, pos);
+ else
+ ret = new_sync_write(file, p, count, pos);
set_fs(old_fs);
if (ret > 0) {
fsnotify_modify(file);
@@ -472,7 +519,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
- if (!file->f_op->write && !file->f_op->aio_write)
+ if (!(file->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
if (unlikely(!access_ok(VERIFY_READ, buf, count)))
return -EFAULT;
@@ -483,8 +530,10 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
file_start_write(file);
if (file->f_op->write)
ret = file->f_op->write(file, buf, count, pos);
- else
+ else if (file->f_op->aio_write)
ret = do_sync_write(file, buf, count, pos);
+ else
+ ret = new_sync_write(file, buf, count, pos);
if (ret > 0) {
fsnotify_modify(file);
add_wchar(current, ret);
@@ -601,6 +650,25 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
}
EXPORT_SYMBOL(iov_shorten);
+static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iovec *iov,
+ unsigned long nr_segs, size_t len, loff_t *ppos, iter_fn_t fn)
+{
+ struct kiocb kiocb;
+ struct iov_iter iter;
+ ssize_t ret;
+
+ init_sync_kiocb(&kiocb, filp);
+ kiocb.ki_pos = *ppos;
+ kiocb.ki_nbytes = len;
+
+ iov_iter_init(&iter, rw, iov, nr_segs, len);
+ ret = fn(&kiocb, &iter);
+ if (ret == -EIOCBQUEUED)
+ ret = wait_on_sync_kiocb(&kiocb);
+ *ppos = kiocb.ki_pos;
+ return ret;
+}
+
static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
{
@@ -738,6 +806,7 @@ static ssize_t do_readv_writev(int type, struct file *file,
ssize_t ret;
io_fn_t fn;
iov_fn_t fnv;
+ iter_fn_t iter_fn;
ret = rw_copy_check_uvector(type, uvector, nr_segs,
ARRAY_SIZE(iovstack), iovstack, &iov);
@@ -753,13 +822,18 @@ static ssize_t do_readv_writev(int type, struct file *file,
if (type == READ) {
fn = file->f_op->read;
fnv = file->f_op->aio_read;
+ iter_fn = file->f_op->read_iter;
} else {
fn = (io_fn_t)file->f_op->write;
fnv = file->f_op->aio_write;
+ iter_fn = file->f_op->write_iter;
file_start_write(file);
}
- if (fnv)
+ if (iter_fn)
+ ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
+ pos, iter_fn);
+ else if (fnv)
ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
pos, fnv);
else
@@ -785,7 +859,7 @@ ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
{
if (!(file->f_mode & FMODE_READ))
return -EBADF;
- if (!file->f_op->aio_read && !file->f_op->read)
+ if (!(file->f_mode & FMODE_CAN_READ))
return -EINVAL;
return do_readv_writev(READ, file, vec, vlen, pos);
@@ -798,7 +872,7 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
{
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
- if (!file->f_op->aio_write && !file->f_op->write)
+ if (!(file->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
return do_readv_writev(WRITE, file, vec, vlen, pos);
@@ -912,6 +986,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
ssize_t ret;
io_fn_t fn;
iov_fn_t fnv;
+ iter_fn_t iter_fn;
ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
UIO_FASTIOV, iovstack, &iov);
@@ -927,13 +1002,18 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
if (type == READ) {
fn = file->f_op->read;
fnv = file->f_op->aio_read;
+ iter_fn = file->f_op->read_iter;
} else {
fn = (io_fn_t)file->f_op->write;
fnv = file->f_op->aio_write;
+ iter_fn = file->f_op->write_iter;
file_start_write(file);
}
- if (fnv)
+ if (iter_fn)
+ ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
+ pos, iter_fn);
+ else if (fnv)
ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
pos, fnv);
else
@@ -964,7 +1044,7 @@ static size_t compat_readv(struct file *file,
goto out;
ret = -EINVAL;
- if (!file->f_op->aio_read && !file->f_op->read)
+ if (!(file->f_mode & FMODE_CAN_READ))
goto out;
ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
@@ -1041,7 +1121,7 @@ static size_t compat_writev(struct file *file,
goto out;
ret = -EINVAL;
- if (!file->f_op->aio_write && !file->f_op->write)
+ if (!(file->f_mode & FMODE_CAN_WRITE))
goto out;
ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index 1bcffeab713c..dc198bc64c61 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -50,8 +50,10 @@ static inline void get_bit_address(struct super_block *s,
unsigned int *bmap_nr,
unsigned int *offset)
{
- /* It is in the bitmap block number equal to the block
- * number divided by the number of bits in a block. */
+ /*
+ * It is in the bitmap block number equal to the block
+ * number divided by the number of bits in a block.
+ */
*bmap_nr = block >> (s->s_blocksize_bits + 3);
/* Within that bitmap block it is located at bit offset *offset. */
*offset = block & ((s->s_blocksize << 3) - 1);
@@ -71,10 +73,12 @@ int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value)
get_bit_address(s, block, &bmap, &offset);
- /* Old format filesystem? Unlikely, but the bitmaps are all up front so
- * we need to account for it. */
+ /*
+ * Old format filesystem? Unlikely, but the bitmaps are all
+ * up front so we need to account for it.
+ */
if (unlikely(test_bit(REISERFS_OLD_FORMAT,
- &(REISERFS_SB(s)->s_properties)))) {
+ &REISERFS_SB(s)->s_properties))) {
b_blocknr_t bmap1 = REISERFS_SB(s)->s_sbh->b_blocknr + 1;
if (block >= bmap1 &&
block <= bmap1 + bmap_count) {
@@ -108,8 +112,11 @@ int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value)
return 1;
}
-/* searches in journal structures for a given block number (bmap, off). If block
- is found in reiserfs journal it suggests next free block candidate to test. */
+/*
+ * Searches in journal structures for a given block number (bmap, off).
+ * If block is found in reiserfs journal it suggests next free block
+ * candidate to test.
+ */
static inline int is_block_in_journal(struct super_block *s, unsigned int bmap,
int off, int *next)
{
@@ -120,7 +127,7 @@ static inline int is_block_in_journal(struct super_block *s, unsigned int bmap,
*next = tmp;
PROC_INFO_INC(s, scan_bitmap.in_journal_hint);
} else {
- (*next) = off + 1; /* inc offset to avoid looping. */
+ (*next) = off + 1; /* inc offset to avoid looping. */
PROC_INFO_INC(s, scan_bitmap.in_journal_nohint);
}
PROC_INFO_INC(s, scan_bitmap.retry);
@@ -129,8 +136,10 @@ static inline int is_block_in_journal(struct super_block *s, unsigned int bmap,
return 0;
}
-/* it searches for a window of zero bits with given minimum and maximum lengths in one bitmap
- * block; */
+/*
+ * Searches for a window of zero bits with given minimum and maximum
+ * lengths in one bitmap block
+ */
static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
unsigned int bmap_n, int *beg, int boundary,
int min, int max, int unfm)
@@ -145,10 +154,6 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
RFALSE(bmap_n >= reiserfs_bmap_count(s), "Bitmap %u is out of "
"range (0..%u)", bmap_n, reiserfs_bmap_count(s) - 1);
PROC_INFO_INC(s, scan_bitmap.bmap);
-/* this is unclear and lacks comments, explain how journal bitmaps
- work here for the reader. Convey a sense of the design here. What
- is a window? */
-/* - I mean `a window of zero bits' as in description of this function - Zam. */
if (!bi) {
reiserfs_error(s, "jdm-4055", "NULL bitmap info pointer "
@@ -161,18 +166,21 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
return 0;
while (1) {
- cont:
+cont:
if (bi->free_count < min) {
brelse(bh);
- return 0; // No free blocks in this bitmap
+ return 0; /* No free blocks in this bitmap */
}
/* search for a first zero bit -- beginning of a window */
*beg = reiserfs_find_next_zero_le_bit
((unsigned long *)(bh->b_data), boundary, *beg);
- if (*beg + min > boundary) { /* search for a zero bit fails or the rest of bitmap block
- * cannot contain a zero window of minimum size */
+ /*
+ * search for a zero bit fails or the rest of bitmap block
+ * cannot contain a zero window of minimum size
+ */
+ if (*beg + min > boundary) {
brelse(bh);
return 0;
}
@@ -186,49 +194,75 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
next = end;
break;
}
- /* finding the other end of zero bit window requires looking into journal structures (in
- * case of searching for free blocks for unformatted nodes) */
+
+ /*
+ * finding the other end of zero bit window requires
+ * looking into journal structures (in case of
+ * searching for free blocks for unformatted nodes)
+ */
if (unfm && is_block_in_journal(s, bmap_n, end, &next))
break;
}
- /* now (*beg) points to beginning of zero bits window,
- * (end) points to one bit after the window end */
- if (end - *beg >= min) { /* it seems we have found window of proper size */
+ /*
+ * now (*beg) points to beginning of zero bits window,
+ * (end) points to one bit after the window end
+ */
+
+ /* found window of proper size */
+ if (end - *beg >= min) {
int i;
reiserfs_prepare_for_journal(s, bh, 1);
- /* try to set all blocks used checking are they still free */
+ /*
+ * try to set all blocks used checking are
+ * they still free
+ */
for (i = *beg; i < end; i++) {
- /* It seems that we should not check in journal again. */
+ /* Don't check in journal again. */
if (reiserfs_test_and_set_le_bit
(i, bh->b_data)) {
- /* bit was set by another process
- * while we slept in prepare_for_journal() */
+ /*
+ * bit was set by another process while
+ * we slept in prepare_for_journal()
+ */
PROC_INFO_INC(s, scan_bitmap.stolen);
- if (i >= *beg + min) { /* we can continue with smaller set of allocated blocks,
- * if length of this set is more or equal to `min' */
+
+ /*
+ * we can continue with smaller set
+ * of allocated blocks, if length of
+ * this set is more or equal to `min'
+ */
+ if (i >= *beg + min) {
end = i;
break;
}
- /* otherwise we clear all bit were set ... */
+
+ /*
+ * otherwise we clear all bit
+ * were set ...
+ */
while (--i >= *beg)
reiserfs_clear_le_bit
(i, bh->b_data);
reiserfs_restore_prepared_buffer(s, bh);
*beg = org;
- /* ... and search again in current block from beginning */
+
+ /*
+ * Search again in current block
+ * from beginning
+ */
goto cont;
}
}
bi->free_count -= (end - *beg);
- journal_mark_dirty(th, s, bh);
+ journal_mark_dirty(th, bh);
brelse(bh);
/* free block count calculation */
reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
1);
PUT_SB_FREE_BLOCKS(s, SB_FREE_BLOCKS(s) - (end - *beg));
- journal_mark_dirty(th, s, SB_BUFFER_WITH_SB(s));
+ journal_mark_dirty(th, SB_BUFFER_WITH_SB(s));
return end - (*beg);
} else {
@@ -267,11 +301,13 @@ static inline int block_group_used(struct super_block *s, u32 id)
int bm = bmap_hash_id(s, id);
struct reiserfs_bitmap_info *info = &SB_AP_BITMAP(s)[bm];
- /* If we don't have cached information on this bitmap block, we're
+ /*
+ * If we don't have cached information on this bitmap block, we're
* going to have to load it later anyway. Loading it here allows us
* to make a better decision. This favors long-term performance gain
* with a better on-disk layout vs. a short term gain of skipping the
- * read and potentially having a bad placement. */
+ * read and potentially having a bad placement.
+ */
if (info->free_count == UINT_MAX) {
struct buffer_head *bh = reiserfs_read_bitmap_block(s, bm);
brelse(bh);
@@ -304,25 +340,26 @@ __le32 reiserfs_choose_packing(struct inode * dir)
return packing;
}
-/* Tries to find contiguous zero bit window (given size) in given region of
- * bitmap and place new blocks there. Returns number of allocated blocks. */
+/*
+ * Tries to find contiguous zero bit window (given size) in given region of
+ * bitmap and place new blocks there. Returns number of allocated blocks.
+ */
static int scan_bitmap(struct reiserfs_transaction_handle *th,
b_blocknr_t * start, b_blocknr_t finish,
int min, int max, int unfm, sector_t file_block)
{
int nr_allocated = 0;
struct super_block *s = th->t_super;
- /* find every bm and bmap and bmap_nr in this file, and change them all to bitmap_blocknr
- * - Hans, it is not a block number - Zam. */
-
unsigned int bm, off;
unsigned int end_bm, end_off;
unsigned int off_max = s->s_blocksize << 3;
BUG_ON(!th->t_trans_id);
PROC_INFO_INC(s, scan_bitmap.call);
+
+ /* No point in looking for more free blocks */
if (SB_FREE_BLOCKS(s) <= 0)
- return 0; // No point in looking for more free blocks
+ return 0;
get_bit_address(s, *start, &bm, &off);
get_bit_address(s, finish, &end_bm, &end_off);
@@ -331,7 +368,8 @@ static int scan_bitmap(struct reiserfs_transaction_handle *th,
if (end_bm > reiserfs_bmap_count(s))
end_bm = reiserfs_bmap_count(s);
- /* When the bitmap is more than 10% free, anyone can allocate.
+ /*
+ * When the bitmap is more than 10% free, anyone can allocate.
* When it's less than 10% free, only files that already use the
* bitmap are allowed. Once we pass 80% full, this restriction
* is lifted.
@@ -369,7 +407,7 @@ static int scan_bitmap(struct reiserfs_transaction_handle *th,
nr_allocated =
scan_bitmap_block(th, bm, &off, end_off + 1, min, max, unfm);
- ret:
+ret:
*start = bm * off_max + off;
return nr_allocated;
@@ -411,14 +449,14 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
"block %lu: bit already cleared", block);
}
apbi[nr].free_count++;
- journal_mark_dirty(th, s, bmbh);
+ journal_mark_dirty(th, bmbh);
brelse(bmbh);
reiserfs_prepare_for_journal(s, sbh, 1);
/* update super block */
set_sb_free_blocks(rs, sb_free_blocks(rs) + 1);
- journal_mark_dirty(th, s, sbh);
+ journal_mark_dirty(th, sbh);
if (for_unformatted) {
int depth = reiserfs_write_unlock_nested(s);
dquot_free_block_nodirty(inode, 1);
@@ -483,7 +521,7 @@ static void __discard_prealloc(struct reiserfs_transaction_handle *th,
if (dirty)
reiserfs_update_sd(th, inode);
ei->i_prealloc_block = save;
- list_del_init(&(ei->i_prealloc_list));
+ list_del_init(&ei->i_prealloc_list);
}
/* FIXME: It should be inline function */
@@ -529,7 +567,8 @@ int reiserfs_parse_alloc_options(struct super_block *s, char *options)
{
char *this_char, *value;
- REISERFS_SB(s)->s_alloc_options.bits = 0; /* clear default settings */
+ /* clear default settings */
+ REISERFS_SB(s)->s_alloc_options.bits = 0;
while ((this_char = strsep(&options, ":")) != NULL) {
if ((value = strchr(this_char, '=')) != NULL)
@@ -731,7 +770,7 @@ static inline void new_hashed_relocation(reiserfs_blocknr_hint_t * hint)
hash_in = (char *)&hint->key.k_dir_id;
} else {
if (!hint->inode) {
- //hint->search_start = hint->beg;
+ /*hint->search_start = hint->beg;*/
hash_in = (char *)&hint->key.k_dir_id;
} else
if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
@@ -785,7 +824,8 @@ static void oid_groups(reiserfs_blocknr_hint_t * hint)
dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
- /* keep the root dir and it's first set of subdirs close to
+ /*
+ * keep the root dir and it's first set of subdirs close to
* the start of the disk
*/
if (dirid <= 2)
@@ -799,7 +839,8 @@ static void oid_groups(reiserfs_blocknr_hint_t * hint)
}
}
-/* returns 1 if it finds an indirect item and gets valid hint info
+/*
+ * returns 1 if it finds an indirect item and gets valid hint info
* from it, otherwise 0
*/
static int get_left_neighbor(reiserfs_blocknr_hint_t * hint)
@@ -811,25 +852,29 @@ static int get_left_neighbor(reiserfs_blocknr_hint_t * hint)
__le32 *item;
int ret = 0;
- if (!hint->path) /* reiserfs code can call this function w/o pointer to path
- * structure supplied; then we rely on supplied search_start */
+ /*
+ * reiserfs code can call this function w/o pointer to path
+ * structure supplied; then we rely on supplied search_start
+ */
+ if (!hint->path)
return 0;
path = hint->path;
bh = get_last_bh(path);
RFALSE(!bh, "green-4002: Illegal path specified to get_left_neighbor");
- ih = get_ih(path);
+ ih = tp_item_head(path);
pos_in_item = path->pos_in_item;
- item = get_item(path);
+ item = tp_item_body(path);
hint->search_start = bh->b_blocknr;
+ /*
+ * for indirect item: go to left and look for the first non-hole entry
+ * in the indirect item
+ */
if (!hint->formatted_node && is_indirect_le_ih(ih)) {
- /* for indirect item: go to left and look for the first non-hole entry
- in the indirect item */
if (pos_in_item == I_UNFM_NUM(ih))
pos_in_item--;
-// pos_in_item = I_UNFM_NUM (ih) - 1;
while (pos_in_item >= 0) {
int t = get_block_num(item, pos_in_item);
if (t) {
@@ -845,10 +890,12 @@ static int get_left_neighbor(reiserfs_blocknr_hint_t * hint)
return ret;
}
-/* should be, if formatted node, then try to put on first part of the device
- specified as number of percent with mount option device, else try to put
- on last of device. This is not to say it is good code to do so,
- but the effect should be measured. */
+/*
+ * should be, if formatted node, then try to put on first part of the device
+ * specified as number of percent with mount option device, else try to put
+ * on last of device. This is not to say it is good code to do so,
+ * but the effect should be measured.
+ */
static inline void set_border_in_hint(struct super_block *s,
reiserfs_blocknr_hint_t * hint)
{
@@ -974,21 +1021,27 @@ static void determine_search_start(reiserfs_blocknr_hint_t * hint,
set_border_in_hint(s, hint);
#ifdef DISPLACE_NEW_PACKING_LOCALITIES
- /* whenever we create a new directory, we displace it. At first we will
- hash for location, later we might look for a moderately empty place for
- it */
+ /*
+ * whenever we create a new directory, we displace it. At first
+ * we will hash for location, later we might look for a moderately
+ * empty place for it
+ */
if (displacing_new_packing_localities(s)
&& hint->th->displace_new_blocks) {
displace_new_packing_locality(hint);
- /* we do not continue determine_search_start,
- * if new packing locality is being displaced */
+ /*
+ * we do not continue determine_search_start,
+ * if new packing locality is being displaced
+ */
return;
}
#endif
- /* all persons should feel encouraged to add more special cases here and
- * test them */
+ /*
+ * all persons should feel encouraged to add more special cases
+ * here and test them
+ */
if (displacing_large_files(s) && !hint->formatted_node
&& this_blocknr_allocation_would_make_it_a_large_file(hint)) {
@@ -996,8 +1049,10 @@ static void determine_search_start(reiserfs_blocknr_hint_t * hint,
return;
}
- /* if none of our special cases is relevant, use the left neighbor in the
- tree order of the new node we are allocating for */
+ /*
+ * if none of our special cases is relevant, use the left
+ * neighbor in the tree order of the new node we are allocating for
+ */
if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes, s)) {
hash_formatted_node(hint);
return;
@@ -1005,10 +1060,13 @@ static void determine_search_start(reiserfs_blocknr_hint_t * hint,
unfm_hint = get_left_neighbor(hint);
- /* Mimic old block allocator behaviour, that is if VFS allowed for preallocation,
- new blocks are displaced based on directory ID. Also, if suggested search_start
- is less than last preallocated block, we start searching from it, assuming that
- HDD dataflow is faster in forward direction */
+ /*
+ * Mimic old block allocator behaviour, that is if VFS allowed for
+ * preallocation, new blocks are displaced based on directory ID.
+ * Also, if suggested search_start is less than last preallocated
+ * block, we start searching from it, assuming that HDD dataflow
+ * is faster in forward direction
+ */
if (TEST_OPTION(old_way, s)) {
if (!hint->formatted_node) {
if (!reiserfs_hashed_relocation(s))
@@ -1037,11 +1095,13 @@ static void determine_search_start(reiserfs_blocknr_hint_t * hint,
TEST_OPTION(old_hashed_relocation, s)) {
old_hashed_relocation(hint);
}
+
/* new_hashed_relocation works with both formatted/unformatted nodes */
if ((!unfm_hint || hint->formatted_node) &&
TEST_OPTION(new_hashed_relocation, s)) {
new_hashed_relocation(hint);
}
+
/* dirid grouping works only on unformatted nodes */
if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups, s)) {
dirid_groups(hint);
@@ -1079,8 +1139,6 @@ static int determine_prealloc_size(reiserfs_blocknr_hint_t * hint)
return CARRY_ON;
}
-/* XXX I know it could be merged with upper-level function;
- but may be result function would be too complex. */
static inline int allocate_without_wrapping_disk(reiserfs_blocknr_hint_t * hint,
b_blocknr_t * new_blocknrs,
b_blocknr_t start,
@@ -1108,7 +1166,10 @@ static inline int allocate_without_wrapping_disk(reiserfs_blocknr_hint_t * hint,
/* do we have something to fill prealloc. array also ? */
if (nr_allocated > 0) {
- /* it means prealloc_size was greater that 0 and we do preallocation */
+ /*
+ * it means prealloc_size was greater that 0 and
+ * we do preallocation
+ */
list_add(&REISERFS_I(hint->inode)->i_prealloc_list,
&SB_JOURNAL(hint->th->t_super)->
j_prealloc_list);
@@ -1176,7 +1237,8 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
start = 0;
finish = hint->beg;
break;
- default: /* We've tried searching everywhere, not enough space */
+ default:
+ /* We've tried searching everywhere, not enough space */
/* Free the blocks */
if (!hint->formatted_node) {
#ifdef REISERQUOTA_DEBUG
@@ -1261,8 +1323,11 @@ static int use_preallocated_list_if_available(reiserfs_blocknr_hint_t * hint,
return amount_needed;
}
-int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t * hint, b_blocknr_t * new_blocknrs, int amount_needed, int reserved_by_us /* Amount of blocks we have
- already reserved */ )
+int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *hint,
+ b_blocknr_t *new_blocknrs,
+ int amount_needed,
+ /* Amount of blocks we have already reserved */
+ int reserved_by_us)
{
int initial_amount_needed = amount_needed;
int ret;
@@ -1274,15 +1339,21 @@ int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t * hint, b_blocknr_t * new
return NO_DISK_SPACE;
/* should this be if !hint->inode && hint->preallocate? */
/* do you mean hint->formatted_node can be removed ? - Zam */
- /* hint->formatted_node cannot be removed because we try to access
- inode information here, and there is often no inode assotiated with
- metadata allocations - green */
+ /*
+ * hint->formatted_node cannot be removed because we try to access
+ * inode information here, and there is often no inode associated with
+ * metadata allocations - green
+ */
if (!hint->formatted_node && hint->preallocate) {
amount_needed = use_preallocated_list_if_available
(hint, new_blocknrs, amount_needed);
- if (amount_needed == 0) /* all blocknrs we need we got from
- prealloc. list */
+
+ /*
+ * We have all the block numbers we need from the
+ * prealloc list
+ */
+ if (amount_needed == 0)
return CARRY_ON;
new_blocknrs += (initial_amount_needed - amount_needed);
}
@@ -1296,10 +1367,12 @@ int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t * hint, b_blocknr_t * new
ret = blocknrs_and_prealloc_arrays_from_search_start
(hint, new_blocknrs, amount_needed);
- /* we used prealloc. list to fill (partially) new_blocknrs array. If final allocation fails we
- * need to return blocks back to prealloc. list or just free them. -- Zam (I chose second
- * variant) */
-
+ /*
+ * We used prealloc. list to fill (partially) new_blocknrs array.
+ * If final allocation fails we need to return blocks back to
+ * prealloc. list or just free them. -- Zam (I chose second
+ * variant)
+ */
if (ret != CARRY_ON) {
while (amount_needed++ < initial_amount_needed) {
reiserfs_free_block(hint->th, hint->inode,
@@ -1338,10 +1411,12 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb,
struct reiserfs_bitmap_info *info = SB_AP_BITMAP(sb) + bitmap;
struct buffer_head *bh;
- /* Way old format filesystems had the bitmaps packed up front.
- * I doubt there are any of these left, but just in case... */
+ /*
+ * Way old format filesystems had the bitmaps packed up front.
+ * I doubt there are any of these left, but just in case...
+ */
if (unlikely(test_bit(REISERFS_OLD_FORMAT,
- &(REISERFS_SB(sb)->s_properties))))
+ &REISERFS_SB(sb)->s_properties)))
block = REISERFS_SB(sb)->s_sbh->b_blocknr + 1 + bitmap;
else if (bitmap == 0)
block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1;
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index af677353a3f5..d9f5a60dd59b 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -59,7 +59,10 @@ static inline bool is_privroot_deh(struct inode *dir, struct reiserfs_de_head *d
int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
{
- struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */
+
+ /* key of current position in the directory (key of directory entry) */
+ struct cpu_key pos_key;
+
INITIALIZE_PATH(path_to_entry);
struct buffer_head *bh;
int item_num, entry_num;
@@ -77,21 +80,28 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
reiserfs_check_lock_depth(inode->i_sb, "readdir");
- /* form key for search the next directory entry using f_pos field of
- file structure */
+ /*
+ * form key for search the next directory entry using
+ * f_pos field of file structure
+ */
make_cpu_key(&pos_key, inode, ctx->pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3);
next_pos = cpu_key_k_offset(&pos_key);
path_to_entry.reada = PATH_READA;
while (1) {
- research:
- /* search the directory item, containing entry with specified key */
+research:
+ /*
+ * search the directory item, containing entry with
+ * specified key
+ */
search_res =
search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry,
&de);
if (search_res == IO_ERROR) {
- // FIXME: we could just skip part of directory which could
- // not be read
+ /*
+ * FIXME: we could just skip part of directory
+ * which could not be read
+ */
ret = -EIO;
goto out;
}
@@ -102,41 +112,49 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
store_ih(&tmp_ih, ih);
/* we must have found item, that is item of this directory, */
- RFALSE(COMP_SHORT_KEYS(&(ih->ih_key), &pos_key),
+ RFALSE(COMP_SHORT_KEYS(&ih->ih_key, &pos_key),
"vs-9000: found item %h does not match to dir we readdir %K",
ih, &pos_key);
RFALSE(item_num > B_NR_ITEMS(bh) - 1,
"vs-9005 item_num == %d, item amount == %d",
item_num, B_NR_ITEMS(bh));
- /* and entry must be not more than number of entries in the item */
- RFALSE(I_ENTRY_COUNT(ih) < entry_num,
+ /*
+ * and entry must be not more than number of entries
+ * in the item
+ */
+ RFALSE(ih_entry_count(ih) < entry_num,
"vs-9010: entry number is too big %d (%d)",
- entry_num, I_ENTRY_COUNT(ih));
+ entry_num, ih_entry_count(ih));
+ /*
+ * go through all entries in the directory item beginning
+ * from the entry, that has been found
+ */
if (search_res == POSITION_FOUND
- || entry_num < I_ENTRY_COUNT(ih)) {
- /* go through all entries in the directory item beginning from the entry, that has been found */
+ || entry_num < ih_entry_count(ih)) {
struct reiserfs_de_head *deh =
B_I_DEH(bh, ih) + entry_num;
- for (; entry_num < I_ENTRY_COUNT(ih);
+ for (; entry_num < ih_entry_count(ih);
entry_num++, deh++) {
int d_reclen;
char *d_name;
ino_t d_ino;
loff_t cur_pos = deh_offset(deh);
+ /* it is hidden entry */
if (!de_visible(deh))
- /* it is hidden entry */
continue;
d_reclen = entry_length(bh, ih, entry_num);
d_name = B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh);
if (d_reclen <= 0 ||
d_name + d_reclen > bh->b_data + bh->b_size) {
- /* There is corrupted data in entry,
- * We'd better stop here */
+ /*
+ * There is corrupted data in entry,
+ * We'd better stop here
+ */
pathrelse(&path_to_entry);
ret = -EIO;
goto out;
@@ -145,10 +163,10 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
if (!d_name[d_reclen - 1])
d_reclen = strlen(d_name);
+ /* too big to send back to VFS */
if (d_reclen >
REISERFS_MAX_NAME(inode->i_sb->
s_blocksize)) {
- /* too big to send back to VFS */
continue;
}
@@ -173,10 +191,14 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
goto research;
}
}
- // Note, that we copy name to user space via temporary
- // buffer (local_buf) because filldir will block if
- // user space buffer is swapped out. At that time
- // entry can move to somewhere else
+
+ /*
+ * Note, that we copy name to user space via
+ * temporary buffer (local_buf) because
+ * filldir will block if user space buffer is
+ * swapped out. At that time entry can move to
+ * somewhere else
+ */
memcpy(local_buf, d_name, d_reclen);
/*
@@ -209,22 +231,26 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
} /* for */
}
+ /* end of directory has been reached */
if (item_num != B_NR_ITEMS(bh) - 1)
- // end of directory has been reached
goto end;
- /* item we went through is last item of node. Using right
- delimiting key check is it directory end */
+ /*
+ * item we went through is last item of node. Using right
+ * delimiting key check is it directory end
+ */
rkey = get_rkey(&path_to_entry, inode->i_sb);
if (!comp_le_keys(rkey, &MIN_KEY)) {
- /* set pos_key to key, that is the smallest and greater
- that key of the last entry in the item */
+ /*
+ * set pos_key to key, that is the smallest and greater
+ * that key of the last entry in the item
+ */
set_cpu_key_k_offset(&pos_key, next_pos);
continue;
}
+ /* end of directory has been reached */
if (COMP_SHORT_KEYS(rkey, &pos_key)) {
- // end of directory has been reached
goto end;
}
@@ -248,71 +274,73 @@ static int reiserfs_readdir(struct file *file, struct dir_context *ctx)
return reiserfs_readdir_inode(file_inode(file), ctx);
}
-/* compose directory item containing "." and ".." entries (entries are
- not aligned to 4 byte boundary) */
-/* the last four params are LE */
+/*
+ * compose directory item containing "." and ".." entries (entries are
+ * not aligned to 4 byte boundary)
+ */
void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid,
__le32 par_dirid, __le32 par_objid)
{
- struct reiserfs_de_head *deh;
+ struct reiserfs_de_head *dot, *dotdot;
memset(body, 0, EMPTY_DIR_SIZE_V1);
- deh = (struct reiserfs_de_head *)body;
+ dot = (struct reiserfs_de_head *)body;
+ dotdot = dot + 1;
/* direntry header of "." */
- put_deh_offset(&(deh[0]), DOT_OFFSET);
+ put_deh_offset(dot, DOT_OFFSET);
/* these two are from make_le_item_head, and are are LE */
- deh[0].deh_dir_id = dirid;
- deh[0].deh_objectid = objid;
- deh[0].deh_state = 0; /* Endian safe if 0 */
- put_deh_location(&(deh[0]), EMPTY_DIR_SIZE_V1 - strlen("."));
- mark_de_visible(&(deh[0]));
+ dot->deh_dir_id = dirid;
+ dot->deh_objectid = objid;
+ dot->deh_state = 0; /* Endian safe if 0 */
+ put_deh_location(dot, EMPTY_DIR_SIZE_V1 - strlen("."));
+ mark_de_visible(dot);
/* direntry header of ".." */
- put_deh_offset(&(deh[1]), DOT_DOT_OFFSET);
+ put_deh_offset(dotdot, DOT_DOT_OFFSET);
/* key of ".." for the root directory */
/* these two are from the inode, and are are LE */
- deh[1].deh_dir_id = par_dirid;
- deh[1].deh_objectid = par_objid;
- deh[1].deh_state = 0; /* Endian safe if 0 */
- put_deh_location(&(deh[1]), deh_location(&(deh[0])) - strlen(".."));
- mark_de_visible(&(deh[1]));
+ dotdot->deh_dir_id = par_dirid;
+ dotdot->deh_objectid = par_objid;
+ dotdot->deh_state = 0; /* Endian safe if 0 */
+ put_deh_location(dotdot, deh_location(dot) - strlen(".."));
+ mark_de_visible(dotdot);
/* copy ".." and "." */
- memcpy(body + deh_location(&(deh[0])), ".", 1);
- memcpy(body + deh_location(&(deh[1])), "..", 2);
+ memcpy(body + deh_location(dot), ".", 1);
+ memcpy(body + deh_location(dotdot), "..", 2);
}
/* compose directory item containing "." and ".." entries */
void make_empty_dir_item(char *body, __le32 dirid, __le32 objid,
__le32 par_dirid, __le32 par_objid)
{
- struct reiserfs_de_head *deh;
+ struct reiserfs_de_head *dot, *dotdot;
memset(body, 0, EMPTY_DIR_SIZE);
- deh = (struct reiserfs_de_head *)body;
+ dot = (struct reiserfs_de_head *)body;
+ dotdot = dot + 1;
/* direntry header of "." */
- put_deh_offset(&(deh[0]), DOT_OFFSET);
+ put_deh_offset(dot, DOT_OFFSET);
/* these two are from make_le_item_head, and are are LE */
- deh[0].deh_dir_id = dirid;
- deh[0].deh_objectid = objid;
- deh[0].deh_state = 0; /* Endian safe if 0 */
- put_deh_location(&(deh[0]), EMPTY_DIR_SIZE - ROUND_UP(strlen(".")));
- mark_de_visible(&(deh[0]));
+ dot->deh_dir_id = dirid;
+ dot->deh_objectid = objid;
+ dot->deh_state = 0; /* Endian safe if 0 */
+ put_deh_location(dot, EMPTY_DIR_SIZE - ROUND_UP(strlen(".")));
+ mark_de_visible(dot);
/* direntry header of ".." */
- put_deh_offset(&(deh[1]), DOT_DOT_OFFSET);
+ put_deh_offset(dotdot, DOT_DOT_OFFSET);
/* key of ".." for the root directory */
/* these two are from the inode, and are are LE */
- deh[1].deh_dir_id = par_dirid;
- deh[1].deh_objectid = par_objid;
- deh[1].deh_state = 0; /* Endian safe if 0 */
- put_deh_location(&(deh[1]),
- deh_location(&(deh[0])) - ROUND_UP(strlen("..")));
- mark_de_visible(&(deh[1]));
+ dotdot->deh_dir_id = par_dirid;
+ dotdot->deh_objectid = par_objid;
+ dotdot->deh_state = 0; /* Endian safe if 0 */
+ put_deh_location(dotdot, deh_location(dot) - ROUND_UP(strlen("..")));
+ mark_de_visible(dotdot);
/* copy ".." and "." */
- memcpy(body + deh_location(&(deh[0])), ".", 1);
- memcpy(body + deh_location(&(deh[1])), "..", 2);
+ memcpy(body + deh_location(dot), ".", 1);
+ memcpy(body + deh_location(dotdot), "..", 2);
}
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 9a3c68cf6026..54fdf196bfb2 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -2,18 +2,13 @@
* Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
*/
-/* Now we have all buffers that must be used in balancing of the tree */
-/* Further calculations can not cause schedule(), and thus the buffer */
-/* tree will be stable until the balancing will be finished */
-/* balance the tree according to the analysis made before, */
-/* and using buffers obtained after all above. */
-
-/**
- ** balance_leaf_when_delete
- ** balance_leaf
- ** do_balance
- **
- **/
+/*
+ * Now we have all buffers that must be used in balancing of the tree
+ * Further calculations can not cause schedule(), and thus the buffer
+ * tree will be stable until the balancing will be finished
+ * balance the tree according to the analysis made before,
+ * and using buffers obtained after all above.
+ */
#include <asm/uaccess.h>
#include <linux/time.h>
@@ -61,48 +56,190 @@ static inline void buffer_info_init_bh(struct tree_balance *tb,
inline void do_balance_mark_leaf_dirty(struct tree_balance *tb,
struct buffer_head *bh, int flag)
{
- journal_mark_dirty(tb->transaction_handle,
- tb->transaction_handle->t_super, bh);
+ journal_mark_dirty(tb->transaction_handle, bh);
}
#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty
-/* summary:
- if deleting something ( tb->insert_size[0] < 0 )
- return(balance_leaf_when_delete()); (flag d handled here)
- else
- if lnum is larger than 0 we put items into the left node
- if rnum is larger than 0 we put items into the right node
- if snum1 is larger than 0 we put items into the new node s1
- if snum2 is larger than 0 we put items into the new node s2
-Note that all *num* count new items being created.
-
-It would be easier to read balance_leaf() if each of these summary
-lines was a separate procedure rather than being inlined. I think
-that there are many passages here and in balance_leaf_when_delete() in
-which two calls to one procedure can replace two passages, and it
-might save cache space and improve software maintenance costs to do so.
-
-Vladimir made the perceptive comment that we should offload most of
-the decision making in this function into fix_nodes/check_balance, and
-then create some sort of structure in tb that says what actions should
-be performed by do_balance.
-
--Hans */
-
-/* Balance leaf node in case of delete or cut: insert_size[0] < 0
+/*
+ * summary:
+ * if deleting something ( tb->insert_size[0] < 0 )
+ * return(balance_leaf_when_delete()); (flag d handled here)
+ * else
+ * if lnum is larger than 0 we put items into the left node
+ * if rnum is larger than 0 we put items into the right node
+ * if snum1 is larger than 0 we put items into the new node s1
+ * if snum2 is larger than 0 we put items into the new node s2
+ * Note that all *num* count new items being created.
+ */
+
+static void balance_leaf_when_delete_del(struct tree_balance *tb)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ int item_pos = PATH_LAST_POSITION(tb->tb_path);
+ struct buffer_info bi;
+#ifdef CONFIG_REISERFS_CHECK
+ struct item_head *ih = item_head(tbS0, item_pos);
+#endif
+
+ RFALSE(ih_item_len(ih) + IH_SIZE != -tb->insert_size[0],
+ "vs-12013: mode Delete, insert size %d, ih to be deleted %h",
+ -tb->insert_size[0], ih);
+
+ buffer_info_init_tbS0(tb, &bi);
+ leaf_delete_items(&bi, 0, item_pos, 1, -1);
+
+ if (!item_pos && tb->CFL[0]) {
+ if (B_NR_ITEMS(tbS0)) {
+ replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
+ } else {
+ if (!PATH_H_POSITION(tb->tb_path, 1))
+ replace_key(tb, tb->CFL[0], tb->lkey[0],
+ PATH_H_PPARENT(tb->tb_path, 0), 0);
+ }
+ }
+
+ RFALSE(!item_pos && !tb->CFL[0],
+ "PAP-12020: tb->CFL[0]==%p, tb->L[0]==%p", tb->CFL[0],
+ tb->L[0]);
+}
+
+/* cut item in S[0] */
+static void balance_leaf_when_delete_cut(struct tree_balance *tb)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ int item_pos = PATH_LAST_POSITION(tb->tb_path);
+ struct item_head *ih = item_head(tbS0, item_pos);
+ int pos_in_item = tb->tb_path->pos_in_item;
+ struct buffer_info bi;
+ buffer_info_init_tbS0(tb, &bi);
+
+ if (is_direntry_le_ih(ih)) {
+ /*
+ * UFS unlink semantics are such that you can only
+ * delete one directory entry at a time.
+ *
+ * when we cut a directory tb->insert_size[0] means
+ * number of entries to be cut (always 1)
+ */
+ tb->insert_size[0] = -1;
+ leaf_cut_from_buffer(&bi, item_pos, pos_in_item,
+ -tb->insert_size[0]);
+
+ RFALSE(!item_pos && !pos_in_item && !tb->CFL[0],
+ "PAP-12030: can not change delimiting key. CFL[0]=%p",
+ tb->CFL[0]);
+
+ if (!item_pos && !pos_in_item && tb->CFL[0])
+ replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
+ } else {
+ leaf_cut_from_buffer(&bi, item_pos, pos_in_item,
+ -tb->insert_size[0]);
+
+ RFALSE(!ih_item_len(ih),
+ "PAP-12035: cut must leave non-zero dynamic "
+ "length of item");
+ }
+}
+
+static int balance_leaf_when_delete_left(struct tree_balance *tb)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ int n = B_NR_ITEMS(tbS0);
+
+ /* L[0] must be joined with S[0] */
+ if (tb->lnum[0] == -1) {
+ /* R[0] must be also joined with S[0] */
+ if (tb->rnum[0] == -1) {
+ if (tb->FR[0] == PATH_H_PPARENT(tb->tb_path, 0)) {
+ /*
+ * all contents of all the
+ * 3 buffers will be in L[0]
+ */
+ if (PATH_H_POSITION(tb->tb_path, 1) == 0 &&
+ 1 < B_NR_ITEMS(tb->FR[0]))
+ replace_key(tb, tb->CFL[0],
+ tb->lkey[0], tb->FR[0], 1);
+
+ leaf_move_items(LEAF_FROM_S_TO_L, tb, n, -1,
+ NULL);
+ leaf_move_items(LEAF_FROM_R_TO_L, tb,
+ B_NR_ITEMS(tb->R[0]), -1,
+ NULL);
+
+ reiserfs_invalidate_buffer(tb, tbS0);
+ reiserfs_invalidate_buffer(tb, tb->R[0]);
+
+ return 0;
+ }
+
+ /* all contents of all the 3 buffers will be in R[0] */
+ leaf_move_items(LEAF_FROM_S_TO_R, tb, n, -1, NULL);
+ leaf_move_items(LEAF_FROM_L_TO_R, tb,
+ B_NR_ITEMS(tb->L[0]), -1, NULL);
+
+ /* right_delimiting_key is correct in R[0] */
+ replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
+
+ reiserfs_invalidate_buffer(tb, tbS0);
+ reiserfs_invalidate_buffer(tb, tb->L[0]);
+
+ return -1;
+ }
+
+ RFALSE(tb->rnum[0] != 0,
+ "PAP-12045: rnum must be 0 (%d)", tb->rnum[0]);
+ /* all contents of L[0] and S[0] will be in L[0] */
+ leaf_shift_left(tb, n, -1);
+
+ reiserfs_invalidate_buffer(tb, tbS0);
+
+ return 0;
+ }
+
+ /*
+ * a part of contents of S[0] will be in L[0] and
+ * the rest part of S[0] will be in R[0]
+ */
+
+ RFALSE((tb->lnum[0] + tb->rnum[0] < n) ||
+ (tb->lnum[0] + tb->rnum[0] > n + 1),
+ "PAP-12050: rnum(%d) and lnum(%d) and item "
+ "number(%d) in S[0] are not consistent",
+ tb->rnum[0], tb->lnum[0], n);
+ RFALSE((tb->lnum[0] + tb->rnum[0] == n) &&
+ (tb->lbytes != -1 || tb->rbytes != -1),
+ "PAP-12055: bad rbytes (%d)/lbytes (%d) "
+ "parameters when items are not split",
+ tb->rbytes, tb->lbytes);
+ RFALSE((tb->lnum[0] + tb->rnum[0] == n + 1) &&
+ (tb->lbytes < 1 || tb->rbytes != -1),
+ "PAP-12060: bad rbytes (%d)/lbytes (%d) "
+ "parameters when items are split",
+ tb->rbytes, tb->lbytes);
+
+ leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
+ leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
+
+ reiserfs_invalidate_buffer(tb, tbS0);
+
+ return 0;
+}
+
+/*
+ * Balance leaf node in case of delete or cut: insert_size[0] < 0
*
* lnum, rnum can have values >= -1
* -1 means that the neighbor must be joined with S
* 0 means that nothing should be done with the neighbor
- * >0 means to shift entirely or partly the specified number of items to the neighbor
+ * >0 means to shift entirely or partly the specified number of items
+ * to the neighbor
*/
static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
{
struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
int item_pos = PATH_LAST_POSITION(tb->tb_path);
- int pos_in_item = tb->tb_path->pos_in_item;
struct buffer_info bi;
int n;
struct item_head *ih;
@@ -114,1022 +251,1202 @@ static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
RFALSE(!tb->blknum[0] && !PATH_H_PPARENT(tb->tb_path, 0),
"PAP-12010: tree can not be empty");
- ih = B_N_PITEM_HEAD(tbS0, item_pos);
+ ih = item_head(tbS0, item_pos);
buffer_info_init_tbS0(tb, &bi);
/* Delete or truncate the item */
- switch (flag) {
- case M_DELETE: /* delete item in S[0] */
+ BUG_ON(flag != M_DELETE && flag != M_CUT);
+ if (flag == M_DELETE)
+ balance_leaf_when_delete_del(tb);
+ else /* M_CUT */
+ balance_leaf_when_delete_cut(tb);
- RFALSE(ih_item_len(ih) + IH_SIZE != -tb->insert_size[0],
- "vs-12013: mode Delete, insert size %d, ih to be deleted %h",
- -tb->insert_size[0], ih);
- leaf_delete_items(&bi, 0, item_pos, 1, -1);
+ /*
+ * the rule is that no shifting occurs unless by shifting
+ * a node can be freed
+ */
+ n = B_NR_ITEMS(tbS0);
- if (!item_pos && tb->CFL[0]) {
- if (B_NR_ITEMS(tbS0)) {
- replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0,
- 0);
- } else {
- if (!PATH_H_POSITION(tb->tb_path, 1))
- replace_key(tb, tb->CFL[0], tb->lkey[0],
- PATH_H_PPARENT(tb->tb_path,
- 0), 0);
- }
- }
- RFALSE(!item_pos && !tb->CFL[0],
- "PAP-12020: tb->CFL[0]==%p, tb->L[0]==%p", tb->CFL[0],
- tb->L[0]);
+ /* L[0] takes part in balancing */
+ if (tb->lnum[0])
+ return balance_leaf_when_delete_left(tb);
- break;
+ if (tb->rnum[0] == -1) {
+ /* all contents of R[0] and S[0] will be in R[0] */
+ leaf_shift_right(tb, n, -1);
+ reiserfs_invalidate_buffer(tb, tbS0);
+ return 0;
+ }
- case M_CUT:{ /* cut item in S[0] */
- if (is_direntry_le_ih(ih)) {
+ RFALSE(tb->rnum[0],
+ "PAP-12065: bad rnum parameter must be 0 (%d)", tb->rnum[0]);
+ return 0;
+}
- /* UFS unlink semantics are such that you can only delete one directory entry at a time. */
- /* when we cut a directory tb->insert_size[0] means number of entries to be cut (always 1) */
- tb->insert_size[0] = -1;
- leaf_cut_from_buffer(&bi, item_pos, pos_in_item,
- -tb->insert_size[0]);
+static void balance_leaf_insert_left(struct tree_balance *tb,
+ struct item_head *ih, const char *body)
+{
+ int ret;
+ struct buffer_info bi;
+ int n = B_NR_ITEMS(tb->L[0]);
+
+ if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) {
+ /* part of new item falls into L[0] */
+ int new_item_len, shift;
+ int version;
+
+ ret = leaf_shift_left(tb, tb->lnum[0] - 1, -1);
+
+ /* Calculate item length to insert to S[0] */
+ new_item_len = ih_item_len(ih) - tb->lbytes;
+
+ /* Calculate and check item length to insert to L[0] */
+ put_ih_item_len(ih, ih_item_len(ih) - new_item_len);
+
+ RFALSE(ih_item_len(ih) <= 0,
+ "PAP-12080: there is nothing to insert into L[0]: "
+ "ih_item_len=%d", ih_item_len(ih));
+
+ /* Insert new item into L[0] */
+ buffer_info_init_left(tb, &bi);
+ leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body,
+ min_t(int, tb->zeroes_num, ih_item_len(ih)));
+
+ version = ih_version(ih);
+
+ /*
+ * Calculate key component, item length and body to
+ * insert into S[0]
+ */
+ shift = 0;
+ if (is_indirect_le_ih(ih))
+ shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
+
+ add_le_ih_k_offset(ih, tb->lbytes << shift);
+
+ put_ih_item_len(ih, new_item_len);
+ if (tb->lbytes > tb->zeroes_num) {
+ body += (tb->lbytes - tb->zeroes_num);
+ tb->zeroes_num = 0;
+ } else
+ tb->zeroes_num -= tb->lbytes;
+
+ RFALSE(ih_item_len(ih) <= 0,
+ "PAP-12085: there is nothing to insert into S[0]: "
+ "ih_item_len=%d", ih_item_len(ih));
+ } else {
+ /* new item in whole falls into L[0] */
+ /* Shift lnum[0]-1 items to L[0] */
+ ret = leaf_shift_left(tb, tb->lnum[0] - 1, tb->lbytes);
+
+ /* Insert new item into L[0] */
+ buffer_info_init_left(tb, &bi);
+ leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body,
+ tb->zeroes_num);
+ tb->insert_size[0] = 0;
+ tb->zeroes_num = 0;
+ }
+}
- RFALSE(!item_pos && !pos_in_item && !tb->CFL[0],
- "PAP-12030: can not change delimiting key. CFL[0]=%p",
- tb->CFL[0]);
+static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb,
+ struct item_head *ih,
+ const char *body)
+{
+ int n = B_NR_ITEMS(tb->L[0]);
+ struct buffer_info bi;
- if (!item_pos && !pos_in_item && tb->CFL[0]) {
- replace_key(tb, tb->CFL[0], tb->lkey[0],
- tbS0, 0);
- }
- } else {
- leaf_cut_from_buffer(&bi, item_pos, pos_in_item,
- -tb->insert_size[0]);
+ RFALSE(tb->zeroes_num,
+ "PAP-12090: invalid parameter in case of a directory");
+
+ /* directory item */
+ if (tb->lbytes > tb->pos_in_item) {
+ /* new directory entry falls into L[0] */
+ struct item_head *pasted;
+ int ret, l_pos_in_item = tb->pos_in_item;
+
+ /*
+ * Shift lnum[0] - 1 items in whole.
+ * Shift lbytes - 1 entries from given directory item
+ */
+ ret = leaf_shift_left(tb, tb->lnum[0], tb->lbytes - 1);
+ if (ret && !tb->item_pos) {
+ pasted = item_head(tb->L[0], B_NR_ITEMS(tb->L[0]) - 1);
+ l_pos_in_item += ih_entry_count(pasted) -
+ (tb->lbytes - 1);
+ }
- RFALSE(!ih_item_len(ih),
- "PAP-12035: cut must leave non-zero dynamic length of item");
- }
- break;
+ /* Append given directory entry to directory item */
+ buffer_info_init_left(tb, &bi);
+ leaf_paste_in_buffer(&bi, n + tb->item_pos - ret,
+ l_pos_in_item, tb->insert_size[0],
+ body, tb->zeroes_num);
+
+ /*
+ * previous string prepared space for pasting new entry,
+ * following string pastes this entry
+ */
+
+ /*
+ * when we have merge directory item, pos_in_item
+ * has been changed too
+ */
+
+ /* paste new directory entry. 1 is entry number */
+ leaf_paste_entries(&bi, n + tb->item_pos - ret,
+ l_pos_in_item, 1,
+ (struct reiserfs_de_head *) body,
+ body + DEH_SIZE, tb->insert_size[0]);
+ tb->insert_size[0] = 0;
+ } else {
+ /* new directory item doesn't fall into L[0] */
+ /*
+ * Shift lnum[0]-1 items in whole. Shift lbytes
+ * directory entries from directory item number lnum[0]
+ */
+ leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
+ }
+
+ /* Calculate new position to append in item body */
+ tb->pos_in_item -= tb->lbytes;
+}
+
+static void balance_leaf_paste_left_shift(struct tree_balance *tb,
+ struct item_head *ih,
+ const char *body)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ int n = B_NR_ITEMS(tb->L[0]);
+ struct buffer_info bi;
+
+ if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) {
+ balance_leaf_paste_left_shift_dirent(tb, ih, body);
+ return;
+ }
+
+ RFALSE(tb->lbytes <= 0,
+ "PAP-12095: there is nothing to shift to L[0]. "
+ "lbytes=%d", tb->lbytes);
+ RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)),
+ "PAP-12100: incorrect position to paste: "
+ "item_len=%d, pos_in_item=%d",
+ ih_item_len(item_head(tbS0, tb->item_pos)), tb->pos_in_item);
+
+ /* appended item will be in L[0] in whole */
+ if (tb->lbytes >= tb->pos_in_item) {
+ struct item_head *tbS0_pos_ih, *tbL0_ih;
+ struct item_head *tbS0_0_ih;
+ struct reiserfs_key *left_delim_key;
+ int ret, l_n, version, temp_l;
+
+ tbS0_pos_ih = item_head(tbS0, tb->item_pos);
+ tbS0_0_ih = item_head(tbS0, 0);
+
+ /*
+ * this bytes number must be appended
+ * to the last item of L[h]
+ */
+ l_n = tb->lbytes - tb->pos_in_item;
+
+ /* Calculate new insert_size[0] */
+ tb->insert_size[0] -= l_n;
+
+ RFALSE(tb->insert_size[0] <= 0,
+ "PAP-12105: there is nothing to paste into "
+ "L[0]. insert_size=%d", tb->insert_size[0]);
+
+ ret = leaf_shift_left(tb, tb->lnum[0],
+ ih_item_len(tbS0_pos_ih));
+
+ tbL0_ih = item_head(tb->L[0], n + tb->item_pos - ret);
+
+ /* Append to body of item in L[0] */
+ buffer_info_init_left(tb, &bi);
+ leaf_paste_in_buffer(&bi, n + tb->item_pos - ret,
+ ih_item_len(tbL0_ih), l_n, body,
+ min_t(int, l_n, tb->zeroes_num));
+
+ /*
+ * 0-th item in S0 can be only of DIRECT type
+ * when l_n != 0
+ */
+ temp_l = l_n;
+
+ RFALSE(ih_item_len(tbS0_0_ih),
+ "PAP-12106: item length must be 0");
+ RFALSE(comp_short_le_keys(&tbS0_0_ih->ih_key,
+ leaf_key(tb->L[0], n + tb->item_pos - ret)),
+ "PAP-12107: items must be of the same file");
+
+ if (is_indirect_le_ih(tbL0_ih)) {
+ int shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
+ temp_l = l_n << shift;
}
+ /* update key of first item in S0 */
+ version = ih_version(tbS0_0_ih);
+ add_le_key_k_offset(version, &tbS0_0_ih->ih_key, temp_l);
+
+ /* update left delimiting key */
+ left_delim_key = internal_key(tb->CFL[0], tb->lkey[0]);
+ add_le_key_k_offset(version, left_delim_key, temp_l);
+
+ /*
+ * Calculate new body, position in item and
+ * insert_size[0]
+ */
+ if (l_n > tb->zeroes_num) {
+ body += (l_n - tb->zeroes_num);
+ tb->zeroes_num = 0;
+ } else
+ tb->zeroes_num -= l_n;
+ tb->pos_in_item = 0;
+
+ RFALSE(comp_short_le_keys(&tbS0_0_ih->ih_key,
+ leaf_key(tb->L[0],
+ B_NR_ITEMS(tb->L[0]) - 1)) ||
+ !op_is_left_mergeable(leaf_key(tbS0, 0), tbS0->b_size) ||
+ !op_is_left_mergeable(left_delim_key, tbS0->b_size),
+ "PAP-12120: item must be merge-able with left "
+ "neighboring item");
+ } else {
+ /* only part of the appended item will be in L[0] */
+
+ /* Calculate position in item for append in S[0] */
+ tb->pos_in_item -= tb->lbytes;
+
+ RFALSE(tb->pos_in_item <= 0,
+ "PAP-12125: no place for paste. pos_in_item=%d",
+ tb->pos_in_item);
+
+ /*
+ * Shift lnum[0] - 1 items in whole.
+ * Shift lbytes - 1 byte from item number lnum[0]
+ */
+ leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
+ }
+}
- default:
- print_cur_tb("12040");
- reiserfs_panic(tb->tb_sb, "PAP-12040",
- "unexpected mode: %s(%d)",
- (flag ==
- M_PASTE) ? "PASTE" : ((flag ==
- M_INSERT) ? "INSERT" :
- "UNKNOWN"), flag);
+
+/* appended item will be in L[0] in whole */
+static void balance_leaf_paste_left_whole(struct tree_balance *tb,
+ struct item_head *ih,
+ const char *body)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ int n = B_NR_ITEMS(tb->L[0]);
+ struct buffer_info bi;
+ struct item_head *pasted;
+ int ret;
+
+ /* if we paste into first item of S[0] and it is left mergable */
+ if (!tb->item_pos &&
+ op_is_left_mergeable(leaf_key(tbS0, 0), tbS0->b_size)) {
+ /*
+ * then increment pos_in_item by the size of the
+ * last item in L[0]
+ */
+ pasted = item_head(tb->L[0], n - 1);
+ if (is_direntry_le_ih(pasted))
+ tb->pos_in_item += ih_entry_count(pasted);
+ else
+ tb->pos_in_item += ih_item_len(pasted);
}
- /* the rule is that no shifting occurs unless by shifting a node can be freed */
- n = B_NR_ITEMS(tbS0);
- if (tb->lnum[0]) { /* L[0] takes part in balancing */
- if (tb->lnum[0] == -1) { /* L[0] must be joined with S[0] */
- if (tb->rnum[0] == -1) { /* R[0] must be also joined with S[0] */
- if (tb->FR[0] == PATH_H_PPARENT(tb->tb_path, 0)) {
- /* all contents of all the 3 buffers will be in L[0] */
- if (PATH_H_POSITION(tb->tb_path, 1) == 0
- && 1 < B_NR_ITEMS(tb->FR[0]))
- replace_key(tb, tb->CFL[0],
- tb->lkey[0],
- tb->FR[0], 1);
-
- leaf_move_items(LEAF_FROM_S_TO_L, tb, n,
- -1, NULL);
- leaf_move_items(LEAF_FROM_R_TO_L, tb,
- B_NR_ITEMS(tb->R[0]),
- -1, NULL);
-
- reiserfs_invalidate_buffer(tb, tbS0);
- reiserfs_invalidate_buffer(tb,
- tb->R[0]);
-
- return 0;
- }
- /* all contents of all the 3 buffers will be in R[0] */
- leaf_move_items(LEAF_FROM_S_TO_R, tb, n, -1,
- NULL);
- leaf_move_items(LEAF_FROM_L_TO_R, tb,
- B_NR_ITEMS(tb->L[0]), -1, NULL);
+ /*
+ * Shift lnum[0] - 1 items in whole.
+ * Shift lbytes - 1 byte from item number lnum[0]
+ */
+ ret = leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
+
+ /* Append to body of item in L[0] */
+ buffer_info_init_left(tb, &bi);
+ leaf_paste_in_buffer(&bi, n + tb->item_pos - ret, tb->pos_in_item,
+ tb->insert_size[0], body, tb->zeroes_num);
+
+ /* if appended item is directory, paste entry */
+ pasted = item_head(tb->L[0], n + tb->item_pos - ret);
+ if (is_direntry_le_ih(pasted))
+ leaf_paste_entries(&bi, n + tb->item_pos - ret,
+ tb->pos_in_item, 1,
+ (struct reiserfs_de_head *)body,
+ body + DEH_SIZE, tb->insert_size[0]);
+
+ /*
+ * if appended item is indirect item, put unformatted node
+ * into un list
+ */
+ if (is_indirect_le_ih(pasted))
+ set_ih_free_space(pasted, 0);
- /* right_delimiting_key is correct in R[0] */
- replace_key(tb, tb->CFR[0], tb->rkey[0],
- tb->R[0], 0);
+ tb->insert_size[0] = 0;
+ tb->zeroes_num = 0;
+}
- reiserfs_invalidate_buffer(tb, tbS0);
- reiserfs_invalidate_buffer(tb, tb->L[0]);
+static void balance_leaf_paste_left(struct tree_balance *tb,
+ struct item_head *ih, const char *body)
+{
+ /* we must shift the part of the appended item */
+ if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1)
+ balance_leaf_paste_left_shift(tb, ih, body);
+ else
+ balance_leaf_paste_left_whole(tb, ih, body);
+}
- return -1;
- }
+/* Shift lnum[0] items from S[0] to the left neighbor L[0] */
+static void balance_leaf_left(struct tree_balance *tb, struct item_head *ih,
+ const char *body, int flag)
+{
+ if (tb->lnum[0] <= 0)
+ return;
- RFALSE(tb->rnum[0] != 0,
- "PAP-12045: rnum must be 0 (%d)", tb->rnum[0]);
- /* all contents of L[0] and S[0] will be in L[0] */
- leaf_shift_left(tb, n, -1);
+ /* new item or it part falls to L[0], shift it too */
+ if (tb->item_pos < tb->lnum[0]) {
+ BUG_ON(flag != M_INSERT && flag != M_PASTE);
+
+ if (flag == M_INSERT)
+ balance_leaf_insert_left(tb, ih, body);
+ else /* M_PASTE */
+ balance_leaf_paste_left(tb, ih, body);
+ } else
+ /* new item doesn't fall into L[0] */
+ leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
+}
- reiserfs_invalidate_buffer(tb, tbS0);
- return 0;
+static void balance_leaf_insert_right(struct tree_balance *tb,
+ struct item_head *ih, const char *body)
+{
+
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ int n = B_NR_ITEMS(tbS0);
+ struct buffer_info bi;
+ int ret;
+
+ /* new item or part of it doesn't fall into R[0] */
+ if (n - tb->rnum[0] >= tb->item_pos) {
+ leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
+ return;
+ }
+
+ /* new item or its part falls to R[0] */
+
+ /* part of new item falls into R[0] */
+ if (tb->item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) {
+ loff_t old_key_comp, old_len, r_zeroes_number;
+ const char *r_body;
+ int version, shift;
+ loff_t offset;
+
+ leaf_shift_right(tb, tb->rnum[0] - 1, -1);
+
+ version = ih_version(ih);
+
+ /* Remember key component and item length */
+ old_key_comp = le_ih_k_offset(ih);
+ old_len = ih_item_len(ih);
+
+ /*
+ * Calculate key component and item length to insert
+ * into R[0]
+ */
+ shift = 0;
+ if (is_indirect_le_ih(ih))
+ shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
+ offset = le_ih_k_offset(ih) + ((old_len - tb->rbytes) << shift);
+ set_le_ih_k_offset(ih, offset);
+ put_ih_item_len(ih, tb->rbytes);
+
+ /* Insert part of the item into R[0] */
+ buffer_info_init_right(tb, &bi);
+ if ((old_len - tb->rbytes) > tb->zeroes_num) {
+ r_zeroes_number = 0;
+ r_body = body + (old_len - tb->rbytes) - tb->zeroes_num;
+ } else {
+ r_body = body;
+ r_zeroes_number = tb->zeroes_num -
+ (old_len - tb->rbytes);
+ tb->zeroes_num -= r_zeroes_number;
}
- /* a part of contents of S[0] will be in L[0] and the rest part of S[0] will be in R[0] */
-
- RFALSE((tb->lnum[0] + tb->rnum[0] < n) ||
- (tb->lnum[0] + tb->rnum[0] > n + 1),
- "PAP-12050: rnum(%d) and lnum(%d) and item number(%d) in S[0] are not consistent",
- tb->rnum[0], tb->lnum[0], n);
- RFALSE((tb->lnum[0] + tb->rnum[0] == n) &&
- (tb->lbytes != -1 || tb->rbytes != -1),
- "PAP-12055: bad rbytes (%d)/lbytes (%d) parameters when items are not split",
- tb->rbytes, tb->lbytes);
- RFALSE((tb->lnum[0] + tb->rnum[0] == n + 1) &&
- (tb->lbytes < 1 || tb->rbytes != -1),
- "PAP-12060: bad rbytes (%d)/lbytes (%d) parameters when items are split",
- tb->rbytes, tb->lbytes);
- leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
+ leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeroes_number);
+
+ /* Replace right delimiting key by first key in R[0] */
+ replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
+
+ /*
+ * Calculate key component and item length to
+ * insert into S[0]
+ */
+ set_le_ih_k_offset(ih, old_key_comp);
+ put_ih_item_len(ih, old_len - tb->rbytes);
+
+ tb->insert_size[0] -= tb->rbytes;
+
+ } else {
+ /* whole new item falls into R[0] */
+
+ /* Shift rnum[0]-1 items to R[0] */
+ ret = leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes);
+
+ /* Insert new item into R[0] */
+ buffer_info_init_right(tb, &bi);
+ leaf_insert_into_buf(&bi, tb->item_pos - n + tb->rnum[0] - 1,
+ ih, body, tb->zeroes_num);
+
+ if (tb->item_pos - n + tb->rnum[0] - 1 == 0)
+ replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
+
+ tb->zeroes_num = tb->insert_size[0] = 0;
+ }
+}
+
+
+static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb,
+ struct item_head *ih, const char *body)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ struct buffer_info bi;
+ int entry_count;
+
+ RFALSE(tb->zeroes_num,
+ "PAP-12145: invalid parameter in case of a directory");
+ entry_count = ih_entry_count(item_head(tbS0, tb->item_pos));
+
+ /* new directory entry falls into R[0] */
+ if (entry_count - tb->rbytes < tb->pos_in_item) {
+ int paste_entry_position;
+
+ RFALSE(tb->rbytes - 1 >= entry_count || !tb->insert_size[0],
+ "PAP-12150: no enough of entries to shift to R[0]: "
+ "rbytes=%d, entry_count=%d", tb->rbytes, entry_count);
+
+ /*
+ * Shift rnum[0]-1 items in whole.
+ * Shift rbytes-1 directory entries from directory
+ * item number rnum[0]
+ */
+ leaf_shift_right(tb, tb->rnum[0], tb->rbytes - 1);
+
+ /* Paste given directory entry to directory item */
+ paste_entry_position = tb->pos_in_item - entry_count +
+ tb->rbytes - 1;
+ buffer_info_init_right(tb, &bi);
+ leaf_paste_in_buffer(&bi, 0, paste_entry_position,
+ tb->insert_size[0], body, tb->zeroes_num);
+
+ /* paste entry */
+ leaf_paste_entries(&bi, 0, paste_entry_position, 1,
+ (struct reiserfs_de_head *) body,
+ body + DEH_SIZE, tb->insert_size[0]);
+
+ /* change delimiting keys */
+ if (paste_entry_position == 0)
+ replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
+
+ tb->insert_size[0] = 0;
+ tb->pos_in_item++;
+ } else {
+ /* new directory entry doesn't fall into R[0] */
leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
+ }
+}
- reiserfs_invalidate_buffer(tb, tbS0);
+static void balance_leaf_paste_right_shift(struct tree_balance *tb,
+ struct item_head *ih, const char *body)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ int n_shift, n_rem, r_zeroes_number, version;
+ unsigned long temp_rem;
+ const char *r_body;
+ struct buffer_info bi;
- return 0;
+ /* we append to directory item */
+ if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) {
+ balance_leaf_paste_right_shift_dirent(tb, ih, body);
+ return;
}
- if (tb->rnum[0] == -1) {
- /* all contents of R[0] and S[0] will be in R[0] */
- leaf_shift_right(tb, n, -1);
- reiserfs_invalidate_buffer(tb, tbS0);
- return 0;
+ /* regular object */
+
+ /*
+ * Calculate number of bytes which must be shifted
+ * from appended item
+ */
+ n_shift = tb->rbytes - tb->insert_size[0];
+ if (n_shift < 0)
+ n_shift = 0;
+
+ RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)),
+ "PAP-12155: invalid position to paste. ih_item_len=%d, "
+ "pos_in_item=%d", tb->pos_in_item,
+ ih_item_len(item_head(tbS0, tb->item_pos)));
+
+ leaf_shift_right(tb, tb->rnum[0], n_shift);
+
+ /*
+ * Calculate number of bytes which must remain in body
+ * after appending to R[0]
+ */
+ n_rem = tb->insert_size[0] - tb->rbytes;
+ if (n_rem < 0)
+ n_rem = 0;
+
+ temp_rem = n_rem;
+
+ version = ih_version(item_head(tb->R[0], 0));
+
+ if (is_indirect_le_key(version, leaf_key(tb->R[0], 0))) {
+ int shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
+ temp_rem = n_rem << shift;
}
- RFALSE(tb->rnum[0],
- "PAP-12065: bad rnum parameter must be 0 (%d)", tb->rnum[0]);
- return 0;
+ add_le_key_k_offset(version, leaf_key(tb->R[0], 0), temp_rem);
+ add_le_key_k_offset(version, internal_key(tb->CFR[0], tb->rkey[0]),
+ temp_rem);
+
+ do_balance_mark_internal_dirty(tb, tb->CFR[0], 0);
+
+ /* Append part of body into R[0] */
+ buffer_info_init_right(tb, &bi);
+ if (n_rem > tb->zeroes_num) {
+ r_zeroes_number = 0;
+ r_body = body + n_rem - tb->zeroes_num;
+ } else {
+ r_body = body;
+ r_zeroes_number = tb->zeroes_num - n_rem;
+ tb->zeroes_num -= r_zeroes_number;
+ }
+
+ leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem,
+ r_body, r_zeroes_number);
+
+ if (is_indirect_le_ih(item_head(tb->R[0], 0)))
+ set_ih_free_space(item_head(tb->R[0], 0), 0);
+
+ tb->insert_size[0] = n_rem;
+ if (!n_rem)
+ tb->pos_in_item++;
}
-static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item header of inserted item (this is on little endian) */
- const char *body, /* body of inserted item or bytes to paste */
- int flag, /* i - insert, d - delete, c - cut, p - paste
- (see comment to do_balance) */
- struct item_head *insert_key, /* in our processing of one level we sometimes determine what
- must be inserted into the next higher level. This insertion
- consists of a key or two keys and their corresponding
- pointers */
- struct buffer_head **insert_ptr /* inserted node-ptrs for the next level */
- )
+static void balance_leaf_paste_right_whole(struct tree_balance *tb,
+ struct item_head *ih, const char *body)
{
struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
- int item_pos = PATH_LAST_POSITION(tb->tb_path); /* index into the array of item headers in S[0]
- of the affected item */
+ int n = B_NR_ITEMS(tbS0);
+ struct item_head *pasted;
struct buffer_info bi;
- struct buffer_head *S_new[2]; /* new nodes allocated to hold what could not fit into S */
- int snum[2]; /* number of items that will be placed
- into S_new (includes partially shifted
- items) */
- int sbytes[2]; /* if an item is partially shifted into S_new then
- if it is a directory item
- it is the number of entries from the item that are shifted into S_new
- else
- it is the number of bytes from the item that are shifted into S_new
- */
- int n, i;
- int ret_val;
- int pos_in_item;
- int zeros_num;
- PROC_INFO_INC(tb->tb_sb, balance_at[0]);
+ buffer_info_init_right(tb, &bi);
+ leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
+
+ /* append item in R[0] */
+ if (tb->pos_in_item >= 0) {
+ buffer_info_init_right(tb, &bi);
+ leaf_paste_in_buffer(&bi, tb->item_pos - n + tb->rnum[0],
+ tb->pos_in_item, tb->insert_size[0], body,
+ tb->zeroes_num);
+ }
- /* Make balance in case insert_size[0] < 0 */
- if (tb->insert_size[0] < 0)
- return balance_leaf_when_delete(tb, flag);
+ /* paste new entry, if item is directory item */
+ pasted = item_head(tb->R[0], tb->item_pos - n + tb->rnum[0]);
+ if (is_direntry_le_ih(pasted) && tb->pos_in_item >= 0) {
+ leaf_paste_entries(&bi, tb->item_pos - n + tb->rnum[0],
+ tb->pos_in_item, 1,
+ (struct reiserfs_de_head *)body,
+ body + DEH_SIZE, tb->insert_size[0]);
- zeros_num = 0;
- if (flag == M_INSERT && !body)
- zeros_num = ih_item_len(ih);
+ if (!tb->pos_in_item) {
- pos_in_item = tb->tb_path->pos_in_item;
- /* for indirect item pos_in_item is measured in unformatted node
- pointers. Recalculate to bytes */
- if (flag != M_INSERT
- && is_indirect_le_ih(B_N_PITEM_HEAD(tbS0, item_pos)))
- pos_in_item *= UNFM_P_SIZE;
-
- if (tb->lnum[0] > 0) {
- /* Shift lnum[0] items from S[0] to the left neighbor L[0] */
- if (item_pos < tb->lnum[0]) {
- /* new item or it part falls to L[0], shift it too */
- n = B_NR_ITEMS(tb->L[0]);
-
- switch (flag) {
- case M_INSERT: /* insert item into L[0] */
-
- if (item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) {
- /* part of new item falls into L[0] */
- int new_item_len;
- int version;
-
- ret_val = leaf_shift_left(tb, tb->lnum[0] - 1, -1);
-
- /* Calculate item length to insert to S[0] */
- new_item_len = ih_item_len(ih) - tb->lbytes;
- /* Calculate and check item length to insert to L[0] */
- put_ih_item_len(ih, ih_item_len(ih) - new_item_len);
-
- RFALSE(ih_item_len(ih) <= 0,
- "PAP-12080: there is nothing to insert into L[0]: ih_item_len=%d",
- ih_item_len(ih));
-
- /* Insert new item into L[0] */
- buffer_info_init_left(tb, &bi);
- leaf_insert_into_buf(&bi,
- n + item_pos - ret_val, ih, body,
- zeros_num > ih_item_len(ih) ? ih_item_len(ih) : zeros_num);
-
- version = ih_version(ih);
-
- /* Calculate key component, item length and body to insert into S[0] */
- set_le_ih_k_offset(ih, le_ih_k_offset(ih) +
- (tb-> lbytes << (is_indirect_le_ih(ih) ? tb->tb_sb-> s_blocksize_bits - UNFM_P_SHIFT : 0)));
-
- put_ih_item_len(ih, new_item_len);
- if (tb->lbytes > zeros_num) {
- body += (tb->lbytes - zeros_num);
- zeros_num = 0;
- } else
- zeros_num -= tb->lbytes;
-
- RFALSE(ih_item_len(ih) <= 0,
- "PAP-12085: there is nothing to insert into S[0]: ih_item_len=%d",
- ih_item_len(ih));
- } else {
- /* new item in whole falls into L[0] */
- /* Shift lnum[0]-1 items to L[0] */
- ret_val = leaf_shift_left(tb, tb->lnum[0] - 1, tb->lbytes);
- /* Insert new item into L[0] */
- buffer_info_init_left(tb, &bi);
- leaf_insert_into_buf(&bi, n + item_pos - ret_val, ih, body, zeros_num);
- tb->insert_size[0] = 0;
- zeros_num = 0;
- }
- break;
-
- case M_PASTE: /* append item in L[0] */
-
- if (item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) {
- /* we must shift the part of the appended item */
- if (is_direntry_le_ih(B_N_PITEM_HEAD(tbS0, item_pos))) {
-
- RFALSE(zeros_num,
- "PAP-12090: invalid parameter in case of a directory");
- /* directory item */
- if (tb->lbytes > pos_in_item) {
- /* new directory entry falls into L[0] */
- struct item_head *pasted;
- int l_pos_in_item = pos_in_item;
-
- /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 entries from given directory item */
- ret_val = leaf_shift_left(tb, tb->lnum[0], tb->lbytes-1);
- if (ret_val && !item_pos) {
- pasted = B_N_PITEM_HEAD(tb->L[0], B_NR_ITEMS(tb->L[0]) - 1);
- l_pos_in_item += I_ENTRY_COUNT(pasted) - (tb->lbytes -1);
- }
-
- /* Append given directory entry to directory item */
- buffer_info_init_left(tb, &bi);
- leaf_paste_in_buffer(&bi, n + item_pos - ret_val, l_pos_in_item, tb->insert_size[0], body, zeros_num);
-
- /* previous string prepared space for pasting new entry, following string pastes this entry */
-
- /* when we have merge directory item, pos_in_item has been changed too */
-
- /* paste new directory entry. 1 is entry number */
- leaf_paste_entries(&bi, n + item_pos - ret_val, l_pos_in_item,
- 1, (struct reiserfs_de_head *) body,
- body + DEH_SIZE, tb->insert_size[0]);
- tb->insert_size[0] = 0;
- } else {
- /* new directory item doesn't fall into L[0] */
- /* Shift lnum[0]-1 items in whole. Shift lbytes directory entries from directory item number lnum[0] */
- leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
- }
- /* Calculate new position to append in item body */
- pos_in_item -= tb->lbytes;
- } else {
- /* regular object */
- RFALSE(tb->lbytes <= 0, "PAP-12095: there is nothing to shift to L[0]. lbytes=%d", tb->lbytes);
- RFALSE(pos_in_item != ih_item_len(B_N_PITEM_HEAD(tbS0, item_pos)),
- "PAP-12100: incorrect position to paste: item_len=%d, pos_in_item=%d",
- ih_item_len(B_N_PITEM_HEAD(tbS0, item_pos)),pos_in_item);
-
- if (tb->lbytes >= pos_in_item) {
- /* appended item will be in L[0] in whole */
- int l_n;
-
- /* this bytes number must be appended to the last item of L[h] */
- l_n = tb->lbytes - pos_in_item;
-
- /* Calculate new insert_size[0] */
- tb->insert_size[0] -= l_n;
-
- RFALSE(tb->insert_size[0] <= 0,
- "PAP-12105: there is nothing to paste into L[0]. insert_size=%d",
- tb->insert_size[0]);
- ret_val = leaf_shift_left(tb, tb->lnum[0], ih_item_len
- (B_N_PITEM_HEAD(tbS0, item_pos)));
- /* Append to body of item in L[0] */
- buffer_info_init_left(tb, &bi);
- leaf_paste_in_buffer
- (&bi, n + item_pos - ret_val, ih_item_len
- (B_N_PITEM_HEAD(tb->L[0], n + item_pos - ret_val)),
- l_n, body,
- zeros_num > l_n ? l_n : zeros_num);
- /* 0-th item in S0 can be only of DIRECT type when l_n != 0 */
- {
- int version;
- int temp_l = l_n;
-
- RFALSE(ih_item_len(B_N_PITEM_HEAD(tbS0, 0)),
- "PAP-12106: item length must be 0");
- RFALSE(comp_short_le_keys(B_N_PKEY(tbS0, 0), B_N_PKEY
- (tb->L[0], n + item_pos - ret_val)),
- "PAP-12107: items must be of the same file");
- if (is_indirect_le_ih(B_N_PITEM_HEAD(tb->L[0], n + item_pos - ret_val))) {
- temp_l = l_n << (tb->tb_sb-> s_blocksize_bits - UNFM_P_SHIFT);
- }
- /* update key of first item in S0 */
- version = ih_version(B_N_PITEM_HEAD(tbS0, 0));
- set_le_key_k_offset(version, B_N_PKEY(tbS0, 0),
- le_key_k_offset(version,B_N_PKEY(tbS0, 0)) + temp_l);
- /* update left delimiting key */
- set_le_key_k_offset(version, B_N_PDELIM_KEY(tb->CFL[0], tb->lkey[0]),
- le_key_k_offset(version, B_N_PDELIM_KEY(tb->CFL[0], tb->lkey[0])) + temp_l);
- }
-
- /* Calculate new body, position in item and insert_size[0] */
- if (l_n > zeros_num) {
- body += (l_n - zeros_num);
- zeros_num = 0;
- } else
- zeros_num -= l_n;
- pos_in_item = 0;
-
- RFALSE(comp_short_le_keys(B_N_PKEY(tbS0, 0), B_N_PKEY(tb->L[0], B_NR_ITEMS(tb->L[0]) - 1))
- || !op_is_left_mergeable(B_N_PKEY(tbS0, 0), tbS0->b_size)
- || !op_is_left_mergeable(B_N_PDELIM_KEY(tb->CFL[0], tb->lkey[0]), tbS0->b_size),
- "PAP-12120: item must be merge-able with left neighboring item");
- } else { /* only part of the appended item will be in L[0] */
-
- /* Calculate position in item for append in S[0] */
- pos_in_item -= tb->lbytes;
-
- RFALSE(pos_in_item <= 0, "PAP-12125: no place for paste. pos_in_item=%d", pos_in_item);
-
- /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */
- leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
- }
- }
- } else { /* appended item will be in L[0] in whole */
-
- struct item_head *pasted;
-
- if (!item_pos && op_is_left_mergeable(B_N_PKEY(tbS0, 0), tbS0->b_size)) { /* if we paste into first item of S[0] and it is left mergable */
- /* then increment pos_in_item by the size of the last item in L[0] */
- pasted = B_N_PITEM_HEAD(tb->L[0], n - 1);
- if (is_direntry_le_ih(pasted))
- pos_in_item += ih_entry_count(pasted);
- else
- pos_in_item += ih_item_len(pasted);
- }
-
- /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */
- ret_val = leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
- /* Append to body of item in L[0] */
- buffer_info_init_left(tb, &bi);
- leaf_paste_in_buffer(&bi, n + item_pos - ret_val,
- pos_in_item,
- tb->insert_size[0],
- body, zeros_num);
-
- /* if appended item is directory, paste entry */
- pasted = B_N_PITEM_HEAD(tb->L[0], n + item_pos - ret_val);
- if (is_direntry_le_ih(pasted))
- leaf_paste_entries(&bi, n + item_pos - ret_val,
- pos_in_item, 1,
- (struct reiserfs_de_head *) body,
- body + DEH_SIZE,
- tb->insert_size[0]);
- /* if appended item is indirect item, put unformatted node into un list */
- if (is_indirect_le_ih(pasted))
- set_ih_free_space(pasted, 0);
- tb->insert_size[0] = 0;
- zeros_num = 0;
- }
- break;
- default: /* cases d and t */
- reiserfs_panic(tb->tb_sb, "PAP-12130",
- "lnum > 0: unexpected mode: "
- " %s(%d)",
- (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag);
- }
- } else {
- /* new item doesn't fall into L[0] */
- leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
+ RFALSE(tb->item_pos - n + tb->rnum[0],
+ "PAP-12165: directory item must be first "
+ "item of node when pasting is in 0th position");
+
+ /* update delimiting keys */
+ replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
}
}
- /* tb->lnum[0] > 0 */
- /* Calculate new item position */
- item_pos -= (tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0));
-
- if (tb->rnum[0] > 0) {
- /* shift rnum[0] items from S[0] to the right neighbor R[0] */
- n = B_NR_ITEMS(tbS0);
- switch (flag) {
-
- case M_INSERT: /* insert item */
- if (n - tb->rnum[0] < item_pos) { /* new item or its part falls to R[0] */
- if (item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) { /* part of new item falls into R[0] */
- loff_t old_key_comp, old_len, r_zeros_number;
- const char *r_body;
- int version;
- loff_t offset;
-
- leaf_shift_right(tb, tb->rnum[0] - 1, -1);
-
- version = ih_version(ih);
- /* Remember key component and item length */
- old_key_comp = le_ih_k_offset(ih);
- old_len = ih_item_len(ih);
-
- /* Calculate key component and item length to insert into R[0] */
- offset = le_ih_k_offset(ih) + ((old_len - tb->rbytes) << (is_indirect_le_ih(ih) ? tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT : 0));
- set_le_ih_k_offset(ih, offset);
- put_ih_item_len(ih, tb->rbytes);
- /* Insert part of the item into R[0] */
- buffer_info_init_right(tb, &bi);
- if ((old_len - tb->rbytes) > zeros_num) {
- r_zeros_number = 0;
- r_body = body + (old_len - tb->rbytes) - zeros_num;
- } else {
- r_body = body;
- r_zeros_number = zeros_num - (old_len - tb->rbytes);
- zeros_num -= r_zeros_number;
- }
-
- leaf_insert_into_buf(&bi, 0, ih, r_body,
- r_zeros_number);
-
- /* Replace right delimiting key by first key in R[0] */
- replace_key(tb, tb->CFR[0], tb->rkey[0],
- tb->R[0], 0);
-
- /* Calculate key component and item length to insert into S[0] */
- set_le_ih_k_offset(ih, old_key_comp);
- put_ih_item_len(ih, old_len - tb->rbytes);
-
- tb->insert_size[0] -= tb->rbytes;
-
- } else { /* whole new item falls into R[0] */
-
- /* Shift rnum[0]-1 items to R[0] */
- ret_val = leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes);
- /* Insert new item into R[0] */
- buffer_info_init_right(tb, &bi);
- leaf_insert_into_buf(&bi, item_pos - n + tb->rnum[0] - 1,
- ih, body, zeros_num);
-
- if (item_pos - n + tb->rnum[0] - 1 == 0) {
- replace_key(tb, tb->CFR[0],
- tb->rkey[0],
- tb->R[0], 0);
-
- }
- zeros_num = tb->insert_size[0] = 0;
- }
- } else { /* new item or part of it doesn't fall into R[0] */
-
- leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
- }
- break;
+ if (is_indirect_le_ih(pasted))
+ set_ih_free_space(pasted, 0);
+ tb->zeroes_num = tb->insert_size[0] = 0;
+}
- case M_PASTE: /* append item */
-
- if (n - tb->rnum[0] <= item_pos) { /* pasted item or part of it falls to R[0] */
- if (item_pos == n - tb->rnum[0] && tb->rbytes != -1) { /* we must shift the part of the appended item */
- if (is_direntry_le_ih(B_N_PITEM_HEAD(tbS0, item_pos))) { /* we append to directory item */
- int entry_count;
-
- RFALSE(zeros_num,
- "PAP-12145: invalid parameter in case of a directory");
- entry_count = I_ENTRY_COUNT(B_N_PITEM_HEAD
- (tbS0, item_pos));
- if (entry_count - tb->rbytes <
- pos_in_item)
- /* new directory entry falls into R[0] */
- {
- int paste_entry_position;
-
- RFALSE(tb->rbytes - 1 >= entry_count || !tb-> insert_size[0],
- "PAP-12150: no enough of entries to shift to R[0]: rbytes=%d, entry_count=%d",
- tb->rbytes, entry_count);
- /* Shift rnum[0]-1 items in whole. Shift rbytes-1 directory entries from directory item number rnum[0] */
- leaf_shift_right(tb, tb->rnum[0], tb->rbytes - 1);
- /* Paste given directory entry to directory item */
- paste_entry_position = pos_in_item - entry_count + tb->rbytes - 1;
- buffer_info_init_right(tb, &bi);
- leaf_paste_in_buffer(&bi, 0, paste_entry_position, tb->insert_size[0], body, zeros_num);
- /* paste entry */
- leaf_paste_entries(&bi, 0, paste_entry_position, 1,
- (struct reiserfs_de_head *) body,
- body + DEH_SIZE, tb->insert_size[0]);
-
- if (paste_entry_position == 0) {
- /* change delimiting keys */
- replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0],0);
- }
-
- tb->insert_size[0] = 0;
- pos_in_item++;
- } else { /* new directory entry doesn't fall into R[0] */
-
- leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
- }
- } else { /* regular object */
-
- int n_shift, n_rem, r_zeros_number;
- const char *r_body;
-
- /* Calculate number of bytes which must be shifted from appended item */
- if ((n_shift = tb->rbytes - tb->insert_size[0]) < 0)
- n_shift = 0;
-
- RFALSE(pos_in_item != ih_item_len
- (B_N_PITEM_HEAD(tbS0, item_pos)),
- "PAP-12155: invalid position to paste. ih_item_len=%d, pos_in_item=%d",
- pos_in_item, ih_item_len
- (B_N_PITEM_HEAD(tbS0, item_pos)));
-
- leaf_shift_right(tb, tb->rnum[0], n_shift);
- /* Calculate number of bytes which must remain in body after appending to R[0] */
- if ((n_rem = tb->insert_size[0] - tb->rbytes) < 0)
- n_rem = 0;
-
- {
- int version;
- unsigned long temp_rem = n_rem;
-
- version = ih_version(B_N_PITEM_HEAD(tb->R[0], 0));
- if (is_indirect_le_key(version, B_N_PKEY(tb->R[0], 0))) {
- temp_rem = n_rem << (tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT);
- }
- set_le_key_k_offset(version, B_N_PKEY(tb->R[0], 0),
- le_key_k_offset(version, B_N_PKEY(tb->R[0], 0)) + temp_rem);
- set_le_key_k_offset(version, B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0]),
- le_key_k_offset(version, B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0])) + temp_rem);
- }
-/* k_offset (B_N_PKEY(tb->R[0],0)) += n_rem;
- k_offset (B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0])) += n_rem;*/
- do_balance_mark_internal_dirty(tb, tb->CFR[0], 0);
-
- /* Append part of body into R[0] */
- buffer_info_init_right(tb, &bi);
- if (n_rem > zeros_num) {
- r_zeros_number = 0;
- r_body = body + n_rem - zeros_num;
- } else {
- r_body = body;
- r_zeros_number = zeros_num - n_rem;
- zeros_num -= r_zeros_number;
- }
-
- leaf_paste_in_buffer(&bi, 0, n_shift,
- tb->insert_size[0] - n_rem,
- r_body, r_zeros_number);
-
- if (is_indirect_le_ih(B_N_PITEM_HEAD(tb->R[0], 0))) {
-#if 0
- RFALSE(n_rem,
- "PAP-12160: paste more than one unformatted node pointer");
-#endif
- set_ih_free_space(B_N_PITEM_HEAD(tb->R[0], 0), 0);
- }
- tb->insert_size[0] = n_rem;
- if (!n_rem)
- pos_in_item++;
- }
- } else { /* pasted item in whole falls into R[0] */
-
- struct item_head *pasted;
-
- ret_val = leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
- /* append item in R[0] */
- if (pos_in_item >= 0) {
- buffer_info_init_right(tb, &bi);
- leaf_paste_in_buffer(&bi, item_pos - n + tb->rnum[0], pos_in_item,
- tb->insert_size[0], body, zeros_num);
- }
-
- /* paste new entry, if item is directory item */
- pasted = B_N_PITEM_HEAD(tb->R[0], item_pos - n + tb->rnum[0]);
- if (is_direntry_le_ih(pasted) && pos_in_item >= 0) {
- leaf_paste_entries(&bi, item_pos - n + tb->rnum[0],
- pos_in_item, 1,
- (struct reiserfs_de_head *) body,
- body + DEH_SIZE, tb->insert_size[0]);
- if (!pos_in_item) {
-
- RFALSE(item_pos - n + tb->rnum[0],
- "PAP-12165: directory item must be first item of node when pasting is in 0th position");
-
- /* update delimiting keys */
- replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
- }
- }
-
- if (is_indirect_le_ih(pasted))
- set_ih_free_space(pasted, 0);
- zeros_num = tb->insert_size[0] = 0;
- }
- } else { /* new item doesn't fall into R[0] */
-
- leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
- }
- break;
- default: /* cases d and t */
- reiserfs_panic(tb->tb_sb, "PAP-12175",
- "rnum > 0: unexpected mode: %s(%d)",
- (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag);
- }
+static void balance_leaf_paste_right(struct tree_balance *tb,
+ struct item_head *ih, const char *body)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ int n = B_NR_ITEMS(tbS0);
+ /* new item doesn't fall into R[0] */
+ if (n - tb->rnum[0] > tb->item_pos) {
+ leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
+ return;
}
- /* tb->rnum[0] > 0 */
- RFALSE(tb->blknum[0] > 3,
- "PAP-12180: blknum can not be %d. It must be <= 3", tb->blknum[0]);
- RFALSE(tb->blknum[0] < 0,
- "PAP-12185: blknum can not be %d. It must be >= 0", tb->blknum[0]);
+ /* pasted item or part of it falls to R[0] */
- /* if while adding to a node we discover that it is possible to split
- it in two, and merge the left part into the left neighbor and the
- right part into the right neighbor, eliminating the node */
- if (tb->blknum[0] == 0) { /* node S[0] is empty now */
+ if (tb->item_pos == n - tb->rnum[0] && tb->rbytes != -1)
+ /* we must shift the part of the appended item */
+ balance_leaf_paste_right_shift(tb, ih, body);
+ else
+ /* pasted item in whole falls into R[0] */
+ balance_leaf_paste_right_whole(tb, ih, body);
+}
- RFALSE(!tb->lnum[0] || !tb->rnum[0],
- "PAP-12190: lnum and rnum must not be zero");
- /* if insertion was done before 0-th position in R[0], right
- delimiting key of the tb->L[0]'s and left delimiting key are
- not set correctly */
- if (tb->CFL[0]) {
- if (!tb->CFR[0])
- reiserfs_panic(tb->tb_sb, "vs-12195",
- "CFR not initialized");
- copy_key(B_N_PDELIM_KEY(tb->CFL[0], tb->lkey[0]),
- B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0]));
- do_balance_mark_internal_dirty(tb, tb->CFL[0], 0);
+/* shift rnum[0] items from S[0] to the right neighbor R[0] */
+static void balance_leaf_right(struct tree_balance *tb, struct item_head *ih,
+ const char *body, int flag)
+{
+ if (tb->rnum[0] <= 0)
+ return;
+
+ BUG_ON(flag != M_INSERT && flag != M_PASTE);
+
+ if (flag == M_INSERT)
+ balance_leaf_insert_right(tb, ih, body);
+ else /* M_PASTE */
+ balance_leaf_paste_right(tb, ih, body);
+}
+
+static void balance_leaf_new_nodes_insert(struct tree_balance *tb,
+ struct item_head *ih,
+ const char *body,
+ struct item_head *insert_key,
+ struct buffer_head **insert_ptr,
+ int i)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ int n = B_NR_ITEMS(tbS0);
+ struct buffer_info bi;
+ int shift;
+
+ /* new item or it part don't falls into S_new[i] */
+ if (n - tb->snum[i] >= tb->item_pos) {
+ leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
+ tb->snum[i], tb->sbytes[i], tb->S_new[i]);
+ return;
+ }
+
+ /* new item or it's part falls to first new node S_new[i] */
+
+ /* part of new item falls into S_new[i] */
+ if (tb->item_pos == n - tb->snum[i] + 1 && tb->sbytes[i] != -1) {
+ int old_key_comp, old_len, r_zeroes_number;
+ const char *r_body;
+ int version;
+
+ /* Move snum[i]-1 items from S[0] to S_new[i] */
+ leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i] - 1, -1,
+ tb->S_new[i]);
+
+ /* Remember key component and item length */
+ version = ih_version(ih);
+ old_key_comp = le_ih_k_offset(ih);
+ old_len = ih_item_len(ih);
+
+ /*
+ * Calculate key component and item length to insert
+ * into S_new[i]
+ */
+ shift = 0;
+ if (is_indirect_le_ih(ih))
+ shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
+ set_le_ih_k_offset(ih,
+ le_ih_k_offset(ih) +
+ ((old_len - tb->sbytes[i]) << shift));
+
+ put_ih_item_len(ih, tb->sbytes[i]);
+
+ /* Insert part of the item into S_new[i] before 0-th item */
+ buffer_info_init_bh(tb, &bi, tb->S_new[i]);
+
+ if ((old_len - tb->sbytes[i]) > tb->zeroes_num) {
+ r_zeroes_number = 0;
+ r_body = body + (old_len - tb->sbytes[i]) -
+ tb->zeroes_num;
+ } else {
+ r_body = body;
+ r_zeroes_number = tb->zeroes_num - (old_len -
+ tb->sbytes[i]);
+ tb->zeroes_num -= r_zeroes_number;
}
- reiserfs_invalidate_buffer(tb, tbS0);
- return 0;
+ leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeroes_number);
+
+ /*
+ * Calculate key component and item length to
+ * insert into S[i]
+ */
+ set_le_ih_k_offset(ih, old_key_comp);
+ put_ih_item_len(ih, old_len - tb->sbytes[i]);
+ tb->insert_size[0] -= tb->sbytes[i];
+ } else {
+ /* whole new item falls into S_new[i] */
+
+ /*
+ * Shift snum[0] - 1 items to S_new[i]
+ * (sbytes[i] of split item)
+ */
+ leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
+ tb->snum[i] - 1, tb->sbytes[i], tb->S_new[i]);
+
+ /* Insert new item into S_new[i] */
+ buffer_info_init_bh(tb, &bi, tb->S_new[i]);
+ leaf_insert_into_buf(&bi, tb->item_pos - n + tb->snum[i] - 1,
+ ih, body, tb->zeroes_num);
+
+ tb->zeroes_num = tb->insert_size[0] = 0;
}
+}
- /* Fill new nodes that appear in place of S[0] */
+/* we append to directory item */
+static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb,
+ struct item_head *ih,
+ const char *body,
+ struct item_head *insert_key,
+ struct buffer_head **insert_ptr,
+ int i)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ struct item_head *aux_ih = item_head(tbS0, tb->item_pos);
+ int entry_count = ih_entry_count(aux_ih);
+ struct buffer_info bi;
+
+ if (entry_count - tb->sbytes[i] < tb->pos_in_item &&
+ tb->pos_in_item <= entry_count) {
+ /* new directory entry falls into S_new[i] */
+
+ RFALSE(!tb->insert_size[0],
+ "PAP-12215: insert_size is already 0");
+ RFALSE(tb->sbytes[i] - 1 >= entry_count,
+ "PAP-12220: there are no so much entries (%d), only %d",
+ tb->sbytes[i] - 1, entry_count);
+
+ /*
+ * Shift snum[i]-1 items in whole.
+ * Shift sbytes[i] directory entries
+ * from directory item number snum[i]
+ */
+ leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i],
+ tb->sbytes[i] - 1, tb->S_new[i]);
+
+ /*
+ * Paste given directory entry to
+ * directory item
+ */
+ buffer_info_init_bh(tb, &bi, tb->S_new[i]);
+ leaf_paste_in_buffer(&bi, 0, tb->pos_in_item - entry_count +
+ tb->sbytes[i] - 1, tb->insert_size[0],
+ body, tb->zeroes_num);
+
+ /* paste new directory entry */
+ leaf_paste_entries(&bi, 0, tb->pos_in_item - entry_count +
+ tb->sbytes[i] - 1, 1,
+ (struct reiserfs_de_head *) body,
+ body + DEH_SIZE, tb->insert_size[0]);
+
+ tb->insert_size[0] = 0;
+ tb->pos_in_item++;
+ } else {
+ /* new directory entry doesn't fall into S_new[i] */
+ leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i],
+ tb->sbytes[i], tb->S_new[i]);
+ }
+
+}
- /* I am told that this copying is because we need an array to enable
- the looping code. -Hans */
- snum[0] = tb->s1num, snum[1] = tb->s2num;
- sbytes[0] = tb->s1bytes;
- sbytes[1] = tb->s2bytes;
+static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb,
+ struct item_head *ih,
+ const char *body,
+ struct item_head *insert_key,
+ struct buffer_head **insert_ptr,
+ int i)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ struct item_head *aux_ih = item_head(tbS0, tb->item_pos);
+ int n_shift, n_rem, r_zeroes_number, shift;
+ const char *r_body;
+ struct item_head *tmp;
+ struct buffer_info bi;
+
+ RFALSE(ih, "PAP-12210: ih must be 0");
+
+ if (is_direntry_le_ih(aux_ih)) {
+ balance_leaf_new_nodes_paste_dirent(tb, ih, body, insert_key,
+ insert_ptr, i);
+ return;
+ }
+
+ /* regular object */
+
+
+ RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)) ||
+ tb->insert_size[0] <= 0,
+ "PAP-12225: item too short or insert_size <= 0");
+
+ /*
+ * Calculate number of bytes which must be shifted from appended item
+ */
+ n_shift = tb->sbytes[i] - tb->insert_size[0];
+ if (n_shift < 0)
+ n_shift = 0;
+ leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i], n_shift,
+ tb->S_new[i]);
+
+ /*
+ * Calculate number of bytes which must remain in body after
+ * append to S_new[i]
+ */
+ n_rem = tb->insert_size[0] - tb->sbytes[i];
+ if (n_rem < 0)
+ n_rem = 0;
+
+ /* Append part of body into S_new[0] */
+ buffer_info_init_bh(tb, &bi, tb->S_new[i]);
+ if (n_rem > tb->zeroes_num) {
+ r_zeroes_number = 0;
+ r_body = body + n_rem - tb->zeroes_num;
+ } else {
+ r_body = body;
+ r_zeroes_number = tb->zeroes_num - n_rem;
+ tb->zeroes_num -= r_zeroes_number;
+ }
+
+ leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem,
+ r_body, r_zeroes_number);
+
+ tmp = item_head(tb->S_new[i], 0);
+ shift = 0;
+ if (is_indirect_le_ih(tmp)) {
+ set_ih_free_space(tmp, 0);
+ shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
+ }
+ add_le_ih_k_offset(tmp, n_rem << shift);
+
+ tb->insert_size[0] = n_rem;
+ if (!n_rem)
+ tb->pos_in_item++;
+}
+
+static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb,
+ struct item_head *ih,
+ const char *body,
+ struct item_head *insert_key,
+ struct buffer_head **insert_ptr,
+ int i)
+
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ int n = B_NR_ITEMS(tbS0);
+ int leaf_mi;
+ struct item_head *pasted;
+ struct buffer_info bi;
+
+#ifdef CONFIG_REISERFS_CHECK
+ struct item_head *ih_check = item_head(tbS0, tb->item_pos);
+
+ if (!is_direntry_le_ih(ih_check) &&
+ (tb->pos_in_item != ih_item_len(ih_check) ||
+ tb->insert_size[0] <= 0))
+ reiserfs_panic(tb->tb_sb,
+ "PAP-12235",
+ "pos_in_item must be equal to ih_item_len");
+#endif
+
+ leaf_mi = leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i],
+ tb->sbytes[i], tb->S_new[i]);
+
+ RFALSE(leaf_mi,
+ "PAP-12240: unexpected value returned by leaf_move_items (%d)",
+ leaf_mi);
+
+ /* paste into item */
+ buffer_info_init_bh(tb, &bi, tb->S_new[i]);
+ leaf_paste_in_buffer(&bi, tb->item_pos - n + tb->snum[i],
+ tb->pos_in_item, tb->insert_size[0],
+ body, tb->zeroes_num);
+
+ pasted = item_head(tb->S_new[i], tb->item_pos - n +
+ tb->snum[i]);
+ if (is_direntry_le_ih(pasted))
+ leaf_paste_entries(&bi, tb->item_pos - n + tb->snum[i],
+ tb->pos_in_item, 1,
+ (struct reiserfs_de_head *)body,
+ body + DEH_SIZE, tb->insert_size[0]);
+
+ /* if we paste to indirect item update ih_free_space */
+ if (is_indirect_le_ih(pasted))
+ set_ih_free_space(pasted, 0);
+
+ tb->zeroes_num = tb->insert_size[0] = 0;
+
+}
+static void balance_leaf_new_nodes_paste(struct tree_balance *tb,
+ struct item_head *ih,
+ const char *body,
+ struct item_head *insert_key,
+ struct buffer_head **insert_ptr,
+ int i)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ int n = B_NR_ITEMS(tbS0);
+
+ /* pasted item doesn't fall into S_new[i] */
+ if (n - tb->snum[i] > tb->item_pos) {
+ leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
+ tb->snum[i], tb->sbytes[i], tb->S_new[i]);
+ return;
+ }
+
+ /* pasted item or part if it falls to S_new[i] */
+
+ if (tb->item_pos == n - tb->snum[i] && tb->sbytes[i] != -1)
+ /* we must shift part of the appended item */
+ balance_leaf_new_nodes_paste_shift(tb, ih, body, insert_key,
+ insert_ptr, i);
+ else
+ /* item falls wholly into S_new[i] */
+ balance_leaf_new_nodes_paste_whole(tb, ih, body, insert_key,
+ insert_ptr, i);
+}
+
+/* Fill new nodes that appear in place of S[0] */
+static void balance_leaf_new_nodes(struct tree_balance *tb,
+ struct item_head *ih,
+ const char *body,
+ struct item_head *insert_key,
+ struct buffer_head **insert_ptr,
+ int flag)
+{
+ int i;
for (i = tb->blknum[0] - 2; i >= 0; i--) {
+ BUG_ON(flag != M_INSERT && flag != M_PASTE);
- RFALSE(!snum[i], "PAP-12200: snum[%d] == %d. Must be > 0", i,
- snum[i]);
+ RFALSE(!tb->snum[i],
+ "PAP-12200: snum[%d] == %d. Must be > 0", i,
+ tb->snum[i]);
/* here we shift from S to S_new nodes */
- S_new[i] = get_FEB(tb);
+ tb->S_new[i] = get_FEB(tb);
/* initialized block type and tree level */
- set_blkh_level(B_BLK_HEAD(S_new[i]), DISK_LEAF_NODE_LEVEL);
-
- n = B_NR_ITEMS(tbS0);
-
- switch (flag) {
- case M_INSERT: /* insert item */
-
- if (n - snum[i] < item_pos) { /* new item or it's part falls to first new node S_new[i] */
- if (item_pos == n - snum[i] + 1 && sbytes[i] != -1) { /* part of new item falls into S_new[i] */
- int old_key_comp, old_len, r_zeros_number;
- const char *r_body;
- int version;
-
- /* Move snum[i]-1 items from S[0] to S_new[i] */
- leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
- snum[i] - 1, -1,
- S_new[i]);
- /* Remember key component and item length */
- version = ih_version(ih);
- old_key_comp = le_ih_k_offset(ih);
- old_len = ih_item_len(ih);
-
- /* Calculate key component and item length to insert into S_new[i] */
- set_le_ih_k_offset(ih, le_ih_k_offset(ih) +
- ((old_len - sbytes[i]) << (is_indirect_le_ih(ih) ? tb->tb_sb-> s_blocksize_bits - UNFM_P_SHIFT : 0)));
-
- put_ih_item_len(ih, sbytes[i]);
-
- /* Insert part of the item into S_new[i] before 0-th item */
- buffer_info_init_bh(tb, &bi, S_new[i]);
-
- if ((old_len - sbytes[i]) > zeros_num) {
- r_zeros_number = 0;
- r_body = body + (old_len - sbytes[i]) - zeros_num;
- } else {
- r_body = body;
- r_zeros_number = zeros_num - (old_len - sbytes[i]);
- zeros_num -= r_zeros_number;
- }
-
- leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeros_number);
-
- /* Calculate key component and item length to insert into S[i] */
- set_le_ih_k_offset(ih, old_key_comp);
- put_ih_item_len(ih, old_len - sbytes[i]);
- tb->insert_size[0] -= sbytes[i];
- } else { /* whole new item falls into S_new[i] */
-
- /* Shift snum[0] - 1 items to S_new[i] (sbytes[i] of split item) */
- leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
- snum[i] - 1, sbytes[i], S_new[i]);
-
- /* Insert new item into S_new[i] */
- buffer_info_init_bh(tb, &bi, S_new[i]);
- leaf_insert_into_buf(&bi, item_pos - n + snum[i] - 1,
- ih, body, zeros_num);
-
- zeros_num = tb->insert_size[0] = 0;
- }
- }
-
- else { /* new item or it part don't falls into S_new[i] */
+ set_blkh_level(B_BLK_HEAD(tb->S_new[i]), DISK_LEAF_NODE_LEVEL);
+
+ if (flag == M_INSERT)
+ balance_leaf_new_nodes_insert(tb, ih, body, insert_key,
+ insert_ptr, i);
+ else /* M_PASTE */
+ balance_leaf_new_nodes_paste(tb, ih, body, insert_key,
+ insert_ptr, i);
+
+ memcpy(insert_key + i, leaf_key(tb->S_new[i], 0), KEY_SIZE);
+ insert_ptr[i] = tb->S_new[i];
+
+ RFALSE(!buffer_journaled(tb->S_new[i])
+ || buffer_journal_dirty(tb->S_new[i])
+ || buffer_dirty(tb->S_new[i]),
+ "PAP-12247: S_new[%d] : (%b)",
+ i, tb->S_new[i]);
+ }
+}
- leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
- snum[i], sbytes[i], S_new[i]);
- }
- break;
+static void balance_leaf_finish_node_insert(struct tree_balance *tb,
+ struct item_head *ih,
+ const char *body)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ struct buffer_info bi;
+ buffer_info_init_tbS0(tb, &bi);
+ leaf_insert_into_buf(&bi, tb->item_pos, ih, body, tb->zeroes_num);
- case M_PASTE: /* append item */
-
- if (n - snum[i] <= item_pos) { /* pasted item or part if it falls to S_new[i] */
- if (item_pos == n - snum[i] && sbytes[i] != -1) { /* we must shift part of the appended item */
- struct item_head *aux_ih;
-
- RFALSE(ih, "PAP-12210: ih must be 0");
-
- aux_ih = B_N_PITEM_HEAD(tbS0, item_pos);
- if (is_direntry_le_ih(aux_ih)) {
- /* we append to directory item */
-
- int entry_count;
-
- entry_count = ih_entry_count(aux_ih);
-
- if (entry_count - sbytes[i] < pos_in_item && pos_in_item <= entry_count) {
- /* new directory entry falls into S_new[i] */
-
- RFALSE(!tb->insert_size[0], "PAP-12215: insert_size is already 0");
- RFALSE(sbytes[i] - 1 >= entry_count,
- "PAP-12220: there are no so much entries (%d), only %d",
- sbytes[i] - 1, entry_count);
-
- /* Shift snum[i]-1 items in whole. Shift sbytes[i] directory entries from directory item number snum[i] */
- leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i] - 1, S_new[i]);
- /* Paste given directory entry to directory item */
- buffer_info_init_bh(tb, &bi, S_new[i]);
- leaf_paste_in_buffer(&bi, 0, pos_in_item - entry_count + sbytes[i] - 1,
- tb->insert_size[0], body, zeros_num);
- /* paste new directory entry */
- leaf_paste_entries(&bi, 0, pos_in_item - entry_count + sbytes[i] - 1, 1,
- (struct reiserfs_de_head *) body,
- body + DEH_SIZE, tb->insert_size[0]);
- tb->insert_size[0] = 0;
- pos_in_item++;
- } else { /* new directory entry doesn't fall into S_new[i] */
- leaf_move_items(LEAF_FROM_S_TO_SNEW,tb, snum[i], sbytes[i], S_new[i]);
- }
- } else { /* regular object */
-
- int n_shift, n_rem, r_zeros_number;
- const char *r_body;
-
- RFALSE(pos_in_item != ih_item_len(B_N_PITEM_HEAD(tbS0, item_pos)) || tb->insert_size[0] <= 0,
- "PAP-12225: item too short or insert_size <= 0");
-
- /* Calculate number of bytes which must be shifted from appended item */
- n_shift = sbytes[i] - tb->insert_size[0];
- if (n_shift < 0)
- n_shift = 0;
- leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, snum[i], n_shift, S_new[i]);
-
- /* Calculate number of bytes which must remain in body after append to S_new[i] */
- n_rem = tb->insert_size[0] - sbytes[i];
- if (n_rem < 0)
- n_rem = 0;
- /* Append part of body into S_new[0] */
- buffer_info_init_bh(tb, &bi, S_new[i]);
- if (n_rem > zeros_num) {
- r_zeros_number = 0;
- r_body = body + n_rem - zeros_num;
- } else {
- r_body = body;
- r_zeros_number = zeros_num - n_rem;
- zeros_num -= r_zeros_number;
- }
-
- leaf_paste_in_buffer(&bi, 0, n_shift,
- tb->insert_size[0] - n_rem,
- r_body, r_zeros_number);
- {
- struct item_head *tmp;
-
- tmp = B_N_PITEM_HEAD(S_new[i], 0);
- if (is_indirect_le_ih
- (tmp)) {
- set_ih_free_space(tmp, 0);
- set_le_ih_k_offset(tmp, le_ih_k_offset(tmp) + (n_rem << (tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT)));
- } else {
- set_le_ih_k_offset(tmp, le_ih_k_offset(tmp) + n_rem);
- }
- }
-
- tb->insert_size[0] = n_rem;
- if (!n_rem)
- pos_in_item++;
- }
- } else
- /* item falls wholly into S_new[i] */
- {
- int leaf_mi;
- struct item_head *pasted;
+ /* If we insert the first key change the delimiting key */
+ if (tb->item_pos == 0) {
+ if (tb->CFL[0]) /* can be 0 in reiserfsck */
+ replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
-#ifdef CONFIG_REISERFS_CHECK
- struct item_head *ih_check = B_N_PITEM_HEAD(tbS0, item_pos);
-
- if (!is_direntry_le_ih(ih_check)
- && (pos_in_item != ih_item_len(ih_check)
- || tb->insert_size[0] <= 0))
- reiserfs_panic(tb->tb_sb,
- "PAP-12235",
- "pos_in_item "
- "must be equal "
- "to ih_item_len");
-#endif /* CONFIG_REISERFS_CHECK */
-
- leaf_mi = leaf_move_items(LEAF_FROM_S_TO_SNEW,
- tb, snum[i],
- sbytes[i],
- S_new[i]);
-
- RFALSE(leaf_mi,
- "PAP-12240: unexpected value returned by leaf_move_items (%d)",
- leaf_mi);
-
- /* paste into item */
- buffer_info_init_bh(tb, &bi, S_new[i]);
- leaf_paste_in_buffer(&bi,
- item_pos - n + snum[i],
- pos_in_item,
- tb->insert_size[0],
- body, zeros_num);
-
- pasted = B_N_PITEM_HEAD(S_new[i], item_pos - n + snum[i]);
- if (is_direntry_le_ih(pasted)) {
- leaf_paste_entries(&bi,
- item_pos - n + snum[i],
- pos_in_item, 1,
- (struct reiserfs_de_head *)body,
- body + DEH_SIZE,
- tb->insert_size[0]
- );
- }
-
- /* if we paste to indirect item update ih_free_space */
- if (is_indirect_le_ih(pasted))
- set_ih_free_space(pasted, 0);
- zeros_num = tb->insert_size[0] = 0;
- }
- }
+ }
+}
- else { /* pasted item doesn't fall into S_new[i] */
+static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb,
+ struct item_head *ih,
+ const char *body)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ struct item_head *pasted = item_head(tbS0, tb->item_pos);
+ struct buffer_info bi;
- leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
- snum[i], sbytes[i], S_new[i]);
- }
- break;
- default: /* cases d and t */
- reiserfs_panic(tb->tb_sb, "PAP-12245",
- "blknum > 2: unexpected mode: %s(%d)",
- (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag);
+ if (tb->pos_in_item >= 0 && tb->pos_in_item <= ih_entry_count(pasted)) {
+ RFALSE(!tb->insert_size[0],
+ "PAP-12260: insert_size is 0 already");
+
+ /* prepare space */
+ buffer_info_init_tbS0(tb, &bi);
+ leaf_paste_in_buffer(&bi, tb->item_pos, tb->pos_in_item,
+ tb->insert_size[0], body, tb->zeroes_num);
+
+ /* paste entry */
+ leaf_paste_entries(&bi, tb->item_pos, tb->pos_in_item, 1,
+ (struct reiserfs_de_head *)body,
+ body + DEH_SIZE, tb->insert_size[0]);
+
+ if (!tb->item_pos && !tb->pos_in_item) {
+ RFALSE(!tb->CFL[0] || !tb->L[0],
+ "PAP-12270: CFL[0]/L[0] must be specified");
+ if (tb->CFL[0])
+ replace_key(tb, tb->CFL[0], tb->lkey[0],
+ tbS0, 0);
}
- memcpy(insert_key + i, B_N_PKEY(S_new[i], 0), KEY_SIZE);
- insert_ptr[i] = S_new[i];
+ tb->insert_size[0] = 0;
+ }
+}
+
+static void balance_leaf_finish_node_paste(struct tree_balance *tb,
+ struct item_head *ih,
+ const char *body)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ struct buffer_info bi;
+ struct item_head *pasted = item_head(tbS0, tb->item_pos);
- RFALSE(!buffer_journaled(S_new[i])
- || buffer_journal_dirty(S_new[i])
- || buffer_dirty(S_new[i]), "PAP-12247: S_new[%d] : (%b)",
- i, S_new[i]);
+ /* when directory, may be new entry already pasted */
+ if (is_direntry_le_ih(pasted)) {
+ balance_leaf_finish_node_paste_dirent(tb, ih, body);
+ return;
}
- /* if the affected item was not wholly shifted then we perform all necessary operations on that part or whole of the
- affected item which remains in S */
- if (0 <= item_pos && item_pos < tb->s0num) { /* if we must insert or append into buffer S[0] */
+ /* regular object */
- switch (flag) {
- case M_INSERT: /* insert item into S[0] */
- buffer_info_init_tbS0(tb, &bi);
- leaf_insert_into_buf(&bi, item_pos, ih, body,
- zeros_num);
+ if (tb->pos_in_item == ih_item_len(pasted)) {
+ RFALSE(tb->insert_size[0] <= 0,
+ "PAP-12275: insert size must not be %d",
+ tb->insert_size[0]);
+ buffer_info_init_tbS0(tb, &bi);
+ leaf_paste_in_buffer(&bi, tb->item_pos,
+ tb->pos_in_item, tb->insert_size[0], body,
+ tb->zeroes_num);
- /* If we insert the first key change the delimiting key */
- if (item_pos == 0) {
- if (tb->CFL[0]) /* can be 0 in reiserfsck */
- replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
- }
- break;
+ if (is_indirect_le_ih(pasted))
+ set_ih_free_space(pasted, 0);
- case M_PASTE:{ /* append item in S[0] */
- struct item_head *pasted;
-
- pasted = B_N_PITEM_HEAD(tbS0, item_pos);
- /* when directory, may be new entry already pasted */
- if (is_direntry_le_ih(pasted)) {
- if (pos_in_item >= 0 && pos_in_item <= ih_entry_count(pasted)) {
-
- RFALSE(!tb->insert_size[0],
- "PAP-12260: insert_size is 0 already");
-
- /* prepare space */
- buffer_info_init_tbS0(tb, &bi);
- leaf_paste_in_buffer(&bi, item_pos, pos_in_item,
- tb->insert_size[0], body,
- zeros_num);
-
- /* paste entry */
- leaf_paste_entries(&bi, item_pos, pos_in_item, 1,
- (struct reiserfs_de_head *)body,
- body + DEH_SIZE,
- tb->insert_size[0]);
- if (!item_pos && !pos_in_item) {
- RFALSE(!tb->CFL[0] || !tb->L[0],
- "PAP-12270: CFL[0]/L[0] must be specified");
- if (tb->CFL[0])
- replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
- }
- tb->insert_size[0] = 0;
- }
- } else { /* regular object */
- if (pos_in_item == ih_item_len(pasted)) {
-
- RFALSE(tb->insert_size[0] <= 0,
- "PAP-12275: insert size must not be %d",
- tb->insert_size[0]);
- buffer_info_init_tbS0(tb, &bi);
- leaf_paste_in_buffer(&bi, item_pos, pos_in_item,
- tb->insert_size[0], body, zeros_num);
-
- if (is_indirect_le_ih(pasted)) {
-#if 0
- RFALSE(tb->
- insert_size[0] !=
- UNFM_P_SIZE,
- "PAP-12280: insert_size for indirect item must be %d, not %d",
- UNFM_P_SIZE,
- tb->
- insert_size[0]);
-#endif
- set_ih_free_space(pasted, 0);
- }
- tb->insert_size[0] = 0;
- }
+ tb->insert_size[0] = 0;
+ }
#ifdef CONFIG_REISERFS_CHECK
- else {
- if (tb->insert_size[0]) {
- print_cur_tb("12285");
- reiserfs_panic(tb->tb_sb,
- "PAP-12285",
- "insert_size "
- "must be 0 "
- "(%d)",
- tb->insert_size[0]);
- }
- }
-#endif /* CONFIG_REISERFS_CHECK */
-
- }
- } /* case M_PASTE: */
+ else if (tb->insert_size[0]) {
+ print_cur_tb("12285");
+ reiserfs_panic(tb->tb_sb, "PAP-12285",
+ "insert_size must be 0 (%d)", tb->insert_size[0]);
+ }
+#endif
+}
+
+/*
+ * if the affected item was not wholly shifted then we
+ * perform all necessary operations on that part or whole
+ * of the affected item which remains in S
+ */
+static void balance_leaf_finish_node(struct tree_balance *tb,
+ struct item_head *ih,
+ const char *body, int flag)
+{
+ /* if we must insert or append into buffer S[0] */
+ if (0 <= tb->item_pos && tb->item_pos < tb->s0num) {
+ if (flag == M_INSERT)
+ balance_leaf_finish_node_insert(tb, ih, body);
+ else /* M_PASTE */
+ balance_leaf_finish_node_paste(tb, ih, body);
+ }
+}
+
+/**
+ * balance_leaf - reiserfs tree balancing algorithm
+ * @tb: tree balance state
+ * @ih: item header of inserted item (little endian)
+ * @body: body of inserted item or bytes to paste
+ * @flag: i - insert, d - delete, c - cut, p - paste (see do_balance)
+ * passed back:
+ * @insert_key: key to insert new nodes
+ * @insert_ptr: array of nodes to insert at the next level
+ *
+ * In our processing of one level we sometimes determine what must be
+ * inserted into the next higher level. This insertion consists of a
+ * key or two keys and their corresponding pointers.
+ */
+static int balance_leaf(struct tree_balance *tb, struct item_head *ih,
+ const char *body, int flag,
+ struct item_head *insert_key,
+ struct buffer_head **insert_ptr)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+
+ PROC_INFO_INC(tb->tb_sb, balance_at[0]);
+
+ /* Make balance in case insert_size[0] < 0 */
+ if (tb->insert_size[0] < 0)
+ return balance_leaf_when_delete(tb, flag);
+
+ tb->item_pos = PATH_LAST_POSITION(tb->tb_path),
+ tb->pos_in_item = tb->tb_path->pos_in_item,
+ tb->zeroes_num = 0;
+ if (flag == M_INSERT && !body)
+ tb->zeroes_num = ih_item_len(ih);
+
+ /*
+ * for indirect item pos_in_item is measured in unformatted node
+ * pointers. Recalculate to bytes
+ */
+ if (flag != M_INSERT
+ && is_indirect_le_ih(item_head(tbS0, tb->item_pos)))
+ tb->pos_in_item *= UNFM_P_SIZE;
+
+ balance_leaf_left(tb, ih, body, flag);
+
+ /* tb->lnum[0] > 0 */
+ /* Calculate new item position */
+ tb->item_pos -= (tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0));
+
+ balance_leaf_right(tb, ih, body, flag);
+
+ /* tb->rnum[0] > 0 */
+ RFALSE(tb->blknum[0] > 3,
+ "PAP-12180: blknum can not be %d. It must be <= 3", tb->blknum[0]);
+ RFALSE(tb->blknum[0] < 0,
+ "PAP-12185: blknum can not be %d. It must be >= 0", tb->blknum[0]);
+
+ /*
+ * if while adding to a node we discover that it is possible to split
+ * it in two, and merge the left part into the left neighbor and the
+ * right part into the right neighbor, eliminating the node
+ */
+ if (tb->blknum[0] == 0) { /* node S[0] is empty now */
+
+ RFALSE(!tb->lnum[0] || !tb->rnum[0],
+ "PAP-12190: lnum and rnum must not be zero");
+ /*
+ * if insertion was done before 0-th position in R[0], right
+ * delimiting key of the tb->L[0]'s and left delimiting key are
+ * not set correctly
+ */
+ if (tb->CFL[0]) {
+ if (!tb->CFR[0])
+ reiserfs_panic(tb->tb_sb, "vs-12195",
+ "CFR not initialized");
+ copy_key(internal_key(tb->CFL[0], tb->lkey[0]),
+ internal_key(tb->CFR[0], tb->rkey[0]));
+ do_balance_mark_internal_dirty(tb, tb->CFL[0], 0);
}
+
+ reiserfs_invalidate_buffer(tb, tbS0);
+ return 0;
}
+
+ balance_leaf_new_nodes(tb, ih, body, insert_key, insert_ptr, flag);
+
+ balance_leaf_finish_node(tb, ih, body, flag);
+
#ifdef CONFIG_REISERFS_CHECK
if (flag == M_PASTE && tb->insert_size[0]) {
print_cur_tb("12290");
@@ -1137,9 +1454,11 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
"PAP-12290", "insert_size is still not 0 (%d)",
tb->insert_size[0]);
}
-#endif /* CONFIG_REISERFS_CHECK */
+#endif
+
+ /* Leaf level of the tree is balanced (end of balance_leaf) */
return 0;
-} /* Leaf level of the tree is balanced (end of balance_leaf) */
+}
/* Make empty node */
void make_empty_node(struct buffer_info *bi)
@@ -1178,9 +1497,7 @@ struct buffer_head *get_FEB(struct tree_balance *tb)
return tb->used[i];
}
-/* This is now used because reiserfs_free_block has to be able to
-** schedule.
-*/
+/* This is now used because reiserfs_free_block has to be able to schedule. */
static void store_thrown(struct tree_balance *tb, struct buffer_head *bh)
{
int i;
@@ -1246,10 +1563,10 @@ void replace_key(struct tree_balance *tb, struct buffer_head *dest, int n_dest,
if (B_IS_ITEMS_LEVEL(src))
/* source buffer contains leaf node */
- memcpy(B_N_PDELIM_KEY(dest, n_dest), B_N_PITEM_HEAD(src, n_src),
+ memcpy(internal_key(dest, n_dest), item_head(src, n_src),
KEY_SIZE);
else
- memcpy(B_N_PDELIM_KEY(dest, n_dest), B_N_PDELIM_KEY(src, n_src),
+ memcpy(internal_key(dest, n_dest), internal_key(src, n_src),
KEY_SIZE);
do_balance_mark_internal_dirty(tb, dest, 0);
@@ -1335,8 +1652,10 @@ static int check_before_balancing(struct tree_balance *tb)
"mount point.");
}
- /* double check that buffers that we will modify are unlocked. (fix_nodes should already have
- prepped all of these for us). */
+ /*
+ * double check that buffers that we will modify are unlocked.
+ * (fix_nodes should already have prepped all of these for us).
+ */
if (tb->lnum[0]) {
retval |= locked_or_not_in_tree(tb, tb->L[0], "L[0]");
retval |= locked_or_not_in_tree(tb, tb->FL[0], "FL[0]");
@@ -1429,49 +1748,51 @@ static void check_internal_levels(struct tree_balance *tb)
#endif
-/* Now we have all of the buffers that must be used in balancing of
- the tree. We rely on the assumption that schedule() will not occur
- while do_balance works. ( Only interrupt handlers are acceptable.)
- We balance the tree according to the analysis made before this,
- using buffers already obtained. For SMP support it will someday be
- necessary to add ordered locking of tb. */
-
-/* Some interesting rules of balancing:
-
- we delete a maximum of two nodes per level per balancing: we never
- delete R, when we delete two of three nodes L, S, R then we move
- them into R.
-
- we only delete L if we are deleting two nodes, if we delete only
- one node we delete S
-
- if we shift leaves then we shift as much as we can: this is a
- deliberate policy of extremism in node packing which results in
- higher average utilization after repeated random balance operations
- at the cost of more memory copies and more balancing as a result of
- small insertions to full nodes.
-
- if we shift internal nodes we try to evenly balance the node
- utilization, with consequent less balancing at the cost of lower
- utilization.
-
- one could argue that the policy for directories in leaves should be
- that of internal nodes, but we will wait until another day to
- evaluate this.... It would be nice to someday measure and prove
- these assumptions as to what is optimal....
+/*
+ * Now we have all of the buffers that must be used in balancing of
+ * the tree. We rely on the assumption that schedule() will not occur
+ * while do_balance works. ( Only interrupt handlers are acceptable.)
+ * We balance the tree according to the analysis made before this,
+ * using buffers already obtained. For SMP support it will someday be
+ * necessary to add ordered locking of tb.
+ */
-*/
+/*
+ * Some interesting rules of balancing:
+ * we delete a maximum of two nodes per level per balancing: we never
+ * delete R, when we delete two of three nodes L, S, R then we move
+ * them into R.
+ *
+ * we only delete L if we are deleting two nodes, if we delete only
+ * one node we delete S
+ *
+ * if we shift leaves then we shift as much as we can: this is a
+ * deliberate policy of extremism in node packing which results in
+ * higher average utilization after repeated random balance operations
+ * at the cost of more memory copies and more balancing as a result of
+ * small insertions to full nodes.
+ *
+ * if we shift internal nodes we try to evenly balance the node
+ * utilization, with consequent less balancing at the cost of lower
+ * utilization.
+ *
+ * one could argue that the policy for directories in leaves should be
+ * that of internal nodes, but we will wait until another day to
+ * evaluate this.... It would be nice to someday measure and prove
+ * these assumptions as to what is optimal....
+ */
static inline void do_balance_starts(struct tree_balance *tb)
{
- /* use print_cur_tb() to see initial state of struct
- tree_balance */
+ /* use print_cur_tb() to see initial state of struct tree_balance */
/* store_print_tb (tb); */
/* do not delete, just comment it out */
-/* print_tb(flag, PATH_LAST_POSITION(tb->tb_path), tb->tb_path->pos_in_item, tb,
- "check");*/
+ /*
+ print_tb(flag, PATH_LAST_POSITION(tb->tb_path),
+ tb->tb_path->pos_in_item, tb, "check");
+ */
RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB");
#ifdef CONFIG_REISERFS_CHECK
REISERFS_SB(tb->tb_sb)->cur_tb = tb;
@@ -1487,9 +1808,10 @@ static inline void do_balance_completed(struct tree_balance *tb)
REISERFS_SB(tb->tb_sb)->cur_tb = NULL;
#endif
- /* reiserfs_free_block is no longer schedule safe. So, we need to
- ** put the buffers we want freed on the thrown list during do_balance,
- ** and then free them now
+ /*
+ * reiserfs_free_block is no longer schedule safe. So, we need to
+ * put the buffers we want freed on the thrown list during do_balance,
+ * and then free them now
*/
REISERFS_SB(tb->tb_sb)->s_do_balance++;
@@ -1500,36 +1822,40 @@ static inline void do_balance_completed(struct tree_balance *tb)
free_thrown(tb);
}
-void do_balance(struct tree_balance *tb, /* tree_balance structure */
- struct item_head *ih, /* item header of inserted item */
- const char *body, /* body of inserted item or bytes to paste */
- int flag)
-{ /* i - insert, d - delete
- c - cut, p - paste
-
- Cut means delete part of an item
- (includes removing an entry from a
- directory).
-
- Delete means delete whole item.
-
- Insert means add a new item into the
- tree.
-
- Paste means to append to the end of an
- existing file or to insert a directory
- entry. */
- int child_pos, /* position of a child node in its parent */
- h; /* level of the tree being processed */
- struct item_head insert_key[2]; /* in our processing of one level
- we sometimes determine what
- must be inserted into the next
- higher level. This insertion
- consists of a key or two keys
- and their corresponding
- pointers */
- struct buffer_head *insert_ptr[2]; /* inserted node-ptrs for the next
- level */
+/*
+ * do_balance - balance the tree
+ *
+ * @tb: tree_balance structure
+ * @ih: item header of inserted item
+ * @body: body of inserted item or bytes to paste
+ * @flag: 'i' - insert, 'd' - delete, 'c' - cut, 'p' paste
+ *
+ * Cut means delete part of an item (includes removing an entry from a
+ * directory).
+ *
+ * Delete means delete whole item.
+ *
+ * Insert means add a new item into the tree.
+ *
+ * Paste means to append to the end of an existing file or to
+ * insert a directory entry.
+ */
+void do_balance(struct tree_balance *tb, struct item_head *ih,
+ const char *body, int flag)
+{
+ int child_pos; /* position of a child node in its parent */
+ int h; /* level of the tree being processed */
+
+ /*
+ * in our processing of one level we sometimes determine what
+ * must be inserted into the next higher level. This insertion
+ * consists of a key or two keys and their corresponding
+ * pointers
+ */
+ struct item_head insert_key[2];
+
+ /* inserted node-ptrs for the next level */
+ struct buffer_head *insert_ptr[2];
tb->tb_mode = flag;
tb->need_balance_dirty = 0;
@@ -1546,12 +1872,14 @@ void do_balance(struct tree_balance *tb, /* tree_balance structure */
return;
}
- atomic_inc(&(fs_generation(tb->tb_sb)));
+ atomic_inc(&fs_generation(tb->tb_sb));
do_balance_starts(tb);
- /* balance leaf returns 0 except if combining L R and S into
- one node. see balance_internal() for explanation of this
- line of code. */
+ /*
+ * balance_leaf returns 0 except if combining L R and S into
+ * one node. see balance_internal() for explanation of this
+ * line of code.
+ */
child_pos = PATH_H_B_ITEM_ORDER(tb->tb_path, 0) +
balance_leaf(tb, ih, body, flag, insert_key, insert_ptr);
@@ -1561,9 +1889,8 @@ void do_balance(struct tree_balance *tb, /* tree_balance structure */
/* Balance internal level of the tree. */
for (h = 1; h < MAX_HEIGHT && tb->insert_size[h]; h++)
- child_pos =
- balance_internal(tb, h, child_pos, insert_key, insert_ptr);
+ child_pos = balance_internal(tb, h, child_pos, insert_key,
+ insert_ptr);
do_balance_completed(tb);
-
}
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index ed58d843d578..db9e80ba53a0 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -15,20 +15,20 @@
#include <linux/quotaops.h>
/*
-** We pack the tails of files on file close, not at the time they are written.
-** This implies an unnecessary copy of the tail and an unnecessary indirect item
-** insertion/balancing, for files that are written in one write.
-** It avoids unnecessary tail packings (balances) for files that are written in
-** multiple writes and are small enough to have tails.
-**
-** file_release is called by the VFS layer when the file is closed. If
-** this is the last open file descriptor, and the file
-** small enough to have a tail, and the tail is currently in an
-** unformatted node, the tail is converted back into a direct item.
-**
-** We use reiserfs_truncate_file to pack the tail, since it already has
-** all the conditions coded.
-*/
+ * We pack the tails of files on file close, not at the time they are written.
+ * This implies an unnecessary copy of the tail and an unnecessary indirect item
+ * insertion/balancing, for files that are written in one write.
+ * It avoids unnecessary tail packings (balances) for files that are written in
+ * multiple writes and are small enough to have tails.
+ *
+ * file_release is called by the VFS layer when the file is closed. If
+ * this is the last open file descriptor, and the file
+ * small enough to have a tail, and the tail is currently in an
+ * unformatted node, the tail is converted back into a direct item.
+ *
+ * We use reiserfs_truncate_file to pack the tail, since it already has
+ * all the conditions coded.
+ */
static int reiserfs_file_release(struct inode *inode, struct file *filp)
{
@@ -41,10 +41,10 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
if (atomic_add_unless(&REISERFS_I(inode)->openers, -1, 1))
return 0;
- mutex_lock(&(REISERFS_I(inode)->tailpack));
+ mutex_lock(&REISERFS_I(inode)->tailpack);
if (!atomic_dec_and_test(&REISERFS_I(inode)->openers)) {
- mutex_unlock(&(REISERFS_I(inode)->tailpack));
+ mutex_unlock(&REISERFS_I(inode)->tailpack);
return 0;
}
@@ -52,31 +52,35 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
if ((!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
!tail_has_to_be_packed(inode)) &&
REISERFS_I(inode)->i_prealloc_count <= 0) {
- mutex_unlock(&(REISERFS_I(inode)->tailpack));
+ mutex_unlock(&REISERFS_I(inode)->tailpack);
return 0;
}
reiserfs_write_lock(inode->i_sb);
- /* freeing preallocation only involves relogging blocks that
+ /*
+ * freeing preallocation only involves relogging blocks that
* are already in the current transaction. preallocation gets
* freed at the end of each transaction, so it is impossible for
* us to log any additional blocks (including quota blocks)
*/
err = journal_begin(&th, inode->i_sb, 1);
if (err) {
- /* uh oh, we can't allow the inode to go away while there
+ /*
+ * uh oh, we can't allow the inode to go away while there
* is still preallocation blocks pending. Try to join the
* aborted transaction
*/
jbegin_failure = err;
- err = journal_join_abort(&th, inode->i_sb, 1);
+ err = journal_join_abort(&th, inode->i_sb);
if (err) {
- /* hmpf, our choices here aren't good. We can pin the inode
- * which will disallow unmount from every happening, we can
- * do nothing, which will corrupt random memory on unmount,
- * or we can forcibly remove the file from the preallocation
- * list, which will leak blocks on disk. Lets pin the inode
+ /*
+ * hmpf, our choices here aren't good. We can pin
+ * the inode which will disallow unmount from ever
+ * happening, we can do nothing, which will corrupt
+ * random memory on unmount, or we can forcibly
+ * remove the file from the preallocation list, which
+ * will leak blocks on disk. Lets pin the inode
* and let the admin know what is going on.
*/
igrab(inode);
@@ -92,7 +96,7 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
#ifdef REISERFS_PREALLOCATE
reiserfs_discard_prealloc(&th, inode);
#endif
- err = journal_end(&th, inode->i_sb, 1);
+ err = journal_end(&th);
/* copy back the error code from journal_begin */
if (!err)
@@ -102,35 +106,38 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
tail_has_to_be_packed(inode)) {
- /* if regular file is released by last holder and it has been
- appended (we append by unformatted node only) or its direct
- item(s) had to be converted, then it may have to be
- indirect2direct converted */
+ /*
+ * if regular file is released by last holder and it has been
+ * appended (we append by unformatted node only) or its direct
+ * item(s) had to be converted, then it may have to be
+ * indirect2direct converted
+ */
err = reiserfs_truncate_file(inode, 0);
}
- out:
+out:
reiserfs_write_unlock(inode->i_sb);
- mutex_unlock(&(REISERFS_I(inode)->tailpack));
+ mutex_unlock(&REISERFS_I(inode)->tailpack);
return err;
}
static int reiserfs_file_open(struct inode *inode, struct file *file)
{
int err = dquot_file_open(inode, file);
+
+ /* somebody might be tailpacking on final close; wait for it */
if (!atomic_inc_not_zero(&REISERFS_I(inode)->openers)) {
- /* somebody might be tailpacking on final close; wait for it */
- mutex_lock(&(REISERFS_I(inode)->tailpack));
+ mutex_lock(&REISERFS_I(inode)->tailpack);
atomic_inc(&REISERFS_I(inode)->openers);
- mutex_unlock(&(REISERFS_I(inode)->tailpack));
+ mutex_unlock(&REISERFS_I(inode)->tailpack);
}
return err;
}
void reiserfs_vfs_truncate_file(struct inode *inode)
{
- mutex_lock(&(REISERFS_I(inode)->tailpack));
+ mutex_lock(&REISERFS_I(inode)->tailpack);
reiserfs_truncate_file(inode, 1);
- mutex_unlock(&(REISERFS_I(inode)->tailpack));
+ mutex_unlock(&REISERFS_I(inode)->tailpack);
}
/* Sync a reiserfs file. */
@@ -205,10 +212,11 @@ int reiserfs_commit_page(struct inode *inode, struct page *page,
set_buffer_uptodate(bh);
if (logit) {
reiserfs_prepare_for_journal(s, bh, 1);
- journal_mark_dirty(&th, s, bh);
+ journal_mark_dirty(&th, bh);
} else if (!buffer_dirty(bh)) {
mark_buffer_dirty(bh);
- /* do data=ordered on any page past the end
+ /*
+ * do data=ordered on any page past the end
* of file and any buffer marked BH_New.
*/
if (reiserfs_data_ordered(inode->i_sb) &&
@@ -219,8 +227,8 @@ int reiserfs_commit_page(struct inode *inode, struct page *page,
}
}
if (logit) {
- ret = journal_end(&th, s, bh_per_page + 1);
- drop_write_lock:
+ ret = journal_end(&th);
+drop_write_lock:
reiserfs_write_unlock(s);
}
/*
@@ -235,8 +243,8 @@ int reiserfs_commit_page(struct inode *inode, struct page *page,
}
const struct file_operations reiserfs_file_operations = {
- .read = do_sync_read,
- .write = do_sync_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
.unlocked_ioctl = reiserfs_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = reiserfs_compat_ioctl,
@@ -245,10 +253,10 @@ const struct file_operations reiserfs_file_operations = {
.open = reiserfs_file_open,
.release = reiserfs_file_release,
.fsync = reiserfs_sync_file,
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
.llseek = generic_file_llseek,
};
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index dc4d41530316..6b0ddb2a9091 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -2,59 +2,32 @@
* Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
*/
-/**
- ** old_item_num
- ** old_entry_num
- ** set_entry_sizes
- ** create_virtual_node
- ** check_left
- ** check_right
- ** directory_part_size
- ** get_num_ver
- ** set_parameters
- ** is_leaf_removable
- ** are_leaves_removable
- ** get_empty_nodes
- ** get_lfree
- ** get_rfree
- ** is_left_neighbor_in_cache
- ** decrement_key
- ** get_far_parent
- ** get_parents
- ** can_node_be_removed
- ** ip_check_balance
- ** dc_check_balance_internal
- ** dc_check_balance_leaf
- ** dc_check_balance
- ** check_balance
- ** get_direct_parent
- ** get_neighbors
- ** fix_nodes
- **
- **
- **/
-
#include <linux/time.h>
#include <linux/slab.h>
#include <linux/string.h>
#include "reiserfs.h"
#include <linux/buffer_head.h>
-/* To make any changes in the tree we find a node, that contains item
- to be changed/deleted or position in the node we insert a new item
- to. We call this node S. To do balancing we need to decide what we
- will shift to left/right neighbor, or to a new node, where new item
- will be etc. To make this analysis simpler we build virtual
- node. Virtual node is an array of items, that will replace items of
- node S. (For instance if we are going to delete an item, virtual
- node does not contain it). Virtual node keeps information about
- item sizes and types, mergeability of first and last items, sizes
- of all entries in directory item. We use this array of items when
- calculating what we can shift to neighbors and how many nodes we
- have to have if we do not any shiftings, if we shift to left/right
- neighbor or to both. */
-
-/* taking item number in virtual node, returns number of item, that it has in source buffer */
+/*
+ * To make any changes in the tree we find a node that contains item
+ * to be changed/deleted or position in the node we insert a new item
+ * to. We call this node S. To do balancing we need to decide what we
+ * will shift to left/right neighbor, or to a new node, where new item
+ * will be etc. To make this analysis simpler we build virtual
+ * node. Virtual node is an array of items, that will replace items of
+ * node S. (For instance if we are going to delete an item, virtual
+ * node does not contain it). Virtual node keeps information about
+ * item sizes and types, mergeability of first and last items, sizes
+ * of all entries in directory item. We use this array of items when
+ * calculating what we can shift to neighbors and how many nodes we
+ * have to have if we do not any shiftings, if we shift to left/right
+ * neighbor or to both.
+ */
+
+/*
+ * Takes item number in virtual node, returns number of item
+ * that it has in source buffer
+ */
static inline int old_item_num(int new_num, int affected_item_num, int mode)
{
if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num)
@@ -105,14 +78,17 @@ static void create_virtual_node(struct tree_balance *tb, int h)
vn->vn_free_ptr += vn->vn_nr_item * sizeof(struct virtual_item);
/* first item in the node */
- ih = B_N_PITEM_HEAD(Sh, 0);
+ ih = item_head(Sh, 0);
/* define the mergeability for 0-th item (if it is not being deleted) */
- if (op_is_left_mergeable(&(ih->ih_key), Sh->b_size)
+ if (op_is_left_mergeable(&ih->ih_key, Sh->b_size)
&& (vn->vn_mode != M_DELETE || vn->vn_affected_item_num))
vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE;
- /* go through all items those remain in the virtual node (except for the new (inserted) one) */
+ /*
+ * go through all items that remain in the virtual
+ * node (except for the new (inserted) one)
+ */
for (new_num = 0; new_num < vn->vn_nr_item; new_num++) {
int j;
struct virtual_item *vi = vn->vn_vi + new_num;
@@ -128,11 +104,13 @@ static void create_virtual_node(struct tree_balance *tb, int h)
vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE;
vi->vi_ih = ih + j;
- vi->vi_item = B_I_PITEM(Sh, ih + j);
+ vi->vi_item = ih_item_body(Sh, ih + j);
vi->vi_uarea = vn->vn_free_ptr;
- // FIXME: there is no check, that item operation did not
- // consume too much memory
+ /*
+ * FIXME: there is no check that item operation did not
+ * consume too much memory
+ */
vn->vn_free_ptr +=
op_create_vi(vn, vi, is_affected, tb->insert_size[0]);
if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
@@ -145,7 +123,8 @@ static void create_virtual_node(struct tree_balance *tb, int h)
if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) {
vn->vn_vi[new_num].vi_item_len += tb->insert_size[0];
- vi->vi_new_data = vn->vn_data; // pointer to data which is going to be pasted
+ /* pointer to data which is going to be pasted */
+ vi->vi_new_data = vn->vn_data;
}
}
@@ -164,11 +143,14 @@ static void create_virtual_node(struct tree_balance *tb, int h)
tb->insert_size[0]);
}
- /* set right merge flag we take right delimiting key and check whether it is a mergeable item */
+ /*
+ * set right merge flag we take right delimiting key and
+ * check whether it is a mergeable item
+ */
if (tb->CFR[0]) {
struct reiserfs_key *key;
- key = B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0]);
+ key = internal_key(tb->CFR[0], tb->rkey[0]);
if (op_is_left_mergeable(key, Sh->b_size)
&& (vn->vn_mode != M_DELETE
|| vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1))
@@ -179,12 +161,19 @@ static void create_virtual_node(struct tree_balance *tb, int h)
if (op_is_left_mergeable(key, Sh->b_size) &&
!(vn->vn_mode != M_DELETE
|| vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) {
- /* we delete last item and it could be merged with right neighbor's first item */
+ /*
+ * we delete last item and it could be merged
+ * with right neighbor's first item
+ */
if (!
(B_NR_ITEMS(Sh) == 1
- && is_direntry_le_ih(B_N_PITEM_HEAD(Sh, 0))
- && I_ENTRY_COUNT(B_N_PITEM_HEAD(Sh, 0)) == 1)) {
- /* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */
+ && is_direntry_le_ih(item_head(Sh, 0))
+ && ih_entry_count(item_head(Sh, 0)) == 1)) {
+ /*
+ * node contains more than 1 item, or item
+ * is not directory item, or this item
+ * contains more than 1 entry
+ */
print_block(Sh, 0, -1, -1);
reiserfs_panic(tb->tb_sb, "vs-8045",
"rdkey %k, affected item==%d "
@@ -198,8 +187,10 @@ static void create_virtual_node(struct tree_balance *tb, int h)
}
}
-/* using virtual node check, how many items can be shifted to left
- neighbor */
+/*
+ * Using virtual node check, how many items can be
+ * shifted to left neighbor
+ */
static void check_left(struct tree_balance *tb, int h, int cur_free)
{
int i;
@@ -259,9 +250,13 @@ static void check_left(struct tree_balance *tb, int h, int cur_free)
}
/* the item cannot be shifted entirely, try to split it */
- /* check whether L[0] can hold ih and at least one byte of the item body */
+ /*
+ * check whether L[0] can hold ih and at least one byte
+ * of the item body
+ */
+
+ /* cannot shift even a part of the current item */
if (cur_free <= ih_size) {
- /* cannot shift even a part of the current item */
tb->lbytes = -1;
return;
}
@@ -278,8 +273,10 @@ static void check_left(struct tree_balance *tb, int h, int cur_free)
return;
}
-/* using virtual node check, how many items can be shifted to right
- neighbor */
+/*
+ * Using virtual node check, how many items can be
+ * shifted to right neighbor
+ */
static void check_right(struct tree_balance *tb, int h, int cur_free)
{
int i;
@@ -338,13 +335,21 @@ static void check_right(struct tree_balance *tb, int h, int cur_free)
continue;
}
- /* check whether R[0] can hold ih and at least one byte of the item body */
- if (cur_free <= ih_size) { /* cannot shift even a part of the current item */
+ /*
+ * check whether R[0] can hold ih and at least one
+ * byte of the item body
+ */
+
+ /* cannot shift even a part of the current item */
+ if (cur_free <= ih_size) {
tb->rbytes = -1;
return;
}
- /* R[0] can hold the header of the item and at least one byte of its body */
+ /*
+ * R[0] can hold the header of the item and at least
+ * one byte of its body
+ */
cur_free -= ih_size; /* cur_free is still > 0 */
tb->rbytes = op_check_right(vi, cur_free);
@@ -361,45 +366,64 @@ static void check_right(struct tree_balance *tb, int h, int cur_free)
/*
* from - number of items, which are shifted to left neighbor entirely
* to - number of item, which are shifted to right neighbor entirely
- * from_bytes - number of bytes of boundary item (or directory entries) which are shifted to left neighbor
- * to_bytes - number of bytes of boundary item (or directory entries) which are shifted to right neighbor */
+ * from_bytes - number of bytes of boundary item (or directory entries)
+ * which are shifted to left neighbor
+ * to_bytes - number of bytes of boundary item (or directory entries)
+ * which are shifted to right neighbor
+ */
static int get_num_ver(int mode, struct tree_balance *tb, int h,
int from, int from_bytes,
int to, int to_bytes, short *snum012, int flow)
{
int i;
int cur_free;
- // int bytes;
int units;
struct virtual_node *vn = tb->tb_vn;
- // struct virtual_item * vi;
-
int total_node_size, max_node_size, current_item_size;
int needed_nodes;
- int start_item, /* position of item we start filling node from */
- end_item, /* position of item we finish filling node by */
- start_bytes, /* number of first bytes (entries for directory) of start_item-th item
- we do not include into node that is being filled */
- end_bytes; /* number of last bytes (entries for directory) of end_item-th item
- we do node include into node that is being filled */
- int split_item_positions[2]; /* these are positions in virtual item of
- items, that are split between S[0] and
- S1new and S1new and S2new */
+
+ /* position of item we start filling node from */
+ int start_item;
+
+ /* position of item we finish filling node by */
+ int end_item;
+
+ /*
+ * number of first bytes (entries for directory) of start_item-th item
+ * we do not include into node that is being filled
+ */
+ int start_bytes;
+
+ /*
+ * number of last bytes (entries for directory) of end_item-th item
+ * we do node include into node that is being filled
+ */
+ int end_bytes;
+
+ /*
+ * these are positions in virtual item of items, that are split
+ * between S[0] and S1new and S1new and S2new
+ */
+ int split_item_positions[2];
split_item_positions[0] = -1;
split_item_positions[1] = -1;
- /* We only create additional nodes if we are in insert or paste mode
- or we are in replace mode at the internal level. If h is 0 and
- the mode is M_REPLACE then in fix_nodes we change the mode to
- paste or insert before we get here in the code. */
+ /*
+ * We only create additional nodes if we are in insert or paste mode
+ * or we are in replace mode at the internal level. If h is 0 and
+ * the mode is M_REPLACE then in fix_nodes we change the mode to
+ * paste or insert before we get here in the code.
+ */
RFALSE(tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE),
"vs-8100: insert_size < 0 in overflow");
max_node_size = MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, h));
- /* snum012 [0-2] - number of items, that lay
- to S[0], first new node and second new node */
+ /*
+ * snum012 [0-2] - number of items, that lay
+ * to S[0], first new node and second new node
+ */
snum012[3] = -1; /* s1bytes */
snum012[4] = -1; /* s2bytes */
@@ -416,20 +440,22 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
total_node_size = 0;
cur_free = max_node_size;
- // start from 'from'-th item
+ /* start from 'from'-th item */
start_item = from;
- // skip its first 'start_bytes' units
+ /* skip its first 'start_bytes' units */
start_bytes = ((from_bytes != -1) ? from_bytes : 0);
- // last included item is the 'end_item'-th one
+ /* last included item is the 'end_item'-th one */
end_item = vn->vn_nr_item - to - 1;
- // do not count last 'end_bytes' units of 'end_item'-th item
+ /* do not count last 'end_bytes' units of 'end_item'-th item */
end_bytes = (to_bytes != -1) ? to_bytes : 0;
- /* go through all item beginning from the start_item-th item and ending by
- the end_item-th item. Do not count first 'start_bytes' units of
- 'start_item'-th item and last 'end_bytes' of 'end_item'-th item */
-
+ /*
+ * go through all item beginning from the start_item-th item
+ * and ending by the end_item-th item. Do not count first
+ * 'start_bytes' units of 'start_item'-th item and last
+ * 'end_bytes' of 'end_item'-th item
+ */
for (i = start_item; i <= end_item; i++) {
struct virtual_item *vi = vn->vn_vi + i;
int skip_from_end = ((i == end_item) ? end_bytes : 0);
@@ -439,7 +465,10 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
/* get size of current item */
current_item_size = vi->vi_item_len;
- /* do not take in calculation head part (from_bytes) of from-th item */
+ /*
+ * do not take in calculation head part (from_bytes)
+ * of from-th item
+ */
current_item_size -=
op_part_size(vi, 0 /*from start */ , start_bytes);
@@ -455,9 +484,11 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
continue;
}
+ /*
+ * virtual item length is longer, than max size of item in
+ * a node. It is impossible for direct item
+ */
if (current_item_size > max_node_size) {
- /* virtual item length is longer, than max size of item in
- a node. It is impossible for direct item */
RFALSE(is_direct_le_ih(vi->vi_ih),
"vs-8110: "
"direct item length is %d. It can not be longer than %d",
@@ -466,15 +497,18 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
flow = 1;
}
+ /* as we do not split items, take new node and continue */
if (!flow) {
- /* as we do not split items, take new node and continue */
needed_nodes++;
i--;
total_node_size = 0;
continue;
}
- // calculate number of item units which fit into node being
- // filled
+
+ /*
+ * calculate number of item units which fit into node being
+ * filled
+ */
{
int free_space;
@@ -482,17 +516,17 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
units =
op_check_left(vi, free_space, start_bytes,
skip_from_end);
+ /*
+ * nothing fits into current node, take new
+ * node and continue
+ */
if (units == -1) {
- /* nothing fits into current node, take new node and continue */
needed_nodes++, i--, total_node_size = 0;
continue;
}
}
/* something fits into the current node */
- //if (snum012[3] != -1 || needed_nodes != 1)
- // reiserfs_panic (tb->tb_sb, "vs-8115: get_num_ver: too many nodes required");
- //snum012[needed_nodes - 1 + 3] = op_unit_num (vi) - start_bytes - units;
start_bytes += units;
snum012[needed_nodes - 1 + 3] = units;
@@ -508,9 +542,11 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
total_node_size = 0;
}
- // sum012[4] (if it is not -1) contains number of units of which
- // are to be in S1new, snum012[3] - to be in S0. They are supposed
- // to be S1bytes and S2bytes correspondingly, so recalculate
+ /*
+ * sum012[4] (if it is not -1) contains number of units of which
+ * are to be in S1new, snum012[3] - to be in S0. They are supposed
+ * to be S1bytes and S2bytes correspondingly, so recalculate
+ */
if (snum012[4] > 0) {
int split_item_num;
int bytes_to_r, bytes_to_l;
@@ -527,7 +563,7 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
((split_item_positions[0] ==
split_item_positions[1]) ? snum012[3] : 0);
- // s2bytes
+ /* s2bytes */
snum012[4] =
op_unit_num(&vn->vn_vi[split_item_num]) - snum012[4] -
bytes_to_r - bytes_to_l - bytes_to_S1new;
@@ -555,7 +591,7 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
((split_item_positions[0] == split_item_positions[1]
&& snum012[4] != -1) ? snum012[4] : 0);
- // s1bytes
+ /* s1bytes */
snum012[3] =
op_unit_num(&vn->vn_vi[split_item_num]) - snum012[3] -
bytes_to_r - bytes_to_l - bytes_to_S2new;
@@ -565,7 +601,8 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
}
-/* Set parameters for balancing.
+/*
+ * Set parameters for balancing.
* Performs write of results of analysis of balancing into structure tb,
* where it will later be used by the functions that actually do the balancing.
* Parameters:
@@ -575,11 +612,12 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
* rnum number of items from S[h] that must be shifted to R[h];
* blk_num number of blocks that S[h] will be splitted into;
* s012 number of items that fall into splitted nodes.
- * lbytes number of bytes which flow to the left neighbor from the item that is not
- * not shifted entirely
- * rbytes number of bytes which flow to the right neighbor from the item that is not
- * not shifted entirely
- * s1bytes number of bytes which flow to the first new node when S[0] splits (this number is contained in s012 array)
+ * lbytes number of bytes which flow to the left neighbor from the
+ * item that is not not shifted entirely
+ * rbytes number of bytes which flow to the right neighbor from the
+ * item that is not not shifted entirely
+ * s1bytes number of bytes which flow to the first new node when
+ * S[0] splits (this number is contained in s012 array)
*/
static void set_parameters(struct tree_balance *tb, int h, int lnum,
@@ -590,12 +628,14 @@ static void set_parameters(struct tree_balance *tb, int h, int lnum,
tb->rnum[h] = rnum;
tb->blknum[h] = blk_num;
- if (h == 0) { /* only for leaf level */
+ /* only for leaf level */
+ if (h == 0) {
if (s012 != NULL) {
- tb->s0num = *s012++,
- tb->s1num = *s012++, tb->s2num = *s012++;
- tb->s1bytes = *s012++;
- tb->s2bytes = *s012;
+ tb->s0num = *s012++;
+ tb->snum[0] = *s012++;
+ tb->snum[1] = *s012++;
+ tb->sbytes[0] = *s012++;
+ tb->sbytes[1] = *s012;
}
tb->lbytes = lb;
tb->rbytes = rb;
@@ -607,8 +647,10 @@ static void set_parameters(struct tree_balance *tb, int h, int lnum,
PROC_INFO_ADD(tb->tb_sb, rbytes[h], rb);
}
-/* check, does node disappear if we shift tb->lnum[0] items to left
- neighbor and tb->rnum[0] to the right one. */
+/*
+ * check if node disappears if we shift tb->lnum[0] items to left
+ * neighbor and tb->rnum[0] to the right one.
+ */
static int is_leaf_removable(struct tree_balance *tb)
{
struct virtual_node *vn = tb->tb_vn;
@@ -616,8 +658,10 @@ static int is_leaf_removable(struct tree_balance *tb)
int size;
int remain_items;
- /* number of items, that will be shifted to left (right) neighbor
- entirely */
+ /*
+ * number of items that will be shifted to left (right) neighbor
+ * entirely
+ */
to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0);
to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0);
remain_items = vn->vn_nr_item;
@@ -625,21 +669,21 @@ static int is_leaf_removable(struct tree_balance *tb)
/* how many items remain in S[0] after shiftings to neighbors */
remain_items -= (to_left + to_right);
+ /* all content of node can be shifted to neighbors */
if (remain_items < 1) {
- /* all content of node can be shifted to neighbors */
set_parameters(tb, 0, to_left, vn->vn_nr_item - to_left, 0,
NULL, -1, -1);
return 1;
}
+ /* S[0] is not removable */
if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1)
- /* S[0] is not removable */
return 0;
- /* check, whether we can divide 1 remaining item between neighbors */
+ /* check whether we can divide 1 remaining item between neighbors */
/* get size of remaining item (in item units) */
- size = op_unit_num(&(vn->vn_vi[to_left]));
+ size = op_unit_num(&vn->vn_vi[to_left]);
if (tb->lbytes + tb->rbytes >= size) {
set_parameters(tb, 0, to_left + 1, to_right + 1, 0, NULL,
@@ -675,23 +719,28 @@ static int are_leaves_removable(struct tree_balance *tb, int lfree, int rfree)
"vs-8125: item number must be 1: it is %d",
B_NR_ITEMS(S0));
- ih = B_N_PITEM_HEAD(S0, 0);
+ ih = item_head(S0, 0);
if (tb->CFR[0]
- && !comp_short_le_keys(&(ih->ih_key),
- B_N_PDELIM_KEY(tb->CFR[0],
+ && !comp_short_le_keys(&ih->ih_key,
+ internal_key(tb->CFR[0],
tb->rkey[0])))
+ /*
+ * Directory must be in correct state here: that is
+ * somewhere at the left side should exist first
+ * directory item. But the item being deleted can
+ * not be that first one because its right neighbor
+ * is item of the same directory. (But first item
+ * always gets deleted in last turn). So, neighbors
+ * of deleted item can be merged, so we can save
+ * ih_size
+ */
if (is_direntry_le_ih(ih)) {
- /* Directory must be in correct state here: that is
- somewhere at the left side should exist first directory
- item. But the item being deleted can not be that first
- one because its right neighbor is item of the same
- directory. (But first item always gets deleted in last
- turn). So, neighbors of deleted item can be merged, so
- we can save ih_size */
ih_size = IH_SIZE;
- /* we might check that left neighbor exists and is of the
- same directory */
+ /*
+ * we might check that left neighbor exists
+ * and is of the same directory
+ */
RFALSE(le_ih_k_offset(ih) == DOT_OFFSET,
"vs-8130: first directory item can not be removed until directory is not empty");
}
@@ -770,7 +819,8 @@ static void free_buffers_in_tb(struct tree_balance *tb)
}
}
-/* Get new buffers for storing new nodes that are created while balancing.
+/*
+ * Get new buffers for storing new nodes that are created while balancing.
* Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
* CARRY_ON - schedule didn't occur while the function worked;
* NO_DISK_SPACE - no disk space.
@@ -778,28 +828,33 @@ static void free_buffers_in_tb(struct tree_balance *tb)
/* The function is NOT SCHEDULE-SAFE! */
static int get_empty_nodes(struct tree_balance *tb, int h)
{
- struct buffer_head *new_bh,
- *Sh = PATH_H_PBUFFER(tb->tb_path, h);
+ struct buffer_head *new_bh, *Sh = PATH_H_PBUFFER(tb->tb_path, h);
b_blocknr_t *blocknr, blocknrs[MAX_AMOUNT_NEEDED] = { 0, };
- int counter, number_of_freeblk, amount_needed, /* number of needed empty blocks */
- retval = CARRY_ON;
+ int counter, number_of_freeblk;
+ int amount_needed; /* number of needed empty blocks */
+ int retval = CARRY_ON;
struct super_block *sb = tb->tb_sb;
- /* number_of_freeblk is the number of empty blocks which have been
- acquired for use by the balancing algorithm minus the number of
- empty blocks used in the previous levels of the analysis,
- number_of_freeblk = tb->cur_blknum can be non-zero if a schedule occurs
- after empty blocks are acquired, and the balancing analysis is
- then restarted, amount_needed is the number needed by this level
- (h) of the balancing analysis.
-
- Note that for systems with many processes writing, it would be
- more layout optimal to calculate the total number needed by all
- levels and then to run reiserfs_new_blocks to get all of them at once. */
-
- /* Initiate number_of_freeblk to the amount acquired prior to the restart of
- the analysis or 0 if not restarted, then subtract the amount needed
- by all of the levels of the tree below h. */
+ /*
+ * number_of_freeblk is the number of empty blocks which have been
+ * acquired for use by the balancing algorithm minus the number of
+ * empty blocks used in the previous levels of the analysis,
+ * number_of_freeblk = tb->cur_blknum can be non-zero if a schedule
+ * occurs after empty blocks are acquired, and the balancing analysis
+ * is then restarted, amount_needed is the number needed by this
+ * level (h) of the balancing analysis.
+ *
+ * Note that for systems with many processes writing, it would be
+ * more layout optimal to calculate the total number needed by all
+ * levels and then to run reiserfs_new_blocks to get all of them at
+ * once.
+ */
+
+ /*
+ * Initiate number_of_freeblk to the amount acquired prior to the
+ * restart of the analysis or 0 if not restarted, then subtract the
+ * amount needed by all of the levels of the tree below h.
+ */
/* blknum includes S[h], so we subtract 1 in this calculation */
for (counter = 0, number_of_freeblk = tb->cur_blknum;
counter < h; counter++)
@@ -810,13 +865,19 @@ static int get_empty_nodes(struct tree_balance *tb, int h)
/* Allocate missing empty blocks. */
/* if Sh == 0 then we are getting a new root */
amount_needed = (Sh) ? (tb->blknum[h] - 1) : 1;
- /* Amount_needed = the amount that we need more than the amount that we have. */
+ /*
+ * Amount_needed = the amount that we need more than the
+ * amount that we have.
+ */
if (amount_needed > number_of_freeblk)
amount_needed -= number_of_freeblk;
- else /* If we have enough already then there is nothing to do. */
+ else /* If we have enough already then there is nothing to do. */
return CARRY_ON;
- /* No need to check quota - is not allocated for blocks used for formatted nodes */
+ /*
+ * No need to check quota - is not allocated for blocks used
+ * for formatted nodes
+ */
if (reiserfs_new_form_blocknrs(tb, blocknrs,
amount_needed) == NO_DISK_SPACE)
return NO_DISK_SPACE;
@@ -849,8 +910,10 @@ static int get_empty_nodes(struct tree_balance *tb, int h)
return retval;
}
-/* Get free space of the left neighbor, which is stored in the parent
- * node of the left neighbor. */
+/*
+ * Get free space of the left neighbor, which is stored in the parent
+ * node of the left neighbor.
+ */
static int get_lfree(struct tree_balance *tb, int h)
{
struct buffer_head *l, *f;
@@ -870,7 +933,8 @@ static int get_lfree(struct tree_balance *tb, int h)
return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
}
-/* Get free space of the right neighbor,
+/*
+ * Get free space of the right neighbor,
* which is stored in the parent node of the right neighbor.
*/
static int get_rfree(struct tree_balance *tb, int h)
@@ -916,7 +980,10 @@ static int is_left_neighbor_in_cache(struct tree_balance *tb, int h)
"vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
father, tb->FL[h]);
- /* Get position of the pointer to the left neighbor into the left father. */
+ /*
+ * Get position of the pointer to the left neighbor
+ * into the left father.
+ */
left_neighbor_position = (father == tb->FL[h]) ?
tb->lkey[h] : B_NR_ITEMS(tb->FL[h]);
/* Get left neighbor block number. */
@@ -940,17 +1007,20 @@ static int is_left_neighbor_in_cache(struct tree_balance *tb, int h)
static void decrement_key(struct cpu_key *key)
{
- // call item specific function for this key
+ /* call item specific function for this key */
item_ops[cpu_key_k_type(key)]->decrement_key(key);
}
-/* Calculate far left/right parent of the left/right neighbor of the current node, that
- * is calculate the left/right (FL[h]/FR[h]) neighbor of the parent F[h].
+/*
+ * Calculate far left/right parent of the left/right neighbor of the
+ * current node, that is calculate the left/right (FL[h]/FR[h]) neighbor
+ * of the parent F[h].
* Calculate left/right common parent of the current node and L[h]/R[h].
* Calculate left/right delimiting key position.
- * Returns: PATH_INCORRECT - path in the tree is not correct;
- SCHEDULE_OCCURRED - schedule occurred while the function worked;
- * CARRY_ON - schedule didn't occur while the function worked;
+ * Returns: PATH_INCORRECT - path in the tree is not correct
+ * SCHEDULE_OCCURRED - schedule occurred while the function worked
+ * CARRY_ON - schedule didn't occur while the function
+ * worked
*/
static int get_far_parent(struct tree_balance *tb,
int h,
@@ -966,8 +1036,10 @@ static int get_far_parent(struct tree_balance *tb,
first_last_position = 0,
path_offset = PATH_H_PATH_OFFSET(path, h);
- /* Starting from F[h] go upwards in the tree, and look for the common
- ancestor of F[h], and its neighbor l/r, that should be obtained. */
+ /*
+ * Starting from F[h] go upwards in the tree, and look for the common
+ * ancestor of F[h], and its neighbor l/r, that should be obtained.
+ */
counter = path_offset;
@@ -975,21 +1047,33 @@ static int get_far_parent(struct tree_balance *tb,
"PAP-8180: invalid path length");
for (; counter > FIRST_PATH_ELEMENT_OFFSET; counter--) {
- /* Check whether parent of the current buffer in the path is really parent in the tree. */
+ /*
+ * Check whether parent of the current buffer in the path
+ * is really parent in the tree.
+ */
if (!B_IS_IN_TREE
(parent = PATH_OFFSET_PBUFFER(path, counter - 1)))
return REPEAT_SEARCH;
+
/* Check whether position in the parent is correct. */
if ((position =
PATH_OFFSET_POSITION(path,
counter - 1)) >
B_NR_ITEMS(parent))
return REPEAT_SEARCH;
- /* Check whether parent at the path really points to the child. */
+
+ /*
+ * Check whether parent at the path really points
+ * to the child.
+ */
if (B_N_CHILD_NUM(parent, position) !=
PATH_OFFSET_PBUFFER(path, counter)->b_blocknr)
return REPEAT_SEARCH;
- /* Return delimiting key if position in the parent is not equal to first/last one. */
+
+ /*
+ * Return delimiting key if position in the parent is not
+ * equal to first/last one.
+ */
if (c_lr_par == RIGHT_PARENTS)
first_last_position = B_NR_ITEMS(parent);
if (position != first_last_position) {
@@ -1002,7 +1086,10 @@ static int get_far_parent(struct tree_balance *tb,
/* if we are in the root of the tree, then there is no common father */
if (counter == FIRST_PATH_ELEMENT_OFFSET) {
- /* Check whether first buffer in the path is the root of the tree. */
+ /*
+ * Check whether first buffer in the path is the
+ * root of the tree.
+ */
if (PATH_OFFSET_PBUFFER
(tb->tb_path,
FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
@@ -1031,12 +1118,15 @@ static int get_far_parent(struct tree_balance *tb,
}
}
- /* So, we got common parent of the current node and its left/right neighbor.
- Now we are geting the parent of the left/right neighbor. */
+ /*
+ * So, we got common parent of the current node and its
+ * left/right neighbor. Now we are getting the parent of the
+ * left/right neighbor.
+ */
/* Form key to get parent of the left/right neighbor. */
le_key2cpu_key(&s_lr_father_key,
- B_N_PDELIM_KEY(*pcom_father,
+ internal_key(*pcom_father,
(c_lr_par ==
LEFT_PARENTS) ? (tb->lkey[h - 1] =
position -
@@ -1050,7 +1140,7 @@ static int get_far_parent(struct tree_balance *tb,
if (search_by_key
(tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father,
h + 1) == IO_ERROR)
- // path is released
+ /* path is released */
return IO_ERROR;
if (FILESYSTEM_CHANGED_TB(tb)) {
@@ -1071,12 +1161,15 @@ static int get_far_parent(struct tree_balance *tb,
return CARRY_ON;
}
-/* Get parents of neighbors of node in the path(S[path_offset]) and common parents of
- * S[path_offset] and L[path_offset]/R[path_offset]: F[path_offset], FL[path_offset],
- * FR[path_offset], CFL[path_offset], CFR[path_offset].
- * Calculate numbers of left and right delimiting keys position: lkey[path_offset], rkey[path_offset].
- * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
- * CARRY_ON - schedule didn't occur while the function worked;
+/*
+ * Get parents of neighbors of node in the path(S[path_offset]) and
+ * common parents of S[path_offset] and L[path_offset]/R[path_offset]:
+ * F[path_offset], FL[path_offset], FR[path_offset], CFL[path_offset],
+ * CFR[path_offset].
+ * Calculate numbers of left and right delimiting keys position:
+ * lkey[path_offset], rkey[path_offset].
+ * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked
+ * CARRY_ON - schedule didn't occur while the function worked
*/
static int get_parents(struct tree_balance *tb, int h)
{
@@ -1088,8 +1181,11 @@ static int get_parents(struct tree_balance *tb, int h)
/* Current node is the root of the tree or will be root of the tree */
if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
- /* The root can not have parents.
- Release nodes which previously were obtained as parents of the current node neighbors. */
+ /*
+ * The root can not have parents.
+ * Release nodes which previously were obtained as
+ * parents of the current node neighbors.
+ */
brelse(tb->FL[h]);
brelse(tb->CFL[h]);
brelse(tb->FR[h]);
@@ -1111,10 +1207,14 @@ static int get_parents(struct tree_balance *tb, int h)
get_bh(curf);
tb->lkey[h] = position - 1;
} else {
- /* Calculate current parent of L[path_offset], which is the left neighbor of the current node.
- Calculate current common parent of L[path_offset] and the current node. Note that
- CFL[path_offset] not equal FL[path_offset] and CFL[path_offset] not equal F[path_offset].
- Calculate lkey[path_offset]. */
+ /*
+ * Calculate current parent of L[path_offset], which is the
+ * left neighbor of the current node. Calculate current
+ * common parent of L[path_offset] and the current node.
+ * Note that CFL[path_offset] not equal FL[path_offset] and
+ * CFL[path_offset] not equal F[path_offset].
+ * Calculate lkey[path_offset].
+ */
if ((ret = get_far_parent(tb, h + 1, &curf,
&curcf,
LEFT_PARENTS)) != CARRY_ON)
@@ -1130,19 +1230,22 @@ static int get_parents(struct tree_balance *tb, int h)
(curcf && !B_IS_IN_TREE(curcf)),
"PAP-8195: FL (%b) or CFL (%b) is invalid", curf, curcf);
-/* Get parent FR[h] of R[h]. */
+ /* Get parent FR[h] of R[h]. */
-/* Current node is the last child of F[h]. FR[h] != F[h]. */
+ /* Current node is the last child of F[h]. FR[h] != F[h]. */
if (position == B_NR_ITEMS(PATH_H_PBUFFER(path, h + 1))) {
-/* Calculate current parent of R[h], which is the right neighbor of F[h].
- Calculate current common parent of R[h] and current node. Note that CFR[h]
- not equal FR[path_offset] and CFR[h] not equal F[h]. */
+ /*
+ * Calculate current parent of R[h], which is the right
+ * neighbor of F[h]. Calculate current common parent of
+ * R[h] and current node. Note that CFR[h] not equal
+ * FR[path_offset] and CFR[h] not equal F[h].
+ */
if ((ret =
get_far_parent(tb, h + 1, &curf, &curcf,
RIGHT_PARENTS)) != CARRY_ON)
return ret;
} else {
-/* Current node is not the last child of its parent F[h]. */
+ /* Current node is not the last child of its parent F[h]. */
curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
get_bh(curf);
@@ -1165,8 +1268,10 @@ static int get_parents(struct tree_balance *tb, int h)
return CARRY_ON;
}
-/* it is possible to remove node as result of shiftings to
- neighbors even when we insert or paste item. */
+/*
+ * it is possible to remove node as result of shiftings to
+ * neighbors even when we insert or paste item.
+ */
static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree,
struct tree_balance *tb, int h)
{
@@ -1175,21 +1280,22 @@ static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree,
struct item_head *ih;
struct reiserfs_key *r_key = NULL;
- ih = B_N_PITEM_HEAD(Sh, 0);
+ ih = item_head(Sh, 0);
if (tb->CFR[h])
- r_key = B_N_PDELIM_KEY(tb->CFR[h], tb->rkey[h]);
+ r_key = internal_key(tb->CFR[h], tb->rkey[h]);
if (lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes
/* shifting may merge items which might save space */
-
((!h
- && op_is_left_mergeable(&(ih->ih_key), Sh->b_size)) ? IH_SIZE : 0)
+ && op_is_left_mergeable(&ih->ih_key, Sh->b_size)) ? IH_SIZE : 0)
-
((!h && r_key
&& op_is_left_mergeable(r_key, Sh->b_size)) ? IH_SIZE : 0)
+ ((h) ? KEY_SIZE : 0)) {
/* node can not be removed */
- if (sfree >= levbytes) { /* new item fits into node S[h] without any shifting */
+ if (sfree >= levbytes) {
+ /* new item fits into node S[h] without any shifting */
if (!h)
tb->s0num =
B_NR_ITEMS(Sh) +
@@ -1202,7 +1308,8 @@ static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree,
return !NO_BALANCING_NEEDED;
}
-/* Check whether current node S[h] is balanced when increasing its size by
+/*
+ * Check whether current node S[h] is balanced when increasing its size by
* Inserting or Pasting.
* Calculate parameters for balancing for current level h.
* Parameters:
@@ -1219,39 +1326,48 @@ static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree,
static int ip_check_balance(struct tree_balance *tb, int h)
{
struct virtual_node *vn = tb->tb_vn;
- int levbytes, /* Number of bytes that must be inserted into (value
- is negative if bytes are deleted) buffer which
- contains node being balanced. The mnemonic is
- that the attempted change in node space used level
- is levbytes bytes. */
- ret;
+ /*
+ * Number of bytes that must be inserted into (value is negative
+ * if bytes are deleted) buffer which contains node being balanced.
+ * The mnemonic is that the attempted change in node space used
+ * level is levbytes bytes.
+ */
+ int levbytes;
+ int ret;
int lfree, sfree, rfree /* free space in L, S and R */ ;
- /* nver is short for number of vertixes, and lnver is the number if
- we shift to the left, rnver is the number if we shift to the
- right, and lrnver is the number if we shift in both directions.
- The goal is to minimize first the number of vertixes, and second,
- the number of vertixes whose contents are changed by shifting,
- and third the number of uncached vertixes whose contents are
- changed by shifting and must be read from disk. */
+ /*
+ * nver is short for number of vertixes, and lnver is the number if
+ * we shift to the left, rnver is the number if we shift to the
+ * right, and lrnver is the number if we shift in both directions.
+ * The goal is to minimize first the number of vertixes, and second,
+ * the number of vertixes whose contents are changed by shifting,
+ * and third the number of uncached vertixes whose contents are
+ * changed by shifting and must be read from disk.
+ */
int nver, lnver, rnver, lrnver;
- /* used at leaf level only, S0 = S[0] is the node being balanced,
- sInum [ I = 0,1,2 ] is the number of items that will
- remain in node SI after balancing. S1 and S2 are new
- nodes that might be created. */
+ /*
+ * used at leaf level only, S0 = S[0] is the node being balanced,
+ * sInum [ I = 0,1,2 ] is the number of items that will
+ * remain in node SI after balancing. S1 and S2 are new
+ * nodes that might be created.
+ */
- /* we perform 8 calls to get_num_ver(). For each call we calculate five parameters.
- where 4th parameter is s1bytes and 5th - s2bytes
+ /*
+ * we perform 8 calls to get_num_ver(). For each call we
+ * calculate five parameters. where 4th parameter is s1bytes
+ * and 5th - s2bytes
+ *
+ * s0num, s1num, s2num for 8 cases
+ * 0,1 - do not shift and do not shift but bottle
+ * 2 - shift only whole item to left
+ * 3 - shift to left and bottle as much as possible
+ * 4,5 - shift to right (whole items and as much as possible
+ * 6,7 - shift to both directions (whole items and as much as possible)
*/
- short snum012[40] = { 0, }; /* s0num, s1num, s2num for 8 cases
- 0,1 - do not shift and do not shift but bottle
- 2 - shift only whole item to left
- 3 - shift to left and bottle as much as possible
- 4,5 - shift to right (whole items and as much as possible
- 6,7 - shift to both directions (whole items and as much as possible)
- */
+ short snum012[40] = { 0, };
/* Sh is the node whose balance is currently being checked */
struct buffer_head *Sh;
@@ -1265,9 +1381,10 @@ static int ip_check_balance(struct tree_balance *tb, int h)
reiserfs_panic(tb->tb_sb, "vs-8210",
"S[0] can not be 0");
switch (ret = get_empty_nodes(tb, h)) {
+ /* no balancing for higher levels needed */
case CARRY_ON:
set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
- return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */
+ return NO_BALANCING_NEEDED;
case NO_DISK_SPACE:
case REPEAT_SEARCH:
@@ -1278,7 +1395,9 @@ static int ip_check_balance(struct tree_balance *tb, int h)
}
}
- if ((ret = get_parents(tb, h)) != CARRY_ON) /* get parents of S[h] neighbors. */
+ /* get parents of S[h] neighbors. */
+ ret = get_parents(tb, h);
+ if (ret != CARRY_ON)
return ret;
sfree = B_FREE_SPACE(Sh);
@@ -1287,38 +1406,44 @@ static int ip_check_balance(struct tree_balance *tb, int h)
rfree = get_rfree(tb, h);
lfree = get_lfree(tb, h);
+ /* and new item fits into node S[h] without any shifting */
if (can_node_be_removed(vn->vn_mode, lfree, sfree, rfree, tb, h) ==
NO_BALANCING_NEEDED)
- /* and new item fits into node S[h] without any shifting */
return NO_BALANCING_NEEDED;
create_virtual_node(tb, h);
/*
- determine maximal number of items we can shift to the left neighbor (in tb structure)
- and the maximal number of bytes that can flow to the left neighbor
- from the left most liquid item that cannot be shifted from S[0] entirely (returned value)
+ * determine maximal number of items we can shift to the left
+ * neighbor (in tb structure) and the maximal number of bytes
+ * that can flow to the left neighbor from the left most liquid
+ * item that cannot be shifted from S[0] entirely (returned value)
*/
check_left(tb, h, lfree);
/*
- determine maximal number of items we can shift to the right neighbor (in tb structure)
- and the maximal number of bytes that can flow to the right neighbor
- from the right most liquid item that cannot be shifted from S[0] entirely (returned value)
+ * determine maximal number of items we can shift to the right
+ * neighbor (in tb structure) and the maximal number of bytes
+ * that can flow to the right neighbor from the right most liquid
+ * item that cannot be shifted from S[0] entirely (returned value)
*/
check_right(tb, h, rfree);
- /* all contents of internal node S[h] can be moved into its
- neighbors, S[h] will be removed after balancing */
+ /*
+ * all contents of internal node S[h] can be moved into its
+ * neighbors, S[h] will be removed after balancing
+ */
if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) {
int to_r;
- /* Since we are working on internal nodes, and our internal
- nodes have fixed size entries, then we can balance by the
- number of items rather than the space they consume. In this
- routine we set the left node equal to the right node,
- allowing a difference of less than or equal to 1 child
- pointer. */
+ /*
+ * Since we are working on internal nodes, and our internal
+ * nodes have fixed size entries, then we can balance by the
+ * number of items rather than the space they consume. In this
+ * routine we set the left node equal to the right node,
+ * allowing a difference of less than or equal to 1 child
+ * pointer.
+ */
to_r =
((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
@@ -1328,7 +1453,10 @@ static int ip_check_balance(struct tree_balance *tb, int h)
return CARRY_ON;
}
- /* this checks balance condition, that any two neighboring nodes can not fit in one node */
+ /*
+ * this checks balance condition, that any two neighboring nodes
+ * can not fit in one node
+ */
RFALSE(h &&
(tb->lnum[h] >= vn->vn_nr_item + 1 ||
tb->rnum[h] >= vn->vn_nr_item + 1),
@@ -1337,16 +1465,22 @@ static int ip_check_balance(struct tree_balance *tb, int h)
(tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1))),
"vs-8225: tree is not balanced on leaf level");
- /* all contents of S[0] can be moved into its neighbors
- S[0] will be removed after balancing. */
+ /*
+ * all contents of S[0] can be moved into its neighbors
+ * S[0] will be removed after balancing.
+ */
if (!h && is_leaf_removable(tb))
return CARRY_ON;
- /* why do we perform this check here rather than earlier??
- Answer: we can win 1 node in some cases above. Moreover we
- checked it above, when we checked, that S[0] is not removable
- in principle */
- if (sfree >= levbytes) { /* new item fits into node S[h] without any shifting */
+ /*
+ * why do we perform this check here rather than earlier??
+ * Answer: we can win 1 node in some cases above. Moreover we
+ * checked it above, when we checked, that S[0] is not removable
+ * in principle
+ */
+
+ /* new item fits into node S[h] without any shifting */
+ if (sfree >= levbytes) {
if (!h)
tb->s0num = vn->vn_nr_item;
set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
@@ -1355,18 +1489,19 @@ static int ip_check_balance(struct tree_balance *tb, int h)
{
int lpar, rpar, nset, lset, rset, lrset;
- /*
- * regular overflowing of the node
- */
+ /* regular overflowing of the node */
- /* get_num_ver works in 2 modes (FLOW & NO_FLOW)
- lpar, rpar - number of items we can shift to left/right neighbor (including splitting item)
- nset, lset, rset, lrset - shows, whether flowing items give better packing
+ /*
+ * get_num_ver works in 2 modes (FLOW & NO_FLOW)
+ * lpar, rpar - number of items we can shift to left/right
+ * neighbor (including splitting item)
+ * nset, lset, rset, lrset - shows, whether flowing items
+ * give better packing
*/
#define FLOW 1
#define NO_FLOW 0 /* do not any splitting */
- /* we choose one the following */
+ /* we choose one of the following */
#define NOTHING_SHIFT_NO_FLOW 0
#define NOTHING_SHIFT_FLOW 5
#define LEFT_SHIFT_NO_FLOW 10
@@ -1379,10 +1514,13 @@ static int ip_check_balance(struct tree_balance *tb, int h)
lpar = tb->lnum[h];
rpar = tb->rnum[h];
- /* calculate number of blocks S[h] must be split into when
- nothing is shifted to the neighbors,
- as well as number of items in each part of the split node (s012 numbers),
- and number of bytes (s1bytes) of the shared drop which flow to S1 if any */
+ /*
+ * calculate number of blocks S[h] must be split into when
+ * nothing is shifted to the neighbors, as well as number of
+ * items in each part of the split node (s012 numbers),
+ * and number of bytes (s1bytes) of the shared drop which
+ * flow to S1 if any
+ */
nset = NOTHING_SHIFT_NO_FLOW;
nver = get_num_ver(vn->vn_mode, tb, h,
0, -1, h ? vn->vn_nr_item : 0, -1,
@@ -1391,7 +1529,10 @@ static int ip_check_balance(struct tree_balance *tb, int h)
if (!h) {
int nver1;
- /* note, that in this case we try to bottle between S[0] and S1 (S1 - the first new node) */
+ /*
+ * note, that in this case we try to bottle
+ * between S[0] and S1 (S1 - the first new node)
+ */
nver1 = get_num_ver(vn->vn_mode, tb, h,
0, -1, 0, -1,
snum012 + NOTHING_SHIFT_FLOW, FLOW);
@@ -1399,11 +1540,13 @@ static int ip_check_balance(struct tree_balance *tb, int h)
nset = NOTHING_SHIFT_FLOW, nver = nver1;
}
- /* calculate number of blocks S[h] must be split into when
- l_shift_num first items and l_shift_bytes of the right most
- liquid item to be shifted are shifted to the left neighbor,
- as well as number of items in each part of the splitted node (s012 numbers),
- and number of bytes (s1bytes) of the shared drop which flow to S1 if any
+ /*
+ * calculate number of blocks S[h] must be split into when
+ * l_shift_num first items and l_shift_bytes of the right
+ * most liquid item to be shifted are shifted to the left
+ * neighbor, as well as number of items in each part of the
+ * splitted node (s012 numbers), and number of bytes
+ * (s1bytes) of the shared drop which flow to S1 if any
*/
lset = LEFT_SHIFT_NO_FLOW;
lnver = get_num_ver(vn->vn_mode, tb, h,
@@ -1422,11 +1565,13 @@ static int ip_check_balance(struct tree_balance *tb, int h)
lset = LEFT_SHIFT_FLOW, lnver = lnver1;
}
- /* calculate number of blocks S[h] must be split into when
- r_shift_num first items and r_shift_bytes of the left most
- liquid item to be shifted are shifted to the right neighbor,
- as well as number of items in each part of the splitted node (s012 numbers),
- and number of bytes (s1bytes) of the shared drop which flow to S1 if any
+ /*
+ * calculate number of blocks S[h] must be split into when
+ * r_shift_num first items and r_shift_bytes of the left most
+ * liquid item to be shifted are shifted to the right neighbor,
+ * as well as number of items in each part of the splitted
+ * node (s012 numbers), and number of bytes (s1bytes) of the
+ * shared drop which flow to S1 if any
*/
rset = RIGHT_SHIFT_NO_FLOW;
rnver = get_num_ver(vn->vn_mode, tb, h,
@@ -1451,10 +1596,12 @@ static int ip_check_balance(struct tree_balance *tb, int h)
rset = RIGHT_SHIFT_FLOW, rnver = rnver1;
}
- /* calculate number of blocks S[h] must be split into when
- items are shifted in both directions,
- as well as number of items in each part of the splitted node (s012 numbers),
- and number of bytes (s1bytes) of the shared drop which flow to S1 if any
+ /*
+ * calculate number of blocks S[h] must be split into when
+ * items are shifted in both directions, as well as number
+ * of items in each part of the splitted node (s012 numbers),
+ * and number of bytes (s1bytes) of the shared drop which
+ * flow to S1 if any
*/
lrset = LR_SHIFT_NO_FLOW;
lrnver = get_num_ver(vn->vn_mode, tb, h,
@@ -1481,10 +1628,12 @@ static int ip_check_balance(struct tree_balance *tb, int h)
lrset = LR_SHIFT_FLOW, lrnver = lrnver1;
}
- /* Our general shifting strategy is:
- 1) to minimized number of new nodes;
- 2) to minimized number of neighbors involved in shifting;
- 3) to minimized number of disk reads; */
+ /*
+ * Our general shifting strategy is:
+ * 1) to minimized number of new nodes;
+ * 2) to minimized number of neighbors involved in shifting;
+ * 3) to minimized number of disk reads;
+ */
/* we can win TWO or ONE nodes by shifting in both directions */
if (lrnver < lnver && lrnver < rnver) {
@@ -1508,42 +1657,59 @@ static int ip_check_balance(struct tree_balance *tb, int h)
return CARRY_ON;
}
- /* if shifting doesn't lead to better packing then don't shift */
+ /*
+ * if shifting doesn't lead to better packing
+ * then don't shift
+ */
if (nver == lrnver) {
set_parameters(tb, h, 0, 0, nver, snum012 + nset, -1,
-1);
return CARRY_ON;
}
- /* now we know that for better packing shifting in only one
- direction either to the left or to the right is required */
+ /*
+ * now we know that for better packing shifting in only one
+ * direction either to the left or to the right is required
+ */
- /* if shifting to the left is better than shifting to the right */
+ /*
+ * if shifting to the left is better than
+ * shifting to the right
+ */
if (lnver < rnver) {
SET_PAR_SHIFT_LEFT;
return CARRY_ON;
}
- /* if shifting to the right is better than shifting to the left */
+ /*
+ * if shifting to the right is better than
+ * shifting to the left
+ */
if (lnver > rnver) {
SET_PAR_SHIFT_RIGHT;
return CARRY_ON;
}
- /* now shifting in either direction gives the same number
- of nodes and we can make use of the cached neighbors */
+ /*
+ * now shifting in either direction gives the same number
+ * of nodes and we can make use of the cached neighbors
+ */
if (is_left_neighbor_in_cache(tb, h)) {
SET_PAR_SHIFT_LEFT;
return CARRY_ON;
}
- /* shift to the right independently on whether the right neighbor in cache or not */
+ /*
+ * shift to the right independently on whether the
+ * right neighbor in cache or not
+ */
SET_PAR_SHIFT_RIGHT;
return CARRY_ON;
}
}
-/* Check whether current node S[h] is balanced when Decreasing its size by
+/*
+ * Check whether current node S[h] is balanced when Decreasing its size by
* Deleting or Cutting for INTERNAL node of S+tree.
* Calculate parameters for balancing for current level h.
* Parameters:
@@ -1563,8 +1729,10 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h)
{
struct virtual_node *vn = tb->tb_vn;
- /* Sh is the node whose balance is currently being checked,
- and Fh is its father. */
+ /*
+ * Sh is the node whose balance is currently being checked,
+ * and Fh is its father.
+ */
struct buffer_head *Sh, *Fh;
int maxsize, ret;
int lfree, rfree /* free space in L and R */ ;
@@ -1574,19 +1742,25 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h)
maxsize = MAX_CHILD_SIZE(Sh);
-/* using tb->insert_size[h], which is negative in this case, create_virtual_node calculates: */
-/* new_nr_item = number of items node would have if operation is */
-/* performed without balancing (new_nr_item); */
+ /*
+ * using tb->insert_size[h], which is negative in this case,
+ * create_virtual_node calculates:
+ * new_nr_item = number of items node would have if operation is
+ * performed without balancing (new_nr_item);
+ */
create_virtual_node(tb, h);
if (!Fh) { /* S[h] is the root. */
+ /* no balancing for higher levels needed */
if (vn->vn_nr_item > 0) {
set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
- return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */
+ return NO_BALANCING_NEEDED;
}
- /* new_nr_item == 0.
+ /*
+ * new_nr_item == 0.
* Current root will be deleted resulting in
- * decrementing the tree height. */
+ * decrementing the tree height.
+ */
set_parameters(tb, h, 0, 0, 0, NULL, -1, -1);
return CARRY_ON;
}
@@ -1602,12 +1776,18 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h)
check_left(tb, h, lfree);
check_right(tb, h, rfree);
- if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) { /* Balance condition for the internal node is valid.
- * In this case we balance only if it leads to better packing. */
- if (vn->vn_nr_item == MIN_NR_KEY(Sh)) { /* Here we join S[h] with one of its neighbors,
- * which is impossible with greater values of new_nr_item. */
+ /*
+ * Balance condition for the internal node is valid.
+ * In this case we balance only if it leads to better packing.
+ */
+ if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) {
+ /*
+ * Here we join S[h] with one of its neighbors,
+ * which is impossible with greater values of new_nr_item.
+ */
+ if (vn->vn_nr_item == MIN_NR_KEY(Sh)) {
+ /* All contents of S[h] can be moved to L[h]. */
if (tb->lnum[h] >= vn->vn_nr_item + 1) {
- /* All contents of S[h] can be moved to L[h]. */
int n;
int order_L;
@@ -1623,8 +1803,8 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h)
return CARRY_ON;
}
+ /* All contents of S[h] can be moved to R[h]. */
if (tb->rnum[h] >= vn->vn_nr_item + 1) {
- /* All contents of S[h] can be moved to R[h]. */
int n;
int order_R;
@@ -1641,8 +1821,11 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h)
}
}
+ /*
+ * All contents of S[h] can be moved to the neighbors
+ * (L[h] & R[h]).
+ */
if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
- /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
int to_r;
to_r =
@@ -1659,7 +1842,10 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h)
return NO_BALANCING_NEEDED;
}
- /* Current node contain insufficient number of items. Balancing is required. */
+ /*
+ * Current node contain insufficient number of items.
+ * Balancing is required.
+ */
/* Check whether we can merge S[h] with left neighbor. */
if (tb->lnum[h] >= vn->vn_nr_item + 1)
if (is_left_neighbor_in_cache(tb, h)
@@ -1726,7 +1912,8 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h)
return CARRY_ON;
}
-/* Check whether current node S[h] is balanced when Decreasing its size by
+/*
+ * Check whether current node S[h] is balanced when Decreasing its size by
* Deleting or Truncating for LEAF node of S+tree.
* Calculate parameters for balancing for current level h.
* Parameters:
@@ -1743,15 +1930,21 @@ static int dc_check_balance_leaf(struct tree_balance *tb, int h)
{
struct virtual_node *vn = tb->tb_vn;
- /* Number of bytes that must be deleted from
- (value is negative if bytes are deleted) buffer which
- contains node being balanced. The mnemonic is that the
- attempted change in node space used level is levbytes bytes. */
+ /*
+ * Number of bytes that must be deleted from
+ * (value is negative if bytes are deleted) buffer which
+ * contains node being balanced. The mnemonic is that the
+ * attempted change in node space used level is levbytes bytes.
+ */
int levbytes;
+
/* the maximal item size */
int maxsize, ret;
- /* S0 is the node whose balance is currently being checked,
- and F0 is its father. */
+
+ /*
+ * S0 is the node whose balance is currently being checked,
+ * and F0 is its father.
+ */
struct buffer_head *S0, *F0;
int lfree, rfree /* free space in L and R */ ;
@@ -1784,9 +1977,11 @@ static int dc_check_balance_leaf(struct tree_balance *tb, int h)
if (are_leaves_removable(tb, lfree, rfree))
return CARRY_ON;
- /* determine maximal number of items we can shift to the left/right neighbor
- and the maximal number of bytes that can flow to the left/right neighbor
- from the left/right most liquid item that cannot be shifted from S[0] entirely
+ /*
+ * determine maximal number of items we can shift to the left/right
+ * neighbor and the maximal number of bytes that can flow to the
+ * left/right neighbor from the left/right most liquid item that
+ * cannot be shifted from S[0] entirely
*/
check_left(tb, h, lfree);
check_right(tb, h, rfree);
@@ -1810,7 +2005,10 @@ static int dc_check_balance_leaf(struct tree_balance *tb, int h)
return CARRY_ON;
}
- /* All contents of S[0] can be moved to the neighbors (L[0] & R[0]). Set parameters and return */
+ /*
+ * All contents of S[0] can be moved to the neighbors (L[0] & R[0]).
+ * Set parameters and return
+ */
if (is_leaf_removable(tb))
return CARRY_ON;
@@ -1820,7 +2018,8 @@ static int dc_check_balance_leaf(struct tree_balance *tb, int h)
return NO_BALANCING_NEEDED;
}
-/* Check whether current node S[h] is balanced when Decreasing its size by
+/*
+ * Check whether current node S[h] is balanced when Decreasing its size by
* Deleting or Cutting.
* Calculate parameters for balancing for current level h.
* Parameters:
@@ -1844,15 +2043,16 @@ static int dc_check_balance(struct tree_balance *tb, int h)
return dc_check_balance_leaf(tb, h);
}
-/* Check whether current node S[h] is balanced.
+/*
+ * Check whether current node S[h] is balanced.
* Calculate parameters for balancing for current level h.
* Parameters:
*
* tb tree_balance structure:
*
- * tb is a large structure that must be read about in the header file
- * at the same time as this procedure if the reader is to successfully
- * understand this procedure
+ * tb is a large structure that must be read about in the header
+ * file at the same time as this procedure if the reader is
+ * to successfully understand this procedure
*
* h current level of the node;
* inum item number in S[h];
@@ -1882,8 +2082,8 @@ static int check_balance(int mode,
RFALSE(mode == M_INSERT && !vn->vn_ins_ih,
"vs-8255: ins_ih can not be 0 in insert mode");
+ /* Calculate balance parameters when size of node is increasing. */
if (tb->insert_size[h] > 0)
- /* Calculate balance parameters when size of node is increasing. */
return ip_check_balance(tb, h);
/* Calculate balance parameters when size of node is decreasing. */
@@ -1911,21 +2111,23 @@ static int get_direct_parent(struct tree_balance *tb, int h)
PATH_OFFSET_POSITION(path, path_offset - 1) = 0;
return CARRY_ON;
}
- return REPEAT_SEARCH; /* Root is changed and we must recalculate the path. */
+ /* Root is changed and we must recalculate the path. */
+ return REPEAT_SEARCH;
}
+ /* Parent in the path is not in the tree. */
if (!B_IS_IN_TREE
(bh = PATH_OFFSET_PBUFFER(path, path_offset - 1)))
- return REPEAT_SEARCH; /* Parent in the path is not in the tree. */
+ return REPEAT_SEARCH;
if ((position =
PATH_OFFSET_POSITION(path,
path_offset - 1)) > B_NR_ITEMS(bh))
return REPEAT_SEARCH;
+ /* Parent in the path is not parent of the current node in the tree. */
if (B_N_CHILD_NUM(bh, position) !=
PATH_OFFSET_PBUFFER(path, path_offset)->b_blocknr)
- /* Parent in the path is not parent of the current node in the tree. */
return REPEAT_SEARCH;
if (buffer_locked(bh)) {
@@ -1936,10 +2138,15 @@ static int get_direct_parent(struct tree_balance *tb, int h)
return REPEAT_SEARCH;
}
- return CARRY_ON; /* Parent in the path is unlocked and really parent of the current node. */
+ /*
+ * Parent in the path is unlocked and really parent
+ * of the current node.
+ */
+ return CARRY_ON;
}
-/* Using lnum[h] and rnum[h] we should determine what neighbors
+/*
+ * Using lnum[h] and rnum[h] we should determine what neighbors
* of S[h] we
* need in order to balance S[h], and get them if necessary.
* Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
@@ -1997,7 +2204,7 @@ static int get_neighbors(struct tree_balance *tb, int h)
}
/* We need right neighbor to balance S[path_offset]. */
- if (tb->rnum[h]) { /* We need right neighbor to balance S[path_offset]. */
+ if (tb->rnum[h]) {
PROC_INFO_INC(sb, need_r_neighbor[h]);
bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
@@ -2053,9 +2260,11 @@ static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh)
(max_num_of_entries - 1) * sizeof(__u16));
}
-/* maybe we should fail balancing we are going to perform when kmalloc
- fails several times. But now it will loop until kmalloc gets
- required memory */
+/*
+ * maybe we should fail balancing we are going to perform when kmalloc
+ * fails several times. But now it will loop until kmalloc gets
+ * required memory
+ */
static int get_mem_for_virtual_node(struct tree_balance *tb)
{
int check_fs = 0;
@@ -2064,8 +2273,8 @@ static int get_mem_for_virtual_node(struct tree_balance *tb)
size = get_virtual_node_size(tb->tb_sb, PATH_PLAST_BUFFER(tb->tb_path));
+ /* we have to allocate more memory for virtual node */
if (size > tb->vn_buf_size) {
- /* we have to allocate more memory for virtual node */
if (tb->vn_buf) {
/* free memory allocated before */
kfree(tb->vn_buf);
@@ -2079,10 +2288,12 @@ static int get_mem_for_virtual_node(struct tree_balance *tb)
/* get memory for virtual item */
buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
if (!buf) {
- /* getting memory with GFP_KERNEL priority may involve
- balancing now (due to indirect_to_direct conversion on
- dcache shrinking). So, release path and collected
- resources here */
+ /*
+ * getting memory with GFP_KERNEL priority may involve
+ * balancing now (due to indirect_to_direct conversion
+ * on dcache shrinking). So, release path and collected
+ * resources here
+ */
free_buffers_in_tb(tb);
buf = kmalloc(size, GFP_NOFS);
if (!buf) {
@@ -2168,8 +2379,10 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
for (i = tb->tb_path->path_length;
!locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) {
if (PATH_OFFSET_PBUFFER(tb->tb_path, i)) {
- /* if I understand correctly, we can only be sure the last buffer
- ** in the path is in the tree --clm
+ /*
+ * if I understand correctly, we can only
+ * be sure the last buffer in the path is
+ * in the tree --clm
*/
#ifdef CONFIG_REISERFS_CHECK
if (PATH_PLAST_BUFFER(tb->tb_path) ==
@@ -2256,13 +2469,15 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
}
}
}
- /* as far as I can tell, this is not required. The FEB list seems
- ** to be full of newly allocated nodes, which will never be locked,
- ** dirty, or anything else.
- ** To be safe, I'm putting in the checks and waits in. For the moment,
- ** they are needed to keep the code in journal.c from complaining
- ** about the buffer. That code is inside CONFIG_REISERFS_CHECK as well.
- ** --clm
+
+ /*
+ * as far as I can tell, this is not required. The FEB list
+ * seems to be full of newly allocated nodes, which will
+ * never be locked, dirty, or anything else.
+ * To be safe, I'm putting in the checks and waits in.
+ * For the moment, they are needed to keep the code in
+ * journal.c from complaining about the buffer.
+ * That code is inside CONFIG_REISERFS_CHECK as well. --clm
*/
for (i = 0; !locked && i < MAX_FEB_SIZE; i++) {
if (tb->FEB[i]) {
@@ -2300,7 +2515,8 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
return CARRY_ON;
}
-/* Prepare for balancing, that is
+/*
+ * Prepare for balancing, that is
* get all necessary parents, and neighbors;
* analyze what and where should be moved;
* get sufficient number of new nodes;
@@ -2309,13 +2525,14 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
* When ported to SMP kernels, only at the last moment after all needed nodes
* are collected in cache, will the resources be locked using the usual
* textbook ordered lock acquisition algorithms. Note that ensuring that
- * this code neither write locks what it does not need to write lock nor locks out of order
- * will be a pain in the butt that could have been avoided. Grumble grumble. -Hans
+ * this code neither write locks what it does not need to write lock nor locks
+ * out of order will be a pain in the butt that could have been avoided.
+ * Grumble grumble. -Hans
*
* fix is meant in the sense of render unchanging
*
- * Latency might be improved by first gathering a list of what buffers are needed
- * and then getting as many of them in parallel as possible? -Hans
+ * Latency might be improved by first gathering a list of what buffers
+ * are needed and then getting as many of them in parallel as possible? -Hans
*
* Parameters:
* op_mode i - insert, d - delete, c - cut (truncate), p - paste (append)
@@ -2335,8 +2552,9 @@ int fix_nodes(int op_mode, struct tree_balance *tb,
int ret, h, item_num = PATH_LAST_POSITION(tb->tb_path);
int pos_in_item;
- /* we set wait_tb_buffers_run when we have to restore any dirty bits cleared
- ** during wait_tb_buffers_run
+ /*
+ * we set wait_tb_buffers_run when we have to restore any dirty
+ * bits cleared during wait_tb_buffers_run
*/
int wait_tb_buffers_run = 0;
struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
@@ -2347,14 +2565,15 @@ int fix_nodes(int op_mode, struct tree_balance *tb,
tb->fs_gen = get_generation(tb->tb_sb);
- /* we prepare and log the super here so it will already be in the
- ** transaction when do_balance needs to change it.
- ** This way do_balance won't have to schedule when trying to prepare
- ** the super for logging
+ /*
+ * we prepare and log the super here so it will already be in the
+ * transaction when do_balance needs to change it.
+ * This way do_balance won't have to schedule when trying to prepare
+ * the super for logging
*/
reiserfs_prepare_for_journal(tb->tb_sb,
SB_BUFFER_WITH_SB(tb->tb_sb), 1);
- journal_mark_dirty(tb->transaction_handle, tb->tb_sb,
+ journal_mark_dirty(tb->transaction_handle,
SB_BUFFER_WITH_SB(tb->tb_sb));
if (FILESYSTEM_CHANGED_TB(tb))
return REPEAT_SEARCH;
@@ -2408,7 +2627,7 @@ int fix_nodes(int op_mode, struct tree_balance *tb,
#endif
if (get_mem_for_virtual_node(tb) == REPEAT_SEARCH)
- // FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat
+ /* FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat */
return REPEAT_SEARCH;
/* Starting from the leaf level; for all levels h of the tree. */
@@ -2427,7 +2646,10 @@ int fix_nodes(int op_mode, struct tree_balance *tb,
goto repeat;
if (h != MAX_HEIGHT - 1)
tb->insert_size[h + 1] = 0;
- /* ok, analysis and resource gathering are complete */
+ /*
+ * ok, analysis and resource gathering
+ * are complete
+ */
break;
}
goto repeat;
@@ -2437,15 +2659,19 @@ int fix_nodes(int op_mode, struct tree_balance *tb,
if (ret != CARRY_ON)
goto repeat;
- /* No disk space, or schedule occurred and analysis may be
- * invalid and needs to be redone. */
+ /*
+ * No disk space, or schedule occurred and analysis may be
+ * invalid and needs to be redone.
+ */
ret = get_empty_nodes(tb, h);
if (ret != CARRY_ON)
goto repeat;
+ /*
+ * We have a positive insert size but no nodes exist on this
+ * level, this means that we are creating a new root.
+ */
if (!PATH_H_PBUFFER(tb->tb_path, h)) {
- /* We have a positive insert size but no nodes exist on this
- level, this means that we are creating a new root. */
RFALSE(tb->blknum[h] != 1,
"PAP-8350: creating new empty root");
@@ -2453,11 +2679,13 @@ int fix_nodes(int op_mode, struct tree_balance *tb,
if (h < MAX_HEIGHT - 1)
tb->insert_size[h + 1] = 0;
} else if (!PATH_H_PBUFFER(tb->tb_path, h + 1)) {
+ /*
+ * The tree needs to be grown, so this node S[h]
+ * which is the root node is split into two nodes,
+ * and a new node (S[h+1]) will be created to
+ * become the root node.
+ */
if (tb->blknum[h] > 1) {
- /* The tree needs to be grown, so this node S[h]
- which is the root node is split into two nodes,
- and a new node (S[h+1]) will be created to
- become the root node. */
RFALSE(h == MAX_HEIGHT - 1,
"PAP-8355: attempt to create too high of a tree");
@@ -2487,12 +2715,14 @@ int fix_nodes(int op_mode, struct tree_balance *tb,
goto repeat;
}
- repeat:
- // fix_nodes was unable to perform its calculation due to
- // filesystem got changed under us, lack of free disk space or i/o
- // failure. If the first is the case - the search will be
- // repeated. For now - free all resources acquired so far except
- // for the new allocated nodes
+repeat:
+ /*
+ * fix_nodes was unable to perform its calculation due to
+ * filesystem got changed under us, lack of free disk space or i/o
+ * failure. If the first is the case - the search will be
+ * repeated. For now - free all resources acquired so far except
+ * for the new allocated nodes
+ */
{
int i;
@@ -2548,8 +2778,6 @@ int fix_nodes(int op_mode, struct tree_balance *tb,
}
-/* Anatoly will probably forgive me renaming tb to tb. I just
- wanted to make lines shorter */
void unfix_nodes(struct tree_balance *tb)
{
int i;
@@ -2578,8 +2806,10 @@ void unfix_nodes(struct tree_balance *tb)
for (i = 0; i < MAX_FEB_SIZE; i++) {
if (tb->FEB[i]) {
b_blocknr_t blocknr = tb->FEB[i]->b_blocknr;
- /* de-allocated block which was not used by balancing and
- bforget about buffer for it */
+ /*
+ * de-allocated block which was not used by
+ * balancing and bforget about buffer for it
+ */
brelse(tb->FEB[i]);
reiserfs_free_block(tb->transaction_handle, NULL,
blocknr, 0);
diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c
index 91b0cc1242a2..7a26c4fe6c46 100644
--- a/fs/reiserfs/hashes.c
+++ b/fs/reiserfs/hashes.c
@@ -12,12 +12,6 @@
* Yura's function is added (04/07/2000)
*/
-//
-// keyed_hash
-// yura_hash
-// r5_hash
-//
-
#include <linux/kernel.h>
#include "reiserfs.h"
#include <asm/types.h>
@@ -56,7 +50,7 @@ u32 keyed_hash(const signed char *msg, int len)
u32 pad;
int i;
- // assert(len >= 0 && len < 256);
+ /* assert(len >= 0 && len < 256); */
pad = (u32) len | ((u32) len << 8);
pad |= pad << 16;
@@ -127,9 +121,10 @@ u32 keyed_hash(const signed char *msg, int len)
return h0 ^ h1;
}
-/* What follows in this file is copyright 2000 by Hans Reiser, and the
- * licensing of what follows is governed by reiserfs/README */
-
+/*
+ * What follows in this file is copyright 2000 by Hans Reiser, and the
+ * licensing of what follows is governed by reiserfs/README
+ */
u32 yura_hash(const signed char *msg, int len)
{
int j, pow;
diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c
index e1978fd895f5..73231b1ebdbe 100644
--- a/fs/reiserfs/ibalance.c
+++ b/fs/reiserfs/ibalance.c
@@ -12,7 +12,10 @@
int balance_internal(struct tree_balance *,
int, int, struct item_head *, struct buffer_head **);
-/* modes of internal_shift_left, internal_shift_right and internal_insert_childs */
+/*
+ * modes of internal_shift_left, internal_shift_right and
+ * internal_insert_childs
+ */
#define INTERNAL_SHIFT_FROM_S_TO_L 0
#define INTERNAL_SHIFT_FROM_R_TO_S 1
#define INTERNAL_SHIFT_FROM_L_TO_S 2
@@ -32,7 +35,9 @@ static void internal_define_dest_src_infos(int shift_mode,
memset(src_bi, 0, sizeof(struct buffer_info));
/* define dest, src, dest parent, dest position */
switch (shift_mode) {
- case INTERNAL_SHIFT_FROM_S_TO_L: /* used in internal_shift_left */
+
+ /* used in internal_shift_left */
+ case INTERNAL_SHIFT_FROM_S_TO_L:
src_bi->tb = tb;
src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
@@ -52,12 +57,14 @@ static void internal_define_dest_src_infos(int shift_mode,
dest_bi->tb = tb;
dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
- dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); /* dest position is analog of dest->b_item_order */
+ /* dest position is analog of dest->b_item_order */
+ dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
*d_key = tb->lkey[h];
*cf = tb->CFL[h];
break;
- case INTERNAL_SHIFT_FROM_R_TO_S: /* used in internal_shift_left */
+ /* used in internal_shift_left */
+ case INTERNAL_SHIFT_FROM_R_TO_S:
src_bi->tb = tb;
src_bi->bi_bh = tb->R[h];
src_bi->bi_parent = tb->FR[h];
@@ -111,7 +118,8 @@ static void internal_define_dest_src_infos(int shift_mode,
}
}
-/* Insert count node pointers into buffer cur before position to + 1.
+/*
+ * Insert count node pointers into buffer cur before position to + 1.
* Insert count items into buffer cur before position to.
* Items and node pointers are specified by inserted and bh respectively.
*/
@@ -146,14 +154,14 @@ static void internal_insert_childs(struct buffer_info *cur_bi,
/* copy to_be_insert disk children */
for (i = 0; i < count; i++) {
- put_dc_size(&(new_dc[i]),
+ put_dc_size(&new_dc[i],
MAX_CHILD_SIZE(bh[i]) - B_FREE_SPACE(bh[i]));
- put_dc_block_number(&(new_dc[i]), bh[i]->b_blocknr);
+ put_dc_block_number(&new_dc[i], bh[i]->b_blocknr);
}
memcpy(dc, new_dc, DC_SIZE * count);
/* prepare space for count items */
- ih = B_N_PDELIM_KEY(cur, ((to == -1) ? 0 : to));
+ ih = internal_key(cur, ((to == -1) ? 0 : to));
memmove(ih + count, ih,
(nr - to) * KEY_SIZE + (nr + 1 + count) * DC_SIZE);
@@ -190,8 +198,10 @@ static void internal_insert_childs(struct buffer_info *cur_bi,
}
-/* Delete del_num items and node pointers from buffer cur starting from *
- * the first_i'th item and first_p'th pointers respectively. */
+/*
+ * Delete del_num items and node pointers from buffer cur starting from
+ * the first_i'th item and first_p'th pointers respectively.
+ */
static void internal_delete_pointers_items(struct buffer_info *cur_bi,
int first_p,
int first_i, int del_num)
@@ -233,7 +243,7 @@ static void internal_delete_pointers_items(struct buffer_info *cur_bi,
dc = B_N_CHILD(cur, first_p);
memmove(dc, dc + del_num, (nr + 1 - first_p - del_num) * DC_SIZE);
- key = B_N_PDELIM_KEY(cur, first_i);
+ key = internal_key(cur, first_i);
memmove(key, key + del_num,
(nr - first_i - del_num) * KEY_SIZE + (nr + 1 -
del_num) * DC_SIZE);
@@ -270,22 +280,30 @@ static void internal_delete_childs(struct buffer_info *cur_bi, int from, int n)
i_from = (from == 0) ? from : from - 1;
- /* delete n pointers starting from `from' position in CUR;
- delete n keys starting from 'i_from' position in CUR;
+ /*
+ * delete n pointers starting from `from' position in CUR;
+ * delete n keys starting from 'i_from' position in CUR;
*/
internal_delete_pointers_items(cur_bi, from, i_from, n);
}
-/* copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest
-* last_first == FIRST_TO_LAST means, that we copy first items from src to tail of dest
- * last_first == LAST_TO_FIRST means, that we copy last items from src to head of dest
+/*
+ * copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer
+ * dest
+ * last_first == FIRST_TO_LAST means that we copy first items
+ * from src to tail of dest
+ * last_first == LAST_TO_FIRST means that we copy last items
+ * from src to head of dest
*/
static void internal_copy_pointers_items(struct buffer_info *dest_bi,
struct buffer_head *src,
int last_first, int cpy_num)
{
- /* ATTENTION! Number of node pointers in DEST is equal to number of items in DEST *
- * as delimiting key have already inserted to buffer dest.*/
+ /*
+ * ATTENTION! Number of node pointers in DEST is equal to number
+ * of items in DEST as delimiting key have already inserted to
+ * buffer dest.
+ */
struct buffer_head *dest = dest_bi->bi_bh;
int nr_dest, nr_src;
int dest_order, src_order;
@@ -330,13 +348,13 @@ static void internal_copy_pointers_items(struct buffer_info *dest_bi,
memcpy(dc, B_N_CHILD(src, src_order), DC_SIZE * cpy_num);
/* prepare space for cpy_num - 1 item headers */
- key = B_N_PDELIM_KEY(dest, dest_order);
+ key = internal_key(dest, dest_order);
memmove(key + cpy_num - 1, key,
KEY_SIZE * (nr_dest - dest_order) + DC_SIZE * (nr_dest +
cpy_num));
/* insert headers */
- memcpy(key, B_N_PDELIM_KEY(src, src_order), KEY_SIZE * (cpy_num - 1));
+ memcpy(key, internal_key(src, src_order), KEY_SIZE * (cpy_num - 1));
/* sizes, item number */
set_blkh_nr_item(blkh, blkh_nr_item(blkh) + (cpy_num - 1));
@@ -366,7 +384,9 @@ static void internal_copy_pointers_items(struct buffer_info *dest_bi,
}
-/* Copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest.
+/*
+ * Copy cpy_num node pointers and cpy_num - 1 items from buffer src to
+ * buffer dest.
* Delete cpy_num - del_par items and node pointers from buffer src.
* last_first == FIRST_TO_LAST means, that we copy/delete first items from src.
* last_first == LAST_TO_FIRST means, that we copy/delete last items from src.
@@ -385,8 +405,10 @@ static void internal_move_pointers_items(struct buffer_info *dest_bi,
if (last_first == FIRST_TO_LAST) { /* shift_left occurs */
first_pointer = 0;
first_item = 0;
- /* delete cpy_num - del_par pointers and keys starting for pointers with first_pointer,
- for key - with first_item */
+ /*
+ * delete cpy_num - del_par pointers and keys starting for
+ * pointers with first_pointer, for key - with first_item
+ */
internal_delete_pointers_items(src_bi, first_pointer,
first_item, cpy_num - del_par);
} else { /* shift_right occurs */
@@ -404,7 +426,9 @@ static void internal_move_pointers_items(struct buffer_info *dest_bi,
}
/* Insert n_src'th key of buffer src before n_dest'th key of buffer dest. */
-static void internal_insert_key(struct buffer_info *dest_bi, int dest_position_before, /* insert key before key with n_dest number */
+static void internal_insert_key(struct buffer_info *dest_bi,
+ /* insert key before key with n_dest number */
+ int dest_position_before,
struct buffer_head *src, int src_position)
{
struct buffer_head *dest = dest_bi->bi_bh;
@@ -429,12 +453,12 @@ static void internal_insert_key(struct buffer_info *dest_bi, int dest_position_b
nr = blkh_nr_item(blkh);
/* prepare space for inserting key */
- key = B_N_PDELIM_KEY(dest, dest_position_before);
+ key = internal_key(dest, dest_position_before);
memmove(key + 1, key,
(nr - dest_position_before) * KEY_SIZE + (nr + 1) * DC_SIZE);
/* insert key */
- memcpy(key, B_N_PDELIM_KEY(src, src_position), KEY_SIZE);
+ memcpy(key, internal_key(src, src_position), KEY_SIZE);
/* Change dirt, free space, item number fields. */
@@ -453,13 +477,19 @@ static void internal_insert_key(struct buffer_info *dest_bi, int dest_position_b
}
}
-/* Insert d_key'th (delimiting) key from buffer cfl to tail of dest.
- * Copy pointer_amount node pointers and pointer_amount - 1 items from buffer src to buffer dest.
+/*
+ * Insert d_key'th (delimiting) key from buffer cfl to tail of dest.
+ * Copy pointer_amount node pointers and pointer_amount - 1 items from
+ * buffer src to buffer dest.
* Replace d_key'th key in buffer cfl.
* Delete pointer_amount items and node pointers from buffer src.
*/
/* this can be invoked both to shift from S to L and from R to S */
-static void internal_shift_left(int mode, /* INTERNAL_FROM_S_TO_L | INTERNAL_FROM_R_TO_S */
+static void internal_shift_left(
+ /*
+ * INTERNAL_FROM_S_TO_L | INTERNAL_FROM_R_TO_S
+ */
+ int mode,
struct tree_balance *tb,
int h, int pointer_amount)
{
@@ -473,7 +503,10 @@ static void internal_shift_left(int mode, /* INTERNAL_FROM_S_TO_L | INTERNAL_FRO
/*printk("pointer_amount = %d\n",pointer_amount); */
if (pointer_amount) {
- /* insert delimiting key from common father of dest and src to node dest into position B_NR_ITEM(dest) */
+ /*
+ * insert delimiting key from common father of dest and
+ * src to node dest into position B_NR_ITEM(dest)
+ */
internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf,
d_key_position);
@@ -492,7 +525,8 @@ static void internal_shift_left(int mode, /* INTERNAL_FROM_S_TO_L | INTERNAL_FRO
}
-/* Insert delimiting key to L[h].
+/*
+ * Insert delimiting key to L[h].
* Copy n node pointers and n - 1 items from buffer S[h] to L[h].
* Delete n - 1 items and node pointers from buffer S[h].
*/
@@ -507,23 +541,27 @@ static void internal_shift1_left(struct tree_balance *tb,
internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
&dest_bi, &src_bi, &d_key_position, &cf);
- if (pointer_amount > 0) /* insert lkey[h]-th key from CFL[h] to left neighbor L[h] */
+ /* insert lkey[h]-th key from CFL[h] to left neighbor L[h] */
+ if (pointer_amount > 0)
internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf,
d_key_position);
- /* internal_insert_key (tb->L[h], B_NR_ITEM(tb->L[h]), tb->CFL[h], tb->lkey[h]); */
/* last parameter is del_parameter */
internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST,
pointer_amount, 1);
- /* internal_move_pointers_items (tb->L[h], tb->S[h], FIRST_TO_LAST, pointer_amount, 1); */
}
-/* Insert d_key'th (delimiting) key from buffer cfr to head of dest.
+/*
+ * Insert d_key'th (delimiting) key from buffer cfr to head of dest.
* Copy n node pointers and n - 1 items from buffer src to buffer dest.
* Replace d_key'th key in buffer cfr.
* Delete n items and node pointers from buffer src.
*/
-static void internal_shift_right(int mode, /* INTERNAL_FROM_S_TO_R | INTERNAL_FROM_L_TO_S */
+static void internal_shift_right(
+ /*
+ * INTERNAL_FROM_S_TO_R | INTERNAL_FROM_L_TO_S
+ */
+ int mode,
struct tree_balance *tb,
int h, int pointer_amount)
{
@@ -538,7 +576,10 @@ static void internal_shift_right(int mode, /* INTERNAL_FROM_S_TO_R | INTERNAL_FR
nr = B_NR_ITEMS(src_bi.bi_bh);
if (pointer_amount > 0) {
- /* insert delimiting key from common father of dest and src to dest node into position 0 */
+ /*
+ * insert delimiting key from common father of dest
+ * and src to dest node into position 0
+ */
internal_insert_key(&dest_bi, 0, cf, d_key_position);
if (nr == pointer_amount - 1) {
RFALSE(src_bi.bi_bh != PATH_H_PBUFFER(tb->tb_path, h) /*tb->S[h] */ ||
@@ -559,7 +600,8 @@ static void internal_shift_right(int mode, /* INTERNAL_FROM_S_TO_R | INTERNAL_FR
pointer_amount, 0);
}
-/* Insert delimiting key to R[h].
+/*
+ * Insert delimiting key to R[h].
* Copy n node pointers and n - 1 items from buffer S[h] to R[h].
* Delete n - 1 items and node pointers from buffer S[h].
*/
@@ -574,18 +616,19 @@ static void internal_shift1_right(struct tree_balance *tb,
internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
&dest_bi, &src_bi, &d_key_position, &cf);
- if (pointer_amount > 0) /* insert rkey from CFR[h] to right neighbor R[h] */
+ /* insert rkey from CFR[h] to right neighbor R[h] */
+ if (pointer_amount > 0)
internal_insert_key(&dest_bi, 0, cf, d_key_position);
- /* internal_insert_key (tb->R[h], 0, tb->CFR[h], tb->rkey[h]); */
/* last parameter is del_parameter */
internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST,
pointer_amount, 1);
- /* internal_move_pointers_items (tb->R[h], tb->S[h], LAST_TO_FIRST, pointer_amount, 1); */
}
-/* Delete insert_num node pointers together with their left items
- * and balance current node.*/
+/*
+ * Delete insert_num node pointers together with their left items
+ * and balance current node.
+ */
static void balance_internal_when_delete(struct tree_balance *tb,
int h, int child_pos)
{
@@ -626,9 +669,11 @@ static void balance_internal_when_delete(struct tree_balance *tb,
new_root = tb->R[h - 1];
else
new_root = tb->L[h - 1];
- /* switch super block's tree root block number to the new value */
+ /*
+ * switch super block's tree root block
+ * number to the new value */
PUT_SB_ROOT_BLOCK(tb->tb_sb, new_root->b_blocknr);
- //REISERFS_SB(tb->tb_sb)->s_rs->s_tree_height --;
+ /*REISERFS_SB(tb->tb_sb)->s_rs->s_tree_height --; */
PUT_SB_TREE_HEIGHT(tb->tb_sb,
SB_TREE_HEIGHT(tb->tb_sb) - 1);
@@ -636,8 +681,8 @@ static void balance_internal_when_delete(struct tree_balance *tb,
REISERFS_SB(tb->tb_sb)->s_sbh,
1);
/*&&&&&&&&&&&&&&&&&&&&&& */
+ /* use check_internal if new root is an internal node */
if (h > 1)
- /* use check_internal if new root is an internal node */
check_internal(new_root);
/*&&&&&&&&&&&&&&&&&&&&&& */
@@ -648,7 +693,8 @@ static void balance_internal_when_delete(struct tree_balance *tb,
return;
}
- if (tb->L[h] && tb->lnum[h] == -B_NR_ITEMS(tb->L[h]) - 1) { /* join S[h] with L[h] */
+ /* join S[h] with L[h] */
+ if (tb->L[h] && tb->lnum[h] == -B_NR_ITEMS(tb->L[h]) - 1) {
RFALSE(tb->rnum[h] != 0,
"invalid tb->rnum[%d]==%d when joining S[h] with L[h]",
@@ -660,7 +706,8 @@ static void balance_internal_when_delete(struct tree_balance *tb,
return;
}
- if (tb->R[h] && tb->rnum[h] == -B_NR_ITEMS(tb->R[h]) - 1) { /* join S[h] with R[h] */
+ /* join S[h] with R[h] */
+ if (tb->R[h] && tb->rnum[h] == -B_NR_ITEMS(tb->R[h]) - 1) {
RFALSE(tb->lnum[h] != 0,
"invalid tb->lnum[%d]==%d when joining S[h] with R[h]",
h, tb->lnum[h]);
@@ -671,17 +718,18 @@ static void balance_internal_when_delete(struct tree_balance *tb,
return;
}
- if (tb->lnum[h] < 0) { /* borrow from left neighbor L[h] */
+ /* borrow from left neighbor L[h] */
+ if (tb->lnum[h] < 0) {
RFALSE(tb->rnum[h] != 0,
"wrong tb->rnum[%d]==%d when borrow from L[h]", h,
tb->rnum[h]);
- /*internal_shift_right (tb, h, tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], -tb->lnum[h]); */
internal_shift_right(INTERNAL_SHIFT_FROM_L_TO_S, tb, h,
-tb->lnum[h]);
return;
}
- if (tb->rnum[h] < 0) { /* borrow from right neighbor R[h] */
+ /* borrow from right neighbor R[h] */
+ if (tb->rnum[h] < 0) {
RFALSE(tb->lnum[h] != 0,
"invalid tb->lnum[%d]==%d when borrow from R[h]",
h, tb->lnum[h]);
@@ -689,7 +737,8 @@ static void balance_internal_when_delete(struct tree_balance *tb,
return;
}
- if (tb->lnum[h] > 0) { /* split S[h] into two parts and put them into neighbors */
+ /* split S[h] into two parts and put them into neighbors */
+ if (tb->lnum[h] > 0) {
RFALSE(tb->rnum[h] == 0 || tb->lnum[h] + tb->rnum[h] != n + 1,
"invalid tb->lnum[%d]==%d or tb->rnum[%d]==%d when S[h](item number == %d) is split between them",
h, tb->lnum[h], h, tb->rnum[h], n);
@@ -717,7 +766,7 @@ static void replace_lkey(struct tree_balance *tb, int h, struct item_head *key)
if (B_NR_ITEMS(PATH_H_PBUFFER(tb->tb_path, h)) == 0)
return;
- memcpy(B_N_PDELIM_KEY(tb->CFL[h], tb->lkey[h]), key, KEY_SIZE);
+ memcpy(internal_key(tb->CFL[h], tb->lkey[h]), key, KEY_SIZE);
do_balance_mark_internal_dirty(tb, tb->CFL[h], 0);
}
@@ -732,34 +781,41 @@ static void replace_rkey(struct tree_balance *tb, int h, struct item_head *key)
"R[h] can not be empty if it exists (item number=%d)",
B_NR_ITEMS(tb->R[h]));
- memcpy(B_N_PDELIM_KEY(tb->CFR[h], tb->rkey[h]), key, KEY_SIZE);
+ memcpy(internal_key(tb->CFR[h], tb->rkey[h]), key, KEY_SIZE);
do_balance_mark_internal_dirty(tb, tb->CFR[h], 0);
}
-int balance_internal(struct tree_balance *tb, /* tree_balance structure */
- int h, /* level of the tree */
- int child_pos, struct item_head *insert_key, /* key for insertion on higher level */
- struct buffer_head **insert_ptr /* node for insertion on higher level */
- )
- /* if inserting/pasting
- {
- child_pos is the position of the node-pointer in S[h] that *
- pointed to S[h-1] before balancing of the h-1 level; *
- this means that new pointers and items must be inserted AFTER *
- child_pos
- }
- else
- {
- it is the position of the leftmost pointer that must be deleted (together with
- its corresponding key to the left of the pointer)
- as a result of the previous level's balancing.
- }
- */
+
+/*
+ * if inserting/pasting {
+ * child_pos is the position of the node-pointer in S[h] that
+ * pointed to S[h-1] before balancing of the h-1 level;
+ * this means that new pointers and items must be inserted AFTER
+ * child_pos
+ * } else {
+ * it is the position of the leftmost pointer that must be deleted
+ * (together with its corresponding key to the left of the pointer)
+ * as a result of the previous level's balancing.
+ * }
+ */
+
+int balance_internal(struct tree_balance *tb,
+ int h, /* level of the tree */
+ int child_pos,
+ /* key for insertion on higher level */
+ struct item_head *insert_key,
+ /* node for insertion on higher level */
+ struct buffer_head **insert_ptr)
{
struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h);
struct buffer_info bi;
- int order; /* we return this: it is 0 if there is no S[h], else it is tb->S[h]->b_item_order */
+
+ /*
+ * we return this: it is 0 if there is no S[h],
+ * else it is tb->S[h]->b_item_order
+ */
+ int order;
int insert_num, n, k;
struct buffer_head *S_new;
struct item_head new_insert_key;
@@ -774,8 +830,10 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure
(tbSh) ? PATH_H_POSITION(tb->tb_path,
h + 1) /*tb->S[h]->b_item_order */ : 0;
- /* Using insert_size[h] calculate the number insert_num of items
- that must be inserted to or deleted from S[h]. */
+ /*
+ * Using insert_size[h] calculate the number insert_num of items
+ * that must be inserted to or deleted from S[h].
+ */
insert_num = tb->insert_size[h] / ((int)(KEY_SIZE + DC_SIZE));
/* Check whether insert_num is proper * */
@@ -794,23 +852,21 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure
k = 0;
if (tb->lnum[h] > 0) {
- /* shift lnum[h] items from S[h] to the left neighbor L[h].
- check how many of new items fall into L[h] or CFL[h] after
- shifting */
+ /*
+ * shift lnum[h] items from S[h] to the left neighbor L[h].
+ * check how many of new items fall into L[h] or CFL[h] after
+ * shifting
+ */
n = B_NR_ITEMS(tb->L[h]); /* number of items in L[h] */
if (tb->lnum[h] <= child_pos) {
/* new items don't fall into L[h] or CFL[h] */
internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
tb->lnum[h]);
- /*internal_shift_left (tb->L[h],tb->CFL[h],tb->lkey[h],tbSh,tb->lnum[h]); */
child_pos -= tb->lnum[h];
} else if (tb->lnum[h] > child_pos + insert_num) {
/* all new items fall into L[h] */
internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
tb->lnum[h] - insert_num);
- /* internal_shift_left(tb->L[h],tb->CFL[h],tb->lkey[h],tbSh,
- tb->lnum[h]-insert_num);
- */
/* insert insert_num keys and node-pointers into L[h] */
bi.tb = tb;
bi.bi_bh = tb->L[h];
@@ -826,7 +882,10 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure
} else {
struct disk_child *dc;
- /* some items fall into L[h] or CFL[h], but some don't fall */
+ /*
+ * some items fall into L[h] or CFL[h],
+ * but some don't fall
+ */
internal_shift1_left(tb, h, child_pos + 1);
/* calculate number of new items that fall into L[h] */
k = tb->lnum[h] - child_pos - 1;
@@ -841,7 +900,10 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure
replace_lkey(tb, h, insert_key + k);
- /* replace the first node-ptr in S[h] by node-ptr to insert_ptr[k] */
+ /*
+ * replace the first node-ptr in S[h] by
+ * node-ptr to insert_ptr[k]
+ */
dc = B_N_CHILD(tbSh, 0);
put_dc_size(dc,
MAX_CHILD_SIZE(insert_ptr[k]) -
@@ -860,17 +922,17 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure
/* tb->lnum[h] > 0 */
if (tb->rnum[h] > 0) {
/*shift rnum[h] items from S[h] to the right neighbor R[h] */
- /* check how many of new items fall into R or CFR after shifting */
+ /*
+ * check how many of new items fall into R or CFR
+ * after shifting
+ */
n = B_NR_ITEMS(tbSh); /* number of items in S[h] */
if (n - tb->rnum[h] >= child_pos)
/* new items fall into S[h] */
- /*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h],tb->rnum[h]); */
internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
tb->rnum[h]);
else if (n + insert_num - tb->rnum[h] < child_pos) {
/* all new items fall into R[h] */
- /*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h],
- tb->rnum[h] - insert_num); */
internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
tb->rnum[h] - insert_num);
@@ -904,7 +966,10 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure
replace_rkey(tb, h, insert_key + insert_num - k - 1);
- /* replace the first node-ptr in R[h] by node-ptr insert_ptr[insert_num-k-1] */
+ /*
+ * replace the first node-ptr in R[h] by
+ * node-ptr insert_ptr[insert_num-k-1]
+ */
dc = B_N_CHILD(tb->R[h], 0);
put_dc_size(dc,
MAX_CHILD_SIZE(insert_ptr
@@ -921,7 +986,7 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure
}
}
- /** Fill new node that appears instead of S[h] **/
+ /** Fill new node that appears instead of S[h] **/
RFALSE(tb->blknum[h] > 2, "blknum can not be > 2 for internal level");
RFALSE(tb->blknum[h] < 0, "blknum can not be < 0");
@@ -997,26 +1062,30 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure
/* new items don't fall into S_new */
/* store the delimiting key for the next level */
/* new_insert_key = (n - snum)'th key in S[h] */
- memcpy(&new_insert_key, B_N_PDELIM_KEY(tbSh, n - snum),
+ memcpy(&new_insert_key, internal_key(tbSh, n - snum),
KEY_SIZE);
/* last parameter is del_par */
internal_move_pointers_items(&dest_bi, &src_bi,
LAST_TO_FIRST, snum, 0);
- /* internal_move_pointers_items(S_new, tbSh, LAST_TO_FIRST, snum, 0); */
} else if (n + insert_num - snum < child_pos) {
/* all new items fall into S_new */
/* store the delimiting key for the next level */
- /* new_insert_key = (n + insert_item - snum)'th key in S[h] */
+ /*
+ * new_insert_key = (n + insert_item - snum)'th
+ * key in S[h]
+ */
memcpy(&new_insert_key,
- B_N_PDELIM_KEY(tbSh, n + insert_num - snum),
+ internal_key(tbSh, n + insert_num - snum),
KEY_SIZE);
/* last parameter is del_par */
internal_move_pointers_items(&dest_bi, &src_bi,
LAST_TO_FIRST,
snum - insert_num, 0);
- /* internal_move_pointers_items(S_new,tbSh,1,snum - insert_num,0); */
- /* insert insert_num keys and node-pointers into S_new */
+ /*
+ * insert insert_num keys and node-pointers
+ * into S_new
+ */
internal_insert_childs(&dest_bi,
/*S_new,tb->S[h-1]->b_next, */
child_pos - n - insert_num +
@@ -1033,7 +1102,6 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure
internal_move_pointers_items(&dest_bi, &src_bi,
LAST_TO_FIRST,
n - child_pos + 1, 1);
- /* internal_move_pointers_items(S_new,tbSh,1,n - child_pos + 1,1); */
/* calculate number of new items that fall into S_new */
k = snum - n + child_pos - 1;
@@ -1043,7 +1111,10 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure
/* new_insert_key = insert_key[insert_num - k - 1] */
memcpy(&new_insert_key, insert_key + insert_num - k - 1,
KEY_SIZE);
- /* replace first node-ptr in S_new by node-ptr to insert_ptr[insert_num-k-1] */
+ /*
+ * replace first node-ptr in S_new by node-ptr
+ * to insert_ptr[insert_num-k-1]
+ */
dc = B_N_CHILD(S_new, 0);
put_dc_size(dc,
@@ -1066,7 +1137,7 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure
|| buffer_dirty(S_new), "cm-00001: bad S_new (%b)",
S_new);
- // S_new is released in unfix_nodes
+ /* S_new is released in unfix_nodes */
}
n = B_NR_ITEMS(tbSh); /*number of items in S[h] */
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index bc8b8009897d..63b2b0ec49e6 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -25,7 +25,10 @@ int reiserfs_commit_write(struct file *f, struct page *page,
void reiserfs_evict_inode(struct inode *inode)
{
- /* We need blocks for transaction + (user+group) quota update (possibly delete) */
+ /*
+ * We need blocks for transaction + (user+group) quota
+ * update (possibly delete)
+ */
int jbegin_count =
JOURNAL_PER_BALANCE_CNT * 2 +
2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
@@ -39,8 +42,12 @@ void reiserfs_evict_inode(struct inode *inode)
if (inode->i_nlink)
goto no_delete;
- /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
- if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */
+ /*
+ * The = 0 happens when we abort creating a new inode
+ * for some reason like lack of space..
+ * also handles bad_inode case
+ */
+ if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) {
reiserfs_delete_xattrs(inode);
@@ -54,34 +61,43 @@ void reiserfs_evict_inode(struct inode *inode)
err = reiserfs_delete_object(&th, inode);
- /* Do quota update inside a transaction for journaled quotas. We must do that
- * after delete_object so that quota updates go into the same transaction as
- * stat data deletion */
+ /*
+ * Do quota update inside a transaction for journaled quotas.
+ * We must do that after delete_object so that quota updates
+ * go into the same transaction as stat data deletion
+ */
if (!err) {
int depth = reiserfs_write_unlock_nested(inode->i_sb);
dquot_free_inode(inode);
reiserfs_write_lock_nested(inode->i_sb, depth);
}
- if (journal_end(&th, inode->i_sb, jbegin_count))
+ if (journal_end(&th))
goto out;
- /* check return value from reiserfs_delete_object after
+ /*
+ * check return value from reiserfs_delete_object after
* ending the transaction
*/
if (err)
goto out;
- /* all items of file are deleted, so we can remove "save" link */
- remove_save_link(inode, 0 /* not truncate */ ); /* we can't do anything
- * about an error here */
+ /*
+ * all items of file are deleted, so we can remove
+ * "save" link
+ * we can't do anything about an error here
+ */
+ remove_save_link(inode, 0 /* not truncate */);
out:
reiserfs_write_unlock(inode->i_sb);
} else {
/* no object items are in the tree */
;
}
- clear_inode(inode); /* note this must go after the journal_end to prevent deadlock */
+
+ /* note this must go after the journal_end to prevent deadlock */
+ clear_inode(inode);
+
dquot_drop(inode);
inode->i_blocks = 0;
return;
@@ -103,8 +119,10 @@ static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid,
key->key_length = length;
}
-/* take base of inode_key (it comes from inode always) (dirid, objectid) and version from an inode, set
- offset and type of key */
+/*
+ * take base of inode_key (it comes from inode always) (dirid, objectid)
+ * and version from an inode, set offset and type of key
+ */
void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset,
int type, int length)
{
@@ -114,9 +132,7 @@ void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset,
length);
}
-//
-// when key is 0, do not set version and short key
-//
+/* when key is 0, do not set version and short key */
inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
int version,
loff_t offset, int type, int length,
@@ -132,43 +148,47 @@ inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
set_le_ih_k_type(ih, type);
put_ih_item_len(ih, length);
/* set_ih_free_space (ih, 0); */
- // for directory items it is entry count, for directs and stat
- // datas - 0xffff, for indirects - 0
+ /*
+ * for directory items it is entry count, for directs and stat
+ * datas - 0xffff, for indirects - 0
+ */
put_ih_entry_count(ih, entry_count);
}
-//
-// FIXME: we might cache recently accessed indirect item
-
-// Ugh. Not too eager for that....
-// I cut the code until such time as I see a convincing argument (benchmark).
-// I don't want a bloated inode struct..., and I don't like code complexity....
-
-/* cutting the code is fine, since it really isn't in use yet and is easy
-** to add back in. But, Vladimir has a really good idea here. Think
-** about what happens for reading a file. For each page,
-** The VFS layer calls reiserfs_readpage, who searches the tree to find
-** an indirect item. This indirect item has X number of pointers, where
-** X is a big number if we've done the block allocation right. But,
-** we only use one or two of these pointers during each call to readpage,
-** needlessly researching again later on.
-**
-** The size of the cache could be dynamic based on the size of the file.
-**
-** I'd also like to see us cache the location the stat data item, since
-** we are needlessly researching for that frequently.
-**
-** --chris
-*/
+/*
+ * FIXME: we might cache recently accessed indirect item
+ * Ugh. Not too eager for that....
+ * I cut the code until such time as I see a convincing argument (benchmark).
+ * I don't want a bloated inode struct..., and I don't like code complexity....
+ */
-/* If this page has a file tail in it, and
-** it was read in by get_block_create_0, the page data is valid,
-** but tail is still sitting in a direct item, and we can't write to
-** it. So, look through this page, and check all the mapped buffers
-** to make sure they have valid block numbers. Any that don't need
-** to be unmapped, so that __block_write_begin will correctly call
-** reiserfs_get_block to convert the tail into an unformatted node
-*/
+/*
+ * cutting the code is fine, since it really isn't in use yet and is easy
+ * to add back in. But, Vladimir has a really good idea here. Think
+ * about what happens for reading a file. For each page,
+ * The VFS layer calls reiserfs_readpage, who searches the tree to find
+ * an indirect item. This indirect item has X number of pointers, where
+ * X is a big number if we've done the block allocation right. But,
+ * we only use one or two of these pointers during each call to readpage,
+ * needlessly researching again later on.
+ *
+ * The size of the cache could be dynamic based on the size of the file.
+ *
+ * I'd also like to see us cache the location the stat data item, since
+ * we are needlessly researching for that frequently.
+ *
+ * --chris
+ */
+
+/*
+ * If this page has a file tail in it, and
+ * it was read in by get_block_create_0, the page data is valid,
+ * but tail is still sitting in a direct item, and we can't write to
+ * it. So, look through this page, and check all the mapped buffers
+ * to make sure they have valid block numbers. Any that don't need
+ * to be unmapped, so that __block_write_begin will correctly call
+ * reiserfs_get_block to convert the tail into an unformatted node
+ */
static inline void fix_tail_page_for_writing(struct page *page)
{
struct buffer_head *head, *next, *bh;
@@ -186,8 +206,10 @@ static inline void fix_tail_page_for_writing(struct page *page)
}
}
-/* reiserfs_get_block does not need to allocate a block only if it has been
- done already or non-hole position has been found in the indirect item */
+/*
+ * reiserfs_get_block does not need to allocate a block only if it has been
+ * done already or non-hole position has been found in the indirect item
+ */
static inline int allocation_needed(int retval, b_blocknr_t allocated,
struct item_head *ih,
__le32 * item, int pos_in_item)
@@ -211,14 +233,16 @@ static inline void set_block_dev_mapped(struct buffer_head *bh,
map_bh(bh, inode->i_sb, block);
}
-//
-// files which were created in the earlier version can not be longer,
-// than 2 gb
-//
+/*
+ * files which were created in the earlier version can not be longer,
+ * than 2 gb
+ */
static int file_capable(struct inode *inode, sector_t block)
{
- if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 || // it is new file.
- block < (1 << (31 - inode->i_sb->s_blocksize_bits))) // old file, but 'block' is inside of 2gb
+ /* it is new file. */
+ if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 ||
+ /* old file, but 'block' is inside of 2gb */
+ block < (1 << (31 - inode->i_sb->s_blocksize_bits)))
return 1;
return 0;
@@ -228,7 +252,6 @@ static int restart_transaction(struct reiserfs_transaction_handle *th,
struct inode *inode, struct treepath *path)
{
struct super_block *s = th->t_super;
- int len = th->t_blocks_allocated;
int err;
BUG_ON(!th->t_trans_id);
@@ -241,7 +264,7 @@ static int restart_transaction(struct reiserfs_transaction_handle *th,
return 0;
}
reiserfs_update_sd(th, inode);
- err = journal_end(th, s, len);
+ err = journal_end(th);
if (!err) {
err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6);
if (!err)
@@ -250,14 +273,14 @@ static int restart_transaction(struct reiserfs_transaction_handle *th,
return err;
}
-// it is called by get_block when create == 0. Returns block number
-// for 'block'-th logical block of file. When it hits direct item it
-// returns 0 (being called from bmap) or read direct item into piece
-// of page (bh_result)
-
-// Please improve the english/clarity in the comment above, as it is
-// hard to understand.
-
+/*
+ * it is called by get_block when create == 0. Returns block number
+ * for 'block'-th logical block of file. When it hits direct item it
+ * returns 0 (being called from bmap) or read direct item into piece
+ * of page (bh_result)
+ * Please improve the english/clarity in the comment above, as it is
+ * hard to understand.
+ */
static int _get_block_create_0(struct inode *inode, sector_t block,
struct buffer_head *bh_result, int args)
{
@@ -273,7 +296,7 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
int done = 0;
unsigned long offset;
- // prepare the key to look for the 'block'-th block of file
+ /* prepare the key to look for the 'block'-th block of file */
make_cpu_key(&key, inode,
(loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY,
3);
@@ -285,23 +308,28 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
kunmap(bh_result->b_page);
if (result == IO_ERROR)
return -EIO;
- // We do not return -ENOENT if there is a hole but page is uptodate, because it means
- // That there is some MMAPED data associated with it that is yet to be written to disk.
+ /*
+ * We do not return -ENOENT if there is a hole but page is
+ * uptodate, because it means that there is some MMAPED data
+ * associated with it that is yet to be written to disk.
+ */
if ((args & GET_BLOCK_NO_HOLE)
&& !PageUptodate(bh_result->b_page)) {
return -ENOENT;
}
return 0;
}
- //
+
bh = get_last_bh(&path);
- ih = get_ih(&path);
+ ih = tp_item_head(&path);
if (is_indirect_le_ih(ih)) {
- __le32 *ind_item = (__le32 *) B_I_PITEM(bh, ih);
+ __le32 *ind_item = (__le32 *) ih_item_body(bh, ih);
- /* FIXME: here we could cache indirect item or part of it in
- the inode to avoid search_by_key in case of subsequent
- access to file */
+ /*
+ * FIXME: here we could cache indirect item or part of it in
+ * the inode to avoid search_by_key in case of subsequent
+ * access to file
+ */
blocknr = get_block_num(ind_item, path.pos_in_item);
ret = 0;
if (blocknr) {
@@ -311,8 +339,12 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
set_buffer_boundary(bh_result);
}
} else
- // We do not return -ENOENT if there is a hole but page is uptodate, because it means
- // That there is some MMAPED data associated with it that is yet to be written to disk.
+ /*
+ * We do not return -ENOENT if there is a hole but
+ * page is uptodate, because it means that there is
+ * some MMAPED data associated with it that is
+ * yet to be written to disk.
+ */
if ((args & GET_BLOCK_NO_HOLE)
&& !PageUptodate(bh_result->b_page)) {
ret = -ENOENT;
@@ -323,41 +355,45 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
kunmap(bh_result->b_page);
return ret;
}
- // requested data are in direct item(s)
+ /* requested data are in direct item(s) */
if (!(args & GET_BLOCK_READ_DIRECT)) {
- // we are called by bmap. FIXME: we can not map block of file
- // when it is stored in direct item(s)
+ /*
+ * we are called by bmap. FIXME: we can not map block of file
+ * when it is stored in direct item(s)
+ */
pathrelse(&path);
if (p)
kunmap(bh_result->b_page);
return -ENOENT;
}
- /* if we've got a direct item, and the buffer or page was uptodate,
- ** we don't want to pull data off disk again. skip to the
- ** end, where we map the buffer and return
+ /*
+ * if we've got a direct item, and the buffer or page was uptodate,
+ * we don't want to pull data off disk again. skip to the
+ * end, where we map the buffer and return
*/
if (buffer_uptodate(bh_result)) {
goto finished;
} else
/*
- ** grab_tail_page can trigger calls to reiserfs_get_block on up to date
- ** pages without any buffers. If the page is up to date, we don't want
- ** read old data off disk. Set the up to date bit on the buffer instead
- ** and jump to the end
+ * grab_tail_page can trigger calls to reiserfs_get_block on
+ * up to date pages without any buffers. If the page is up
+ * to date, we don't want read old data off disk. Set the up
+ * to date bit on the buffer instead and jump to the end
*/
if (!bh_result->b_page || PageUptodate(bh_result->b_page)) {
set_buffer_uptodate(bh_result);
goto finished;
}
- // read file tail into part of page
+ /* read file tail into part of page */
offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1);
copy_item_head(&tmp_ih, ih);
- /* we only want to kmap if we are reading the tail into the page.
- ** this is not the common case, so we don't kmap until we are
- ** sure we need to. But, this means the item might move if
- ** kmap schedules
+ /*
+ * we only want to kmap if we are reading the tail into the page.
+ * this is not the common case, so we don't kmap until we are
+ * sure we need to. But, this means the item might move if
+ * kmap schedules
*/
if (!p)
p = (char *)kmap(bh_result->b_page);
@@ -368,10 +404,11 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
if (!is_direct_le_ih(ih)) {
BUG();
}
- /* make sure we don't read more bytes than actually exist in
- ** the file. This can happen in odd cases where i_size isn't
- ** correct, and when direct item padding results in a few
- ** extra bytes at the end of the direct item
+ /*
+ * make sure we don't read more bytes than actually exist in
+ * the file. This can happen in odd cases where i_size isn't
+ * correct, and when direct item padding results in a few
+ * extra bytes at the end of the direct item
*/
if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
break;
@@ -383,40 +420,43 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
} else {
chars = ih_item_len(ih) - path.pos_in_item;
}
- memcpy(p, B_I_PITEM(bh, ih) + path.pos_in_item, chars);
+ memcpy(p, ih_item_body(bh, ih) + path.pos_in_item, chars);
if (done)
break;
p += chars;
+ /*
+ * we done, if read direct item is not the last item of
+ * node FIXME: we could try to check right delimiting key
+ * to see whether direct item continues in the right
+ * neighbor or rely on i_size
+ */
if (PATH_LAST_POSITION(&path) != (B_NR_ITEMS(bh) - 1))
- // we done, if read direct item is not the last item of
- // node FIXME: we could try to check right delimiting key
- // to see whether direct item continues in the right
- // neighbor or rely on i_size
break;
- // update key to look for the next piece
+ /* update key to look for the next piece */
set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + chars);
result = search_for_position_by_key(inode->i_sb, &key, &path);
if (result != POSITION_FOUND)
- // i/o error most likely
+ /* i/o error most likely */
break;
bh = get_last_bh(&path);
- ih = get_ih(&path);
+ ih = tp_item_head(&path);
} while (1);
flush_dcache_page(bh_result->b_page);
kunmap(bh_result->b_page);
- finished:
+finished:
pathrelse(&path);
if (result == IO_ERROR)
return -EIO;
- /* this buffer has valid data, but isn't valid for io. mapping it to
+ /*
+ * this buffer has valid data, but isn't valid for io. mapping it to
* block #0 tells the rest of reiserfs it just has a tail in it
*/
map_bh(bh_result, inode->i_sb, 0);
@@ -424,8 +464,10 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
return 0;
}
-// this is called to create file map. So, _get_block_create_0 will not
-// read direct item
+/*
+ * this is called to create file map. So, _get_block_create_0 will not
+ * read direct item
+ */
static int reiserfs_bmap(struct inode *inode, sector_t block,
struct buffer_head *bh_result, int create)
{
@@ -439,22 +481,23 @@ static int reiserfs_bmap(struct inode *inode, sector_t block,
return 0;
}
-/* special version of get_block that is only used by grab_tail_page right
-** now. It is sent to __block_write_begin, and when you try to get a
-** block past the end of the file (or a block from a hole) it returns
-** -ENOENT instead of a valid buffer. __block_write_begin expects to
-** be able to do i/o on the buffers returned, unless an error value
-** is also returned.
-**
-** So, this allows __block_write_begin to be used for reading a single block
-** in a page. Where it does not produce a valid page for holes, or past the
-** end of the file. This turns out to be exactly what we need for reading
-** tails for conversion.
-**
-** The point of the wrapper is forcing a certain value for create, even
-** though the VFS layer is calling this function with create==1. If you
-** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
-** don't use this function.
+/*
+ * special version of get_block that is only used by grab_tail_page right
+ * now. It is sent to __block_write_begin, and when you try to get a
+ * block past the end of the file (or a block from a hole) it returns
+ * -ENOENT instead of a valid buffer. __block_write_begin expects to
+ * be able to do i/o on the buffers returned, unless an error value
+ * is also returned.
+ *
+ * So, this allows __block_write_begin to be used for reading a single block
+ * in a page. Where it does not produce a valid page for holes, or past the
+ * end of the file. This turns out to be exactly what we need for reading
+ * tails for conversion.
+ *
+ * The point of the wrapper is forcing a certain value for create, even
+ * though the VFS layer is calling this function with create==1. If you
+ * don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
+ * don't use this function.
*/
static int reiserfs_get_block_create_0(struct inode *inode, sector_t block,
struct buffer_head *bh_result,
@@ -463,8 +506,10 @@ static int reiserfs_get_block_create_0(struct inode *inode, sector_t block,
return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE);
}
-/* This is special helper for reiserfs_get_block in case we are executing
- direct_IO request. */
+/*
+ * This is special helper for reiserfs_get_block in case we are executing
+ * direct_IO request.
+ */
static int reiserfs_get_blocks_direct_io(struct inode *inode,
sector_t iblock,
struct buffer_head *bh_result,
@@ -474,9 +519,11 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode,
bh_result->b_page = NULL;
- /* We set the b_size before reiserfs_get_block call since it is
- referenced in convert_tail_for_hole() that may be called from
- reiserfs_get_block() */
+ /*
+ * We set the b_size before reiserfs_get_block call since it is
+ * referenced in convert_tail_for_hole() that may be called from
+ * reiserfs_get_block()
+ */
bh_result->b_size = (1 << inode->i_blkbits);
ret = reiserfs_get_block(inode, iblock, bh_result,
@@ -486,14 +533,18 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode,
/* don't allow direct io onto tail pages */
if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
- /* make sure future calls to the direct io funcs for this offset
- ** in the file fail by unmapping the buffer
+ /*
+ * make sure future calls to the direct io funcs for this
+ * offset in the file fail by unmapping the buffer
*/
clear_buffer_mapped(bh_result);
ret = -EINVAL;
}
- /* Possible unpacked tail. Flush the data before pages have
- disappeared */
+
+ /*
+ * Possible unpacked tail. Flush the data before pages have
+ * disappeared
+ */
if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
int err;
@@ -507,20 +558,20 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode,
if (err < 0)
ret = err;
}
- out:
+out:
return ret;
}
/*
-** helper function for when reiserfs_get_block is called for a hole
-** but the file tail is still in a direct item
-** bh_result is the buffer head for the hole
-** tail_offset is the offset of the start of the tail in the file
-**
-** This calls prepare_write, which will start a new transaction
-** you should not be in a transaction, or have any paths held when you
-** call this.
-*/
+ * helper function for when reiserfs_get_block is called for a hole
+ * but the file tail is still in a direct item
+ * bh_result is the buffer head for the hole
+ * tail_offset is the offset of the start of the tail in the file
+ *
+ * This calls prepare_write, which will start a new transaction
+ * you should not be in a transaction, or have any paths held when you
+ * call this.
+ */
static int convert_tail_for_hole(struct inode *inode,
struct buffer_head *bh_result,
loff_t tail_offset)
@@ -540,9 +591,10 @@ static int convert_tail_for_hole(struct inode *inode,
tail_end = (tail_start | (bh_result->b_size - 1)) + 1;
index = tail_offset >> PAGE_CACHE_SHIFT;
- /* hole_page can be zero in case of direct_io, we are sure
- that we cannot get here if we write with O_DIRECT into
- tail page */
+ /*
+ * hole_page can be zero in case of direct_io, we are sure
+ * that we cannot get here if we write with O_DIRECT into tail page
+ */
if (!hole_page || index != hole_page->index) {
tail_page = grab_cache_page(inode->i_mapping, index);
retval = -ENOMEM;
@@ -553,14 +605,15 @@ static int convert_tail_for_hole(struct inode *inode,
tail_page = hole_page;
}
- /* we don't have to make sure the conversion did not happen while
- ** we were locking the page because anyone that could convert
- ** must first take i_mutex.
- **
- ** We must fix the tail page for writing because it might have buffers
- ** that are mapped, but have a block number of 0. This indicates tail
- ** data that has been read directly into the page, and
- ** __block_write_begin won't trigger a get_block in this case.
+ /*
+ * we don't have to make sure the conversion did not happen while
+ * we were locking the page because anyone that could convert
+ * must first take i_mutex.
+ *
+ * We must fix the tail page for writing because it might have buffers
+ * that are mapped, but have a block number of 0. This indicates tail
+ * data that has been read directly into the page, and
+ * __block_write_begin won't trigger a get_block in this case.
*/
fix_tail_page_for_writing(tail_page);
retval = __reiserfs_write_begin(tail_page, tail_start,
@@ -573,12 +626,12 @@ static int convert_tail_for_hole(struct inode *inode,
retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end);
- unlock:
+unlock:
if (tail_page != hole_page) {
unlock_page(tail_page);
page_cache_release(tail_page);
}
- out:
+out:
return retval;
}
@@ -604,7 +657,8 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
struct buffer_head *bh_result, int create)
{
int repeat, retval = 0;
- b_blocknr_t allocated_block_nr = 0; // b_blocknr_t is (unsigned) 32 bit int
+ /* b_blocknr_t is (unsigned) 32 bit int*/
+ b_blocknr_t allocated_block_nr = 0;
INITIALIZE_PATH(path);
int pos_in_item;
struct cpu_key key;
@@ -614,12 +668,14 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
int done;
int fs_gen;
struct reiserfs_transaction_handle *th = NULL;
- /* space reserved in transaction batch:
- . 3 balancings in direct->indirect conversion
- . 1 block involved into reiserfs_update_sd()
- XXX in practically impossible worst case direct2indirect()
- can incur (much) more than 3 balancings.
- quota update for user, group */
+ /*
+ * space reserved in transaction batch:
+ * . 3 balancings in direct->indirect conversion
+ * . 1 block involved into reiserfs_update_sd()
+ * XXX in practically impossible worst case direct2indirect()
+ * can incur (much) more than 3 balancings.
+ * quota update for user, group
+ */
int jbegin_count =
JOURNAL_PER_BALANCE_CNT * 3 + 1 +
2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
@@ -636,8 +692,9 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
return -EFBIG;
}
- /* if !create, we aren't changing the FS, so we don't need to
- ** log anything, so we don't need to start a transaction
+ /*
+ * if !create, we aren't changing the FS, so we don't need to
+ * log anything, so we don't need to start a transaction
*/
if (!(create & GET_BLOCK_CREATE)) {
int ret;
@@ -647,6 +704,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
reiserfs_write_unlock(inode->i_sb);
return ret;
}
+
/*
* if we're already in a transaction, make sure to close
* any new transactions we start in this func
@@ -655,8 +713,10 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
reiserfs_transaction_running(inode->i_sb))
dangle = 0;
- /* If file is of such a size, that it might have a tail and tails are enabled
- ** we should mark it as possibly needing tail packing on close
+ /*
+ * If file is of such a size, that it might have a tail and
+ * tails are enabled we should mark it as possibly needing
+ * tail packing on close
*/
if ((have_large_tails(inode->i_sb)
&& inode->i_size < i_block_size(inode) * 4)
@@ -667,7 +727,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
/* set the key of the first byte in the 'block'-th block of file */
make_cpu_key(&key, inode, new_offset, TYPE_ANY, 3 /*key length */ );
if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
- start_trans:
+start_trans:
th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
if (!th) {
retval = -ENOMEM;
@@ -675,7 +735,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
}
reiserfs_update_inode_transaction(inode);
}
- research:
+research:
retval = search_for_position_by_key(inode->i_sb, &key, &path);
if (retval == IO_ERROR) {
@@ -684,8 +744,8 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
}
bh = get_last_bh(&path);
- ih = get_ih(&path);
- item = get_item(&path);
+ ih = tp_item_head(&path);
+ item = tp_item_body(&path);
pos_in_item = path.pos_in_item;
fs_gen = get_generation(inode->i_sb);
@@ -703,11 +763,12 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
_allocate_block(th, block, inode, &allocated_block_nr,
&path, create);
+ /*
+ * restart the transaction to give the journal a chance to free
+ * some blocks. releases the path, so we have to go back to
+ * research if we succeed on the second try
+ */
if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
- /* restart the transaction to give the journal a chance to free
- ** some blocks. releases the path, so we have to go back to
- ** research if we succeed on the second try
- */
SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
retval = restart_transaction(th, inode, &path);
if (retval)
@@ -734,9 +795,11 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
if (indirect_item_found(retval, ih)) {
b_blocknr_t unfm_ptr;
- /* 'block'-th block is in the file already (there is
- corresponding cell in some indirect item). But it may be
- zero unformatted node pointer (hole) */
+ /*
+ * 'block'-th block is in the file already (there is
+ * corresponding cell in some indirect item). But it may be
+ * zero unformatted node pointer (hole)
+ */
unfm_ptr = get_block_num(item, pos_in_item);
if (unfm_ptr == 0) {
/* use allocated block to plug the hole */
@@ -753,7 +816,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
reiserfs_add_ordered_list(inode, bh_result);
put_block_num(item, pos_in_item, allocated_block_nr);
unfm_ptr = allocated_block_nr;
- journal_mark_dirty(th, inode->i_sb, bh);
+ journal_mark_dirty(th, bh);
reiserfs_update_sd(th, inode);
}
set_block_dev_mapped(bh_result, unfm_ptr, inode);
@@ -764,9 +827,10 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
reiserfs_write_unlock(inode->i_sb);
- /* the item was found, so new blocks were not added to the file
- ** there is no need to make sure the inode is updated with this
- ** transaction
+ /*
+ * the item was found, so new blocks were not added to the file
+ * there is no need to make sure the inode is updated with this
+ * transaction
*/
return retval;
}
@@ -776,9 +840,11 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
goto start_trans;
}
- /* desired position is not found or is in the direct item. We have
- to append file with holes up to 'block'-th block converting
- direct items to indirect one if necessary */
+ /*
+ * desired position is not found or is in the direct item. We have
+ * to append file with holes up to 'block'-th block converting
+ * direct items to indirect one if necessary
+ */
done = 0;
do {
if (is_statdata_le_ih(ih)) {
@@ -790,16 +856,18 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
TYPE_INDIRECT, UNFM_P_SIZE,
0 /* free_space */ );
+ /*
+ * we are going to add 'block'-th block to the file.
+ * Use allocated block for that
+ */
if (cpu_key_k_offset(&key) == 1) {
- /* we are going to add 'block'-th block to the file. Use
- allocated block for that */
unp = cpu_to_le32(allocated_block_nr);
set_block_dev_mapped(bh_result,
allocated_block_nr, inode);
set_buffer_new(bh_result);
done = 1;
}
- tmp_key = key; // ;)
+ tmp_key = key; /* ;) */
set_cpu_key_k_offset(&tmp_key, 1);
PATH_LAST_POSITION(&path)++;
@@ -809,9 +877,12 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
if (retval) {
reiserfs_free_block(th, inode,
allocated_block_nr, 1);
- goto failure; // retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST
+ /*
+ * retval == -ENOSPC, -EDQUOT or -EIO
+ * or -EEXIST
+ */
+ goto failure;
}
- //mark_tail_converted (inode);
} else if (is_direct_le_ih(ih)) {
/* direct item has to be converted */
loff_t tail_offset;
@@ -819,18 +890,24 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
tail_offset =
((le_ih_k_offset(ih) -
1) & ~(inode->i_sb->s_blocksize - 1)) + 1;
+
+ /*
+ * direct item we just found fits into block we have
+ * to map. Convert it into unformatted node: use
+ * bh_result for the conversion
+ */
if (tail_offset == cpu_key_k_offset(&key)) {
- /* direct item we just found fits into block we have
- to map. Convert it into unformatted node: use
- bh_result for the conversion */
set_block_dev_mapped(bh_result,
allocated_block_nr, inode);
unbh = bh_result;
done = 1;
} else {
- /* we have to padd file tail stored in direct item(s)
- up to block size and convert it to unformatted
- node. FIXME: this should also get into page cache */
+ /*
+ * we have to pad file tail stored in direct
+ * item(s) up to block size and convert it
+ * to unformatted node. FIXME: this should
+ * also get into page cache
+ */
pathrelse(&path);
/*
@@ -859,7 +936,10 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
inode->i_ino,
retval);
if (allocated_block_nr) {
- /* the bitmap, the super, and the stat data == 3 */
+ /*
+ * the bitmap, the super,
+ * and the stat data == 3
+ */
if (!th)
th = reiserfs_persistent_transaction(inode->i_sb, 3);
if (th)
@@ -881,43 +961,57 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
allocated_block_nr, 1);
goto failure;
}
- /* it is important the set_buffer_uptodate is done after
- ** the direct2indirect. The buffer might contain valid
- ** data newer than the data on disk (read by readpage, changed,
- ** and then sent here by writepage). direct2indirect needs
- ** to know if unbh was already up to date, so it can decide
- ** if the data in unbh needs to be replaced with data from
- ** the disk
+ /*
+ * it is important the set_buffer_uptodate is done
+ * after the direct2indirect. The buffer might
+ * contain valid data newer than the data on disk
+ * (read by readpage, changed, and then sent here by
+ * writepage). direct2indirect needs to know if unbh
+ * was already up to date, so it can decide if the
+ * data in unbh needs to be replaced with data from
+ * the disk
*/
set_buffer_uptodate(unbh);
- /* unbh->b_page == NULL in case of DIRECT_IO request, this means
- buffer will disappear shortly, so it should not be added to
+ /*
+ * unbh->b_page == NULL in case of DIRECT_IO request,
+ * this means buffer will disappear shortly, so it
+ * should not be added to
*/
if (unbh->b_page) {
- /* we've converted the tail, so we must
- ** flush unbh before the transaction commits
+ /*
+ * we've converted the tail, so we must
+ * flush unbh before the transaction commits
*/
reiserfs_add_tail_list(inode, unbh);
- /* mark it dirty now to prevent commit_write from adding
- ** this buffer to the inode's dirty buffer list
+ /*
+ * mark it dirty now to prevent commit_write
+ * from adding this buffer to the inode's
+ * dirty buffer list
*/
/*
- * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty().
- * It's still atomic, but it sets the page dirty too,
- * which makes it eligible for writeback at any time by the
- * VM (which was also the case with __mark_buffer_dirty())
+ * AKPM: changed __mark_buffer_dirty to
+ * mark_buffer_dirty(). It's still atomic,
+ * but it sets the page dirty too, which makes
+ * it eligible for writeback at any time by the
+ * VM (which was also the case with
+ * __mark_buffer_dirty())
*/
mark_buffer_dirty(unbh);
}
} else {
- /* append indirect item with holes if needed, when appending
- pointer to 'block'-th block use block, which is already
- allocated */
+ /*
+ * append indirect item with holes if needed, when
+ * appending pointer to 'block'-th block use block,
+ * which is already allocated
+ */
struct cpu_key tmp_key;
- unp_t unf_single = 0; // We use this in case we need to allocate only
- // one block which is a fastpath
+ /*
+ * We use this in case we need to allocate
+ * only one block which is a fastpath
+ */
+ unp_t unf_single = 0;
unp_t *un;
__u64 max_to_insert =
MAX_ITEM_LEN(inode->i_sb->s_blocksize) /
@@ -926,14 +1020,17 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
RFALSE(pos_in_item != ih_item_len(ih) / UNFM_P_SIZE,
"vs-804: invalid position for append");
- /* indirect item has to be appended, set up key of that position */
+ /*
+ * indirect item has to be appended,
+ * set up key of that position
+ * (key type is unimportant)
+ */
make_cpu_key(&tmp_key, inode,
le_key_k_offset(version,
- &(ih->ih_key)) +
+ &ih->ih_key) +
op_bytes_number(ih,
inode->i_sb->s_blocksize),
- //pos_in_item * inode->i_sb->s_blocksize,
- TYPE_INDIRECT, 3); // key type is unimportant
+ TYPE_INDIRECT, 3);
RFALSE(cpu_key_k_offset(&tmp_key) > cpu_key_k_offset(&key),
"green-805: invalid offset");
@@ -954,8 +1051,10 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
}
}
if (blocks_needed <= max_to_insert) {
- /* we are going to add target block to the file. Use allocated
- block for that */
+ /*
+ * we are going to add target block to
+ * the file. Use allocated block for that
+ */
un[blocks_needed - 1] =
cpu_to_le32(allocated_block_nr);
set_block_dev_mapped(bh_result,
@@ -964,8 +1063,11 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
done = 1;
} else {
/* paste hole to the indirect item */
- /* If kmalloc failed, max_to_insert becomes zero and it means we
- only have space for one block */
+ /*
+ * If kmalloc failed, max_to_insert becomes
+ * zero and it means we only have space for
+ * one block
+ */
blocks_needed =
max_to_insert ? max_to_insert : 1;
}
@@ -984,9 +1086,12 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
goto failure;
}
if (!done) {
- /* We need to mark new file size in case this function will be
- interrupted/aborted later on. And we may do this only for
- holes. */
+ /*
+ * We need to mark new file size in case
+ * this function will be interrupted/aborted
+ * later on. And we may do this only for
+ * holes.
+ */
inode->i_size +=
inode->i_sb->s_blocksize * blocks_needed;
}
@@ -995,13 +1100,13 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
if (done == 1)
break;
- /* this loop could log more blocks than we had originally asked
- ** for. So, we have to allow the transaction to end if it is
- ** too big or too full. Update the inode so things are
- ** consistent if we crash before the function returns
- **
- ** release the path so that anybody waiting on the path before
- ** ending their transaction will be able to continue.
+ /*
+ * this loop could log more blocks than we had originally
+ * asked for. So, we have to allow the transaction to end
+ * if it is too big or too full. Update the inode so things
+ * are consistent if we crash before the function returns
+ * release the path so that anybody waiting on the path before
+ * ending their transaction will be able to continue.
*/
if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
retval = restart_transaction(th, inode, &path);
@@ -1031,14 +1136,14 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
goto failure;
}
bh = get_last_bh(&path);
- ih = get_ih(&path);
- item = get_item(&path);
+ ih = tp_item_head(&path);
+ item = tp_item_body(&path);
pos_in_item = path.pos_in_item;
} while (1);
retval = 0;
- failure:
+failure:
if (th && (!dangle || (retval && !th->t_trans_id))) {
int err;
if (th->t_trans_id)
@@ -1060,8 +1165,10 @@ reiserfs_readpages(struct file *file, struct address_space *mapping,
return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block);
}
-/* Compute real number of used bytes by file
- * Following three functions can go away when we'll have enough space in stat item
+/*
+ * Compute real number of used bytes by file
+ * Following three functions can go away when we'll have enough space in
+ * stat item
*/
static int real_space_diff(struct inode *inode, int sd_size)
{
@@ -1071,13 +1178,14 @@ static int real_space_diff(struct inode *inode, int sd_size)
if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
return sd_size;
- /* End of file is also in full block with indirect reference, so round
- ** up to the next block.
- **
- ** there is just no way to know if the tail is actually packed
- ** on the file, so we have to assume it isn't. When we pack the
- ** tail, we add 4 bytes to pretend there really is an unformatted
- ** node pointer
+ /*
+ * End of file is also in full block with indirect reference, so round
+ * up to the next block.
+ *
+ * there is just no way to know if the tail is actually packed
+ * on the file, so we have to assume it isn't. When we pack the
+ * tail, we add 4 bytes to pretend there really is an unformatted
+ * node pointer
*/
bytes =
((inode->i_size +
@@ -1108,36 +1216,36 @@ static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
bytes += (loff_t) 511;
}
- /* files from before the quota patch might i_blocks such that
- ** bytes < real_space. Deal with that here to prevent it from
- ** going negative.
+ /*
+ * files from before the quota patch might i_blocks such that
+ * bytes < real_space. Deal with that here to prevent it from
+ * going negative.
*/
if (bytes < real_space)
return 0;
return (bytes - real_space) >> 9;
}
-//
-// BAD: new directories have stat data of new type and all other items
-// of old type. Version stored in the inode says about body items, so
-// in update_stat_data we can not rely on inode, but have to check
-// item version directly
-//
+/*
+ * BAD: new directories have stat data of new type and all other items
+ * of old type. Version stored in the inode says about body items, so
+ * in update_stat_data we can not rely on inode, but have to check
+ * item version directly
+ */
-// called by read_locked_inode
+/* called by read_locked_inode */
static void init_inode(struct inode *inode, struct treepath *path)
{
struct buffer_head *bh;
struct item_head *ih;
__u32 rdev;
- //int version = ITEM_VERSION_1;
bh = PATH_PLAST_BUFFER(path);
- ih = PATH_PITEM_HEAD(path);
+ ih = tp_item_head(path);
- copy_key(INODE_PKEY(inode), &(ih->ih_key));
+ copy_key(INODE_PKEY(inode), &ih->ih_key);
- INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
+ INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list);
REISERFS_I(inode)->i_flags = 0;
REISERFS_I(inode)->i_prealloc_block = 0;
REISERFS_I(inode)->i_prealloc_count = 0;
@@ -1147,7 +1255,7 @@ static void init_inode(struct inode *inode, struct treepath *path)
if (stat_data_v1(ih)) {
struct stat_data_v1 *sd =
- (struct stat_data_v1 *)B_I_PITEM(bh, ih);
+ (struct stat_data_v1 *)ih_item_body(bh, ih);
unsigned long blocks;
set_inode_item_key_version(inode, KEY_FORMAT_3_5);
@@ -1168,20 +1276,26 @@ static void init_inode(struct inode *inode, struct treepath *path)
inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
blocks = (inode->i_size + 511) >> 9;
blocks = _ROUND_UP(blocks, inode->i_sb->s_blocksize >> 9);
+
+ /*
+ * there was a bug in <=3.5.23 when i_blocks could take
+ * negative values. Starting from 3.5.17 this value could
+ * even be stored in stat data. For such files we set
+ * i_blocks based on file size. Just 2 notes: this can be
+ * wrong for sparse files. On-disk value will be only
+ * updated if file's inode will ever change
+ */
if (inode->i_blocks > blocks) {
- // there was a bug in <=3.5.23 when i_blocks could take negative
- // values. Starting from 3.5.17 this value could even be stored in
- // stat data. For such files we set i_blocks based on file
- // size. Just 2 notes: this can be wrong for sparce files. On-disk value will be
- // only updated if file's inode will ever change
inode->i_blocks = blocks;
}
rdev = sd_v1_rdev(sd);
REISERFS_I(inode)->i_first_direct_byte =
sd_v1_first_direct_byte(sd);
- /* an early bug in the quota code can give us an odd number for the
- ** block count. This is incorrect, fix it here.
+
+ /*
+ * an early bug in the quota code can give us an odd
+ * number for the block count. This is incorrect, fix it here.
*/
if (inode->i_blocks & 1) {
inode->i_blocks++;
@@ -1189,13 +1303,17 @@ static void init_inode(struct inode *inode, struct treepath *path)
inode_set_bytes(inode,
to_real_used_space(inode, inode->i_blocks,
SD_V1_SIZE));
- /* nopack is initially zero for v1 objects. For v2 objects,
- nopack is initialised from sd_attrs */
+ /*
+ * nopack is initially zero for v1 objects. For v2 objects,
+ * nopack is initialised from sd_attrs
+ */
REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
} else {
- // new stat data found, but object may have old items
- // (directories and symlinks)
- struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih);
+ /*
+ * new stat data found, but object may have old items
+ * (directories and symlinks)
+ */
+ struct stat_data *sd = (struct stat_data *)ih_item_body(bh, ih);
inode->i_mode = sd_v2_mode(sd);
set_nlink(inode, sd_v2_nlink(sd));
@@ -1225,8 +1343,10 @@ static void init_inode(struct inode *inode, struct treepath *path)
inode_set_bytes(inode,
to_real_used_space(inode, inode->i_blocks,
SD_V2_SIZE));
- /* read persistent inode attributes from sd and initialise
- generic inode flags from them */
+ /*
+ * read persistent inode attributes from sd and initialise
+ * generic inode flags from them
+ */
REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd);
sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode);
}
@@ -1249,7 +1369,7 @@ static void init_inode(struct inode *inode, struct treepath *path)
}
}
-// update new stat data with inode fields
+/* update new stat data with inode fields */
static void inode2sd(void *sd, struct inode *inode, loff_t size)
{
struct stat_data *sd_v2 = (struct stat_data *)sd;
@@ -1273,7 +1393,7 @@ static void inode2sd(void *sd, struct inode *inode, loff_t size)
set_sd_v2_attrs(sd_v2, flags);
}
-// used to copy inode's fields to old stat data
+/* used to copy inode's fields to old stat data */
static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
{
struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd;
@@ -1292,14 +1412,15 @@ static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
else
set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
- // Sigh. i_first_direct_byte is back
+ /* Sigh. i_first_direct_byte is back */
set_sd_v1_first_direct_byte(sd_v1,
REISERFS_I(inode)->i_first_direct_byte);
}
-/* NOTE, you must prepare the buffer head before sending it here,
-** and then log it after the call
-*/
+/*
+ * NOTE, you must prepare the buffer head before sending it here,
+ * and then log it after the call
+ */
static void update_stat_data(struct treepath *path, struct inode *inode,
loff_t size)
{
@@ -1307,17 +1428,17 @@ static void update_stat_data(struct treepath *path, struct inode *inode,
struct item_head *ih;
bh = PATH_PLAST_BUFFER(path);
- ih = PATH_PITEM_HEAD(path);
+ ih = tp_item_head(path);
if (!is_statdata_le_ih(ih))
reiserfs_panic(inode->i_sb, "vs-13065", "key %k, found item %h",
INODE_PKEY(inode), ih);
+ /* path points to old stat data */
if (stat_data_v1(ih)) {
- // path points to old stat data
- inode2sd_v1(B_I_PITEM(bh, ih), inode, size);
+ inode2sd_v1(ih_item_body(bh, ih), inode, size);
} else {
- inode2sd(B_I_PITEM(bh, ih), inode, size);
+ inode2sd(ih_item_body(bh, ih), inode, size);
}
return;
@@ -1335,7 +1456,8 @@ void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
BUG_ON(!th->t_trans_id);
- make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3); //key type is unimportant
+ /* key type is unimportant */
+ make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);
for (;;) {
int pos;
@@ -1363,45 +1485,48 @@ void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
return;
}
- /* sigh, prepare_for_journal might schedule. When it schedules the
- ** FS might change. We have to detect that, and loop back to the
- ** search if the stat data item has moved
+ /*
+ * sigh, prepare_for_journal might schedule. When it
+ * schedules the FS might change. We have to detect that,
+ * and loop back to the search if the stat data item has moved
*/
bh = get_last_bh(&path);
- ih = get_ih(&path);
+ ih = tp_item_head(&path);
copy_item_head(&tmp_ih, ih);
fs_gen = get_generation(inode->i_sb);
reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
+
+ /* Stat_data item has been moved after scheduling. */
if (fs_changed(fs_gen, inode->i_sb)
&& item_moved(&tmp_ih, &path)) {
reiserfs_restore_prepared_buffer(inode->i_sb, bh);
- continue; /* Stat_data item has been moved after scheduling. */
+ continue;
}
break;
}
update_stat_data(&path, inode, size);
- journal_mark_dirty(th, th->t_super, bh);
+ journal_mark_dirty(th, bh);
pathrelse(&path);
return;
}
-/* reiserfs_read_locked_inode is called to read the inode off disk, and it
-** does a make_bad_inode when things go wrong. But, we need to make sure
-** and clear the key in the private portion of the inode, otherwise a
-** corresponding iput might try to delete whatever object the inode last
-** represented.
-*/
+/*
+ * reiserfs_read_locked_inode is called to read the inode off disk, and it
+ * does a make_bad_inode when things go wrong. But, we need to make sure
+ * and clear the key in the private portion of the inode, otherwise a
+ * corresponding iput might try to delete whatever object the inode last
+ * represented.
+ */
static void reiserfs_make_bad_inode(struct inode *inode)
{
memset(INODE_PKEY(inode), 0, KEY_SIZE);
make_bad_inode(inode);
}
-//
-// initially this function was derived from minix or ext2's analog and
-// evolved as the prototype did
-//
-
+/*
+ * initially this function was derived from minix or ext2's analog and
+ * evolved as the prototype did
+ */
int reiserfs_init_locked_inode(struct inode *inode, void *p)
{
struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p;
@@ -1410,8 +1535,10 @@ int reiserfs_init_locked_inode(struct inode *inode, void *p)
return 0;
}
-/* looks for stat data in the tree, and fills up the fields of in-core
- inode stat data fields */
+/*
+ * looks for stat data in the tree, and fills up the fields of in-core
+ * inode stat data fields
+ */
void reiserfs_read_locked_inode(struct inode *inode,
struct reiserfs_iget_args *args)
{
@@ -1422,8 +1549,10 @@ void reiserfs_read_locked_inode(struct inode *inode,
dirino = args->dirid;
- /* set version 1, version 2 could be used too, because stat data
- key is the same in both versions */
+ /*
+ * set version 1, version 2 could be used too, because stat data
+ * key is the same in both versions
+ */
key.version = KEY_FORMAT_3_5;
key.on_disk_key.k_dir_id = dirino;
key.on_disk_key.k_objectid = inode->i_ino;
@@ -1439,8 +1568,9 @@ void reiserfs_read_locked_inode(struct inode *inode,
reiserfs_make_bad_inode(inode);
return;
}
+
+ /* a stale NFS handle can trigger this without it being an error */
if (retval != ITEM_FOUND) {
- /* a stale NFS handle can trigger this without it being an error */
pathrelse(&path_to_sd);
reiserfs_make_bad_inode(inode);
clear_nlink(inode);
@@ -1449,20 +1579,25 @@ void reiserfs_read_locked_inode(struct inode *inode,
init_inode(inode, &path_to_sd);
- /* It is possible that knfsd is trying to access inode of a file
- that is being removed from the disk by some other thread. As we
- update sd on unlink all that is required is to check for nlink
- here. This bug was first found by Sizif when debugging
- SquidNG/Butterfly, forgotten, and found again after Philippe
- Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
-
- More logical fix would require changes in fs/inode.c:iput() to
- remove inode from hash-table _after_ fs cleaned disk stuff up and
- in iget() to return NULL if I_FREEING inode is found in
- hash-table. */
- /* Currently there is one place where it's ok to meet inode with
- nlink==0: processing of open-unlinked and half-truncated files
- during mount (fs/reiserfs/super.c:finish_unfinished()). */
+ /*
+ * It is possible that knfsd is trying to access inode of a file
+ * that is being removed from the disk by some other thread. As we
+ * update sd on unlink all that is required is to check for nlink
+ * here. This bug was first found by Sizif when debugging
+ * SquidNG/Butterfly, forgotten, and found again after Philippe
+ * Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
+
+ * More logical fix would require changes in fs/inode.c:iput() to
+ * remove inode from hash-table _after_ fs cleaned disk stuff up and
+ * in iget() to return NULL if I_FREEING inode is found in
+ * hash-table.
+ */
+
+ /*
+ * Currently there is one place where it's ok to meet inode with
+ * nlink==0: processing of open-unlinked and half-truncated files
+ * during mount (fs/reiserfs/super.c:finish_unfinished()).
+ */
if ((inode->i_nlink == 0) &&
!REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) {
reiserfs_warning(inode->i_sb, "vs-13075",
@@ -1472,7 +1607,8 @@ void reiserfs_read_locked_inode(struct inode *inode,
reiserfs_make_bad_inode(inode);
}
- reiserfs_check_path(&path_to_sd); /* init inode should be relsing */
+ /* init inode should be relsing */
+ reiserfs_check_path(&path_to_sd);
/*
* Stat data v1 doesn't support ACLs.
@@ -1481,7 +1617,7 @@ void reiserfs_read_locked_inode(struct inode *inode,
cache_no_acl(inode);
}
-/**
+/*
* reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked().
*
* @inode: inode from hash table to check
@@ -1556,7 +1692,8 @@ static struct dentry *reiserfs_get_dentry(struct super_block *sb,
struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
int fh_len, int fh_type)
{
- /* fhtype happens to reflect the number of u32s encoded.
+ /*
+ * fhtype happens to reflect the number of u32s encoded.
* due to a bug in earlier code, fhtype might indicate there
* are more u32s then actually fitted.
* so if fhtype seems to be more than len, reduce fhtype.
@@ -1625,13 +1762,16 @@ int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp,
return *lenp;
}
-/* looks for stat data, then copies fields to it, marks the buffer
- containing stat data as dirty */
-/* reiserfs inodes are never really dirty, since the dirty inode call
-** always logs them. This call allows the VFS inode marking routines
-** to properly mark inodes for datasync and such, but only actually
-** does something when called for a synchronous update.
-*/
+/*
+ * looks for stat data, then copies fields to it, marks the buffer
+ * containing stat data as dirty
+ */
+/*
+ * reiserfs inodes are never really dirty, since the dirty inode call
+ * always logs them. This call allows the VFS inode marking routines
+ * to properly mark inodes for datasync and such, but only actually
+ * does something when called for a synchronous update.
+ */
int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
struct reiserfs_transaction_handle th;
@@ -1639,24 +1779,28 @@ int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc)
if (inode->i_sb->s_flags & MS_RDONLY)
return -EROFS;
- /* memory pressure can sometimes initiate write_inode calls with sync == 1,
- ** these cases are just when the system needs ram, not when the
- ** inode needs to reach disk for safety, and they can safely be
- ** ignored because the altered inode has already been logged.
+ /*
+ * memory pressure can sometimes initiate write_inode calls with
+ * sync == 1,
+ * these cases are just when the system needs ram, not when the
+ * inode needs to reach disk for safety, and they can safely be
+ * ignored because the altered inode has already been logged.
*/
if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) {
reiserfs_write_lock(inode->i_sb);
if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
reiserfs_update_sd(&th, inode);
- journal_end_sync(&th, inode->i_sb, jbegin_count);
+ journal_end_sync(&th);
}
reiserfs_write_unlock(inode->i_sb);
}
return 0;
}
-/* stat data of new object is inserted already, this inserts the item
- containing "." and ".." entries */
+/*
+ * stat data of new object is inserted already, this inserts the item
+ * containing "." and ".." entries
+ */
static int reiserfs_new_directory(struct reiserfs_transaction_handle *th,
struct inode *inode,
struct item_head *ih, struct treepath *path,
@@ -1674,9 +1818,11 @@ static int reiserfs_new_directory(struct reiserfs_transaction_handle *th,
le32_to_cpu(ih->ih_key.k_objectid), DOT_OFFSET,
TYPE_DIRENTRY, 3 /*key length */ );
- /* compose item head for new item. Directories consist of items of
- old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
- is done by reiserfs_new_inode */
+ /*
+ * compose item head for new item. Directories consist of items of
+ * old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
+ * is done by reiserfs_new_inode
+ */
if (old_format_only(sb)) {
make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2);
@@ -1714,9 +1860,12 @@ static int reiserfs_new_directory(struct reiserfs_transaction_handle *th,
return reiserfs_insert_item(th, path, &key, ih, inode, body);
}
-/* stat data of object has been inserted, this inserts the item
- containing the body of symlink */
-static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct inode *inode, /* Inode of symlink */
+/*
+ * stat data of object has been inserted, this inserts the item
+ * containing the body of symlink
+ */
+static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th,
+ struct inode *inode,
struct item_head *ih,
struct treepath *path, const char *symname,
int item_len)
@@ -1754,15 +1903,26 @@ static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct i
return reiserfs_insert_item(th, path, &key, ih, inode, symname);
}
-/* inserts the stat data into the tree, and then calls
- reiserfs_new_directory (to insert ".", ".." item if new object is
- directory) or reiserfs_new_symlink (to insert symlink body if new
- object is symlink) or nothing (if new object is regular file)
-
- NOTE! uid and gid must already be set in the inode. If we return
- non-zero due to an error, we have to drop the quota previously allocated
- for the fresh inode. This can only be done outside a transaction, so
- if we return non-zero, we also end the transaction. */
+/*
+ * inserts the stat data into the tree, and then calls
+ * reiserfs_new_directory (to insert ".", ".." item if new object is
+ * directory) or reiserfs_new_symlink (to insert symlink body if new
+ * object is symlink) or nothing (if new object is regular file)
+
+ * NOTE! uid and gid must already be set in the inode. If we return
+ * non-zero due to an error, we have to drop the quota previously allocated
+ * for the fresh inode. This can only be done outside a transaction, so
+ * if we return non-zero, we also end the transaction.
+ *
+ * @th: active transaction handle
+ * @dir: parent directory for new inode
+ * @mode: mode of new inode
+ * @symname: symlink contents if inode is symlink
+ * @isize: 0 for regular file, EMPTY_DIR_SIZE for dirs, strlen(symname) for
+ * symlinks
+ * @inode: inode to be filled
+ * @security: optional security context to associate with this inode
+ */
int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
struct inode *dir, umode_t mode, const char *symname,
/* 0 for regular, EMTRY_DIR_SIZE for dirs,
@@ -1807,7 +1967,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
else
make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
- memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
+ memcpy(INODE_PKEY(inode), &ih.ih_key, KEY_SIZE);
args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
depth = reiserfs_write_unlock_nested(inode->i_sb);
@@ -1820,10 +1980,11 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
}
if (old_format_only(sb))
- /* not a perfect generation count, as object ids can be reused, but
- ** this is as good as reiserfs can do right now.
- ** note that the private part of inode isn't filled in yet, we have
- ** to use the directory.
+ /*
+ * not a perfect generation count, as object ids can be reused,
+ * but this is as good as reiserfs can do right now.
+ * note that the private part of inode isn't filled in yet,
+ * we have to use the directory.
*/
inode->i_generation = le32_to_cpu(INODE_PKEY(dir)->k_objectid);
else
@@ -1850,7 +2011,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 :
U32_MAX /*NO_BYTES_IN_DIRECT_ITEM */ ;
- INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
+ INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list);
REISERFS_I(inode)->i_flags = 0;
REISERFS_I(inode)->i_prealloc_block = 0;
REISERFS_I(inode)->i_prealloc_count = 0;
@@ -1878,9 +2039,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
goto out_bad_inode;
}
if (old_format_only(sb)) {
+ /* i_uid or i_gid is too big to be stored in stat data v3.5 */
if (i_uid_read(inode) & ~0xffff || i_gid_read(inode) & ~0xffff) {
pathrelse(&path_to_key);
- /* i_uid or i_gid is too big to be stored in stat data v3.5 */
err = -EINVAL;
goto out_bad_inode;
}
@@ -1888,9 +2049,11 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
} else {
inode2sd(&sd, inode, inode->i_size);
}
- // store in in-core inode the key of stat data and version all
- // object items will have (directory items will have old offset
- // format, other new objects will consist of new items)
+ /*
+ * store in in-core inode the key of stat data and version all
+ * object items will have (directory items will have old offset
+ * format, other new objects will consist of new items)
+ */
if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode))
set_inode_item_key_version(inode, KEY_FORMAT_3_5);
else
@@ -1934,7 +2097,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
if (retval) {
err = retval;
reiserfs_check_path(&path_to_key);
- journal_end(th, th->t_super, th->t_blocks_allocated);
+ journal_end(th);
goto out_inserted_sd;
}
@@ -1945,7 +2108,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
if (retval) {
err = retval;
reiserfs_check_path(&path_to_key);
- journal_end(th, th->t_super, th->t_blocks_allocated);
+ journal_end(th);
goto out_inserted_sd;
}
} else if (inode->i_sb->s_flags & MS_POSIXACL) {
@@ -1962,8 +2125,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
if (retval) {
err = retval;
reiserfs_check_path(&path_to_key);
- retval = journal_end(th, th->t_super,
- th->t_blocks_allocated);
+ retval = journal_end(th);
if (retval)
err = retval;
goto out_inserted_sd;
@@ -1975,11 +2137,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
return 0;
-/* it looks like you can easily compress these two goto targets into
- * one. Keeping it like this doesn't actually hurt anything, and they
- * are place holders for what the quota code actually needs.
- */
- out_bad_inode:
+out_bad_inode:
/* Invalidate the object, nothing was inserted yet */
INODE_PKEY(inode)->k_objectid = 0;
@@ -1988,16 +2146,19 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
dquot_free_inode(inode);
reiserfs_write_lock_nested(inode->i_sb, depth);
- out_end_trans:
- journal_end(th, th->t_super, th->t_blocks_allocated);
- /* Drop can be outside and it needs more credits so it's better to have it outside */
+out_end_trans:
+ journal_end(th);
+ /*
+ * Drop can be outside and it needs more credits so it's better
+ * to have it outside
+ */
depth = reiserfs_write_unlock_nested(inode->i_sb);
dquot_drop(inode);
reiserfs_write_lock_nested(inode->i_sb, depth);
inode->i_flags |= S_NOQUOTA;
make_bad_inode(inode);
- out_inserted_sd:
+out_inserted_sd:
clear_nlink(inode);
th->t_trans_id = 0; /* so the caller can't use this handle later */
unlock_new_inode(inode); /* OK to do even if we hadn't locked it */
@@ -2006,25 +2167,26 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
}
/*
-** finds the tail page in the page cache,
-** reads the last block in.
-**
-** On success, page_result is set to a locked, pinned page, and bh_result
-** is set to an up to date buffer for the last block in the file. returns 0.
-**
-** tail conversion is not done, so bh_result might not be valid for writing
-** check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
-** trying to write the block.
-**
-** on failure, nonzero is returned, page_result and bh_result are untouched.
-*/
+ * finds the tail page in the page cache,
+ * reads the last block in.
+ *
+ * On success, page_result is set to a locked, pinned page, and bh_result
+ * is set to an up to date buffer for the last block in the file. returns 0.
+ *
+ * tail conversion is not done, so bh_result might not be valid for writing
+ * check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
+ * trying to write the block.
+ *
+ * on failure, nonzero is returned, page_result and bh_result are untouched.
+ */
static int grab_tail_page(struct inode *inode,
struct page **page_result,
struct buffer_head **bh_result)
{
- /* we want the page with the last byte in the file,
- ** not the page that will hold the next byte for appending
+ /*
+ * we want the page with the last byte in the file,
+ * not the page that will hold the next byte for appending
*/
unsigned long index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
unsigned long pos = 0;
@@ -2036,10 +2198,11 @@ static int grab_tail_page(struct inode *inode,
struct page *page;
int error;
- /* we know that we are only called with inode->i_size > 0.
- ** we also know that a file tail can never be as big as a block
- ** If i_size % blocksize == 0, our file is currently block aligned
- ** and it won't need converting or zeroing after a truncate.
+ /*
+ * we know that we are only called with inode->i_size > 0.
+ * we also know that a file tail can never be as big as a block
+ * If i_size % blocksize == 0, our file is currently block aligned
+ * and it won't need converting or zeroing after a truncate.
*/
if ((offset & (blocksize - 1)) == 0) {
return -ENOENT;
@@ -2068,10 +2231,11 @@ static int grab_tail_page(struct inode *inode,
} while (bh != head);
if (!buffer_uptodate(bh)) {
- /* note, this should never happen, prepare_write should
- ** be taking care of this for us. If the buffer isn't up to date,
- ** I've screwed up the code to find the buffer, or the code to
- ** call prepare_write
+ /*
+ * note, this should never happen, prepare_write should be
+ * taking care of this for us. If the buffer isn't up to
+ * date, I've screwed up the code to find the buffer, or the
+ * code to call prepare_write
*/
reiserfs_error(inode->i_sb, "clm-6000",
"error reading block %lu", bh->b_blocknr);
@@ -2081,21 +2245,21 @@ static int grab_tail_page(struct inode *inode,
*bh_result = bh;
*page_result = page;
- out:
+out:
return error;
- unlock:
+unlock:
unlock_page(page);
page_cache_release(page);
return error;
}
/*
-** vfs version of truncate file. Must NOT be called with
-** a transaction already started.
-**
-** some code taken from block_truncate_page
-*/
+ * vfs version of truncate file. Must NOT be called with
+ * a transaction already started.
+ *
+ * some code taken from block_truncate_page
+ */
int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
{
struct reiserfs_transaction_handle th;
@@ -2113,9 +2277,11 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
if (inode->i_size > 0) {
error = grab_tail_page(inode, &page, &bh);
if (error) {
- // -ENOENT means we truncated past the end of the file,
- // and get_block_create_0 could not find a block to read in,
- // which is ok.
+ /*
+ * -ENOENT means we truncated past the end of the
+ * file, and get_block_create_0 could not find a
+ * block to read in, which is ok.
+ */
if (error != -ENOENT)
reiserfs_error(inode->i_sb, "clm-6001",
"grab_tail_page failed %d",
@@ -2125,29 +2291,33 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
}
}
- /* so, if page != NULL, we have a buffer head for the offset at
- ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
- ** then we have an unformatted node. Otherwise, we have a direct item,
- ** and no zeroing is required on disk. We zero after the truncate,
- ** because the truncate might pack the item anyway
- ** (it will unmap bh if it packs).
+ /*
+ * so, if page != NULL, we have a buffer head for the offset at
+ * the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
+ * then we have an unformatted node. Otherwise, we have a direct item,
+ * and no zeroing is required on disk. We zero after the truncate,
+ * because the truncate might pack the item anyway
+ * (it will unmap bh if it packs).
+ *
+ * it is enough to reserve space in transaction for 2 balancings:
+ * one for "save" link adding and another for the first
+ * cut_from_item. 1 is for update_sd
*/
- /* it is enough to reserve space in transaction for 2 balancings:
- one for "save" link adding and another for the first
- cut_from_item. 1 is for update_sd */
error = journal_begin(&th, inode->i_sb,
JOURNAL_PER_BALANCE_CNT * 2 + 1);
if (error)
goto out;
reiserfs_update_inode_transaction(inode);
if (update_timestamps)
- /* we are doing real truncate: if the system crashes before the last
- transaction of truncating gets committed - on reboot the file
- either appears truncated properly or not truncated at all */
+ /*
+ * we are doing real truncate: if the system crashes
+ * before the last transaction of truncating gets committed
+ * - on reboot the file either appears truncated properly
+ * or not truncated at all
+ */
add_save_link(&th, inode, 1);
err2 = reiserfs_do_truncate(&th, inode, page, update_timestamps);
- error =
- journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1);
+ error = journal_end(&th);
if (error)
goto out;
@@ -2180,7 +2350,7 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
reiserfs_write_unlock(inode->i_sb);
return 0;
- out:
+out:
if (page) {
unlock_page(page);
page_cache_release(page);
@@ -2212,7 +2382,10 @@ static int map_block_for_writepage(struct inode *inode,
int copy_size;
int trans_running = 0;
- /* catch places below that try to log something without starting a trans */
+ /*
+ * catch places below that try to log something without
+ * starting a trans
+ */
th.t_trans_id = 0;
if (!buffer_uptodate(bh_result)) {
@@ -2220,11 +2393,11 @@ static int map_block_for_writepage(struct inode *inode,
}
kmap(bh_result->b_page);
- start_over:
+start_over:
reiserfs_write_lock(inode->i_sb);
make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3);
- research:
+research:
retval = search_for_position_by_key(inode->i_sb, &key, &path);
if (retval != POSITION_FOUND) {
use_get_block = 1;
@@ -2232,8 +2405,8 @@ static int map_block_for_writepage(struct inode *inode,
}
bh = get_last_bh(&path);
- ih = get_ih(&path);
- item = get_item(&path);
+ ih = tp_item_head(&path);
+ item = tp_item_body(&path);
pos_in_item = path.pos_in_item;
/* we've found an unformatted node */
@@ -2281,10 +2454,10 @@ static int map_block_for_writepage(struct inode *inode,
goto research;
}
- memcpy(B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied,
+ memcpy(ih_item_body(bh, ih) + pos_in_item, p + bytes_copied,
copy_size);
- journal_mark_dirty(&th, inode->i_sb, bh);
+ journal_mark_dirty(&th, bh);
bytes_copied += copy_size;
set_block_dev_mapped(bh_result, 0, inode);
@@ -2304,10 +2477,10 @@ static int map_block_for_writepage(struct inode *inode,
}
retval = 0;
- out:
+out:
pathrelse(&path);
if (trans_running) {
- int err = journal_end(&th, inode->i_sb, jbegin_count);
+ int err = journal_end(&th);
if (err)
retval = err;
trans_running = 0;
@@ -2331,7 +2504,8 @@ static int map_block_for_writepage(struct inode *inode,
kunmap(bh_result->b_page);
if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
- /* we've copied data from the page into the direct item, so the
+ /*
+ * we've copied data from the page into the direct item, so the
* buffer in the page is now clean, mark it to reflect that.
*/
lock_buffer(bh_result);
@@ -2370,7 +2544,8 @@ static int reiserfs_write_full_page(struct page *page,
return 0;
}
- /* The page dirty bit is cleared before writepage is called, which
+ /*
+ * The page dirty bit is cleared before writepage is called, which
* means we have to tell create_empty_buffers to make dirty buffers
* The page really should be up to date at this point, so tossing
* in the BH_Uptodate is just a sanity check.
@@ -2381,8 +2556,9 @@ static int reiserfs_write_full_page(struct page *page,
}
head = page_buffers(page);
- /* last page in the file, zero out any contents past the
- ** last byte in the file
+ /*
+ * last page in the file, zero out any contents past the
+ * last byte in the file
*/
if (page->index >= end_index) {
unsigned last_offset;
@@ -2412,7 +2588,8 @@ static int reiserfs_write_full_page(struct page *page,
(!buffer_mapped(bh) || (buffer_mapped(bh)
&& bh->b_blocknr ==
0))) {
- /* not mapped yet, or it points to a direct item, search
+ /*
+ * not mapped yet, or it points to a direct item, search
* the btree for the mapping info, and log any direct
* items found
*/
@@ -2450,10 +2627,11 @@ static int reiserfs_write_full_page(struct page *page,
if (checked) {
reiserfs_prepare_for_journal(s, bh, 1);
- journal_mark_dirty(&th, s, bh);
+ journal_mark_dirty(&th, bh);
continue;
}
- /* from this point on, we know the buffer is mapped to a
+ /*
+ * from this point on, we know the buffer is mapped to a
* real block and not a direct item
*/
if (wbc->sync_mode != WB_SYNC_NONE) {
@@ -2472,7 +2650,7 @@ static int reiserfs_write_full_page(struct page *page,
} while ((bh = bh->b_this_page) != head);
if (checked) {
- error = journal_end(&th, s, bh_per_page + 1);
+ error = journal_end(&th);
reiserfs_write_unlock(s);
if (error)
goto fail;
@@ -2497,7 +2675,7 @@ static int reiserfs_write_full_page(struct page *page,
} while (bh != head);
error = 0;
- done:
+done:
if (nr == 0) {
/*
* if this page only had a direct item, it is very possible for
@@ -2519,8 +2697,9 @@ static int reiserfs_write_full_page(struct page *page,
}
return error;
- fail:
- /* catches various errors, we need to make sure any valid dirty blocks
+fail:
+ /*
+ * catches various errors, we need to make sure any valid dirty blocks
* get to the media. The page is currently locked and not marked for
* writeback
*/
@@ -2533,8 +2712,8 @@ static int reiserfs_write_full_page(struct page *page,
mark_buffer_async_write(bh);
} else {
/*
- * clear any dirty bits that might have come from getting
- * attached to a dirty page
+ * clear any dirty bits that might have come from
+ * getting attached to a dirty page
*/
clear_buffer_dirty(bh);
}
@@ -2614,15 +2793,18 @@ static int reiserfs_write_begin(struct file *file,
ret = __block_write_begin(page, pos, len, reiserfs_get_block);
if (ret && reiserfs_transaction_running(inode->i_sb)) {
struct reiserfs_transaction_handle *th = current->journal_info;
- /* this gets a little ugly. If reiserfs_get_block returned an
- * error and left a transacstion running, we've got to close it,
- * and we've got to free handle if it was a persistent transaction.
+ /*
+ * this gets a little ugly. If reiserfs_get_block returned an
+ * error and left a transacstion running, we've got to close
+ * it, and we've got to free handle if it was a persistent
+ * transaction.
*
* But, if we had nested into an existing transaction, we need
* to just drop the ref count on the handle.
*
* If old_ref == 0, the transaction is from reiserfs_get_block,
- * and it was a persistent trans. Otherwise, it was nested above.
+ * and it was a persistent trans. Otherwise, it was nested
+ * above.
*/
if (th->t_refcount > old_ref) {
if (old_ref)
@@ -2671,15 +2853,18 @@ int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len)
ret = __block_write_begin(page, from, len, reiserfs_get_block);
if (ret && reiserfs_transaction_running(inode->i_sb)) {
struct reiserfs_transaction_handle *th = current->journal_info;
- /* this gets a little ugly. If reiserfs_get_block returned an
- * error and left a transacstion running, we've got to close it,
- * and we've got to free handle if it was a persistent transaction.
+ /*
+ * this gets a little ugly. If reiserfs_get_block returned an
+ * error and left a transacstion running, we've got to close
+ * it, and we've got to free handle if it was a persistent
+ * transaction.
*
* But, if we had nested into an existing transaction, we need
* to just drop the ref count on the handle.
*
* If old_ref == 0, the transaction is from reiserfs_get_block,
- * and it was a persistent trans. Otherwise, it was nested above.
+ * and it was a persistent trans. Otherwise, it was nested
+ * above.
*/
if (th->t_refcount > old_ref) {
if (old_ref)
@@ -2734,17 +2919,20 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
reiserfs_commit_page(inode, page, start, start + copied);
- /* generic_commit_write does this for us, but does not update the
- ** transaction tracking stuff when the size changes. So, we have
- ** to do the i_size updates here.
+ /*
+ * generic_commit_write does this for us, but does not update the
+ * transaction tracking stuff when the size changes. So, we have
+ * to do the i_size updates here.
*/
if (pos + copied > inode->i_size) {
struct reiserfs_transaction_handle myth;
reiserfs_write_lock(inode->i_sb);
locked = true;
- /* If the file have grown beyond the border where it
- can have a tail, unmark it as needing a tail
- packing */
+ /*
+ * If the file have grown beyond the border where it
+ * can have a tail, unmark it as needing a tail
+ * packing
+ */
if ((have_large_tails(inode->i_sb)
&& inode->i_size > i_block_size(inode) * 4)
|| (have_small_tails(inode->i_sb)
@@ -2759,13 +2947,13 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
inode->i_size = pos + copied;
/*
* this will just nest into our transaction. It's important
- * to use mark_inode_dirty so the inode gets pushed around on the
- * dirty lists, and so that O_SYNC works as expected
+ * to use mark_inode_dirty so the inode gets pushed around on
+ * the dirty lists, and so that O_SYNC works as expected
*/
mark_inode_dirty(inode);
reiserfs_update_sd(&myth, inode);
update_sd = 1;
- ret = journal_end(&myth, inode->i_sb, 1);
+ ret = journal_end(&myth);
if (ret)
goto journal_error;
}
@@ -2781,7 +2969,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
goto out;
}
- out:
+out:
if (locked)
reiserfs_write_unlock(inode->i_sb);
unlock_page(page);
@@ -2792,7 +2980,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
return ret == 0 ? copied : ret;
- journal_error:
+journal_error:
reiserfs_write_unlock(inode->i_sb);
locked = false;
if (th) {
@@ -2822,15 +3010,18 @@ int reiserfs_commit_write(struct file *f, struct page *page,
}
reiserfs_commit_page(inode, page, from, to);
- /* generic_commit_write does this for us, but does not update the
- ** transaction tracking stuff when the size changes. So, we have
- ** to do the i_size updates here.
+ /*
+ * generic_commit_write does this for us, but does not update the
+ * transaction tracking stuff when the size changes. So, we have
+ * to do the i_size updates here.
*/
if (pos > inode->i_size) {
struct reiserfs_transaction_handle myth;
- /* If the file have grown beyond the border where it
- can have a tail, unmark it as needing a tail
- packing */
+ /*
+ * If the file have grown beyond the border where it
+ * can have a tail, unmark it as needing a tail
+ * packing
+ */
if ((have_large_tails(inode->i_sb)
&& inode->i_size > i_block_size(inode) * 4)
|| (have_small_tails(inode->i_sb)
@@ -2845,13 +3036,13 @@ int reiserfs_commit_write(struct file *f, struct page *page,
inode->i_size = pos;
/*
* this will just nest into our transaction. It's important
- * to use mark_inode_dirty so the inode gets pushed around on the
- * dirty lists, and so that O_SYNC works as expected
+ * to use mark_inode_dirty so the inode gets pushed around
+ * on the dirty lists, and so that O_SYNC works as expected
*/
mark_inode_dirty(inode);
reiserfs_update_sd(&myth, inode);
update_sd = 1;
- ret = journal_end(&myth, inode->i_sb, 1);
+ ret = journal_end(&myth);
if (ret)
goto journal_error;
}
@@ -2863,10 +3054,10 @@ int reiserfs_commit_write(struct file *f, struct page *page,
goto out;
}
- out:
+out:
return ret;
- journal_error:
+journal_error:
if (th) {
if (!update_sd)
reiserfs_update_sd(th, inode);
@@ -2924,9 +3115,10 @@ void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs)
}
}
-/* decide if this buffer needs to stay around for data logging or ordered
-** write purposes
-*/
+/*
+ * decide if this buffer needs to stay around for data logging or ordered
+ * write purposes
+ */
static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
{
int ret = 1;
@@ -2937,7 +3129,8 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
if (!buffer_mapped(bh)) {
goto free_jh;
}
- /* the page is locked, and the only places that log a data buffer
+ /*
+ * the page is locked, and the only places that log a data buffer
* also lock the page.
*/
if (reiserfs_file_data_log(inode)) {
@@ -2952,7 +3145,8 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
struct reiserfs_journal_list *jl;
struct reiserfs_jh *jh = bh->b_private;
- /* why is this safe?
+ /*
+ * why is this safe?
* reiserfs_setattr updates i_size in the on disk
* stat data before allowing vmtruncate to be called.
*
@@ -2969,7 +3163,7 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
&& jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
ret = 0;
}
- free_jh:
+free_jh:
if (ret && bh->b_private) {
reiserfs_free_jh(bh);
}
@@ -3028,7 +3222,7 @@ static void reiserfs_invalidatepage(struct page *page, unsigned int offset,
ret = try_to_release_page(page, 0);
/* maybe should BUG_ON(!ret); - neilb */
}
- out:
+out:
return;
}
@@ -3080,18 +3274,20 @@ static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
return ret;
}
-/* We thank Mingming Cao for helping us understand in great detail what
- to do in this section of the code. */
+/*
+ * We thank Mingming Cao for helping us understand in great detail what
+ * to do in this section of the code.
+ */
static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
+ size_t count = iov_iter_count(iter);
ssize_t ret;
- ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
- reiserfs_get_blocks_direct_io);
+ ret = blockdev_direct_IO(rw, iocb, inode, iter, offset,
+ reiserfs_get_blocks_direct_io);
/*
* In case of error extending write may have instantiated a few
@@ -3099,7 +3295,7 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
*/
if (unlikely((rw & WRITE) && ret < 0)) {
loff_t isize = i_size_read(inode);
- loff_t end = offset + iov_length(iov, nr_segs);
+ loff_t end = offset + count;
if ((end > isize) && inode_newsize_ok(inode, isize) == 0) {
truncate_setsize(inode, isize);
@@ -3127,8 +3323,9 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
dquot_initialize(inode);
reiserfs_write_lock(inode->i_sb);
if (attr->ia_valid & ATTR_SIZE) {
- /* version 2 items will be caught by the s_maxbytes check
- ** done for us in vmtruncate
+ /*
+ * version 2 items will be caught by the s_maxbytes check
+ * done for us in vmtruncate
*/
if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
attr->ia_size > MAX_NON_LFS) {
@@ -3149,7 +3346,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
err = journal_begin(&th, inode->i_sb, 4);
if (!err) {
reiserfs_discard_prealloc(&th, inode);
- err = journal_end(&th, inode->i_sb, 4);
+ err = journal_end(&th);
}
if (err)
error = err;
@@ -3189,7 +3386,10 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
if (error)
return error;
- /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
+ /*
+ * (user+group)*(old+new) structure - we count quota
+ * info and , inode write (sb, inode)
+ */
reiserfs_write_lock(inode->i_sb);
error = journal_begin(&th, inode->i_sb, jbegin_count);
reiserfs_write_unlock(inode->i_sb);
@@ -3198,19 +3398,21 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
error = dquot_transfer(inode, attr);
reiserfs_write_lock(inode->i_sb);
if (error) {
- journal_end(&th, inode->i_sb, jbegin_count);
+ journal_end(&th);
reiserfs_write_unlock(inode->i_sb);
goto out;
}
- /* Update corresponding info in inode so that everything is in
- * one transaction */
+ /*
+ * Update corresponding info in inode so that everything
+ * is in one transaction
+ */
if (attr->ia_valid & ATTR_UID)
inode->i_uid = attr->ia_uid;
if (attr->ia_valid & ATTR_GID)
inode->i_gid = attr->ia_gid;
mark_inode_dirty(inode);
- error = journal_end(&th, inode->i_sb, jbegin_count);
+ error = journal_end(&th);
reiserfs_write_unlock(inode->i_sb);
if (error)
goto out;
@@ -3220,8 +3422,14 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
attr->ia_size != i_size_read(inode)) {
error = inode_newsize_ok(inode, attr->ia_size);
if (!error) {
+ /*
+ * Could race against reiserfs_file_release
+ * if called from NFS, so take tailpack mutex.
+ */
+ mutex_lock(&REISERFS_I(inode)->tailpack);
truncate_setsize(inode, attr->ia_size);
- reiserfs_vfs_truncate_file(inode);
+ reiserfs_truncate_file(inode, 1);
+ mutex_unlock(&REISERFS_I(inode)->tailpack);
}
}
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 946ccbf5b5a1..501ed6811a2b 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -15,7 +15,8 @@
* reiserfs_ioctl - handler for ioctl for inode
* supported commands:
* 1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect
- * and prevent packing file (argument arg has to be non-zero)
+ * and prevent packing file (argument arg has t
+ * be non-zero)
* 2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION
* 3) That's all for a while ...
*/
@@ -132,7 +133,10 @@ setversion_out:
long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
- /* These are just misnamed, they actually get/put from/to user an int */
+ /*
+ * These are just misnamed, they actually
+ * get/put from/to user an int
+ */
switch (cmd) {
case REISERFS_IOC32_UNPACK:
cmd = REISERFS_IOC_UNPACK;
@@ -160,10 +164,10 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
int reiserfs_commit_write(struct file *f, struct page *page,
unsigned from, unsigned to);
/*
-** reiserfs_unpack
-** Function try to convert tail from direct item into indirect.
-** It set up nopack attribute in the REISERFS_I(inode)->nopack
-*/
+ * reiserfs_unpack
+ * Function try to convert tail from direct item into indirect.
+ * It set up nopack attribute in the REISERFS_I(inode)->nopack
+ */
int reiserfs_unpack(struct inode *inode, struct file *filp)
{
int retval = 0;
@@ -194,9 +198,10 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
goto out;
}
- /* we unpack by finding the page with the tail, and calling
- ** __reiserfs_write_begin on that page. This will force a
- ** reiserfs_get_block to unpack the tail for us.
+ /*
+ * we unpack by finding the page with the tail, and calling
+ * __reiserfs_write_begin on that page. This will force a
+ * reiserfs_get_block to unpack the tail for us.
*/
index = inode->i_size >> PAGE_CACHE_SHIFT;
mapping = inode->i_mapping;
@@ -214,11 +219,11 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
retval = reiserfs_commit_write(NULL, page, write_from, write_from);
REISERFS_I(inode)->i_flags |= i_nopack_mask;
- out_unlock:
+out_unlock:
unlock_page(page);
page_cache_release(page);
- out:
+out:
mutex_unlock(&inode->i_mutex);
reiserfs_write_unlock(inode->i_sb);
return retval;
diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
index ee382ef3d300..cfaee912ee09 100644
--- a/fs/reiserfs/item_ops.c
+++ b/fs/reiserfs/item_ops.c
@@ -5,15 +5,17 @@
#include <linux/time.h>
#include "reiserfs.h"
-// this contains item handlers for old item types: sd, direct,
-// indirect, directory
+/*
+ * this contains item handlers for old item types: sd, direct,
+ * indirect, directory
+ */
-/* and where are the comments? how about saying where we can find an
- explanation of each item handler method? -Hans */
+/*
+ * and where are the comments? how about saying where we can find an
+ * explanation of each item handler method? -Hans
+ */
-//////////////////////////////////////////////////////////////////////////////
-// stat data functions
-//
+/* stat data functions */
static int sd_bytes_number(struct item_head *ih, int block_size)
{
return 0;
@@ -60,7 +62,7 @@ static void sd_print_item(struct item_head *ih, char *item)
static void sd_check_item(struct item_head *ih, char *item)
{
- // FIXME: type something here!
+ /* unused */
}
static int sd_create_vi(struct virtual_node *vn,
@@ -68,7 +70,6 @@ static int sd_create_vi(struct virtual_node *vn,
int is_affected, int insert_size)
{
vi->vi_index = TYPE_STAT_DATA;
- //vi->vi_type |= VI_TYPE_STAT_DATA;// not needed?
return 0;
}
@@ -117,15 +118,13 @@ static struct item_operations stat_data_ops = {
.print_vi = sd_print_vi
};
-//////////////////////////////////////////////////////////////////////////////
-// direct item functions
-//
+/* direct item functions */
static int direct_bytes_number(struct item_head *ih, int block_size)
{
return ih_item_len(ih);
}
-// FIXME: this should probably switch to indirect as well
+/* FIXME: this should probably switch to indirect as well */
static void direct_decrement_key(struct cpu_key *key)
{
cpu_key_k_offset_dec(key);
@@ -144,7 +143,7 @@ static void direct_print_item(struct item_head *ih, char *item)
{
int j = 0;
-// return;
+/* return; */
printk("\"");
while (j < ih_item_len(ih))
printk("%c", item[j++]);
@@ -153,7 +152,7 @@ static void direct_print_item(struct item_head *ih, char *item)
static void direct_check_item(struct item_head *ih, char *item)
{
- // FIXME: type something here!
+ /* unused */
}
static int direct_create_vi(struct virtual_node *vn,
@@ -161,7 +160,6 @@ static int direct_create_vi(struct virtual_node *vn,
int is_affected, int insert_size)
{
vi->vi_index = TYPE_DIRECT;
- //vi->vi_type |= VI_TYPE_DIRECT;
return 0;
}
@@ -211,16 +209,13 @@ static struct item_operations direct_ops = {
.print_vi = direct_print_vi
};
-//////////////////////////////////////////////////////////////////////////////
-// indirect item functions
-//
-
+/* indirect item functions */
static int indirect_bytes_number(struct item_head *ih, int block_size)
{
- return ih_item_len(ih) / UNFM_P_SIZE * block_size; //- get_ih_free_space (ih);
+ return ih_item_len(ih) / UNFM_P_SIZE * block_size;
}
-// decrease offset, if it becomes 0, change type to stat data
+/* decrease offset, if it becomes 0, change type to stat data */
static void indirect_decrement_key(struct cpu_key *key)
{
cpu_key_k_offset_dec(key);
@@ -228,7 +223,7 @@ static void indirect_decrement_key(struct cpu_key *key)
set_cpu_key_k_type(key, TYPE_STAT_DATA);
}
-// if it is not first item of the body, then it is mergeable
+/* if it is not first item of the body, then it is mergeable */
static int indirect_is_left_mergeable(struct reiserfs_key *key,
unsigned long bsize)
{
@@ -236,7 +231,7 @@ static int indirect_is_left_mergeable(struct reiserfs_key *key,
return (le_key_k_offset(version, key) != 1);
}
-// printing of indirect item
+/* printing of indirect item */
static void start_new_sequence(__u32 * start, int *len, __u32 new)
{
*start = new;
@@ -295,7 +290,7 @@ static void indirect_print_item(struct item_head *ih, char *item)
static void indirect_check_item(struct item_head *ih, char *item)
{
- // FIXME: type something here!
+ /* unused */
}
static int indirect_create_vi(struct virtual_node *vn,
@@ -303,7 +298,6 @@ static int indirect_create_vi(struct virtual_node *vn,
int is_affected, int insert_size)
{
vi->vi_index = TYPE_INDIRECT;
- //vi->vi_type |= VI_TYPE_INDIRECT;
return 0;
}
@@ -321,16 +315,19 @@ static int indirect_check_right(struct virtual_item *vi, int free)
return indirect_check_left(vi, free, 0, 0);
}
-// return size in bytes of 'units' units. If first == 0 - calculate from the head (left), otherwise - from tail (right)
+/*
+ * return size in bytes of 'units' units. If first == 0 - calculate
+ * from the head (left), otherwise - from tail (right)
+ */
static int indirect_part_size(struct virtual_item *vi, int first, int units)
{
- // unit of indirect item is byte (yet)
+ /* unit of indirect item is byte (yet) */
return units;
}
static int indirect_unit_num(struct virtual_item *vi)
{
- // unit of indirect item is byte (yet)
+ /* unit of indirect item is byte (yet) */
return vi->vi_item_len - IH_SIZE;
}
@@ -356,10 +353,7 @@ static struct item_operations indirect_ops = {
.print_vi = indirect_print_vi
};
-//////////////////////////////////////////////////////////////////////////////
-// direntry functions
-//
-
+/* direntry functions */
static int direntry_bytes_number(struct item_head *ih, int block_size)
{
reiserfs_warning(NULL, "vs-16090",
@@ -396,7 +390,7 @@ static void direntry_print_item(struct item_head *ih, char *item)
deh = (struct reiserfs_de_head *)item;
- for (i = 0; i < I_ENTRY_COUNT(ih); i++, deh++) {
+ for (i = 0; i < ih_entry_count(ih); i++, deh++) {
namelen =
(i ? (deh_location(deh - 1)) : ih_item_len(ih)) -
deh_location(deh);
@@ -428,9 +422,9 @@ static void direntry_check_item(struct item_head *ih, char *item)
int i;
struct reiserfs_de_head *deh;
- // FIXME: type something here!
+ /* unused */
deh = (struct reiserfs_de_head *)item;
- for (i = 0; i < I_ENTRY_COUNT(ih); i++, deh++) {
+ for (i = 0; i < ih_entry_count(ih); i++, deh++) {
;
}
}
@@ -439,7 +433,8 @@ static void direntry_check_item(struct item_head *ih, char *item)
/*
* function returns old entry number in directory item in real node
- * using new entry number in virtual item in virtual node */
+ * using new entry number in virtual item in virtual node
+ */
static inline int old_entry_num(int is_affected, int virtual_entry_num,
int pos_in_item, int mode)
{
@@ -463,9 +458,11 @@ static inline int old_entry_num(int is_affected, int virtual_entry_num,
return virtual_entry_num - 1;
}
-/* Create an array of sizes of directory entries for virtual
- item. Return space used by an item. FIXME: no control over
- consuming of space used by this item handler */
+/*
+ * Create an array of sizes of directory entries for virtual
+ * item. Return space used by an item. FIXME: no control over
+ * consuming of space used by this item handler
+ */
static int direntry_create_vi(struct virtual_node *vn,
struct virtual_item *vi,
int is_affected, int insert_size)
@@ -494,8 +491,8 @@ static int direntry_create_vi(struct virtual_node *vn,
j = old_entry_num(is_affected, i, vn->vn_pos_in_item,
vn->vn_mode);
dir_u->entry_sizes[i] =
- (j ? deh_location(&(deh[j - 1])) : ih_item_len(vi->vi_ih)) -
- deh_location(&(deh[j])) + DEH_SIZE;
+ (j ? deh_location(&deh[j - 1]) : ih_item_len(vi->vi_ih)) -
+ deh_location(&deh[j]) + DEH_SIZE;
}
size += (dir_u->entry_count * sizeof(short));
@@ -529,10 +526,10 @@ static int direntry_create_vi(struct virtual_node *vn,
}
-//
-// return number of entries which may fit into specified amount of
-// free space, or -1 if free space is not enough even for 1 entry
-//
+/*
+ * return number of entries which may fit into specified amount of
+ * free space, or -1 if free space is not enough even for 1 entry
+ */
static int direntry_check_left(struct virtual_item *vi, int free,
int start_skip, int end_skip)
{
@@ -541,8 +538,8 @@ static int direntry_check_left(struct virtual_item *vi, int free,
struct direntry_uarea *dir_u = vi->vi_uarea;
for (i = start_skip; i < dir_u->entry_count - end_skip; i++) {
+ /* i-th entry doesn't fit into the remaining free space */
if (dir_u->entry_sizes[i] > free)
- /* i-th entry doesn't fit into the remaining free space */
break;
free -= dir_u->entry_sizes[i];
@@ -570,8 +567,8 @@ static int direntry_check_right(struct virtual_item *vi, int free)
struct direntry_uarea *dir_u = vi->vi_uarea;
for (i = dir_u->entry_count - 1; i >= 0; i--) {
+ /* i-th entry doesn't fit into the remaining free space */
if (dir_u->entry_sizes[i] > free)
- /* i-th entry doesn't fit into the remaining free space */
break;
free -= dir_u->entry_sizes[i];
@@ -643,9 +640,7 @@ static struct item_operations direntry_ops = {
.print_vi = direntry_print_vi
};
-//////////////////////////////////////////////////////////////////////////////
-// Error catching functions to catch errors caused by incorrect item types.
-//
+/* Error catching functions to catch errors caused by incorrect item types. */
static int errcatch_bytes_number(struct item_head *ih, int block_size)
{
reiserfs_warning(NULL, "green-16001",
@@ -685,8 +680,12 @@ static int errcatch_create_vi(struct virtual_node *vn,
{
reiserfs_warning(NULL, "green-16006",
"Invalid item type observed, run fsck ASAP");
- return 0; // We might return -1 here as well, but it won't help as create_virtual_node() from where
- // this operation is called from is of return type void.
+ /*
+ * We might return -1 here as well, but it won't help as
+ * create_virtual_node() from where this operation is called
+ * from is of return type void.
+ */
+ return 0;
}
static int errcatch_check_left(struct virtual_item *vi, int free,
@@ -739,9 +738,6 @@ static struct item_operations errcatch_ops = {
errcatch_print_vi
};
-//////////////////////////////////////////////////////////////////////////////
-//
-//
#if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3)
#error Item types must use disk-format assigned values.
#endif
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index fd777032c2ba..e8870de4627e 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1,38 +1,38 @@
/*
-** Write ahead logging implementation copyright Chris Mason 2000
-**
-** The background commits make this code very interrelated, and
-** overly complex. I need to rethink things a bit....The major players:
-**
-** journal_begin -- call with the number of blocks you expect to log.
-** If the current transaction is too
-** old, it will block until the current transaction is
-** finished, and then start a new one.
-** Usually, your transaction will get joined in with
-** previous ones for speed.
-**
-** journal_join -- same as journal_begin, but won't block on the current
-** transaction regardless of age. Don't ever call
-** this. Ever. There are only two places it should be
-** called from, and they are both inside this file.
-**
-** journal_mark_dirty -- adds blocks into this transaction. clears any flags
-** that might make them get sent to disk
-** and then marks them BH_JDirty. Puts the buffer head
-** into the current transaction hash.
-**
-** journal_end -- if the current transaction is batchable, it does nothing
-** otherwise, it could do an async/synchronous commit, or
-** a full flush of all log and real blocks in the
-** transaction.
-**
-** flush_old_commits -- if the current transaction is too old, it is ended and
-** commit blocks are sent to disk. Forces commit blocks
-** to disk for all backgrounded commits that have been
-** around too long.
-** -- Note, if you call this as an immediate flush from
-** from within kupdate, it will ignore the immediate flag
-*/
+ * Write ahead logging implementation copyright Chris Mason 2000
+ *
+ * The background commits make this code very interrelated, and
+ * overly complex. I need to rethink things a bit....The major players:
+ *
+ * journal_begin -- call with the number of blocks you expect to log.
+ * If the current transaction is too
+ * old, it will block until the current transaction is
+ * finished, and then start a new one.
+ * Usually, your transaction will get joined in with
+ * previous ones for speed.
+ *
+ * journal_join -- same as journal_begin, but won't block on the current
+ * transaction regardless of age. Don't ever call
+ * this. Ever. There are only two places it should be
+ * called from, and they are both inside this file.
+ *
+ * journal_mark_dirty -- adds blocks into this transaction. clears any flags
+ * that might make them get sent to disk
+ * and then marks them BH_JDirty. Puts the buffer head
+ * into the current transaction hash.
+ *
+ * journal_end -- if the current transaction is batchable, it does nothing
+ * otherwise, it could do an async/synchronous commit, or
+ * a full flush of all log and real blocks in the
+ * transaction.
+ *
+ * flush_old_commits -- if the current transaction is too old, it is ended and
+ * commit blocks are sent to disk. Forces commit blocks
+ * to disk for all backgrounded commits that have been
+ * around too long.
+ * -- Note, if you call this as an immediate flush from
+ * from within kupdate, it will ignore the immediate flag
+ */
#include <linux/time.h>
#include <linux/semaphore.h>
@@ -58,23 +58,19 @@
#define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
j_working_list))
-/* the number of mounted filesystems. This is used to decide when to
-** start and kill the commit workqueue
-*/
-static int reiserfs_mounted_fs_count;
-
-static struct workqueue_struct *commit_wq;
-
-#define JOURNAL_TRANS_HALF 1018 /* must be correct to keep the desc and commit
- structs at 4k */
+/* must be correct to keep the desc and commit structs at 4k */
+#define JOURNAL_TRANS_HALF 1018
#define BUFNR 64 /*read ahead */
/* cnode stat bits. Move these into reiserfs_fs.h */
-#define BLOCK_FREED 2 /* this block was freed, and can't be written. */
-#define BLOCK_FREED_HOLDER 3 /* this block was freed during this transaction, and can't be written */
+/* this block was freed, and can't be written. */
+#define BLOCK_FREED 2
+/* this block was freed during this transaction, and can't be written */
+#define BLOCK_FREED_HOLDER 3
-#define BLOCK_NEEDS_FLUSH 4 /* used in flush_journal_list */
+/* used in flush_journal_list */
+#define BLOCK_NEEDS_FLUSH 4
#define BLOCK_DIRTIED 5
/* journal list state bits */
@@ -87,16 +83,14 @@ static struct workqueue_struct *commit_wq;
#define COMMIT_NOW 2 /* end and commit this transaction */
#define WAIT 4 /* wait for the log blocks to hit the disk */
-static int do_journal_end(struct reiserfs_transaction_handle *,
- struct super_block *, unsigned long nblocks,
- int flags);
+static int do_journal_end(struct reiserfs_transaction_handle *, int flags);
static int flush_journal_list(struct super_block *s,
struct reiserfs_journal_list *jl, int flushall);
static int flush_commit_list(struct super_block *s,
struct reiserfs_journal_list *jl, int flushall);
static int can_dirty(struct reiserfs_journal_cnode *cn);
static int journal_join(struct reiserfs_transaction_handle *th,
- struct super_block *sb, unsigned long nblocks);
+ struct super_block *sb);
static void release_journal_dev(struct super_block *super,
struct reiserfs_journal *journal);
static int dirty_one_transaction(struct super_block *s,
@@ -107,8 +101,10 @@ static void queue_log_writer(struct super_block *s);
/* values for join in do_journal_begin_r */
enum {
JBEGIN_REG = 0, /* regular journal begin */
- JBEGIN_JOIN = 1, /* join the running transaction if at all possible */
- JBEGIN_ABORT = 2, /* called from cleanup code, ignores aborted flag */
+ /* join the running transaction if at all possible */
+ JBEGIN_JOIN = 1,
+ /* called from cleanup code, ignores aborted flag */
+ JBEGIN_ABORT = 2,
};
static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
@@ -123,10 +119,11 @@ static void init_journal_hash(struct super_block *sb)
}
/*
-** clears BH_Dirty and sticks the buffer on the clean list. Called because I can't allow refile_buffer to
-** make schedule happen after I've freed a block. Look at remove_from_transaction and journal_mark_freed for
-** more details.
-*/
+ * clears BH_Dirty and sticks the buffer on the clean list. Called because
+ * I can't allow refile_buffer to make schedule happen after I've freed a
+ * block. Look at remove_from_transaction and journal_mark_freed for
+ * more details.
+ */
static int reiserfs_clean_and_file_buffer(struct buffer_head *bh)
{
if (bh) {
@@ -163,7 +160,7 @@ static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *sb)
struct list_head *entry = journal->j_bitmap_nodes.next;
journal->j_used_bitmap_nodes++;
- repeat:
+repeat:
if (entry != &journal->j_bitmap_nodes) {
bn = list_entry(entry, struct reiserfs_bitmap_node, list);
@@ -204,7 +201,8 @@ static void allocate_bitmap_nodes(struct super_block *sb)
list_add(&bn->list, &journal->j_bitmap_nodes);
journal->j_free_bitmap_nodes++;
} else {
- break; /* this is ok, we'll try again when more are needed */
+ /* this is ok, we'll try again when more are needed */
+ break;
}
}
}
@@ -239,8 +237,8 @@ static void cleanup_bitmap_list(struct super_block *sb,
}
/*
-** only call this on FS unmount.
-*/
+ * only call this on FS unmount.
+ */
static int free_list_bitmaps(struct super_block *sb,
struct reiserfs_list_bitmap *jb_array)
{
@@ -275,9 +273,9 @@ static int free_bitmap_nodes(struct super_block *sb)
}
/*
-** get memory for JOURNAL_NUM_BITMAPS worth of bitmaps.
-** jb_array is the array to be filled in.
-*/
+ * get memory for JOURNAL_NUM_BITMAPS worth of bitmaps.
+ * jb_array is the array to be filled in.
+ */
int reiserfs_allocate_list_bitmaps(struct super_block *sb,
struct reiserfs_list_bitmap *jb_array,
unsigned int bmap_nr)
@@ -306,9 +304,9 @@ int reiserfs_allocate_list_bitmaps(struct super_block *sb,
}
/*
-** find an available list bitmap. If you can't find one, flush a commit list
-** and try again
-*/
+ * find an available list bitmap. If you can't find one, flush a commit list
+ * and try again
+ */
static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *sb,
struct reiserfs_journal_list
*jl)
@@ -332,18 +330,18 @@ static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *sb,
break;
}
}
- if (jb->journal_list) { /* double check to make sure if flushed correctly */
+ /* double check to make sure if flushed correctly */
+ if (jb->journal_list)
return NULL;
- }
jb->journal_list = jl;
return jb;
}
/*
-** allocates a new chunk of X nodes, and links them all together as a list.
-** Uses the cnode->next and cnode->prev pointers
-** returns NULL on failure
-*/
+ * allocates a new chunk of X nodes, and links them all together as a list.
+ * Uses the cnode->next and cnode->prev pointers
+ * returns NULL on failure
+ */
static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)
{
struct reiserfs_journal_cnode *head;
@@ -365,9 +363,7 @@ static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)
return head;
}
-/*
-** pulls a cnode off the free list, or returns NULL on failure
-*/
+/* pulls a cnode off the free list, or returns NULL on failure */
static struct reiserfs_journal_cnode *get_cnode(struct super_block *sb)
{
struct reiserfs_journal_cnode *cn;
@@ -393,8 +389,8 @@ static struct reiserfs_journal_cnode *get_cnode(struct super_block *sb)
}
/*
-** returns a cnode to the free list
-*/
+ * returns a cnode to the free list
+ */
static void free_cnode(struct super_block *sb,
struct reiserfs_journal_cnode *cn)
{
@@ -419,7 +415,10 @@ static void clear_prepared_bits(struct buffer_head *bh)
clear_buffer_journal_restore_dirty(bh);
}
-/* return a cnode with same dev, block number and size in table, or null if not found */
+/*
+ * return a cnode with same dev, block number and size in table,
+ * or null if not found
+ */
static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
super_block
*sb,
@@ -439,23 +438,24 @@ static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
}
/*
-** this actually means 'can this block be reallocated yet?'. If you set search_all, a block can only be allocated
-** if it is not in the current transaction, was not freed by the current transaction, and has no chance of ever
-** being overwritten by a replay after crashing.
-**
-** If you don't set search_all, a block can only be allocated if it is not in the current transaction. Since deleting
-** a block removes it from the current transaction, this case should never happen. If you don't set search_all, make
-** sure you never write the block without logging it.
-**
-** next_zero_bit is a suggestion about the next block to try for find_forward.
-** when bl is rejected because it is set in a journal list bitmap, we search
-** for the next zero bit in the bitmap that rejected bl. Then, we return that
-** through next_zero_bit for find_forward to try.
-**
-** Just because we return something in next_zero_bit does not mean we won't
-** reject it on the next call to reiserfs_in_journal
-**
-*/
+ * this actually means 'can this block be reallocated yet?'. If you set
+ * search_all, a block can only be allocated if it is not in the current
+ * transaction, was not freed by the current transaction, and has no chance
+ * of ever being overwritten by a replay after crashing.
+ *
+ * If you don't set search_all, a block can only be allocated if it is not
+ * in the current transaction. Since deleting a block removes it from the
+ * current transaction, this case should never happen. If you don't set
+ * search_all, make sure you never write the block without logging it.
+ *
+ * next_zero_bit is a suggestion about the next block to try for find_forward.
+ * when bl is rejected because it is set in a journal list bitmap, we search
+ * for the next zero bit in the bitmap that rejected bl. Then, we return
+ * that through next_zero_bit for find_forward to try.
+ *
+ * Just because we return something in next_zero_bit does not mean we won't
+ * reject it on the next call to reiserfs_in_journal
+ */
int reiserfs_in_journal(struct super_block *sb,
unsigned int bmap_nr, int bit_nr, int search_all,
b_blocknr_t * next_zero_bit)
@@ -469,9 +469,11 @@ int reiserfs_in_journal(struct super_block *sb,
*next_zero_bit = 0; /* always start this at zero. */
PROC_INFO_INC(sb, journal.in_journal);
- /* If we aren't doing a search_all, this is a metablock, and it will be logged before use.
- ** if we crash before the transaction that freed it commits, this transaction won't
- ** have committed either, and the block will never be written
+ /*
+ * If we aren't doing a search_all, this is a metablock, and it
+ * will be logged before use. if we crash before the transaction
+ * that freed it commits, this transaction won't have committed
+ * either, and the block will never be written
*/
if (search_all) {
for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
@@ -511,8 +513,7 @@ int reiserfs_in_journal(struct super_block *sb,
return 0;
}
-/* insert cn into table
-*/
+/* insert cn into table */
static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
struct reiserfs_journal_cnode *cn)
{
@@ -558,10 +559,10 @@ static inline void put_journal_list(struct super_block *s,
}
/*
-** this used to be much more involved, and I'm keeping it just in case things get ugly again.
-** it gets called by flush_commit_list, and cleans up any data stored about blocks freed during a
-** transaction.
-*/
+ * this used to be much more involved, and I'm keeping it just in case
+ * things get ugly again. it gets called by flush_commit_list, and
+ * cleans up any data stored about blocks freed during a transaction.
+ */
static void cleanup_freed_for_journal_list(struct super_block *sb,
struct reiserfs_journal_list *jl)
{
@@ -756,11 +757,12 @@ static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh,
jh = bh->b_private;
list_del_init(&jh->list);
} else {
- no_jh:
+no_jh:
get_bh(bh);
jh = alloc_jh();
spin_lock(&j->j_dirty_buffers_lock);
- /* buffer must be locked for __add_jh, should be able to have
+ /*
+ * buffer must be locked for __add_jh, should be able to have
* two adds at the same time
*/
BUG_ON(bh->b_private);
@@ -818,7 +820,8 @@ static int write_ordered_buffers(spinlock_t * lock,
spin_lock(lock);
goto loop_next;
}
- /* in theory, dirty non-uptodate buffers should never get here,
+ /*
+ * in theory, dirty non-uptodate buffers should never get here,
* but the upper layer io error paths still have a few quirks.
* Handle them here as gracefully as we can
*/
@@ -833,7 +836,7 @@ static int write_ordered_buffers(spinlock_t * lock,
reiserfs_free_jh(bh);
unlock_buffer(bh);
}
- loop_next:
+loop_next:
put_bh(bh);
cond_resched_lock(lock);
}
@@ -856,13 +859,14 @@ static int write_ordered_buffers(spinlock_t * lock,
if (!buffer_uptodate(bh)) {
ret = -EIO;
}
- /* ugly interaction with invalidatepage here.
- * reiserfs_invalidate_page will pin any buffer that has a valid
- * journal head from an older transaction. If someone else sets
- * our buffer dirty after we write it in the first loop, and
- * then someone truncates the page away, nobody will ever write
- * the buffer. We're safe if we write the page one last time
- * after freeing the journal header.
+ /*
+ * ugly interaction with invalidatepage here.
+ * reiserfs_invalidate_page will pin any buffer that has a
+ * valid journal head from an older transaction. If someone
+ * else sets our buffer dirty after we write it in the first
+ * loop, and then someone truncates the page away, nobody
+ * will ever write the buffer. We're safe if we write the
+ * page one last time after freeing the journal header.
*/
if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) {
spin_unlock(lock);
@@ -887,7 +891,7 @@ static int flush_older_commits(struct super_block *s,
unsigned int other_trans_id;
unsigned int first_trans_id;
- find_first:
+find_first:
/*
* first we walk backwards to find the oldest uncommitted transation
*/
@@ -923,9 +927,11 @@ static int flush_older_commits(struct super_block *s,
if (!journal_list_still_alive(s, trans_id))
return 1;
- /* the one we just flushed is gone, this means all
- * older lists are also gone, so first_jl is no longer
- * valid either. Go back to the beginning.
+ /*
+ * the one we just flushed is gone, this means
+ * all older lists are also gone, so first_jl
+ * is no longer valid either. Go back to the
+ * beginning.
*/
if (!journal_list_still_alive
(s, other_trans_id)) {
@@ -958,12 +964,12 @@ static int reiserfs_async_progress_wait(struct super_block *s)
}
/*
-** if this journal list still has commit blocks unflushed, send them to disk.
-**
-** log areas must be flushed in order (transaction 2 can't commit before transaction 1)
-** Before the commit block can by written, every other log block must be safely on disk
-**
-*/
+ * if this journal list still has commit blocks unflushed, send them to disk.
+ *
+ * log areas must be flushed in order (transaction 2 can't commit before
+ * transaction 1) Before the commit block can by written, every other log
+ * block must be safely on disk
+ */
static int flush_commit_list(struct super_block *s,
struct reiserfs_journal_list *jl, int flushall)
{
@@ -982,8 +988,9 @@ static int flush_commit_list(struct super_block *s,
return 0;
}
- /* before we can put our commit blocks on disk, we have to make sure everyone older than
- ** us is on disk too
+ /*
+ * before we can put our commit blocks on disk, we have to make
+ * sure everyone older than us is on disk too
*/
BUG_ON(jl->j_len <= 0);
BUG_ON(trans_id == journal->j_trans_id);
@@ -991,7 +998,10 @@ static int flush_commit_list(struct super_block *s,
get_journal_list(jl);
if (flushall) {
if (flush_older_commits(s, jl) == 1) {
- /* list disappeared during flush_older_commits. return */
+ /*
+ * list disappeared during flush_older_commits.
+ * return
+ */
goto put_jl;
}
}
@@ -1006,9 +1016,9 @@ static int flush_commit_list(struct super_block *s,
BUG_ON(jl->j_trans_id == 0);
/* this commit is done, exit */
- if (atomic_read(&(jl->j_commit_left)) <= 0) {
+ if (atomic_read(&jl->j_commit_left) <= 0) {
if (flushall) {
- atomic_set(&(jl->j_older_commits_done), 1);
+ atomic_set(&jl->j_older_commits_done, 1);
}
mutex_unlock(&jl->j_commit_mutex);
goto put_jl;
@@ -1063,9 +1073,10 @@ static int flush_commit_list(struct super_block *s,
depth = reiserfs_write_unlock_nested(s);
__wait_on_buffer(tbh);
reiserfs_write_lock_nested(s, depth);
- // since we're using ll_rw_blk above, it might have skipped over
- // a locked buffer. Double check here
- //
+ /*
+ * since we're using ll_rw_blk above, it might have skipped
+ * over a locked buffer. Double check here
+ */
/* redundant, sync_dirty_buffer() checks */
if (buffer_dirty(tbh)) {
depth = reiserfs_write_unlock_nested(s);
@@ -1079,17 +1090,21 @@ static int flush_commit_list(struct super_block *s,
#endif
retval = -EIO;
}
- put_bh(tbh); /* once for journal_find_get_block */
- put_bh(tbh); /* once due to original getblk in do_journal_end */
- atomic_dec(&(jl->j_commit_left));
+ /* once for journal_find_get_block */
+ put_bh(tbh);
+ /* once due to original getblk in do_journal_end */
+ put_bh(tbh);
+ atomic_dec(&jl->j_commit_left);
}
- BUG_ON(atomic_read(&(jl->j_commit_left)) != 1);
+ BUG_ON(atomic_read(&jl->j_commit_left) != 1);
- /* If there was a write error in the journal - we can't commit
+ /*
+ * If there was a write error in the journal - we can't commit
* this transaction - it will be invalid and, if successful,
* will just end up propagating the write error out to
- * the file system. */
+ * the file system.
+ */
if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
if (buffer_dirty(jl->j_commit_bh))
BUG();
@@ -1102,9 +1117,11 @@ static int flush_commit_list(struct super_block *s,
reiserfs_write_lock_nested(s, depth);
}
- /* If there was a write error in the journal - we can't commit this
+ /*
+ * If there was a write error in the journal - we can't commit this
* transaction - it will be invalid and, if successful, will just end
- * up propagating the write error out to the filesystem. */
+ * up propagating the write error out to the filesystem.
+ */
if (unlikely(!buffer_uptodate(jl->j_commit_bh))) {
#ifdef CONFIG_REISERFS_CHECK
reiserfs_warning(s, "journal-615", "buffer write failed");
@@ -1119,7 +1136,10 @@ static int flush_commit_list(struct super_block *s,
}
journal->j_last_commit_id = jl->j_trans_id;
- /* now, every commit block is on the disk. It is safe to allow blocks freed during this transaction to be reallocated */
+ /*
+ * now, every commit block is on the disk. It is safe to allow
+ * blocks freed during this transaction to be reallocated
+ */
cleanup_freed_for_journal_list(s, jl);
retval = retval ? retval : journal->j_errno;
@@ -1127,13 +1147,13 @@ static int flush_commit_list(struct super_block *s,
/* mark the metadata dirty */
if (!retval)
dirty_one_transaction(s, jl);
- atomic_dec(&(jl->j_commit_left));
+ atomic_dec(&jl->j_commit_left);
if (flushall) {
- atomic_set(&(jl->j_older_commits_done), 1);
+ atomic_set(&jl->j_older_commits_done, 1);
}
mutex_unlock(&jl->j_commit_mutex);
- put_jl:
+put_jl:
put_journal_list(s, jl);
if (retval)
@@ -1143,9 +1163,9 @@ static int flush_commit_list(struct super_block *s,
}
/*
-** flush_journal_list frequently needs to find a newer transaction for a given block. This does that, or
-** returns NULL if it can't find anything
-*/
+ * flush_journal_list frequently needs to find a newer transaction for a
+ * given block. This does that, or returns NULL if it can't find anything
+ */
static struct reiserfs_journal_list *find_newer_jl_for_cn(struct
reiserfs_journal_cnode
*cn)
@@ -1169,10 +1189,11 @@ static void remove_journal_hash(struct super_block *,
int);
/*
-** once all the real blocks have been flushed, it is safe to remove them from the
-** journal list for this transaction. Aside from freeing the cnode, this also allows the
-** block to be reallocated for data blocks if it had been deleted.
-*/
+ * once all the real blocks have been flushed, it is safe to remove them
+ * from the journal list for this transaction. Aside from freeing the
+ * cnode, this also allows the block to be reallocated for data blocks
+ * if it had been deleted.
+ */
static void remove_all_from_journal_list(struct super_block *sb,
struct reiserfs_journal_list *jl,
int debug)
@@ -1181,8 +1202,9 @@ static void remove_all_from_journal_list(struct super_block *sb,
struct reiserfs_journal_cnode *cn, *last;
cn = jl->j_realblock;
- /* which is better, to lock once around the whole loop, or
- ** to lock for each call to remove_journal_hash?
+ /*
+ * which is better, to lock once around the whole loop, or
+ * to lock for each call to remove_journal_hash?
*/
while (cn) {
if (cn->blocknr != 0) {
@@ -1204,12 +1226,13 @@ static void remove_all_from_journal_list(struct super_block *sb,
}
/*
-** if this timestamp is greater than the timestamp we wrote last to the header block, write it to the header block.
-** once this is done, I can safely say the log area for this transaction won't ever be replayed, and I can start
-** releasing blocks in this transaction for reuse as data blocks.
-** called by flush_journal_list, before it calls remove_all_from_journal_list
-**
-*/
+ * if this timestamp is greater than the timestamp we wrote last to the
+ * header block, write it to the header block. once this is done, I can
+ * safely say the log area for this transaction won't ever be replayed,
+ * and I can start releasing blocks in this transaction for reuse as data
+ * blocks. called by flush_journal_list, before it calls
+ * remove_all_from_journal_list
+ */
static int _update_journal_header_block(struct super_block *sb,
unsigned long offset,
unsigned int trans_id)
@@ -1279,10 +1302,11 @@ static int flush_older_journal_lists(struct super_block *sb,
struct reiserfs_journal *journal = SB_JOURNAL(sb);
unsigned int trans_id = jl->j_trans_id;
- /* we know we are the only ones flushing things, no extra race
+ /*
+ * we know we are the only ones flushing things, no extra race
* protection is required.
*/
- restart:
+restart:
entry = journal->j_journal_list.next;
/* Did we wrap? */
if (entry == &journal->j_journal_list)
@@ -1309,15 +1333,16 @@ static void del_from_work_list(struct super_block *s,
}
}
-/* flush a journal list, both commit and real blocks
-**
-** always set flushall to 1, unless you are calling from inside
-** flush_journal_list
-**
-** IMPORTANT. This can only be called while there are no journal writers,
-** and the journal is locked. That means it can only be called from
-** do_journal_end, or by journal_release
-*/
+/*
+ * flush a journal list, both commit and real blocks
+ *
+ * always set flushall to 1, unless you are calling from inside
+ * flush_journal_list
+ *
+ * IMPORTANT. This can only be called while there are no journal writers,
+ * and the journal is locked. That means it can only be called from
+ * do_journal_end, or by journal_release
+ */
static int flush_journal_list(struct super_block *s,
struct reiserfs_journal_list *jl, int flushall)
{
@@ -1354,13 +1379,14 @@ static int flush_journal_list(struct super_block *s,
}
/* if all the work is already done, get out of here */
- if (atomic_read(&(jl->j_nonzerolen)) <= 0 &&
- atomic_read(&(jl->j_commit_left)) <= 0) {
+ if (atomic_read(&jl->j_nonzerolen) <= 0 &&
+ atomic_read(&jl->j_commit_left) <= 0) {
goto flush_older_and_return;
}
- /* start by putting the commit list on disk. This will also flush
- ** the commit lists of any olders transactions
+ /*
+ * start by putting the commit list on disk. This will also flush
+ * the commit lists of any olders transactions
*/
flush_commit_list(s, jl, 1);
@@ -1369,15 +1395,16 @@ static int flush_journal_list(struct super_block *s,
BUG();
/* are we done now? */
- if (atomic_read(&(jl->j_nonzerolen)) <= 0 &&
- atomic_read(&(jl->j_commit_left)) <= 0) {
+ if (atomic_read(&jl->j_nonzerolen) <= 0 &&
+ atomic_read(&jl->j_commit_left) <= 0) {
goto flush_older_and_return;
}
- /* loop through each cnode, see if we need to write it,
- ** or wait on a more recent transaction, or just ignore it
+ /*
+ * loop through each cnode, see if we need to write it,
+ * or wait on a more recent transaction, or just ignore it
*/
- if (atomic_read(&(journal->j_wcount)) != 0) {
+ if (atomic_read(&journal->j_wcount) != 0) {
reiserfs_panic(s, "journal-844", "journal list is flushing, "
"wcount is not 0");
}
@@ -1391,20 +1418,25 @@ static int flush_journal_list(struct super_block *s,
goto free_cnode;
}
- /* This transaction failed commit. Don't write out to the disk */
+ /*
+ * This transaction failed commit.
+ * Don't write out to the disk
+ */
if (!(jl->j_state & LIST_DIRTY))
goto free_cnode;
pjl = find_newer_jl_for_cn(cn);
- /* the order is important here. We check pjl to make sure we
- ** don't clear BH_JDirty_wait if we aren't the one writing this
- ** block to disk
+ /*
+ * the order is important here. We check pjl to make sure we
+ * don't clear BH_JDirty_wait if we aren't the one writing this
+ * block to disk
*/
if (!pjl && cn->bh) {
saved_bh = cn->bh;
- /* we do this to make sure nobody releases the buffer while
- ** we are working with it
+ /*
+ * we do this to make sure nobody releases the
+ * buffer while we are working with it
*/
get_bh(saved_bh);
@@ -1413,13 +1445,17 @@ static int flush_journal_list(struct super_block *s,
was_jwait = 1;
was_dirty = 1;
} else if (can_dirty(cn)) {
- /* everything with !pjl && jwait should be writable */
+ /*
+ * everything with !pjl && jwait
+ * should be writable
+ */
BUG();
}
}
- /* if someone has this block in a newer transaction, just make
- ** sure they are committed, and don't try writing it to disk
+ /*
+ * if someone has this block in a newer transaction, just make
+ * sure they are committed, and don't try writing it to disk
*/
if (pjl) {
if (atomic_read(&pjl->j_commit_left))
@@ -1427,16 +1463,18 @@ static int flush_journal_list(struct super_block *s,
goto free_cnode;
}
- /* bh == NULL when the block got to disk on its own, OR,
- ** the block got freed in a future transaction
+ /*
+ * bh == NULL when the block got to disk on its own, OR,
+ * the block got freed in a future transaction
*/
if (saved_bh == NULL) {
goto free_cnode;
}
- /* this should never happen. kupdate_one_transaction has this list
- ** locked while it works, so we should never see a buffer here that
- ** is not marked JDirty_wait
+ /*
+ * this should never happen. kupdate_one_transaction has
+ * this list locked while it works, so we should never see a
+ * buffer here that is not marked JDirty_wait
*/
if ((!was_jwait) && !buffer_locked(saved_bh)) {
reiserfs_warning(s, "journal-813",
@@ -1447,7 +1485,10 @@ static int flush_journal_list(struct super_block *s,
was_jwait ? ' ' : '!');
}
if (was_dirty) {
- /* we inc again because saved_bh gets decremented at free_cnode */
+ /*
+ * we inc again because saved_bh gets decremented
+ * at free_cnode
+ */
get_bh(saved_bh);
set_bit(BLOCK_NEEDS_FLUSH, &cn->state);
lock_buffer(saved_bh);
@@ -1463,13 +1504,16 @@ static int flush_journal_list(struct super_block *s,
(unsigned long long)saved_bh->
b_blocknr, __func__);
}
- free_cnode:
+free_cnode:
last = cn;
cn = cn->next;
if (saved_bh) {
- /* we incremented this to keep others from taking the buffer head away */
+ /*
+ * we incremented this to keep others from
+ * taking the buffer head away
+ */
put_bh(saved_bh);
- if (atomic_read(&(saved_bh->b_count)) < 0) {
+ if (atomic_read(&saved_bh->b_count) < 0) {
reiserfs_warning(s, "journal-945",
"saved_bh->b_count < 0");
}
@@ -1499,8 +1543,10 @@ static int flush_journal_list(struct super_block *s,
#endif
err = -EIO;
}
- /* note, we must clear the JDirty_wait bit after the up to date
- ** check, otherwise we race against our flushpage routine
+ /*
+ * note, we must clear the JDirty_wait bit
+ * after the up to date check, otherwise we
+ * race against our flushpage routine
*/
BUG_ON(!test_clear_buffer_journal_dirty
(cn->bh));
@@ -1518,25 +1564,27 @@ static int flush_journal_list(struct super_block *s,
reiserfs_abort(s, -EIO,
"Write error while pushing transaction to disk in %s",
__func__);
- flush_older_and_return:
+flush_older_and_return:
- /* before we can update the journal header block, we _must_ flush all
- ** real blocks from all older transactions to disk. This is because
- ** once the header block is updated, this transaction will not be
- ** replayed after a crash
+ /*
+ * before we can update the journal header block, we _must_ flush all
+ * real blocks from all older transactions to disk. This is because
+ * once the header block is updated, this transaction will not be
+ * replayed after a crash
*/
if (flushall) {
flush_older_journal_lists(s, jl);
}
err = journal->j_errno;
- /* before we can remove everything from the hash tables for this
- ** transaction, we must make sure it can never be replayed
- **
- ** since we are only called from do_journal_end, we know for sure there
- ** are no allocations going on while we are flushing journal lists. So,
- ** we only need to update the journal header block for the last list
- ** being flushed
+ /*
+ * before we can remove everything from the hash tables for this
+ * transaction, we must make sure it can never be replayed
+ *
+ * since we are only called from do_journal_end, we know for sure there
+ * are no allocations going on while we are flushing journal lists. So,
+ * we only need to update the journal header block for the last list
+ * being flushed
*/
if (!err && flushall) {
err =
@@ -1561,11 +1609,12 @@ static int flush_journal_list(struct super_block *s,
}
journal->j_last_flush_id = jl->j_trans_id;
- /* not strictly required since we are freeing the list, but it should
+ /*
+ * not strictly required since we are freeing the list, but it should
* help find code using dead lists later on
*/
jl->j_len = 0;
- atomic_set(&(jl->j_nonzerolen), 0);
+ atomic_set(&jl->j_nonzerolen, 0);
jl->j_start = 0;
jl->j_realblock = NULL;
jl->j_commit_bh = NULL;
@@ -1592,15 +1641,17 @@ static int write_one_transaction(struct super_block *s,
cn = jl->j_realblock;
while (cn) {
- /* if the blocknr == 0, this has been cleared from the hash,
- ** skip it
+ /*
+ * if the blocknr == 0, this has been cleared from the hash,
+ * skip it
*/
if (cn->blocknr == 0) {
goto next;
}
if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) {
struct buffer_head *tmp_bh;
- /* we can race against journal_mark_freed when we try
+ /*
+ * we can race against journal_mark_freed when we try
* to lock_buffer(cn->bh), so we have to inc the buffer
* count, and recheck things after locking
*/
@@ -1619,7 +1670,7 @@ static int write_one_transaction(struct super_block *s,
}
put_bh(tmp_bh);
}
- next:
+next:
cn = cn->next;
cond_resched();
}
@@ -1637,15 +1688,17 @@ static int dirty_one_transaction(struct super_block *s,
jl->j_state |= LIST_DIRTY;
cn = jl->j_realblock;
while (cn) {
- /* look for a more recent transaction that logged this
- ** buffer. Only the most recent transaction with a buffer in
- ** it is allowed to send that buffer to disk
+ /*
+ * look for a more recent transaction that logged this
+ * buffer. Only the most recent transaction with a buffer in
+ * it is allowed to send that buffer to disk
*/
pjl = find_newer_jl_for_cn(cn);
if (!pjl && cn->blocknr && cn->bh
&& buffer_journal_dirty(cn->bh)) {
BUG_ON(!can_dirty(cn));
- /* if the buffer is prepared, it will either be logged
+ /*
+ * if the buffer is prepared, it will either be logged
* or restored. If restored, we need to make sure
* it actually gets marked dirty
*/
@@ -1682,7 +1735,8 @@ static int kupdate_transactions(struct super_block *s,
goto done;
}
- /* we've got j_flush_mutex held, nobody is going to delete any
+ /*
+ * we've got j_flush_mutex held, nobody is going to delete any
* of these lists out from underneath us
*/
while ((num_trans && transactions_flushed < num_trans) ||
@@ -1716,20 +1770,21 @@ static int kupdate_transactions(struct super_block *s,
write_chunk(&chunk);
}
- done:
+done:
mutex_unlock(&journal->j_flush_mutex);
return ret;
}
-/* for o_sync and fsync heavy applications, they tend to use
-** all the journa list slots with tiny transactions. These
-** trigger lots and lots of calls to update the header block, which
-** adds seeks and slows things down.
-**
-** This function tries to clear out a large chunk of the journal lists
-** at once, which makes everything faster since only the newest journal
-** list updates the header block
-*/
+/*
+ * for o_sync and fsync heavy applications, they tend to use
+ * all the journa list slots with tiny transactions. These
+ * trigger lots and lots of calls to update the header block, which
+ * adds seeks and slows things down.
+ *
+ * This function tries to clear out a large chunk of the journal lists
+ * at once, which makes everything faster since only the newest journal
+ * list updates the header block
+ */
static int flush_used_journal_lists(struct super_block *s,
struct reiserfs_journal_list *jl)
{
@@ -1766,9 +1821,11 @@ static int flush_used_journal_lists(struct super_block *s,
}
get_journal_list(jl);
get_journal_list(flush_jl);
- /* try to find a group of blocks we can flush across all the
- ** transactions, but only bother if we've actually spanned
- ** across multiple lists
+
+ /*
+ * try to find a group of blocks we can flush across all the
+ * transactions, but only bother if we've actually spanned
+ * across multiple lists
*/
if (flush_jl != jl) {
ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
@@ -1780,9 +1837,9 @@ static int flush_used_journal_lists(struct super_block *s,
}
/*
-** removes any nodes in table with name block and dev as bh.
-** only touchs the hnext and hprev pointers.
-*/
+ * removes any nodes in table with name block and dev as bh.
+ * only touchs the hnext and hprev pointers.
+ */
void remove_journal_hash(struct super_block *sb,
struct reiserfs_journal_cnode **table,
struct reiserfs_journal_list *jl,
@@ -1811,8 +1868,12 @@ void remove_journal_hash(struct super_block *sb,
cur->blocknr = 0;
cur->sb = NULL;
cur->state = 0;
- if (cur->bh && cur->jlist) /* anybody who clears the cur->bh will also dec the nonzerolen */
- atomic_dec(&(cur->jlist->j_nonzerolen));
+ /*
+ * anybody who clears the cur->bh will also
+ * dec the nonzerolen
+ */
+ if (cur->bh && cur->jlist)
+ atomic_dec(&cur->jlist->j_nonzerolen);
cur->bh = NULL;
cur->jlist = NULL;
}
@@ -1832,17 +1893,18 @@ static void free_journal_ram(struct super_block *sb)
if (journal->j_header_bh) {
brelse(journal->j_header_bh);
}
- /* j_header_bh is on the journal dev, make sure not to release the journal
- * dev until we brelse j_header_bh
+ /*
+ * j_header_bh is on the journal dev, make sure
+ * not to release the journal dev until we brelse j_header_bh
*/
release_journal_dev(sb, journal);
vfree(journal);
}
/*
-** call on unmount. Only set error to 1 if you haven't made your way out
-** of read_super() yet. Any other caller must keep error at 0.
-*/
+ * call on unmount. Only set error to 1 if you haven't made your way out
+ * of read_super() yet. Any other caller must keep error at 0.
+ */
static int do_journal_release(struct reiserfs_transaction_handle *th,
struct super_block *sb, int error)
{
@@ -1850,21 +1912,25 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
int flushed = 0;
struct reiserfs_journal *journal = SB_JOURNAL(sb);
- /* we only want to flush out transactions if we were called with error == 0
+ /*
+ * we only want to flush out transactions if we were
+ * called with error == 0
*/
if (!error && !(sb->s_flags & MS_RDONLY)) {
/* end the current trans */
BUG_ON(!th->t_trans_id);
- do_journal_end(th, sb, 10, FLUSH_ALL);
+ do_journal_end(th, FLUSH_ALL);
- /* make sure something gets logged to force our way into the flush code */
- if (!journal_join(&myth, sb, 1)) {
+ /*
+ * make sure something gets logged to force
+ * our way into the flush code
+ */
+ if (!journal_join(&myth, sb)) {
reiserfs_prepare_for_journal(sb,
SB_BUFFER_WITH_SB(sb),
1);
- journal_mark_dirty(&myth, sb,
- SB_BUFFER_WITH_SB(sb));
- do_journal_end(&myth, sb, 1, FLUSH_ALL);
+ journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb));
+ do_journal_end(&myth, FLUSH_ALL);
flushed = 1;
}
}
@@ -1872,17 +1938,15 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
/* this also catches errors during the do_journal_end above */
if (!error && reiserfs_is_journal_aborted(journal)) {
memset(&myth, 0, sizeof(myth));
- if (!journal_join_abort(&myth, sb, 1)) {
+ if (!journal_join_abort(&myth, sb)) {
reiserfs_prepare_for_journal(sb,
SB_BUFFER_WITH_SB(sb),
1);
- journal_mark_dirty(&myth, sb,
- SB_BUFFER_WITH_SB(sb));
- do_journal_end(&myth, sb, 1, FLUSH_ALL);
+ journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb));
+ do_journal_end(&myth, FLUSH_ALL);
}
}
- reiserfs_mounted_fs_count--;
/* wait for all commits to finish */
cancel_delayed_work(&SB_JOURNAL(sb)->j_work);
@@ -1893,12 +1957,7 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
reiserfs_write_unlock(sb);
cancel_delayed_work_sync(&REISERFS_SB(sb)->old_work);
- flush_workqueue(commit_wq);
-
- if (!reiserfs_mounted_fs_count) {
- destroy_workqueue(commit_wq);
- commit_wq = NULL;
- }
+ flush_workqueue(REISERFS_SB(sb)->commit_wq);
free_journal_ram(sb);
@@ -1907,25 +1966,24 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
return 0;
}
-/*
-** call on unmount. flush all journal trans, release all alloc'd ram
-*/
+/* * call on unmount. flush all journal trans, release all alloc'd ram */
int journal_release(struct reiserfs_transaction_handle *th,
struct super_block *sb)
{
return do_journal_release(th, sb, 0);
}
-/*
-** only call from an error condition inside reiserfs_read_super!
-*/
+/* only call from an error condition inside reiserfs_read_super! */
int journal_release_error(struct reiserfs_transaction_handle *th,
struct super_block *sb)
{
return do_journal_release(th, sb, 1);
}
-/* compares description block with commit block. returns 1 if they differ, 0 if they are the same */
+/*
+ * compares description block with commit block.
+ * returns 1 if they differ, 0 if they are the same
+ */
static int journal_compare_desc_commit(struct super_block *sb,
struct reiserfs_journal_desc *desc,
struct reiserfs_journal_commit *commit)
@@ -1939,11 +1997,12 @@ static int journal_compare_desc_commit(struct super_block *sb,
return 0;
}
-/* returns 0 if it did not find a description block
-** returns -1 if it found a corrupt commit block
-** returns 1 if both desc and commit were valid
-** NOTE: only called during fs mount
-*/
+/*
+ * returns 0 if it did not find a description block
+ * returns -1 if it found a corrupt commit block
+ * returns 1 if both desc and commit were valid
+ * NOTE: only called during fs mount
+ */
static int journal_transaction_is_valid(struct super_block *sb,
struct buffer_head *d_bh,
unsigned int *oldest_invalid_trans_id,
@@ -1989,7 +2048,10 @@ static int journal_transaction_is_valid(struct super_block *sb,
}
offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
- /* ok, we have a journal description block, lets see if the transaction was valid */
+ /*
+ * ok, we have a journal description block,
+ * let's see if the transaction was valid
+ */
c_bh =
journal_bread(sb,
SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
@@ -2041,11 +2103,11 @@ static void brelse_array(struct buffer_head **heads, int num)
}
/*
-** given the start, and values for the oldest acceptable transactions,
-** this either reads in a replays a transaction, or returns because the
-** transaction is invalid, or too old.
-** NOTE: only called during fs mount
-*/
+ * given the start, and values for the oldest acceptable transactions,
+ * this either reads in a replays a transaction, or returns because the
+ * transaction is invalid, or too old.
+ * NOTE: only called during fs mount
+ */
static int journal_read_transaction(struct super_block *sb,
unsigned long cur_dblock,
unsigned long oldest_start,
@@ -2119,7 +2181,10 @@ static int journal_read_transaction(struct super_block *sb,
}
trans_id = get_desc_trans_id(desc);
- /* now we know we've got a good transaction, and it was inside the valid time ranges */
+ /*
+ * now we know we've got a good transaction, and it was
+ * inside the valid time ranges
+ */
log_blocks = kmalloc(get_desc_trans_len(desc) *
sizeof(struct buffer_head *), GFP_NOFS);
real_blocks = kmalloc(get_desc_trans_len(desc) *
@@ -2164,7 +2229,7 @@ static int journal_read_transaction(struct super_block *sb,
reiserfs_warning(sb, "journal-1204",
"REPLAY FAILURE fsck required! "
"Trying to replay onto a log block");
- abort_replay:
+abort_replay:
brelse_array(log_blocks, i);
brelse_array(real_blocks, i);
brelse(c_bh);
@@ -2226,7 +2291,10 @@ static int journal_read_transaction(struct super_block *sb,
"journal-1095: setting journal " "start to offset %ld",
cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb));
- /* init starting values for the first transaction, in case this is the last transaction to be replayed. */
+ /*
+ * init starting values for the first transaction, in case
+ * this is the last transaction to be replayed.
+ */
journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
journal->j_last_flush_trans_id = trans_id;
journal->j_trans_id = trans_id + 1;
@@ -2240,12 +2308,14 @@ static int journal_read_transaction(struct super_block *sb,
return 0;
}
-/* This function reads blocks starting from block and to max_block of bufsize
- size (but no more than BUFNR blocks at a time). This proved to improve
- mounting speed on self-rebuilding raid5 arrays at least.
- Right now it is only used from journal code. But later we might use it
- from other places.
- Note: Do not use journal_getblk/sb_getblk functions here! */
+/*
+ * This function reads blocks starting from block and to max_block of bufsize
+ * size (but no more than BUFNR blocks at a time). This proved to improve
+ * mounting speed on self-rebuilding raid5 arrays at least.
+ * Right now it is only used from journal code. But later we might use it
+ * from other places.
+ * Note: Do not use journal_getblk/sb_getblk functions here!
+ */
static struct buffer_head *reiserfs_breada(struct block_device *dev,
b_blocknr_t block, int bufsize,
b_blocknr_t max_block)
@@ -2284,15 +2354,17 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev,
}
/*
-** read and replay the log
-** on a clean unmount, the journal header's next unflushed pointer will
-** be to an invalid transaction. This tests that before finding all the
-** transactions in the log, which makes normal mount times fast.
-** After a crash, this starts with the next unflushed transaction, and
-** replays until it finds one too old, or invalid.
-** On exit, it sets things up so the first transaction will work correctly.
-** NOTE: only called during fs mount
-*/
+ * read and replay the log
+ * on a clean unmount, the journal header's next unflushed pointer will be
+ * to an invalid transaction. This tests that before finding all the
+ * transactions in the log, which makes normal mount times fast.
+ *
+ * After a crash, this starts with the next unflushed transaction, and
+ * replays until it finds one too old, or invalid.
+ *
+ * On exit, it sets things up so the first transaction will work correctly.
+ * NOTE: only called during fs mount
+ */
static int journal_read(struct super_block *sb)
{
struct reiserfs_journal *journal = SB_JOURNAL(sb);
@@ -2316,9 +2388,10 @@ static int journal_read(struct super_block *sb)
bdevname(journal->j_dev_bd, b));
start = get_seconds();
- /* step 1, read in the journal header block. Check the transaction it says
- ** is the first unflushed, and if that transaction is not valid,
- ** replay is done
+ /*
+ * step 1, read in the journal header block. Check the transaction
+ * it says is the first unflushed, and if that transaction is not
+ * valid, replay is done
*/
journal->j_header_bh = journal_bread(sb,
SB_ONDISK_JOURNAL_1st_BLOCK(sb)
@@ -2342,9 +2415,10 @@ static int journal_read(struct super_block *sb)
le32_to_cpu(jh->j_last_flush_trans_id));
valid_journal_header = 1;
- /* now, we try to read the first unflushed offset. If it is not valid,
- ** there is nothing more we can do, and it makes no sense to read
- ** through the whole log.
+ /*
+ * now, we try to read the first unflushed offset. If it
+ * is not valid, there is nothing more we can do, and it
+ * makes no sense to read through the whole log.
*/
d_bh =
journal_bread(sb,
@@ -2358,15 +2432,19 @@ static int journal_read(struct super_block *sb)
goto start_log_replay;
}
- /* ok, there are transactions that need to be replayed. start with the first log block, find
- ** all the valid transactions, and pick out the oldest.
+ /*
+ * ok, there are transactions that need to be replayed. start
+ * with the first log block, find all the valid transactions, and
+ * pick out the oldest.
*/
while (continue_replay
&& cur_dblock <
(SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
SB_ONDISK_JOURNAL_SIZE(sb))) {
- /* Note that it is required for blocksize of primary fs device and journal
- device to be the same */
+ /*
+ * Note that it is required for blocksize of primary fs
+ * device and journal device to be the same
+ */
d_bh =
reiserfs_breada(journal->j_dev_bd, cur_dblock,
sb->s_blocksize,
@@ -2413,7 +2491,7 @@ static int journal_read(struct super_block *sb)
brelse(d_bh);
}
- start_log_replay:
+start_log_replay:
cur_dblock = oldest_start;
if (oldest_trans_id) {
reiserfs_debug(sb, REISERFS_DEBUG_CODE,
@@ -2444,9 +2522,11 @@ static int journal_read(struct super_block *sb)
reiserfs_debug(sb, REISERFS_DEBUG_CODE,
"journal-1225: No valid " "transactions found");
}
- /* j_start does not get set correctly if we don't replay any transactions.
- ** if we had a valid journal_header, set j_start to the first unflushed transaction value,
- ** copy the trans_id from the header
+ /*
+ * j_start does not get set correctly if we don't replay any
+ * transactions. if we had a valid journal_header, set j_start
+ * to the first unflushed transaction value, copy the trans_id
+ * from the header
*/
if (valid_journal_header && replay_count == 0) {
journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset);
@@ -2475,8 +2555,9 @@ static int journal_read(struct super_block *sb)
_update_journal_header_block(sb, journal->j_start,
journal->j_last_flush_trans_id)) {
reiserfs_write_unlock(sb);
- /* replay failed, caller must call free_journal_ram and abort
- ** the mount
+ /*
+ * replay failed, caller must call free_journal_ram and abort
+ * the mount
*/
return -1;
}
@@ -2569,7 +2650,7 @@ static int journal_init_dev(struct super_block *super,
return 0;
}
-/**
+/*
* When creating/tuning a file system user can assign some
* journal params within boundaries which depend on the ratio
* blocksize/standard_blocksize.
@@ -2587,8 +2668,7 @@ static int check_advise_trans_params(struct super_block *sb,
struct reiserfs_journal *journal)
{
if (journal->j_trans_max) {
- /* Non-default journal params.
- Do sanity check for them. */
+ /* Non-default journal params. Do sanity check for them. */
int ratio = 1;
if (sb->s_blocksize < REISERFS_STANDARD_BLKSIZE)
ratio = REISERFS_STANDARD_BLKSIZE / sb->s_blocksize;
@@ -2610,10 +2690,12 @@ static int check_advise_trans_params(struct super_block *sb,
return 1;
}
} else {
- /* Default journal params.
- The file system was created by old version
- of mkreiserfs, so some fields contain zeros,
- and we need to advise proper values for them */
+ /*
+ * Default journal params.
+ * The file system was created by old version
+ * of mkreiserfs, so some fields contain zeros,
+ * and we need to advise proper values for them
+ */
if (sb->s_blocksize != REISERFS_STANDARD_BLKSIZE) {
reiserfs_warning(sb, "sh-464", "bad blocksize (%u)",
sb->s_blocksize);
@@ -2626,9 +2708,7 @@ static int check_advise_trans_params(struct super_block *sb,
return 0;
}
-/*
-** must be called once on fs mount. calls journal_read for you
-*/
+/* must be called once on fs mount. calls journal_read for you */
int journal_init(struct super_block *sb, const char *j_dev_name,
int old_format, unsigned int commit_max_age)
{
@@ -2667,8 +2747,10 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
REISERFS_DISK_OFFSET_IN_BYTES /
sb->s_blocksize + 2);
- /* Sanity check to see is the standard journal fitting within first bitmap
- (actual for small blocksizes) */
+ /*
+ * Sanity check to see is the standard journal fitting
+ * within first bitmap (actual for small blocksizes)
+ */
if (!SB_ONDISK_JOURNAL_DEVICE(sb) &&
(SB_JOURNAL_1st_RESERVED_BLOCK(sb) +
SB_ONDISK_JOURNAL_SIZE(sb) > sb->s_blocksize * 8)) {
@@ -2754,20 +2836,20 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
journal->j_start = 0;
journal->j_len = 0;
journal->j_len_alloc = 0;
- atomic_set(&(journal->j_wcount), 0);
- atomic_set(&(journal->j_async_throttle), 0);
+ atomic_set(&journal->j_wcount, 0);
+ atomic_set(&journal->j_async_throttle, 0);
journal->j_bcount = 0;
journal->j_trans_start_time = 0;
journal->j_last = NULL;
journal->j_first = NULL;
- init_waitqueue_head(&(journal->j_join_wait));
+ init_waitqueue_head(&journal->j_join_wait);
mutex_init(&journal->j_mutex);
mutex_init(&journal->j_flush_mutex);
journal->j_trans_id = 10;
journal->j_mount_id = 10;
journal->j_state = 0;
- atomic_set(&(journal->j_jlock), 0);
+ atomic_set(&journal->j_jlock, 0);
journal->j_cnode_free_list = allocate_cnodes(num_cnodes);
journal->j_cnode_free_orig = journal->j_cnode_free_list;
journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0;
@@ -2807,23 +2889,19 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
goto free_and_return;
}
- reiserfs_mounted_fs_count++;
- if (reiserfs_mounted_fs_count <= 1)
- commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0);
-
INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
journal->j_work_sb = sb;
return 0;
- free_and_return:
+free_and_return:
free_journal_ram(sb);
return 1;
}
/*
-** test for a polite end of the current transaction. Used by file_write, and should
-** be used by delete to make sure they don't write more than can fit inside a single
-** transaction
-*/
+ * test for a polite end of the current transaction. Used by file_write,
+ * and should be used by delete to make sure they don't write more than
+ * can fit inside a single transaction
+ */
int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
int new_alloc)
{
@@ -2835,7 +2913,7 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
return 0;
if (journal->j_must_wait > 0 ||
(journal->j_len_alloc + new_alloc) >= journal->j_max_batch ||
- atomic_read(&(journal->j_jlock)) ||
+ atomic_read(&journal->j_jlock) ||
(now - journal->j_trans_start_time) > journal->j_max_trans_age ||
journal->j_cnode_free < (journal->j_trans_max * 3)) {
return 1;
@@ -2846,8 +2924,7 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
return 0;
}
-/* this must be called inside a transaction
-*/
+/* this must be called inside a transaction */
void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
{
struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
@@ -2857,8 +2934,7 @@ void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
return;
}
-/* this must be called without a transaction started
-*/
+/* this must be called without a transaction started */
void reiserfs_allow_writes(struct super_block *s)
{
struct reiserfs_journal *journal = SB_JOURNAL(s);
@@ -2866,8 +2942,7 @@ void reiserfs_allow_writes(struct super_block *s)
wake_up(&journal->j_join_wait);
}
-/* this must be called without a transaction started
-*/
+/* this must be called without a transaction started */
void reiserfs_wait_on_write_block(struct super_block *s)
{
struct reiserfs_journal *journal = SB_JOURNAL(s);
@@ -2929,11 +3004,12 @@ static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
}
}
-/* join == true if you must join an existing transaction.
-** join == false if you can deal with waiting for others to finish
-**
-** this will block until the transaction is joinable. send the number of blocks you
-** expect to use in nblocks.
+/*
+ * join == true if you must join an existing transaction.
+ * join == false if you can deal with waiting for others to finish
+ *
+ * this will block until the transaction is joinable. send the number of
+ * blocks you expect to use in nblocks.
*/
static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
struct super_block *sb, unsigned long nblocks,
@@ -2955,7 +3031,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
th->t_refcount = 1;
th->t_super = sb;
- relock:
+relock:
lock_journal(sb);
if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) {
unlock_journal(sb);
@@ -2974,9 +3050,11 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
}
now = get_seconds();
- /* if there is no room in the journal OR
- ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning
- ** we don't sleep if there aren't other writers
+ /*
+ * if there is no room in the journal OR
+ * if this transaction is too old, and we weren't called joinable,
+ * wait for it to finish before beginning we don't sleep if there
+ * aren't other writers
*/
if ((!join && journal->j_must_wait > 0) ||
@@ -2990,7 +3068,8 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
|| (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) {
old_trans_id = journal->j_trans_id;
- unlock_journal(sb); /* allow others to finish this transaction */
+ /* allow others to finish this transaction */
+ unlock_journal(sb);
if (!join && (journal->j_len_alloc + nblocks + 2) >=
journal->j_max_batch &&
@@ -3002,8 +3081,9 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
goto relock;
}
}
- /* don't mess with joining the transaction if all we have to do is
- * wait for someone else to do a commit
+ /*
+ * don't mess with joining the transaction if all we
+ * have to do is wait for someone else to do a commit
*/
if (atomic_read(&journal->j_jlock)) {
while (journal->j_trans_id == old_trans_id &&
@@ -3012,15 +3092,15 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
}
goto relock;
}
- retval = journal_join(&myth, sb, 1);
+ retval = journal_join(&myth, sb);
if (retval)
goto out_fail;
/* someone might have ended the transaction while we joined */
if (old_trans_id != journal->j_trans_id) {
- retval = do_journal_end(&myth, sb, 1, 0);
+ retval = do_journal_end(&myth, 0);
} else {
- retval = do_journal_end(&myth, sb, 1, COMMIT_NOW);
+ retval = do_journal_end(&myth, COMMIT_NOW);
}
if (retval)
@@ -3033,7 +3113,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
if (journal->j_trans_start_time == 0) {
journal->j_trans_start_time = get_seconds();
}
- atomic_inc(&(journal->j_wcount));
+ atomic_inc(&journal->j_wcount);
journal->j_len_alloc += nblocks;
th->t_blocks_logged = 0;
th->t_blocks_allocated = nblocks;
@@ -3042,11 +3122,13 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
INIT_LIST_HEAD(&th->t_list);
return 0;
- out_fail:
+out_fail:
memset(th, 0, sizeof(*th));
- /* Re-set th->t_super, so we can properly keep track of how many
+ /*
+ * Re-set th->t_super, so we can properly keep track of how many
* persistent transactions there are. We need to do this so if this
- * call is part of a failed restart_transaction, we can free it later */
+ * call is part of a failed restart_transaction, we can free it later
+ */
th->t_super = sb;
return retval;
}
@@ -3059,14 +3141,15 @@ struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
int ret;
struct reiserfs_transaction_handle *th;
- /* if we're nesting into an existing transaction. It will be
- ** persistent on its own
+ /*
+ * if we're nesting into an existing transaction. It will be
+ * persistent on its own
*/
if (reiserfs_transaction_running(s)) {
th = current->journal_info;
th->t_refcount++;
BUG_ON(th->t_refcount < 2);
-
+
return th;
}
th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS);
@@ -3087,7 +3170,7 @@ int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th)
struct super_block *s = th->t_super;
int ret = 0;
if (th->t_trans_id)
- ret = journal_end(th, th->t_super, th->t_blocks_allocated);
+ ret = journal_end(th);
else
ret = -EIO;
if (th->t_refcount == 0) {
@@ -3098,29 +3181,31 @@ int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th)
}
static int journal_join(struct reiserfs_transaction_handle *th,
- struct super_block *sb, unsigned long nblocks)
+ struct super_block *sb)
{
struct reiserfs_transaction_handle *cur_th = current->journal_info;
- /* this keeps do_journal_end from NULLing out the current->journal_info
- ** pointer
+ /*
+ * this keeps do_journal_end from NULLing out the
+ * current->journal_info pointer
*/
th->t_handle_save = cur_th;
BUG_ON(cur_th && cur_th->t_refcount > 1);
- return do_journal_begin_r(th, sb, nblocks, JBEGIN_JOIN);
+ return do_journal_begin_r(th, sb, 1, JBEGIN_JOIN);
}
int journal_join_abort(struct reiserfs_transaction_handle *th,
- struct super_block *sb, unsigned long nblocks)
+ struct super_block *sb)
{
struct reiserfs_transaction_handle *cur_th = current->journal_info;
- /* this keeps do_journal_end from NULLing out the current->journal_info
- ** pointer
+ /*
+ * this keeps do_journal_end from NULLing out the
+ * current->journal_info pointer
*/
th->t_handle_save = cur_th;
BUG_ON(cur_th && cur_th->t_refcount > 1);
- return do_journal_begin_r(th, sb, nblocks, JBEGIN_ABORT);
+ return do_journal_begin_r(th, sb, 1, JBEGIN_ABORT);
}
int journal_begin(struct reiserfs_transaction_handle *th,
@@ -3142,9 +3227,10 @@ int journal_begin(struct reiserfs_transaction_handle *th,
"journal_info != 0");
return 0;
} else {
- /* we've ended up with a handle from a different filesystem.
- ** save it and restore on journal_end. This should never
- ** really happen...
+ /*
+ * we've ended up with a handle from a different
+ * filesystem. save it and restore on journal_end.
+ * This should never really happen...
*/
reiserfs_warning(sb, "clm-2100",
"nesting info a different FS");
@@ -3157,9 +3243,10 @@ int journal_begin(struct reiserfs_transaction_handle *th,
ret = do_journal_begin_r(th, sb, nblocks, JBEGIN_REG);
BUG_ON(current->journal_info != th);
- /* I guess this boils down to being the reciprocal of clm-2100 above.
- * If do_journal_begin_r fails, we need to put it back, since journal_end
- * won't be called to do it. */
+ /*
+ * I guess this boils down to being the reciprocal of clm-2100 above.
+ * If do_journal_begin_r fails, we need to put it back, since
+ * journal_end won't be called to do it. */
if (ret)
current->journal_info = th->t_handle_save;
else
@@ -3169,17 +3256,19 @@ int journal_begin(struct reiserfs_transaction_handle *th,
}
/*
-** puts bh into the current transaction. If it was already there, reorders removes the
-** old pointers from the hash, and puts new ones in (to make sure replay happen in the right order).
-**
-** if it was dirty, cleans and files onto the clean list. I can't let it be dirty again until the
-** transaction is committed.
-**
-** if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len.
-*/
+ * puts bh into the current transaction. If it was already there, reorders
+ * removes the old pointers from the hash, and puts new ones in (to make
+ * sure replay happen in the right order).
+ *
+ * if it was dirty, cleans and files onto the clean list. I can't let it
+ * be dirty again until the transaction is committed.
+ *
+ * if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len.
+ */
int journal_mark_dirty(struct reiserfs_transaction_handle *th,
- struct super_block *sb, struct buffer_head *bh)
+ struct buffer_head *bh)
{
+ struct super_block *sb = th->t_super;
struct reiserfs_journal *journal = SB_JOURNAL(sb);
struct reiserfs_journal_cnode *cn = NULL;
int count_already_incd = 0;
@@ -3201,9 +3290,10 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
return 0;
}
- /* this must be turned into a panic instead of a warning. We can't allow
- ** a dirty or journal_dirty or locked buffer to be logged, as some changes
- ** could get to disk too early. NOT GOOD.
+ /*
+ * this must be turned into a panic instead of a warning. We can't
+ * allow a dirty or journal_dirty or locked buffer to be logged, as
+ * some changes could get to disk too early. NOT GOOD.
*/
if (!prepared || buffer_dirty(bh)) {
reiserfs_warning(sb, "journal-1777",
@@ -3216,14 +3306,16 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
buffer_journal_dirty(bh) ? ' ' : '!');
}
- if (atomic_read(&(journal->j_wcount)) <= 0) {
+ if (atomic_read(&journal->j_wcount) <= 0) {
reiserfs_warning(sb, "journal-1409",
"returning because j_wcount was %d",
- atomic_read(&(journal->j_wcount)));
+ atomic_read(&journal->j_wcount));
return 1;
}
- /* this error means I've screwed up, and we've overflowed the transaction.
- ** Nothing can be done here, except make the FS readonly or panic.
+ /*
+ * this error means I've screwed up, and we've overflowed
+ * the transaction. Nothing can be done here, except make the
+ * FS readonly or panic.
*/
if (journal->j_len >= journal->j_trans_max) {
reiserfs_panic(th->t_super, "journal-1413",
@@ -3280,9 +3372,9 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
return 0;
}
-int journal_end(struct reiserfs_transaction_handle *th,
- struct super_block *sb, unsigned long nblocks)
+int journal_end(struct reiserfs_transaction_handle *th)
{
+ struct super_block *sb = th->t_super;
if (!current->journal_info && th->t_refcount > 1)
reiserfs_warning(sb, "REISER-NESTING",
"th NULL, refcount %d", th->t_refcount);
@@ -3297,8 +3389,9 @@ int journal_end(struct reiserfs_transaction_handle *th,
struct reiserfs_transaction_handle *cur_th =
current->journal_info;
- /* we aren't allowed to close a nested transaction on a different
- ** filesystem from the one in the task struct
+ /*
+ * we aren't allowed to close a nested transaction on a
+ * different filesystem from the one in the task struct
*/
BUG_ON(cur_th->t_super != th->t_super);
@@ -3308,17 +3401,18 @@ int journal_end(struct reiserfs_transaction_handle *th,
}
return 0;
} else {
- return do_journal_end(th, sb, nblocks, 0);
+ return do_journal_end(th, 0);
}
}
-/* removes from the current transaction, relsing and descrementing any counters.
-** also files the removed buffer directly onto the clean list
-**
-** called by journal_mark_freed when a block has been deleted
-**
-** returns 1 if it cleaned and relsed the buffer. 0 otherwise
-*/
+/*
+ * removes from the current transaction, relsing and descrementing any counters.
+ * also files the removed buffer directly onto the clean list
+ *
+ * called by journal_mark_freed when a block has been deleted
+ *
+ * returns 1 if it cleaned and relsed the buffer. 0 otherwise
+ */
static int remove_from_transaction(struct super_block *sb,
b_blocknr_t blocknr, int already_cleaned)
{
@@ -3354,7 +3448,7 @@ static int remove_from_transaction(struct super_block *sb,
clear_buffer_dirty(bh);
clear_buffer_journal_test(bh);
put_bh(bh);
- if (atomic_read(&(bh->b_count)) < 0) {
+ if (atomic_read(&bh->b_count) < 0) {
reiserfs_warning(sb, "journal-1752",
"b_count < 0");
}
@@ -3367,15 +3461,16 @@ static int remove_from_transaction(struct super_block *sb,
}
/*
-** for any cnode in a journal list, it can only be dirtied of all the
-** transactions that include it are committed to disk.
-** this checks through each transaction, and returns 1 if you are allowed to dirty,
-** and 0 if you aren't
-**
-** it is called by dirty_journal_list, which is called after flush_commit_list has gotten all the log
-** blocks for a given transaction on disk
-**
-*/
+ * for any cnode in a journal list, it can only be dirtied of all the
+ * transactions that include it are committed to disk.
+ * this checks through each transaction, and returns 1 if you are allowed
+ * to dirty, and 0 if you aren't
+ *
+ * it is called by dirty_journal_list, which is called after
+ * flush_commit_list has gotten all the log blocks for a given
+ * transaction on disk
+ *
+ */
static int can_dirty(struct reiserfs_journal_cnode *cn)
{
struct super_block *sb = cn->sb;
@@ -3383,9 +3478,10 @@ static int can_dirty(struct reiserfs_journal_cnode *cn)
struct reiserfs_journal_cnode *cur = cn->hprev;
int can_dirty = 1;
- /* first test hprev. These are all newer than cn, so any node here
- ** with the same block number and dev means this node can't be sent
- ** to disk right now.
+ /*
+ * first test hprev. These are all newer than cn, so any node here
+ * with the same block number and dev means this node can't be sent
+ * to disk right now.
*/
while (cur && can_dirty) {
if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb &&
@@ -3394,13 +3490,14 @@ static int can_dirty(struct reiserfs_journal_cnode *cn)
}
cur = cur->hprev;
}
- /* then test hnext. These are all older than cn. As long as they
- ** are committed to the log, it is safe to write cn to disk
+ /*
+ * then test hnext. These are all older than cn. As long as they
+ * are committed to the log, it is safe to write cn to disk
*/
cur = cn->hnext;
while (cur && can_dirty) {
if (cur->jlist && cur->jlist->j_len > 0 &&
- atomic_read(&(cur->jlist->j_commit_left)) > 0 && cur->bh &&
+ atomic_read(&cur->jlist->j_commit_left) > 0 && cur->bh &&
cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) {
can_dirty = 0;
}
@@ -3409,12 +3506,13 @@ static int can_dirty(struct reiserfs_journal_cnode *cn)
return can_dirty;
}
-/* syncs the commit blocks, but does not force the real buffers to disk
-** will wait until the current transaction is done/committed before returning
-*/
-int journal_end_sync(struct reiserfs_transaction_handle *th,
- struct super_block *sb, unsigned long nblocks)
+/*
+ * syncs the commit blocks, but does not force the real buffers to disk
+ * will wait until the current transaction is done/committed before returning
+ */
+int journal_end_sync(struct reiserfs_transaction_handle *th)
{
+ struct super_block *sb = th->t_super;
struct reiserfs_journal *journal = SB_JOURNAL(sb);
BUG_ON(!th->t_trans_id);
@@ -3423,14 +3521,12 @@ int journal_end_sync(struct reiserfs_transaction_handle *th,
if (journal->j_len == 0) {
reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
1);
- journal_mark_dirty(th, sb, SB_BUFFER_WITH_SB(sb));
+ journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb));
}
- return do_journal_end(th, sb, nblocks, COMMIT_NOW | WAIT);
+ return do_journal_end(th, COMMIT_NOW | WAIT);
}
-/*
-** writeback the pending async commits to disk
-*/
+/* writeback the pending async commits to disk */
static void flush_async_commits(struct work_struct *work)
{
struct reiserfs_journal *journal =
@@ -3450,9 +3546,9 @@ static void flush_async_commits(struct work_struct *work)
}
/*
-** flushes any old transactions to disk
-** ends the current transaction if it is too old
-*/
+ * flushes any old transactions to disk
+ * ends the current transaction if it is too old
+ */
void reiserfs_flush_old_commits(struct super_block *sb)
{
time_t now;
@@ -3460,48 +3556,53 @@ void reiserfs_flush_old_commits(struct super_block *sb)
struct reiserfs_journal *journal = SB_JOURNAL(sb);
now = get_seconds();
- /* safety check so we don't flush while we are replaying the log during
+ /*
+ * safety check so we don't flush while we are replaying the log during
* mount
*/
if (list_empty(&journal->j_journal_list))
return;
- /* check the current transaction. If there are no writers, and it is
+ /*
+ * check the current transaction. If there are no writers, and it is
* too old, finish it, and force the commit blocks to disk
*/
if (atomic_read(&journal->j_wcount) <= 0 &&
journal->j_trans_start_time > 0 &&
journal->j_len > 0 &&
(now - journal->j_trans_start_time) > journal->j_max_trans_age) {
- if (!journal_join(&th, sb, 1)) {
+ if (!journal_join(&th, sb)) {
reiserfs_prepare_for_journal(sb,
SB_BUFFER_WITH_SB(sb),
1);
- journal_mark_dirty(&th, sb,
- SB_BUFFER_WITH_SB(sb));
+ journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb));
- /* we're only being called from kreiserfsd, it makes no sense to do
- ** an async commit so that kreiserfsd can do it later
+ /*
+ * we're only being called from kreiserfsd, it makes
+ * no sense to do an async commit so that kreiserfsd
+ * can do it later
*/
- do_journal_end(&th, sb, 1, COMMIT_NOW | WAIT);
+ do_journal_end(&th, COMMIT_NOW | WAIT);
}
}
}
/*
-** returns 0 if do_journal_end should return right away, returns 1 if do_journal_end should finish the commit
-**
-** if the current transaction is too old, but still has writers, this will wait on j_join_wait until all
-** the writers are done. By the time it wakes up, the transaction it was called has already ended, so it just
-** flushes the commit list and returns 0.
-**
-** Won't batch when flush or commit_now is set. Also won't batch when others are waiting on j_join_wait.
-**
-** Note, we can't allow the journal_end to proceed while there are still writers in the log.
-*/
-static int check_journal_end(struct reiserfs_transaction_handle *th,
- struct super_block *sb, unsigned long nblocks,
- int flags)
+ * returns 0 if do_journal_end should return right away, returns 1 if
+ * do_journal_end should finish the commit
+ *
+ * if the current transaction is too old, but still has writers, this will
+ * wait on j_join_wait until all the writers are done. By the time it
+ * wakes up, the transaction it was called has already ended, so it just
+ * flushes the commit list and returns 0.
+ *
+ * Won't batch when flush or commit_now is set. Also won't batch when
+ * others are waiting on j_join_wait.
+ *
+ * Note, we can't allow the journal_end to proceed while there are still
+ * writers in the log.
+ */
+static int check_journal_end(struct reiserfs_transaction_handle *th, int flags)
{
time_t now;
@@ -3509,6 +3610,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
int commit_now = flags & COMMIT_NOW;
int wait_on_commit = flags & WAIT;
struct reiserfs_journal_list *jl;
+ struct super_block *sb = th->t_super;
struct reiserfs_journal *journal = SB_JOURNAL(sb);
BUG_ON(!th->t_trans_id);
@@ -3520,23 +3622,27 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
}
journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged);
- if (atomic_read(&(journal->j_wcount)) > 0) { /* <= 0 is allowed. unmounting might not call begin */
- atomic_dec(&(journal->j_wcount));
- }
+ /* <= 0 is allowed. unmounting might not call begin */
+ if (atomic_read(&journal->j_wcount) > 0)
+ atomic_dec(&journal->j_wcount);
- /* BUG, deal with case where j_len is 0, but people previously freed blocks need to be released
- ** will be dealt with by next transaction that actually writes something, but should be taken
- ** care of in this trans
+ /*
+ * BUG, deal with case where j_len is 0, but people previously
+ * freed blocks need to be released will be dealt with by next
+ * transaction that actually writes something, but should be taken
+ * care of in this trans
*/
BUG_ON(journal->j_len == 0);
- /* if wcount > 0, and we are called to with flush or commit_now,
- ** we wait on j_join_wait. We will wake up when the last writer has
- ** finished the transaction, and started it on its way to the disk.
- ** Then, we flush the commit or journal list, and just return 0
- ** because the rest of journal end was already done for this transaction.
+ /*
+ * if wcount > 0, and we are called to with flush or commit_now,
+ * we wait on j_join_wait. We will wake up when the last writer has
+ * finished the transaction, and started it on its way to the disk.
+ * Then, we flush the commit or journal list, and just return 0
+ * because the rest of journal end was already done for this
+ * transaction.
*/
- if (atomic_read(&(journal->j_wcount)) > 0) {
+ if (atomic_read(&journal->j_wcount) > 0) {
if (flush || commit_now) {
unsigned trans_id;
@@ -3544,27 +3650,30 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
trans_id = jl->j_trans_id;
if (wait_on_commit)
jl->j_state |= LIST_COMMIT_PENDING;
- atomic_set(&(journal->j_jlock), 1);
+ atomic_set(&journal->j_jlock, 1);
if (flush) {
journal->j_next_full_flush = 1;
}
unlock_journal(sb);
- /* sleep while the current transaction is still j_jlocked */
+ /*
+ * sleep while the current transaction is
+ * still j_jlocked
+ */
while (journal->j_trans_id == trans_id) {
if (atomic_read(&journal->j_jlock)) {
queue_log_writer(sb);
} else {
lock_journal(sb);
if (journal->j_trans_id == trans_id) {
- atomic_set(&(journal->j_jlock),
+ atomic_set(&journal->j_jlock,
1);
}
unlock_journal(sb);
}
}
BUG_ON(journal->j_trans_id == trans_id);
-
+
if (commit_now
&& journal_list_still_alive(sb, trans_id)
&& wait_on_commit) {
@@ -3584,7 +3693,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
}
/* don't batch when someone is waiting on j_join_wait */
/* don't batch when syncing the commit or flushing the whole trans */
- if (!(journal->j_must_wait > 0) && !(atomic_read(&(journal->j_jlock)))
+ if (!(journal->j_must_wait > 0) && !(atomic_read(&journal->j_jlock))
&& !flush && !commit_now && (journal->j_len < journal->j_max_batch)
&& journal->j_len_alloc < journal->j_max_batch
&& journal->j_cnode_free > (journal->j_trans_max * 3)) {
@@ -3602,19 +3711,22 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
}
/*
-** Does all the work that makes deleting blocks safe.
-** when deleting a block mark BH_JNew, just remove it from the current transaction, clean it's buffer_head and move on.
-**
-** otherwise:
-** set a bit for the block in the journal bitmap. That will prevent it from being allocated for unformatted nodes
-** before this transaction has finished.
-**
-** mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers. That will prevent any old transactions with
-** this block from trying to flush to the real location. Since we aren't removing the cnode from the journal_list_hash,
-** the block can't be reallocated yet.
-**
-** Then remove it from the current transaction, decrementing any counters and filing it on the clean list.
-*/
+ * Does all the work that makes deleting blocks safe.
+ * when deleting a block mark BH_JNew, just remove it from the current
+ * transaction, clean it's buffer_head and move on.
+ *
+ * otherwise:
+ * set a bit for the block in the journal bitmap. That will prevent it from
+ * being allocated for unformatted nodes before this transaction has finished.
+ *
+ * mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers.
+ * That will prevent any old transactions with this block from trying to flush
+ * to the real location. Since we aren't removing the cnode from the
+ * journal_list_hash, *the block can't be reallocated yet.
+ *
+ * Then remove it from the current transaction, decrementing any counters and
+ * filing it on the clean list.
+ */
int journal_mark_freed(struct reiserfs_transaction_handle *th,
struct super_block *sb, b_blocknr_t blocknr)
{
@@ -3637,7 +3749,10 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th,
reiserfs_clean_and_file_buffer(bh);
cleaned = remove_from_transaction(sb, blocknr, cleaned);
} else {
- /* set the bit for this block in the journal bitmap for this transaction */
+ /*
+ * set the bit for this block in the journal bitmap
+ * for this transaction
+ */
jb = journal->j_current_jl->j_list_bitmap;
if (!jb) {
reiserfs_panic(sb, "journal-1702",
@@ -3653,17 +3768,22 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th,
}
cleaned = remove_from_transaction(sb, blocknr, cleaned);
- /* find all older transactions with this block, make sure they don't try to write it out */
+ /*
+ * find all older transactions with this block,
+ * make sure they don't try to write it out
+ */
cn = get_journal_hash_dev(sb, journal->j_list_hash_table,
blocknr);
while (cn) {
if (sb == cn->sb && blocknr == cn->blocknr) {
set_bit(BLOCK_FREED, &cn->state);
if (cn->bh) {
+ /*
+ * remove_from_transaction will brelse
+ * the buffer if it was in the current
+ * trans
+ */
if (!cleaned) {
- /* remove_from_transaction will brelse the buffer if it was
- ** in the current trans
- */
clear_buffer_journal_dirty(cn->
bh);
clear_buffer_dirty(cn->bh);
@@ -3672,16 +3792,19 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th,
cleaned = 1;
put_bh(cn->bh);
if (atomic_read
- (&(cn->bh->b_count)) < 0) {
+ (&cn->bh->b_count) < 0) {
reiserfs_warning(sb,
"journal-2138",
"cn->bh->b_count < 0");
}
}
- if (cn->jlist) { /* since we are clearing the bh, we MUST dec nonzerolen */
- atomic_dec(&
- (cn->jlist->
- j_nonzerolen));
+ /*
+ * since we are clearing the bh,
+ * we MUST dec nonzerolen
+ */
+ if (cn->jlist) {
+ atomic_dec(&cn->jlist->
+ j_nonzerolen);
}
cn->bh = NULL;
}
@@ -3714,10 +3837,16 @@ static int __commit_trans_jl(struct inode *inode, unsigned long id,
struct reiserfs_journal *journal = SB_JOURNAL(sb);
int ret = 0;
- /* is it from the current transaction, or from an unknown transaction? */
+ /*
+ * is it from the current transaction,
+ * or from an unknown transaction?
+ */
if (id == journal->j_trans_id) {
jl = journal->j_current_jl;
- /* try to let other writers come in and grow this transaction */
+ /*
+ * try to let other writers come in and
+ * grow this transaction
+ */
let_transaction_grow(sb, id);
if (journal->j_trans_id != id) {
goto flush_commit_only;
@@ -3731,21 +3860,22 @@ static int __commit_trans_jl(struct inode *inode, unsigned long id,
if (journal->j_trans_id != id) {
reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
1);
- journal_mark_dirty(&th, sb, SB_BUFFER_WITH_SB(sb));
- ret = journal_end(&th, sb, 1);
+ journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb));
+ ret = journal_end(&th);
goto flush_commit_only;
}
- ret = journal_end_sync(&th, sb, 1);
+ ret = journal_end_sync(&th);
if (!ret)
ret = 1;
} else {
- /* this gets tricky, we have to make sure the journal list in
+ /*
+ * this gets tricky, we have to make sure the journal list in
* the inode still exists. We know the list is still around
* if we've got a larger transaction id than the oldest list
*/
- flush_commit_only:
+flush_commit_only:
if (journal_list_still_alive(inode->i_sb, id)) {
/*
* we only set ret to 1 when we know for sure
@@ -3768,7 +3898,8 @@ int reiserfs_commit_for_inode(struct inode *inode)
unsigned int id = REISERFS_I(inode)->i_trans_id;
struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;
- /* for the whole inode, assume unset id means it was
+ /*
+ * for the whole inode, assume unset id means it was
* changed in the current transaction. More conservative
*/
if (!id || !jl) {
@@ -3806,12 +3937,11 @@ void reiserfs_restore_prepared_buffer(struct super_block *sb,
extern struct tree_balance *cur_tb;
/*
-** before we can change a metadata block, we have to make sure it won't
-** be written to disk while we are altering it. So, we must:
-** clean it
-** wait on it.
-**
-*/
+ * before we can change a metadata block, we have to make sure it won't
+ * be written to disk while we are altering it. So, we must:
+ * clean it
+ * wait on it.
+ */
int reiserfs_prepare_for_journal(struct super_block *sb,
struct buffer_head *bh, int wait)
{
@@ -3832,19 +3962,18 @@ int reiserfs_prepare_for_journal(struct super_block *sb,
}
/*
-** long and ugly. If flush, will not return until all commit
-** blocks and all real buffers in the trans are on disk.
-** If no_async, won't return until all commit blocks are on disk.
-**
-** keep reading, there are comments as you go along
-**
-** If the journal is aborted, we just clean up. Things like flushing
-** journal lists, etc just won't happen.
-*/
-static int do_journal_end(struct reiserfs_transaction_handle *th,
- struct super_block *sb, unsigned long nblocks,
- int flags)
+ * long and ugly. If flush, will not return until all commit
+ * blocks and all real buffers in the trans are on disk.
+ * If no_async, won't return until all commit blocks are on disk.
+ *
+ * keep reading, there are comments as you go along
+ *
+ * If the journal is aborted, we just clean up. Things like flushing
+ * journal lists, etc just won't happen.
+ */
+static int do_journal_end(struct reiserfs_transaction_handle *th, int flags)
{
+ struct super_block *sb = th->t_super;
struct reiserfs_journal *journal = SB_JOURNAL(sb);
struct reiserfs_journal_cnode *cn, *next, *jl_cn;
struct reiserfs_journal_cnode *last_cn = NULL;
@@ -3866,9 +3995,12 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
BUG_ON(th->t_refcount > 1);
BUG_ON(!th->t_trans_id);
+ BUG_ON(!th->t_super);
- /* protect flush_older_commits from doing mistakes if the
- transaction ID counter gets overflowed. */
+ /*
+ * protect flush_older_commits from doing mistakes if the
+ * transaction ID counter gets overflowed.
+ */
if (th->t_trans_id == ~0U)
flags |= FLUSH_ALL | COMMIT_NOW | WAIT;
flush = flags & FLUSH_ALL;
@@ -3879,7 +4011,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
if (journal->j_len == 0) {
reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
1);
- journal_mark_dirty(th, sb, SB_BUFFER_WITH_SB(sb));
+ journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb));
}
lock_journal(sb);
@@ -3892,10 +4024,12 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
wait_on_commit = 1;
}
- /* check_journal_end locks the journal, and unlocks if it does not return 1
- ** it tells us if we should continue with the journal_end, or just return
+ /*
+ * check_journal_end locks the journal, and unlocks if it does
+ * not return 1 it tells us if we should continue with the
+ * journal_end, or just return
*/
- if (!check_journal_end(th, sb, nblocks, flags)) {
+ if (!check_journal_end(th, flags)) {
reiserfs_schedule_old_flush(sb);
wake_queued_writers(sb);
reiserfs_async_progress_wait(sb);
@@ -3908,19 +4042,23 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
}
/*
- ** j must wait means we have to flush the log blocks, and the real blocks for
- ** this transaction
+ * j must wait means we have to flush the log blocks, and the
+ * real blocks for this transaction
*/
if (journal->j_must_wait > 0) {
flush = 1;
}
#ifdef REISERFS_PREALLOCATE
- /* quota ops might need to nest, setup the journal_info pointer for them
- * and raise the refcount so that it is > 0. */
+ /*
+ * quota ops might need to nest, setup the journal_info pointer
+ * for them and raise the refcount so that it is > 0.
+ */
current->journal_info = th;
th->t_refcount++;
- reiserfs_discard_all_prealloc(th); /* it should not involve new blocks into
- * the transaction */
+
+ /* it should not involve new blocks into the transaction */
+ reiserfs_discard_all_prealloc(th);
+
th->t_refcount--;
current->journal_info = th->t_handle_save;
#endif
@@ -3936,7 +4074,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
memcpy(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8);
set_desc_trans_id(desc, journal->j_trans_id);
- /* setup commit block. Don't write (keep it clean too) this one until after everyone else is written */
+ /*
+ * setup commit block. Don't write (keep it clean too) this one
+ * until after everyone else is written
+ */
c_bh = journal_getblk(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
((journal->j_start + journal->j_len +
1) % SB_ONDISK_JOURNAL_SIZE(sb)));
@@ -3948,7 +4089,8 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
/* init this journal list */
jl = journal->j_current_jl;
- /* we lock the commit before doing anything because
+ /*
+ * we lock the commit before doing anything because
* we want to make sure nobody tries to run flush_commit_list until
* the new transaction is fully setup, and we've already flushed the
* ordered bh list
@@ -3968,9 +4110,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
atomic_set(&jl->j_commit_left, journal->j_len + 2);
jl->j_realblock = NULL;
- /* The ENTIRE FOR LOOP MUST not cause schedule to occur.
- ** for each real block, add it to the journal list hash,
- ** copy into real block index array in the commit or desc block
+ /*
+ * The ENTIRE FOR LOOP MUST not cause schedule to occur.
+ * for each real block, add it to the journal list hash,
+ * copy into real block index array in the commit or desc block
*/
trans_half = journal_trans_half(sb->s_blocksize);
for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) {
@@ -3989,9 +4132,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
last_cn->next = jl_cn;
}
last_cn = jl_cn;
- /* make sure the block we are trying to log is not a block
- of journal or reserved area */
-
+ /*
+ * make sure the block we are trying to log
+ * is not a block of journal or reserved area
+ */
if (is_block_in_log_or_reserved_area
(sb, cn->bh->b_blocknr)) {
reiserfs_panic(sb, "journal-2332",
@@ -4021,19 +4165,26 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
set_desc_trans_id(desc, journal->j_trans_id);
set_commit_trans_len(commit, journal->j_len);
- /* special check in case all buffers in the journal were marked for not logging */
+ /*
+ * special check in case all buffers in the journal
+ * were marked for not logging
+ */
BUG_ON(journal->j_len == 0);
- /* we're about to dirty all the log blocks, mark the description block
+ /*
+ * we're about to dirty all the log blocks, mark the description block
* dirty now too. Don't mark the commit block dirty until all the
* others are on disk
*/
mark_buffer_dirty(d_bh);
- /* first data block is j_start + 1, so add one to cur_write_start wherever you use it */
+ /*
+ * first data block is j_start + 1, so add one to
+ * cur_write_start wherever you use it
+ */
cur_write_start = journal->j_start;
cn = journal->j_first;
- jindex = 1; /* start at one so we don't get the desc again */
+ jindex = 1; /* start at one so we don't get the desc again */
while (cn) {
clear_buffer_journal_new(cn->bh);
/* copy all the real blocks into log area. dirty log blocks */
@@ -4059,7 +4210,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
set_buffer_journal_dirty(cn->bh);
clear_buffer_journaled(cn->bh);
} else {
- /* JDirty cleared sometime during transaction. don't log this one */
+ /*
+ * JDirty cleared sometime during transaction.
+ * don't log this one
+ */
reiserfs_warning(sb, "journal-2048",
"BAD, buffer in journal hash, "
"but not JDirty!");
@@ -4071,9 +4225,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
reiserfs_cond_resched(sb);
}
- /* we are done with both the c_bh and d_bh, but
- ** c_bh must be written after all other commit blocks,
- ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
+ /*
+ * we are done with both the c_bh and d_bh, but
+ * c_bh must be written after all other commit blocks,
+ * so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
*/
journal->j_current_jl = alloc_journal_list(sb);
@@ -4088,7 +4243,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
journal->j_start =
(journal->j_start + journal->j_len +
2) % SB_ONDISK_JOURNAL_SIZE(sb);
- atomic_set(&(journal->j_wcount), 0);
+ atomic_set(&journal->j_wcount, 0);
journal->j_bcount = 0;
journal->j_last = NULL;
journal->j_first = NULL;
@@ -4104,15 +4259,18 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
journal->j_next_async_flush = 0;
init_journal_hash(sb);
- // make sure reiserfs_add_jh sees the new current_jl before we
- // write out the tails
+ /*
+ * make sure reiserfs_add_jh sees the new current_jl before we
+ * write out the tails
+ */
smp_mb();
- /* tail conversion targets have to hit the disk before we end the
+ /*
+ * tail conversion targets have to hit the disk before we end the
* transaction. Otherwise a later transaction might repack the tail
- * before this transaction commits, leaving the data block unflushed and
- * clean, if we crash before the later transaction commits, the data block
- * is lost.
+ * before this transaction commits, leaving the data block unflushed
+ * and clean, if we crash before the later transaction commits, the
+ * data block is lost.
*/
if (!list_empty(&jl->j_tail_bh_list)) {
depth = reiserfs_write_unlock_nested(sb);
@@ -4123,24 +4281,27 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
BUG_ON(!list_empty(&jl->j_tail_bh_list));
mutex_unlock(&jl->j_commit_mutex);
- /* honor the flush wishes from the caller, simple commits can
- ** be done outside the journal lock, they are done below
- **
- ** if we don't flush the commit list right now, we put it into
- ** the work queue so the people waiting on the async progress work
- ** queue don't wait for this proc to flush journal lists and such.
+ /*
+ * honor the flush wishes from the caller, simple commits can
+ * be done outside the journal lock, they are done below
+ *
+ * if we don't flush the commit list right now, we put it into
+ * the work queue so the people waiting on the async progress work
+ * queue don't wait for this proc to flush journal lists and such.
*/
if (flush) {
flush_commit_list(sb, jl, 1);
flush_journal_list(sb, jl, 1);
} else if (!(jl->j_state & LIST_COMMIT_PENDING))
- queue_delayed_work(commit_wq, &journal->j_work, HZ / 10);
+ queue_delayed_work(REISERFS_SB(sb)->commit_wq,
+ &journal->j_work, HZ / 10);
- /* if the next transaction has any chance of wrapping, flush
- ** transactions that might get overwritten. If any journal lists are very
- ** old flush them as well.
+ /*
+ * if the next transaction has any chance of wrapping, flush
+ * transactions that might get overwritten. If any journal lists
+ * are very old flush them as well.
*/
- first_jl:
+first_jl:
list_for_each_safe(entry, safe, &journal->j_journal_list) {
temp_jl = JOURNAL_LIST_ENTRY(entry);
if (journal->j_start <= temp_jl->j_start) {
@@ -4151,8 +4312,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
} else if ((journal->j_start +
journal->j_trans_max + 1) <
SB_ONDISK_JOURNAL_SIZE(sb)) {
- /* if we don't cross into the next transaction and we don't
- * wrap, there is no way we can overlap any later transactions
+ /*
+ * if we don't cross into the next
+ * transaction and we don't wrap, there is
+ * no way we can overlap any later transactions
* break now
*/
break;
@@ -4166,10 +4329,12 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
flush_used_journal_lists(sb, temp_jl);
goto first_jl;
} else {
- /* we don't overlap anything from out start to the end of the
- * log, and our wrapped portion doesn't overlap anything at
- * the start of the log. We can break
- */
+ /*
+ * we don't overlap anything from out start
+ * to the end of the log, and our wrapped
+ * portion doesn't overlap anything at
+ * the start of the log. We can break
+ */
break;
}
}
@@ -4183,23 +4348,25 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
"could not get a list bitmap");
}
- atomic_set(&(journal->j_jlock), 0);
+ atomic_set(&journal->j_jlock, 0);
unlock_journal(sb);
/* wake up any body waiting to join. */
clear_bit(J_WRITERS_QUEUED, &journal->j_state);
- wake_up(&(journal->j_join_wait));
+ wake_up(&journal->j_join_wait);
if (!flush && wait_on_commit &&
journal_list_still_alive(sb, commit_trans_id)) {
flush_commit_list(sb, jl, 1);
}
- out:
+out:
reiserfs_check_lock_depth(sb, "journal end2");
memset(th, 0, sizeof(*th));
- /* Re-set th->t_super, so we can properly keep track of how many
+ /*
+ * Re-set th->t_super, so we can properly keep track of how many
* persistent transactions there are. We need to do this so if this
- * call is part of a failed restart_transaction, we can free it later */
+ * call is part of a failed restart_transaction, we can free it later
+ */
th->t_super = sb;
return journal->j_errno;
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index 79e5a8b4c226..d6744c8b24e1 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@ -8,46 +8,42 @@
#include "reiserfs.h"
#include <linux/buffer_head.h>
-/* these are used in do_balance.c */
-
-/* leaf_move_items
- leaf_shift_left
- leaf_shift_right
- leaf_delete_items
- leaf_insert_into_buf
- leaf_paste_in_buffer
- leaf_cut_from_buffer
- leaf_paste_entries
- */
-
-/* copy copy_count entries from source directory item to dest buffer (creating new item if needed) */
+/*
+ * copy copy_count entries from source directory item to dest buffer
+ * (creating new item if needed)
+ */
static void leaf_copy_dir_entries(struct buffer_info *dest_bi,
struct buffer_head *source, int last_first,
int item_num, int from, int copy_count)
{
struct buffer_head *dest = dest_bi->bi_bh;
- int item_num_in_dest; /* either the number of target item,
- or if we must create a new item,
- the number of the item we will
- create it next to */
+ /*
+ * either the number of target item, or if we must create a
+ * new item, the number of the item we will create it next to
+ */
+ int item_num_in_dest;
+
struct item_head *ih;
struct reiserfs_de_head *deh;
int copy_records_len; /* length of all records in item to be copied */
char *records;
- ih = B_N_PITEM_HEAD(source, item_num);
+ ih = item_head(source, item_num);
RFALSE(!is_direntry_le_ih(ih), "vs-10000: item must be directory item");
- /* length of all record to be copied and first byte of the last of them */
+ /*
+ * length of all record to be copied and first byte of
+ * the last of them
+ */
deh = B_I_DEH(source, ih);
if (copy_count) {
- copy_records_len = (from ? deh_location(&(deh[from - 1])) :
+ copy_records_len = (from ? deh_location(&deh[from - 1]) :
ih_item_len(ih)) -
- deh_location(&(deh[from + copy_count - 1]));
+ deh_location(&deh[from + copy_count - 1]);
records =
source->b_data + ih_location(ih) +
- deh_location(&(deh[from + copy_count - 1]));
+ deh_location(&deh[from + copy_count - 1]);
} else {
copy_records_len = 0;
records = NULL;
@@ -59,12 +55,15 @@ static void leaf_copy_dir_entries(struct buffer_info *dest_bi,
LAST_TO_FIRST) ? ((B_NR_ITEMS(dest)) ? 0 : -1) : (B_NR_ITEMS(dest)
- 1);
- /* if there are no items in dest or the first/last item in dest is not item of the same directory */
+ /*
+ * if there are no items in dest or the first/last item in
+ * dest is not item of the same directory
+ */
if ((item_num_in_dest == -1) ||
(last_first == FIRST_TO_LAST && le_ih_k_offset(ih) == DOT_OFFSET) ||
(last_first == LAST_TO_FIRST
&& comp_short_le_keys /*COMP_SHORT_KEYS */ (&ih->ih_key,
- B_N_PKEY(dest,
+ leaf_key(dest,
item_num_in_dest))))
{
/* create new item in dest */
@@ -80,16 +79,22 @@ static void leaf_copy_dir_entries(struct buffer_info *dest_bi,
if (last_first == LAST_TO_FIRST) {
/* form key by the following way */
- if (from < I_ENTRY_COUNT(ih)) {
+ if (from < ih_entry_count(ih)) {
set_le_ih_k_offset(&new_ih,
- deh_offset(&(deh[from])));
- /*memcpy (&new_ih.ih_key.k_offset, &deh[from].deh_offset, SHORT_KEY_SIZE); */
+ deh_offset(&deh[from]));
} else {
- /* no entries will be copied to this item in this function */
+ /*
+ * no entries will be copied to this
+ * item in this function
+ */
set_le_ih_k_offset(&new_ih, U32_MAX);
- /* this item is not yet valid, but we want I_IS_DIRECTORY_ITEM to return 1 for it, so we -1 */
+ /*
+ * this item is not yet valid, but we
+ * want I_IS_DIRECTORY_ITEM to return 1
+ * for it, so we -1
+ */
}
- set_le_key_k_type(KEY_FORMAT_3_5, &(new_ih.ih_key),
+ set_le_key_k_type(KEY_FORMAT_3_5, &new_ih.ih_key,
TYPE_DIRENTRY);
}
@@ -113,36 +118,44 @@ static void leaf_copy_dir_entries(struct buffer_info *dest_bi,
leaf_paste_entries(dest_bi, item_num_in_dest,
(last_first ==
- FIRST_TO_LAST) ? I_ENTRY_COUNT(B_N_PITEM_HEAD(dest,
+ FIRST_TO_LAST) ? ih_entry_count(item_head(dest,
item_num_in_dest))
: 0, copy_count, deh + from, records,
DEH_SIZE * copy_count + copy_records_len);
}
-/* Copy the first (if last_first == FIRST_TO_LAST) or last (last_first == LAST_TO_FIRST) item or
- part of it or nothing (see the return 0 below) from SOURCE to the end
- (if last_first) or beginning (!last_first) of the DEST */
+/*
+ * Copy the first (if last_first == FIRST_TO_LAST) or last
+ * (last_first == LAST_TO_FIRST) item or part of it or nothing
+ * (see the return 0 below) from SOURCE to the end (if last_first)
+ * or beginning (!last_first) of the DEST
+ */
/* returns 1 if anything was copied, else 0 */
static int leaf_copy_boundary_item(struct buffer_info *dest_bi,
struct buffer_head *src, int last_first,
int bytes_or_entries)
{
struct buffer_head *dest = dest_bi->bi_bh;
- int dest_nr_item, src_nr_item; /* number of items in the source and destination buffers */
+ /* number of items in the source and destination buffers */
+ int dest_nr_item, src_nr_item;
struct item_head *ih;
struct item_head *dih;
dest_nr_item = B_NR_ITEMS(dest);
+ /*
+ * if ( DEST is empty or first item of SOURCE and last item of
+ * DEST are the items of different objects or of different types )
+ * then there is no need to treat this item differently from the
+ * other items that we copy, so we return
+ */
if (last_first == FIRST_TO_LAST) {
- /* if ( DEST is empty or first item of SOURCE and last item of DEST are the items of different objects
- or of different types ) then there is no need to treat this item differently from the other items
- that we copy, so we return */
- ih = B_N_PITEM_HEAD(src, 0);
- dih = B_N_PITEM_HEAD(dest, dest_nr_item - 1);
+ ih = item_head(src, 0);
+ dih = item_head(dest, dest_nr_item - 1);
+
+ /* there is nothing to merge */
if (!dest_nr_item
- || (!op_is_left_mergeable(&(ih->ih_key), src->b_size)))
- /* there is nothing to merge */
+ || (!op_is_left_mergeable(&ih->ih_key, src->b_size)))
return 0;
RFALSE(!ih_item_len(ih),
@@ -157,8 +170,11 @@ static int leaf_copy_boundary_item(struct buffer_info *dest_bi,
return 1;
}
- /* copy part of the body of the first item of SOURCE to the end of the body of the last item of the DEST
- part defined by 'bytes_or_entries'; if bytes_or_entries == -1 copy whole body; don't create new item header
+ /*
+ * copy part of the body of the first item of SOURCE
+ * to the end of the body of the last item of the DEST
+ * part defined by 'bytes_or_entries'; if bytes_or_entries
+ * == -1 copy whole body; don't create new item header
*/
if (bytes_or_entries == -1)
bytes_or_entries = ih_item_len(ih);
@@ -176,11 +192,13 @@ static int leaf_copy_boundary_item(struct buffer_info *dest_bi,
}
#endif
- /* merge first item (or its part) of src buffer with the last
- item of dest buffer. Both are of the same file */
+ /*
+ * merge first item (or its part) of src buffer with the last
+ * item of dest buffer. Both are of the same file
+ */
leaf_paste_in_buffer(dest_bi,
dest_nr_item - 1, ih_item_len(dih),
- bytes_or_entries, B_I_PITEM(src, ih), 0);
+ bytes_or_entries, ih_item_body(src, ih), 0);
if (is_indirect_le_ih(dih)) {
RFALSE(get_ih_free_space(dih),
@@ -195,19 +213,23 @@ static int leaf_copy_boundary_item(struct buffer_info *dest_bi,
/* copy boundary item to right (last_first == LAST_TO_FIRST) */
- /* ( DEST is empty or last item of SOURCE and first item of DEST
- are the items of different object or of different types )
+ /*
+ * (DEST is empty or last item of SOURCE and first item of DEST
+ * are the items of different object or of different types)
*/
src_nr_item = B_NR_ITEMS(src);
- ih = B_N_PITEM_HEAD(src, src_nr_item - 1);
- dih = B_N_PITEM_HEAD(dest, 0);
+ ih = item_head(src, src_nr_item - 1);
+ dih = item_head(dest, 0);
- if (!dest_nr_item || !op_is_left_mergeable(&(dih->ih_key), src->b_size))
+ if (!dest_nr_item || !op_is_left_mergeable(&dih->ih_key, src->b_size))
return 0;
if (is_direntry_le_ih(ih)) {
+ /*
+ * bytes_or_entries = entries number in last
+ * item body of SOURCE
+ */
if (bytes_or_entries == -1)
- /* bytes_or_entries = entries number in last item body of SOURCE */
bytes_or_entries = ih_entry_count(ih);
leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST,
@@ -217,9 +239,11 @@ static int leaf_copy_boundary_item(struct buffer_info *dest_bi,
return 1;
}
- /* copy part of the body of the last item of SOURCE to the begin of the body of the first item of the DEST;
- part defined by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body; change first item key of the DEST;
- don't create new item header
+ /*
+ * copy part of the body of the last item of SOURCE to the
+ * begin of the body of the first item of the DEST; part defined
+ * by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body;
+ * change first item key of the DEST; don't create new item header
*/
RFALSE(is_indirect_le_ih(ih) && get_ih_free_space(ih),
@@ -270,15 +294,18 @@ static int leaf_copy_boundary_item(struct buffer_info *dest_bi,
}
leaf_paste_in_buffer(dest_bi, 0, 0, bytes_or_entries,
- B_I_PITEM(src,
+ ih_item_body(src,
ih) + ih_item_len(ih) - bytes_or_entries,
0);
return 1;
}
-/* copy cpy_mun items from buffer src to buffer dest
- * last_first == FIRST_TO_LAST means, that we copy cpy_num items beginning from first-th item in src to tail of dest
- * last_first == LAST_TO_FIRST means, that we copy cpy_num items beginning from first-th item in src to head of dest
+/*
+ * copy cpy_mun items from buffer src to buffer dest
+ * last_first == FIRST_TO_LAST means, that we copy cpy_num items beginning
+ * from first-th item in src to tail of dest
+ * last_first == LAST_TO_FIRST means, that we copy cpy_num items beginning
+ * from first-th item in src to head of dest
*/
static void leaf_copy_items_entirely(struct buffer_info *dest_bi,
struct buffer_head *src, int last_first,
@@ -311,11 +338,14 @@ static void leaf_copy_items_entirely(struct buffer_info *dest_bi,
nr = blkh_nr_item(blkh);
free_space = blkh_free_space(blkh);
- /* we will insert items before 0-th or nr-th item in dest buffer. It depends of last_first parameter */
+ /*
+ * we will insert items before 0-th or nr-th item in dest buffer.
+ * It depends of last_first parameter
+ */
dest_before = (last_first == LAST_TO_FIRST) ? 0 : nr;
/* location of head of first new item */
- ih = B_N_PITEM_HEAD(dest, dest_before);
+ ih = item_head(dest, dest_before);
RFALSE(blkh_free_space(blkh) < cpy_num * IH_SIZE,
"vs-10140: not enough free space for headers %d (needed %d)",
@@ -325,7 +355,7 @@ static void leaf_copy_items_entirely(struct buffer_info *dest_bi,
memmove(ih + cpy_num, ih, (nr - dest_before) * IH_SIZE);
/* copy item headers */
- memcpy(ih, B_N_PITEM_HEAD(src, first), cpy_num * IH_SIZE);
+ memcpy(ih, item_head(src, first), cpy_num * IH_SIZE);
free_space -= (IH_SIZE * cpy_num);
set_blkh_free_space(blkh, free_space);
@@ -338,8 +368,8 @@ static void leaf_copy_items_entirely(struct buffer_info *dest_bi,
}
/* prepare space for items */
- last_loc = ih_location(&(ih[nr + cpy_num - 1 - dest_before]));
- last_inserted_loc = ih_location(&(ih[cpy_num - 1]));
+ last_loc = ih_location(&ih[nr + cpy_num - 1 - dest_before]);
+ last_inserted_loc = ih_location(&ih[cpy_num - 1]);
/* check free space */
RFALSE(free_space < j - last_inserted_loc,
@@ -352,7 +382,8 @@ static void leaf_copy_items_entirely(struct buffer_info *dest_bi,
/* copy items */
memcpy(dest->b_data + last_inserted_loc,
- B_N_PITEM(src, (first + cpy_num - 1)), j - last_inserted_loc);
+ item_body(src, (first + cpy_num - 1)),
+ j - last_inserted_loc);
/* sizes, item number */
set_blkh_nr_item(blkh, nr + cpy_num);
@@ -376,8 +407,10 @@ static void leaf_copy_items_entirely(struct buffer_info *dest_bi,
}
}
-/* This function splits the (liquid) item into two items (useful when
- shifting part of an item into another node.) */
+/*
+ * This function splits the (liquid) item into two items (useful when
+ * shifting part of an item into another node.)
+ */
static void leaf_item_bottle(struct buffer_info *dest_bi,
struct buffer_head *src, int last_first,
int item_num, int cpy_bytes)
@@ -389,17 +422,22 @@ static void leaf_item_bottle(struct buffer_info *dest_bi,
"vs-10170: bytes == - 1 means: do not split item");
if (last_first == FIRST_TO_LAST) {
- /* if ( if item in position item_num in buffer SOURCE is directory item ) */
- ih = B_N_PITEM_HEAD(src, item_num);
+ /*
+ * if ( if item in position item_num in buffer SOURCE
+ * is directory item )
+ */
+ ih = item_head(src, item_num);
if (is_direntry_le_ih(ih))
leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST,
item_num, 0, cpy_bytes);
else {
struct item_head n_ih;
- /* copy part of the body of the item number 'item_num' of SOURCE to the end of the DEST
- part defined by 'cpy_bytes'; create new item header; change old item_header (????);
- n_ih = new item_header;
+ /*
+ * copy part of the body of the item number 'item_num'
+ * of SOURCE to the end of the DEST part defined by
+ * 'cpy_bytes'; create new item header; change old
+ * item_header (????); n_ih = new item_header;
*/
memcpy(&n_ih, ih, IH_SIZE);
put_ih_item_len(&n_ih, cpy_bytes);
@@ -411,30 +449,36 @@ static void leaf_item_bottle(struct buffer_info *dest_bi,
set_ih_free_space(&n_ih, 0);
}
- RFALSE(op_is_left_mergeable(&(ih->ih_key), src->b_size),
+ RFALSE(op_is_left_mergeable(&ih->ih_key, src->b_size),
"vs-10190: bad mergeability of item %h", ih);
n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */
leaf_insert_into_buf(dest_bi, B_NR_ITEMS(dest), &n_ih,
- B_N_PITEM(src, item_num), 0);
+ item_body(src, item_num), 0);
}
} else {
- /* if ( if item in position item_num in buffer SOURCE is directory item ) */
- ih = B_N_PITEM_HEAD(src, item_num);
+ /*
+ * if ( if item in position item_num in buffer
+ * SOURCE is directory item )
+ */
+ ih = item_head(src, item_num);
if (is_direntry_le_ih(ih))
leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST,
item_num,
- I_ENTRY_COUNT(ih) - cpy_bytes,
+ ih_entry_count(ih) - cpy_bytes,
cpy_bytes);
else {
struct item_head n_ih;
- /* copy part of the body of the item number 'item_num' of SOURCE to the begin of the DEST
- part defined by 'cpy_bytes'; create new item header;
- n_ih = new item_header;
+ /*
+ * copy part of the body of the item number 'item_num'
+ * of SOURCE to the begin of the DEST part defined by
+ * 'cpy_bytes'; create new item header;
+ * n_ih = new item_header;
*/
memcpy(&n_ih, ih, SHORT_KEY_SIZE);
- n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */
+ /* Endian safe, both le */
+ n_ih.ih_version = ih->ih_version;
if (is_direct_le_ih(ih)) {
set_le_ih_k_offset(&n_ih,
@@ -458,20 +502,22 @@ static void leaf_item_bottle(struct buffer_info *dest_bi,
/* set item length */
put_ih_item_len(&n_ih, cpy_bytes);
- n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */
+ /* Endian safe, both le */
+ n_ih.ih_version = ih->ih_version;
leaf_insert_into_buf(dest_bi, 0, &n_ih,
- B_N_PITEM(src,
- item_num) +
- ih_item_len(ih) - cpy_bytes, 0);
+ item_body(src, item_num) +
+ ih_item_len(ih) - cpy_bytes, 0);
}
}
}
-/* If cpy_bytes equals minus one than copy cpy_num whole items from SOURCE to DEST.
- If cpy_bytes not equal to minus one than copy cpy_num-1 whole items from SOURCE to DEST.
- From last item copy cpy_num bytes for regular item and cpy_num directory entries for
- directory item. */
+/*
+ * If cpy_bytes equals minus one than copy cpy_num whole items from SOURCE
+ * to DEST. If cpy_bytes not equal to minus one than copy cpy_num-1 whole
+ * items from SOURCE to DEST. From last item copy cpy_num bytes for regular
+ * item and cpy_num directory entries for directory item.
+ */
static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src,
int last_first, int cpy_num, int cpy_bytes)
{
@@ -498,22 +544,34 @@ static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src,
else
bytes = -1;
- /* copy the first item or it part or nothing to the end of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes)) */
+ /*
+ * copy the first item or it part or nothing to the end of
+ * the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes))
+ */
i = leaf_copy_boundary_item(dest_bi, src, FIRST_TO_LAST, bytes);
cpy_num -= i;
if (cpy_num == 0)
return i;
pos += i;
if (cpy_bytes == -1)
- /* copy first cpy_num items starting from position 'pos' of SOURCE to end of DEST */
+ /*
+ * copy first cpy_num items starting from position
+ * 'pos' of SOURCE to end of DEST
+ */
leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST,
pos, cpy_num);
else {
- /* copy first cpy_num-1 items starting from position 'pos-1' of the SOURCE to the end of the DEST */
+ /*
+ * copy first cpy_num-1 items starting from position
+ * 'pos-1' of the SOURCE to the end of the DEST
+ */
leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST,
pos, cpy_num - 1);
- /* copy part of the item which number is cpy_num+pos-1 to the end of the DEST */
+ /*
+ * copy part of the item which number is
+ * cpy_num+pos-1 to the end of the DEST
+ */
leaf_item_bottle(dest_bi, src, FIRST_TO_LAST,
cpy_num + pos - 1, cpy_bytes);
}
@@ -525,7 +583,11 @@ static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src,
else
bytes = -1;
- /* copy the last item or it part or nothing to the begin of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes)); */
+ /*
+ * copy the last item or it part or nothing to the
+ * begin of the DEST
+ * (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes));
+ */
i = leaf_copy_boundary_item(dest_bi, src, LAST_TO_FIRST, bytes);
cpy_num -= i;
@@ -534,15 +596,24 @@ static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src,
pos = src_nr_item - cpy_num - i;
if (cpy_bytes == -1) {
- /* starting from position 'pos' copy last cpy_num items of SOURCE to begin of DEST */
+ /*
+ * starting from position 'pos' copy last cpy_num
+ * items of SOURCE to begin of DEST
+ */
leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST,
pos, cpy_num);
} else {
- /* copy last cpy_num-1 items starting from position 'pos+1' of the SOURCE to the begin of the DEST; */
+ /*
+ * copy last cpy_num-1 items starting from position
+ * 'pos+1' of the SOURCE to the begin of the DEST;
+ */
leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST,
pos + 1, cpy_num - 1);
- /* copy part of the item which number is pos to the begin of the DEST */
+ /*
+ * copy part of the item which number is pos to
+ * the begin of the DEST
+ */
leaf_item_bottle(dest_bi, src, LAST_TO_FIRST, pos,
cpy_bytes);
}
@@ -550,9 +621,11 @@ static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src,
return i;
}
-/* there are types of coping: from S[0] to L[0], from S[0] to R[0],
- from R[0] to L[0]. for each of these we have to define parent and
- positions of destination and source buffers */
+/*
+ * there are types of coping: from S[0] to L[0], from S[0] to R[0],
+ * from R[0] to L[0]. for each of these we have to define parent and
+ * positions of destination and source buffers
+ */
static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb,
struct buffer_info *dest_bi,
struct buffer_info *src_bi,
@@ -568,7 +641,9 @@ static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb,
src_bi->tb = tb;
src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
- src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0); /* src->b_item_order */
+
+ /* src->b_item_order */
+ src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0);
dest_bi->tb = tb;
dest_bi->bi_bh = tb->L[0];
dest_bi->bi_parent = tb->FL[0];
@@ -633,8 +708,10 @@ static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb,
shift_mode, src_bi->bi_bh, dest_bi->bi_bh);
}
-/* copy mov_num items and mov_bytes of the (mov_num-1)th item to
- neighbor. Delete them from source */
+/*
+ * copy mov_num items and mov_bytes of the (mov_num-1)th item to
+ * neighbor. Delete them from source
+ */
int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num,
int mov_bytes, struct buffer_head *Snew)
{
@@ -657,18 +734,24 @@ int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num,
return ret_value;
}
-/* Shift shift_num items (and shift_bytes of last shifted item if shift_bytes != -1)
- from S[0] to L[0] and replace the delimiting key */
+/*
+ * Shift shift_num items (and shift_bytes of last shifted item if
+ * shift_bytes != -1) from S[0] to L[0] and replace the delimiting key
+ */
int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes)
{
struct buffer_head *S0 = PATH_PLAST_BUFFER(tb->tb_path);
int i;
- /* move shift_num (and shift_bytes bytes) items from S[0] to left neighbor L[0] */
+ /*
+ * move shift_num (and shift_bytes bytes) items from S[0]
+ * to left neighbor L[0]
+ */
i = leaf_move_items(LEAF_FROM_S_TO_L, tb, shift_num, shift_bytes, NULL);
if (shift_num) {
- if (B_NR_ITEMS(S0) == 0) { /* number of items in S[0] == 0 */
+ /* number of items in S[0] == 0 */
+ if (B_NR_ITEMS(S0) == 0) {
RFALSE(shift_bytes != -1,
"vs-10270: S0 is empty now, but shift_bytes != -1 (%d)",
@@ -691,10 +774,10 @@ int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes)
replace_key(tb, tb->CFL[0], tb->lkey[0], S0, 0);
RFALSE((shift_bytes != -1 &&
- !(is_direntry_le_ih(B_N_PITEM_HEAD(S0, 0))
- && !I_ENTRY_COUNT(B_N_PITEM_HEAD(S0, 0)))) &&
+ !(is_direntry_le_ih(item_head(S0, 0))
+ && !ih_entry_count(item_head(S0, 0)))) &&
(!op_is_left_mergeable
- (B_N_PKEY(S0, 0), S0->b_size)),
+ (leaf_key(S0, 0), S0->b_size)),
"vs-10280: item must be mergeable");
}
}
@@ -704,13 +787,18 @@ int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes)
/* CLEANING STOPPED HERE */
-/* Shift shift_num (shift_bytes) items from S[0] to the right neighbor, and replace the delimiting key */
+/*
+ * Shift shift_num (shift_bytes) items from S[0] to the right neighbor,
+ * and replace the delimiting key
+ */
int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes)
{
- // struct buffer_head * S0 = PATH_PLAST_BUFFER (tb->tb_path);
int ret_value;
- /* move shift_num (and shift_bytes) items from S[0] to right neighbor R[0] */
+ /*
+ * move shift_num (and shift_bytes) items from S[0] to
+ * right neighbor R[0]
+ */
ret_value =
leaf_move_items(LEAF_FROM_S_TO_R, tb, shift_num, shift_bytes, NULL);
@@ -725,12 +813,16 @@ int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes)
static void leaf_delete_items_entirely(struct buffer_info *bi,
int first, int del_num);
-/* If del_bytes == -1, starting from position 'first' delete del_num items in whole in buffer CUR.
- If not.
- If last_first == 0. Starting from position 'first' delete del_num-1 items in whole. Delete part of body of
- the first item. Part defined by del_bytes. Don't delete first item header
- If last_first == 1. Starting from position 'first+1' delete del_num-1 items in whole. Delete part of body of
- the last item . Part defined by del_bytes. Don't delete last item header.
+/*
+ * If del_bytes == -1, starting from position 'first' delete del_num
+ * items in whole in buffer CUR.
+ * If not.
+ * If last_first == 0. Starting from position 'first' delete del_num-1
+ * items in whole. Delete part of body of the first item. Part defined by
+ * del_bytes. Don't delete first item header
+ * If last_first == 1. Starting from position 'first+1' delete del_num-1
+ * items in whole. Delete part of body of the last item . Part defined by
+ * del_bytes. Don't delete last item header.
*/
void leaf_delete_items(struct buffer_info *cur_bi, int last_first,
int first, int del_num, int del_bytes)
@@ -761,32 +853,43 @@ void leaf_delete_items(struct buffer_info *cur_bi, int last_first,
leaf_delete_items_entirely(cur_bi, first, del_num);
else {
if (last_first == FIRST_TO_LAST) {
- /* delete del_num-1 items beginning from item in position first */
+ /*
+ * delete del_num-1 items beginning from
+ * item in position first
+ */
leaf_delete_items_entirely(cur_bi, first, del_num - 1);
- /* delete the part of the first item of the bh
- do not delete item header
+ /*
+ * delete the part of the first item of the bh
+ * do not delete item header
*/
leaf_cut_from_buffer(cur_bi, 0, 0, del_bytes);
} else {
struct item_head *ih;
int len;
- /* delete del_num-1 items beginning from item in position first+1 */
+ /*
+ * delete del_num-1 items beginning from
+ * item in position first+1
+ */
leaf_delete_items_entirely(cur_bi, first + 1,
del_num - 1);
- ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh) - 1);
+ ih = item_head(bh, B_NR_ITEMS(bh) - 1);
if (is_direntry_le_ih(ih))
/* the last item is directory */
- /* len = numbers of directory entries in this item */
+ /*
+ * len = numbers of directory entries
+ * in this item
+ */
len = ih_entry_count(ih);
else
/* len = body len of item */
len = ih_item_len(ih);
- /* delete the part of the last item of the bh
- do not delete item header
+ /*
+ * delete the part of the last item of the bh
+ * do not delete item header
*/
leaf_cut_from_buffer(cur_bi, B_NR_ITEMS(bh) - 1,
len - del_bytes, del_bytes);
@@ -820,10 +923,10 @@ void leaf_insert_into_buf(struct buffer_info *bi, int before,
zeros_number, ih_item_len(inserted_item_ih));
/* get item new item must be inserted before */
- ih = B_N_PITEM_HEAD(bh, before);
+ ih = item_head(bh, before);
/* prepare space for the body of new item */
- last_loc = nr ? ih_location(&(ih[nr - before - 1])) : bh->b_size;
+ last_loc = nr ? ih_location(&ih[nr - before - 1]) : bh->b_size;
unmoved_loc = before ? ih_location(ih - 1) : bh->b_size;
memmove(bh->b_data + last_loc - ih_item_len(inserted_item_ih),
@@ -846,8 +949,8 @@ void leaf_insert_into_buf(struct buffer_info *bi, int before,
/* change locations */
for (i = before; i < nr + 1; i++) {
- unmoved_loc -= ih_item_len(&(ih[i - before]));
- put_ih_location(&(ih[i - before]), unmoved_loc);
+ unmoved_loc -= ih_item_len(&ih[i - before]);
+ put_ih_location(&ih[i - before], unmoved_loc);
}
/* sizes, free space, item number */
@@ -867,8 +970,10 @@ void leaf_insert_into_buf(struct buffer_info *bi, int before,
}
}
-/* paste paste_size bytes to affected_item_num-th item.
- When item is a directory, this only prepare space for new entries */
+/*
+ * paste paste_size bytes to affected_item_num-th item.
+ * When item is a directory, this only prepare space for new entries
+ */
void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num,
int pos_in_item, int paste_size,
const char *body, int zeros_number)
@@ -902,9 +1007,9 @@ void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num,
#endif /* CONFIG_REISERFS_CHECK */
/* item to be appended */
- ih = B_N_PITEM_HEAD(bh, affected_item_num);
+ ih = item_head(bh, affected_item_num);
- last_loc = ih_location(&(ih[nr - affected_item_num - 1]));
+ last_loc = ih_location(&ih[nr - affected_item_num - 1]);
unmoved_loc = affected_item_num ? ih_location(ih - 1) : bh->b_size;
/* prepare space */
@@ -913,8 +1018,8 @@ void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num,
/* change locations */
for (i = affected_item_num; i < nr; i++)
- put_ih_location(&(ih[i - affected_item_num]),
- ih_location(&(ih[i - affected_item_num])) -
+ put_ih_location(&ih[i - affected_item_num],
+ ih_location(&ih[i - affected_item_num]) -
paste_size);
if (body) {
@@ -957,10 +1062,12 @@ void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num,
}
}
-/* cuts DEL_COUNT entries beginning from FROM-th entry. Directory item
- does not have free space, so it moves DEHs and remaining records as
- necessary. Return value is size of removed part of directory item
- in bytes. */
+/*
+ * cuts DEL_COUNT entries beginning from FROM-th entry. Directory item
+ * does not have free space, so it moves DEHs and remaining records as
+ * necessary. Return value is size of removed part of directory item
+ * in bytes.
+ */
static int leaf_cut_entries(struct buffer_head *bh,
struct item_head *ih, int from, int del_count)
{
@@ -971,12 +1078,14 @@ static int leaf_cut_entries(struct buffer_head *bh,
int cut_records_len; /* length of all removed records */
int i;
- /* make sure, that item is directory and there are enough entries to
- remove */
+ /*
+ * make sure that item is directory and there are enough entries to
+ * remove
+ */
RFALSE(!is_direntry_le_ih(ih), "10180: item is not directory item");
- RFALSE(I_ENTRY_COUNT(ih) < from + del_count,
+ RFALSE(ih_entry_count(ih) < from + del_count,
"10185: item contains not enough entries: entry_count = %d, from = %d, to delete = %d",
- I_ENTRY_COUNT(ih), from, del_count);
+ ih_entry_count(ih), from, del_count);
if (del_count == 0)
return 0;
@@ -987,22 +1096,24 @@ static int leaf_cut_entries(struct buffer_head *bh,
/* entry head array */
deh = B_I_DEH(bh, ih);
- /* first byte of remaining entries, those are BEFORE cut entries
- (prev_record) and length of all removed records (cut_records_len) */
+ /*
+ * first byte of remaining entries, those are BEFORE cut entries
+ * (prev_record) and length of all removed records (cut_records_len)
+ */
prev_record_offset =
- (from ? deh_location(&(deh[from - 1])) : ih_item_len(ih));
+ (from ? deh_location(&deh[from - 1]) : ih_item_len(ih));
cut_records_len = prev_record_offset /*from_record */ -
- deh_location(&(deh[from + del_count - 1]));
+ deh_location(&deh[from + del_count - 1]);
prev_record = item + prev_record_offset;
/* adjust locations of remaining entries */
- for (i = I_ENTRY_COUNT(ih) - 1; i > from + del_count - 1; i--)
- put_deh_location(&(deh[i]),
+ for (i = ih_entry_count(ih) - 1; i > from + del_count - 1; i--)
+ put_deh_location(&deh[i],
deh_location(&deh[i]) -
(DEH_SIZE * del_count));
for (i = 0; i < from; i++)
- put_deh_location(&(deh[i]),
+ put_deh_location(&deh[i],
deh_location(&deh[i]) - (DEH_SIZE * del_count +
cut_records_len));
@@ -1021,14 +1132,15 @@ static int leaf_cut_entries(struct buffer_head *bh,
return DEH_SIZE * del_count + cut_records_len;
}
-/* when cut item is part of regular file
- pos_in_item - first byte that must be cut
- cut_size - number of bytes to be cut beginning from pos_in_item
-
- when cut item is part of directory
- pos_in_item - number of first deleted entry
- cut_size - count of deleted entries
- */
+/*
+ * when cut item is part of regular file
+ * pos_in_item - first byte that must be cut
+ * cut_size - number of bytes to be cut beginning from pos_in_item
+ *
+ * when cut item is part of directory
+ * pos_in_item - number of first deleted entry
+ * cut_size - count of deleted entries
+ */
void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
int pos_in_item, int cut_size)
{
@@ -1043,7 +1155,7 @@ void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
nr = blkh_nr_item(blkh);
/* item head of truncated item */
- ih = B_N_PITEM_HEAD(bh, cut_item_num);
+ ih = item_head(bh, cut_item_num);
if (is_direntry_le_ih(ih)) {
/* first cut entry () */
@@ -1055,7 +1167,6 @@ void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
cut_item_num);
/* change item key by key of first entry in the item */
set_le_ih_k_offset(ih, deh_offset(B_I_DEH(bh, ih)));
- /*memcpy (&ih->ih_key.k_offset, &(B_I_DEH (bh, ih)->deh_offset), SHORT_KEY_SIZE); */
}
} else {
/* item is direct or indirect */
@@ -1089,7 +1200,7 @@ void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
}
/* location of the last item */
- last_loc = ih_location(&(ih[nr - cut_item_num - 1]));
+ last_loc = ih_location(&ih[nr - cut_item_num - 1]);
/* location of the item, which is remaining at the same place */
unmoved_loc = cut_item_num ? ih_location(ih - 1) : bh->b_size;
@@ -1108,7 +1219,7 @@ void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
/* change locations */
for (i = cut_item_num; i < nr; i++)
- put_ih_location(&(ih[i - cut_item_num]),
+ put_ih_location(&ih[i - cut_item_num],
ih_location(&ih[i - cut_item_num]) + cut_size);
/* size, free space */
@@ -1156,14 +1267,14 @@ static void leaf_delete_items_entirely(struct buffer_info *bi,
return;
}
- ih = B_N_PITEM_HEAD(bh, first);
+ ih = item_head(bh, first);
/* location of unmovable item */
j = (first == 0) ? bh->b_size : ih_location(ih - 1);
/* delete items */
- last_loc = ih_location(&(ih[nr - 1 - first]));
- last_removed_loc = ih_location(&(ih[del_num - 1]));
+ last_loc = ih_location(&ih[nr - 1 - first]);
+ last_removed_loc = ih_location(&ih[del_num - 1]);
memmove(bh->b_data + last_loc + j - last_removed_loc,
bh->b_data + last_loc, last_removed_loc - last_loc);
@@ -1173,8 +1284,8 @@ static void leaf_delete_items_entirely(struct buffer_info *bi,
/* change item location */
for (i = first; i < nr - del_num; i++)
- put_ih_location(&(ih[i - first]),
- ih_location(&(ih[i - first])) + (j -
+ put_ih_location(&ih[i - first],
+ ih_location(&ih[i - first]) + (j -
last_removed_loc));
/* sizes, item number */
@@ -1195,7 +1306,10 @@ static void leaf_delete_items_entirely(struct buffer_info *bi,
}
}
-/* paste new_entry_count entries (new_dehs, records) into position before to item_num-th item */
+/*
+ * paste new_entry_count entries (new_dehs, records) into position
+ * before to item_num-th item
+ */
void leaf_paste_entries(struct buffer_info *bi,
int item_num,
int before,
@@ -1213,13 +1327,16 @@ void leaf_paste_entries(struct buffer_info *bi,
if (new_entry_count == 0)
return;
- ih = B_N_PITEM_HEAD(bh, item_num);
+ ih = item_head(bh, item_num);
- /* make sure, that item is directory, and there are enough records in it */
+ /*
+ * make sure, that item is directory, and there are enough
+ * records in it
+ */
RFALSE(!is_direntry_le_ih(ih), "10225: item is not directory item");
- RFALSE(I_ENTRY_COUNT(ih) < before,
+ RFALSE(ih_entry_count(ih) < before,
"10230: there are no entry we paste entries before. entry_count = %d, before = %d",
- I_ENTRY_COUNT(ih), before);
+ ih_entry_count(ih), before);
/* first byte of dest item */
item = bh->b_data + ih_location(ih);
@@ -1230,21 +1347,21 @@ void leaf_paste_entries(struct buffer_info *bi,
/* new records will be pasted at this point */
insert_point =
item +
- (before ? deh_location(&(deh[before - 1]))
+ (before ? deh_location(&deh[before - 1])
: (ih_item_len(ih) - paste_size));
/* adjust locations of records that will be AFTER new records */
- for (i = I_ENTRY_COUNT(ih) - 1; i >= before; i--)
- put_deh_location(&(deh[i]),
- deh_location(&(deh[i])) +
+ for (i = ih_entry_count(ih) - 1; i >= before; i--)
+ put_deh_location(&deh[i],
+ deh_location(&deh[i]) +
(DEH_SIZE * new_entry_count));
/* adjust locations of records that will be BEFORE new records */
for (i = 0; i < before; i++)
- put_deh_location(&(deh[i]),
- deh_location(&(deh[i])) + paste_size);
+ put_deh_location(&deh[i],
+ deh_location(&deh[i]) + paste_size);
- old_entry_num = I_ENTRY_COUNT(ih);
+ old_entry_num = ih_entry_count(ih);
put_ih_entry_count(ih, ih_entry_count(ih) + new_entry_count);
/* prepare space for pasted records */
@@ -1266,10 +1383,10 @@ void leaf_paste_entries(struct buffer_info *bi,
/* set locations of new records */
for (i = 0; i < new_entry_count; i++) {
- put_deh_location(&(deh[i]),
- deh_location(&(deh[i])) +
+ put_deh_location(&deh[i],
+ deh_location(&deh[i]) +
(-deh_location
- (&(new_dehs[new_entry_count - 1])) +
+ (&new_dehs[new_entry_count - 1]) +
insert_point + DEH_SIZE * new_entry_count -
item));
}
@@ -1277,28 +1394,26 @@ void leaf_paste_entries(struct buffer_info *bi,
/* change item key if necessary (when we paste before 0-th entry */
if (!before) {
set_le_ih_k_offset(ih, deh_offset(new_dehs));
-/* memcpy (&ih->ih_key.k_offset,
- &new_dehs->deh_offset, SHORT_KEY_SIZE);*/
}
#ifdef CONFIG_REISERFS_CHECK
{
int prev, next;
/* check record locations */
deh = B_I_DEH(bh, ih);
- for (i = 0; i < I_ENTRY_COUNT(ih); i++) {
+ for (i = 0; i < ih_entry_count(ih); i++) {
next =
(i <
- I_ENTRY_COUNT(ih) -
- 1) ? deh_location(&(deh[i + 1])) : 0;
- prev = (i != 0) ? deh_location(&(deh[i - 1])) : 0;
+ ih_entry_count(ih) -
+ 1) ? deh_location(&deh[i + 1]) : 0;
+ prev = (i != 0) ? deh_location(&deh[i - 1]) : 0;
- if (prev && prev <= deh_location(&(deh[i])))
+ if (prev && prev <= deh_location(&deh[i]))
reiserfs_error(sb_from_bi(bi), "vs-10240",
"directory item (%h) "
"corrupted (prev %a, "
"cur(%d) %a)",
ih, deh + i - 1, i, deh + i);
- if (next && next >= deh_location(&(deh[i])))
+ if (next && next >= deh_location(&deh[i]))
reiserfs_error(sb_from_bi(bi), "vs-10250",
"directory item (%h) "
"corrupted (cur(%d) %a, "
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index e825f8b63e6b..cd11358b10c7 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -22,8 +22,10 @@
#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) set_nlink(i, 1); }
#define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i);
-// directory item contains array of entry headers. This performs
-// binary search through that array
+/*
+ * directory item contains array of entry headers. This performs
+ * binary search through that array
+ */
static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off)
{
struct item_head *ih = de->de_ih;
@@ -31,7 +33,7 @@ static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off)
int rbound, lbound, j;
lbound = 0;
- rbound = I_ENTRY_COUNT(ih) - 1;
+ rbound = ih_entry_count(ih) - 1;
for (j = (rbound + lbound) / 2; lbound <= rbound;
j = (rbound + lbound) / 2) {
@@ -43,7 +45,7 @@ static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off)
lbound = j + 1;
continue;
}
- // this is not name found, but matched third key component
+ /* this is not name found, but matched third key component */
de->de_entry_num = j;
return NAME_FOUND;
}
@@ -52,17 +54,21 @@ static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off)
return NAME_NOT_FOUND;
}
-// comment? maybe something like set de to point to what the path points to?
+/*
+ * comment? maybe something like set de to point to what the path points to?
+ */
static inline void set_de_item_location(struct reiserfs_dir_entry *de,
struct treepath *path)
{
de->de_bh = get_last_bh(path);
- de->de_ih = get_ih(path);
+ de->de_ih = tp_item_head(path);
de->de_deh = B_I_DEH(de->de_bh, de->de_ih);
de->de_item_num = PATH_LAST_POSITION(path);
}
-// de_bh, de_ih, de_deh (points to first element of array), de_item_num is set
+/*
+ * de_bh, de_ih, de_deh (points to first element of array), de_item_num is set
+ */
inline void set_de_name_and_namelen(struct reiserfs_dir_entry *de)
{
struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num;
@@ -71,17 +77,17 @@ inline void set_de_name_and_namelen(struct reiserfs_dir_entry *de)
de->de_entrylen = entry_length(de->de_bh, de->de_ih, de->de_entry_num);
de->de_namelen = de->de_entrylen - (de_with_sd(deh) ? SD_SIZE : 0);
- de->de_name = B_I_PITEM(de->de_bh, de->de_ih) + deh_location(deh);
+ de->de_name = ih_item_body(de->de_bh, de->de_ih) + deh_location(deh);
if (de->de_name[de->de_namelen - 1] == 0)
de->de_namelen = strlen(de->de_name);
}
-// what entry points to
+/* what entry points to */
static inline void set_de_object_key(struct reiserfs_dir_entry *de)
{
BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih));
- de->de_dir_id = deh_dir_id(&(de->de_deh[de->de_entry_num]));
- de->de_objectid = deh_objectid(&(de->de_deh[de->de_entry_num]));
+ de->de_dir_id = deh_dir_id(&de->de_deh[de->de_entry_num]);
+ de->de_objectid = deh_objectid(&de->de_deh[de->de_entry_num]);
}
static inline void store_de_entry_key(struct reiserfs_dir_entry *de)
@@ -96,21 +102,20 @@ static inline void store_de_entry_key(struct reiserfs_dir_entry *de)
le32_to_cpu(de->de_ih->ih_key.k_dir_id);
de->de_entry_key.on_disk_key.k_objectid =
le32_to_cpu(de->de_ih->ih_key.k_objectid);
- set_cpu_key_k_offset(&(de->de_entry_key), deh_offset(deh));
- set_cpu_key_k_type(&(de->de_entry_key), TYPE_DIRENTRY);
+ set_cpu_key_k_offset(&de->de_entry_key, deh_offset(deh));
+ set_cpu_key_k_type(&de->de_entry_key, TYPE_DIRENTRY);
}
-/* We assign a key to each directory item, and place multiple entries
-in a single directory item. A directory item has a key equal to the
-key of the first directory entry in it.
-
-This function first calls search_by_key, then, if item whose first
-entry matches is not found it looks for the entry inside directory
-item found by search_by_key. Fills the path to the entry, and to the
-entry position in the item
-
-*/
-
+/*
+ * We assign a key to each directory item, and place multiple entries in a
+ * single directory item. A directory item has a key equal to the key of
+ * the first directory entry in it.
+
+ * This function first calls search_by_key, then, if item whose first entry
+ * matches is not found it looks for the entry inside directory item found
+ * by search_by_key. Fills the path to the entry, and to the entry position
+ * in the item
+ */
/* The function is NOT SCHEDULE-SAFE! */
int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
struct treepath *path, struct reiserfs_dir_entry *de)
@@ -144,7 +149,7 @@ int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
#ifdef CONFIG_REISERFS_CHECK
if (!is_direntry_le_ih(de->de_ih) ||
- COMP_SHORT_KEYS(&(de->de_ih->ih_key), key)) {
+ COMP_SHORT_KEYS(&de->de_ih->ih_key, key)) {
print_block(de->de_bh, 0, -1, -1);
reiserfs_panic(sb, "vs-7005", "found item %h is not directory "
"item or does not belong to the same directory "
@@ -152,12 +157,17 @@ int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
}
#endif /* CONFIG_REISERFS_CHECK */
- /* binary search in directory item by third componen t of the
- key. sets de->de_entry_num of de */
+ /*
+ * binary search in directory item by third component of the
+ * key. sets de->de_entry_num of de
+ */
retval = bin_search_in_dir_item(de, cpu_key_k_offset(key));
path->pos_in_item = de->de_entry_num;
if (retval != NAME_NOT_FOUND) {
- // ugly, but rename needs de_bh, de_deh, de_name, de_namelen, de_objectid set
+ /*
+ * ugly, but rename needs de_bh, de_deh, de_name,
+ * de_namelen, de_objectid set
+ */
set_de_name_and_namelen(de);
set_de_object_key(de);
}
@@ -166,11 +176,12 @@ int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
/* Keyed 32-bit hash function using TEA in a Davis-Meyer function */
-/* The third component is hashed, and you can choose from more than
- one hash function. Per directory hashes are not yet implemented
- but are thought about. This function should be moved to hashes.c
- Jedi, please do so. -Hans */
-
+/*
+ * The third component is hashed, and you can choose from more than
+ * one hash function. Per directory hashes are not yet implemented
+ * but are thought about. This function should be moved to hashes.c
+ * Jedi, please do so. -Hans
+ */
static __u32 get_third_component(struct super_block *s,
const char *name, int len)
{
@@ -183,11 +194,13 @@ static __u32 get_third_component(struct super_block *s,
res = REISERFS_SB(s)->s_hash_function(name, len);
- // take bits from 7-th to 30-th including both bounds
+ /* take bits from 7-th to 30-th including both bounds */
res = GET_HASH_VALUE(res);
if (res == 0)
- // needed to have no names before "." and ".." those have hash
- // value == 0 and generation conters 1 and 2 accordingly
+ /*
+ * needed to have no names before "." and ".." those have hash
+ * value == 0 and generation conters 1 and 2 accordingly
+ */
res = 128;
return res + MAX_GENERATION_NUMBER;
}
@@ -208,7 +221,7 @@ static int reiserfs_match(struct reiserfs_dir_entry *de,
/* de's de_bh, de_ih, de_deh, de_item_num, de_entry_num are set already */
- /* used when hash collisions exist */
+/* used when hash collisions exist */
static int linear_search_in_dir_item(struct cpu_key *key,
struct reiserfs_dir_entry *de,
@@ -220,7 +233,7 @@ static int linear_search_in_dir_item(struct cpu_key *key,
i = de->de_entry_num;
- if (i == I_ENTRY_COUNT(de->de_ih) ||
+ if (i == ih_entry_count(de->de_ih) ||
GET_HASH_VALUE(deh_offset(deh + i)) !=
GET_HASH_VALUE(cpu_key_k_offset(key))) {
i--;
@@ -232,43 +245,50 @@ static int linear_search_in_dir_item(struct cpu_key *key,
deh += i;
for (; i >= 0; i--, deh--) {
+ /* hash value does not match, no need to check whole name */
if (GET_HASH_VALUE(deh_offset(deh)) !=
GET_HASH_VALUE(cpu_key_k_offset(key))) {
- // hash value does not match, no need to check whole name
return NAME_NOT_FOUND;
}
- /* mark, that this generation number is used */
+ /* mark that this generation number is used */
if (de->de_gen_number_bit_string)
set_bit(GET_GENERATION_NUMBER(deh_offset(deh)),
de->de_gen_number_bit_string);
- // calculate pointer to name and namelen
+ /* calculate pointer to name and namelen */
de->de_entry_num = i;
set_de_name_and_namelen(de);
+ /*
+ * de's de_name, de_namelen, de_recordlen are set.
+ * Fill the rest.
+ */
if ((retval =
reiserfs_match(de, name, namelen)) != NAME_NOT_FOUND) {
- // de's de_name, de_namelen, de_recordlen are set. Fill the rest:
- // key of pointed object
+ /* key of pointed object */
set_de_object_key(de);
store_de_entry_key(de);
- // retval can be NAME_FOUND or NAME_FOUND_INVISIBLE
+ /* retval can be NAME_FOUND or NAME_FOUND_INVISIBLE */
return retval;
}
}
if (GET_GENERATION_NUMBER(le_ih_k_offset(de->de_ih)) == 0)
- /* we have reached left most entry in the node. In common we
- have to go to the left neighbor, but if generation counter
- is 0 already, we know for sure, that there is no name with
- the same hash value */
- // FIXME: this work correctly only because hash value can not
- // be 0. Btw, in case of Yura's hash it is probably possible,
- // so, this is a bug
+ /*
+ * we have reached left most entry in the node. In common we
+ * have to go to the left neighbor, but if generation counter
+ * is 0 already, we know for sure, that there is no name with
+ * the same hash value
+ */
+ /*
+ * FIXME: this work correctly only because hash value can not
+ * be 0. Btw, in case of Yura's hash it is probably possible,
+ * so, this is a bug
+ */
return NAME_NOT_FOUND;
RFALSE(de->de_item_num,
@@ -277,8 +297,10 @@ static int linear_search_in_dir_item(struct cpu_key *key,
return GOTO_PREVIOUS_ITEM;
}
-// may return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND
-// FIXME: should add something like IOERROR
+/*
+ * may return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND
+ * FIXME: should add something like IOERROR
+ */
static int reiserfs_find_entry(struct inode *dir, const char *name, int namelen,
struct treepath *path_to_entry,
struct reiserfs_dir_entry *de)
@@ -307,13 +329,19 @@ static int reiserfs_find_entry(struct inode *dir, const char *name, int namelen,
retval =
linear_search_in_dir_item(&key_to_search, de, name,
namelen);
+ /*
+ * there is no need to scan directory anymore.
+ * Given entry found or does not exist
+ */
if (retval != GOTO_PREVIOUS_ITEM) {
- /* there is no need to scan directory anymore. Given entry found or does not exist */
path_to_entry->pos_in_item = de->de_entry_num;
return retval;
}
- /* there is left neighboring item of this directory and given entry can be there */
+ /*
+ * there is left neighboring item of this directory
+ * and given entry can be there
+ */
set_cpu_key_k_offset(&key_to_search,
le_ih_k_offset(de->de_ih) - 1);
pathrelse(path_to_entry);
@@ -341,14 +369,16 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
pathrelse(&path_to_entry);
if (retval == NAME_FOUND) {
inode = reiserfs_iget(dir->i_sb,
- (struct cpu_key *)&(de.de_dir_id));
+ (struct cpu_key *)&de.de_dir_id);
if (!inode || IS_ERR(inode)) {
reiserfs_write_unlock(dir->i_sb);
return ERR_PTR(-EACCES);
}
- /* Propagate the private flag so we know we're
- * in the priv tree */
+ /*
+ * Propagate the private flag so we know we're
+ * in the priv tree
+ */
if (IS_PRIVATE(dir))
inode->i_flags |= S_PRIVATE;
}
@@ -361,9 +391,9 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
}
/*
-** looks up the dentry of the parent directory for child.
-** taken from ext2_get_parent
-*/
+ * looks up the dentry of the parent directory for child.
+ * taken from ext2_get_parent
+ */
struct dentry *reiserfs_get_parent(struct dentry *child)
{
int retval;
@@ -384,7 +414,7 @@ struct dentry *reiserfs_get_parent(struct dentry *child)
reiserfs_write_unlock(dir->i_sb);
return ERR_PTR(-ENOENT);
}
- inode = reiserfs_iget(dir->i_sb, (struct cpu_key *)&(de.de_dir_id));
+ inode = reiserfs_iget(dir->i_sb, (struct cpu_key *)&de.de_dir_id);
reiserfs_write_unlock(dir->i_sb);
return d_obtain_alias(inode);
@@ -406,8 +436,13 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
struct reiserfs_dir_entry de;
DECLARE_BITMAP(bit_string, MAX_GENERATION_NUMBER + 1);
int gen_number;
- char small_buf[32 + DEH_SIZE]; /* 48 bytes now and we avoid kmalloc
- if we create file with short name */
+
+ /*
+ * 48 bytes now and we avoid kmalloc if we
+ * create file with short name
+ */
+ char small_buf[32 + DEH_SIZE];
+
char *buffer;
int buflen, paste_size;
int retval;
@@ -439,21 +474,30 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
(get_inode_sd_version(dir) ==
STAT_DATA_V1) ? (DEH_SIZE + namelen) : buflen;
- /* fill buffer : directory entry head, name[, dir objectid | , stat data | ,stat data, dir objectid ] */
+ /*
+ * fill buffer : directory entry head, name[, dir objectid | ,
+ * stat data | ,stat data, dir objectid ]
+ */
deh = (struct reiserfs_de_head *)buffer;
deh->deh_location = 0; /* JDM Endian safe if 0 */
put_deh_offset(deh, cpu_key_k_offset(&entry_key));
deh->deh_state = 0; /* JDM Endian safe if 0 */
/* put key (ino analog) to de */
- deh->deh_dir_id = INODE_PKEY(inode)->k_dir_id; /* safe: k_dir_id is le */
- deh->deh_objectid = INODE_PKEY(inode)->k_objectid; /* safe: k_objectid is le */
+
+ /* safe: k_dir_id is le */
+ deh->deh_dir_id = INODE_PKEY(inode)->k_dir_id;
+ /* safe: k_objectid is le */
+ deh->deh_objectid = INODE_PKEY(inode)->k_objectid;
/* copy name */
memcpy((char *)(deh + 1), name, namelen);
/* padd by 0s to the 4 byte boundary */
padd_item((char *)(deh + 1), ROUND_UP(namelen), namelen);
- /* entry is ready to be pasted into tree, set 'visibility' and 'stat data in entry' attributes */
+ /*
+ * entry is ready to be pasted into tree, set 'visibility'
+ * and 'stat data in entry' attributes
+ */
mark_de_without_sd(deh);
visible ? mark_de_visible(deh) : mark_de_hidden(deh);
@@ -499,7 +543,8 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
/* update max-hash-collisions counter in reiserfs_sb_info */
PROC_INFO_MAX(th->t_super, max_hash_collisions, gen_number);
- if (gen_number != 0) { /* we need to re-search for the insertion point */
+ /* we need to re-search for the insertion point */
+ if (gen_number != 0) {
if (search_by_entry_key(dir->i_sb, &entry_key, &path, &de) !=
NAME_NOT_FOUND) {
reiserfs_warning(dir->i_sb, "vs-7032",
@@ -527,18 +572,19 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
dir->i_size += paste_size;
dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
if (!S_ISDIR(inode->i_mode) && visible)
- // reiserfs_mkdir or reiserfs_rename will do that by itself
+ /* reiserfs_mkdir or reiserfs_rename will do that by itself */
reiserfs_update_sd(th, dir);
reiserfs_check_path(&path);
return 0;
}
-/* quota utility function, call if you've had to abort after calling
-** new_inode_init, and have not called reiserfs_new_inode yet.
-** This should only be called on inodes that do not have stat data
-** inserted into the tree yet.
-*/
+/*
+ * quota utility function, call if you've had to abort after calling
+ * new_inode_init, and have not called reiserfs_new_inode yet.
+ * This should only be called on inodes that do not have stat data
+ * inserted into the tree yet.
+ */
static int drop_new_inode(struct inode *inode)
{
dquot_drop(inode);
@@ -548,18 +594,23 @@ static int drop_new_inode(struct inode *inode)
return 0;
}
-/* utility function that does setup for reiserfs_new_inode.
-** dquot_initialize needs lots of credits so it's better to have it
-** outside of a transaction, so we had to pull some bits of
-** reiserfs_new_inode out into this func.
-*/
+/*
+ * utility function that does setup for reiserfs_new_inode.
+ * dquot_initialize needs lots of credits so it's better to have it
+ * outside of a transaction, so we had to pull some bits of
+ * reiserfs_new_inode out into this func.
+ */
static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode)
{
- /* Make inode invalid - just in case we are going to drop it before
- * the initialization happens */
+ /*
+ * Make inode invalid - just in case we are going to drop it before
+ * the initialization happens
+ */
INODE_PKEY(inode)->k_objectid = 0;
- /* the quota init calls have to know who to charge the quota to, so
- ** we have to set uid and gid here
+
+ /*
+ * the quota init calls have to know who to charge the quota to, so
+ * we have to set uid and gid here
*/
inode_init_owner(inode, dir, mode);
dquot_initialize(inode);
@@ -571,7 +622,10 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mod
{
int retval;
struct inode *inode;
- /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
+ /*
+ * We need blocks for transaction + (user+group)*(quotas
+ * for new inode + update of quota for directory owner)
+ */
int jbegin_count =
JOURNAL_PER_BALANCE_CNT * 2 +
2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
@@ -618,7 +672,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mod
int err;
drop_nlink(inode);
reiserfs_update_sd(&th, inode);
- err = journal_end(&th, dir->i_sb, jbegin_count);
+ err = journal_end(&th);
if (err)
retval = err;
unlock_new_inode(inode);
@@ -630,9 +684,9 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mod
unlock_new_inode(inode);
d_instantiate(dentry, inode);
- retval = journal_end(&th, dir->i_sb, jbegin_count);
+ retval = journal_end(&th);
- out_failed:
+out_failed:
reiserfs_write_unlock(dir->i_sb);
return retval;
}
@@ -644,7 +698,10 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode
struct inode *inode;
struct reiserfs_transaction_handle th;
struct reiserfs_security_handle security;
- /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
+ /*
+ * We need blocks for transaction + (user+group)*(quotas
+ * for new inode + update of quota for directory owner)
+ */
int jbegin_count =
JOURNAL_PER_BALANCE_CNT * 3 +
2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
@@ -685,7 +742,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode
inode->i_op = &reiserfs_special_inode_operations;
init_special_inode(inode, inode->i_mode, rdev);
- //FIXME: needed for block and char devices only
+ /* FIXME: needed for block and char devices only */
reiserfs_update_sd(&th, inode);
reiserfs_update_inode_transaction(inode);
@@ -698,7 +755,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode
int err;
drop_nlink(inode);
reiserfs_update_sd(&th, inode);
- err = journal_end(&th, dir->i_sb, jbegin_count);
+ err = journal_end(&th);
if (err)
retval = err;
unlock_new_inode(inode);
@@ -708,9 +765,9 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode
unlock_new_inode(inode);
d_instantiate(dentry, inode);
- retval = journal_end(&th, dir->i_sb, jbegin_count);
+ retval = journal_end(&th);
- out_failed:
+out_failed:
reiserfs_write_unlock(dir->i_sb);
return retval;
}
@@ -721,7 +778,10 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
struct inode *inode;
struct reiserfs_transaction_handle th;
struct reiserfs_security_handle security;
- /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
+ /*
+ * We need blocks for transaction + (user+group)*(quotas
+ * for new inode + update of quota for directory owner)
+ */
int jbegin_count =
JOURNAL_PER_BALANCE_CNT * 3 +
2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
@@ -730,7 +790,10 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
dquot_initialize(dir);
#ifdef DISPLACE_NEW_PACKING_LOCALITIES
- /* set flag that new packing locality created and new blocks for the content * of that directory are not displaced yet */
+ /*
+ * set flag that new packing locality created and new blocks
+ * for the content of that directory are not displaced yet
+ */
REISERFS_I(dir)->new_packing_locality = 1;
#endif
mode = S_IFDIR | mode;
@@ -754,8 +817,9 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
goto out_failed;
}
- /* inc the link count now, so another writer doesn't overflow it while
- ** we sleep later on.
+ /*
+ * inc the link count now, so another writer doesn't overflow
+ * it while we sleep later on.
*/
INC_DIR_INODE_NLINK(dir)
@@ -774,7 +838,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
inode->i_op = &reiserfs_dir_inode_operations;
inode->i_fop = &reiserfs_dir_operations;
- // note, _this_ add_entry will not update dir's stat data
+ /* note, _this_ add_entry will not update dir's stat data */
retval =
reiserfs_add_entry(&th, dir, dentry->d_name.name,
dentry->d_name.len, inode, 1 /*visible */ );
@@ -783,19 +847,19 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
clear_nlink(inode);
DEC_DIR_INODE_NLINK(dir);
reiserfs_update_sd(&th, inode);
- err = journal_end(&th, dir->i_sb, jbegin_count);
+ err = journal_end(&th);
if (err)
retval = err;
unlock_new_inode(inode);
iput(inode);
goto out_failed;
}
- // the above add_entry did not update dir's stat data
+ /* the above add_entry did not update dir's stat data */
reiserfs_update_sd(&th, dir);
unlock_new_inode(inode);
d_instantiate(dentry, inode);
- retval = journal_end(&th, dir->i_sb, jbegin_count);
+ retval = journal_end(&th);
out_failed:
reiserfs_write_unlock(dir->i_sb);
return retval;
@@ -803,10 +867,11 @@ out_failed:
static inline int reiserfs_empty_dir(struct inode *inode)
{
- /* we can cheat because an old format dir cannot have
- ** EMPTY_DIR_SIZE, and a new format dir cannot have
- ** EMPTY_DIR_SIZE_V1. So, if the inode is either size,
- ** regardless of disk format version, the directory is empty.
+ /*
+ * we can cheat because an old format dir cannot have
+ * EMPTY_DIR_SIZE, and a new format dir cannot have
+ * EMPTY_DIR_SIZE_V1. So, if the inode is either size,
+ * regardless of disk format version, the directory is empty.
*/
if (inode->i_size != EMPTY_DIR_SIZE &&
inode->i_size != EMPTY_DIR_SIZE_V1) {
@@ -824,10 +889,12 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
INITIALIZE_PATH(path);
struct reiserfs_dir_entry de;
- /* we will be doing 2 balancings and update 2 stat data, we change quotas
- * of the owner of the directory and of the owner of the parent directory.
- * The quota structure is possibly deleted only on last iput => outside
- * of this transaction */
+ /*
+ * we will be doing 2 balancings and update 2 stat data, we
+ * change quotas of the owner of the directory and of the owner
+ * of the parent directory. The quota structure is possibly
+ * deleted only on last iput => outside of this transaction
+ */
jbegin_count =
JOURNAL_PER_BALANCE_CNT * 2 + 2 +
4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
@@ -856,8 +923,9 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
reiserfs_update_inode_transaction(dir);
if (de.de_objectid != inode->i_ino) {
- // FIXME: compare key of an object and a key found in the
- // entry
+ /*
+ * FIXME: compare key of an object and a key found in the entry
+ */
retval = -EIO;
goto end_rmdir;
}
@@ -867,7 +935,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
}
/* cut entry from dir directory */
- retval = reiserfs_cut_from_item(&th, &path, &(de.de_entry_key), dir, NULL, /* page */
+ retval = reiserfs_cut_from_item(&th, &path, &de.de_entry_key,
+ dir, NULL, /* page */
0 /*new file size - not used here */ );
if (retval < 0)
goto end_rmdir;
@@ -888,18 +957,20 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
/* prevent empty directory from getting lost */
add_save_link(&th, inode, 0 /* not truncate */ );
- retval = journal_end(&th, dir->i_sb, jbegin_count);
+ retval = journal_end(&th);
reiserfs_check_path(&path);
- out_rmdir:
+out_rmdir:
reiserfs_write_unlock(dir->i_sb);
return retval;
- end_rmdir:
- /* we must release path, because we did not call
- reiserfs_cut_from_item, or reiserfs_cut_from_item does not
- release path if operation was not complete */
+end_rmdir:
+ /*
+ * we must release path, because we did not call
+ * reiserfs_cut_from_item, or reiserfs_cut_from_item does not
+ * release path if operation was not complete
+ */
pathrelse(&path);
- err = journal_end(&th, dir->i_sb, jbegin_count);
+ err = journal_end(&th);
reiserfs_write_unlock(dir->i_sb);
return err ? err : retval;
}
@@ -918,10 +989,13 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
inode = dentry->d_inode;
- /* in this transaction we can be doing at max two balancings and update
- * two stat datas, we change quotas of the owner of the directory and of
- * the owner of the parent directory. The quota structure is possibly
- * deleted only on iput => outside of this transaction */
+ /*
+ * in this transaction we can be doing at max two balancings and
+ * update two stat datas, we change quotas of the owner of the
+ * directory and of the owner of the parent directory. The quota
+ * structure is possibly deleted only on iput => outside of
+ * this transaction
+ */
jbegin_count =
JOURNAL_PER_BALANCE_CNT * 2 + 2 +
4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
@@ -946,8 +1020,9 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
reiserfs_update_inode_transaction(dir);
if (de.de_objectid != inode->i_ino) {
- // FIXME: compare key of an object and a key found in the
- // entry
+ /*
+ * FIXME: compare key of an object and a key found in the entry
+ */
retval = -EIO;
goto end_unlink;
}
@@ -968,7 +1043,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
savelink = inode->i_nlink;
retval =
- reiserfs_cut_from_item(&th, &path, &(de.de_entry_key), dir, NULL,
+ reiserfs_cut_from_item(&th, &path, &de.de_entry_key, dir, NULL,
0);
if (retval < 0) {
inc_nlink(inode);
@@ -985,18 +1060,18 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
/* prevent file from getting lost */
add_save_link(&th, inode, 0 /* not truncate */ );
- retval = journal_end(&th, dir->i_sb, jbegin_count);
+ retval = journal_end(&th);
reiserfs_check_path(&path);
reiserfs_write_unlock(dir->i_sb);
return retval;
- end_unlink:
+end_unlink:
pathrelse(&path);
- err = journal_end(&th, dir->i_sb, jbegin_count);
+ err = journal_end(&th);
reiserfs_check_path(&path);
if (err)
retval = err;
- out_unlink:
+out_unlink:
reiserfs_write_unlock(dir->i_sb);
return retval;
}
@@ -1011,7 +1086,10 @@ static int reiserfs_symlink(struct inode *parent_dir,
struct reiserfs_transaction_handle th;
struct reiserfs_security_handle security;
int mode = S_IFLNK | S_IRWXUGO;
- /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
+ /*
+ * We need blocks for transaction + (user+group)*(quotas for
+ * new inode + update of quota for directory owner)
+ */
int jbegin_count =
JOURNAL_PER_BALANCE_CNT * 3 +
2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) +
@@ -1070,17 +1148,13 @@ static int reiserfs_symlink(struct inode *parent_dir,
inode->i_op = &reiserfs_symlink_inode_operations;
inode->i_mapping->a_ops = &reiserfs_address_space_operations;
- // must be sure this inode is written with this transaction
- //
- //reiserfs_update_sd (&th, inode, READ_BLOCKS);
-
retval = reiserfs_add_entry(&th, parent_dir, dentry->d_name.name,
dentry->d_name.len, inode, 1 /*visible */ );
if (retval) {
int err;
drop_nlink(inode);
reiserfs_update_sd(&th, inode);
- err = journal_end(&th, parent_dir->i_sb, jbegin_count);
+ err = journal_end(&th);
if (err)
retval = err;
unlock_new_inode(inode);
@@ -1090,8 +1164,8 @@ static int reiserfs_symlink(struct inode *parent_dir,
unlock_new_inode(inode);
d_instantiate(dentry, inode);
- retval = journal_end(&th, parent_dir->i_sb, jbegin_count);
- out_failed:
+ retval = journal_end(&th);
+out_failed:
reiserfs_write_unlock(parent_dir->i_sb);
return retval;
}
@@ -1102,7 +1176,10 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
int retval;
struct inode *inode = old_dentry->d_inode;
struct reiserfs_transaction_handle th;
- /* We need blocks for transaction + update of quotas for the owners of the directory */
+ /*
+ * We need blocks for transaction + update of quotas for
+ * the owners of the directory
+ */
int jbegin_count =
JOURNAL_PER_BALANCE_CNT * 3 +
2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
@@ -1111,7 +1188,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
reiserfs_write_lock(dir->i_sb);
if (inode->i_nlink >= REISERFS_LINK_MAX) {
- //FIXME: sd_nlink is 32 bit for new files
+ /* FIXME: sd_nlink is 32 bit for new files */
reiserfs_write_unlock(dir->i_sb);
return -EMLINK;
}
@@ -1137,7 +1214,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
if (retval) {
int err;
drop_nlink(inode);
- err = journal_end(&th, dir->i_sb, jbegin_count);
+ err = journal_end(&th);
reiserfs_write_unlock(dir->i_sb);
return err ? err : retval;
}
@@ -1147,7 +1224,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
ihold(inode);
d_instantiate(dentry, inode);
- retval = journal_end(&th, dir->i_sb, jbegin_count);
+ retval = journal_end(&th);
reiserfs_write_unlock(dir->i_sb);
return retval;
}
@@ -1158,9 +1235,9 @@ static int de_still_valid(const char *name, int len,
{
struct reiserfs_dir_entry tmp = *de;
- // recalculate pointer to name and name length
+ /* recalculate pointer to name and name length */
set_de_name_and_namelen(&tmp);
- // FIXME: could check more
+ /* FIXME: could check more */
if (tmp.de_namelen != len || memcmp(name, de->de_name, len))
return 0;
return 1;
@@ -1217,14 +1294,16 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
unsigned long savelink = 1;
struct timespec ctime;
- /* three balancings: (1) old name removal, (2) new name insertion
- and (3) maybe "save" link insertion
- stat data updates: (1) old directory,
- (2) new directory and (3) maybe old object stat data (when it is
- directory) and (4) maybe stat data of object to which new entry
- pointed initially and (5) maybe block containing ".." of
- renamed directory
- quota updates: two parent directories */
+ /*
+ * three balancings: (1) old name removal, (2) new name insertion
+ * and (3) maybe "save" link insertion
+ * stat data updates: (1) old directory,
+ * (2) new directory and (3) maybe old object stat data (when it is
+ * directory) and (4) maybe stat data of object to which new entry
+ * pointed initially and (5) maybe block containing ".." of
+ * renamed directory
+ * quota updates: two parent directories
+ */
jbegin_count =
JOURNAL_PER_BALANCE_CNT * 3 + 5 +
4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb);
@@ -1235,8 +1314,10 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
old_inode = old_dentry->d_inode;
new_dentry_inode = new_dentry->d_inode;
- // make sure, that oldname still exists and points to an object we
- // are going to rename
+ /*
+ * make sure that oldname still exists and points to an object we
+ * are going to rename
+ */
old_de.de_gen_number_bit_string = NULL;
reiserfs_write_lock(old_dir->i_sb);
retval =
@@ -1256,10 +1337,11 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
old_inode_mode = old_inode->i_mode;
if (S_ISDIR(old_inode_mode)) {
- // make sure, that directory being renamed has correct ".."
- // and that its new parent directory has not too many links
- // already
-
+ /*
+ * make sure that directory being renamed has correct ".."
+ * and that its new parent directory has not too many links
+ * already
+ */
if (new_dentry_inode) {
if (!reiserfs_empty_dir(new_dentry_inode)) {
reiserfs_write_unlock(old_dir->i_sb);
@@ -1267,8 +1349,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
}
- /* directory is renamed, its parent directory will be changed,
- ** so find ".." entry
+ /*
+ * directory is renamed, its parent directory will be changed,
+ * so find ".." entry
*/
dot_dot_de.de_gen_number_bit_string = NULL;
retval =
@@ -1303,7 +1386,7 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
"new entry is found, new inode == 0");
}
} else if (retval) {
- int err = journal_end(&th, old_dir->i_sb, jbegin_count);
+ int err = journal_end(&th);
reiserfs_write_unlock(old_dir->i_sb);
return err ? err : retval;
}
@@ -1311,8 +1394,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
reiserfs_update_inode_transaction(old_dir);
reiserfs_update_inode_transaction(new_dir);
- /* this makes it so an fsync on an open fd for the old name will
- ** commit the rename operation
+ /*
+ * this makes it so an fsync on an open fd for the old name will
+ * commit the rename operation
*/
reiserfs_update_inode_transaction(old_inode);
@@ -1320,38 +1404,45 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
reiserfs_update_inode_transaction(new_dentry_inode);
while (1) {
- // look for old name using corresponding entry key (found by reiserfs_find_entry)
+ /*
+ * look for old name using corresponding entry key
+ * (found by reiserfs_find_entry)
+ */
if ((retval =
search_by_entry_key(new_dir->i_sb, &old_de.de_entry_key,
&old_entry_path,
&old_de)) != NAME_FOUND) {
pathrelse(&old_entry_path);
- journal_end(&th, old_dir->i_sb, jbegin_count);
+ journal_end(&th);
reiserfs_write_unlock(old_dir->i_sb);
return -EIO;
}
- copy_item_head(&old_entry_ih, get_ih(&old_entry_path));
+ copy_item_head(&old_entry_ih, tp_item_head(&old_entry_path));
reiserfs_prepare_for_journal(old_inode->i_sb, old_de.de_bh, 1);
- // look for new name by reiserfs_find_entry
+ /* look for new name by reiserfs_find_entry */
new_de.de_gen_number_bit_string = NULL;
retval =
reiserfs_find_entry(new_dir, new_dentry->d_name.name,
new_dentry->d_name.len, &new_entry_path,
&new_de);
- // reiserfs_add_entry should not return IO_ERROR, because it is called with essentially same parameters from
- // reiserfs_add_entry above, and we'll catch any i/o errors before we get here.
+ /*
+ * reiserfs_add_entry should not return IO_ERROR,
+ * because it is called with essentially same parameters from
+ * reiserfs_add_entry above, and we'll catch any i/o errors
+ * before we get here.
+ */
if (retval != NAME_FOUND_INVISIBLE && retval != NAME_FOUND) {
pathrelse(&new_entry_path);
pathrelse(&old_entry_path);
- journal_end(&th, old_dir->i_sb, jbegin_count);
+ journal_end(&th);
reiserfs_write_unlock(old_dir->i_sb);
return -EIO;
}
- copy_item_head(&new_entry_ih, get_ih(&new_entry_path));
+ copy_item_head(&new_entry_ih, tp_item_head(&new_entry_path));
reiserfs_prepare_for_journal(old_inode->i_sb, new_de.de_bh, 1);
@@ -1364,28 +1455,32 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
pathrelse(&dot_dot_entry_path);
pathrelse(&new_entry_path);
pathrelse(&old_entry_path);
- journal_end(&th, old_dir->i_sb, jbegin_count);
+ journal_end(&th);
reiserfs_write_unlock(old_dir->i_sb);
return -EIO;
}
copy_item_head(&dot_dot_ih,
- get_ih(&dot_dot_entry_path));
- // node containing ".." gets into transaction
+ tp_item_head(&dot_dot_entry_path));
+ /* node containing ".." gets into transaction */
reiserfs_prepare_for_journal(old_inode->i_sb,
dot_dot_de.de_bh, 1);
}
- /* we should check seals here, not do
- this stuff, yes? Then, having
- gathered everything into RAM we
- should lock the buffers, yes? -Hans */
- /* probably. our rename needs to hold more
- ** than one path at once. The seals would
- ** have to be written to deal with multi-path
- ** issues -chris
+ /*
+ * we should check seals here, not do
+ * this stuff, yes? Then, having
+ * gathered everything into RAM we
+ * should lock the buffers, yes? -Hans
+ */
+ /*
+ * probably. our rename needs to hold more
+ * than one path at once. The seals would
+ * have to be written to deal with multi-path
+ * issues -chris
*/
- /* sanity checking before doing the rename - avoid races many
- ** of the above checks could have scheduled. We have to be
- ** sure our items haven't been shifted by another process.
+ /*
+ * sanity checking before doing the rename - avoid races many
+ * of the above checks could have scheduled. We have to be
+ * sure our items haven't been shifted by another process.
*/
if (item_moved(&new_entry_ih, &new_entry_path) ||
!entry_points_to_object(new_dentry->d_name.name,
@@ -1430,24 +1525,28 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
break;
}
- /* ok, all the changes can be done in one fell swoop when we
- have claimed all the buffers needed. */
+ /*
+ * ok, all the changes can be done in one fell swoop when we
+ * have claimed all the buffers needed.
+ */
mark_de_visible(new_de.de_deh + new_de.de_entry_num);
set_ino_in_dir_entry(&new_de, INODE_PKEY(old_inode));
- journal_mark_dirty(&th, old_dir->i_sb, new_de.de_bh);
+ journal_mark_dirty(&th, new_de.de_bh);
mark_de_hidden(old_de.de_deh + old_de.de_entry_num);
- journal_mark_dirty(&th, old_dir->i_sb, old_de.de_bh);
+ journal_mark_dirty(&th, old_de.de_bh);
ctime = CURRENT_TIME_SEC;
old_dir->i_ctime = old_dir->i_mtime = ctime;
new_dir->i_ctime = new_dir->i_mtime = ctime;
- /* thanks to Alex Adriaanse <alex_a@caltech.edu> for patch which adds ctime update of
- renamed object */
+ /*
+ * thanks to Alex Adriaanse <alex_a@caltech.edu> for patch
+ * which adds ctime update of renamed object
+ */
old_inode->i_ctime = ctime;
if (new_dentry_inode) {
- // adjust link number of the victim
+ /* adjust link number of the victim */
if (S_ISDIR(new_dentry_inode->i_mode)) {
clear_nlink(new_dentry_inode);
} else {
@@ -1460,25 +1559,32 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (S_ISDIR(old_inode_mode)) {
/* adjust ".." of renamed directory */
set_ino_in_dir_entry(&dot_dot_de, INODE_PKEY(new_dir));
- journal_mark_dirty(&th, new_dir->i_sb, dot_dot_de.de_bh);
+ journal_mark_dirty(&th, dot_dot_de.de_bh);
+ /*
+ * there (in new_dir) was no directory, so it got new link
+ * (".." of renamed directory)
+ */
if (!new_dentry_inode)
- /* there (in new_dir) was no directory, so it got new link
- (".." of renamed directory) */
INC_DIR_INODE_NLINK(new_dir);
/* old directory lost one link - ".. " of renamed directory */
DEC_DIR_INODE_NLINK(old_dir);
}
- // looks like in 2.3.99pre3 brelse is atomic. so we can use pathrelse
+ /*
+ * looks like in 2.3.99pre3 brelse is atomic.
+ * so we can use pathrelse
+ */
pathrelse(&new_entry_path);
pathrelse(&dot_dot_entry_path);
- // FIXME: this reiserfs_cut_from_item's return value may screw up
- // anybody, but it will panic if will not be able to find the
- // entry. This needs one more clean up
+ /*
+ * FIXME: this reiserfs_cut_from_item's return value may screw up
+ * anybody, but it will panic if will not be able to find the
+ * entry. This needs one more clean up
+ */
if (reiserfs_cut_from_item
- (&th, &old_entry_path, &(old_de.de_entry_key), old_dir, NULL,
+ (&th, &old_entry_path, &old_de.de_entry_key, old_dir, NULL,
0) < 0)
reiserfs_error(old_dir->i_sb, "vs-7060",
"couldn't not cut old name. Fsck later?");
@@ -1496,16 +1602,13 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
reiserfs_update_sd(&th, new_dentry_inode);
}
- retval = journal_end(&th, old_dir->i_sb, jbegin_count);
+ retval = journal_end(&th);
reiserfs_write_unlock(old_dir->i_sb);
return retval;
}
-/*
- * directories can handle most operations...
- */
+/* directories can handle most operations... */
const struct inode_operations reiserfs_dir_inode_operations = {
- //&reiserfs_dir_operations, /* default_file_ops */
.create = reiserfs_create,
.lookup = reiserfs_lookup,
.link = reiserfs_link,
diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
index f732d6a5251d..99a5d5dae46a 100644
--- a/fs/reiserfs/objectid.c
+++ b/fs/reiserfs/objectid.c
@@ -7,7 +7,7 @@
#include <linux/time.h>
#include "reiserfs.h"
-// find where objectid map starts
+/* find where objectid map starts */
#define objectid_map(s,rs) (old_format_only (s) ? \
(__le32 *)((struct reiserfs_super_block_v1 *)(rs) + 1) :\
(__le32 *)((rs) + 1))
@@ -20,7 +20,7 @@ static void check_objectid_map(struct super_block *s, __le32 * map)
reiserfs_panic(s, "vs-15010", "map corrupted: %lx",
(long unsigned int)le32_to_cpu(map[0]));
- // FIXME: add something else here
+ /* FIXME: add something else here */
}
#else
@@ -29,19 +29,21 @@ static void check_objectid_map(struct super_block *s, __le32 * map)
}
#endif
-/* When we allocate objectids we allocate the first unused objectid.
- Each sequence of objectids in use (the odd sequences) is followed
- by a sequence of objectids not in use (the even sequences). We
- only need to record the last objectid in each of these sequences
- (both the odd and even sequences) in order to fully define the
- boundaries of the sequences. A consequence of allocating the first
- objectid not in use is that under most conditions this scheme is
- extremely compact. The exception is immediately after a sequence
- of operations which deletes a large number of objects of
- non-sequential objectids, and even then it will become compact
- again as soon as more objects are created. Note that many
- interesting optimizations of layout could result from complicating
- objectid assignment, but we have deferred making them for now. */
+/*
+ * When we allocate objectids we allocate the first unused objectid.
+ * Each sequence of objectids in use (the odd sequences) is followed
+ * by a sequence of objectids not in use (the even sequences). We
+ * only need to record the last objectid in each of these sequences
+ * (both the odd and even sequences) in order to fully define the
+ * boundaries of the sequences. A consequence of allocating the first
+ * objectid not in use is that under most conditions this scheme is
+ * extremely compact. The exception is immediately after a sequence
+ * of operations which deletes a large number of objects of
+ * non-sequential objectids, and even then it will become compact
+ * again as soon as more objects are created. Note that many
+ * interesting optimizations of layout could result from complicating
+ * objectid assignment, but we have deferred making them for now.
+ */
/* get unique object identifier */
__u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th)
@@ -64,26 +66,30 @@ __u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th)
return 0;
}
- /* This incrementation allocates the first unused objectid. That
- is to say, the first entry on the objectid map is the first
- unused objectid, and by incrementing it we use it. See below
- where we check to see if we eliminated a sequence of unused
- objectids.... */
+ /*
+ * This incrementation allocates the first unused objectid. That
+ * is to say, the first entry on the objectid map is the first
+ * unused objectid, and by incrementing it we use it. See below
+ * where we check to see if we eliminated a sequence of unused
+ * objectids....
+ */
map[1] = cpu_to_le32(unused_objectid + 1);
- /* Now we check to see if we eliminated the last remaining member of
- the first even sequence (and can eliminate the sequence by
- eliminating its last objectid from oids), and can collapse the
- first two odd sequences into one sequence. If so, then the net
- result is to eliminate a pair of objectids from oids. We do this
- by shifting the entire map to the left. */
+ /*
+ * Now we check to see if we eliminated the last remaining member of
+ * the first even sequence (and can eliminate the sequence by
+ * eliminating its last objectid from oids), and can collapse the
+ * first two odd sequences into one sequence. If so, then the net
+ * result is to eliminate a pair of objectids from oids. We do this
+ * by shifting the entire map to the left.
+ */
if (sb_oid_cursize(rs) > 2 && map[1] == map[2]) {
memmove(map + 1, map + 3,
(sb_oid_cursize(rs) - 3) * sizeof(__u32));
set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2);
}
- journal_mark_dirty(th, s, SB_BUFFER_WITH_SB(s));
+ journal_mark_dirty(th, SB_BUFFER_WITH_SB(s));
return unused_objectid;
}
@@ -97,30 +103,33 @@ void reiserfs_release_objectid(struct reiserfs_transaction_handle *th,
int i = 0;
BUG_ON(!th->t_trans_id);
- //return;
+ /*return; */
check_objectid_map(s, map);
reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
- journal_mark_dirty(th, s, SB_BUFFER_WITH_SB(s));
-
- /* start at the beginning of the objectid map (i = 0) and go to
- the end of it (i = disk_sb->s_oid_cursize). Linear search is
- what we use, though it is possible that binary search would be
- more efficient after performing lots of deletions (which is
- when oids is large.) We only check even i's. */
+ journal_mark_dirty(th, SB_BUFFER_WITH_SB(s));
+
+ /*
+ * start at the beginning of the objectid map (i = 0) and go to
+ * the end of it (i = disk_sb->s_oid_cursize). Linear search is
+ * what we use, though it is possible that binary search would be
+ * more efficient after performing lots of deletions (which is
+ * when oids is large.) We only check even i's.
+ */
while (i < sb_oid_cursize(rs)) {
if (objectid_to_release == le32_to_cpu(map[i])) {
/* This incrementation unallocates the objectid. */
- //map[i]++;
le32_add_cpu(&map[i], 1);
- /* Did we unallocate the last member of an odd sequence, and can shrink oids? */
+ /*
+ * Did we unallocate the last member of an
+ * odd sequence, and can shrink oids?
+ */
if (map[i] == map[i + 1]) {
/* shrink objectid map */
memmove(map + i, map + i + 2,
(sb_oid_cursize(rs) - i -
2) * sizeof(__u32));
- //disk_sb->s_oid_cursize -= 2;
set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2);
RFALSE(sb_oid_cursize(rs) < 2 ||
@@ -135,14 +144,19 @@ void reiserfs_release_objectid(struct reiserfs_transaction_handle *th,
objectid_to_release < le32_to_cpu(map[i + 1])) {
/* size of objectid map is not changed */
if (objectid_to_release + 1 == le32_to_cpu(map[i + 1])) {
- //objectid_map[i+1]--;
le32_add_cpu(&map[i + 1], -1);
return;
}
- /* JDM comparing two little-endian values for equality -- safe */
+ /*
+ * JDM comparing two little-endian values for
+ * equality -- safe
+ */
+ /*
+ * objectid map must be expanded, but
+ * there is no space
+ */
if (sb_oid_cursize(rs) == sb_oid_maxsize(rs)) {
- /* objectid map must be expanded, but there is no space */
PROC_INFO_INC(s, leaked_oid);
return;
}
@@ -178,8 +192,9 @@ int reiserfs_convert_objectid_map_v1(struct super_block *s)
new_objectid_map = (__le32 *) (disk_sb + 1);
if (cur_size > new_size) {
- /* mark everyone used that was listed as free at the end of the objectid
- ** map
+ /*
+ * mark everyone used that was listed as free at
+ * the end of the objectid map
*/
objectid_map[new_size - 1] = objectid_map[cur_size - 1];
set_sb_oid_cursize(disk_sb, new_size);
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 54944d5a4a6e..c9b47e91baf8 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -172,18 +172,19 @@ static char *is_there_reiserfs_struct(char *fmt, int *what)
return k;
}
-/* debugging reiserfs we used to print out a lot of different
- variables, like keys, item headers, buffer heads etc. Values of
- most fields matter. So it took a long time just to write
- appropriative printk. With this reiserfs_warning you can use format
- specification for complex structures like you used to do with
- printfs for integers, doubles and pointers. For instance, to print
- out key structure you have to write just:
- reiserfs_warning ("bad key %k", key);
- instead of
- printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid,
- key->k_offset, key->k_uniqueness);
-*/
+/*
+ * debugging reiserfs we used to print out a lot of different
+ * variables, like keys, item headers, buffer heads etc. Values of
+ * most fields matter. So it took a long time just to write
+ * appropriative printk. With this reiserfs_warning you can use format
+ * specification for complex structures like you used to do with
+ * printfs for integers, doubles and pointers. For instance, to print
+ * out key structure you have to write just:
+ * reiserfs_warning ("bad key %k", key);
+ * instead of
+ * printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid,
+ * key->k_offset, key->k_uniqueness);
+ */
static DEFINE_SPINLOCK(error_lock);
static void prepare_error_buf(const char *fmt, va_list args)
{
@@ -243,15 +244,16 @@ static void prepare_error_buf(const char *fmt, va_list args)
}
-/* in addition to usual conversion specifiers this accepts reiserfs
- specific conversion specifiers:
- %k to print little endian key,
- %K to print cpu key,
- %h to print item_head,
- %t to print directory entry
- %z to print block head (arg must be struct buffer_head *
- %b to print buffer_head
-*/
+/*
+ * in addition to usual conversion specifiers this accepts reiserfs
+ * specific conversion specifiers:
+ * %k to print little endian key,
+ * %K to print cpu key,
+ * %h to print item_head,
+ * %t to print directory entry
+ * %z to print block head (arg must be struct buffer_head *
+ * %b to print buffer_head
+ */
#define do_reiserfs_warning(fmt)\
{\
@@ -304,50 +306,52 @@ void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...)
#endif
}
-/* The format:
-
- maintainer-errorid: [function-name:] message
-
- where errorid is unique to the maintainer and function-name is
- optional, is recommended, so that anyone can easily find the bug
- with a simple grep for the short to type string
- maintainer-errorid. Don't bother with reusing errorids, there are
- lots of numbers out there.
-
- Example:
-
- reiserfs_panic(
- p_sb, "reiser-29: reiserfs_new_blocknrs: "
- "one of search_start or rn(%d) is equal to MAX_B_NUM,"
- "which means that we are optimizing location based on the bogus location of a temp buffer (%p).",
- rn, bh
- );
-
- Regular panic()s sometimes clear the screen before the message can
- be read, thus the need for the while loop.
-
- Numbering scheme for panic used by Vladimir and Anatoly( Hans completely ignores this scheme, and considers it
- pointless complexity):
-
- panics in reiserfs.h have numbers from 1000 to 1999
- super.c 2000 to 2999
- preserve.c (unused) 3000 to 3999
- bitmap.c 4000 to 4999
- stree.c 5000 to 5999
- prints.c 6000 to 6999
- namei.c 7000 to 7999
- fix_nodes.c 8000 to 8999
- dir.c 9000 to 9999
- lbalance.c 10000 to 10999
- ibalance.c 11000 to 11999 not ready
- do_balan.c 12000 to 12999
- inode.c 13000 to 13999
- file.c 14000 to 14999
- objectid.c 15000 - 15999
- buffer.c 16000 - 16999
- symlink.c 17000 - 17999
-
- . */
+/*
+ * The format:
+ *
+ * maintainer-errorid: [function-name:] message
+ *
+ * where errorid is unique to the maintainer and function-name is
+ * optional, is recommended, so that anyone can easily find the bug
+ * with a simple grep for the short to type string
+ * maintainer-errorid. Don't bother with reusing errorids, there are
+ * lots of numbers out there.
+ *
+ * Example:
+ *
+ * reiserfs_panic(
+ * p_sb, "reiser-29: reiserfs_new_blocknrs: "
+ * "one of search_start or rn(%d) is equal to MAX_B_NUM,"
+ * "which means that we are optimizing location based on the "
+ * "bogus location of a temp buffer (%p).",
+ * rn, bh
+ * );
+ *
+ * Regular panic()s sometimes clear the screen before the message can
+ * be read, thus the need for the while loop.
+ *
+ * Numbering scheme for panic used by Vladimir and Anatoly( Hans completely
+ * ignores this scheme, and considers it pointless complexity):
+ *
+ * panics in reiserfs_fs.h have numbers from 1000 to 1999
+ * super.c 2000 to 2999
+ * preserve.c (unused) 3000 to 3999
+ * bitmap.c 4000 to 4999
+ * stree.c 5000 to 5999
+ * prints.c 6000 to 6999
+ * namei.c 7000 to 7999
+ * fix_nodes.c 8000 to 8999
+ * dir.c 9000 to 9999
+ * lbalance.c 10000 to 10999
+ * ibalance.c 11000 to 11999 not ready
+ * do_balan.c 12000 to 12999
+ * inode.c 13000 to 13999
+ * file.c 14000 to 14999
+ * objectid.c 15000 - 15999
+ * buffer.c 16000 - 16999
+ * symlink.c 17000 - 17999
+ *
+ * . */
void __reiserfs_panic(struct super_block *sb, const char *id,
const char *function, const char *fmt, ...)
@@ -411,9 +415,11 @@ void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...)
reiserfs_abort_journal(sb, errno);
}
-/* this prints internal nodes (4 keys/items in line) (dc_number,
- dc_size)[k_dirid, k_objectid, k_offset, k_uniqueness](dc_number,
- dc_size)...*/
+/*
+ * this prints internal nodes (4 keys/items in line) (dc_number,
+ * dc_size)[k_dirid, k_objectid, k_offset, k_uniqueness](dc_number,
+ * dc_size)...
+ */
static int print_internal(struct buffer_head *bh, int first, int last)
{
struct reiserfs_key *key;
@@ -439,7 +445,7 @@ static int print_internal(struct buffer_head *bh, int first, int last)
dc = B_N_CHILD(bh, from);
reiserfs_printk("PTR %d: %y ", from, dc);
- for (i = from, key = B_N_PDELIM_KEY(bh, from), dc++; i < to;
+ for (i = from, key = internal_key(bh, from), dc++; i < to;
i++, key++, dc++) {
reiserfs_printk("KEY %d: %k PTR %d: %y ", i, key, i + 1, dc);
if (i && i % 4 == 0)
@@ -463,7 +469,7 @@ static int print_leaf(struct buffer_head *bh, int print_mode, int first,
check_leaf(bh);
blkh = B_BLK_HEAD(bh);
- ih = B_N_PITEM_HEAD(bh, 0);
+ ih = item_head(bh, 0);
nr = blkh_nr_item(blkh);
printk
@@ -496,7 +502,7 @@ static int print_leaf(struct buffer_head *bh, int print_mode, int first,
("-------------------------------------------------------------------------------\n");
reiserfs_printk("|%2d| %h |\n", i, ih);
if (print_mode & PRINT_LEAF_ITEMS)
- op_print_item(ih, B_I_PITEM(bh, ih));
+ op_print_item(ih, ih_item_body(bh, ih));
}
printk
@@ -543,9 +549,11 @@ static int print_super_block(struct buffer_head *bh)
printk("Block count %u\n", sb_block_count(rs));
printk("Blocksize %d\n", sb_blocksize(rs));
printk("Free blocks %u\n", sb_free_blocks(rs));
- // FIXME: this would be confusing if
- // someone stores reiserfs super block in some data block ;)
+ /*
+ * FIXME: this would be confusing if
+ * someone stores reiserfs super block in some data block ;)
// skipped = (bh->b_blocknr * bh->b_size) / sb_blocksize(rs);
+ */
skipped = bh->b_blocknr;
data_blocks = sb_block_count(rs) - skipped - 1 - sb_bmap_nr(rs) -
(!is_reiserfs_jr(rs) ? sb_jp_journal_size(rs) +
@@ -581,8 +589,8 @@ static int print_desc_block(struct buffer_head *bh)
return 0;
}
-
-void print_block(struct buffer_head *bh, ...) //int print_mode, int first, int last)
+/* ..., int print_mode, int first, int last) */
+void print_block(struct buffer_head *bh, ...)
{
va_list args;
int mode, first, last;
@@ -644,11 +652,11 @@ void store_print_tb(struct tree_balance *tb)
"* %d * %3lld(%2d) * %3lld(%2d) * %3lld(%2d) * %5lld * %5lld * %5lld * %5lld * %5lld *\n",
h,
(tbSh) ? (long long)(tbSh->b_blocknr) : (-1LL),
- (tbSh) ? atomic_read(&(tbSh->b_count)) : -1,
+ (tbSh) ? atomic_read(&tbSh->b_count) : -1,
(tb->L[h]) ? (long long)(tb->L[h]->b_blocknr) : (-1LL),
- (tb->L[h]) ? atomic_read(&(tb->L[h]->b_count)) : -1,
+ (tb->L[h]) ? atomic_read(&tb->L[h]->b_count) : -1,
(tb->R[h]) ? (long long)(tb->R[h]->b_blocknr) : (-1LL),
- (tb->R[h]) ? atomic_read(&(tb->R[h]->b_count)) : -1,
+ (tb->R[h]) ? atomic_read(&tb->R[h]->b_count) : -1,
(tbFh) ? (long long)(tbFh->b_blocknr) : (-1LL),
(tb->FL[h]) ? (long long)(tb->FL[h]->
b_blocknr) : (-1LL),
@@ -665,9 +673,9 @@ void store_print_tb(struct tree_balance *tb)
"* h * size * ln * lb * rn * rb * blkn * s0 * s1 * s1b * s2 * s2b * curb * lk * rk *\n"
"* 0 * %4d * %2d * %2d * %2d * %2d * %4d * %2d * %2d * %3d * %2d * %3d * %4d * %2d * %2d *\n",
tb->insert_size[0], tb->lnum[0], tb->lbytes, tb->rnum[0],
- tb->rbytes, tb->blknum[0], tb->s0num, tb->s1num, tb->s1bytes,
- tb->s2num, tb->s2bytes, tb->cur_blknum, tb->lkey[0],
- tb->rkey[0]);
+ tb->rbytes, tb->blknum[0], tb->s0num, tb->snum[0],
+ tb->sbytes[0], tb->snum[1], tb->sbytes[1],
+ tb->cur_blknum, tb->lkey[0], tb->rkey[0]);
/* this prints balance parameters for non-leaf levels */
h = 0;
@@ -690,7 +698,7 @@ void store_print_tb(struct tree_balance *tb)
"%p (%llu %d)%s", tb->FEB[i],
tb->FEB[i] ? (unsigned long long)tb->FEB[i]->
b_blocknr : 0ULL,
- tb->FEB[i] ? atomic_read(&(tb->FEB[i]->b_count)) : 0,
+ tb->FEB[i] ? atomic_read(&tb->FEB[i]->b_count) : 0,
(i == ARRAY_SIZE(tb->FEB) - 1) ? "\n" : ", ");
sprintf(print_tb_buf + strlen(print_tb_buf),
@@ -744,8 +752,8 @@ void check_leaf(struct buffer_head *bh)
if (!bh)
return;
check_leaf_block_head(bh);
- for (i = 0, ih = B_N_PITEM_HEAD(bh, 0); i < B_NR_ITEMS(bh); i++, ih++)
- op_check_item(ih, B_I_PITEM(bh, ih));
+ for (i = 0, ih = item_head(bh, 0); i < B_NR_ITEMS(bh); i++, ih++)
+ op_check_item(ih, ih_item_body(bh, ih));
}
void check_internal(struct buffer_head *bh)
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 83d4eac8059a..bf53888c7f59 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -1,5 +1,6 @@
/*
- * Copyright 1996, 1997, 1998 Hans Reiser, see reiserfs/README for licensing and copyright details
+ * Copyright 1996, 1997, 1998 Hans Reiser, see reiserfs/README for
+ * licensing and copyright details
*/
#include <linux/reiserfs_fs.h>
@@ -23,52 +24,73 @@
struct reiserfs_journal_list;
-/** bitmasks for i_flags field in reiserfs-specific part of inode */
+/* bitmasks for i_flags field in reiserfs-specific part of inode */
typedef enum {
- /** this says what format of key do all items (but stat data) of
- an object have. If this is set, that format is 3.6 otherwise
- - 3.5 */
+ /*
+ * this says what format of key do all items (but stat data) of
+ * an object have. If this is set, that format is 3.6 otherwise - 3.5
+ */
i_item_key_version_mask = 0x0001,
- /** If this is unset, object has 3.5 stat data, otherwise, it has
- 3.6 stat data with 64bit size, 32bit nlink etc. */
+
+ /*
+ * If this is unset, object has 3.5 stat data, otherwise,
+ * it has 3.6 stat data with 64bit size, 32bit nlink etc.
+ */
i_stat_data_version_mask = 0x0002,
- /** file might need tail packing on close */
+
+ /* file might need tail packing on close */
i_pack_on_close_mask = 0x0004,
- /** don't pack tail of file */
+
+ /* don't pack tail of file */
i_nopack_mask = 0x0008,
- /** If those is set, "safe link" was created for this file during
- truncate or unlink. Safe link is used to avoid leakage of disk
- space on crash with some files open, but unlinked. */
+
+ /*
+ * If either of these are set, "safe link" was created for this
+ * file during truncate or unlink. Safe link is used to avoid
+ * leakage of disk space on crash with some files open, but unlinked.
+ */
i_link_saved_unlink_mask = 0x0010,
i_link_saved_truncate_mask = 0x0020,
+
i_has_xattr_dir = 0x0040,
i_data_log = 0x0080,
} reiserfs_inode_flags;
struct reiserfs_inode_info {
__u32 i_key[4]; /* key is still 4 32 bit integers */
- /** transient inode flags that are never stored on disk. Bitmasks
- for this field are defined above. */
+
+ /*
+ * transient inode flags that are never stored on disk. Bitmasks
+ * for this field are defined above.
+ */
__u32 i_flags;
- __u32 i_first_direct_byte; // offset of first byte stored in direct item.
+ /* offset of first byte stored in direct item. */
+ __u32 i_first_direct_byte;
/* copy of persistent inode flags read from sd_attrs. */
__u32 i_attrs;
- int i_prealloc_block; /* first unused block of a sequence of unused blocks */
+ /* first unused block of a sequence of unused blocks */
+ int i_prealloc_block;
int i_prealloc_count; /* length of that sequence */
- struct list_head i_prealloc_list; /* per-transaction list of inodes which
- * have preallocated blocks */
- unsigned new_packing_locality:1; /* new_packig_locality is created; new blocks
- * for the contents of this directory should be
- * displaced */
+ /* per-transaction list of inodes which have preallocated blocks */
+ struct list_head i_prealloc_list;
- /* we use these for fsync or O_SYNC to decide which transaction
- ** needs to be committed in order for this inode to be properly
- ** flushed */
+ /*
+ * new_packing_locality is created; new blocks for the contents
+ * of this directory should be displaced
+ */
+ unsigned new_packing_locality:1;
+
+ /*
+ * we use these for fsync or O_SYNC to decide which transaction
+ * needs to be committed in order for this inode to be properly
+ * flushed
+ */
unsigned int i_trans_id;
+
struct reiserfs_journal_list *i_jl;
atomic_t openers;
struct mutex tailpack;
@@ -82,9 +104,10 @@ typedef enum {
reiserfs_attrs_cleared = 0x00000001,
} reiserfs_super_block_flags;
-/* struct reiserfs_super_block accessors/mutators
- * since this is a disk structure, it will always be in
- * little endian format. */
+/*
+ * struct reiserfs_super_block accessors/mutators since this is a disk
+ * structure, it will always be in little endian format.
+ */
#define sb_block_count(sbp) (le32_to_cpu((sbp)->s_v1.s_block_count))
#define set_sb_block_count(sbp,v) ((sbp)->s_v1.s_block_count = cpu_to_le32(v))
#define sb_free_blocks(sbp) (le32_to_cpu((sbp)->s_v1.s_free_blocks))
@@ -152,48 +175,61 @@ typedef enum {
/* LOGGING -- */
-/* These all interelate for performance.
-**
-** If the journal block count is smaller than n transactions, you lose speed.
-** I don't know what n is yet, I'm guessing 8-16.
-**
-** typical transaction size depends on the application, how often fsync is
-** called, and how many metadata blocks you dirty in a 30 second period.
-** The more small files (<16k) you use, the larger your transactions will
-** be.
-**
-** If your journal fills faster than dirty buffers get flushed to disk, it must flush them before allowing the journal
-** to wrap, which slows things down. If you need high speed meta data updates, the journal should be big enough
-** to prevent wrapping before dirty meta blocks get to disk.
-**
-** If the batch max is smaller than the transaction max, you'll waste space at the end of the journal
-** because journal_end sets the next transaction to start at 0 if the next transaction has any chance of wrapping.
-**
-** The large the batch max age, the better the speed, and the more meta data changes you'll lose after a crash.
-**
-*/
+/*
+ * These all interelate for performance.
+ *
+ * If the journal block count is smaller than n transactions, you lose speed.
+ * I don't know what n is yet, I'm guessing 8-16.
+ *
+ * typical transaction size depends on the application, how often fsync is
+ * called, and how many metadata blocks you dirty in a 30 second period.
+ * The more small files (<16k) you use, the larger your transactions will
+ * be.
+ *
+ * If your journal fills faster than dirty buffers get flushed to disk, it
+ * must flush them before allowing the journal to wrap, which slows things
+ * down. If you need high speed meta data updates, the journal should be
+ * big enough to prevent wrapping before dirty meta blocks get to disk.
+ *
+ * If the batch max is smaller than the transaction max, you'll waste space
+ * at the end of the journal because journal_end sets the next transaction
+ * to start at 0 if the next transaction has any chance of wrapping.
+ *
+ * The large the batch max age, the better the speed, and the more meta
+ * data changes you'll lose after a crash.
+ */
/* don't mess with these for a while */
- /* we have a node size define somewhere in reiserfs_fs.h. -Hans */
+/* we have a node size define somewhere in reiserfs_fs.h. -Hans */
#define JOURNAL_BLOCK_SIZE 4096 /* BUG gotta get rid of this */
#define JOURNAL_MAX_CNODE 1500 /* max cnodes to allocate. */
#define JOURNAL_HASH_SIZE 8192
-#define JOURNAL_NUM_BITMAPS 5 /* number of copies of the bitmaps to have floating. Must be >= 2 */
-
-/* One of these for every block in every transaction
-** Each one is in two hash tables. First, a hash of the current transaction, and after journal_end, a
-** hash of all the in memory transactions.
-** next and prev are used by the current transaction (journal_hash).
-** hnext and hprev are used by journal_list_hash. If a block is in more than one transaction, the journal_list_hash
-** links it in multiple times. This allows flush_journal_list to remove just the cnode belonging
-** to a given transaction.
-*/
+
+/* number of copies of the bitmaps to have floating. Must be >= 2 */
+#define JOURNAL_NUM_BITMAPS 5
+
+/*
+ * One of these for every block in every transaction
+ * Each one is in two hash tables. First, a hash of the current transaction,
+ * and after journal_end, a hash of all the in memory transactions.
+ * next and prev are used by the current transaction (journal_hash).
+ * hnext and hprev are used by journal_list_hash. If a block is in more
+ * than one transaction, the journal_list_hash links it in multiple times.
+ * This allows flush_journal_list to remove just the cnode belonging to a
+ * given transaction.
+ */
struct reiserfs_journal_cnode {
struct buffer_head *bh; /* real buffer head */
struct super_block *sb; /* dev of real buffer head */
- __u32 blocknr; /* block number of real buffer head, == 0 when buffer on disk */
+
+ /* block number of real buffer head, == 0 when buffer on disk */
+ __u32 blocknr;
+
unsigned long state;
- struct reiserfs_journal_list *jlist; /* journal list this cnode lives in */
+
+ /* journal list this cnode lives in */
+ struct reiserfs_journal_list *jlist;
+
struct reiserfs_journal_cnode *next; /* next in transaction list */
struct reiserfs_journal_cnode *prev; /* prev in transaction list */
struct reiserfs_journal_cnode *hprev; /* prev in hash list */
@@ -212,18 +248,22 @@ struct reiserfs_list_bitmap {
};
/*
-** one of these for each transaction. The most important part here is the j_realblock.
-** this list of cnodes is used to hash all the blocks in all the commits, to mark all the
-** real buffer heads dirty once all the commits hit the disk,
-** and to make sure every real block in a transaction is on disk before allowing the log area
-** to be overwritten */
+ * one of these for each transaction. The most important part here is the
+ * j_realblock. this list of cnodes is used to hash all the blocks in all
+ * the commits, to mark all the real buffer heads dirty once all the commits
+ * hit the disk, and to make sure every real block in a transaction is on
+ * disk before allowing the log area to be overwritten
+ */
struct reiserfs_journal_list {
unsigned long j_start;
unsigned long j_state;
unsigned long j_len;
atomic_t j_nonzerolen;
atomic_t j_commit_left;
- atomic_t j_older_commits_done; /* all commits older than this on disk */
+
+ /* all commits older than this on disk */
+ atomic_t j_older_commits_done;
+
struct mutex j_commit_mutex;
unsigned int j_trans_id;
time_t j_timestamp;
@@ -234,11 +274,15 @@ struct reiserfs_journal_list {
/* time ordered list of all active transactions */
struct list_head j_list;
- /* time ordered list of all transactions we haven't tried to flush yet */
+ /*
+ * time ordered list of all transactions we haven't tried
+ * to flush yet
+ */
struct list_head j_working_list;
/* list of tail conversion targets in need of flush before commit */
struct list_head j_tail_bh_list;
+
/* list of data=ordered buffers in need of flush before commit */
struct list_head j_bh_list;
int j_refcount;
@@ -246,46 +290,83 @@ struct reiserfs_journal_list {
struct reiserfs_journal {
struct buffer_head **j_ap_blocks; /* journal blocks on disk */
- struct reiserfs_journal_cnode *j_last; /* newest journal block */
- struct reiserfs_journal_cnode *j_first; /* oldest journal block. start here for traverse */
+ /* newest journal block */
+ struct reiserfs_journal_cnode *j_last;
+
+ /* oldest journal block. start here for traverse */
+ struct reiserfs_journal_cnode *j_first;
struct block_device *j_dev_bd;
fmode_t j_dev_mode;
- int j_1st_reserved_block; /* first block on s_dev of reserved area journal */
+
+ /* first block on s_dev of reserved area journal */
+ int j_1st_reserved_block;
unsigned long j_state;
unsigned int j_trans_id;
unsigned long j_mount_id;
- unsigned long j_start; /* start of current waiting commit (index into j_ap_blocks) */
+
+ /* start of current waiting commit (index into j_ap_blocks) */
+ unsigned long j_start;
unsigned long j_len; /* length of current waiting commit */
- unsigned long j_len_alloc; /* number of buffers requested by journal_begin() */
+
+ /* number of buffers requested by journal_begin() */
+ unsigned long j_len_alloc;
+
atomic_t j_wcount; /* count of writers for current commit */
- unsigned long j_bcount; /* batch count. allows turning X transactions into 1 */
- unsigned long j_first_unflushed_offset; /* first unflushed transactions offset */
- unsigned j_last_flush_trans_id; /* last fully flushed journal timestamp */
+
+ /* batch count. allows turning X transactions into 1 */
+ unsigned long j_bcount;
+
+ /* first unflushed transactions offset */
+ unsigned long j_first_unflushed_offset;
+
+ /* last fully flushed journal timestamp */
+ unsigned j_last_flush_trans_id;
+
struct buffer_head *j_header_bh;
time_t j_trans_start_time; /* time this transaction started */
struct mutex j_mutex;
struct mutex j_flush_mutex;
- wait_queue_head_t j_join_wait; /* wait for current transaction to finish before starting new one */
- atomic_t j_jlock; /* lock for j_join_wait */
+
+ /* wait for current transaction to finish before starting new one */
+ wait_queue_head_t j_join_wait;
+
+ atomic_t j_jlock; /* lock for j_join_wait */
int j_list_bitmap_index; /* number of next list bitmap to use */
- int j_must_wait; /* no more journal begins allowed. MUST sleep on j_join_wait */
- int j_next_full_flush; /* next journal_end will flush all journal list */
- int j_next_async_flush; /* next journal_end will flush all async commits */
+
+ /* no more journal begins allowed. MUST sleep on j_join_wait */
+ int j_must_wait;
+
+ /* next journal_end will flush all journal list */
+ int j_next_full_flush;
+
+ /* next journal_end will flush all async commits */
+ int j_next_async_flush;
int j_cnode_used; /* number of cnodes on the used list */
int j_cnode_free; /* number of cnodes on the free list */
- unsigned int j_trans_max; /* max number of blocks in a transaction. */
- unsigned int j_max_batch; /* max number of blocks to batch into a trans */
- unsigned int j_max_commit_age; /* in seconds, how old can an async commit be */
- unsigned int j_max_trans_age; /* in seconds, how old can a transaction be */
- unsigned int j_default_max_commit_age; /* the default for the max commit age */
+ /* max number of blocks in a transaction. */
+ unsigned int j_trans_max;
+
+ /* max number of blocks to batch into a trans */
+ unsigned int j_max_batch;
+
+ /* in seconds, how old can an async commit be */
+ unsigned int j_max_commit_age;
+
+ /* in seconds, how old can a transaction be */
+ unsigned int j_max_trans_age;
+
+ /* the default for the max commit age */
+ unsigned int j_default_max_commit_age;
struct reiserfs_journal_cnode *j_cnode_free_list;
- struct reiserfs_journal_cnode *j_cnode_free_orig; /* orig pointer returned from vmalloc */
+
+ /* orig pointer returned from vmalloc */
+ struct reiserfs_journal_cnode *j_cnode_free_orig;
struct reiserfs_journal_list *j_current_jl;
int j_free_bitmap_nodes;
@@ -306,14 +387,21 @@ struct reiserfs_journal {
/* list of all active transactions */
struct list_head j_journal_list;
+
/* lists that haven't been touched by writeback attempts */
struct list_head j_working_list;
- struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS]; /* array of bitmaps to record the deleted blocks */
- struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE]; /* hash table for real buffer heads in current trans */
- struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE]; /* hash table for all the real buffer heads in all
- the transactions */
- struct list_head j_prealloc_list; /* list of inodes which have preallocated blocks */
+ /* hash table for real buffer heads in current trans */
+ struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE];
+
+ /* hash table for all the real buffer heads in all the transactions */
+ struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE];
+
+ /* array of bitmaps to record the deleted blocks */
+ struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS];
+
+ /* list of inodes which have preallocated blocks */
+ struct list_head j_prealloc_list;
int j_persistent_trans;
unsigned long j_max_trans_size;
unsigned long j_max_batch_size;
@@ -328,11 +416,12 @@ struct reiserfs_journal {
enum journal_state_bits {
J_WRITERS_BLOCKED = 1, /* set when new writers not allowed */
- J_WRITERS_QUEUED, /* set when log is full due to too many writers */
- J_ABORTED, /* set when log is aborted */
+ J_WRITERS_QUEUED, /* set when log is full due to too many writers */
+ J_ABORTED, /* set when log is aborted */
};
-#define JOURNAL_DESC_MAGIC "ReIsErLB" /* ick. magic string to find desc blocks in the journal */
+/* ick. magic string to find desc blocks in the journal */
+#define JOURNAL_DESC_MAGIC "ReIsErLB"
typedef __u32(*hashf_t) (const signed char *, int);
@@ -364,7 +453,10 @@ typedef struct reiserfs_proc_info_data {
stat_cnt_t leaked_oid;
stat_cnt_t leaves_removable;
- /* balances per level. Use explicit 5 as MAX_HEIGHT is not visible yet. */
+ /*
+ * balances per level.
+ * Use explicit 5 as MAX_HEIGHT is not visible yet.
+ */
stat_cnt_t balance_at[5]; /* XXX */
/* sbk == search_by_key */
stat_cnt_t sbk_read_at[5]; /* XXX */
@@ -416,47 +508,75 @@ typedef struct reiserfs_proc_info_data {
/* reiserfs union of in-core super block data */
struct reiserfs_sb_info {
- struct buffer_head *s_sbh; /* Buffer containing the super block */
- /* both the comment and the choice of
- name are unclear for s_rs -Hans */
- struct reiserfs_super_block *s_rs; /* Pointer to the super block in the buffer */
+ /* Buffer containing the super block */
+ struct buffer_head *s_sbh;
+
+ /* Pointer to the on-disk super block in the buffer */
+ struct reiserfs_super_block *s_rs;
struct reiserfs_bitmap_info *s_ap_bitmap;
- struct reiserfs_journal *s_journal; /* pointer to journal information */
+
+ /* pointer to journal information */
+ struct reiserfs_journal *s_journal;
+
unsigned short s_mount_state; /* reiserfs state (valid, invalid) */
/* Serialize writers access, replace the old bkl */
struct mutex lock;
+
/* Owner of the lock (can be recursive) */
struct task_struct *lock_owner;
+
/* Depth of the lock, start from -1 like the bkl */
int lock_depth;
+ struct workqueue_struct *commit_wq;
+
/* Comment? -Hans */
void (*end_io_handler) (struct buffer_head *, int);
- hashf_t s_hash_function; /* pointer to function which is used
- to sort names in directory. Set on
- mount */
- unsigned long s_mount_opt; /* reiserfs's mount options are set
- here (currently - NOTAIL, NOLOG,
- REPLAYONLY) */
-
- struct { /* This is a structure that describes block allocator options */
- unsigned long bits; /* Bitfield for enable/disable kind of options */
- unsigned long large_file_size; /* size started from which we consider file to be a large one(in blocks) */
+
+ /*
+ * pointer to function which is used to sort names in directory.
+ * Set on mount
+ */
+ hashf_t s_hash_function;
+
+ /* reiserfs's mount options are set here */
+ unsigned long s_mount_opt;
+
+ /* This is a structure that describes block allocator options */
+ struct {
+ /* Bitfield for enable/disable kind of options */
+ unsigned long bits;
+
+ /*
+ * size started from which we consider file
+ * to be a large one (in blocks)
+ */
+ unsigned long large_file_size;
+
int border; /* percentage of disk, border takes */
- int preallocmin; /* Minimal file size (in blocks) starting from which we do preallocations */
- int preallocsize; /* Number of blocks we try to prealloc when file
- reaches preallocmin size (in blocks) or
- prealloc_list is empty. */
+
+ /*
+ * Minimal file size (in blocks) starting
+ * from which we do preallocations
+ */
+ int preallocmin;
+
+ /*
+ * Number of blocks we try to prealloc when file
+ * reaches preallocmin size (in blocks) or prealloc_list
+ is empty.
+ */
+ int preallocsize;
} s_alloc_options;
/* Comment? -Hans */
wait_queue_head_t s_wait;
- /* To be obsoleted soon by per buffer seals.. -Hans */
- atomic_t s_generation_counter; // increased by one every time the
- // tree gets re-balanced
- unsigned long s_properties; /* File system properties. Currently holds
- on-disk FS format */
+ /* increased by one every time the tree gets re-balanced */
+ atomic_t s_generation_counter;
+
+ /* File system properties. Currently holds on-disk FS format */
+ unsigned long s_properties;
/* session statistics */
int s_disk_reads;
@@ -469,14 +589,23 @@ struct reiserfs_sb_info {
int s_bmaps_without_search;
int s_direct2indirect;
int s_indirect2direct;
- /* set up when it's ok for reiserfs_read_inode2() to read from
- disk inode with nlink==0. Currently this is only used during
- finish_unfinished() processing at mount time */
+
+ /*
+ * set up when it's ok for reiserfs_read_inode2() to read from
+ * disk inode with nlink==0. Currently this is only used during
+ * finish_unfinished() processing at mount time
+ */
int s_is_unlinked_ok;
+
reiserfs_proc_info_data_t s_proc_info_data;
struct proc_dir_entry *procdir;
- int reserved_blocks; /* amount of blocks reserved for further allocations */
- spinlock_t bitmap_lock; /* this lock on now only used to protect reserved_blocks variable */
+
+ /* amount of blocks reserved for further allocations */
+ int reserved_blocks;
+
+
+ /* this lock on now only used to protect reserved_blocks variable */
+ spinlock_t bitmap_lock;
struct dentry *priv_root; /* root of /.reiserfs_priv */
struct dentry *xattr_root; /* root of /.reiserfs_priv/xattrs */
int j_errno;
@@ -492,14 +621,13 @@ struct reiserfs_sb_info {
char *s_jdev; /* Stored jdev for mount option showing */
#ifdef CONFIG_REISERFS_CHECK
- struct tree_balance *cur_tb; /*
- * Detects whether more than one
- * copy of tb exists per superblock
- * as a means of checking whether
- * do_balance is executing concurrently
- * against another tree reader/writer
- * on a same mount point.
- */
+ /*
+ * Detects whether more than one copy of tb exists per superblock
+ * as a means of checking whether do_balance is executing
+ * concurrently against another tree reader/writer on a same
+ * mount point.
+ */
+ struct tree_balance *cur_tb;
#endif
};
@@ -508,25 +636,36 @@ struct reiserfs_sb_info {
#define REISERFS_3_6 1
#define REISERFS_OLD_FORMAT 2
-enum reiserfs_mount_options {
/* Mount options */
- REISERFS_LARGETAIL, /* large tails will be created in a session */
- REISERFS_SMALLTAIL, /* small (for files less than block size) tails will be created in a session */
- REPLAYONLY, /* replay journal and return 0. Use by fsck */
- REISERFS_CONVERT, /* -o conv: causes conversion of old
- format super block to the new
- format. If not specified - old
- partition will be dealt with in a
- manner of 3.5.x */
-
-/* -o hash={tea, rupasov, r5, detect} is meant for properly mounting
-** reiserfs disks from 3.5.19 or earlier. 99% of the time, this option
-** is not required. If the normal autodection code can't determine which
-** hash to use (because both hashes had the same value for a file)
-** use this option to force a specific hash. It won't allow you to override
-** the existing hash on the FS, so if you have a tea hash disk, and mount
-** with -o hash=rupasov, the mount will fail.
-*/
+enum reiserfs_mount_options {
+ /* large tails will be created in a session */
+ REISERFS_LARGETAIL,
+ /*
+ * small (for files less than block size) tails will
+ * be created in a session
+ */
+ REISERFS_SMALLTAIL,
+
+ /* replay journal and return 0. Use by fsck */
+ REPLAYONLY,
+
+ /*
+ * -o conv: causes conversion of old format super block to the
+ * new format. If not specified - old partition will be dealt
+ * with in a manner of 3.5.x
+ */
+ REISERFS_CONVERT,
+
+ /*
+ * -o hash={tea, rupasov, r5, detect} is meant for properly mounting
+ * reiserfs disks from 3.5.19 or earlier. 99% of the time, this
+ * option is not required. If the normal autodection code can't
+ * determine which hash to use (because both hashes had the same
+ * value for a file) use this option to force a specific hash.
+ * It won't allow you to override the existing hash on the FS, so
+ * if you have a tea hash disk, and mount with -o hash=rupasov,
+ * the mount will fail.
+ */
FORCE_TEA_HASH, /* try to force tea hash on mount */
FORCE_RUPASOV_HASH, /* try to force rupasov hash on mount */
FORCE_R5_HASH, /* try to force rupasov hash on mount */
@@ -536,9 +675,11 @@ enum reiserfs_mount_options {
REISERFS_DATA_ORDERED,
REISERFS_DATA_WRITEBACK,
-/* used for testing experimental features, makes benchmarking new
- features with and without more convenient, should never be used by
- users in any code shipped to users (ideally) */
+ /*
+ * used for testing experimental features, makes benchmarking new
+ * features with and without more convenient, should never be used by
+ * users in any code shipped to users (ideally)
+ */
REISERFS_NO_BORDER,
REISERFS_NO_UNHASHED_RELOCATION,
@@ -705,28 +846,28 @@ static inline void reiserfs_cond_resched(struct super_block *s)
struct fid;
-/* in reading the #defines, it may help to understand that they employ
- the following abbreviations:
-
- B = Buffer
- I = Item header
- H = Height within the tree (should be changed to LEV)
- N = Number of the item in the node
- STAT = stat data
- DEH = Directory Entry Header
- EC = Entry Count
- E = Entry number
- UL = Unsigned Long
- BLKH = BLocK Header
- UNFM = UNForMatted node
- DC = Disk Child
- P = Path
-
- These #defines are named by concatenating these abbreviations,
- where first comes the arguments, and last comes the return value,
- of the macro.
-
-*/
+/*
+ * in reading the #defines, it may help to understand that they employ
+ * the following abbreviations:
+ *
+ * B = Buffer
+ * I = Item header
+ * H = Height within the tree (should be changed to LEV)
+ * N = Number of the item in the node
+ * STAT = stat data
+ * DEH = Directory Entry Header
+ * EC = Entry Count
+ * E = Entry number
+ * UL = Unsigned Long
+ * BLKH = BLocK Header
+ * UNFM = UNForMatted node
+ * DC = Disk Child
+ * P = Path
+ *
+ * These #defines are named by concatenating these abbreviations,
+ * where first comes the arguments, and last comes the return value,
+ * of the macro.
+ */
#define USE_INODE_GENERATION_COUNTER
@@ -737,14 +878,17 @@ struct fid;
/* n must be power of 2 */
#define _ROUND_UP(x,n) (((x)+(n)-1u) & ~((n)-1u))
-// to be ok for alpha and others we have to align structures to 8 byte
-// boundary.
-// FIXME: do not change 4 by anything else: there is code which relies on that
+/*
+ * to be ok for alpha and others we have to align structures to 8 byte
+ * boundary.
+ * FIXME: do not change 4 by anything else: there is code which relies on that
+ */
#define ROUND_UP(x) _ROUND_UP(x,8LL)
-/* debug levels. Right now, CONFIG_REISERFS_CHECK means print all debug
-** messages.
-*/
+/*
+ * debug levels. Right now, CONFIG_REISERFS_CHECK means print all debug
+ * messages.
+ */
#define REISERFS_DEBUG_CODE 5 /* extra messages to help find/debug errors */
void __reiserfs_warning(struct super_block *s, const char *id,
@@ -753,7 +897,7 @@ void __reiserfs_warning(struct super_block *s, const char *id,
__reiserfs_warning(s, id, __func__, fmt, ##args)
/* assertions handling */
-/** always check a condition and panic if it's false. */
+/* always check a condition and panic if it's false. */
#define __RASSERT(cond, scond, format, args...) \
do { \
if (!(cond)) \
@@ -776,35 +920,48 @@ do { \
* Disk Data Structures
*/
-/***************************************************************************/
-/* SUPER BLOCK */
-/***************************************************************************/
+/***************************************************************************
+ * SUPER BLOCK *
+ ***************************************************************************/
/*
- * Structure of super block on disk, a version of which in RAM is often accessed as REISERFS_SB(s)->s_rs
- * the version in RAM is part of a larger structure containing fields never written to disk.
+ * Structure of super block on disk, a version of which in RAM is often
+ * accessed as REISERFS_SB(s)->s_rs. The version in RAM is part of a larger
+ * structure containing fields never written to disk.
*/
-#define UNSET_HASH 0 // read_super will guess about, what hash names
- // in directories were sorted with
+#define UNSET_HASH 0 /* Detect hash on disk */
#define TEA_HASH 1
#define YURA_HASH 2
#define R5_HASH 3
#define DEFAULT_HASH R5_HASH
struct journal_params {
- __le32 jp_journal_1st_block; /* where does journal start from on its
- * device */
- __le32 jp_journal_dev; /* journal device st_rdev */
- __le32 jp_journal_size; /* size of the journal */
- __le32 jp_journal_trans_max; /* max number of blocks in a transaction. */
- __le32 jp_journal_magic; /* random value made on fs creation (this
- * was sb_journal_block_count) */
- __le32 jp_journal_max_batch; /* max number of blocks to batch into a
- * trans */
- __le32 jp_journal_max_commit_age; /* in seconds, how old can an async
- * commit be */
- __le32 jp_journal_max_trans_age; /* in seconds, how old can a transaction
- * be */
+ /* where does journal start from on its * device */
+ __le32 jp_journal_1st_block;
+
+ /* journal device st_rdev */
+ __le32 jp_journal_dev;
+
+ /* size of the journal */
+ __le32 jp_journal_size;
+
+ /* max number of blocks in a transaction. */
+ __le32 jp_journal_trans_max;
+
+ /*
+ * random value made on fs creation
+ * (this was sb_journal_block_count)
+ */
+ __le32 jp_journal_magic;
+
+ /* max number of blocks to batch into a trans */
+ __le32 jp_journal_max_batch;
+
+ /* in seconds, how old can an async commit be */
+ __le32 jp_journal_max_commit_age;
+
+ /* in seconds, how old can a transaction be */
+ __le32 jp_journal_max_trans_age;
};
/* this is the super from 3.5.X, where X >= 10 */
@@ -814,26 +971,48 @@ struct reiserfs_super_block_v1 {
__le32 s_root_block; /* root block number */
struct journal_params s_journal;
__le16 s_blocksize; /* block size */
- __le16 s_oid_maxsize; /* max size of object id array, see
- * get_objectid() commentary */
+
+ /* max size of object id array, see get_objectid() commentary */
+ __le16 s_oid_maxsize;
__le16 s_oid_cursize; /* current size of object id array */
- __le16 s_umount_state; /* this is set to 1 when filesystem was
- * umounted, to 2 - when not */
- char s_magic[10]; /* reiserfs magic string indicates that
- * file system is reiserfs:
- * "ReIsErFs" or "ReIsEr2Fs" or "ReIsEr3Fs" */
- __le16 s_fs_state; /* it is set to used by fsck to mark which
- * phase of rebuilding is done */
- __le32 s_hash_function_code; /* indicate, what hash function is being use
- * to sort names in a directory*/
+
+ /* this is set to 1 when filesystem was umounted, to 2 - when not */
+ __le16 s_umount_state;
+
+ /*
+ * reiserfs magic string indicates that file system is reiserfs:
+ * "ReIsErFs" or "ReIsEr2Fs" or "ReIsEr3Fs"
+ */
+ char s_magic[10];
+
+ /*
+ * it is set to used by fsck to mark which
+ * phase of rebuilding is done
+ */
+ __le16 s_fs_state;
+ /*
+ * indicate, what hash function is being use
+ * to sort names in a directory
+ */
+ __le32 s_hash_function_code;
__le16 s_tree_height; /* height of disk tree */
- __le16 s_bmap_nr; /* amount of bitmap blocks needed to address
- * each block of file system */
- __le16 s_version; /* this field is only reliable on filesystem
- * with non-standard journal */
- __le16 s_reserved_for_journal; /* size in blocks of journal area on main
- * device, we need to keep after
- * making fs with non-standard journal */
+
+ /*
+ * amount of bitmap blocks needed to address
+ * each block of file system
+ */
+ __le16 s_bmap_nr;
+
+ /*
+ * this field is only reliable on filesystem with non-standard journal
+ */
+ __le16 s_version;
+
+ /*
+ * size in blocks of journal area on main device, we need to
+ * keep after making fs with non-standard journal
+ */
+ __le16 s_reserved_for_journal;
} __attribute__ ((__packed__));
#define SB_SIZE_V1 (sizeof(struct reiserfs_super_block_v1))
@@ -842,17 +1021,21 @@ struct reiserfs_super_block_v1 {
struct reiserfs_super_block {
struct reiserfs_super_block_v1 s_v1;
__le32 s_inode_generation;
- __le32 s_flags; /* Right now used only by inode-attributes, if enabled */
+
+ /* Right now used only by inode-attributes, if enabled */
+ __le32 s_flags;
+
unsigned char s_uuid[16]; /* filesystem unique identifier */
unsigned char s_label[16]; /* filesystem volume label */
__le16 s_mnt_count; /* Count of mounts since last fsck */
__le16 s_max_mnt_count; /* Maximum mounts before check */
__le32 s_lastcheck; /* Timestamp of last fsck */
__le32 s_check_interval; /* Interval between checks */
- char s_unused[76]; /* zero filled by mkreiserfs and
- * reiserfs_convert_objectid_map_v1()
- * so any additions must be updated
- * there as well. */
+
+ /*
+ * zero filled by mkreiserfs and reiserfs_convert_objectid_map_v1()
+ * so any additions must be updated there as well. */
+ char s_unused[76];
} __attribute__ ((__packed__));
#define SB_SIZE (sizeof(struct reiserfs_super_block))
@@ -860,7 +1043,7 @@ struct reiserfs_super_block {
#define REISERFS_VERSION_1 0
#define REISERFS_VERSION_2 2
-// on-disk super block fields converted to cpu form
+/* on-disk super block fields converted to cpu form */
#define SB_DISK_SUPER_BLOCK(s) (REISERFS_SB(s)->s_rs)
#define SB_V1_DISK_SUPER_BLOCK(s) (&(SB_DISK_SUPER_BLOCK(s)->s_v1))
#define SB_BLOCKSIZE(s) \
@@ -915,11 +1098,13 @@ int is_reiserfs_3_5(struct reiserfs_super_block *rs);
int is_reiserfs_3_6(struct reiserfs_super_block *rs);
int is_reiserfs_jr(struct reiserfs_super_block *rs);
-/* ReiserFS leaves the first 64k unused, so that partition labels have
- enough space. If someone wants to write a fancy bootloader that
- needs more than 64k, let us know, and this will be increased in size.
- This number must be larger than than the largest block size on any
- platform, or code will break. -Hans */
+/*
+ * ReiserFS leaves the first 64k unused, so that partition labels have
+ * enough space. If someone wants to write a fancy bootloader that
+ * needs more than 64k, let us know, and this will be increased in size.
+ * This number must be larger than than the largest block size on any
+ * platform, or code will break. -Hans
+ */
#define REISERFS_DISK_OFFSET_IN_BYTES (64 * 1024)
#define REISERFS_FIRST_BLOCK unused_define
#define REISERFS_JOURNAL_OFFSET_IN_BYTES REISERFS_DISK_OFFSET_IN_BYTES
@@ -944,8 +1129,7 @@ struct unfm_nodeinfo {
unsigned short unfm_freespace;
};
-/* there are two formats of keys: 3.5 and 3.6
- */
+/* there are two formats of keys: 3.5 and 3.6 */
#define KEY_FORMAT_3_5 0
#define KEY_FORMAT_3_6 1
@@ -963,8 +1147,10 @@ static inline struct reiserfs_sb_info *REISERFS_SB(const struct super_block *sb)
return sb->s_fs_info;
}
-/* Don't trust REISERFS_SB(sb)->s_bmap_nr, it's a u16
- * which overflows on large file systems. */
+/*
+ * Don't trust REISERFS_SB(sb)->s_bmap_nr, it's a u16
+ * which overflows on large file systems.
+ */
static inline __u32 reiserfs_bmap_count(struct super_block *sb)
{
return (SB_BLOCK_COUNT(sb) - 1) / (sb->s_blocksize * 8) + 1;
@@ -975,8 +1161,10 @@ static inline int bmap_would_wrap(unsigned bmap_nr)
return bmap_nr > ((1LL << 16) - 1);
}
-/** this says about version of key of all items (but stat data) the
- object consists of */
+/*
+ * this says about version of key of all items (but stat data) the
+ * object consists of
+ */
#define get_inode_item_key_version( inode ) \
((REISERFS_I(inode)->i_flags & i_item_key_version_mask) ? KEY_FORMAT_3_6 : KEY_FORMAT_3_5)
@@ -995,16 +1183,18 @@ static inline int bmap_would_wrap(unsigned bmap_nr)
else \
REISERFS_I(inode)->i_flags &= ~i_stat_data_version_mask; })
-/* This is an aggressive tail suppression policy, I am hoping it
- improves our benchmarks. The principle behind it is that percentage
- space saving is what matters, not absolute space saving. This is
- non-intuitive, but it helps to understand it if you consider that the
- cost to access 4 blocks is not much more than the cost to access 1
- block, if you have to do a seek and rotate. A tail risks a
- non-linear disk access that is significant as a percentage of total
- time cost for a 4 block file and saves an amount of space that is
- less significant as a percentage of space, or so goes the hypothesis.
- -Hans */
+/*
+ * This is an aggressive tail suppression policy, I am hoping it
+ * improves our benchmarks. The principle behind it is that percentage
+ * space saving is what matters, not absolute space saving. This is
+ * non-intuitive, but it helps to understand it if you consider that the
+ * cost to access 4 blocks is not much more than the cost to access 1
+ * block, if you have to do a seek and rotate. A tail risks a
+ * non-linear disk access that is significant as a percentage of total
+ * time cost for a 4 block file and saves an amount of space that is
+ * less significant as a percentage of space, or so goes the hypothesis.
+ * -Hans
+ */
#define STORE_TAIL_IN_UNFM_S1(n_file_size,n_tail_size,n_block_size) \
(\
(!(n_tail_size)) || \
@@ -1018,10 +1208,11 @@ static inline int bmap_would_wrap(unsigned bmap_nr)
( (n_tail_size) >= (MAX_DIRECT_ITEM_LEN(n_block_size) * 3)/4) ) ) \
)
-/* Another strategy for tails, this one means only create a tail if all the
- file would fit into one DIRECT item.
- Primary intention for this one is to increase performance by decreasing
- seeking.
+/*
+ * Another strategy for tails, this one means only create a tail if all the
+ * file would fit into one DIRECT item.
+ * Primary intention for this one is to increase performance by decreasing
+ * seeking.
*/
#define STORE_TAIL_IN_UNFM_S2(n_file_size,n_tail_size,n_block_size) \
(\
@@ -1035,23 +1226,21 @@ static inline int bmap_would_wrap(unsigned bmap_nr)
#define REISERFS_VALID_FS 1
#define REISERFS_ERROR_FS 2
-//
-// there are 5 item types currently
-//
+/*
+ * there are 5 item types currently
+ */
#define TYPE_STAT_DATA 0
#define TYPE_INDIRECT 1
#define TYPE_DIRECT 2
#define TYPE_DIRENTRY 3
#define TYPE_MAXTYPE 3
-#define TYPE_ANY 15 // FIXME: comment is required
+#define TYPE_ANY 15 /* FIXME: comment is required */
-/***************************************************************************/
-/* KEY & ITEM HEAD */
-/***************************************************************************/
+/***************************************************************************
+ * KEY & ITEM HEAD *
+ ***************************************************************************/
-//
-// directories use this key as well as old files
-//
+/* * directories use this key as well as old files */
struct offset_v1 {
__le32 k_offset;
__le32 k_uniqueness;
@@ -1084,11 +1273,14 @@ static inline void set_offset_v2_k_offset(struct offset_v2 *v2, loff_t offset)
v2->v = (v2->v & cpu_to_le64(15ULL << 60)) | cpu_to_le64(offset);
}
-/* Key of an item determines its location in the S+tree, and
- is composed of 4 components */
+/*
+ * Key of an item determines its location in the S+tree, and
+ * is composed of 4 components
+ */
struct reiserfs_key {
- __le32 k_dir_id; /* packing locality: by default parent
- directory object id */
+ /* packing locality: by default parent directory object id */
+ __le32 k_dir_id;
+
__le32 k_objectid; /* object identifier */
union {
struct offset_v1 k_offset_v1;
@@ -1097,8 +1289,8 @@ struct reiserfs_key {
} __attribute__ ((__packed__));
struct in_core_key {
- __u32 k_dir_id; /* packing locality: by default parent
- directory object id */
+ /* packing locality: by default parent directory object id */
+ __u32 k_dir_id;
__u32 k_objectid; /* object identifier */
__u64 k_offset;
__u8 k_type;
@@ -1107,14 +1299,16 @@ struct in_core_key {
struct cpu_key {
struct in_core_key on_disk_key;
int version;
- int key_length; /* 3 in all cases but direct2indirect and
- indirect2direct conversion */
+ /* 3 in all cases but direct2indirect and indirect2direct conversion */
+ int key_length;
};
-/* Our function for comparing keys can compare keys of different
- lengths. It takes as a parameter the length of the keys it is to
- compare. These defines are used in determining what is to be passed
- to it as that parameter. */
+/*
+ * Our function for comparing keys can compare keys of different
+ * lengths. It takes as a parameter the length of the keys it is to
+ * compare. These defines are used in determining what is to be passed
+ * to it as that parameter.
+ */
#define REISERFS_FULL_KEY_LEN 4
#define REISERFS_SHORT_KEY_LEN 2
@@ -1143,40 +1337,52 @@ struct cpu_key {
#define POSITION_FOUND 1
#define POSITION_NOT_FOUND 0
-// return values for reiserfs_find_entry and search_by_entry_key
+/* return values for reiserfs_find_entry and search_by_entry_key */
#define NAME_FOUND 1
#define NAME_NOT_FOUND 0
#define GOTO_PREVIOUS_ITEM 2
#define NAME_FOUND_INVISIBLE 3
-/* Everything in the filesystem is stored as a set of items. The
- item head contains the key of the item, its free space (for
- indirect items) and specifies the location of the item itself
- within the block. */
+/*
+ * Everything in the filesystem is stored as a set of items. The
+ * item head contains the key of the item, its free space (for
+ * indirect items) and specifies the location of the item itself
+ * within the block.
+ */
struct item_head {
- /* Everything in the tree is found by searching for it based on
- * its key.*/
+ /*
+ * Everything in the tree is found by searching for it based on
+ * its key.
+ */
struct reiserfs_key ih_key;
union {
- /* The free space in the last unformatted node of an
- indirect item if this is an indirect item. This
- equals 0xFFFF iff this is a direct item or stat data
- item. Note that the key, not this field, is used to
- determine the item type, and thus which field this
- union contains. */
+ /*
+ * The free space in the last unformatted node of an
+ * indirect item if this is an indirect item. This
+ * equals 0xFFFF iff this is a direct item or stat data
+ * item. Note that the key, not this field, is used to
+ * determine the item type, and thus which field this
+ * union contains.
+ */
__le16 ih_free_space_reserved;
- /* Iff this is a directory item, this field equals the
- number of directory entries in the directory item. */
+
+ /*
+ * Iff this is a directory item, this field equals the
+ * number of directory entries in the directory item.
+ */
__le16 ih_entry_count;
} __attribute__ ((__packed__)) u;
__le16 ih_item_len; /* total size of the item body */
- __le16 ih_item_location; /* an offset to the item body
- * within the block */
- __le16 ih_version; /* 0 for all old items, 2 for new
- ones. Highest bit is set by fsck
- temporary, cleaned after all
- done */
+
+ /* an offset to the item body within the block */
+ __le16 ih_item_location;
+
+ /*
+ * 0 for all old items, 2 for new ones. Highest bit is set by fsck
+ * temporary, cleaned after all done
+ */
+ __le16 ih_version;
} __attribute__ ((__packed__));
/* size of item header */
#define IH_SIZE (sizeof(struct item_head))
@@ -1198,27 +1404,24 @@ struct item_head {
#define get_ih_free_space(ih) (ih_version (ih) == KEY_FORMAT_3_6 ? 0 : ih_free_space (ih))
#define set_ih_free_space(ih,val) put_ih_free_space((ih), ((ih_version(ih) == KEY_FORMAT_3_6) ? 0 : (val)))
-/* these operate on indirect items, where you've got an array of ints
-** at a possibly unaligned location. These are a noop on ia32
-**
-** p is the array of __u32, i is the index into the array, v is the value
-** to store there.
-*/
+/*
+ * these operate on indirect items, where you've got an array of ints
+ * at a possibly unaligned location. These are a noop on ia32
+ *
+ * p is the array of __u32, i is the index into the array, v is the value
+ * to store there.
+ */
#define get_block_num(p, i) get_unaligned_le32((p) + (i))
#define put_block_num(p, i, v) put_unaligned_le32((v), (p) + (i))
-//
-// in old version uniqueness field shows key type
-//
+/* * in old version uniqueness field shows key type */
#define V1_SD_UNIQUENESS 0
#define V1_INDIRECT_UNIQUENESS 0xfffffffe
#define V1_DIRECT_UNIQUENESS 0xffffffff
#define V1_DIRENTRY_UNIQUENESS 500
-#define V1_ANY_UNIQUENESS 555 // FIXME: comment is required
+#define V1_ANY_UNIQUENESS 555 /* FIXME: comment is required */
-//
-// here are conversion routines
-//
+/* here are conversion routines */
static inline int uniqueness2type(__u32 uniqueness) CONSTF;
static inline int uniqueness2type(__u32 uniqueness)
{
@@ -1255,11 +1458,11 @@ static inline __u32 type2uniqueness(int type)
}
}
-//
-// key is pointer to on disk key which is stored in le, result is cpu,
-// there is no way to get version of object from key, so, provide
-// version to these defines
-//
+/*
+ * key is pointer to on disk key which is stored in le, result is cpu,
+ * there is no way to get version of object from key, so, provide
+ * version to these defines
+ */
static inline loff_t le_key_k_offset(int version,
const struct reiserfs_key *key)
{
@@ -1275,9 +1478,11 @@ static inline loff_t le_ih_k_offset(const struct item_head *ih)
static inline loff_t le_key_k_type(int version, const struct reiserfs_key *key)
{
- return (version == KEY_FORMAT_3_5) ?
- uniqueness2type(le32_to_cpu(key->u.k_offset_v1.k_uniqueness)) :
- offset_v2_k_type(&(key->u.k_offset_v2));
+ if (version == KEY_FORMAT_3_5) {
+ loff_t val = le32_to_cpu(key->u.k_offset_v1.k_uniqueness);
+ return uniqueness2type(val);
+ } else
+ return offset_v2_k_type(&(key->u.k_offset_v2));
}
static inline loff_t le_ih_k_type(const struct item_head *ih)
@@ -1288,8 +1493,22 @@ static inline loff_t le_ih_k_type(const struct item_head *ih)
static inline void set_le_key_k_offset(int version, struct reiserfs_key *key,
loff_t offset)
{
- (version == KEY_FORMAT_3_5) ? (void)(key->u.k_offset_v1.k_offset = cpu_to_le32(offset)) : /* jdm check */
- (void)(set_offset_v2_k_offset(&(key->u.k_offset_v2), offset));
+ if (version == KEY_FORMAT_3_5)
+ key->u.k_offset_v1.k_offset = cpu_to_le32(offset);
+ else
+ set_offset_v2_k_offset(&key->u.k_offset_v2, offset);
+}
+
+static inline void add_le_key_k_offset(int version, struct reiserfs_key *key,
+ loff_t offset)
+{
+ set_le_key_k_offset(version, key,
+ le_key_k_offset(version, key) + offset);
+}
+
+static inline void add_le_ih_k_offset(struct item_head *ih, loff_t offset)
+{
+ add_le_key_k_offset(ih_version(ih), &(ih->ih_key), offset);
}
static inline void set_le_ih_k_offset(struct item_head *ih, loff_t offset)
@@ -1300,10 +1519,11 @@ static inline void set_le_ih_k_offset(struct item_head *ih, loff_t offset)
static inline void set_le_key_k_type(int version, struct reiserfs_key *key,
int type)
{
- (version == KEY_FORMAT_3_5) ?
- (void)(key->u.k_offset_v1.k_uniqueness =
- cpu_to_le32(type2uniqueness(type)))
- : (void)(set_offset_v2_k_type(&(key->u.k_offset_v2), type));
+ if (version == KEY_FORMAT_3_5) {
+ type = type2uniqueness(type);
+ key->u.k_offset_v1.k_uniqueness = cpu_to_le32(type);
+ } else
+ set_offset_v2_k_type(&key->u.k_offset_v2, type);
}
static inline void set_le_ih_k_type(struct item_head *ih, int type)
@@ -1331,9 +1551,7 @@ static inline int is_statdata_le_key(int version, struct reiserfs_key *key)
return le_key_k_type(version, key) == TYPE_STAT_DATA;
}
-//
-// item header has version.
-//
+/* item header has version. */
static inline int is_direntry_le_ih(struct item_head *ih)
{
return is_direntry_le_key(ih_version(ih), &ih->ih_key);
@@ -1354,9 +1572,7 @@ static inline int is_statdata_le_ih(struct item_head *ih)
return is_statdata_le_key(ih_version(ih), &ih->ih_key);
}
-//
-// key is pointer to cpu key, result is cpu
-//
+/* key is pointer to cpu key, result is cpu */
static inline loff_t cpu_key_k_offset(const struct cpu_key *key)
{
return key->on_disk_key.k_offset;
@@ -1407,7 +1623,7 @@ static inline void cpu_key_k_offset_dec(struct cpu_key *key)
extern struct reiserfs_key root_key;
-/*
+/*
* Picture represents a leaf of the S+tree
* ______________________________________________________
* | | Array of | | |
@@ -1416,15 +1632,19 @@ extern struct reiserfs_key root_key;
* |______|_______________|___________________|___________|
*/
-/* Header of a disk block. More precisely, header of a formatted leaf
- or internal node, and not the header of an unformatted node. */
+/*
+ * Header of a disk block. More precisely, header of a formatted leaf
+ * or internal node, and not the header of an unformatted node.
+ */
struct block_head {
__le16 blk_level; /* Level of a block in the tree. */
__le16 blk_nr_item; /* Number of keys/items in a block. */
__le16 blk_free_space; /* Block free space in bytes. */
__le16 blk_reserved;
/* dump this in v4/planA */
- struct reiserfs_key blk_right_delim_key; /* kept only for compatibility */
+
+ /* kept only for compatibility */
+ struct reiserfs_key blk_right_delim_key;
};
#define BLKH_SIZE (sizeof(struct block_head))
@@ -1439,18 +1659,20 @@ struct block_head {
#define blkh_right_delim_key(p_blkh) ((p_blkh)->blk_right_delim_key)
#define set_blkh_right_delim_key(p_blkh,val) ((p_blkh)->blk_right_delim_key = val)
+/* values for blk_level field of the struct block_head */
+
/*
- * values for blk_level field of the struct block_head
+ * When node gets removed from the tree its blk_level is set to FREE_LEVEL.
+ * It is then used to see whether the node is still in the tree
*/
-
-#define FREE_LEVEL 0 /* when node gets removed from the tree its
- blk_level is set to FREE_LEVEL. It is then
- used to see whether the node is still in the
- tree */
+#define FREE_LEVEL 0
#define DISK_LEAF_NODE_LEVEL 1 /* Leaf node level. */
-/* Given the buffer head of a formatted node, resolve to the block head of that node. */
+/*
+ * Given the buffer head of a formatted node, resolve to the
+ * block head of that node.
+ */
#define B_BLK_HEAD(bh) ((struct block_head *)((bh)->b_data))
/* Number of items that are in buffer. */
#define B_NR_ITEMS(bh) (blkh_nr_item(B_BLK_HEAD(bh)))
@@ -1471,14 +1693,14 @@ struct block_head {
#define B_IS_KEYS_LEVEL(bh) (B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL \
&& B_LEVEL(bh) <= MAX_HEIGHT)
-/***************************************************************************/
-/* STAT DATA */
-/***************************************************************************/
+/***************************************************************************
+ * STAT DATA *
+ ***************************************************************************/
-//
-// old stat data is 32 bytes long. We are going to distinguish new one by
-// different size
-//
+/*
+ * old stat data is 32 bytes long. We are going to distinguish new one by
+ * different size
+*/
struct stat_data_v1 {
__le16 sd_mode; /* file type, permissions */
__le16 sd_nlink; /* number of hard links */
@@ -1487,20 +1709,25 @@ struct stat_data_v1 {
__le32 sd_size; /* file size */
__le32 sd_atime; /* time of last access */
__le32 sd_mtime; /* time file was last modified */
- __le32 sd_ctime; /* time inode (stat data) was last changed (except changes to sd_atime and sd_mtime) */
+
+ /*
+ * time inode (stat data) was last changed
+ * (except changes to sd_atime and sd_mtime)
+ */
+ __le32 sd_ctime;
union {
__le32 sd_rdev;
__le32 sd_blocks; /* number of blocks file uses */
} __attribute__ ((__packed__)) u;
- __le32 sd_first_direct_byte; /* first byte of file which is stored
- in a direct item: except that if it
- equals 1 it is a symlink and if it
- equals ~(__u32)0 there is no
- direct item. The existence of this
- field really grates on me. Let's
- replace it with a macro based on
- sd_size and our tail suppression
- policy. Someday. -Hans */
+
+ /*
+ * first byte of file which is stored in a direct item: except that if
+ * it equals 1 it is a symlink and if it equals ~(__u32)0 there is no
+ * direct item. The existence of this field really grates on me.
+ * Let's replace it with a macro based on sd_size and our tail
+ * suppression policy. Someday. -Hans
+ */
+ __le32 sd_first_direct_byte;
} __attribute__ ((__packed__));
#define SD_V1_SIZE (sizeof(struct stat_data_v1))
@@ -1532,8 +1759,10 @@ struct stat_data_v1 {
/* inode flags stored in sd_attrs (nee sd_reserved) */
-/* we want common flags to have the same values as in ext2,
- so chattr(1) will work without problems */
+/*
+ * we want common flags to have the same values as in ext2,
+ * so chattr(1) will work without problems
+ */
#define REISERFS_IMMUTABLE_FL FS_IMMUTABLE_FL
#define REISERFS_APPEND_FL FS_APPEND_FL
#define REISERFS_SYNC_FL FS_SYNC_FL
@@ -1553,8 +1782,10 @@ struct stat_data_v1 {
REISERFS_COMPR_FL | \
REISERFS_NOTAIL_FL )
-/* Stat Data on disk (reiserfs version of UFS disk inode minus the
- address blocks) */
+/*
+ * Stat Data on disk (reiserfs version of UFS disk inode minus the
+ * address blocks)
+ */
struct stat_data {
__le16 sd_mode; /* file type, permissions */
__le16 sd_attrs; /* persistent inode flags */
@@ -1564,25 +1795,20 @@ struct stat_data {
__le32 sd_gid; /* group */
__le32 sd_atime; /* time of last access */
__le32 sd_mtime; /* time file was last modified */
- __le32 sd_ctime; /* time inode (stat data) was last changed (except changes to sd_atime and sd_mtime) */
+
+ /*
+ * time inode (stat data) was last changed
+ * (except changes to sd_atime and sd_mtime)
+ */
+ __le32 sd_ctime;
__le32 sd_blocks;
union {
__le32 sd_rdev;
__le32 sd_generation;
- //__le32 sd_first_direct_byte;
- /* first byte of file which is stored in a
- direct item: except that if it equals 1
- it is a symlink and if it equals
- ~(__u32)0 there is no direct item. The
- existence of this field really grates
- on me. Let's replace it with a macro
- based on sd_size and our tail
- suppression policy? */
} __attribute__ ((__packed__)) u;
} __attribute__ ((__packed__));
-//
-// this is 44 bytes long
-//
+
+/* this is 44 bytes long */
#define SD_SIZE (sizeof(struct stat_data))
#define SD_V2_SIZE SD_SIZE
#define stat_data_v2(ih) (ih_version (ih) == KEY_FORMAT_3_6)
@@ -1613,48 +1839,61 @@ struct stat_data {
#define sd_v2_attrs(sdp) (le16_to_cpu((sdp)->sd_attrs))
#define set_sd_v2_attrs(sdp,v) ((sdp)->sd_attrs = cpu_to_le16(v))
-/***************************************************************************/
-/* DIRECTORY STRUCTURE */
-/***************************************************************************/
-/*
- Picture represents the structure of directory items
- ________________________________________________
- | Array of | | | | | |
- | directory |N-1| N-2 | .... | 1st |0th|
- | entry headers | | | | | |
- |_______________|___|_____|________|_______|___|
- <---- directory entries ------>
-
- First directory item has k_offset component 1. We store "." and ".."
- in one item, always, we never split "." and ".." into differing
- items. This makes, among other things, the code for removing
- directories simpler. */
+/***************************************************************************
+ * DIRECTORY STRUCTURE *
+ ***************************************************************************/
+/*
+ * Picture represents the structure of directory items
+ * ________________________________________________
+ * | Array of | | | | | |
+ * | directory |N-1| N-2 | .... | 1st |0th|
+ * | entry headers | | | | | |
+ * |_______________|___|_____|________|_______|___|
+ * <---- directory entries ------>
+ *
+ * First directory item has k_offset component 1. We store "." and ".."
+ * in one item, always, we never split "." and ".." into differing
+ * items. This makes, among other things, the code for removing
+ * directories simpler.
+ */
#define SD_OFFSET 0
#define SD_UNIQUENESS 0
#define DOT_OFFSET 1
#define DOT_DOT_OFFSET 2
#define DIRENTRY_UNIQUENESS 500
-/* */
#define FIRST_ITEM_OFFSET 1
/*
- Q: How to get key of object pointed to by entry from entry?
-
- A: Each directory entry has its header. This header has deh_dir_id and deh_objectid fields, those are key
- of object, entry points to */
+ * Q: How to get key of object pointed to by entry from entry?
+ *
+ * A: Each directory entry has its header. This header has deh_dir_id
+ * and deh_objectid fields, those are key of object, entry points to
+ */
-/* NOT IMPLEMENTED:
- Directory will someday contain stat data of object */
+/*
+ * NOT IMPLEMENTED:
+ * Directory will someday contain stat data of object
+ */
struct reiserfs_de_head {
__le32 deh_offset; /* third component of the directory entry key */
- __le32 deh_dir_id; /* objectid of the parent directory of the object, that is referenced
- by directory entry */
- __le32 deh_objectid; /* objectid of the object, that is referenced by directory entry */
+
+ /*
+ * objectid of the parent directory of the object, that is referenced
+ * by directory entry
+ */
+ __le32 deh_dir_id;
+
+ /* objectid of the object, that is referenced by directory entry */
+ __le32 deh_objectid;
__le16 deh_location; /* offset of name in the whole item */
- __le16 deh_state; /* whether 1) entry contains stat data (for future), and 2) whether
- entry is hidden (unlinked) */
+
+ /*
+ * whether 1) entry contains stat data (for future), and
+ * 2) whether entry is hidden (unlinked)
+ */
+ __le16 deh_state;
} __attribute__ ((__packed__));
#define DEH_SIZE sizeof(struct reiserfs_de_head)
#define deh_offset(p_deh) (le32_to_cpu((p_deh)->deh_offset))
@@ -1684,9 +1923,11 @@ struct reiserfs_de_head {
# define ADDR_UNALIGNED_BITS (3)
#endif
-/* These are only used to manipulate deh_state.
+/*
+ * These are only used to manipulate deh_state.
* Because of this, we'll use the ext2_ bit routines,
- * since they are little endian */
+ * since they are little endian
+ */
#ifdef ADDR_UNALIGNED_BITS
# define aligned_address(addr) ((void *)((long)(addr) & ~((1UL << ADDR_UNALIGNED_BITS) - 1)))
@@ -1721,46 +1962,16 @@ extern void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid,
extern void make_empty_dir_item(char *body, __le32 dirid, __le32 objid,
__le32 par_dirid, __le32 par_objid);
-/* array of the entry headers */
- /* get item body */
-#define B_I_PITEM(bh,ih) ( (bh)->b_data + ih_location(ih) )
-#define B_I_DEH(bh,ih) ((struct reiserfs_de_head *)(B_I_PITEM(bh,ih)))
-
-/* length of the directory entry in directory item. This define
- calculates length of i-th directory entry using directory entry
- locations from dir entry head. When it calculates length of 0-th
- directory entry, it uses length of whole item in place of entry
- location of the non-existent following entry in the calculation.
- See picture above.*/
-/*
-#define I_DEH_N_ENTRY_LENGTH(ih,deh,i) \
-((i) ? (deh_location((deh)-1) - deh_location((deh))) : (ih_item_len((ih)) - deh_location((deh))))
-*/
-static inline int entry_length(const struct buffer_head *bh,
- const struct item_head *ih, int pos_in_item)
-{
- struct reiserfs_de_head *deh;
-
- deh = B_I_DEH(bh, ih) + pos_in_item;
- if (pos_in_item)
- return deh_location(deh - 1) - deh_location(deh);
-
- return ih_item_len(ih) - deh_location(deh);
-}
-
-/* number of entries in the directory item, depends on ENTRY_COUNT being at the start of directory dynamic data. */
-#define I_ENTRY_COUNT(ih) (ih_entry_count((ih)))
-
-/* name by bh, ih and entry_num */
-#define B_I_E_NAME(bh,ih,entry_num) ((char *)(bh->b_data + ih_location(ih) + deh_location(B_I_DEH(bh,ih)+(entry_num))))
-
-// two entries per block (at least)
+/* two entries per block (at least) */
#define REISERFS_MAX_NAME(block_size) 255
-/* this structure is used for operations on directory entries. It is
- not a disk structure. */
-/* When reiserfs_find_entry or search_by_entry_key find directory
- entry, they return filled reiserfs_dir_entry structure */
+/*
+ * this structure is used for operations on directory entries. It is
+ * not a disk structure.
+ *
+ * When reiserfs_find_entry or search_by_entry_key find directory
+ * entry, they return filled reiserfs_dir_entry structure
+ */
struct reiserfs_dir_entry {
struct buffer_head *de_bh;
int de_item_num;
@@ -1778,10 +1989,14 @@ struct reiserfs_dir_entry {
struct cpu_key de_entry_key;
};
-/* these defines are useful when a particular member of a reiserfs_dir_entry is needed */
+/*
+ * these defines are useful when a particular member of
+ * a reiserfs_dir_entry is needed
+ */
/* pointer to file name, stored in entry */
-#define B_I_DEH_ENTRY_FILE_NAME(bh,ih,deh) (B_I_PITEM (bh, ih) + deh_location(deh))
+#define B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh) \
+ (ih_item_body(bh, ih) + deh_location(deh))
/* length of name */
#define I_DEH_N_ENTRY_FILE_NAME_LENGTH(ih,deh,entry_num) \
@@ -1804,11 +2019,13 @@ struct reiserfs_dir_entry {
* |______|_______________|___________________|___________|
*/
-/***************************************************************************/
-/* DISK CHILD */
-/***************************************************************************/
-/* Disk child pointer: The pointer from an internal node of the tree
- to a node that is on disk. */
+/***************************************************************************
+ * DISK CHILD *
+ ***************************************************************************/
+/*
+ * Disk child pointer:
+ * The pointer from an internal node of the tree to a node that is on disk.
+ */
struct disk_child {
__le32 dc_block_number; /* Disk child's block number. */
__le16 dc_size; /* Disk child's used space. */
@@ -1841,47 +2058,66 @@ struct disk_child {
#define MAX_NR_KEY(bh) ( (MAX_CHILD_SIZE(bh)-DC_SIZE)/(KEY_SIZE+DC_SIZE) )
#define MIN_NR_KEY(bh) (MAX_NR_KEY(bh)/2)
-/***************************************************************************/
-/* PATH STRUCTURES AND DEFINES */
-/***************************************************************************/
+/***************************************************************************
+ * PATH STRUCTURES AND DEFINES *
+ ***************************************************************************/
-/* Search_by_key fills up the path from the root to the leaf as it descends the tree looking for the
- key. It uses reiserfs_bread to try to find buffers in the cache given their block number. If it
- does not find them in the cache it reads them from disk. For each node search_by_key finds using
- reiserfs_bread it then uses bin_search to look through that node. bin_search will find the
- position of the block_number of the next node if it is looking through an internal node. If it
- is looking through a leaf node bin_search will find the position of the item which has key either
- equal to given key, or which is the maximal key less than the given key. */
+/*
+ * search_by_key fills up the path from the root to the leaf as it descends
+ * the tree looking for the key. It uses reiserfs_bread to try to find
+ * buffers in the cache given their block number. If it does not find
+ * them in the cache it reads them from disk. For each node search_by_key
+ * finds using reiserfs_bread it then uses bin_search to look through that
+ * node. bin_search will find the position of the block_number of the next
+ * node if it is looking through an internal node. If it is looking through
+ * a leaf node bin_search will find the position of the item which has key
+ * either equal to given key, or which is the maximal key less than the
+ * given key.
+ */
struct path_element {
- struct buffer_head *pe_buffer; /* Pointer to the buffer at the path in the tree. */
- int pe_position; /* Position in the tree node which is placed in the */
- /* buffer above. */
+ /* Pointer to the buffer at the path in the tree. */
+ struct buffer_head *pe_buffer;
+ /* Position in the tree node which is placed in the buffer above. */
+ int pe_position;
};
-#define MAX_HEIGHT 5 /* maximal height of a tree. don't change this without changing JOURNAL_PER_BALANCE_CNT */
-#define EXTENDED_MAX_HEIGHT 7 /* Must be equals MAX_HEIGHT + FIRST_PATH_ELEMENT_OFFSET */
-#define FIRST_PATH_ELEMENT_OFFSET 2 /* Must be equal to at least 2. */
-
-#define ILLEGAL_PATH_ELEMENT_OFFSET 1 /* Must be equal to FIRST_PATH_ELEMENT_OFFSET - 1 */
-#define MAX_FEB_SIZE 6 /* this MUST be MAX_HEIGHT + 1. See about FEB below */
-
-/* We need to keep track of who the ancestors of nodes are. When we
- perform a search we record which nodes were visited while
- descending the tree looking for the node we searched for. This list
- of nodes is called the path. This information is used while
- performing balancing. Note that this path information may become
- invalid, and this means we must check it when using it to see if it
- is still valid. You'll need to read search_by_key and the comments
- in it, especially about decrement_counters_in_path(), to understand
- this structure.
-
-Paths make the code so much harder to work with and debug.... An
-enormous number of bugs are due to them, and trying to write or modify
-code that uses them just makes my head hurt. They are based on an
-excessive effort to avoid disturbing the precious VFS code.:-( The
-gods only know how we are going to SMP the code that uses them.
-znodes are the way! */
+/*
+ * maximal height of a tree. don't change this without
+ * changing JOURNAL_PER_BALANCE_CNT
+ */
+#define MAX_HEIGHT 5
+
+/* Must be equals MAX_HEIGHT + FIRST_PATH_ELEMENT_OFFSET */
+#define EXTENDED_MAX_HEIGHT 7
+
+/* Must be equal to at least 2. */
+#define FIRST_PATH_ELEMENT_OFFSET 2
+
+/* Must be equal to FIRST_PATH_ELEMENT_OFFSET - 1 */
+#define ILLEGAL_PATH_ELEMENT_OFFSET 1
+
+/* this MUST be MAX_HEIGHT + 1. See about FEB below */
+#define MAX_FEB_SIZE 6
+
+/*
+ * We need to keep track of who the ancestors of nodes are. When we
+ * perform a search we record which nodes were visited while
+ * descending the tree looking for the node we searched for. This list
+ * of nodes is called the path. This information is used while
+ * performing balancing. Note that this path information may become
+ * invalid, and this means we must check it when using it to see if it
+ * is still valid. You'll need to read search_by_key and the comments
+ * in it, especially about decrement_counters_in_path(), to understand
+ * this structure.
+ *
+ * Paths make the code so much harder to work with and debug.... An
+ * enormous number of bugs are due to them, and trying to write or modify
+ * code that uses them just makes my head hurt. They are based on an
+ * excessive effort to avoid disturbing the precious VFS code.:-( The
+ * gods only know how we are going to SMP the code that uses them.
+ * znodes are the way!
+ */
#define PATH_READA 0x1 /* do read ahead */
#define PATH_READA_BACK 0x2 /* read backwards */
@@ -1889,7 +2125,8 @@ znodes are the way! */
struct treepath {
int path_length; /* Length of the array above. */
int reada;
- struct path_element path_elements[EXTENDED_MAX_HEIGHT]; /* Array of the path elements. */
+ /* Array of the path elements. */
+ struct path_element path_elements[EXTENDED_MAX_HEIGHT];
int pos_in_item;
};
@@ -1908,41 +2145,124 @@ struct treepath var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET, .reada = 0,}
#define PATH_OFFSET_POSITION(path, n_offset) (PATH_OFFSET_PELEMENT(path, n_offset)->pe_position)
#define PATH_PLAST_BUFFER(path) (PATH_OFFSET_PBUFFER((path), (path)->path_length))
- /* you know, to the person who didn't
- write this the macro name does not
- at first suggest what it does.
- Maybe POSITION_FROM_PATH_END? Or
- maybe we should just focus on
- dumping paths... -Hans */
+
+/*
+ * you know, to the person who didn't write this the macro name does not
+ * at first suggest what it does. Maybe POSITION_FROM_PATH_END? Or
+ * maybe we should just focus on dumping paths... -Hans
+ */
#define PATH_LAST_POSITION(path) (PATH_OFFSET_POSITION((path), (path)->path_length))
-#define PATH_PITEM_HEAD(path) B_N_PITEM_HEAD(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path))
+/*
+ * in do_balance leaf has h == 0 in contrast with path structure,
+ * where root has level == 0. That is why we need these defines
+ */
+
+/* tb->S[h] */
+#define PATH_H_PBUFFER(path, h) \
+ PATH_OFFSET_PBUFFER(path, path->path_length - (h))
+
+/* tb->F[h] or tb->S[0]->b_parent */
+#define PATH_H_PPARENT(path, h) PATH_H_PBUFFER(path, (h) + 1)
+
+#define PATH_H_POSITION(path, h) \
+ PATH_OFFSET_POSITION(path, path->path_length - (h))
-/* in do_balance leaf has h == 0 in contrast with path structure,
- where root has level == 0. That is why we need these defines */
-#define PATH_H_PBUFFER(path, h) PATH_OFFSET_PBUFFER (path, path->path_length - (h)) /* tb->S[h] */
-#define PATH_H_PPARENT(path, h) PATH_H_PBUFFER (path, (h) + 1) /* tb->F[h] or tb->S[0]->b_parent */
-#define PATH_H_POSITION(path, h) PATH_OFFSET_POSITION (path, path->path_length - (h))
-#define PATH_H_B_ITEM_ORDER(path, h) PATH_H_POSITION(path, h + 1) /* tb->S[h]->b_item_order */
+/* tb->S[h]->b_item_order */
+#define PATH_H_B_ITEM_ORDER(path, h) PATH_H_POSITION(path, h + 1)
#define PATH_H_PATH_OFFSET(path, n_h) ((path)->path_length - (n_h))
+static inline void *reiserfs_node_data(const struct buffer_head *bh)
+{
+ return bh->b_data + sizeof(struct block_head);
+}
+
+/* get key from internal node */
+static inline struct reiserfs_key *internal_key(struct buffer_head *bh,
+ int item_num)
+{
+ struct reiserfs_key *key = reiserfs_node_data(bh);
+
+ return &key[item_num];
+}
+
+/* get the item header from leaf node */
+static inline struct item_head *item_head(const struct buffer_head *bh,
+ int item_num)
+{
+ struct item_head *ih = reiserfs_node_data(bh);
+
+ return &ih[item_num];
+}
+
+/* get the key from leaf node */
+static inline struct reiserfs_key *leaf_key(const struct buffer_head *bh,
+ int item_num)
+{
+ return &item_head(bh, item_num)->ih_key;
+}
+
+static inline void *ih_item_body(const struct buffer_head *bh,
+ const struct item_head *ih)
+{
+ return bh->b_data + ih_location(ih);
+}
+
+/* get item body from leaf node */
+static inline void *item_body(const struct buffer_head *bh, int item_num)
+{
+ return ih_item_body(bh, item_head(bh, item_num));
+}
+
+static inline struct item_head *tp_item_head(const struct treepath *path)
+{
+ return item_head(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path));
+}
+
+static inline void *tp_item_body(const struct treepath *path)
+{
+ return item_body(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path));
+}
+
#define get_last_bh(path) PATH_PLAST_BUFFER(path)
-#define get_ih(path) PATH_PITEM_HEAD(path)
#define get_item_pos(path) PATH_LAST_POSITION(path)
-#define get_item(path) ((void *)B_N_PITEM(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION (path)))
#define item_moved(ih,path) comp_items(ih, path)
#define path_changed(ih,path) comp_items (ih, path)
-/***************************************************************************/
-/* MISC */
-/***************************************************************************/
+/* array of the entry headers */
+ /* get item body */
+#define B_I_DEH(bh, ih) ((struct reiserfs_de_head *)(ih_item_body(bh, ih)))
+
+/*
+ * length of the directory entry in directory item. This define
+ * calculates length of i-th directory entry using directory entry
+ * locations from dir entry head. When it calculates length of 0-th
+ * directory entry, it uses length of whole item in place of entry
+ * location of the non-existent following entry in the calculation.
+ * See picture above.
+ */
+static inline int entry_length(const struct buffer_head *bh,
+ const struct item_head *ih, int pos_in_item)
+{
+ struct reiserfs_de_head *deh;
+
+ deh = B_I_DEH(bh, ih) + pos_in_item;
+ if (pos_in_item)
+ return deh_location(deh - 1) - deh_location(deh);
+
+ return ih_item_len(ih) - deh_location(deh);
+}
+
+/***************************************************************************
+ * MISC *
+ ***************************************************************************/
/* Size of pointer to the unformatted node. */
#define UNFM_P_SIZE (sizeof(unp_t))
#define UNFM_P_SHIFT 2
-// in in-core inode key is stored on le form
+/* in in-core inode key is stored on le form */
#define INODE_PKEY(inode) ((struct reiserfs_key *)(REISERFS_I(inode)->i_key))
#define MAX_UL_INT 0xffffffff
@@ -1958,7 +2278,6 @@ static inline loff_t max_reiserfs_offset(struct inode *inode)
return (loff_t) ((~(__u64) 0) >> 4);
}
-/*#define MAX_KEY_UNIQUENESS MAX_UL_INT*/
#define MAX_KEY_OBJECTID MAX_UL_INT
#define MAX_B_NUM MAX_UL_INT
@@ -1967,9 +2286,12 @@ static inline loff_t max_reiserfs_offset(struct inode *inode)
/* the purpose is to detect overflow of an unsigned short */
#define REISERFS_LINK_MAX (MAX_US_INT - 1000)
-/* The following defines are used in reiserfs_insert_item and reiserfs_append_item */
-#define REISERFS_KERNEL_MEM 0 /* reiserfs kernel memory mode */
-#define REISERFS_USER_MEM 1 /* reiserfs user memory mode */
+/*
+ * The following defines are used in reiserfs_insert_item
+ * and reiserfs_append_item
+ */
+#define REISERFS_KERNEL_MEM 0 /* kernel memory mode */
+#define REISERFS_USER_MEM 1 /* user memory mode */
#define fs_generation(s) (REISERFS_SB(s)->s_generation_counter)
#define get_generation(s) atomic_read (&fs_generation(s))
@@ -1981,46 +2303,65 @@ static inline loff_t max_reiserfs_offset(struct inode *inode)
__fs_changed(gen, s); \
})
-/***************************************************************************/
-/* FIXATE NODES */
-/***************************************************************************/
+/***************************************************************************
+ * FIXATE NODES *
+ ***************************************************************************/
#define VI_TYPE_LEFT_MERGEABLE 1
#define VI_TYPE_RIGHT_MERGEABLE 2
-/* To make any changes in the tree we always first find node, that
- contains item to be changed/deleted or place to insert a new
- item. We call this node S. To do balancing we need to decide what
- we will shift to left/right neighbor, or to a new node, where new
- item will be etc. To make this analysis simpler we build virtual
- node. Virtual node is an array of items, that will replace items of
- node S. (For instance if we are going to delete an item, virtual
- node does not contain it). Virtual node keeps information about
- item sizes and types, mergeability of first and last items, sizes
- of all entries in directory item. We use this array of items when
- calculating what we can shift to neighbors and how many nodes we
- have to have if we do not any shiftings, if we shift to left/right
- neighbor or to both. */
+/*
+ * To make any changes in the tree we always first find node, that
+ * contains item to be changed/deleted or place to insert a new
+ * item. We call this node S. To do balancing we need to decide what
+ * we will shift to left/right neighbor, or to a new node, where new
+ * item will be etc. To make this analysis simpler we build virtual
+ * node. Virtual node is an array of items, that will replace items of
+ * node S. (For instance if we are going to delete an item, virtual
+ * node does not contain it). Virtual node keeps information about
+ * item sizes and types, mergeability of first and last items, sizes
+ * of all entries in directory item. We use this array of items when
+ * calculating what we can shift to neighbors and how many nodes we
+ * have to have if we do not any shiftings, if we shift to left/right
+ * neighbor or to both.
+ */
struct virtual_item {
- int vi_index; // index in the array of item operations
- unsigned short vi_type; // left/right mergeability
- unsigned short vi_item_len; /* length of item that it will have after balancing */
+ int vi_index; /* index in the array of item operations */
+ unsigned short vi_type; /* left/right mergeability */
+
+ /* length of item that it will have after balancing */
+ unsigned short vi_item_len;
+
struct item_head *vi_ih;
- const char *vi_item; // body of item (old or new)
- const void *vi_new_data; // 0 always but paste mode
- void *vi_uarea; // item specific area
+ const char *vi_item; /* body of item (old or new) */
+ const void *vi_new_data; /* 0 always but paste mode */
+ void *vi_uarea; /* item specific area */
};
struct virtual_node {
- char *vn_free_ptr; /* this is a pointer to the free space in the buffer */
+ /* this is a pointer to the free space in the buffer */
+ char *vn_free_ptr;
+
unsigned short vn_nr_item; /* number of items in virtual node */
- short vn_size; /* size of node , that node would have if it has unlimited size and no balancing is performed */
- short vn_mode; /* mode of balancing (paste, insert, delete, cut) */
+
+ /*
+ * size of node , that node would have if it has
+ * unlimited size and no balancing is performed
+ */
+ short vn_size;
+
+ /* mode of balancing (paste, insert, delete, cut) */
+ short vn_mode;
+
short vn_affected_item_num;
short vn_pos_in_item;
- struct item_head *vn_ins_ih; /* item header of inserted item, 0 for other modes */
+
+ /* item header of inserted item, 0 for other modes */
+ struct item_head *vn_ins_ih;
const void *vn_data;
- struct virtual_item *vn_vi; /* array of items (including a new one, excluding item to be deleted) */
+
+ /* array of items (including a new one, excluding item to be deleted) */
+ struct virtual_item *vn_vi;
};
/* used by directory items when creating virtual nodes */
@@ -2030,22 +2371,25 @@ struct direntry_uarea {
__u16 entry_sizes[1];
} __attribute__ ((__packed__));
-/***************************************************************************/
-/* TREE BALANCE */
-/***************************************************************************/
+/***************************************************************************
+ * TREE BALANCE *
+ ***************************************************************************/
-/* This temporary structure is used in tree balance algorithms, and
- constructed as we go to the extent that its various parts are
- needed. It contains arrays of nodes that can potentially be
- involved in the balancing of node S, and parameters that define how
- each of the nodes must be balanced. Note that in these algorithms
- for balancing the worst case is to need to balance the current node
- S and the left and right neighbors and all of their parents plus
- create a new node. We implement S1 balancing for the leaf nodes
- and S0 balancing for the internal nodes (S1 and S0 are defined in
- our papers.)*/
+/*
+ * This temporary structure is used in tree balance algorithms, and
+ * constructed as we go to the extent that its various parts are
+ * needed. It contains arrays of nodes that can potentially be
+ * involved in the balancing of node S, and parameters that define how
+ * each of the nodes must be balanced. Note that in these algorithms
+ * for balancing the worst case is to need to balance the current node
+ * S and the left and right neighbors and all of their parents plus
+ * create a new node. We implement S1 balancing for the leaf nodes
+ * and S0 balancing for the internal nodes (S1 and S0 are defined in
+ * our papers.)
+ */
-#define MAX_FREE_BLOCK 7 /* size of the array of buffers to free at end of do_balance */
+/* size of the array of buffers to free at end of do_balance */
+#define MAX_FREE_BLOCK 7
/* maximum number of FEB blocknrs on a single level */
#define MAX_AMOUNT_NEEDED 2
@@ -2057,64 +2401,144 @@ struct tree_balance {
struct super_block *tb_sb;
struct reiserfs_transaction_handle *transaction_handle;
struct treepath *tb_path;
- struct buffer_head *L[MAX_HEIGHT]; /* array of left neighbors of nodes in the path */
- struct buffer_head *R[MAX_HEIGHT]; /* array of right neighbors of nodes in the path */
- struct buffer_head *FL[MAX_HEIGHT]; /* array of fathers of the left neighbors */
- struct buffer_head *FR[MAX_HEIGHT]; /* array of fathers of the right neighbors */
- struct buffer_head *CFL[MAX_HEIGHT]; /* array of common parents of center node and its left neighbor */
- struct buffer_head *CFR[MAX_HEIGHT]; /* array of common parents of center node and its right neighbor */
-
- struct buffer_head *FEB[MAX_FEB_SIZE]; /* array of empty buffers. Number of buffers in array equals
- cur_blknum. */
+
+ /* array of left neighbors of nodes in the path */
+ struct buffer_head *L[MAX_HEIGHT];
+
+ /* array of right neighbors of nodes in the path */
+ struct buffer_head *R[MAX_HEIGHT];
+
+ /* array of fathers of the left neighbors */
+ struct buffer_head *FL[MAX_HEIGHT];
+
+ /* array of fathers of the right neighbors */
+ struct buffer_head *FR[MAX_HEIGHT];
+ /* array of common parents of center node and its left neighbor */
+ struct buffer_head *CFL[MAX_HEIGHT];
+
+ /* array of common parents of center node and its right neighbor */
+ struct buffer_head *CFR[MAX_HEIGHT];
+
+ /*
+ * array of empty buffers. Number of buffers in array equals
+ * cur_blknum.
+ */
+ struct buffer_head *FEB[MAX_FEB_SIZE];
struct buffer_head *used[MAX_FEB_SIZE];
struct buffer_head *thrown[MAX_FEB_SIZE];
- int lnum[MAX_HEIGHT]; /* array of number of items which must be
- shifted to the left in order to balance the
- current node; for leaves includes item that
- will be partially shifted; for internal
- nodes, it is the number of child pointers
- rather than items. It includes the new item
- being created. The code sometimes subtracts
- one to get the number of wholly shifted
- items for other purposes. */
- int rnum[MAX_HEIGHT]; /* substitute right for left in comment above */
- int lkey[MAX_HEIGHT]; /* array indexed by height h mapping the key delimiting L[h] and
- S[h] to its item number within the node CFL[h] */
- int rkey[MAX_HEIGHT]; /* substitute r for l in comment above */
- int insert_size[MAX_HEIGHT]; /* the number of bytes by we are trying to add or remove from
- S[h]. A negative value means removing. */
- int blknum[MAX_HEIGHT]; /* number of nodes that will replace node S[h] after
- balancing on the level h of the tree. If 0 then S is
- being deleted, if 1 then S is remaining and no new nodes
- are being created, if 2 or 3 then 1 or 2 new nodes is
- being created */
+
+ /*
+ * array of number of items which must be shifted to the left in
+ * order to balance the current node; for leaves includes item that
+ * will be partially shifted; for internal nodes, it is the number
+ * of child pointers rather than items. It includes the new item
+ * being created. The code sometimes subtracts one to get the
+ * number of wholly shifted items for other purposes.
+ */
+ int lnum[MAX_HEIGHT];
+
+ /* substitute right for left in comment above */
+ int rnum[MAX_HEIGHT];
+
+ /*
+ * array indexed by height h mapping the key delimiting L[h] and
+ * S[h] to its item number within the node CFL[h]
+ */
+ int lkey[MAX_HEIGHT];
+
+ /* substitute r for l in comment above */
+ int rkey[MAX_HEIGHT];
+
+ /*
+ * the number of bytes by we are trying to add or remove from
+ * S[h]. A negative value means removing.
+ */
+ int insert_size[MAX_HEIGHT];
+
+ /*
+ * number of nodes that will replace node S[h] after balancing
+ * on the level h of the tree. If 0 then S is being deleted,
+ * if 1 then S is remaining and no new nodes are being created,
+ * if 2 or 3 then 1 or 2 new nodes is being created
+ */
+ int blknum[MAX_HEIGHT];
/* fields that are used only for balancing leaves of the tree */
- int cur_blknum; /* number of empty blocks having been already allocated */
- int s0num; /* number of items that fall into left most node when S[0] splits */
- int s1num; /* number of items that fall into first new node when S[0] splits */
- int s2num; /* number of items that fall into second new node when S[0] splits */
- int lbytes; /* number of bytes which can flow to the left neighbor from the left */
- /* most liquid item that cannot be shifted from S[0] entirely */
- /* if -1 then nothing will be partially shifted */
- int rbytes; /* number of bytes which will flow to the right neighbor from the right */
- /* most liquid item that cannot be shifted from S[0] entirely */
- /* if -1 then nothing will be partially shifted */
- int s1bytes; /* number of bytes which flow to the first new node when S[0] splits */
- /* note: if S[0] splits into 3 nodes, then items do not need to be cut */
- int s2bytes;
- struct buffer_head *buf_to_free[MAX_FREE_BLOCK]; /* buffers which are to be freed after do_balance finishes by unfix_nodes */
- char *vn_buf; /* kmalloced memory. Used to create
- virtual node and keep map of
- dirtied bitmap blocks */
+
+ /* number of empty blocks having been already allocated */
+ int cur_blknum;
+
+ /* number of items that fall into left most node when S[0] splits */
+ int s0num;
+
+ /*
+ * number of bytes which can flow to the left neighbor from the left
+ * most liquid item that cannot be shifted from S[0] entirely
+ * if -1 then nothing will be partially shifted
+ */
+ int lbytes;
+
+ /*
+ * number of bytes which will flow to the right neighbor from the right
+ * most liquid item that cannot be shifted from S[0] entirely
+ * if -1 then nothing will be partially shifted
+ */
+ int rbytes;
+
+
+ /*
+ * index into the array of item headers in
+ * S[0] of the affected item
+ */
+ int item_pos;
+
+ /* new nodes allocated to hold what could not fit into S */
+ struct buffer_head *S_new[2];
+
+ /*
+ * number of items that will be placed into nodes in S_new
+ * when S[0] splits
+ */
+ int snum[2];
+
+ /*
+ * number of bytes which flow to nodes in S_new when S[0] splits
+ * note: if S[0] splits into 3 nodes, then items do not need to be cut
+ */
+ int sbytes[2];
+
+ int pos_in_item;
+ int zeroes_num;
+
+ /*
+ * buffers which are to be freed after do_balance finishes
+ * by unfix_nodes
+ */
+ struct buffer_head *buf_to_free[MAX_FREE_BLOCK];
+
+ /*
+ * kmalloced memory. Used to create virtual node and keep
+ * map of dirtied bitmap blocks
+ */
+ char *vn_buf;
+
int vn_buf_size; /* size of the vn_buf */
- struct virtual_node *tb_vn; /* VN starts after bitmap of bitmap blocks */
- int fs_gen; /* saved value of `reiserfs_generation' counter
- see FILESYSTEM_CHANGED() macro in reiserfs_fs.h */
+ /* VN starts after bitmap of bitmap blocks */
+ struct virtual_node *tb_vn;
+
+ /*
+ * saved value of `reiserfs_generation' counter see
+ * FILESYSTEM_CHANGED() macro in reiserfs_fs.h
+ */
+ int fs_gen;
+
#ifdef DISPLACE_NEW_PACKING_LOCALITIES
- struct in_core_key key; /* key pointer, to pass to block allocator or
- another low-level subsystem */
+ /*
+ * key pointer, to pass to block allocator or
+ * another low-level subsystem
+ */
+ struct in_core_key key;
#endif
};
@@ -2122,20 +2546,24 @@ struct tree_balance {
/* When inserting an item. */
#define M_INSERT 'i'
-/* When inserting into (directories only) or appending onto an already
- existent item. */
+/*
+ * When inserting into (directories only) or appending onto an already
+ * existent item.
+ */
#define M_PASTE 'p'
/* When deleting an item. */
#define M_DELETE 'd'
/* When truncating an item or removing an entry from a (directory) item. */
-#define M_CUT 'c'
+#define M_CUT 'c'
/* used when balancing on leaf level skipped (in reiserfsck) */
#define M_INTERNAL 'n'
-/* When further balancing is not needed, then do_balance does not need
- to be called. */
-#define M_SKIP_BALANCING 's'
+/*
+ * When further balancing is not needed, then do_balance does not need
+ * to be called.
+ */
+#define M_SKIP_BALANCING 's'
#define M_CONVERT 'v'
/* modes of leaf_move_items */
@@ -2148,8 +2576,10 @@ struct tree_balance {
#define FIRST_TO_LAST 0
#define LAST_TO_FIRST 1
-/* used in do_balance for passing parent of node information that has
- been gotten from tb struct */
+/*
+ * used in do_balance for passing parent of node information that has
+ * been gotten from tb struct
+ */
struct buffer_info {
struct tree_balance *tb;
struct buffer_head *bi_bh;
@@ -2167,20 +2597,24 @@ static inline struct super_block *sb_from_bi(struct buffer_info *bi)
return bi ? sb_from_tb(bi->tb) : NULL;
}
-/* there are 4 types of items: stat data, directory item, indirect, direct.
-+-------------------+------------+--------------+------------+
-| | k_offset | k_uniqueness | mergeable? |
-+-------------------+------------+--------------+------------+
-| stat data | 0 | 0 | no |
-+-------------------+------------+--------------+------------+
-| 1st directory item| DOT_OFFSET |DIRENTRY_UNIQUENESS| no |
-| non 1st directory | hash value | | yes |
-| item | | | |
-+-------------------+------------+--------------+------------+
-| indirect item | offset + 1 |TYPE_INDIRECT | if this is not the first indirect item of the object
-+-------------------+------------+--------------+------------+
-| direct item | offset + 1 |TYPE_DIRECT | if not this is not the first direct item of the object
-+-------------------+------------+--------------+------------+
+/*
+ * there are 4 types of items: stat data, directory item, indirect, direct.
+ * +-------------------+------------+--------------+------------+
+ * | | k_offset | k_uniqueness | mergeable? |
+ * +-------------------+------------+--------------+------------+
+ * | stat data | 0 | 0 | no |
+ * +-------------------+------------+--------------+------------+
+ * | 1st directory item| DOT_OFFSET | DIRENTRY_ .. | no |
+ * | non 1st directory | hash value | UNIQUENESS | yes |
+ * | item | | | |
+ * +-------------------+------------+--------------+------------+
+ * | indirect item | offset + 1 |TYPE_INDIRECT | [1] |
+ * +-------------------+------------+--------------+------------+
+ * | direct item | offset + 1 |TYPE_DIRECT | [2] |
+ * +-------------------+------------+--------------+------------+
+ *
+ * [1] if this is not the first indirect item of the object
+ * [2] if this is not the first direct item of the object
*/
struct item_operations {
@@ -2219,49 +2653,43 @@ extern struct item_operations *item_ops[TYPE_ANY + 1];
/* number of blocks pointed to by the indirect item */
#define I_UNFM_NUM(ih) (ih_item_len(ih) / UNFM_P_SIZE)
-/* the used space within the unformatted node corresponding to pos within the item pointed to by ih */
+/*
+ * the used space within the unformatted node corresponding
+ * to pos within the item pointed to by ih
+ */
#define I_POS_UNFM_SIZE(ih,pos,size) (((pos) == I_UNFM_NUM(ih) - 1 ) ? (size) - ih_free_space(ih) : (size))
-/* number of bytes contained by the direct item or the unformatted nodes the indirect item points to */
-
-/* get the item header */
-#define B_N_PITEM_HEAD(bh,item_num) ( (struct item_head * )((bh)->b_data + BLKH_SIZE) + (item_num) )
-
-/* get key */
-#define B_N_PDELIM_KEY(bh,item_num) ( (struct reiserfs_key * )((bh)->b_data + BLKH_SIZE) + (item_num) )
-
-/* get the key */
-#define B_N_PKEY(bh,item_num) ( &(B_N_PITEM_HEAD(bh,item_num)->ih_key) )
-
-/* get item body */
-#define B_N_PITEM(bh,item_num) ( (bh)->b_data + ih_location(B_N_PITEM_HEAD((bh),(item_num))))
-
-/* get the stat data by the buffer header and the item order */
-#define B_N_STAT_DATA(bh,nr) \
-( (struct stat_data *)((bh)->b_data + ih_location(B_N_PITEM_HEAD((bh),(nr))) ) )
+/*
+ * number of bytes contained by the direct item or the
+ * unformatted nodes the indirect item points to
+ */
- /* following defines use reiserfs buffer header and item header */
+/* following defines use reiserfs buffer header and item header */
/* get stat-data */
#define B_I_STAT_DATA(bh, ih) ( (struct stat_data * )((bh)->b_data + ih_location(ih)) )
-// this is 3976 for size==4096
+/* this is 3976 for size==4096 */
#define MAX_DIRECT_ITEM_LEN(size) ((size) - BLKH_SIZE - 2*IH_SIZE - SD_SIZE - UNFM_P_SIZE)
-/* indirect items consist of entries which contain blocknrs, pos
- indicates which entry, and B_I_POS_UNFM_POINTER resolves to the
- blocknr contained by the entry pos points to */
-#define B_I_POS_UNFM_POINTER(bh,ih,pos) le32_to_cpu(*(((unp_t *)B_I_PITEM(bh,ih)) + (pos)))
-#define PUT_B_I_POS_UNFM_POINTER(bh,ih,pos, val) do {*(((unp_t *)B_I_PITEM(bh,ih)) + (pos)) = cpu_to_le32(val); } while (0)
+/*
+ * indirect items consist of entries which contain blocknrs, pos
+ * indicates which entry, and B_I_POS_UNFM_POINTER resolves to the
+ * blocknr contained by the entry pos points to
+ */
+#define B_I_POS_UNFM_POINTER(bh, ih, pos) \
+ le32_to_cpu(*(((unp_t *)ih_item_body(bh, ih)) + (pos)))
+#define PUT_B_I_POS_UNFM_POINTER(bh, ih, pos, val) \
+ (*(((unp_t *)ih_item_body(bh, ih)) + (pos)) = cpu_to_le32(val))
struct reiserfs_iget_args {
__u32 objectid;
__u32 dirid;
};
-/***************************************************************************/
-/* FUNCTION DECLARATIONS */
-/***************************************************************************/
+/***************************************************************************
+ * FUNCTION DECLARATIONS *
+ ***************************************************************************/
#define get_journal_desc_magic(bh) (bh->b_data + bh->b_size - 12)
@@ -2273,7 +2701,10 @@ struct reiserfs_iget_args {
/* first block written in a commit. */
struct reiserfs_journal_desc {
__le32 j_trans_id; /* id of commit */
- __le32 j_len; /* length of commit. len +1 is the commit block */
+
+ /* length of commit. len +1 is the commit block */
+ __le32 j_len;
+
__le32 j_mount_id; /* mount id of this trans */
__le32 j_realblock[1]; /* real locations for each block */
};
@@ -2300,22 +2731,35 @@ struct reiserfs_journal_commit {
#define set_commit_trans_id(c,val) do { (c)->j_trans_id = cpu_to_le32 (val); } while (0)
#define set_commit_trans_len(c,val) do { (c)->j_len = cpu_to_le32 (val); } while (0)
-/* this header block gets written whenever a transaction is considered fully flushed, and is more recent than the
-** last fully flushed transaction. fully flushed means all the log blocks and all the real blocks are on disk,
-** and this transaction does not need to be replayed.
-*/
+/*
+ * this header block gets written whenever a transaction is considered
+ * fully flushed, and is more recent than the last fully flushed transaction.
+ * fully flushed means all the log blocks and all the real blocks are on
+ * disk, and this transaction does not need to be replayed.
+ */
struct reiserfs_journal_header {
- __le32 j_last_flush_trans_id; /* id of last fully flushed transaction */
- __le32 j_first_unflushed_offset; /* offset in the log of where to start replay after a crash */
+ /* id of last fully flushed transaction */
+ __le32 j_last_flush_trans_id;
+
+ /* offset in the log of where to start replay after a crash */
+ __le32 j_first_unflushed_offset;
+
__le32 j_mount_id;
/* 12 */ struct journal_params jh_journal;
};
/* biggest tunable defines are right here */
#define JOURNAL_BLOCK_COUNT 8192 /* number of blocks in the journal */
-#define JOURNAL_TRANS_MAX_DEFAULT 1024 /* biggest possible single transaction, don't change for now (8/3/99) */
+
+/* biggest possible single transaction, don't change for now (8/3/99) */
+#define JOURNAL_TRANS_MAX_DEFAULT 1024
#define JOURNAL_TRANS_MIN_DEFAULT 256
-#define JOURNAL_MAX_BATCH_DEFAULT 900 /* max blocks to batch into one transaction, don't make this any bigger than 900 */
+
+/*
+ * max blocks to batch into one transaction,
+ * don't make this any bigger than 900
+ */
+#define JOURNAL_MAX_BATCH_DEFAULT 900
#define JOURNAL_MIN_RATIO 2
#define JOURNAL_MAX_COMMIT_AGE 30
#define JOURNAL_MAX_TRANS_AGE 30
@@ -2340,16 +2784,18 @@ struct reiserfs_journal_header {
#define REISERFS_QUOTA_DEL_BLOCKS(s) 0
#endif
-/* both of these can be as low as 1, or as high as you want. The min is the
-** number of 4k bitmap nodes preallocated on mount. New nodes are allocated
-** as needed, and released when transactions are committed. On release, if
-** the current number of nodes is > max, the node is freed, otherwise,
-** it is put on a free list for faster use later.
+/*
+ * both of these can be as low as 1, or as high as you want. The min is the
+ * number of 4k bitmap nodes preallocated on mount. New nodes are allocated
+ * as needed, and released when transactions are committed. On release, if
+ * the current number of nodes is > max, the node is freed, otherwise,
+ * it is put on a free list for faster use later.
*/
#define REISERFS_MIN_BITMAP_NODES 10
#define REISERFS_MAX_BITMAP_NODES 100
-#define JBH_HASH_SHIFT 13 /* these are based on journal hash size of 8192 */
+/* these are based on journal hash size of 8192 */
+#define JBH_HASH_SHIFT 13
#define JBH_HASH_MASK 8191
#define _jhashfn(sb,block) \
@@ -2357,7 +2803,7 @@ struct reiserfs_journal_header {
(((block)<<(JBH_HASH_SHIFT - 6)) ^ ((block) >> 13) ^ ((block) << (JBH_HASH_SHIFT - 12))))
#define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK])
-// We need these to make journal.c code more readable
+/* We need these to make journal.c code more readable */
#define journal_find_get_block(s, block) __find_get_block(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
#define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
#define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
@@ -2365,12 +2811,14 @@ struct reiserfs_journal_header {
enum reiserfs_bh_state_bits {
BH_JDirty = BH_PrivateStart, /* buffer is in current transaction */
BH_JDirty_wait,
- BH_JNew, /* disk block was taken off free list before
- * being in a finished transaction, or
- * written to disk. Can be reused immed. */
+ /*
+ * disk block was taken off free list before being in a
+ * finished transaction, or written to disk. Can be reused immed.
+ */
+ BH_JNew,
BH_JPrepared,
BH_JRestore_dirty,
- BH_JTest, // debugging only will go away
+ BH_JTest, /* debugging only will go away */
};
BUFFER_FNS(JDirty, journaled);
@@ -2386,27 +2834,36 @@ TAS_BUFFER_FNS(JRestore_dirty, journal_restore_dirty);
BUFFER_FNS(JTest, journal_test);
TAS_BUFFER_FNS(JTest, journal_test);
-/*
-** transaction handle which is passed around for all journal calls
-*/
+/* transaction handle which is passed around for all journal calls */
struct reiserfs_transaction_handle {
- struct super_block *t_super; /* super for this FS when journal_begin was
- called. saves calls to reiserfs_get_super
- also used by nested transactions to make
- sure they are nesting on the right FS
- _must_ be first in the handle
- */
+ /*
+ * super for this FS when journal_begin was called. saves calls to
+ * reiserfs_get_super also used by nested transactions to make
+ * sure they are nesting on the right FS _must_ be first
+ * in the handle
+ */
+ struct super_block *t_super;
+
int t_refcount;
int t_blocks_logged; /* number of blocks this writer has logged */
int t_blocks_allocated; /* number of blocks this writer allocated */
- unsigned int t_trans_id; /* sanity check, equals the current trans id */
+
+ /* sanity check, equals the current trans id */
+ unsigned int t_trans_id;
+
void *t_handle_save; /* save existing current->journal_info */
- unsigned displace_new_blocks:1; /* if new block allocation occurres, that block
- should be displaced from others */
+
+ /*
+ * if new block allocation occurres, that block
+ * should be displaced from others
+ */
+ unsigned displace_new_blocks:1;
+
struct list_head t_list;
};
-/* used to keep track of ordered and tail writes, attached to the buffer
+/*
+ * used to keep track of ordered and tail writes, attached to the buffer
* head through b_journal_head.
*/
struct reiserfs_jh {
@@ -2419,7 +2876,7 @@ void reiserfs_free_jh(struct buffer_head *bh);
int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh);
int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh);
int journal_mark_dirty(struct reiserfs_transaction_handle *,
- struct super_block *, struct buffer_head *bh);
+ struct buffer_head *bh);
static inline int reiserfs_file_data_log(struct inode *inode)
{
@@ -2469,10 +2926,8 @@ int journal_init(struct super_block *, const char *j_dev_name, int old_format,
int journal_release(struct reiserfs_transaction_handle *, struct super_block *);
int journal_release_error(struct reiserfs_transaction_handle *,
struct super_block *);
-int journal_end(struct reiserfs_transaction_handle *, struct super_block *,
- unsigned long);
-int journal_end_sync(struct reiserfs_transaction_handle *, struct super_block *,
- unsigned long);
+int journal_end(struct reiserfs_transaction_handle *);
+int journal_end_sync(struct reiserfs_transaction_handle *);
int journal_mark_freed(struct reiserfs_transaction_handle *,
struct super_block *, b_blocknr_t blocknr);
int journal_transaction_should_end(struct reiserfs_transaction_handle *, int);
@@ -2481,7 +2936,7 @@ int reiserfs_in_journal(struct super_block *sb, unsigned int bmap_nr,
int journal_begin(struct reiserfs_transaction_handle *,
struct super_block *sb, unsigned long);
int journal_join_abort(struct reiserfs_transaction_handle *,
- struct super_block *sb, unsigned long);
+ struct super_block *sb);
void reiserfs_abort_journal(struct super_block *sb, int errno);
void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...);
int reiserfs_allocate_list_bitmaps(struct super_block *s,
@@ -2503,20 +2958,18 @@ int B_IS_IN_TREE(const struct buffer_head *);
extern void copy_item_head(struct item_head *to,
const struct item_head *from);
-// first key is in cpu form, second - le
+/* first key is in cpu form, second - le */
extern int comp_short_keys(const struct reiserfs_key *le_key,
const struct cpu_key *cpu_key);
extern void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from);
-// both are in le form
+/* both are in le form */
extern int comp_le_keys(const struct reiserfs_key *,
const struct reiserfs_key *);
extern int comp_short_le_keys(const struct reiserfs_key *,
const struct reiserfs_key *);
-//
-// get key version from on disk key - kludge
-//
+/* * get key version from on disk key - kludge */
static inline int le_key_version(const struct reiserfs_key *key)
{
int type;
@@ -2593,12 +3046,12 @@ void padd_item(char *item, int total_length, int length);
/* inode.c */
/* args for the create parameter of reiserfs_get_block */
-#define GET_BLOCK_NO_CREATE 0 /* don't create new blocks or convert tails */
-#define GET_BLOCK_CREATE 1 /* add anything you need to find block */
-#define GET_BLOCK_NO_HOLE 2 /* return -ENOENT for file holes */
-#define GET_BLOCK_READ_DIRECT 4 /* read the tail if indirect item not found */
-#define GET_BLOCK_NO_IMUX 8 /* i_mutex is not held, don't preallocate */
-#define GET_BLOCK_NO_DANGLE 16 /* don't leave any transactions running */
+#define GET_BLOCK_NO_CREATE 0 /* don't create new blocks or convert tails */
+#define GET_BLOCK_CREATE 1 /* add anything you need to find block */
+#define GET_BLOCK_NO_HOLE 2 /* return -ENOENT for file holes */
+#define GET_BLOCK_READ_DIRECT 4 /* read the tail if indirect item not found */
+#define GET_BLOCK_NO_IMUX 8 /* i_mutex is not held, don't preallocate */
+#define GET_BLOCK_NO_DANGLE 16 /* don't leave any transactions running */
void reiserfs_read_locked_inode(struct inode *inode,
struct reiserfs_iget_args *args);
@@ -2797,25 +3250,49 @@ struct buffer_head *get_FEB(struct tree_balance *);
/* bitmap.c */
-/* structure contains hints for block allocator, and it is a container for
- * arguments, such as node, search path, transaction_handle, etc. */
+/*
+ * structure contains hints for block allocator, and it is a container for
+ * arguments, such as node, search path, transaction_handle, etc.
+ */
struct __reiserfs_blocknr_hint {
- struct inode *inode; /* inode passed to allocator, if we allocate unf. nodes */
+ /* inode passed to allocator, if we allocate unf. nodes */
+ struct inode *inode;
+
sector_t block; /* file offset, in blocks */
struct in_core_key key;
- struct treepath *path; /* search path, used by allocator to deternine search_start by
- * various ways */
- struct reiserfs_transaction_handle *th; /* transaction handle is needed to log super blocks and
- * bitmap blocks changes */
+
+ /*
+ * search path, used by allocator to deternine search_start by
+ * various ways
+ */
+ struct treepath *path;
+
+ /*
+ * transaction handle is needed to log super blocks
+ * and bitmap blocks changes
+ */
+ struct reiserfs_transaction_handle *th;
+
b_blocknr_t beg, end;
- b_blocknr_t search_start; /* a field used to transfer search start value (block number)
- * between different block allocator procedures
- * (determine_search_start() and others) */
- int prealloc_size; /* is set in determine_prealloc_size() function, used by underlayed
- * function that do actual allocation */
-
- unsigned formatted_node:1; /* the allocator uses different polices for getting disk space for
- * formatted/unformatted blocks with/without preallocation */
+
+ /*
+ * a field used to transfer search start value (block number)
+ * between different block allocator procedures
+ * (determine_search_start() and others)
+ */
+ b_blocknr_t search_start;
+
+ /*
+ * is set in determine_prealloc_size() function,
+ * used by underlayed function that do actual allocation
+ */
+ int prealloc_size;
+
+ /*
+ * the allocator uses different polices for getting disk
+ * space for formatted/unformatted blocks with/without preallocation
+ */
+ unsigned formatted_node:1;
unsigned preallocate:1;
};
@@ -2909,13 +3386,15 @@ __u32 r5_hash(const signed char *msg, int len);
#define reiserfs_test_le_bit test_bit_le
#define reiserfs_find_next_zero_le_bit find_next_zero_bit_le
-/* sometimes reiserfs_truncate may require to allocate few new blocks
- to perform indirect2direct conversion. People probably used to
- think, that truncate should work without problems on a filesystem
- without free disk space. They may complain that they can not
- truncate due to lack of free disk space. This spare space allows us
- to not worry about it. 500 is probably too much, but it should be
- absolutely safe */
+/*
+ * sometimes reiserfs_truncate may require to allocate few new blocks
+ * to perform indirect2direct conversion. People probably used to
+ * think, that truncate should work without problems on a filesystem
+ * without free disk space. They may complain that they can not
+ * truncate due to lack of free disk space. This spare space allows us
+ * to not worry about it. 500 is probably too much, but it should be
+ * absolutely safe
+ */
#define SPARE_SPACE 500
/* prototypes from ioctl.c */
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index a4ef5cd606eb..6052d323bc9a 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -53,8 +53,10 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
}
bforget(bh);
- /* old disk layout detection; those partitions can be mounted, but
- * cannot be resized */
+ /*
+ * old disk layout detection; those partitions can be mounted, but
+ * cannot be resized
+ */
if (SB_BUFFER_WITH_SB(s)->b_blocknr * SB_BUFFER_WITH_SB(s)->b_size
!= REISERFS_DISK_OFFSET_IN_BYTES) {
printk
@@ -86,12 +88,14 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
("reiserfs_resize: unable to allocate memory for journal bitmaps\n");
return -ENOMEM;
}
- /* the new journal bitmaps are zero filled, now we copy in the bitmap
- ** node pointers from the old journal bitmap structs, and then
- ** transfer the new data structures into the journal struct.
- **
- ** using the copy_size var below allows this code to work for
- ** both shrinking and expanding the FS.
+ /*
+ * the new journal bitmaps are zero filled, now we copy i
+ * the bitmap node pointers from the old journal bitmap
+ * structs, and then transfer the new data structures
+ * into the journal struct.
+ *
+ * using the copy_size var below allows this code to work for
+ * both shrinking and expanding the FS.
*/
copy_size = bmap_nr_new < bmap_nr ? bmap_nr_new : bmap_nr;
copy_size =
@@ -101,36 +105,45 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
jb = SB_JOURNAL(s)->j_list_bitmap + i;
memcpy(jbitmap[i].bitmaps, jb->bitmaps, copy_size);
- /* just in case vfree schedules on us, copy the new
- ** pointer into the journal struct before freeing the
- ** old one
+ /*
+ * just in case vfree schedules on us, copy the new
+ * pointer into the journal struct before freeing the
+ * old one
*/
node_tmp = jb->bitmaps;
jb->bitmaps = jbitmap[i].bitmaps;
vfree(node_tmp);
}
- /* allocate additional bitmap blocks, reallocate array of bitmap
- * block pointers */
+ /*
+ * allocate additional bitmap blocks, reallocate
+ * array of bitmap block pointers
+ */
bitmap =
vzalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new);
if (!bitmap) {
- /* Journal bitmaps are still supersized, but the memory isn't
- * leaked, so I guess it's ok */
+ /*
+ * Journal bitmaps are still supersized, but the
+ * memory isn't leaked, so I guess it's ok
+ */
printk("reiserfs_resize: unable to allocate memory.\n");
return -ENOMEM;
}
for (i = 0; i < bmap_nr; i++)
bitmap[i] = old_bitmap[i];
- /* This doesn't go through the journal, but it doesn't have to.
- * The changes are still atomic: We're synced up when the journal
- * transaction begins, and the new bitmaps don't matter if the
- * transaction fails. */
+ /*
+ * This doesn't go through the journal, but it doesn't have to.
+ * The changes are still atomic: We're synced up when the
+ * journal transaction begins, and the new bitmaps don't
+ * matter if the transaction fails.
+ */
for (i = bmap_nr; i < bmap_nr_new; i++) {
int depth;
- /* don't use read_bitmap_block since it will cache
- * the uninitialized bitmap */
+ /*
+ * don't use read_bitmap_block since it will cache
+ * the uninitialized bitmap
+ */
depth = reiserfs_write_unlock_nested(s);
bh = sb_bread(s, i * s->s_blocksize * 8);
reiserfs_write_lock_nested(s, depth);
@@ -147,7 +160,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
depth = reiserfs_write_unlock_nested(s);
sync_dirty_buffer(bh);
reiserfs_write_lock_nested(s, depth);
- // update bitmap_info stuff
+ /* update bitmap_info stuff */
bitmap[i].free_count = sb_blocksize(sb) * 8 - 1;
brelse(bh);
}
@@ -156,9 +169,11 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
vfree(old_bitmap);
}
- /* begin transaction, if there was an error, it's fine. Yes, we have
+ /*
+ * begin transaction, if there was an error, it's fine. Yes, we have
* incorrect bitmaps now, but none of it is ever going to touch the
- * disk anyway. */
+ * disk anyway.
+ */
err = journal_begin(&th, s, 10);
if (err)
return err;
@@ -167,7 +182,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
info = SB_AP_BITMAP(s) + bmap_nr - 1;
bh = reiserfs_read_bitmap_block(s, bmap_nr - 1);
if (!bh) {
- int jerr = journal_end(&th, s, 10);
+ int jerr = journal_end(&th);
if (jerr)
return jerr;
return -EIO;
@@ -178,14 +193,14 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
reiserfs_clear_le_bit(i, bh->b_data);
info->free_count += s->s_blocksize * 8 - block_r;
- journal_mark_dirty(&th, s, bh);
+ journal_mark_dirty(&th, bh);
brelse(bh);
/* Correct new last bitmap block - It may not be full */
info = SB_AP_BITMAP(s) + bmap_nr_new - 1;
bh = reiserfs_read_bitmap_block(s, bmap_nr_new - 1);
if (!bh) {
- int jerr = journal_end(&th, s, 10);
+ int jerr = journal_end(&th);
if (jerr)
return jerr;
return -EIO;
@@ -194,7 +209,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
reiserfs_prepare_for_journal(s, bh, 1);
for (i = block_r_new; i < s->s_blocksize * 8; i++)
reiserfs_set_le_bit(i, bh->b_data);
- journal_mark_dirty(&th, s, bh);
+ journal_mark_dirty(&th, bh);
brelse(bh);
info->free_count -= s->s_blocksize * 8 - block_r_new;
@@ -207,8 +222,8 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
PUT_SB_BLOCK_COUNT(s, block_count_new);
PUT_SB_BMAP_NR(s, bmap_would_wrap(bmap_nr_new) ? : bmap_nr_new);
- journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
+ journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
SB_JOURNAL(s)->j_must_wait = 1;
- return journal_end(&th, s, 10);
+ return journal_end(&th);
}
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 615cd9ab7940..dd44468edc2b 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -8,46 +8,6 @@
* Pereslavl-Zalessky Russia
*/
-/*
- * This file contains functions dealing with S+tree
- *
- * B_IS_IN_TREE
- * copy_item_head
- * comp_short_keys
- * comp_keys
- * comp_short_le_keys
- * le_key2cpu_key
- * comp_le_keys
- * bin_search
- * get_lkey
- * get_rkey
- * key_in_buffer
- * decrement_bcount
- * reiserfs_check_path
- * pathrelse_and_restore
- * pathrelse
- * search_by_key_reada
- * search_by_key
- * search_for_position_by_key
- * comp_items
- * prepare_for_direct_item
- * prepare_for_direntry_item
- * prepare_for_delete_or_cut
- * calc_deleted_bytes_number
- * init_tb_struct
- * padd_item
- * reiserfs_delete_item
- * reiserfs_delete_solid_item
- * reiserfs_delete_object
- * maybe_indirect_to_direct
- * indirect_to_direct_roll_back
- * reiserfs_cut_from_item
- * truncate_directory
- * reiserfs_do_truncate
- * reiserfs_paste_into_item
- * reiserfs_insert_item
- */
-
#include <linux/time.h>
#include <linux/string.h>
#include <linux/pagemap.h>
@@ -65,21 +25,21 @@ inline int B_IS_IN_TREE(const struct buffer_head *bh)
return (B_LEVEL(bh) != FREE_LEVEL);
}
-//
-// to gets item head in le form
-//
+/* to get item head in le form */
inline void copy_item_head(struct item_head *to,
const struct item_head *from)
{
memcpy(to, from, IH_SIZE);
}
-/* k1 is pointer to on-disk structure which is stored in little-endian
- form. k2 is pointer to cpu variable. For key of items of the same
- object this returns 0.
- Returns: -1 if key1 < key2
- 0 if key1 == key2
- 1 if key1 > key2 */
+/*
+ * k1 is pointer to on-disk structure which is stored in little-endian
+ * form. k2 is pointer to cpu variable. For key of items of the same
+ * object this returns 0.
+ * Returns: -1 if key1 < key2
+ * 0 if key1 == key2
+ * 1 if key1 > key2
+ */
inline int comp_short_keys(const struct reiserfs_key *le_key,
const struct cpu_key *cpu_key)
{
@@ -97,11 +57,13 @@ inline int comp_short_keys(const struct reiserfs_key *le_key,
return 0;
}
-/* k1 is pointer to on-disk structure which is stored in little-endian
- form. k2 is pointer to cpu variable.
- Compare keys using all 4 key fields.
- Returns: -1 if key1 < key2 0
- if key1 = key2 1 if key1 > key2 */
+/*
+ * k1 is pointer to on-disk structure which is stored in little-endian
+ * form. k2 is pointer to cpu variable.
+ * Compare keys using all 4 key fields.
+ * Returns: -1 if key1 < key2 0
+ * if key1 = key2 1 if key1 > key2
+ */
static inline int comp_keys(const struct reiserfs_key *le_key,
const struct cpu_key *cpu_key)
{
@@ -155,15 +117,17 @@ inline void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from)
to->on_disk_key.k_dir_id = le32_to_cpu(from->k_dir_id);
to->on_disk_key.k_objectid = le32_to_cpu(from->k_objectid);
- // find out version of the key
+ /* find out version of the key */
version = le_key_version(from);
to->version = version;
to->on_disk_key.k_offset = le_key_k_offset(version, from);
to->on_disk_key.k_type = le_key_k_type(version, from);
}
-// this does not say which one is bigger, it only returns 1 if keys
-// are not equal, 0 otherwise
+/*
+ * this does not say which one is bigger, it only returns 1 if keys
+ * are not equal, 0 otherwise
+ */
inline int comp_le_keys(const struct reiserfs_key *k1,
const struct reiserfs_key *k2)
{
@@ -177,24 +141,27 @@ inline int comp_le_keys(const struct reiserfs_key *k1,
* *pos = number of the searched element if found, else the *
* number of the first element that is larger than key. *
**************************************************************************/
-/* For those not familiar with binary search: lbound is the leftmost item that it
- could be, rbound the rightmost item that it could be. We examine the item
- halfway between lbound and rbound, and that tells us either that we can increase
- lbound, or decrease rbound, or that we have found it, or if lbound <= rbound that
- there are no possible items, and we have not found it. With each examination we
- cut the number of possible items it could be by one more than half rounded down,
- or we find it. */
+/*
+ * For those not familiar with binary search: lbound is the leftmost item
+ * that it could be, rbound the rightmost item that it could be. We examine
+ * the item halfway between lbound and rbound, and that tells us either
+ * that we can increase lbound, or decrease rbound, or that we have found it,
+ * or if lbound <= rbound that there are no possible items, and we have not
+ * found it. With each examination we cut the number of possible items it
+ * could be by one more than half rounded down, or we find it.
+ */
static inline int bin_search(const void *key, /* Key to search for. */
const void *base, /* First item in the array. */
int num, /* Number of items in the array. */
- int width, /* Item size in the array.
- searched. Lest the reader be
- confused, note that this is crafted
- as a general function, and when it
- is applied specifically to the array
- of item headers in a node, width
- is actually the item header size not
- the item size. */
+ /*
+ * Item size in the array. searched. Lest the
+ * reader be confused, note that this is crafted
+ * as a general function, and when it is applied
+ * specifically to the array of item headers in a
+ * node, width is actually the item header size
+ * not the item size.
+ */
+ int width,
int *pos /* Number of the searched for element. */
)
{
@@ -216,8 +183,10 @@ static inline int bin_search(const void *key, /* Key to search for. */
return ITEM_FOUND; /* Key found in the array. */
}
- /* bin_search did not find given key, it returns position of key,
- that is minimal and greater than the given one. */
+ /*
+ * bin_search did not find given key, it returns position of key,
+ * that is minimal and greater than the given one.
+ */
*pos = lbound;
return ITEM_NOT_FOUND;
}
@@ -234,10 +203,14 @@ static const struct reiserfs_key MAX_KEY = {
cpu_to_le32(0xffffffff)},}
};
-/* Get delimiting key of the buffer by looking for it in the buffers in the path, starting from the bottom
- of the path, and going upwards. We must check the path's validity at each step. If the key is not in
- the path, there is no delimiting key in the tree (buffer is first or last buffer in tree), and in this
- case we return a special key, either MIN_KEY or MAX_KEY. */
+/*
+ * Get delimiting key of the buffer by looking for it in the buffers in the
+ * path, starting from the bottom of the path, and going upwards. We must
+ * check the path's validity at each step. If the key is not in the path,
+ * there is no delimiting key in the tree (buffer is first or last buffer
+ * in tree), and in this case we return a special key, either MIN_KEY or
+ * MAX_KEY.
+ */
static inline const struct reiserfs_key *get_lkey(const struct treepath *chk_path,
const struct super_block *sb)
{
@@ -270,9 +243,12 @@ static inline const struct reiserfs_key *get_lkey(const struct treepath *chk_pat
PATH_OFFSET_PBUFFER(chk_path,
path_offset + 1)->b_blocknr)
return &MAX_KEY;
- /* Return delimiting key if position in the parent is not equal to zero. */
+ /*
+ * Return delimiting key if position in the parent
+ * is not equal to zero.
+ */
if (position)
- return B_N_PDELIM_KEY(parent, position - 1);
+ return internal_key(parent, position - 1);
}
/* Return MIN_KEY if we are in the root of the buffer tree. */
if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)->
@@ -308,15 +284,23 @@ inline const struct reiserfs_key *get_rkey(const struct treepath *chk_path,
path_offset)) >
B_NR_ITEMS(parent))
return &MIN_KEY;
- /* Check whether parent at the path really points to the child. */
+ /*
+ * Check whether parent at the path really points
+ * to the child.
+ */
if (B_N_CHILD_NUM(parent, position) !=
PATH_OFFSET_PBUFFER(chk_path,
path_offset + 1)->b_blocknr)
return &MIN_KEY;
- /* Return delimiting key if position in the parent is not the last one. */
+
+ /*
+ * Return delimiting key if position in the parent
+ * is not the last one.
+ */
if (position != B_NR_ITEMS(parent))
- return B_N_PDELIM_KEY(parent, position);
+ return internal_key(parent, position);
}
+
/* Return MAX_KEY if we are in the root of the buffer tree. */
if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)->
b_blocknr == SB_ROOT_BLOCK(sb))
@@ -324,13 +308,20 @@ inline const struct reiserfs_key *get_rkey(const struct treepath *chk_path,
return &MIN_KEY;
}
-/* Check whether a key is contained in the tree rooted from a buffer at a path. */
-/* This works by looking at the left and right delimiting keys for the buffer in the last path_element in
- the path. These delimiting keys are stored at least one level above that buffer in the tree. If the
- buffer is the first or last node in the tree order then one of the delimiting keys may be absent, and in
- this case get_lkey and get_rkey return a special key which is MIN_KEY or MAX_KEY. */
-static inline int key_in_buffer(struct treepath *chk_path, /* Path which should be checked. */
- const struct cpu_key *key, /* Key which should be checked. */
+/*
+ * Check whether a key is contained in the tree rooted from a buffer at a path.
+ * This works by looking at the left and right delimiting keys for the buffer
+ * in the last path_element in the path. These delimiting keys are stored
+ * at least one level above that buffer in the tree. If the buffer is the
+ * first or last node in the tree order then one of the delimiting keys may
+ * be absent, and in this case get_lkey and get_rkey return a special key
+ * which is MIN_KEY or MAX_KEY.
+ */
+static inline int key_in_buffer(
+ /* Path which should be checked. */
+ struct treepath *chk_path,
+ /* Key which should be checked. */
+ const struct cpu_key *key,
struct super_block *sb
)
{
@@ -359,9 +350,11 @@ int reiserfs_check_path(struct treepath *p)
return 0;
}
-/* Drop the reference to each buffer in a path and restore
+/*
+ * Drop the reference to each buffer in a path and restore
* dirty bits clean when preparing the buffer for the log.
- * This version should only be called from fix_nodes() */
+ * This version should only be called from fix_nodes()
+ */
void pathrelse_and_restore(struct super_block *sb,
struct treepath *search_path)
{
@@ -418,14 +411,17 @@ static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
}
ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1;
used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location(ih));
+
+ /* free space does not match to calculated amount of use space */
if (used_space != blocksize - blkh_free_space(blkh)) {
- /* free space does not match to calculated amount of use space */
reiserfs_warning(NULL, "reiserfs-5082",
"free space seems wrong: %z", bh);
return 0;
}
- // FIXME: it is_leaf will hit performance too much - we may have
- // return 1 here
+ /*
+ * FIXME: it is_leaf will hit performance too much - we may have
+ * return 1 here
+ */
/* check tables of item heads */
ih = (struct item_head *)(buf + BLKH_SIZE);
@@ -460,7 +456,7 @@ static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
prev_location = ih_location(ih);
}
- // one may imagine much more checks
+ /* one may imagine many more checks */
return 1;
}
@@ -481,8 +477,8 @@ static int is_internal(char *buf, int blocksize, struct buffer_head *bh)
}
nr = blkh_nr_item(blkh);
+ /* for internal which is not root we might check min number of keys */
if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) {
- /* for internal which is not root we might check min number of keys */
reiserfs_warning(NULL, "reiserfs-5088",
"number of key seems wrong: %z", bh);
return 0;
@@ -494,12 +490,15 @@ static int is_internal(char *buf, int blocksize, struct buffer_head *bh)
"free space seems wrong: %z", bh);
return 0;
}
- // one may imagine much more checks
+
+ /* one may imagine many more checks */
return 1;
}
-// make sure that bh contains formatted node of reiserfs tree of
-// 'level'-th level
+/*
+ * make sure that bh contains formatted node of reiserfs tree of
+ * 'level'-th level
+ */
static int is_tree_node(struct buffer_head *bh, int level)
{
if (B_LEVEL(bh) != level) {
@@ -546,7 +545,8 @@ static int search_by_key_reada(struct super_block *s,
for (j = 0; j < i; j++) {
/*
* note, this needs attention if we are getting rid of the BKL
- * you have to make sure the prepared bit isn't set on this buffer
+ * you have to make sure the prepared bit isn't set on this
+ * buffer
*/
if (!buffer_uptodate(bh[j])) {
if (depth == -1)
@@ -558,39 +558,34 @@ static int search_by_key_reada(struct super_block *s,
return depth;
}
-/**************************************************************************
- * Algorithm SearchByKey *
- * look for item in the Disk S+Tree by its key *
- * Input: sb - super block *
- * key - pointer to the key to search *
- * Output: ITEM_FOUND, ITEM_NOT_FOUND or IO_ERROR *
- * search_path - path from the root to the needed leaf *
- **************************************************************************/
-
-/* This function fills up the path from the root to the leaf as it
- descends the tree looking for the key. It uses reiserfs_bread to
- try to find buffers in the cache given their block number. If it
- does not find them in the cache it reads them from disk. For each
- node search_by_key finds using reiserfs_bread it then uses
- bin_search to look through that node. bin_search will find the
- position of the block_number of the next node if it is looking
- through an internal node. If it is looking through a leaf node
- bin_search will find the position of the item which has key either
- equal to given key, or which is the maximal key less than the given
- key. search_by_key returns a path that must be checked for the
- correctness of the top of the path but need not be checked for the
- correctness of the bottom of the path */
-/* The function is NOT SCHEDULE-SAFE! */
-int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to search. */
- struct treepath *search_path,/* This structure was
- allocated and initialized
- by the calling
- function. It is filled up
- by this function. */
- int stop_level /* How far down the tree to search. To
- stop at leaf level - set to
- DISK_LEAF_NODE_LEVEL */
- )
+/*
+ * This function fills up the path from the root to the leaf as it
+ * descends the tree looking for the key. It uses reiserfs_bread to
+ * try to find buffers in the cache given their block number. If it
+ * does not find them in the cache it reads them from disk. For each
+ * node search_by_key finds using reiserfs_bread it then uses
+ * bin_search to look through that node. bin_search will find the
+ * position of the block_number of the next node if it is looking
+ * through an internal node. If it is looking through a leaf node
+ * bin_search will find the position of the item which has key either
+ * equal to given key, or which is the maximal key less than the given
+ * key. search_by_key returns a path that must be checked for the
+ * correctness of the top of the path but need not be checked for the
+ * correctness of the bottom of the path
+ */
+/*
+ * search_by_key - search for key (and item) in stree
+ * @sb: superblock
+ * @key: pointer to key to search for
+ * @search_path: Allocated and initialized struct treepath; Returned filled
+ * on success.
+ * @stop_level: How far down the tree to search, Use DISK_LEAF_NODE_LEVEL to
+ * stop at leaf level.
+ *
+ * The function is NOT SCHEDULE-SAFE!
+ */
+int search_by_key(struct super_block *sb, const struct cpu_key *key,
+ struct treepath *search_path, int stop_level)
{
b_blocknr_t block_number;
int expected_level;
@@ -609,17 +604,22 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s
PROC_INFO_INC(sb, search_by_key);
- /* As we add each node to a path we increase its count. This means that
- we must be careful to release all nodes in a path before we either
- discard the path struct or re-use the path struct, as we do here. */
+ /*
+ * As we add each node to a path we increase its count. This means
+ * that we must be careful to release all nodes in a path before we
+ * either discard the path struct or re-use the path struct, as we
+ * do here.
+ */
pathrelse(search_path);
right_neighbor_of_leaf_node = 0;
- /* With each iteration of this loop we search through the items in the
- current node, and calculate the next current node(next path element)
- for the next iteration of this loop.. */
+ /*
+ * With each iteration of this loop we search through the items in the
+ * current node, and calculate the next current node(next path element)
+ * for the next iteration of this loop..
+ */
block_number = SB_ROOT_BLOCK(sb);
expected_level = -1;
while (1) {
@@ -639,8 +639,10 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s
++search_path->path_length);
fs_gen = get_generation(sb);
- /* Read the next tree node, and set the last element in the path to
- have a pointer to it. */
+ /*
+ * Read the next tree node, and set the last element
+ * in the path to have a pointer to it.
+ */
if ((bh = last_element->pe_buffer =
sb_getblk(sb, block_number))) {
@@ -666,7 +668,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s
if (!buffer_uptodate(bh))
goto io_error;
} else {
- io_error:
+io_error:
search_path->path_length--;
pathrelse(search_path);
return IO_ERROR;
@@ -676,9 +678,12 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s
expected_level = SB_TREE_HEIGHT(sb);
expected_level--;
- /* It is possible that schedule occurred. We must check whether the key
- to search is still in the tree rooted from the current buffer. If
- not then repeat search from the root. */
+ /*
+ * It is possible that schedule occurred. We must check
+ * whether the key to search is still in the tree rooted
+ * from the current buffer. If not then repeat search
+ * from the root.
+ */
if (fs_changed(fs_gen, sb) &&
(!B_IS_IN_TREE(bh) ||
B_LEVEL(bh) != expected_level ||
@@ -689,8 +694,10 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s
sbk_restarted[expected_level - 1]);
pathrelse(search_path);
- /* Get the root block number so that we can repeat the search
- starting from the root. */
+ /*
+ * Get the root block number so that we can
+ * repeat the search starting from the root.
+ */
block_number = SB_ROOT_BLOCK(sb);
expected_level = -1;
right_neighbor_of_leaf_node = 0;
@@ -699,9 +706,11 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s
continue;
}
- /* only check that the key is in the buffer if key is not
- equal to the MAX_KEY. Latter case is only possible in
- "finish_unfinished()" processing during mount. */
+ /*
+ * only check that the key is in the buffer if key is not
+ * equal to the MAX_KEY. Latter case is only possible in
+ * "finish_unfinished()" processing during mount.
+ */
RFALSE(comp_keys(&MAX_KEY, key) &&
!key_in_buffer(search_path, key, sb),
"PAP-5130: key is not in the buffer");
@@ -713,8 +722,10 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s
}
#endif
- // make sure, that the node contents look like a node of
- // certain level
+ /*
+ * make sure, that the node contents look like a node of
+ * certain level
+ */
if (!is_tree_node(bh, expected_level)) {
reiserfs_error(sb, "vs-5150",
"invalid format found in block %ld. "
@@ -732,32 +743,42 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s
"vs-5152: tree level (%d) is less than stop level (%d)",
node_level, stop_level);
- retval = bin_search(key, B_N_PITEM_HEAD(bh, 0),
+ retval = bin_search(key, item_head(bh, 0),
B_NR_ITEMS(bh),
(node_level ==
DISK_LEAF_NODE_LEVEL) ? IH_SIZE :
KEY_SIZE,
- &(last_element->pe_position));
+ &last_element->pe_position);
if (node_level == stop_level) {
return retval;
}
/* we are not in the stop level */
+ /*
+ * item has been found, so we choose the pointer which
+ * is to the right of the found one
+ */
if (retval == ITEM_FOUND)
- /* item has been found, so we choose the pointer which is to the right of the found one */
last_element->pe_position++;
- /* if item was not found we choose the position which is to
- the left of the found item. This requires no code,
- bin_search did it already. */
+ /*
+ * if item was not found we choose the position which is to
+ * the left of the found item. This requires no code,
+ * bin_search did it already.
+ */
- /* So we have chosen a position in the current node which is
- an internal node. Now we calculate child block number by
- position in the node. */
+ /*
+ * So we have chosen a position in the current node which is
+ * an internal node. Now we calculate child block number by
+ * position in the node.
+ */
block_number =
B_N_CHILD_NUM(bh, last_element->pe_position);
- /* if we are going to read leaf nodes, try for read ahead as well */
+ /*
+ * if we are going to read leaf nodes, try for read
+ * ahead as well
+ */
if ((search_path->reada & PATH_READA) &&
node_level == DISK_LEAF_NODE_LEVEL + 1) {
int pos = last_element->pe_position;
@@ -779,7 +800,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s
/*
* check to make sure we're in the same object
*/
- le_key = B_N_PDELIM_KEY(bh, pos);
+ le_key = internal_key(bh, pos);
if (le32_to_cpu(le_key->k_objectid) !=
key->on_disk_key.k_objectid) {
break;
@@ -789,26 +810,28 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s
}
}
-/* Form the path to an item and position in this item which contains
- file byte defined by key. If there is no such item
- corresponding to the key, we point the path to the item with
- maximal key less than key, and *pos_in_item is set to one
- past the last entry/byte in the item. If searching for entry in a
- directory item, and it is not found, *pos_in_item is set to one
- entry more than the entry with maximal key which is less than the
- sought key.
-
- Note that if there is no entry in this same node which is one more,
- then we point to an imaginary entry. for direct items, the
- position is in units of bytes, for indirect items the position is
- in units of blocknr entries, for directory items the position is in
- units of directory entries. */
-
+/*
+ * Form the path to an item and position in this item which contains
+ * file byte defined by key. If there is no such item
+ * corresponding to the key, we point the path to the item with
+ * maximal key less than key, and *pos_in_item is set to one
+ * past the last entry/byte in the item. If searching for entry in a
+ * directory item, and it is not found, *pos_in_item is set to one
+ * entry more than the entry with maximal key which is less than the
+ * sought key.
+ *
+ * Note that if there is no entry in this same node which is one more,
+ * then we point to an imaginary entry. for direct items, the
+ * position is in units of bytes, for indirect items the position is
+ * in units of blocknr entries, for directory items the position is in
+ * units of directory entries.
+ */
/* The function is NOT SCHEDULE-SAFE! */
-int search_for_position_by_key(struct super_block *sb, /* Pointer to the super block. */
- const struct cpu_key *p_cpu_key, /* Key to search (cpu variable) */
- struct treepath *search_path /* Filled up by this function. */
- )
+int search_for_position_by_key(struct super_block *sb,
+ /* Key to search (cpu variable) */
+ const struct cpu_key *p_cpu_key,
+ /* Filled up by this function. */
+ struct treepath *search_path)
{
struct item_head *p_le_ih; /* pointer to on-disk structure */
int blk_size;
@@ -830,7 +853,7 @@ int search_for_position_by_key(struct super_block *sb, /* Pointer to the super b
if (retval == ITEM_FOUND) {
RFALSE(!ih_item_len
- (B_N_PITEM_HEAD
+ (item_head
(PATH_PLAST_BUFFER(search_path),
PATH_LAST_POSITION(search_path))),
"PAP-5165: item length equals zero");
@@ -844,14 +867,14 @@ int search_for_position_by_key(struct super_block *sb, /* Pointer to the super b
/* Item is not found. Set path to the previous item. */
p_le_ih =
- B_N_PITEM_HEAD(PATH_PLAST_BUFFER(search_path),
+ item_head(PATH_PLAST_BUFFER(search_path),
--PATH_LAST_POSITION(search_path));
blk_size = sb->s_blocksize;
- if (comp_short_keys(&(p_le_ih->ih_key), p_cpu_key)) {
+ if (comp_short_keys(&p_le_ih->ih_key, p_cpu_key))
return FILE_NOT_FOUND;
- }
- // FIXME: quite ugly this far
+
+ /* FIXME: quite ugly this far */
item_offset = le_ih_k_offset(p_le_ih);
offset = cpu_key_k_offset(p_cpu_key);
@@ -866,8 +889,10 @@ int search_for_position_by_key(struct super_block *sb, /* Pointer to the super b
return POSITION_FOUND;
}
- /* Needed byte is not contained in the item pointed to by the
- path. Set pos_in_item out of the item. */
+ /*
+ * Needed byte is not contained in the item pointed to by the
+ * path. Set pos_in_item out of the item.
+ */
if (is_indirect_le_ih(p_le_ih))
pos_in_item(search_path) =
ih_item_len(p_le_ih) / UNFM_P_SIZE;
@@ -892,19 +917,17 @@ int comp_items(const struct item_head *stored_ih, const struct treepath *path)
return 1;
/* we need only to know, whether it is the same item */
- ih = get_ih(path);
+ ih = tp_item_head(path);
return memcmp(stored_ih, ih, IH_SIZE);
}
-/* unformatted nodes are not logged anymore, ever. This is safe
-** now
-*/
+/* unformatted nodes are not logged anymore, ever. This is safe now */
#define held_by_others(bh) (atomic_read(&(bh)->b_count) > 1)
-// block can not be forgotten as it is in I/O or held by someone
+/* block can not be forgotten as it is in I/O or held by someone */
#define block_in_use(bh) (buffer_locked(bh) || (held_by_others(bh)))
-// prepare for delete or cut of direct item
+/* prepare for delete or cut of direct item */
static inline int prepare_for_direct_item(struct treepath *path,
struct item_head *le_ih,
struct inode *inode,
@@ -917,9 +940,8 @@ static inline int prepare_for_direct_item(struct treepath *path,
*cut_size = -(IH_SIZE + ih_item_len(le_ih));
return M_DELETE;
}
- // new file gets truncated
+ /* new file gets truncated */
if (get_inode_item_key_version(inode) == KEY_FORMAT_3_6) {
- //
round_len = ROUND_UP(new_file_length);
/* this was new_file_length < le_ih ... */
if (round_len < le_ih_k_offset(le_ih)) {
@@ -933,12 +955,13 @@ static inline int prepare_for_direct_item(struct treepath *path,
return M_CUT; /* Cut from this item. */
}
- // old file: items may have any length
+ /* old file: items may have any length */
if (new_file_length < le_ih_k_offset(le_ih)) {
*cut_size = -(IH_SIZE + ih_item_len(le_ih));
return M_DELETE; /* Delete this item. */
}
+
/* Calculate first position and size for cutting from item. */
*cut_size = -(ih_item_len(le_ih) -
(pos_in_item(path) =
@@ -957,12 +980,15 @@ static inline int prepare_for_direntry_item(struct treepath *path,
RFALSE(ih_entry_count(le_ih) != 2,
"PAP-5220: incorrect empty directory item (%h)", le_ih);
*cut_size = -(IH_SIZE + ih_item_len(le_ih));
- return M_DELETE; /* Delete the directory item containing "." and ".." entry. */
+ /* Delete the directory item containing "." and ".." entry. */
+ return M_DELETE;
}
if (ih_entry_count(le_ih) == 1) {
- /* Delete the directory item such as there is one record only
- in this item */
+ /*
+ * Delete the directory item such as there is one record only
+ * in this item
+ */
*cut_size = -(IH_SIZE + ih_item_len(le_ih));
return M_DELETE;
}
@@ -976,18 +1002,34 @@ static inline int prepare_for_direntry_item(struct treepath *path,
#define JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD (2 * JOURNAL_PER_BALANCE_CNT + 1)
-/* If the path points to a directory or direct item, calculate mode and the size cut, for balance.
- If the path points to an indirect item, remove some number of its unformatted nodes.
- In case of file truncate calculate whether this item must be deleted/truncated or last
- unformatted node of this item will be converted to a direct item.
- This function returns a determination of what balance mode the calling function should employ. */
-static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, struct inode *inode, struct treepath *path, const struct cpu_key *item_key, int *removed, /* Number of unformatted nodes which were removed
- from end of the file. */
- int *cut_size, unsigned long long new_file_length /* MAX_KEY_OFFSET in case of delete. */
+/*
+ * If the path points to a directory or direct item, calculate mode
+ * and the size cut, for balance.
+ * If the path points to an indirect item, remove some number of its
+ * unformatted nodes.
+ * In case of file truncate calculate whether this item must be
+ * deleted/truncated or last unformatted node of this item will be
+ * converted to a direct item.
+ * This function returns a determination of what balance mode the
+ * calling function should employ.
+ */
+static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th,
+ struct inode *inode,
+ struct treepath *path,
+ const struct cpu_key *item_key,
+ /*
+ * Number of unformatted nodes
+ * which were removed from end
+ * of the file.
+ */
+ int *removed,
+ int *cut_size,
+ /* MAX_KEY_OFFSET in case of delete. */
+ unsigned long long new_file_length
)
{
struct super_block *sb = inode->i_sb;
- struct item_head *p_le_ih = PATH_PITEM_HEAD(path);
+ struct item_head *p_le_ih = tp_item_head(path);
struct buffer_head *bh = PATH_PLAST_BUFFER(path);
BUG_ON(!th->t_trans_id);
@@ -1023,8 +1065,10 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
int pos = 0;
if ( new_file_length == max_reiserfs_offset (inode) ) {
- /* prepare_for_delete_or_cut() is called by
- * reiserfs_delete_item() */
+ /*
+ * prepare_for_delete_or_cut() is called by
+ * reiserfs_delete_item()
+ */
new_file_length = 0;
delete = 1;
}
@@ -1033,27 +1077,30 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
need_re_search = 0;
*cut_size = 0;
bh = PATH_PLAST_BUFFER(path);
- copy_item_head(&s_ih, PATH_PITEM_HEAD(path));
+ copy_item_head(&s_ih, tp_item_head(path));
pos = I_UNFM_NUM(&s_ih);
while (le_ih_k_offset (&s_ih) + (pos - 1) * blk_size > new_file_length) {
__le32 *unfm;
__u32 block;
- /* Each unformatted block deletion may involve one additional
- * bitmap block into the transaction, thereby the initial
- * journal space reservation might not be enough. */
+ /*
+ * Each unformatted block deletion may involve
+ * one additional bitmap block into the transaction,
+ * thereby the initial journal space reservation
+ * might not be enough.
+ */
if (!delete && (*cut_size) != 0 &&
reiserfs_transaction_free_space(th) < JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD)
break;
- unfm = (__le32 *)B_I_PITEM(bh, &s_ih) + pos - 1;
+ unfm = (__le32 *)ih_item_body(bh, &s_ih) + pos - 1;
block = get_block_num(unfm, 0);
if (block != 0) {
reiserfs_prepare_for_journal(sb, bh, 1);
put_block_num(unfm, 0, 0);
- journal_mark_dirty(th, sb, bh);
+ journal_mark_dirty(th, bh);
reiserfs_free_block(th, inode, block, 1);
}
@@ -1074,17 +1121,21 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
break;
}
}
- /* a trick. If the buffer has been logged, this will do nothing. If
- ** we've broken the loop without logging it, it will restore the
- ** buffer */
+ /*
+ * a trick. If the buffer has been logged, this will
+ * do nothing. If we've broken the loop without logging
+ * it, it will restore the buffer
+ */
reiserfs_restore_prepared_buffer(sb, bh);
} while (need_re_search &&
search_for_position_by_key(sb, item_key, path) == POSITION_FOUND);
pos_in_item(path) = pos * UNFM_P_SIZE;
if (*cut_size == 0) {
- /* Nothing were cut. maybe convert last unformatted node to the
- * direct item? */
+ /*
+ * Nothing was cut. maybe convert last unformatted node to the
+ * direct item?
+ */
result = M_CONVERT;
}
return result;
@@ -1095,7 +1146,7 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
static int calc_deleted_bytes_number(struct tree_balance *tb, char mode)
{
int del_size;
- struct item_head *p_le_ih = PATH_PITEM_HEAD(tb->tb_path);
+ struct item_head *p_le_ih = tp_item_head(tb->tb_path);
if (is_statdata_le_ih(p_le_ih))
return 0;
@@ -1104,9 +1155,11 @@ static int calc_deleted_bytes_number(struct tree_balance *tb, char mode)
(mode ==
M_DELETE) ? ih_item_len(p_le_ih) : -tb->insert_size[0];
if (is_direntry_le_ih(p_le_ih)) {
- /* return EMPTY_DIR_SIZE; We delete emty directoris only.
- * we can't use EMPTY_DIR_SIZE, as old format dirs have a different
- * empty size. ick. FIXME, is this right? */
+ /*
+ * return EMPTY_DIR_SIZE; We delete emty directories only.
+ * we can't use EMPTY_DIR_SIZE, as old format dirs have a
+ * different empty size. ick. FIXME, is this right?
+ */
return del_size;
}
@@ -1169,7 +1222,8 @@ char head2type(struct item_head *ih)
}
#endif
-/* Delete object item.
+/*
+ * Delete object item.
* th - active transaction handle
* path - path to the deleted item
* item_key - key to search for the deleted item
@@ -1212,7 +1266,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
RFALSE(mode != M_DELETE, "PAP-5320: mode must be M_DELETE");
- copy_item_head(&s_ih, PATH_PITEM_HEAD(path));
+ copy_item_head(&s_ih, tp_item_head(path));
s_del_balance.insert_size[0] = del_size;
ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL);
@@ -1221,7 +1275,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
PROC_INFO_INC(sb, delete_item_restarted);
- // file system changed, repeat search
+ /* file system changed, repeat search */
ret_value =
search_for_position_by_key(sb, item_key, path);
if (ret_value == IO_ERROR)
@@ -1238,16 +1292,18 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
unfix_nodes(&s_del_balance);
return 0;
}
- // reiserfs_delete_item returns item length when success
+
+ /* reiserfs_delete_item returns item length when success */
ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE);
- q_ih = get_ih(path);
+ q_ih = tp_item_head(path);
quota_cut_bytes = ih_item_len(q_ih);
- /* hack so the quota code doesn't have to guess if the file
- ** has a tail. On tail insert, we allocate quota for 1 unformatted node.
- ** We test the offset because the tail might have been
- ** split into multiple items, and we only want to decrement for
- ** the unfm node once
+ /*
+ * hack so the quota code doesn't have to guess if the file has a
+ * tail. On tail insert, we allocate quota for 1 unformatted node.
+ * We test the offset because the tail might have been
+ * split into multiple items, and we only want to decrement for
+ * the unfm node once
*/
if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(q_ih)) {
if ((le_ih_k_offset(q_ih) & (sb->s_blocksize - 1)) == 1) {
@@ -1261,33 +1317,38 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
int off;
char *data;
- /* We are in direct2indirect conversion, so move tail contents
- to the unformatted node */
- /* note, we do the copy before preparing the buffer because we
- ** don't care about the contents of the unformatted node yet.
- ** the only thing we really care about is the direct item's data
- ** is in the unformatted node.
- **
- ** Otherwise, we would have to call reiserfs_prepare_for_journal on
- ** the unformatted node, which might schedule, meaning we'd have to
- ** loop all the way back up to the start of the while loop.
- **
- ** The unformatted node must be dirtied later on. We can't be
- ** sure here if the entire tail has been deleted yet.
- **
- ** un_bh is from the page cache (all unformatted nodes are
- ** from the page cache) and might be a highmem page. So, we
- ** can't use un_bh->b_data.
- ** -clm
+ /*
+ * We are in direct2indirect conversion, so move tail contents
+ * to the unformatted node
+ */
+ /*
+ * note, we do the copy before preparing the buffer because we
+ * don't care about the contents of the unformatted node yet.
+ * the only thing we really care about is the direct item's
+ * data is in the unformatted node.
+ *
+ * Otherwise, we would have to call
+ * reiserfs_prepare_for_journal on the unformatted node,
+ * which might schedule, meaning we'd have to loop all the
+ * way back up to the start of the while loop.
+ *
+ * The unformatted node must be dirtied later on. We can't be
+ * sure here if the entire tail has been deleted yet.
+ *
+ * un_bh is from the page cache (all unformatted nodes are
+ * from the page cache) and might be a highmem page. So, we
+ * can't use un_bh->b_data.
+ * -clm
*/
data = kmap_atomic(un_bh->b_page);
off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_CACHE_SIZE - 1));
memcpy(data + off,
- B_I_PITEM(PATH_PLAST_BUFFER(path), &s_ih),
+ ih_item_body(PATH_PLAST_BUFFER(path), &s_ih),
ret_value);
kunmap_atomic(data);
}
+
/* Perform balancing after all resources have been collected at once. */
do_balance(&s_del_balance, NULL, NULL, M_DELETE);
@@ -1304,20 +1365,21 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
return ret_value;
}
-/* Summary Of Mechanisms For Handling Collisions Between Processes:
-
- deletion of the body of the object is performed by iput(), with the
- result that if multiple processes are operating on a file, the
- deletion of the body of the file is deferred until the last process
- that has an open inode performs its iput().
-
- writes and truncates are protected from collisions by use of
- semaphores.
-
- creates, linking, and mknod are protected from collisions with other
- processes by making the reiserfs_add_entry() the last step in the
- creation, and then rolling back all changes if there was a collision.
- - Hans
+/*
+ * Summary Of Mechanisms For Handling Collisions Between Processes:
+ *
+ * deletion of the body of the object is performed by iput(), with the
+ * result that if multiple processes are operating on a file, the
+ * deletion of the body of the file is deferred until the last process
+ * that has an open inode performs its iput().
+ *
+ * writes and truncates are protected from collisions by use of
+ * semaphores.
+ *
+ * creates, linking, and mknod are protected from collisions with other
+ * processes by making the reiserfs_add_entry() the last step in the
+ * creation, and then rolling back all changes if there was a collision.
+ * - Hans
*/
/* this deletes item which never gets split */
@@ -1347,7 +1409,11 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
}
if (retval != ITEM_FOUND) {
pathrelse(&path);
- // No need for a warning, if there is just no free space to insert '..' item into the newly-created subdir
+ /*
+ * No need for a warning, if there is just no free
+ * space to insert '..' item into the
+ * newly-created subdir
+ */
if (!
((unsigned long long)
GET_HASH_VALUE(le_key_k_offset
@@ -1362,11 +1428,11 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
}
if (!tb_init) {
tb_init = 1;
- item_len = ih_item_len(PATH_PITEM_HEAD(&path));
+ item_len = ih_item_len(tp_item_head(&path));
init_tb_struct(th, &tb, th->t_super, &path,
-(IH_SIZE + item_len));
}
- quota_cut_bytes = ih_item_len(PATH_PITEM_HEAD(&path));
+ quota_cut_bytes = ih_item_len(tp_item_head(&path));
retval = fix_nodes(M_DELETE, &tb, NULL, NULL);
if (retval == REPEAT_SEARCH) {
@@ -1376,7 +1442,11 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
if (retval == CARRY_ON) {
do_balance(&tb, NULL, NULL, M_DELETE);
- if (inode) { /* Should we count quota for item? (we don't count quotas for save-links) */
+ /*
+ * Should we count quota for item? (we don't
+ * count quotas for save-links)
+ */
+ if (inode) {
int depth;
#ifdef REISERQUOTA_DEBUG
reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
@@ -1391,7 +1461,8 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
}
break;
}
- // IO_ERROR, NO_DISK_SPACE, etc
+
+ /* IO_ERROR, NO_DISK_SPACE, etc */
reiserfs_warning(th->t_super, "vs-5360",
"could not delete %K due to fix_nodes failure",
&cpu_key);
@@ -1447,11 +1518,13 @@ static void unmap_buffers(struct page *page, loff_t pos)
do {
next = bh->b_this_page;
- /* we want to unmap the buffers that contain the tail, and
- ** all the buffers after it (since the tail must be at the
- ** end of the file). We don't want to unmap file data
- ** before the tail, since it might be dirty and waiting to
- ** reach disk
+ /*
+ * we want to unmap the buffers that contain
+ * the tail, and all the buffers after it
+ * (since the tail must be at the end of the
+ * file). We don't want to unmap file data
+ * before the tail, since it might be dirty
+ * and waiting to reach disk
*/
cur_index += bh->b_size;
if (cur_index > tail_index) {
@@ -1476,9 +1549,10 @@ static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th,
BUG_ON(!th->t_trans_id);
BUG_ON(new_file_size != inode->i_size);
- /* the page being sent in could be NULL if there was an i/o error
- ** reading in the last block. The user will hit problems trying to
- ** read the file, but for now we just skip the indirect2direct
+ /*
+ * the page being sent in could be NULL if there was an i/o error
+ * reading in the last block. The user will hit problems trying to
+ * read the file, but for now we just skip the indirect2direct
*/
if (atomic_read(&inode->i_count) > 1 ||
!tail_has_to_be_packed(inode) ||
@@ -1490,17 +1564,18 @@ static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th,
pathrelse(path);
return cut_bytes;
}
+
/* Perform the conversion to a direct_item. */
- /* return indirect_to_direct(inode, path, item_key,
- new_file_size, mode); */
return indirect2direct(th, inode, page, path, item_key,
new_file_size, mode);
}
-/* we did indirect_to_direct conversion. And we have inserted direct
- item successesfully, but there were no disk space to cut unfm
- pointer being converted. Therefore we have to delete inserted
- direct item(s) */
+/*
+ * we did indirect_to_direct conversion. And we have inserted direct
+ * item successesfully, but there were no disk space to cut unfm
+ * pointer being converted. Therefore we have to delete inserted
+ * direct item(s)
+ */
static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th,
struct inode *inode, struct treepath *path)
{
@@ -1509,7 +1584,7 @@ static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th,
int removed;
BUG_ON(!th->t_trans_id);
- make_cpu_key(&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4); // !!!!
+ make_cpu_key(&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4);
tail_key.key_length = 4;
tail_len =
@@ -1521,7 +1596,7 @@ static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th,
reiserfs_panic(inode->i_sb, "vs-5615",
"found invalid item");
RFALSE(path->pos_in_item !=
- ih_item_len(PATH_PITEM_HEAD(path)) - 1,
+ ih_item_len(tp_item_head(path)) - 1,
"vs-5616: appended bytes found");
PATH_LAST_POSITION(path)--;
@@ -1539,7 +1614,6 @@ static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th,
reiserfs_warning(inode->i_sb, "reiserfs-5091", "indirect_to_direct "
"conversion has been rolled back due to "
"lack of disk space");
- //mark_file_without_tail (inode);
mark_inode_dirty(inode);
}
@@ -1551,15 +1625,18 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
struct page *page, loff_t new_file_size)
{
struct super_block *sb = inode->i_sb;
- /* Every function which is going to call do_balance must first
- create a tree_balance structure. Then it must fill up this
- structure by using the init_tb_struct and fix_nodes functions.
- After that we can make tree balancing. */
+ /*
+ * Every function which is going to call do_balance must first
+ * create a tree_balance structure. Then it must fill up this
+ * structure by using the init_tb_struct and fix_nodes functions.
+ * After that we can make tree balancing.
+ */
struct tree_balance s_cut_balance;
struct item_head *p_le_ih;
- int cut_size = 0, /* Amount to be cut. */
- ret_value = CARRY_ON, removed = 0, /* Number of the removed unformatted nodes. */
- is_inode_locked = 0;
+ int cut_size = 0; /* Amount to be cut. */
+ int ret_value = CARRY_ON;
+ int removed = 0; /* Number of the removed unformatted nodes. */
+ int is_inode_locked = 0;
char mode; /* Mode of the balance. */
int retval2 = -1;
int quota_cut_bytes;
@@ -1571,21 +1648,27 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
init_tb_struct(th, &s_cut_balance, inode->i_sb, path,
cut_size);
- /* Repeat this loop until we either cut the item without needing
- to balance, or we fix_nodes without schedule occurring */
+ /*
+ * Repeat this loop until we either cut the item without needing
+ * to balance, or we fix_nodes without schedule occurring
+ */
while (1) {
- /* Determine the balance mode, position of the first byte to
- be cut, and size to be cut. In case of the indirect item
- free unformatted nodes which are pointed to by the cut
- pointers. */
+ /*
+ * Determine the balance mode, position of the first byte to
+ * be cut, and size to be cut. In case of the indirect item
+ * free unformatted nodes which are pointed to by the cut
+ * pointers.
+ */
mode =
prepare_for_delete_or_cut(th, inode, path,
item_key, &removed,
&cut_size, new_file_size);
if (mode == M_CONVERT) {
- /* convert last unformatted node to direct item or leave
- tail in the unformatted node */
+ /*
+ * convert last unformatted node to direct item or
+ * leave tail in the unformatted node
+ */
RFALSE(ret_value != CARRY_ON,
"PAP-5570: can not convert twice");
@@ -1599,15 +1682,20 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
is_inode_locked = 1;
- /* removing of last unformatted node will change value we
- have to return to truncate. Save it */
+ /*
+ * removing of last unformatted node will
+ * change value we have to return to truncate.
+ * Save it
+ */
retval2 = ret_value;
- /*retval2 = sb->s_blocksize - (new_file_size & (sb->s_blocksize - 1)); */
- /* So, we have performed the first part of the conversion:
- inserting the new direct item. Now we are removing the
- last unformatted node pointer. Set key to search for
- it. */
+ /*
+ * So, we have performed the first part of the
+ * conversion:
+ * inserting the new direct item. Now we are
+ * removing the last unformatted node pointer.
+ * Set key to search for it.
+ */
set_cpu_key_k_type(item_key, TYPE_INDIRECT);
item_key->key_length = 4;
new_file_size -=
@@ -1650,11 +1738,13 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
return (ret_value == IO_ERROR) ? -EIO : -ENOENT;
} /* while */
- // check fix_nodes results (IO_ERROR or NO_DISK_SPACE)
+ /* check fix_nodes results (IO_ERROR or NO_DISK_SPACE) */
if (ret_value != CARRY_ON) {
if (is_inode_locked) {
- // FIXME: this seems to be not needed: we are always able
- // to cut item
+ /*
+ * FIXME: this seems to be not needed: we are always
+ * able to cut item
+ */
indirect_to_direct_roll_back(th, inode, path);
}
if (ret_value == NO_DISK_SPACE)
@@ -1671,22 +1761,23 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
/* Calculate number of bytes that need to be cut from the item. */
quota_cut_bytes =
(mode ==
- M_DELETE) ? ih_item_len(get_ih(path)) : -s_cut_balance.
+ M_DELETE) ? ih_item_len(tp_item_head(path)) : -s_cut_balance.
insert_size[0];
if (retval2 == -1)
ret_value = calc_deleted_bytes_number(&s_cut_balance, mode);
else
ret_value = retval2;
- /* For direct items, we only change the quota when deleting the last
- ** item.
+ /*
+ * For direct items, we only change the quota when deleting the last
+ * item.
*/
- p_le_ih = PATH_PITEM_HEAD(s_cut_balance.tb_path);
+ p_le_ih = tp_item_head(s_cut_balance.tb_path);
if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(p_le_ih)) {
if (mode == M_DELETE &&
(le_ih_k_offset(p_le_ih) & (sb->s_blocksize - 1)) ==
1) {
- // FIXME: this is to keep 3.5 happy
+ /* FIXME: this is to keep 3.5 happy */
REISERFS_I(inode)->i_first_direct_byte = U32_MAX;
quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE;
} else {
@@ -1696,10 +1787,12 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
#ifdef CONFIG_REISERFS_CHECK
if (is_inode_locked) {
struct item_head *le_ih =
- PATH_PITEM_HEAD(s_cut_balance.tb_path);
- /* we are going to complete indirect2direct conversion. Make
- sure, that we exactly remove last unformatted node pointer
- of the item */
+ tp_item_head(s_cut_balance.tb_path);
+ /*
+ * we are going to complete indirect2direct conversion. Make
+ * sure, that we exactly remove last unformatted node pointer
+ * of the item
+ */
if (!is_indirect_le_ih(le_ih))
reiserfs_panic(sb, "vs-5652",
"item must be indirect %h", le_ih);
@@ -1717,17 +1810,20 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
"(CUT, insert_size==%d)",
le_ih, s_cut_balance.insert_size[0]);
}
- /* it would be useful to make sure, that right neighboring
- item is direct item of this file */
+ /*
+ * it would be useful to make sure, that right neighboring
+ * item is direct item of this file
+ */
}
#endif
do_balance(&s_cut_balance, NULL, NULL, mode);
if (is_inode_locked) {
- /* we've done an indirect->direct conversion. when the data block
- ** was freed, it was removed from the list of blocks that must
- ** be flushed before the transaction commits, make sure to
- ** unmap and invalidate it
+ /*
+ * we've done an indirect->direct conversion. when the
+ * data block was freed, it was removed from the list of
+ * blocks that must be flushed before the transaction
+ * commits, make sure to unmap and invalidate it
*/
unmap_buffers(page, tail_pos);
REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
@@ -1758,20 +1854,25 @@ static void truncate_directory(struct reiserfs_transaction_handle *th,
set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_STAT_DATA);
}
-/* Truncate file to the new size. Note, this must be called with a transaction
- already started */
+/*
+ * Truncate file to the new size. Note, this must be called with a
+ * transaction already started
+ */
int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
- struct inode *inode, /* ->i_size contains new size */
+ struct inode *inode, /* ->i_size contains new size */
struct page *page, /* up to date for last block */
- int update_timestamps /* when it is called by
- file_release to convert
- the tail - no timestamps
- should be updated */
+ /*
+ * when it is called by file_release to convert
+ * the tail - no timestamps should be updated
+ */
+ int update_timestamps
)
{
INITIALIZE_PATH(s_search_path); /* Path to the current object item. */
struct item_head *p_le_ih; /* Pointer to an item header. */
- struct cpu_key s_item_key; /* Key to search for a previous file item. */
+
+ /* Key to search for a previous file item. */
+ struct cpu_key s_item_key;
loff_t file_size, /* Old file size. */
new_file_size; /* New file size. */
int deleted; /* Number of deleted or truncated bytes. */
@@ -1784,8 +1885,8 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
|| S_ISLNK(inode->i_mode)))
return 0;
+ /* deletion of directory - no need to update timestamps */
if (S_ISDIR(inode->i_mode)) {
- // deletion of directory - no need to update timestamps
truncate_directory(th, inode);
return 0;
}
@@ -1793,7 +1894,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
/* Get new file size. */
new_file_size = inode->i_size;
- // FIXME: note, that key type is unimportant here
+ /* FIXME: note, that key type is unimportant here */
make_cpu_key(&s_item_key, inode, max_reiserfs_offset(inode),
TYPE_DIRECT, 3);
@@ -1819,7 +1920,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
s_search_path.pos_in_item--;
/* Get real file size (total length of all file items) */
- p_le_ih = PATH_PITEM_HEAD(&s_search_path);
+ p_le_ih = tp_item_head(&s_search_path);
if (is_statdata_le_ih(p_le_ih))
file_size = 0;
else {
@@ -1827,9 +1928,11 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
int bytes =
op_bytes_number(p_le_ih, inode->i_sb->s_blocksize);
- /* this may mismatch with real file size: if last direct item
- had no padding zeros and last unformatted node had no free
- space, this file would have this file size */
+ /*
+ * this may mismatch with real file size: if last direct item
+ * had no padding zeros and last unformatted node had no free
+ * space, this file would have this file size
+ */
file_size = offset + bytes - 1;
}
/*
@@ -1867,18 +1970,20 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
set_cpu_key_k_offset(&s_item_key, file_size);
- /* While there are bytes to truncate and previous file item is presented in the tree. */
+ /*
+ * While there are bytes to truncate and previous
+ * file item is presented in the tree.
+ */
/*
- ** This loop could take a really long time, and could log
- ** many more blocks than a transaction can hold. So, we do a polite
- ** journal end here, and if the transaction needs ending, we make
- ** sure the file is consistent before ending the current trans
- ** and starting a new one
+ * This loop could take a really long time, and could log
+ * many more blocks than a transaction can hold. So, we do
+ * a polite journal end here, and if the transaction needs
+ * ending, we make sure the file is consistent before ending
+ * the current trans and starting a new one
*/
if (journal_transaction_should_end(th, 0) ||
reiserfs_transaction_free_space(th) <= JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) {
- int orig_len_alloc = th->t_blocks_allocated;
pathrelse(&s_search_path);
if (update_timestamps) {
@@ -1887,7 +1992,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
}
reiserfs_update_sd(th, inode);
- err = journal_end(th, inode->i_sb, orig_len_alloc);
+ err = journal_end(th);
if (err)
goto out;
err = journal_begin(th, inode->i_sb,
@@ -1904,25 +2009,25 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
"PAP-5680: truncate did not finish: new_file_size %Ld, current %Ld, oid %d",
new_file_size, file_size, s_item_key.on_disk_key.k_objectid);
- update_and_out:
+update_and_out:
if (update_timestamps) {
- // this is truncate, not file closing
+ /* this is truncate, not file closing */
inode->i_mtime = CURRENT_TIME_SEC;
inode->i_ctime = CURRENT_TIME_SEC;
}
reiserfs_update_sd(th, inode);
- out:
+out:
pathrelse(&s_search_path);
return err;
}
#ifdef CONFIG_REISERFS_CHECK
-// this makes sure, that we __append__, not overwrite or add holes
+/* this makes sure, that we __append__, not overwrite or add holes */
static void check_research_for_paste(struct treepath *path,
const struct cpu_key *key)
{
- struct item_head *found_ih = get_ih(path);
+ struct item_head *found_ih = tp_item_head(path);
if (is_direct_le_ih(found_ih)) {
if (le_ih_k_offset(found_ih) +
@@ -1952,13 +2057,22 @@ static void check_research_for_paste(struct treepath *path,
}
#endif /* config reiserfs check */
-/* Paste bytes to the existing item. Returns bytes number pasted into the item. */
-int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct treepath *search_path, /* Path to the pasted item. */
- const struct cpu_key *key, /* Key to search for the needed item. */
- struct inode *inode, /* Inode item belongs to */
- const char *body, /* Pointer to the bytes to paste. */
+/*
+ * Paste bytes to the existing item.
+ * Returns bytes number pasted into the item.
+ */
+int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th,
+ /* Path to the pasted item. */
+ struct treepath *search_path,
+ /* Key to search for the needed item. */
+ const struct cpu_key *key,
+ /* Inode item belongs to */
+ struct inode *inode,
+ /* Pointer to the bytes to paste. */
+ const char *body,
+ /* Size of pasted bytes. */
int pasted_size)
-{ /* Size of pasted bytes. */
+{
struct super_block *sb = inode->i_sb;
struct tree_balance s_paste_balance;
int retval;
@@ -1973,7 +2087,7 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
"reiserquota paste_into_item(): allocating %u id=%u type=%c",
pasted_size, inode->i_uid,
- key2type(&(key->on_disk_key)));
+ key2type(&key->on_disk_key));
#endif
depth = reiserfs_write_unlock_nested(sb);
@@ -1997,7 +2111,7 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
while ((retval =
fix_nodes(M_PASTE, &s_paste_balance, NULL,
body)) == REPEAT_SEARCH) {
- search_again:
+search_again:
/* file system changed while we were in the fix_nodes */
PROC_INFO_INC(th->t_super, paste_into_item_restarted);
retval =
@@ -2019,21 +2133,23 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
#endif
}
- /* Perform balancing after all resources are collected by fix_nodes, and
- accessing them will not risk triggering schedule. */
+ /*
+ * Perform balancing after all resources are collected by fix_nodes,
+ * and accessing them will not risk triggering schedule.
+ */
if (retval == CARRY_ON) {
do_balance(&s_paste_balance, NULL /*ih */ , body, M_PASTE);
return 0;
}
retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
- error_out:
+error_out:
/* this also releases the path */
unfix_nodes(&s_paste_balance);
#ifdef REISERQUOTA_DEBUG
reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
"reiserquota paste_into_item(): freeing %u id=%u type=%c",
pasted_size, inode->i_uid,
- key2type(&(key->on_disk_key)));
+ key2type(&key->on_disk_key));
#endif
depth = reiserfs_write_unlock_nested(sb);
dquot_free_space_nodirty(inode, pasted_size);
@@ -2041,7 +2157,8 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
return retval;
}
-/* Insert new item into the buffer at the path.
+/*
+ * Insert new item into the buffer at the path.
* th - active transaction handle
* path - path to the inserted item
* ih - pointer to the item header to insert
@@ -2064,8 +2181,10 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
fs_gen = get_generation(inode->i_sb);
quota_bytes = ih_item_len(ih);
- /* hack so the quota code doesn't have to guess if the file has
- ** a tail, links are always tails, so there's no guessing needed
+ /*
+ * hack so the quota code doesn't have to guess
+ * if the file has a tail, links are always tails,
+ * so there's no guessing needed
*/
if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(ih))
quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE;
@@ -2074,8 +2193,10 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
"reiserquota insert_item(): allocating %u id=%u type=%c",
quota_bytes, inode->i_uid, head2type(ih));
#endif
- /* We can't dirty inode here. It would be immediately written but
- * appropriate stat item isn't inserted yet... */
+ /*
+ * We can't dirty inode here. It would be immediately
+ * written but appropriate stat item isn't inserted yet...
+ */
depth = reiserfs_write_unlock_nested(inode->i_sb);
retval = dquot_alloc_space_nodirty(inode, quota_bytes);
reiserfs_write_lock_nested(inode->i_sb, depth);
@@ -2089,7 +2210,10 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
#ifdef DISPLACE_NEW_PACKING_LOCALITIES
s_ins_balance.key = key->on_disk_key;
#endif
- /* DQUOT_* can schedule, must check to be sure calling fix_nodes is safe */
+ /*
+ * DQUOT_* can schedule, must check to be sure calling
+ * fix_nodes is safe
+ */
if (inode && fs_changed(fs_gen, inode->i_sb)) {
goto search_again;
}
@@ -2097,7 +2221,7 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
while ((retval =
fix_nodes(M_INSERT, &s_ins_balance, ih,
body)) == REPEAT_SEARCH) {
- search_again:
+search_again:
/* file system changed while we were in the fix_nodes */
PROC_INFO_INC(th->t_super, insert_item_restarted);
retval = search_item(th->t_super, key, path);
@@ -2121,7 +2245,7 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
}
retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
- error_out:
+error_out:
/* also releases the path */
unfix_nodes(&s_ins_balance);
#ifdef REISERQUOTA_DEBUG
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 9fb20426005e..a392cef6acc6 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -74,7 +74,7 @@ static int reiserfs_sync_fs(struct super_block *s, int wait)
dquot_writeback_dquots(s, -1);
reiserfs_write_lock(s);
if (!journal_begin(&th, s, 1))
- if (!journal_end_sync(&th, s, 1))
+ if (!journal_end_sync(&th))
reiserfs_flush_old_commits(s);
reiserfs_write_unlock(s);
return 0;
@@ -136,9 +136,9 @@ static int reiserfs_freeze(struct super_block *s)
} else {
reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
1);
- journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
+ journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
reiserfs_block_writes(&th);
- journal_end_sync(&th, s, 1);
+ journal_end_sync(&th);
}
}
reiserfs_write_unlock(s);
@@ -153,13 +153,15 @@ static int reiserfs_unfreeze(struct super_block *s)
extern const struct in_core_key MAX_IN_CORE_KEY;
-/* this is used to delete "save link" when there are no items of a
- file it points to. It can either happen if unlink is completed but
- "save unlink" removal, or if file has both unlink and truncate
- pending and as unlink completes first (because key of "save link"
- protecting unlink is bigger that a key lf "save link" which
- protects truncate), so there left no items to make truncate
- completion on */
+/*
+ * this is used to delete "save link" when there are no items of a
+ * file it points to. It can either happen if unlink is completed but
+ * "save unlink" removal, or if file has both unlink and truncate
+ * pending and as unlink completes first (because key of "save link"
+ * protecting unlink is bigger that a key lf "save link" which
+ * protects truncate), so there left no items to make truncate
+ * completion on
+ */
static int remove_save_link_only(struct super_block *s,
struct reiserfs_key *key, int oid_free)
{
@@ -176,7 +178,7 @@ static int remove_save_link_only(struct super_block *s,
/* removals are protected by direct items */
reiserfs_release_objectid(&th, le32_to_cpu(key->k_objectid));
- return journal_end(&th, s, JOURNAL_PER_BALANCE_CNT);
+ return journal_end(&th);
}
#ifdef CONFIG_QUOTA
@@ -258,7 +260,7 @@ static int finish_unfinished(struct super_block *s)
break;
}
item_pos--;
- ih = B_N_PITEM_HEAD(bh, item_pos);
+ ih = item_head(bh, item_pos);
if (le32_to_cpu(ih->ih_key.k_dir_id) != MAX_KEY_OBJECTID)
/* there are no "save" links anymore */
@@ -271,7 +273,7 @@ static int finish_unfinished(struct super_block *s)
truncate = 0;
/* reiserfs_iget needs k_dirid and k_objectid only */
- item = B_I_PITEM(bh, ih);
+ item = ih_item_body(bh, ih);
obj_key.on_disk_key.k_dir_id = le32_to_cpu(*(__le32 *) item);
obj_key.on_disk_key.k_objectid =
le32_to_cpu(ih->ih_key.k_objectid);
@@ -282,8 +284,10 @@ static int finish_unfinished(struct super_block *s)
inode = reiserfs_iget(s, &obj_key);
if (!inode) {
- /* the unlink almost completed, it just did not manage to remove
- "save" link and release objectid */
+ /*
+ * the unlink almost completed, it just did not
+ * manage to remove "save" link and release objectid
+ */
reiserfs_warning(s, "vs-2180", "iget failed for %K",
&obj_key);
retval = remove_save_link_only(s, &save_link_key, 1);
@@ -303,10 +307,13 @@ static int finish_unfinished(struct super_block *s)
reiserfs_write_lock_nested(inode->i_sb, depth);
if (truncate && S_ISDIR(inode->i_mode)) {
- /* We got a truncate request for a dir which is impossible.
- The only imaginable way is to execute unfinished truncate request
- then boot into old kernel, remove the file and create dir with
- the same key. */
+ /*
+ * We got a truncate request for a dir which
+ * is impossible. The only imaginable way is to
+ * execute unfinished truncate request then boot
+ * into old kernel, remove the file and create dir
+ * with the same key.
+ */
reiserfs_warning(s, "green-2101",
"impossible truncate on a "
"directory %k. Please report",
@@ -320,14 +327,16 @@ static int finish_unfinished(struct super_block *s)
if (truncate) {
REISERFS_I(inode)->i_flags |=
i_link_saved_truncate_mask;
- /* not completed truncate found. New size was committed together
- with "save" link */
+ /*
+ * not completed truncate found. New size was
+ * committed together with "save" link
+ */
reiserfs_info(s, "Truncating %k to %Ld ..",
INODE_PKEY(inode), inode->i_size);
- reiserfs_truncate_file(inode,
- 0
- /*don't update modification time */
- );
+
+ /* don't update modification time */
+ reiserfs_truncate_file(inode, 0);
+
retval = remove_save_link(inode, truncate);
} else {
REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask;
@@ -373,10 +382,12 @@ static int finish_unfinished(struct super_block *s)
return retval;
}
-/* to protect file being unlinked from getting lost we "safe" link files
- being unlinked. This link will be deleted in the same transaction with last
- item of file. mounting the filesystem we scan all these links and remove
- files which almost got lost */
+/*
+ * to protect file being unlinked from getting lost we "safe" link files
+ * being unlinked. This link will be deleted in the same transaction with last
+ * item of file. mounting the filesystem we scan all these links and remove
+ * files which almost got lost
+ */
void add_save_link(struct reiserfs_transaction_handle *th,
struct inode *inode, int truncate)
{
@@ -495,7 +506,7 @@ int remove_save_link(struct inode *inode, int truncate)
} else
REISERFS_I(inode)->i_flags &= ~i_link_saved_truncate_mask;
- return journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT);
+ return journal_end(&th);
}
static void reiserfs_kill_sb(struct super_block *s)
@@ -530,19 +541,23 @@ static void reiserfs_put_super(struct super_block *s)
reiserfs_write_lock(s);
- /* change file system state to current state if it was mounted with read-write permissions */
+ /*
+ * change file system state to current state if it was mounted
+ * with read-write permissions
+ */
if (!(s->s_flags & MS_RDONLY)) {
if (!journal_begin(&th, s, 10)) {
reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
1);
set_sb_umount_state(SB_DISK_SUPER_BLOCK(s),
REISERFS_SB(s)->s_mount_state);
- journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
+ journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
}
}
- /* note, journal_release checks for readonly mount, and can decide not
- ** to do a journal_end
+ /*
+ * note, journal_release checks for readonly mount, and can
+ * decide not to do a journal_end
*/
journal_release(&th, s);
@@ -559,6 +574,7 @@ static void reiserfs_put_super(struct super_block *s)
reiserfs_write_unlock(s);
mutex_destroy(&REISERFS_SB(s)->lock);
+ destroy_workqueue(REISERFS_SB(s)->commit_wq);
kfree(s->s_fs_info);
s->s_fs_info = NULL;
}
@@ -634,15 +650,16 @@ static void reiserfs_dirty_inode(struct inode *inode, int flags)
}
reiserfs_write_lock(inode->i_sb);
- /* this is really only used for atime updates, so they don't have
- ** to be included in O_SYNC or fsync
+ /*
+ * this is really only used for atime updates, so they don't have
+ * to be included in O_SYNC or fsync
*/
err = journal_begin(&th, inode->i_sb, 1);
if (err)
goto out;
reiserfs_update_sd(&th, inode);
- journal_end(&th, inode->i_sb, 1);
+ journal_end(&th);
out:
reiserfs_write_unlock(inode->i_sb);
@@ -788,31 +805,53 @@ static const struct export_operations reiserfs_export_ops = {
.get_parent = reiserfs_get_parent,
};
-/* this struct is used in reiserfs_getopt () for containing the value for those
- mount options that have values rather than being toggles. */
+/*
+ * this struct is used in reiserfs_getopt () for containing the value for
+ * those mount options that have values rather than being toggles.
+ */
typedef struct {
char *value;
- int setmask; /* bitmask which is to set on mount_options bitmask when this
- value is found, 0 is no bits are to be changed. */
- int clrmask; /* bitmask which is to clear on mount_options bitmask when this
- value is found, 0 is no bits are to be changed. This is
- applied BEFORE setmask */
+ /*
+ * bitmask which is to set on mount_options bitmask
+ * when this value is found, 0 is no bits are to be changed.
+ */
+ int setmask;
+ /*
+ * bitmask which is to clear on mount_options bitmask
+ * when this value is found, 0 is no bits are to be changed.
+ * This is applied BEFORE setmask
+ */
+ int clrmask;
} arg_desc_t;
/* Set this bit in arg_required to allow empty arguments */
#define REISERFS_OPT_ALLOWEMPTY 31
-/* this struct is used in reiserfs_getopt() for describing the set of reiserfs
- mount options */
+/*
+ * this struct is used in reiserfs_getopt() for describing the
+ * set of reiserfs mount options
+ */
typedef struct {
char *option_name;
- int arg_required; /* 0 if argument is not required, not 0 otherwise */
- const arg_desc_t *values; /* list of values accepted by an option */
- int setmask; /* bitmask which is to set on mount_options bitmask when this
- value is found, 0 is no bits are to be changed. */
- int clrmask; /* bitmask which is to clear on mount_options bitmask when this
- value is found, 0 is no bits are to be changed. This is
- applied BEFORE setmask */
+
+ /* 0 if argument is not required, not 0 otherwise */
+ int arg_required;
+
+ /* list of values accepted by an option */
+ const arg_desc_t *values;
+
+ /*
+ * bitmask which is to set on mount_options bitmask
+ * when this value is found, 0 is no bits are to be changed.
+ */
+ int setmask;
+
+ /*
+ * bitmask which is to clear on mount_options bitmask
+ * when this value is found, 0 is no bits are to be changed.
+ * This is applied BEFORE setmask
+ */
+ int clrmask;
} opt_desc_t;
/* possible values for -o data= */
@@ -833,8 +872,10 @@ static const arg_desc_t barrier_mode[] = {
{.value = NULL}
};
-/* possible values for "-o block-allocator=" and bits which are to be set in
- s_mount_opt of reiserfs specific part of in-core super block */
+/*
+ * possible values for "-o block-allocator=" and bits which are to be set in
+ * s_mount_opt of reiserfs specific part of in-core super block
+ */
static const arg_desc_t balloc[] = {
{"noborder", 1 << REISERFS_NO_BORDER, 0},
{"border", 0, 1 << REISERFS_NO_BORDER},
@@ -864,21 +905,25 @@ static const arg_desc_t error_actions[] = {
{NULL, 0, 0},
};
-/* proceed only one option from a list *cur - string containing of mount options
- opts - array of options which are accepted
- opt_arg - if option is found and requires an argument and if it is specifed
- in the input - pointer to the argument is stored here
- bit_flags - if option requires to set a certain bit - it is set here
- return -1 if unknown option is found, opt->arg_required otherwise */
+/*
+ * proceed only one option from a list *cur - string containing of mount
+ * options
+ * opts - array of options which are accepted
+ * opt_arg - if option is found and requires an argument and if it is specifed
+ * in the input - pointer to the argument is stored here
+ * bit_flags - if option requires to set a certain bit - it is set here
+ * return -1 if unknown option is found, opt->arg_required otherwise
+ */
static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
char **opt_arg, unsigned long *bit_flags)
{
char *p;
- /* foo=bar,
- ^ ^ ^
- | | +-- option_end
- | +-- arg_start
- +-- option_start
+ /*
+ * foo=bar,
+ * ^ ^ ^
+ * | | +-- option_end
+ * | +-- arg_start
+ * +-- option_start
*/
const opt_desc_t *opt;
const arg_desc_t *arg;
@@ -893,9 +938,12 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
}
if (!strncmp(p, "alloc=", 6)) {
- /* Ugly special case, probably we should redo options parser so that
- it can understand several arguments for some options, also so that
- it can fill several bitfields with option values. */
+ /*
+ * Ugly special case, probably we should redo options
+ * parser so that it can understand several arguments for
+ * some options, also so that it can fill several bitfields
+ * with option values.
+ */
if (reiserfs_parse_alloc_options(s, p + 6)) {
return -1;
} else {
@@ -958,7 +1006,10 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
return -1;
}
- /* move to the argument, or to next option if argument is not required */
+ /*
+ * move to the argument, or to next option if argument is not
+ * required
+ */
p++;
if (opt->arg_required
@@ -995,12 +1046,20 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
}
/* returns 0 if something is wrong in option string, 1 - otherwise */
-static int reiserfs_parse_options(struct super_block *s, char *options, /* string given via mount's -o */
+static int reiserfs_parse_options(struct super_block *s,
+
+ /* string given via mount's -o */
+ char *options,
+
+ /*
+ * after the parsing phase, contains the
+ * collection of bitflags defining what
+ * mount options were selected.
+ */
unsigned long *mount_options,
- /* after the parsing phase, contains the
- collection of bitflags defining what
- mount options were selected. */
- unsigned long *blocks, /* strtol-ed from NNN of resize=NNN */
+
+ /* strtol-ed from NNN of resize=NNN */
+ unsigned long *blocks,
char **jdev_name,
unsigned int *commit_max_age,
char **qf_names,
@@ -1010,7 +1069,10 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
char *arg = NULL;
char *pos;
opt_desc_t opts[] = {
- /* Compatibility stuff, so that -o notail for old setups still work */
+ /*
+ * Compatibility stuff, so that -o notail for old
+ * setups still work
+ */
{"tails",.arg_required = 't',.values = tails},
{"notail",.clrmask =
(1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)},
@@ -1055,8 +1117,10 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
*blocks = 0;
if (!options || !*options)
- /* use default configuration: create tails, journaling on, no
- conversion to newest format */
+ /*
+ * use default configuration: create tails, journaling on, no
+ * conversion to newest format
+ */
return 1;
for (pos = options; pos;) {
@@ -1109,7 +1173,8 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
if (c == 'j') {
if (arg && *arg && jdev_name) {
- if (*jdev_name) { //Hm, already assigned?
+ /* Hm, already assigned? */
+ if (*jdev_name) {
reiserfs_warning(s, "super-6510",
"journal device was "
"already specified to "
@@ -1362,8 +1427,10 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
safe_mask |= 1 << REISERFS_USRQUOTA;
safe_mask |= 1 << REISERFS_GRPQUOTA;
- /* Update the bitmask, taking care to keep
- * the bits we're not allowed to change here */
+ /*
+ * Update the bitmask, taking care to keep
+ * the bits we're not allowed to change here
+ */
REISERFS_SB(s)->s_mount_opt =
(REISERFS_SB(s)->
s_mount_opt & ~safe_mask) | (mount_options & safe_mask);
@@ -1410,7 +1477,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
/* Mounting a rw partition read-only. */
reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
set_sb_umount_state(rs, REISERFS_SB(s)->s_mount_state);
- journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
+ journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
} else {
/* remount read-write */
if (!(s->s_flags & MS_RDONLY)) {
@@ -1427,7 +1494,9 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
handle_data_mode(s, mount_options);
handle_barrier_mode(s, mount_options);
REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
- s->s_flags &= ~MS_RDONLY; /* now it is safe to call journal_begin */
+
+ /* now it is safe to call journal_begin */
+ s->s_flags &= ~MS_RDONLY;
err = journal_begin(&th, s, 10);
if (err)
goto out_err_unlock;
@@ -1440,12 +1509,12 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
if (!old_format_only(s))
set_sb_mnt_count(rs, sb_mnt_count(rs) + 1);
/* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */
- journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
+ journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS;
}
/* this will force a full flush of all journal lists */
SB_JOURNAL(s)->j_must_wait = 1;
- err = journal_end(&th, s, 10);
+ err = journal_end(&th);
if (err)
goto out_err_unlock;
@@ -1489,9 +1558,9 @@ static int read_super_block(struct super_block *s, int offset)
brelse(bh);
return 1;
}
- //
- // ok, reiserfs signature (old or new) found in at the given offset
- //
+ /*
+ * ok, reiserfs signature (old or new) found in at the given offset
+ */
fs_blocksize = sb_blocksize(rs);
brelse(bh);
sb_set_blocksize(s, fs_blocksize);
@@ -1529,9 +1598,11 @@ static int read_super_block(struct super_block *s, int offset)
SB_BUFFER_WITH_SB(s) = bh;
SB_DISK_SUPER_BLOCK(s) = rs;
+ /*
+ * magic is of non-standard journal filesystem, look at s_version to
+ * find which format is in use
+ */
if (is_reiserfs_jr(rs)) {
- /* magic is of non-standard journal filesystem, look at s_version to
- find which format is in use */
if (sb_version(rs) == REISERFS_VERSION_2)
reiserfs_info(s, "found reiserfs format \"3.6\""
" with non-standard journal\n");
@@ -1545,8 +1616,10 @@ static int read_super_block(struct super_block *s, int offset)
return 1;
}
} else
- /* s_version of standard format may contain incorrect information,
- so we just look at the magic string */
+ /*
+ * s_version of standard format may contain incorrect
+ * information, so we just look at the magic string
+ */
reiserfs_info(s,
"found reiserfs format \"%s\" with standard journal\n",
is_reiserfs_3_5(rs) ? "3.5" : "3.6");
@@ -1558,8 +1631,9 @@ static int read_super_block(struct super_block *s, int offset)
s->dq_op = &reiserfs_quota_operations;
#endif
- /* new format is limited by the 32 bit wide i_blocks field, want to
- ** be one full block below that.
+ /*
+ * new format is limited by the 32 bit wide i_blocks field, want to
+ * be one full block below that.
*/
s->s_maxbytes = (512LL << 32) - s->s_blocksize;
return 0;
@@ -1568,7 +1642,7 @@ static int read_super_block(struct super_block *s, int offset)
/* after journal replay, reread all bitmap and super blocks */
static int reread_meta_blocks(struct super_block *s)
{
- ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s)));
+ ll_rw_block(READ, 1, &SB_BUFFER_WITH_SB(s));
wait_on_buffer(SB_BUFFER_WITH_SB(s));
if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
reiserfs_warning(s, "reiserfs-2504", "error reading the super");
@@ -1578,14 +1652,15 @@ static int reread_meta_blocks(struct super_block *s)
return 0;
}
-/////////////////////////////////////////////////////
-// hash detection stuff
+/* hash detection stuff */
-// if root directory is empty - we set default - Yura's - hash and
-// warn about it
-// FIXME: we look for only one name in a directory. If tea and yura
-// bith have the same value - we ask user to send report to the
-// mailing list
+/*
+ * if root directory is empty - we set default - Yura's - hash and
+ * warn about it
+ * FIXME: we look for only one name in a directory. If tea and yura
+ * both have the same value - we ask user to send report to the
+ * mailing list
+ */
static __u32 find_hash_out(struct super_block *s)
{
int retval;
@@ -1593,92 +1668,83 @@ static __u32 find_hash_out(struct super_block *s)
struct cpu_key key;
INITIALIZE_PATH(path);
struct reiserfs_dir_entry de;
+ struct reiserfs_de_head *deh;
__u32 hash = DEFAULT_HASH;
+ __u32 deh_hashval, teahash, r5hash, yurahash;
inode = s->s_root->d_inode;
- do { // Some serious "goto"-hater was there ;)
- u32 teahash, r5hash, yurahash;
+ make_cpu_key(&key, inode, ~0, TYPE_DIRENTRY, 3);
+ retval = search_by_entry_key(s, &key, &path, &de);
+ if (retval == IO_ERROR) {
+ pathrelse(&path);
+ return UNSET_HASH;
+ }
+ if (retval == NAME_NOT_FOUND)
+ de.de_entry_num--;
+
+ set_de_name_and_namelen(&de);
+ deh = de.de_deh + de.de_entry_num;
- make_cpu_key(&key, inode, ~0, TYPE_DIRENTRY, 3);
- retval = search_by_entry_key(s, &key, &path, &de);
- if (retval == IO_ERROR) {
- pathrelse(&path);
- return UNSET_HASH;
- }
- if (retval == NAME_NOT_FOUND)
- de.de_entry_num--;
- set_de_name_and_namelen(&de);
- if (deh_offset(&(de.de_deh[de.de_entry_num])) == DOT_DOT_OFFSET) {
- /* allow override in this case */
- if (reiserfs_rupasov_hash(s)) {
- hash = YURA_HASH;
- }
- reiserfs_info(s, "FS seems to be empty, autodetect "
- "is using the default hash\n");
- break;
- }
- r5hash = GET_HASH_VALUE(r5_hash(de.de_name, de.de_namelen));
- teahash = GET_HASH_VALUE(keyed_hash(de.de_name, de.de_namelen));
- yurahash = GET_HASH_VALUE(yura_hash(de.de_name, de.de_namelen));
- if (((teahash == r5hash)
- &&
- (GET_HASH_VALUE(deh_offset(&(de.de_deh[de.de_entry_num])))
- == r5hash)) || ((teahash == yurahash)
- && (yurahash ==
- GET_HASH_VALUE(deh_offset
- (&
- (de.
- de_deh[de.
- de_entry_num])))))
- || ((r5hash == yurahash)
- && (yurahash ==
- GET_HASH_VALUE(deh_offset
- (&(de.de_deh[de.de_entry_num])))))) {
- reiserfs_warning(s, "reiserfs-2506", "Unable to "
- "automatically detect hash function. "
- "Please mount with -o "
- "hash={tea,rupasov,r5}");
- hash = UNSET_HASH;
- break;
- }
- if (GET_HASH_VALUE(deh_offset(&(de.de_deh[de.de_entry_num]))) ==
- yurahash)
+ if (deh_offset(deh) == DOT_DOT_OFFSET) {
+ /* allow override in this case */
+ if (reiserfs_rupasov_hash(s))
hash = YURA_HASH;
- else if (GET_HASH_VALUE
- (deh_offset(&(de.de_deh[de.de_entry_num]))) == teahash)
- hash = TEA_HASH;
- else if (GET_HASH_VALUE
- (deh_offset(&(de.de_deh[de.de_entry_num]))) == r5hash)
- hash = R5_HASH;
- else {
- reiserfs_warning(s, "reiserfs-2506",
- "Unrecognised hash function");
- hash = UNSET_HASH;
- }
- } while (0);
+ reiserfs_info(s, "FS seems to be empty, autodetect is using the default hash\n");
+ goto out;
+ }
+
+ deh_hashval = GET_HASH_VALUE(deh_offset(deh));
+ r5hash = GET_HASH_VALUE(r5_hash(de.de_name, de.de_namelen));
+ teahash = GET_HASH_VALUE(keyed_hash(de.de_name, de.de_namelen));
+ yurahash = GET_HASH_VALUE(yura_hash(de.de_name, de.de_namelen));
+
+ if ((teahash == r5hash && deh_hashval == r5hash) ||
+ (teahash == yurahash && deh_hashval == yurahash) ||
+ (r5hash == yurahash && deh_hashval == yurahash)) {
+ reiserfs_warning(s, "reiserfs-2506",
+ "Unable to automatically detect hash "
+ "function. Please mount with -o "
+ "hash={tea,rupasov,r5}");
+ hash = UNSET_HASH;
+ goto out;
+ }
+ if (deh_hashval == yurahash)
+ hash = YURA_HASH;
+ else if (deh_hashval == teahash)
+ hash = TEA_HASH;
+ else if (deh_hashval == r5hash)
+ hash = R5_HASH;
+ else {
+ reiserfs_warning(s, "reiserfs-2506",
+ "Unrecognised hash function");
+ hash = UNSET_HASH;
+ }
+out:
pathrelse(&path);
return hash;
}
-// finds out which hash names are sorted with
+/* finds out which hash names are sorted with */
static int what_hash(struct super_block *s)
{
__u32 code;
code = sb_hash_function_code(SB_DISK_SUPER_BLOCK(s));
- /* reiserfs_hash_detect() == true if any of the hash mount options
- ** were used. We must check them to make sure the user isn't
- ** using a bad hash value
+ /*
+ * reiserfs_hash_detect() == true if any of the hash mount options
+ * were used. We must check them to make sure the user isn't
+ * using a bad hash value
*/
if (code == UNSET_HASH || reiserfs_hash_detect(s))
code = find_hash_out(s);
if (code != UNSET_HASH && reiserfs_hash_detect(s)) {
- /* detection has found the hash, and we must check against the
- ** mount options
+ /*
+ * detection has found the hash, and we must check against the
+ * mount options
*/
if (reiserfs_rupasov_hash(s) && code != YURA_HASH) {
reiserfs_warning(s, "reiserfs-2507",
@@ -1700,7 +1766,10 @@ static int what_hash(struct super_block *s)
code = UNSET_HASH;
}
} else {
- /* find_hash_out was not called or could not determine the hash */
+ /*
+ * find_hash_out was not called or
+ * could not determine the hash
+ */
if (reiserfs_rupasov_hash(s)) {
code = YURA_HASH;
} else if (reiserfs_tea_hash(s)) {
@@ -1710,8 +1779,9 @@ static int what_hash(struct super_block *s)
}
}
- /* if we are mounted RW, and we have a new valid hash code, update
- ** the super
+ /*
+ * if we are mounted RW, and we have a new valid hash code, update
+ * the super
*/
if (code != UNSET_HASH &&
!(s->s_flags & MS_RDONLY) &&
@@ -1721,7 +1791,7 @@ static int what_hash(struct super_block *s)
return code;
}
-// return pointer to appropriate function
+/* return pointer to appropriate function */
static hashf_t hash_function(struct super_block *s)
{
switch (what_hash(s)) {
@@ -1738,7 +1808,7 @@ static hashf_t hash_function(struct super_block *s)
return NULL;
}
-// this is used to set up correct value for old partitions
+/* this is used to set up correct value for old partitions */
static int function2code(hashf_t func)
{
if (func == keyed_hash)
@@ -1748,7 +1818,7 @@ static int function2code(hashf_t func)
if (func == r5_hash)
return R5_HASH;
- BUG(); // should never happen
+ BUG(); /* should never happen */
return 0;
}
@@ -1783,8 +1853,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
sbi->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
sbi->s_mount_opt |= (1 << REISERFS_ERROR_RO);
sbi->s_mount_opt |= (1 << REISERFS_BARRIER_FLUSH);
- /* no preallocation minimum, be smart in
- reiserfs_file_write instead */
+ /* no preallocation minimum, be smart in reiserfs_file_write instead */
sbi->s_alloc_options.preallocmin = 0;
/* Preallocate by 16 blocks (17-1) at once */
sbi->s_alloc_options.preallocsize = 17;
@@ -1796,9 +1865,17 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
mutex_init(&sbi->lock);
sbi->lock_depth = -1;
+ sbi->commit_wq = alloc_workqueue("reiserfs/%s", WQ_MEM_RECLAIM, 0,
+ s->s_id);
+ if (!sbi->commit_wq) {
+ SWARN(silent, s, "", "Cannot allocate commit workqueue");
+ errval = -ENOMEM;
+ goto error_unlocked;
+ }
+
jdev_name = NULL;
if (reiserfs_parse_options
- (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name,
+ (s, (char *)data, &sbi->s_mount_opt, &blocks, &jdev_name,
&commit_max_age, qf_names, &qfmt) == 0) {
goto error_unlocked;
}
@@ -1819,10 +1896,17 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
goto error_unlocked;
}
- /* try old format (undistributed bitmap, super block in 8-th 1k block of a device) */
+ /*
+ * try old format (undistributed bitmap, super block in 8-th 1k
+ * block of a device)
+ */
if (!read_super_block(s, REISERFS_OLD_DISK_OFFSET_IN_BYTES))
old_format = 1;
- /* try new format (64-th 1k block), which can contain reiserfs super block */
+
+ /*
+ * try new format (64-th 1k block), which can contain reiserfs
+ * super block
+ */
else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) {
SWARN(silent, s, "sh-2021", "can not find reiserfs on %s",
s->s_id);
@@ -1830,9 +1914,11 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
}
rs = SB_DISK_SUPER_BLOCK(s);
- /* Let's do basic sanity check to verify that underlying device is not
- smaller than the filesystem. If the check fails then abort and scream,
- because bad stuff will happen otherwise. */
+ /*
+ * Let's do basic sanity check to verify that underlying device is not
+ * smaller than the filesystem. If the check fails then abort and
+ * scream, because bad stuff will happen otherwise.
+ */
if (s->s_bdev && s->s_bdev->bd_inode
&& i_size_read(s->s_bdev->bd_inode) <
sb_block_count(rs) * sb_blocksize(rs)) {
@@ -1876,15 +1962,16 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
printk("reiserfs: using flush barriers\n");
}
- // set_device_ro(s->s_dev, 1) ;
if (journal_init(s, jdev_name, old_format, commit_max_age)) {
SWARN(silent, s, "sh-2022",
"unable to initialize journal space");
goto error_unlocked;
} else {
- jinit_done = 1; /* once this is set, journal_release must be called
- ** if we error out of the mount
- */
+ /*
+ * once this is set, journal_release must be called
+ * if we error out of the mount
+ */
+ jinit_done = 1;
}
if (reread_meta_blocks(s)) {
@@ -1905,7 +1992,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
args.dirid = REISERFS_ROOT_PARENT_OBJECTID;
root_inode =
iget5_locked(s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor,
- reiserfs_init_locked_inode, (void *)(&args));
+ reiserfs_init_locked_inode, (void *)&args);
if (!root_inode) {
SWARN(silent, s, "jmacd-10", "get root inode failed");
goto error_unlocked;
@@ -1929,7 +2016,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
s->s_root = d_make_root(root_inode);
if (!s->s_root)
goto error;
- // define and initialize hash function
+ /* define and initialize hash function */
sbi->s_hash_function = hash_function(s);
if (sbi->s_hash_function == NULL) {
dput(s->s_root);
@@ -1939,11 +2026,11 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
if (is_reiserfs_3_5(rs)
|| (is_reiserfs_jr(rs) && SB_VERSION(s) == REISERFS_VERSION_1))
- set_bit(REISERFS_3_5, &(sbi->s_properties));
+ set_bit(REISERFS_3_5, &sbi->s_properties);
else if (old_format)
- set_bit(REISERFS_OLD_FORMAT, &(sbi->s_properties));
+ set_bit(REISERFS_OLD_FORMAT, &sbi->s_properties);
else
- set_bit(REISERFS_3_6, &(sbi->s_properties));
+ set_bit(REISERFS_3_6, &sbi->s_properties);
if (!(s->s_flags & MS_RDONLY)) {
@@ -1958,10 +2045,12 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
set_sb_umount_state(rs, REISERFS_ERROR_FS);
set_sb_fs_state(rs, 0);
- /* Clear out s_bmap_nr if it would wrap. We can handle this
+ /*
+ * Clear out s_bmap_nr if it would wrap. We can handle this
* case, but older revisions can't. This will cause the
* file system to fail mount on those older implementations,
- * avoiding corruption. -jeffm */
+ * avoiding corruption. -jeffm
+ */
if (bmap_would_wrap(reiserfs_bmap_count(s)) &&
sb_bmap_nr(rs) != 0) {
reiserfs_warning(s, "super-2030", "This file system "
@@ -1974,8 +2063,10 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
}
if (old_format_only(s)) {
- /* filesystem of format 3.5 either with standard or non-standard
- journal */
+ /*
+ * filesystem of format 3.5 either with standard
+ * or non-standard journal
+ */
if (convert_reiserfs(s)) {
/* and -o conv is given */
if (!silent)
@@ -1983,8 +2074,11 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
"converting 3.5 filesystem to the 3.6 format");
if (is_reiserfs_3_5(rs))
- /* put magic string of 3.6 format. 2.2 will not be able to
- mount this filesystem anymore */
+ /*
+ * put magic string of 3.6 format.
+ * 2.2 will not be able to
+ * mount this filesystem anymore
+ */
memcpy(rs->s_v1.s_magic,
reiserfs_3_6_magic_string,
sizeof
@@ -1992,8 +2086,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
set_sb_version(rs, REISERFS_VERSION_2);
reiserfs_convert_objectid_map_v1(s);
- set_bit(REISERFS_3_6, &(sbi->s_properties));
- clear_bit(REISERFS_3_5, &(sbi->s_properties));
+ set_bit(REISERFS_3_6, &sbi->s_properties);
+ clear_bit(REISERFS_3_5, &sbi->s_properties);
} else if (!silent) {
reiserfs_info(s, "using 3.5.x disk format\n");
}
@@ -2001,8 +2095,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
set_sb_mnt_count(rs, sb_mnt_count(rs) + 1);
- journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
- errval = journal_end(&th, s, 1);
+ journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
+ errval = journal_end(&th);
if (errval) {
dput(s->s_root);
s->s_root = NULL;
@@ -2018,7 +2112,9 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
}
reiserfs_write_lock(s);
- /* look for files which were to be removed in previous session */
+ /*
+ * look for files which were to be removed in previous session
+ */
finish_unfinished(s);
} else {
if (old_format_only(s) && !silent) {
@@ -2034,7 +2130,9 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
}
reiserfs_write_lock(s);
}
- // mark hash in super block: it could be unset. overwrite should be ok
+ /*
+ * mark hash in super block: it could be unset. overwrite should be ok
+ */
set_sb_hash_function_code(rs, function2code(sbi->s_hash_function));
handle_attrs(s);
@@ -2111,9 +2209,7 @@ static int reiserfs_write_dquot(struct dquot *dquot)
depth = reiserfs_write_unlock_nested(dquot->dq_sb);
ret = dquot_commit(dquot);
reiserfs_write_lock_nested(dquot->dq_sb, depth);
- err =
- journal_end(&th, dquot->dq_sb,
- REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
+ err = journal_end(&th);
if (!ret && err)
ret = err;
out:
@@ -2136,9 +2232,7 @@ static int reiserfs_acquire_dquot(struct dquot *dquot)
depth = reiserfs_write_unlock_nested(dquot->dq_sb);
ret = dquot_acquire(dquot);
reiserfs_write_lock_nested(dquot->dq_sb, depth);
- err =
- journal_end(&th, dquot->dq_sb,
- REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
+ err = journal_end(&th);
if (!ret && err)
ret = err;
out:
@@ -2163,9 +2257,7 @@ static int reiserfs_release_dquot(struct dquot *dquot)
}
ret = dquot_release(dquot);
reiserfs_write_lock(dquot->dq_sb);
- err =
- journal_end(&th, dquot->dq_sb,
- REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+ err = journal_end(&th);
if (!ret && err)
ret = err;
reiserfs_write_unlock(dquot->dq_sb);
@@ -2198,7 +2290,7 @@ static int reiserfs_write_info(struct super_block *sb, int type)
depth = reiserfs_write_unlock_nested(sb);
ret = dquot_commit_info(sb, type);
reiserfs_write_lock_nested(sb, depth);
- err = journal_end(&th, sb, 2);
+ err = journal_end(&th);
if (!ret && err)
ret = err;
out:
@@ -2238,7 +2330,10 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
goto out;
}
inode = path->dentry->d_inode;
- /* We must not pack tails for quota files on reiserfs for quota IO to work */
+ /*
+ * We must not pack tails for quota files on reiserfs for quota
+ * IO to work
+ */
if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) {
err = reiserfs_unpack(inode, NULL);
if (err) {
@@ -2268,7 +2363,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
err = journal_begin(&th, sb, 1);
if (err)
goto out;
- err = journal_end_sync(&th, sb, 1);
+ err = journal_end_sync(&th);
if (err)
goto out;
}
@@ -2279,10 +2374,12 @@ out:
return err;
}
-/* Read data from quotafile - avoid pagecache and such because we cannot afford
+/*
+ * Read data from quotafile - avoid pagecache and such because we cannot afford
* acquiring the locks... As quota files are never truncated and quota code
* itself serializes the operations (and no one else should touch the files)
- * we don't have to be afraid of races */
+ * we don't have to be afraid of races
+ */
static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data,
size_t len, loff_t off)
{
@@ -2303,7 +2400,10 @@ static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data,
sb->s_blocksize - offset <
toread ? sb->s_blocksize - offset : toread;
tmp_bh.b_state = 0;
- /* Quota files are without tails so we can safely use this function */
+ /*
+ * Quota files are without tails so we can safely
+ * use this function
+ */
reiserfs_write_lock(sb);
err = reiserfs_get_block(inode, blk, &tmp_bh, 0);
reiserfs_write_unlock(sb);
@@ -2326,8 +2426,10 @@ static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data,
return len;
}
-/* Write to quotafile (we know the transaction is already started and has
- * enough credits) */
+/*
+ * Write to quotafile (we know the transaction is already started and has
+ * enough credits)
+ */
static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
const char *data, size_t len, loff_t off)
{
@@ -2368,7 +2470,7 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
unlock_buffer(bh);
reiserfs_write_lock(sb);
reiserfs_prepare_for_journal(sb, bh, 1);
- journal_mark_dirty(current->journal_info, sb, bh);
+ journal_mark_dirty(current->journal_info, bh);
if (!journal_quota)
reiserfs_add_ordered_list(inode, bh);
reiserfs_write_unlock(sb);
@@ -2402,18 +2504,18 @@ static int __init init_reiserfs_fs(void)
{
int ret;
- if ((ret = init_inodecache())) {
+ ret = init_inodecache();
+ if (ret)
return ret;
- }
reiserfs_proc_info_global_init();
ret = register_filesystem(&reiserfs_fs_type);
+ if (ret)
+ goto out;
- if (ret == 0) {
- return 0;
- }
-
+ return 0;
+out:
reiserfs_proc_info_global_done();
destroy_inodecache();
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
index 5e2624d12f70..f41e19b4bb42 100644
--- a/fs/reiserfs/tail_conversion.c
+++ b/fs/reiserfs/tail_conversion.c
@@ -1,5 +1,6 @@
/*
- * Copyright 1999 Hans Reiser, see reiserfs/README for licensing and copyright details
+ * Copyright 1999 Hans Reiser, see reiserfs/README for licensing and copyright
+ * details
*/
#include <linux/time.h>
@@ -7,29 +8,41 @@
#include <linux/buffer_head.h>
#include "reiserfs.h"
-/* access to tail : when one is going to read tail it must make sure, that is not running.
- direct2indirect and indirect2direct can not run concurrently */
+/*
+ * access to tail : when one is going to read tail it must make sure, that is
+ * not running. direct2indirect and indirect2direct can not run concurrently
+ */
-/* Converts direct items to an unformatted node. Panics if file has no
- tail. -ENOSPC if no disk space for conversion */
-/* path points to first direct item of the file regarless of how many of
- them are there */
+/*
+ * Converts direct items to an unformatted node. Panics if file has no
+ * tail. -ENOSPC if no disk space for conversion
+ */
+/*
+ * path points to first direct item of the file regardless of how many of
+ * them are there
+ */
int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
struct treepath *path, struct buffer_head *unbh,
loff_t tail_offset)
{
struct super_block *sb = inode->i_sb;
struct buffer_head *up_to_date_bh;
- struct item_head *p_le_ih = PATH_PITEM_HEAD(path);
+ struct item_head *p_le_ih = tp_item_head(path);
unsigned long total_tail = 0;
- struct cpu_key end_key; /* Key to search for the last byte of the
- converted item. */
- struct item_head ind_ih; /* new indirect item to be inserted or
- key of unfm pointer to be pasted */
- int blk_size, retval; /* returned value for reiserfs_insert_item and clones */
- unp_t unfm_ptr; /* Handle on an unformatted node
- that will be inserted in the
- tree. */
+
+ /* Key to search for the last byte of the converted item. */
+ struct cpu_key end_key;
+
+ /*
+ * new indirect item to be inserted or key
+ * of unfm pointer to be pasted
+ */
+ struct item_head ind_ih;
+ int blk_size;
+ /* returned value for reiserfs_insert_item and clones */
+ int retval;
+ /* Handle on an unformatted node that will be inserted in the tree. */
+ unp_t unfm_ptr;
BUG_ON(!th->t_trans_id);
@@ -37,8 +50,10 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
blk_size = sb->s_blocksize;
- /* and key to search for append or insert pointer to the new
- unformatted node. */
+ /*
+ * and key to search for append or insert pointer to the new
+ * unformatted node.
+ */
copy_item_head(&ind_ih, p_le_ih);
set_le_ih_k_offset(&ind_ih, tail_offset);
set_le_ih_k_type(&ind_ih, TYPE_INDIRECT);
@@ -55,7 +70,7 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
return -EIO;
}
- p_le_ih = PATH_PITEM_HEAD(path);
+ p_le_ih = tp_item_head(path);
unfm_ptr = cpu_to_le32(unbh->b_blocknr);
@@ -76,36 +91,43 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
if (retval) {
return retval;
}
- // note: from here there are two keys which have matching first
- // three key components. They only differ by the fourth one.
+ /*
+ * note: from here there are two keys which have matching first
+ * three key components. They only differ by the fourth one.
+ */
/* Set the key to search for the direct items of the file */
make_cpu_key(&end_key, inode, max_reiserfs_offset(inode), TYPE_DIRECT,
4);
- /* Move bytes from the direct items to the new unformatted node
- and delete them. */
+ /*
+ * Move bytes from the direct items to the new unformatted node
+ * and delete them.
+ */
while (1) {
int tail_size;
- /* end_key.k_offset is set so, that we will always have found
- last item of the file */
+ /*
+ * end_key.k_offset is set so, that we will always have found
+ * last item of the file
+ */
if (search_for_position_by_key(sb, &end_key, path) ==
POSITION_FOUND)
reiserfs_panic(sb, "PAP-14050",
"direct item (%K) not found", &end_key);
- p_le_ih = PATH_PITEM_HEAD(path);
+ p_le_ih = tp_item_head(path);
RFALSE(!is_direct_le_ih(p_le_ih),
"vs-14055: direct item expected(%K), found %h",
&end_key, p_le_ih);
tail_size = (le_ih_k_offset(p_le_ih) & (blk_size - 1))
+ ih_item_len(p_le_ih) - 1;
- /* we only send the unbh pointer if the buffer is not up to date.
- ** this avoids overwriting good data from writepage() with old data
- ** from the disk or buffer cache
- ** Special case: unbh->b_page will be NULL if we are coming through
- ** DIRECT_IO handler here.
+ /*
+ * we only send the unbh pointer if the buffer is not
+ * up to date. this avoids overwriting good data from
+ * writepage() with old data from the disk or buffer cache
+ * Special case: unbh->b_page will be NULL if we are coming
+ * through DIRECT_IO handler here.
*/
if (!unbh->b_page || buffer_uptodate(unbh)
|| PageUptodate(unbh->b_page)) {
@@ -117,13 +139,15 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
up_to_date_bh);
total_tail += retval;
+
+ /* done: file does not have direct items anymore */
if (tail_size == retval)
- // done: file does not have direct items anymore
break;
}
- /* if we've copied bytes from disk into the page, we need to zero
- ** out the unused part of the block (it was not up to date before)
+ /*
+ * if we've copied bytes from disk into the page, we need to zero
+ * out the unused part of the block (it was not up to date before)
*/
if (up_to_date_bh) {
unsigned pgoff =
@@ -146,9 +170,11 @@ void reiserfs_unmap_buffer(struct buffer_head *bh)
BUG();
}
clear_buffer_dirty(bh);
- /* Remove the buffer from whatever list it belongs to. We are mostly
- interested in removing it from per-sb j_dirty_buffers list, to avoid
- BUG() on attempt to write not mapped buffer */
+ /*
+ * Remove the buffer from whatever list it belongs to. We are mostly
+ * interested in removing it from per-sb j_dirty_buffers list, to avoid
+ * BUG() on attempt to write not mapped buffer
+ */
if ((!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) {
struct inode *inode = bh->b_page->mapping->host;
struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
@@ -164,12 +190,14 @@ void reiserfs_unmap_buffer(struct buffer_head *bh)
unlock_buffer(bh);
}
-/* this first locks inode (neither reads nor sync are permitted),
- reads tail through page cache, insert direct item. When direct item
- inserted successfully inode is left locked. Return value is always
- what we expect from it (number of cut bytes). But when tail remains
- in the unformatted node, we set mode to SKIP_BALANCING and unlock
- inode */
+/*
+ * this first locks inode (neither reads nor sync are permitted),
+ * reads tail through page cache, insert direct item. When direct item
+ * inserted successfully inode is left locked. Return value is always
+ * what we expect from it (number of cut bytes). But when tail remains
+ * in the unformatted node, we set mode to SKIP_BALANCING and unlock
+ * inode
+ */
int indirect2direct(struct reiserfs_transaction_handle *th,
struct inode *inode, struct page *page,
struct treepath *path, /* path to the indirect item. */
@@ -194,7 +222,7 @@ int indirect2direct(struct reiserfs_transaction_handle *th,
*mode = M_SKIP_BALANCING;
/* store item head path points to. */
- copy_item_head(&s_ih, PATH_PITEM_HEAD(path));
+ copy_item_head(&s_ih, tp_item_head(path));
tail_len = (n_new_file_size & (block_size - 1));
if (get_inode_sd_version(inode) == STAT_DATA_V2)
@@ -207,9 +235,11 @@ int indirect2direct(struct reiserfs_transaction_handle *th,
1) * sb->s_blocksize;
pos1 = pos;
- // we are protected by i_mutex. The tail can not disapper, not
- // append can be done either
- // we are in truncate or packing tail in file_release
+ /*
+ * we are protected by i_mutex. The tail can not disapper, not
+ * append can be done either
+ * we are in truncate or packing tail in file_release
+ */
tail = (char *)kmap(page); /* this can schedule */
@@ -220,7 +250,7 @@ int indirect2direct(struct reiserfs_transaction_handle *th,
reiserfs_panic(sb, "PAP-5520",
"item to be converted %K does not exist",
item_key);
- copy_item_head(&s_ih, PATH_PITEM_HEAD(path));
+ copy_item_head(&s_ih, tp_item_head(path));
#ifdef CONFIG_REISERFS_CHECK
pos = le_ih_k_offset(&s_ih) - 1 +
(ih_item_len(&s_ih) / UNFM_P_SIZE -
@@ -236,9 +266,10 @@ int indirect2direct(struct reiserfs_transaction_handle *th,
pos1 + 1, TYPE_DIRECT, round_tail_len,
0xffff /*ih_free_space */ );
- /* we want a pointer to the first byte of the tail in the page.
- ** the page was locked and this part of the page was up to date when
- ** indirect2direct was called, so we know the bytes are still valid
+ /*
+ * we want a pointer to the first byte of the tail in the page.
+ * the page was locked and this part of the page was up to date when
+ * indirect2direct was called, so we know the bytes are still valid
*/
tail = tail + (pos & (PAGE_CACHE_SIZE - 1));
@@ -250,12 +281,14 @@ int indirect2direct(struct reiserfs_transaction_handle *th,
/* Insert tail as new direct item in the tree */
if (reiserfs_insert_item(th, path, &key, &s_ih, inode,
tail ? tail : NULL) < 0) {
- /* No disk memory. So we can not convert last unformatted node
- to the direct item. In this case we used to adjust
- indirect items's ih_free_space. Now ih_free_space is not
- used, it would be ideal to write zeros to corresponding
- unformatted node. For now i_size is considered as guard for
- going out of file size */
+ /*
+ * No disk memory. So we can not convert last unformatted node
+ * to the direct item. In this case we used to adjust
+ * indirect items's ih_free_space. Now ih_free_space is not
+ * used, it would be ideal to write zeros to corresponding
+ * unformatted node. For now i_size is considered as guard for
+ * going out of file size
+ */
kunmap(page);
return block_size - round_tail_len;
}
@@ -264,12 +297,16 @@ int indirect2direct(struct reiserfs_transaction_handle *th,
/* make sure to get the i_blocks changes from reiserfs_insert_item */
reiserfs_update_sd(th, inode);
- // note: we have now the same as in above direct2indirect
- // conversion: there are two keys which have matching first three
- // key components. They only differ by the fouhth one.
+ /*
+ * note: we have now the same as in above direct2indirect
+ * conversion: there are two keys which have matching first three
+ * key components. They only differ by the fourth one.
+ */
- /* We have inserted new direct item and must remove last
- unformatted node. */
+ /*
+ * We have inserted new direct item and must remove last
+ * unformatted node.
+ */
*mode = M_CUT;
/* we store position of first direct item in the in-core inode */
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 5cdfbd638b5c..ca416d099e7d 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -56,9 +56,11 @@
#define XAROOT_NAME "xattrs"
-/* Helpers for inode ops. We do this so that we don't have all the VFS
+/*
+ * Helpers for inode ops. We do this so that we don't have all the VFS
* overhead and also for proper i_mutex annotation.
- * dir->i_mutex must be held for all of them. */
+ * dir->i_mutex must be held for all of them.
+ */
#ifdef CONFIG_REISERFS_FS_XATTR
static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
{
@@ -73,10 +75,12 @@ static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
return dir->i_op->mkdir(dir, dentry, mode);
}
-/* We use I_MUTEX_CHILD here to silence lockdep. It's safe because xattr
+/*
+ * We use I_MUTEX_CHILD here to silence lockdep. It's safe because xattr
* mutation ops aren't called during rename or splace, which are the
* only other users of I_MUTEX_CHILD. It violates the ordering, but that's
- * better than allocating another subclass just for this code. */
+ * better than allocating another subclass just for this code.
+ */
static int xattr_unlink(struct inode *dir, struct dentry *dentry)
{
int error;
@@ -166,9 +170,11 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
return xadir;
}
-/* The following are side effects of other operations that aren't explicitly
+/*
+ * The following are side effects of other operations that aren't explicitly
* modifying extended attributes. This includes operations such as permissions
- * or ownership changes, object deletions, etc. */
+ * or ownership changes, object deletions, etc.
+ */
struct reiserfs_dentry_buf {
struct dir_context ctx;
struct dentry *xadir;
@@ -267,11 +273,13 @@ static int reiserfs_for_each_xattr(struct inode *inode,
cleanup_dentry_buf(&buf);
if (!err) {
- /* We start a transaction here to avoid a ABBA situation
+ /*
+ * We start a transaction here to avoid a ABBA situation
* between the xattr root's i_mutex and the journal lock.
* This doesn't incur much additional overhead since the
* new transaction will just nest inside the
- * outer transaction. */
+ * outer transaction.
+ */
int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 +
4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
struct reiserfs_transaction_handle th;
@@ -284,7 +292,7 @@ static int reiserfs_for_each_xattr(struct inode *inode,
I_MUTEX_XATTR);
err = action(dir, data);
reiserfs_write_lock(inode->i_sb);
- jerror = journal_end(&th, inode->i_sb, blocks);
+ jerror = journal_end(&th);
reiserfs_write_unlock(inode->i_sb);
mutex_unlock(&dir->d_parent->d_inode->i_mutex);
err = jerror ?: err;
@@ -349,9 +357,11 @@ int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
}
#ifdef CONFIG_REISERFS_FS_XATTR
-/* Returns a dentry corresponding to a specific extended attribute file
+/*
+ * Returns a dentry corresponding to a specific extended attribute file
* for the inode. If flags allow, the file is created. Otherwise, a
- * valid or negative dentry, or an error is returned. */
+ * valid or negative dentry, or an error is returned.
+ */
static struct dentry *xattr_lookup(struct inode *inode, const char *name,
int flags)
{
@@ -400,8 +410,10 @@ static struct page *reiserfs_get_page(struct inode *dir, size_t n)
{
struct address_space *mapping = dir->i_mapping;
struct page *page;
- /* We can deadlock if we try to free dentries,
- and an unlink/rmdir has just occurred - GFP_NOFS avoids this */
+ /*
+ * We can deadlock if we try to free dentries,
+ * and an unlink/rmdir has just occurred - GFP_NOFS avoids this
+ */
mapping_set_gfp_mask(mapping, GFP_NOFS);
page = read_mapping_page(mapping, n >> PAGE_CACHE_SHIFT, NULL);
if (!IS_ERR(page)) {
@@ -411,7 +423,7 @@ static struct page *reiserfs_get_page(struct inode *dir, size_t n)
}
return page;
- fail:
+fail:
reiserfs_put_page(page);
return ERR_PTR(-EIO);
}
@@ -589,7 +601,7 @@ int reiserfs_xattr_set(struct inode *inode, const char *name,
buffer, buffer_size, flags);
reiserfs_write_lock(inode->i_sb);
- error2 = journal_end(&th, inode->i_sb, jbegin_count);
+ error2 = journal_end(&th);
reiserfs_write_unlock(inode->i_sb);
if (error == 0)
error = error2;
@@ -615,8 +627,10 @@ reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer,
if (name == NULL)
return -EINVAL;
- /* We can't have xattrs attached to v1 items since they don't have
- * generation numbers */
+ /*
+ * We can't have xattrs attached to v1 items since they don't have
+ * generation numbers
+ */
if (get_inode_sd_version(inode) == STAT_DATA_V1)
return -EOPNOTSUPP;
@@ -913,12 +927,16 @@ static const struct xattr_handler *reiserfs_xattr_handlers[] = {
static int xattr_mount_check(struct super_block *s)
{
- /* We need generation numbers to ensure that the oid mapping is correct
- * v3.5 filesystems don't have them. */
+ /*
+ * We need generation numbers to ensure that the oid mapping is correct
+ * v3.5 filesystems don't have them.
+ */
if (old_format_only(s)) {
if (reiserfs_xattrs_optional(s)) {
- /* Old format filesystem, but optional xattrs have
- * been enabled. Error out. */
+ /*
+ * Old format filesystem, but optional xattrs have
+ * been enabled. Error out.
+ */
reiserfs_warning(s, "jdm-2005",
"xattrs/ACLs not supported "
"on pre-v3.6 format filesystems. "
@@ -972,9 +990,11 @@ int reiserfs_lookup_privroot(struct super_block *s)
return err;
}
-/* We need to take a copy of the mount flags since things like
+/*
+ * We need to take a copy of the mount flags since things like
* MS_RDONLY don't get set until *after* we're called.
- * mount_flags != mount_options */
+ * mount_flags != mount_options
+ */
int reiserfs_xattr_init(struct super_block *s, int mount_flags)
{
int err = 0;
@@ -1007,8 +1027,8 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
error:
if (err) {
- clear_bit(REISERFS_XATTRS_USER, &(REISERFS_SB(s)->s_mount_opt));
- clear_bit(REISERFS_POSIXACL, &(REISERFS_SB(s)->s_mount_opt));
+ clear_bit(REISERFS_XATTRS_USER, &REISERFS_SB(s)->s_mount_opt);
+ clear_bit(REISERFS_POSIXACL, &REISERFS_SB(s)->s_mount_opt);
}
/* The super_block MS_POSIXACL must mirror the (no)acl mount option. */
diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h
index f59626c5d33b..857ec7e3016f 100644
--- a/fs/reiserfs/xattr.h
+++ b/fs/reiserfs/xattr.h
@@ -61,7 +61,8 @@ static inline loff_t reiserfs_xattr_nblocks(struct inode *inode, loff_t size)
return ret;
}
-/* We may have to create up to 3 objects: xattr root, xattr dir, xattr file.
+/*
+ * We may have to create up to 3 objects: xattr root, xattr dir, xattr file.
* Let's try to be smart about it.
* xattr root: We cache it. If it's not cached, we may need to create it.
* xattr dir: If anything has been loaded for this inode, we can set a flag
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index a6ce532402dc..44503e293790 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -25,8 +25,10 @@ reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
int size = acl ? posix_acl_xattr_size(acl->a_count) : 0;
- /* Pessimism: We can't assume that anything from the xattr root up
- * has been created. */
+ /*
+ * Pessimism: We can't assume that anything from the xattr root up
+ * has been created.
+ */
jcreate_blocks = reiserfs_xattr_jcreate_nblocks(inode) +
reiserfs_xattr_nblocks(inode, size) * 2;
@@ -37,7 +39,7 @@ reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
if (error == 0) {
error = __reiserfs_set_acl(&th, inode, type, acl);
reiserfs_write_lock(inode->i_sb);
- error2 = journal_end(&th, inode->i_sb, jcreate_blocks);
+ error2 = journal_end(&th);
reiserfs_write_unlock(inode->i_sb);
if (error2)
error = error2;
@@ -111,7 +113,7 @@ static struct posix_acl *reiserfs_posix_acl_from_disk(const void *value, size_t
goto fail;
return acl;
- fail:
+fail:
posix_acl_release(acl);
return ERR_PTR(-EINVAL);
}
@@ -164,7 +166,7 @@ static void *reiserfs_posix_acl_to_disk(const struct posix_acl *acl, size_t * si
}
return (char *)ext_acl;
- fail:
+fail:
kfree(ext_acl);
return ERR_PTR(-EINVAL);
}
@@ -208,8 +210,10 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
retval = reiserfs_xattr_get(inode, name, value, size);
if (retval == -ENODATA || retval == -ENOSYS) {
- /* This shouldn't actually happen as it should have
- been caught above.. but just in case */
+ /*
+ * This shouldn't actually happen as it should have
+ * been caught above.. but just in case
+ */
acl = NULL;
} else if (retval < 0) {
acl = ERR_PTR(retval);
@@ -290,8 +294,10 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
return error;
}
-/* dir->i_mutex: locked,
- * inode is new and not released into the wild yet */
+/*
+ * dir->i_mutex: locked,
+ * inode is new and not released into the wild yet
+ */
int
reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
struct inode *dir, struct dentry *dentry,
@@ -304,14 +310,18 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
if (S_ISLNK(inode->i_mode))
return 0;
- /* ACLs can only be used on "new" objects, so if it's an old object
- * there is nothing to inherit from */
+ /*
+ * ACLs can only be used on "new" objects, so if it's an old object
+ * there is nothing to inherit from
+ */
if (get_inode_sd_version(dir) == STAT_DATA_V1)
goto apply_umask;
- /* Don't apply ACLs to objects in the .reiserfs_priv tree.. This
+ /*
+ * Don't apply ACLs to objects in the .reiserfs_priv tree.. This
* would be useless since permissions are ignored, and a pain because
- * it introduces locking cycles */
+ * it introduces locking cycles
+ */
if (IS_PRIVATE(dir)) {
inode->i_flags |= S_PRIVATE;
goto apply_umask;
@@ -335,7 +345,7 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
return err;
- apply_umask:
+apply_umask:
/* no ACL, apply umask */
inode->i_mode &= ~current_umask();
return err;
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
index f373bde8f545..ea06c7554860 100644
--- a/fs/romfs/mmap-nommu.c
+++ b/fs/romfs/mmap-nommu.c
@@ -72,8 +72,8 @@ static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
const struct file_operations romfs_ro_fops = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
.splice_read = generic_file_splice_read,
.mmap = romfs_mmap,
.get_unmapped_area = romfs_get_unmapped_area,
diff --git a/fs/splice.c b/fs/splice.c
index e246954ea48c..f5cb9ba84510 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -32,6 +32,7 @@
#include <linux/gfp.h>
#include <linux/socket.h>
#include <linux/compat.h>
+#include <linux/aio.h>
#include "internal.h"
/*
@@ -717,63 +718,6 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
sd->len, &pos, more);
}
-/*
- * This is a little more tricky than the file -> pipe splicing. There are
- * basically three cases:
- *
- * - Destination page already exists in the address space and there
- * are users of it. For that case we have no other option that
- * copying the data. Tough luck.
- * - Destination page already exists in the address space, but there
- * are no users of it. Make sure it's uptodate, then drop it. Fall
- * through to last case.
- * - Destination page does not exist, we can add the pipe page to
- * the page cache and avoid the copy.
- *
- * If asked to move pages to the output file (SPLICE_F_MOVE is set in
- * sd->flags), we attempt to migrate pages from the pipe to the output
- * file address space page cache. This is possible if no one else has
- * the pipe page referenced outside of the pipe and page cache. If
- * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
- * a new page in the output file page cache and fill/dirty that.
- */
-int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
- struct splice_desc *sd)
-{
- struct file *file = sd->u.file;
- struct address_space *mapping = file->f_mapping;
- unsigned int offset, this_len;
- struct page *page;
- void *fsdata;
- int ret;
-
- offset = sd->pos & ~PAGE_CACHE_MASK;
-
- this_len = sd->len;
- if (this_len + offset > PAGE_CACHE_SIZE)
- this_len = PAGE_CACHE_SIZE - offset;
-
- ret = pagecache_write_begin(file, mapping, sd->pos, this_len,
- AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
- if (unlikely(ret))
- goto out;
-
- if (buf->page != page) {
- char *src = kmap_atomic(buf->page);
- char *dst = kmap_atomic(page);
-
- memcpy(dst + offset, src + buf->offset, this_len);
- flush_dcache_page(page);
- kunmap_atomic(dst);
- kunmap_atomic(src);
- }
- ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
- page, fsdata);
-out:
- return ret;
-}
-EXPORT_SYMBOL(pipe_to_file);
-
static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
{
smp_mb();
@@ -802,7 +746,7 @@ static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
* locking is required around copying the pipe buffers to the
* destination.
*/
-int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
+static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
splice_actor *actor)
{
int ret;
@@ -849,7 +793,6 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
return 1;
}
-EXPORT_SYMBOL(splice_from_pipe_feed);
/**
* splice_from_pipe_next - wait for some data to splice from
@@ -861,7 +804,7 @@ EXPORT_SYMBOL(splice_from_pipe_feed);
* value (one) if pipe buffers are available. It will return zero
* or -errno if no more data needs to be spliced.
*/
-int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
+static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
{
while (!pipe->nrbufs) {
if (!pipe->writers)
@@ -886,7 +829,6 @@ int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
return 1;
}
-EXPORT_SYMBOL(splice_from_pipe_next);
/**
* splice_from_pipe_begin - start splicing from pipe
@@ -897,12 +839,11 @@ EXPORT_SYMBOL(splice_from_pipe_next);
* splice_from_pipe_next() and splice_from_pipe_feed() to
* initialize the necessary fields of @sd.
*/
-void splice_from_pipe_begin(struct splice_desc *sd)
+static void splice_from_pipe_begin(struct splice_desc *sd)
{
sd->num_spliced = 0;
sd->need_wakeup = false;
}
-EXPORT_SYMBOL(splice_from_pipe_begin);
/**
* splice_from_pipe_end - finish splicing from pipe
@@ -914,12 +855,11 @@ EXPORT_SYMBOL(splice_from_pipe_begin);
* be called after a loop containing splice_from_pipe_next() and
* splice_from_pipe_feed().
*/
-void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
+static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
{
if (sd->need_wakeup)
wakeup_pipe_writers(pipe);
}
-EXPORT_SYMBOL(splice_from_pipe_end);
/**
* __splice_from_pipe - splice data from a pipe to given actor
@@ -985,7 +925,7 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
}
/**
- * generic_file_splice_write - splice data from a pipe to a file
+ * iter_file_splice_write - splice data from a pipe to a file
* @pipe: pipe info
* @out: file to write to
* @ppos: position in @out
@@ -995,40 +935,122 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
* Description:
* Will either move or copy pages (determined by @flags options) from
* the given pipe inode to the given file.
+ * This one is ->write_iter-based.
*
*/
ssize_t
-generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
+iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
loff_t *ppos, size_t len, unsigned int flags)
{
- struct address_space *mapping = out->f_mapping;
- struct inode *inode = mapping->host;
struct splice_desc sd = {
.total_len = len,
.flags = flags,
.pos = *ppos,
.u.file = out,
};
+ int nbufs = pipe->buffers;
+ struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
+ GFP_KERNEL);
ssize_t ret;
+ if (unlikely(!array))
+ return -ENOMEM;
+
pipe_lock(pipe);
splice_from_pipe_begin(&sd);
- do {
+ while (sd.total_len) {
+ struct iov_iter from;
+ struct kiocb kiocb;
+ size_t left;
+ int n, idx;
+
ret = splice_from_pipe_next(pipe, &sd);
if (ret <= 0)
break;
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
- ret = file_remove_suid(out);
- if (!ret) {
- ret = file_update_time(out);
- if (!ret)
- ret = splice_from_pipe_feed(pipe, &sd,
- pipe_to_file);
+ if (unlikely(nbufs < pipe->buffers)) {
+ kfree(array);
+ nbufs = pipe->buffers;
+ array = kcalloc(nbufs, sizeof(struct bio_vec),
+ GFP_KERNEL);
+ if (!array) {
+ ret = -ENOMEM;
+ break;
+ }
}
- mutex_unlock(&inode->i_mutex);
- } while (ret > 0);
+
+ /* build the vector */
+ left = sd.total_len;
+ for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) {
+ struct pipe_buffer *buf = pipe->bufs + idx;
+ size_t this_len = buf->len;
+
+ if (this_len > left)
+ this_len = left;
+
+ if (idx == pipe->buffers - 1)
+ idx = -1;
+
+ ret = buf->ops->confirm(pipe, buf);
+ if (unlikely(ret)) {
+ if (ret == -ENODATA)
+ ret = 0;
+ goto done;
+ }
+
+ array[n].bv_page = buf->page;
+ array[n].bv_len = this_len;
+ array[n].bv_offset = buf->offset;
+ left -= this_len;
+ }
+
+ /* ... iov_iter */
+ from.type = ITER_BVEC | WRITE;
+ from.bvec = array;
+ from.nr_segs = n;
+ from.count = sd.total_len - left;
+ from.iov_offset = 0;
+
+ /* ... and iocb */
+ init_sync_kiocb(&kiocb, out);
+ kiocb.ki_pos = sd.pos;
+ kiocb.ki_nbytes = sd.total_len - left;
+
+ /* now, send it */
+ ret = out->f_op->write_iter(&kiocb, &from);
+ if (-EIOCBQUEUED == ret)
+ ret = wait_on_sync_kiocb(&kiocb);
+
+ if (ret <= 0)
+ break;
+
+ sd.num_spliced += ret;
+ sd.total_len -= ret;
+ *ppos = sd.pos = kiocb.ki_pos;
+
+ /* dismiss the fully eaten buffers, adjust the partial one */
+ while (ret) {
+ struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
+ if (ret >= buf->len) {
+ const struct pipe_buf_operations *ops = buf->ops;
+ ret -= buf->len;
+ buf->len = 0;
+ buf->ops = NULL;
+ ops->release(pipe, buf);
+ pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
+ pipe->nrbufs--;
+ if (pipe->files)
+ sd.need_wakeup = true;
+ } else {
+ buf->offset += ret;
+ buf->len -= ret;
+ ret = 0;
+ }
+ }
+ }
+done:
+ kfree(array);
splice_from_pipe_end(pipe, &sd);
pipe_unlock(pipe);
@@ -1036,21 +1058,10 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
if (sd.num_spliced)
ret = sd.num_spliced;
- if (ret > 0) {
- int err;
-
- err = generic_write_sync(out, *ppos, ret);
- if (err)
- ret = err;
- else
- *ppos += ret;
- balance_dirty_pages_ratelimited(mapping);
- }
-
return ret;
}
-EXPORT_SYMBOL(generic_file_splice_write);
+EXPORT_SYMBOL(iter_file_splice_write);
static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
struct splice_desc *sd)
@@ -1549,7 +1560,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
goto out;
count = ret;
- iov_iter_init(&iter, iov, nr_segs, count, 0);
+ iov_iter_init(&iter, READ, iov, nr_segs, count);
sd.len = 0;
sd.total_len = count;
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 9d4dc6831792..b00811c75b24 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -21,10 +21,10 @@
*/
const struct file_operations sysv_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .write = do_sync_write,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.fsync = generic_file_fsync,
.splice_read = generic_file_splice_read,
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index e8e01d74dc05..eb997e9c4ab0 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -437,7 +437,6 @@ static int calc_dd_growth(const struct ubifs_info *c,
*/
int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
{
- int uninitialized_var(cmt_retries), uninitialized_var(wb_retries);
int err, idx_growth, data_growth, dd_growth, retried = 0;
ubifs_assert(req->new_page <= 1);
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 5157b866a853..177b0152fef4 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -745,8 +745,10 @@ void ubifs_dump_lprops(struct ubifs_info *c)
for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) {
err = ubifs_read_one_lp(c, lnum, &lp);
- if (err)
+ if (err) {
ubifs_err("cannot read lprops for LEB %d", lnum);
+ continue;
+ }
ubifs_dump_lprop(c, &lp);
}
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 4f34dbae823d..b5b593c45270 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -903,8 +903,9 @@ static int do_writepage(struct page *page, int len)
struct ubifs_info *c = inode->i_sb->s_fs_info;
#ifdef UBIFS_DEBUG
+ struct ubifs_inode *ui = ubifs_inode(inode);
spin_lock(&ui->ui_lock);
- ubifs_assert(page->index <= ui->synced_i_size << PAGE_CACHE_SIZE);
+ ubifs_assert(page->index <= ui->synced_i_size >> PAGE_CACHE_SHIFT);
spin_unlock(&ui->ui_lock);
#endif
@@ -1363,17 +1364,17 @@ static inline int mctime_update_needed(const struct inode *inode,
/**
* update_ctime - update mtime and ctime of an inode.
- * @c: UBIFS file-system description object
* @inode: inode to update
*
* This function updates mtime and ctime of the inode if it is not equivalent to
* current time. Returns zero in case of success and a negative error code in
* case of failure.
*/
-static int update_mctime(struct ubifs_info *c, struct inode *inode)
+static int update_mctime(struct inode *inode)
{
struct timespec now = ubifs_current_time(inode);
struct ubifs_inode *ui = ubifs_inode(inode);
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
if (mctime_update_needed(inode, &now)) {
int err, release;
@@ -1396,18 +1397,13 @@ static int update_mctime(struct ubifs_info *c, struct inode *inode)
return 0;
}
-static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t ubifs_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
- int err;
- struct inode *inode = iocb->ki_filp->f_mapping->host;
- struct ubifs_info *c = inode->i_sb->s_fs_info;
-
- err = update_mctime(c, inode);
+ int err = update_mctime(file_inode(iocb->ki_filp));
if (err)
return err;
- return generic_file_aio_write(iocb, iov, nr_segs, pos);
+ return generic_file_write_iter(iocb, from);
}
static int ubifs_set_page_dirty(struct page *page)
@@ -1525,8 +1521,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma,
}
wait_for_stable_page(page);
- unlock_page(page);
- return 0;
+ return VM_FAULT_LOCKED;
out_unlock:
unlock_page(page);
@@ -1582,15 +1577,15 @@ const struct inode_operations ubifs_symlink_inode_operations = {
const struct file_operations ubifs_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = ubifs_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = ubifs_write_iter,
.mmap = ubifs_file_mmap,
.fsync = ubifs_fsync,
.unlocked_ioctl = ubifs_ioctl,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
#ifdef CONFIG_COMPAT
.compat_ioctl = ubifs_compat_ioctl,
#endif
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index e18b9889a51b..2290d5866725 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -988,30 +988,32 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
return err;
if (type != ch->node_type) {
- ubifs_err("bad node type (%d but expected %d)",
- ch->node_type, type);
+ ubifs_errc(c, "bad node type (%d but expected %d)",
+ ch->node_type, type);
goto out;
}
err = ubifs_check_node(c, buf, lnum, offs, 0, 0);
if (err) {
- ubifs_err("expected node type %d", type);
+ ubifs_errc(c, "expected node type %d", type);
return err;
}
l = le32_to_cpu(ch->len);
if (l != len) {
- ubifs_err("bad node length %d, expected %d", l, len);
+ ubifs_errc(c, "bad node length %d, expected %d", l, len);
goto out;
}
return 0;
out:
- ubifs_err("bad node at LEB %d:%d, LEB mapping status %d", lnum, offs,
- ubi_is_mapped(c->ubi, lnum));
- ubifs_dump_node(c, buf);
- dump_stack();
+ ubifs_errc(c, "bad node at LEB %d:%d, LEB mapping status %d", lnum,
+ offs, ubi_is_mapped(c->ubi, lnum));
+ if (!c->probing) {
+ ubifs_dump_node(c, buf);
+ dump_stack();
+ }
return -EINVAL;
}
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index f35135e28e96..9a9fb94a41c6 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -128,7 +128,6 @@ static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention)
freed = ubifs_destroy_tnc_subtree(znode);
atomic_long_sub(freed, &ubifs_clean_zn_cnt);
atomic_long_sub(freed, &c->clean_zn_cnt);
- ubifs_assert(atomic_long_read(&c->clean_zn_cnt) >= 0);
total_freed += freed;
znode = zprev;
}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index a81c7b556896..3904c8574ef9 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1149,6 +1149,9 @@ static int mount_ubifs(struct ubifs_info *c)
size_t sz;
c->ro_mount = !!(c->vfs_sb->s_flags & MS_RDONLY);
+ /* Suppress error messages while probing if MS_SILENT is set */
+ c->probing = !!(c->vfs_sb->s_flags & MS_SILENT);
+
err = init_constants_early(c);
if (err)
return err;
@@ -1214,6 +1217,8 @@ static int mount_ubifs(struct ubifs_info *c)
if (err)
goto out_free;
+ c->probing = 0;
+
/*
* Make sure the compressor which is set as default in the superblock
* or overridden by mount options is actually compiled in.
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 9083bc7ed4ae..8a40cf9c02d7 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -2859,10 +2859,11 @@ void ubifs_tnc_close(struct ubifs_info *c)
{
tnc_destroy_cnext(c);
if (c->zroot.znode) {
- long n;
+ long n, freed;
- ubifs_destroy_tnc_subtree(c->zroot.znode);
n = atomic_long_read(&c->clean_zn_cnt);
+ freed = ubifs_destroy_tnc_subtree(c->zroot.znode);
+ ubifs_assert(freed == n);
atomic_long_sub(n, &ubifs_clean_zn_cnt);
}
kfree(c->gap_lebs);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index e8c8cfe1435c..c1f71fe17cc0 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -51,6 +51,15 @@
#define ubifs_warn(fmt, ...) \
pr_warn("UBIFS warning (pid %d): %s: " fmt "\n", \
current->pid, __func__, ##__VA_ARGS__)
+/*
+ * A variant of 'ubifs_err()' which takes the UBIFS file-sytem description
+ * object as an argument.
+ */
+#define ubifs_errc(c, fmt, ...) \
+ do { \
+ if (!(c)->probing) \
+ ubifs_err(fmt, ##__VA_ARGS__); \
+ } while (0)
/* UBIFS file system VFS magic number */
#define UBIFS_SUPER_MAGIC 0x24051905
@@ -1209,6 +1218,7 @@ struct ubifs_debug_info;
* @need_recovery: %1 if the file-system needs recovery
* @replaying: %1 during journal replay
* @mounting: %1 while mounting
+ * @probing: %1 while attempting to mount if MS_SILENT mount flag is set
* @remounting_rw: %1 while re-mounting from R/O mode to R/W mode
* @replay_list: temporary list used during journal replay
* @replay_buds: list of buds to replay
@@ -1441,6 +1451,7 @@ struct ubifs_info {
unsigned int replaying:1;
unsigned int mounting:1;
unsigned int remounting_rw:1;
+ unsigned int probing:1;
struct list_head replay_list;
struct list_head replay_buds;
unsigned long long cs_sqnum;
diff --git a/fs/udf/file.c b/fs/udf/file.c
index d2c170f8b035..d80738fdf424 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -119,8 +119,8 @@ static int udf_adinicb_write_end(struct file *file,
}
static ssize_t udf_adinicb_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov,
- loff_t offset, unsigned long nr_segs)
+ struct iov_iter *iter,
+ loff_t offset)
{
/* Fallback to buffered I/O. */
return 0;
@@ -134,8 +134,7 @@ const struct address_space_operations udf_adinicb_aops = {
.direct_IO = udf_adinicb_direct_IO,
};
-static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t ppos)
+static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
ssize_t retval;
struct file *file = iocb->ki_filp;
@@ -150,7 +149,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (file->f_flags & O_APPEND)
pos = inode->i_size;
else
- pos = ppos;
+ pos = iocb->ki_pos;
if (inode->i_sb->s_blocksize <
(udf_file_entry_alloc_offset(inode) +
@@ -171,7 +170,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
} else
up_write(&iinfo->i_data_sem);
- retval = __generic_file_aio_write(iocb, iov, nr_segs);
+ retval = __generic_file_write_iter(iocb, from);
mutex_unlock(&inode->i_mutex);
if (retval > 0) {
@@ -252,13 +251,13 @@ static int udf_release_file(struct inode *inode, struct file *filp)
}
const struct file_operations udf_file_operations = {
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
.unlocked_ioctl = udf_ioctl,
.open = generic_file_open,
.mmap = generic_file_mmap,
- .write = do_sync_write,
- .aio_write = udf_file_aio_write,
+ .write = new_sync_write,
+ .write_iter = udf_file_write_iter,
.release = udf_release_file,
.fsync = generic_file_fsync,
.splice_read = generic_file_splice_read,
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 5d643706212f..236cd48184c2 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -217,18 +217,18 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
}
static ssize_t udf_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov,
- loff_t offset, unsigned long nr_segs)
+ struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
+ size_t count = iov_iter_count(iter);
ssize_t ret;
- ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
- udf_get_block);
+ ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, udf_get_block);
if (unlikely(ret < 0 && (rw & WRITE)))
- udf_write_failed(mapping, offset + iov_length(iov, nr_segs));
+ udf_write_failed(mapping, offset + count);
return ret;
}
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 33afa20d4509..c84ec010a676 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -35,10 +35,10 @@
const struct file_operations ufs_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .write = do_sync_write,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.open = generic_file_open,
.fsync = generic_file_fsync,
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 0fdd4109c624..6e247a99f5db 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -160,30 +160,38 @@ typedef struct xfs_agi {
* still being referenced.
*/
__be32 agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
-
+ /*
+ * This marks the end of logging region 1 and start of logging region 2.
+ */
uuid_t agi_uuid; /* uuid of filesystem */
__be32 agi_crc; /* crc of agi sector */
__be32 agi_pad32;
__be64 agi_lsn; /* last write sequence */
+ __be32 agi_free_root; /* root of the free inode btree */
+ __be32 agi_free_level;/* levels in free inode btree */
+
/* structure must be padded to 64 bit alignment */
} xfs_agi_t;
#define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc)
-#define XFS_AGI_MAGICNUM 0x00000001
-#define XFS_AGI_VERSIONNUM 0x00000002
-#define XFS_AGI_SEQNO 0x00000004
-#define XFS_AGI_LENGTH 0x00000008
-#define XFS_AGI_COUNT 0x00000010
-#define XFS_AGI_ROOT 0x00000020
-#define XFS_AGI_LEVEL 0x00000040
-#define XFS_AGI_FREECOUNT 0x00000080
-#define XFS_AGI_NEWINO 0x00000100
-#define XFS_AGI_DIRINO 0x00000200
-#define XFS_AGI_UNLINKED 0x00000400
-#define XFS_AGI_NUM_BITS 11
-#define XFS_AGI_ALL_BITS ((1 << XFS_AGI_NUM_BITS) - 1)
+#define XFS_AGI_MAGICNUM (1 << 0)
+#define XFS_AGI_VERSIONNUM (1 << 1)
+#define XFS_AGI_SEQNO (1 << 2)
+#define XFS_AGI_LENGTH (1 << 3)
+#define XFS_AGI_COUNT (1 << 4)
+#define XFS_AGI_ROOT (1 << 5)
+#define XFS_AGI_LEVEL (1 << 6)
+#define XFS_AGI_FREECOUNT (1 << 7)
+#define XFS_AGI_NEWINO (1 << 8)
+#define XFS_AGI_DIRINO (1 << 9)
+#define XFS_AGI_UNLINKED (1 << 10)
+#define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */
+#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1)
+#define XFS_AGI_FREE_ROOT (1 << 11)
+#define XFS_AGI_FREE_LEVEL (1 << 12)
+#define XFS_AGI_NUM_BITS_R2 13
/* disk block (xfs_daddr_t) in the AG */
#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index c1cf6a336a72..d43813267a80 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -257,16 +257,14 @@ xfs_alloc_fix_len(
k = rlen % args->prod;
if (k == args->mod)
return;
- if (k > args->mod) {
- if ((int)(rlen = rlen - k - args->mod) < (int)args->minlen)
- return;
- } else {
- if ((int)(rlen = rlen - args->prod - (args->mod - k)) <
- (int)args->minlen)
- return;
- }
- ASSERT(rlen >= args->minlen);
- ASSERT(rlen <= args->maxlen);
+ if (k > args->mod)
+ rlen = rlen - (k - args->mod);
+ else
+ rlen = rlen - args->prod + (args->mod - k);
+ if ((int)rlen < (int)args->minlen)
+ return;
+ ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
+ ASSERT(rlen % args->prod == args->mod);
args->len = rlen;
}
@@ -541,7 +539,6 @@ xfs_alloc_read_agfl(
XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
if (error)
return error;
- ASSERT(!xfs_buf_geterror(bp));
xfs_buf_set_ref(bp, XFS_AGFL_REF);
*bpp = bp;
return 0;
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index cc1eadcbb049..8358f1ded94d 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -70,7 +70,6 @@ xfs_allocbt_alloc_block(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *start,
union xfs_btree_ptr *new,
- int length,
int *stat)
{
int error;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 0479c32c5eb1..faaf716e2080 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -975,14 +975,39 @@ xfs_vm_writepage(
* Given that we do not allow direct reclaim to call us, we should
* never be called while in a filesystem transaction.
*/
- if (WARN_ON(current->flags & PF_FSTRANS))
+ if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
goto redirty;
/* Is this page beyond the end of the file? */
offset = i_size_read(inode);
end_index = offset >> PAGE_CACHE_SHIFT;
last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
- if (page->index >= end_index) {
+
+ /*
+ * The page index is less than the end_index, adjust the end_offset
+ * to the highest offset that this page should represent.
+ * -----------------------------------------------------
+ * | file mapping | <EOF> |
+ * -----------------------------------------------------
+ * | Page ... | Page N-2 | Page N-1 | Page N | |
+ * ^--------------------------------^----------|--------
+ * | desired writeback range | see else |
+ * ---------------------------------^------------------|
+ */
+ if (page->index < end_index)
+ end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT;
+ else {
+ /*
+ * Check whether the page to write out is beyond or straddles
+ * i_size or not.
+ * -------------------------------------------------------
+ * | file mapping | <EOF> |
+ * -------------------------------------------------------
+ * | Page ... | Page N-2 | Page N-1 | Page N | Beyond |
+ * ^--------------------------------^-----------|---------
+ * | | Straddles |
+ * ---------------------------------^-----------|--------|
+ */
unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1);
/*
@@ -990,24 +1015,36 @@ xfs_vm_writepage(
* truncate operation that is in progress. We must redirty the
* page so that reclaim stops reclaiming it. Otherwise
* xfs_vm_releasepage() is called on it and gets confused.
+ *
+ * Note that the end_index is unsigned long, it would overflow
+ * if the given offset is greater than 16TB on 32-bit system
+ * and if we do check the page is fully outside i_size or not
+ * via "if (page->index >= end_index + 1)" as "end_index + 1"
+ * will be evaluated to 0. Hence this page will be redirtied
+ * and be written out repeatedly which would result in an
+ * infinite loop, the user program that perform this operation
+ * will hang. Instead, we can verify this situation by checking
+ * if the page to write is totally beyond the i_size or if it's
+ * offset is just equal to the EOF.
*/
- if (page->index >= end_index + 1 || offset_into_page == 0)
+ if (page->index > end_index ||
+ (page->index == end_index && offset_into_page == 0))
goto redirty;
/*
* The page straddles i_size. It must be zeroed out on each
* and every writepage invocation because it may be mmapped.
* "A file is mapped in multiples of the page size. For a file
- * that is not a multiple of the page size, the remaining
+ * that is not a multiple of the page size, the remaining
* memory is zeroed when mapped, and writes to that region are
* not written out to the file."
*/
zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE);
+
+ /* Adjust the end_offset to the end of file */
+ end_offset = offset;
}
- end_offset = min_t(unsigned long long,
- (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
- offset);
len = 1 << inode->i_blkbits;
bh = head = page_buffers(page);
@@ -1188,9 +1225,9 @@ xfs_vm_releasepage(
xfs_count_page_state(page, &delalloc, &unwritten);
- if (WARN_ON(delalloc))
+ if (WARN_ON_ONCE(delalloc))
return 0;
- if (WARN_ON(unwritten))
+ if (WARN_ON_ONCE(unwritten))
return 0;
return try_to_free_buffers(page);
@@ -1449,9 +1486,8 @@ STATIC ssize_t
xfs_vm_direct_IO(
int rw,
struct kiocb *iocb,
- const struct iovec *iov,
- loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter,
+ loff_t offset)
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
struct block_device *bdev = xfs_find_bdev_for_inode(inode);
@@ -1459,7 +1495,7 @@ xfs_vm_direct_IO(
ssize_t ret;
if (rw & WRITE) {
- size_t size = iov_length(iov, nr_segs);
+ size_t size = iov_iter_count(iter);
/*
* We cannot preallocate a size update transaction here as we
@@ -1471,17 +1507,15 @@ xfs_vm_direct_IO(
if (offset + size > XFS_I(inode)->i_d.di_size)
ioend->io_isdirect = 1;
- ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
- offset, nr_segs,
- xfs_get_blocks_direct,
+ ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
+ offset, xfs_get_blocks_direct,
xfs_end_io_direct_write, NULL,
DIO_ASYNC_EXTEND);
if (ret != -EIOCBQUEUED && iocb->private)
goto out_destroy_ioend;
} else {
- ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
- offset, nr_segs,
- xfs_get_blocks_direct,
+ ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
+ offset, xfs_get_blocks_direct,
NULL, NULL, 0);
}
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index abda1124a70f..bfe36fc2cdc2 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -77,17 +77,27 @@ STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
STATIC int
-xfs_attr_name_to_xname(
- struct xfs_name *xname,
- const unsigned char *aname)
+xfs_attr_args_init(
+ struct xfs_da_args *args,
+ struct xfs_inode *dp,
+ const unsigned char *name,
+ int flags)
{
- if (!aname)
+
+ if (!name)
return EINVAL;
- xname->name = aname;
- xname->len = strlen((char *)aname);
- if (xname->len >= MAXNAMELEN)
+
+ memset(args, 0, sizeof(*args));
+ args->geo = dp->i_mount->m_attr_geo;
+ args->whichfork = XFS_ATTR_FORK;
+ args->dp = dp;
+ args->flags = flags;
+ args->name = name;
+ args->namelen = strlen((const char *)name);
+ if (args->namelen >= MAXNAMELEN)
return EFAULT; /* match IRIX behaviour */
+ args->hashval = xfs_da_hashname(args->name, args->namelen);
return 0;
}
@@ -106,79 +116,46 @@ xfs_inode_hasattr(
* Overall external interface routines.
*========================================================================*/
-STATIC int
-xfs_attr_get_int(
+int
+xfs_attr_get(
struct xfs_inode *ip,
- struct xfs_name *name,
+ const unsigned char *name,
unsigned char *value,
int *valuelenp,
int flags)
{
- xfs_da_args_t args;
- int error;
+ struct xfs_da_args args;
+ uint lock_mode;
+ int error;
+
+ XFS_STATS_INC(xs_attr_get);
+
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ return EIO;
if (!xfs_inode_hasattr(ip))
return ENOATTR;
- /*
- * Fill in the arg structure for this request.
- */
- memset((char *)&args, 0, sizeof(args));
- args.name = name->name;
- args.namelen = name->len;
+ error = xfs_attr_args_init(&args, ip, name, flags);
+ if (error)
+ return error;
+
args.value = value;
args.valuelen = *valuelenp;
- args.flags = flags;
- args.hashval = xfs_da_hashname(args.name, args.namelen);
- args.dp = ip;
- args.whichfork = XFS_ATTR_FORK;
- /*
- * Decide on what work routines to call based on the inode size.
- */
- if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+ lock_mode = xfs_ilock_attr_map_shared(ip);
+ if (!xfs_inode_hasattr(ip))
+ error = ENOATTR;
+ else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL)
error = xfs_attr_shortform_getvalue(&args);
- } else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) {
+ else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK))
error = xfs_attr_leaf_get(&args);
- } else {
+ else
error = xfs_attr_node_get(&args);
- }
+ xfs_iunlock(ip, lock_mode);
- /*
- * Return the number of bytes in the value to the caller.
- */
*valuelenp = args.valuelen;
-
- if (error == EEXIST)
- error = 0;
- return(error);
-}
-
-int
-xfs_attr_get(
- xfs_inode_t *ip,
- const unsigned char *name,
- unsigned char *value,
- int *valuelenp,
- int flags)
-{
- int error;
- struct xfs_name xname;
- uint lock_mode;
-
- XFS_STATS_INC(xs_attr_get);
-
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return(EIO);
-
- error = xfs_attr_name_to_xname(&xname, name);
- if (error)
- return error;
-
- lock_mode = xfs_ilock_attr_map_shared(ip);
- error = xfs_attr_get_int(ip, &xname, value, valuelenp, flags);
- xfs_iunlock(ip, lock_mode);
- return(error);
+ return error == EEXIST ? 0 : error;
}
/*
@@ -186,12 +163,10 @@ xfs_attr_get(
*/
STATIC int
xfs_attr_calc_size(
- struct xfs_inode *ip,
- int namelen,
- int valuelen,
+ struct xfs_da_args *args,
int *local)
{
- struct xfs_mount *mp = ip->i_mount;
+ struct xfs_mount *mp = args->dp->i_mount;
int size;
int nblks;
@@ -199,12 +174,10 @@ xfs_attr_calc_size(
* Determine space new attribute will use, and if it would be
* "local" or "remote" (note: local != inline).
*/
- size = xfs_attr_leaf_newentsize(namelen, valuelen,
- mp->m_sb.sb_blocksize, local);
-
+ size = xfs_attr_leaf_newentsize(args, local);
nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
if (*local) {
- if (size > (mp->m_sb.sb_blocksize >> 1)) {
+ if (size > (args->geo->blksize / 2)) {
/* Double split possible */
nblks *= 2;
}
@@ -213,7 +186,7 @@ xfs_attr_calc_size(
* Out of line attribute, cannot double split, but
* make room for the attribute value itself.
*/
- uint dblocks = xfs_attr3_rmt_blocks(mp, valuelen);
+ uint dblocks = xfs_attr3_rmt_blocks(mp, args->valuelen);
nblks += dblocks;
nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
}
@@ -221,26 +194,38 @@ xfs_attr_calc_size(
return nblks;
}
-STATIC int
-xfs_attr_set_int(
- struct xfs_inode *dp,
- struct xfs_name *name,
- unsigned char *value,
- int valuelen,
- int flags)
+int
+xfs_attr_set(
+ struct xfs_inode *dp,
+ const unsigned char *name,
+ unsigned char *value,
+ int valuelen,
+ int flags)
{
- xfs_da_args_t args;
- xfs_fsblock_t firstblock;
- xfs_bmap_free_t flist;
- int error, err2, committed;
struct xfs_mount *mp = dp->i_mount;
+ struct xfs_da_args args;
+ struct xfs_bmap_free flist;
struct xfs_trans_res tres;
+ xfs_fsblock_t firstblock;
int rsvd = (flags & ATTR_ROOT) != 0;
- int local;
+ int error, err2, committed, local;
+
+ XFS_STATS_INC(xs_attr_set);
+
+ if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+ return EIO;
+
+ error = xfs_attr_args_init(&args, dp, name, flags);
+ if (error)
+ return error;
+
+ args.value = value;
+ args.valuelen = valuelen;
+ args.firstblock = &firstblock;
+ args.flist = &flist;
+ args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+ args.total = xfs_attr_calc_size(&args, &local);
- /*
- * Attach the dquots to the inode.
- */
error = xfs_qm_dqattach(dp, 0);
if (error)
return error;
@@ -251,32 +236,14 @@ xfs_attr_set_int(
*/
if (XFS_IFORK_Q(dp) == 0) {
int sf_size = sizeof(xfs_attr_sf_hdr_t) +
- XFS_ATTR_SF_ENTSIZE_BYNAME(name->len, valuelen);
+ XFS_ATTR_SF_ENTSIZE_BYNAME(args.namelen, valuelen);
- if ((error = xfs_bmap_add_attrfork(dp, sf_size, rsvd)))
- return(error);
+ error = xfs_bmap_add_attrfork(dp, sf_size, rsvd);
+ if (error)
+ return error;
}
/*
- * Fill in the arg structure for this request.
- */
- memset((char *)&args, 0, sizeof(args));
- args.name = name->name;
- args.namelen = name->len;
- args.value = value;
- args.valuelen = valuelen;
- args.flags = flags;
- args.hashval = xfs_da_hashname(args.name, args.namelen);
- args.dp = dp;
- args.firstblock = &firstblock;
- args.flist = &flist;
- args.whichfork = XFS_ATTR_FORK;
- args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
-
- /* Size is now blocks for attribute data */
- args.total = xfs_attr_calc_size(dp, name->len, valuelen, &local);
-
- /*
* Start our first transaction of the day.
*
* All future transactions during this code must be "chained" off
@@ -303,7 +270,7 @@ xfs_attr_set_int(
error = xfs_trans_reserve(args.trans, &tres, args.total, 0);
if (error) {
xfs_trans_cancel(args.trans, 0);
- return(error);
+ return error;
}
xfs_ilock(dp, XFS_ILOCK_EXCL);
@@ -313,7 +280,7 @@ xfs_attr_set_int(
if (error) {
xfs_iunlock(dp, XFS_ILOCK_EXCL);
xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
- return (error);
+ return error;
}
xfs_trans_ijoin(args.trans, dp, 0);
@@ -322,9 +289,9 @@ xfs_attr_set_int(
* If the attribute list is non-existent or a shortform list,
* upgrade it to a single-leaf-block attribute list.
*/
- if ((dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) ||
- ((dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) &&
- (dp->i_d.di_anextents == 0))) {
+ if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL ||
+ (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+ dp->i_d.di_anextents == 0)) {
/*
* Build initial attribute list (if required).
@@ -349,9 +316,8 @@ xfs_attr_set_int(
* the transaction goes to disk before returning
* to the user.
*/
- if (mp->m_flags & XFS_MOUNT_WSYNC) {
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(args.trans);
- }
if (!error && (flags & ATTR_KERNOTIME) == 0) {
xfs_trans_ichgtime(args.trans, dp,
@@ -361,7 +327,7 @@ xfs_attr_set_int(
XFS_TRANS_RELEASE_LOG_RES);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return(error == 0 ? err2 : error);
+ return error ? error : err2;
}
/*
@@ -399,22 +365,19 @@ xfs_attr_set_int(
}
- if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+ if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
error = xfs_attr_leaf_addname(&args);
- } else {
+ else
error = xfs_attr_node_addname(&args);
- }
- if (error) {
+ if (error)
goto out;
- }
/*
* If this is a synchronous mount, make sure that the
* transaction goes to disk before returning to the user.
*/
- if (mp->m_flags & XFS_MOUNT_WSYNC) {
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(args.trans);
- }
if ((flags & ATTR_KERNOTIME) == 0)
xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
@@ -426,65 +389,47 @@ xfs_attr_set_int(
error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return(error);
+ return error;
out:
- if (args.trans)
+ if (args.trans) {
xfs_trans_cancel(args.trans,
XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+ }
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return(error);
+ return error;
}
+/*
+ * Generic handler routine to remove a name from an attribute list.
+ * Transitions attribute list from Btree to shortform as necessary.
+ */
int
-xfs_attr_set(
- xfs_inode_t *dp,
- const unsigned char *name,
- unsigned char *value,
- int valuelen,
- int flags)
+xfs_attr_remove(
+ struct xfs_inode *dp,
+ const unsigned char *name,
+ int flags)
{
- int error;
- struct xfs_name xname;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_da_args args;
+ struct xfs_bmap_free flist;
+ xfs_fsblock_t firstblock;
+ int error;
- XFS_STATS_INC(xs_attr_set);
+ XFS_STATS_INC(xs_attr_remove);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
- return (EIO);
+ return EIO;
+
+ if (!xfs_inode_hasattr(dp))
+ return ENOATTR;
- error = xfs_attr_name_to_xname(&xname, name);
+ error = xfs_attr_args_init(&args, dp, name, flags);
if (error)
return error;
- return xfs_attr_set_int(dp, &xname, value, valuelen, flags);
-}
-
-/*
- * Generic handler routine to remove a name from an attribute list.
- * Transitions attribute list from Btree to shortform as necessary.
- */
-STATIC int
-xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
-{
- xfs_da_args_t args;
- xfs_fsblock_t firstblock;
- xfs_bmap_free_t flist;
- int error;
- xfs_mount_t *mp = dp->i_mount;
-
- /*
- * Fill in the arg structure for this request.
- */
- memset((char *)&args, 0, sizeof(args));
- args.name = name->name;
- args.namelen = name->len;
- args.flags = flags;
- args.hashval = xfs_da_hashname(args.name, args.namelen);
- args.dp = dp;
args.firstblock = &firstblock;
args.flist = &flist;
- args.total = 0;
- args.whichfork = XFS_ATTR_FORK;
/*
* we have no control over the attribute names that userspace passes us
@@ -493,9 +438,6 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
*/
args.op_flags = XFS_DA_OP_OKNOENT;
- /*
- * Attach the dquots to the inode.
- */
error = xfs_qm_dqattach(dp, 0);
if (error)
return error;
@@ -524,7 +466,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
XFS_ATTRRM_SPACE_RES(mp), 0);
if (error) {
xfs_trans_cancel(args.trans, 0);
- return(error);
+ return error;
}
xfs_ilock(dp, XFS_ILOCK_EXCL);
@@ -534,35 +476,26 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
*/
xfs_trans_ijoin(args.trans, dp, 0);
- /*
- * Decide on what work routines to call based on the inode size.
- */
if (!xfs_inode_hasattr(dp)) {
error = XFS_ERROR(ENOATTR);
- goto out;
- }
- if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+ } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
error = xfs_attr_shortform_remove(&args);
- if (error) {
- goto out;
- }
} else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
error = xfs_attr_leaf_removename(&args);
} else {
error = xfs_attr_node_removename(&args);
}
- if (error) {
+
+ if (error)
goto out;
- }
/*
* If this is a synchronous mount, make sure that the
* transaction goes to disk before returning to the user.
*/
- if (mp->m_flags & XFS_MOUNT_WSYNC) {
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(args.trans);
- }
if ((flags & ATTR_KERNOTIME) == 0)
xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
@@ -574,45 +507,17 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return(error);
+ return error;
out:
- if (args.trans)
+ if (args.trans) {
xfs_trans_cancel(args.trans,
XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return(error);
-}
-
-int
-xfs_attr_remove(
- xfs_inode_t *dp,
- const unsigned char *name,
- int flags)
-{
- int error;
- struct xfs_name xname;
-
- XFS_STATS_INC(xs_attr_remove);
-
- if (XFS_FORCED_SHUTDOWN(dp->i_mount))
- return (EIO);
-
- error = xfs_attr_name_to_xname(&xname, name);
- if (error)
- return error;
-
- xfs_ilock(dp, XFS_ILOCK_SHARED);
- if (!xfs_inode_hasattr(dp)) {
- xfs_iunlock(dp, XFS_ILOCK_SHARED);
- return XFS_ERROR(ENOATTR);
}
- xfs_iunlock(dp, XFS_ILOCK_SHARED);
-
- return xfs_attr_remove_int(dp, &xname, flags);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ return error;
}
-
/*========================================================================
* External routines when attribute list is inside the inode
*========================================================================*/
@@ -958,7 +863,7 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
}
/*========================================================================
- * External routines when attribute list size > XFS_LBSIZE(mp).
+ * External routines when attribute list size > geo->blksize
*========================================================================*/
/*
@@ -991,8 +896,6 @@ restart:
state = xfs_da_state_alloc();
state->args = args;
state->mp = mp;
- state->blocksize = state->mp->m_sb.sb_blocksize;
- state->node_ents = state->mp->m_attr_node_ents;
/*
* Search to see if name already exists, and get back a pointer
@@ -1170,8 +1073,6 @@ restart:
state = xfs_da_state_alloc();
state->args = args;
state->mp = mp;
- state->blocksize = state->mp->m_sb.sb_blocksize;
- state->node_ents = state->mp->m_attr_node_ents;
state->inleaf = 0;
error = xfs_da3_node_lookup_int(state, &retval);
if (error)
@@ -1262,8 +1163,6 @@ xfs_attr_node_removename(xfs_da_args_t *args)
state = xfs_da_state_alloc();
state->args = args;
state->mp = dp->i_mount;
- state->blocksize = state->mp->m_sb.sb_blocksize;
- state->node_ents = state->mp->m_attr_node_ents;
/*
* Search to see if name exists, and get back a pointer to it.
@@ -1525,8 +1424,6 @@ xfs_attr_node_get(xfs_da_args_t *args)
state = xfs_da_state_alloc();
state->args = args;
state->mp = args->dp->i_mount;
- state->blocksize = state->mp->m_sb.sb_blocksize;
- state->node_ents = state->mp->m_attr_node_ents;
/*
* Search to see if name exists, and get back a pointer to it.
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 511c283459b1..28712d29e43c 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -80,11 +80,12 @@ STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state,
/*
* Utility routines.
*/
-STATIC void xfs_attr3_leaf_moveents(struct xfs_attr_leafblock *src_leaf,
+STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args,
+ struct xfs_attr_leafblock *src_leaf,
struct xfs_attr3_icleaf_hdr *src_ichdr, int src_start,
struct xfs_attr_leafblock *dst_leaf,
struct xfs_attr3_icleaf_hdr *dst_ichdr, int dst_start,
- int move_count, struct xfs_mount *mp);
+ int move_count);
STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
void
@@ -711,6 +712,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
memset((char *)&nargs, 0, sizeof(nargs));
nargs.dp = dp;
+ nargs.geo = args->geo;
nargs.firstblock = args->firstblock;
nargs.flist = args->flist;
nargs.total = args->total;
@@ -805,18 +807,18 @@ xfs_attr3_leaf_to_shortform(
trace_xfs_attr_leaf_to_sf(args);
- tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP);
+ tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
if (!tmpbuffer)
return ENOMEM;
- memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(dp->i_mount));
+ memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
leaf = (xfs_attr_leafblock_t *)tmpbuffer;
xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
entry = xfs_attr3_leaf_entryp(leaf);
/* XXX (dgc): buffer is about to be marked stale - why zero it? */
- memset(bp->b_addr, 0, XFS_LBSIZE(dp->i_mount));
+ memset(bp->b_addr, 0, args->geo->blksize);
/*
* Clean out the prior contents of the attribute list.
@@ -838,6 +840,7 @@ xfs_attr3_leaf_to_shortform(
* Copy the attributes
*/
memset((char *)&nargs, 0, sizeof(nargs));
+ nargs.geo = args->geo;
nargs.dp = dp;
nargs.firstblock = args->firstblock;
nargs.flist = args->flist;
@@ -904,12 +907,12 @@ xfs_attr3_leaf_to_node(
/* copy leaf to new buffer, update identifiers */
xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF);
bp2->b_ops = bp1->b_ops;
- memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(mp));
+ memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize);
if (xfs_sb_version_hascrc(&mp->m_sb)) {
struct xfs_da3_blkinfo *hdr3 = bp2->b_addr;
hdr3->blkno = cpu_to_be64(bp2->b_bn);
}
- xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(mp) - 1);
+ xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1);
/*
* Set up the new root node.
@@ -930,7 +933,7 @@ xfs_attr3_leaf_to_node(
btree[0].before = cpu_to_be32(blkno);
icnodehdr.count = 1;
dp->d_ops->node_hdr_to_disk(node, &icnodehdr);
- xfs_trans_log_buf(args->trans, bp1, 0, XFS_LBSIZE(mp) - 1);
+ xfs_trans_log_buf(args->trans, bp1, 0, args->geo->blksize - 1);
error = 0;
out:
return error;
@@ -966,10 +969,10 @@ xfs_attr3_leaf_create(
bp->b_ops = &xfs_attr3_leaf_buf_ops;
xfs_trans_buf_set_type(args->trans, bp, XFS_BLFT_ATTR_LEAF_BUF);
leaf = bp->b_addr;
- memset(leaf, 0, XFS_LBSIZE(mp));
+ memset(leaf, 0, args->geo->blksize);
memset(&ichdr, 0, sizeof(ichdr));
- ichdr.firstused = XFS_LBSIZE(mp);
+ ichdr.firstused = args->geo->blksize;
if (xfs_sb_version_hascrc(&mp->m_sb)) {
struct xfs_da3_blkinfo *hdr3 = bp->b_addr;
@@ -988,7 +991,7 @@ xfs_attr3_leaf_create(
ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base;
xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
- xfs_trans_log_buf(args->trans, bp, 0, XFS_LBSIZE(mp) - 1);
+ xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1);
*bpp = bp;
return 0;
@@ -1074,8 +1077,7 @@ xfs_attr3_leaf_add(
leaf = bp->b_addr;
xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
ASSERT(args->index >= 0 && args->index <= ichdr.count);
- entsize = xfs_attr_leaf_newentsize(args->namelen, args->valuelen,
- args->trans->t_mountp->m_sb.sb_blocksize, NULL);
+ entsize = xfs_attr_leaf_newentsize(args, NULL);
/*
* Search through freemap for first-fit on new name length.
@@ -1174,17 +1176,14 @@ xfs_attr3_leaf_add_work(
* Allocate space for the new string (at the end of the run).
*/
mp = args->trans->t_mountp;
- ASSERT(ichdr->freemap[mapindex].base < XFS_LBSIZE(mp));
+ ASSERT(ichdr->freemap[mapindex].base < args->geo->blksize);
ASSERT((ichdr->freemap[mapindex].base & 0x3) == 0);
ASSERT(ichdr->freemap[mapindex].size >=
- xfs_attr_leaf_newentsize(args->namelen, args->valuelen,
- mp->m_sb.sb_blocksize, NULL));
- ASSERT(ichdr->freemap[mapindex].size < XFS_LBSIZE(mp));
+ xfs_attr_leaf_newentsize(args, NULL));
+ ASSERT(ichdr->freemap[mapindex].size < args->geo->blksize);
ASSERT((ichdr->freemap[mapindex].size & 0x3) == 0);
- ichdr->freemap[mapindex].size -=
- xfs_attr_leaf_newentsize(args->namelen, args->valuelen,
- mp->m_sb.sb_blocksize, &tmp);
+ ichdr->freemap[mapindex].size -= xfs_attr_leaf_newentsize(args, &tmp);
entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base +
ichdr->freemap[mapindex].size);
@@ -1269,14 +1268,13 @@ xfs_attr3_leaf_compact(
struct xfs_attr_leafblock *leaf_dst;
struct xfs_attr3_icleaf_hdr ichdr_src;
struct xfs_trans *trans = args->trans;
- struct xfs_mount *mp = trans->t_mountp;
char *tmpbuffer;
trace_xfs_attr_leaf_compact(args);
- tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP);
- memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp));
- memset(bp->b_addr, 0, XFS_LBSIZE(mp));
+ tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
+ memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
+ memset(bp->b_addr, 0, args->geo->blksize);
leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
leaf_dst = bp->b_addr;
@@ -1289,7 +1287,7 @@ xfs_attr3_leaf_compact(
/* Initialise the incore headers */
ichdr_src = *ichdr_dst; /* struct copy */
- ichdr_dst->firstused = XFS_LBSIZE(mp);
+ ichdr_dst->firstused = args->geo->blksize;
ichdr_dst->usedbytes = 0;
ichdr_dst->count = 0;
ichdr_dst->holes = 0;
@@ -1304,13 +1302,13 @@ xfs_attr3_leaf_compact(
* Copy all entry's in the same (sorted) order,
* but allocate name/value pairs packed and in sequence.
*/
- xfs_attr3_leaf_moveents(leaf_src, &ichdr_src, 0, leaf_dst, ichdr_dst, 0,
- ichdr_src.count, mp);
+ xfs_attr3_leaf_moveents(args, leaf_src, &ichdr_src, 0,
+ leaf_dst, ichdr_dst, 0, ichdr_src.count);
/*
* this logs the entire buffer, but the caller must write the header
* back to the buffer when it is finished modifying it.
*/
- xfs_trans_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1);
+ xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1);
kmem_free(tmpbuffer);
}
@@ -1461,8 +1459,8 @@ xfs_attr3_leaf_rebalance(
/*
* Move high entries from leaf1 to low end of leaf2.
*/
- xfs_attr3_leaf_moveents(leaf1, &ichdr1, ichdr1.count - count,
- leaf2, &ichdr2, 0, count, state->mp);
+ xfs_attr3_leaf_moveents(args, leaf1, &ichdr1,
+ ichdr1.count - count, leaf2, &ichdr2, 0, count);
} else if (count > ichdr1.count) {
/*
@@ -1490,14 +1488,14 @@ xfs_attr3_leaf_rebalance(
/*
* Move low entries from leaf2 to high end of leaf1.
*/
- xfs_attr3_leaf_moveents(leaf2, &ichdr2, 0, leaf1, &ichdr1,
- ichdr1.count, count, state->mp);
+ xfs_attr3_leaf_moveents(args, leaf2, &ichdr2, 0, leaf1, &ichdr1,
+ ichdr1.count, count);
}
xfs_attr3_leaf_hdr_to_disk(leaf1, &ichdr1);
xfs_attr3_leaf_hdr_to_disk(leaf2, &ichdr2);
- xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1);
- xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1);
+ xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1);
+ xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1);
/*
* Copy out last hashval in each block for B-tree code.
@@ -1592,11 +1590,9 @@ xfs_attr3_leaf_figure_balance(
max = ichdr1->count + ichdr2->count;
half = (max + 1) * sizeof(*entry);
half += ichdr1->usedbytes + ichdr2->usedbytes +
- xfs_attr_leaf_newentsize(state->args->namelen,
- state->args->valuelen,
- state->blocksize, NULL);
+ xfs_attr_leaf_newentsize(state->args, NULL);
half /= 2;
- lastdelta = state->blocksize;
+ lastdelta = state->args->geo->blksize;
entry = xfs_attr3_leaf_entryp(leaf1);
for (count = index = 0; count < max; entry++, index++, count++) {
@@ -1606,10 +1602,7 @@ xfs_attr3_leaf_figure_balance(
*/
if (count == blk1->index) {
tmp = totallen + sizeof(*entry) +
- xfs_attr_leaf_newentsize(
- state->args->namelen,
- state->args->valuelen,
- state->blocksize, NULL);
+ xfs_attr_leaf_newentsize(state->args, NULL);
if (XFS_ATTR_ABS(half - tmp) > lastdelta)
break;
lastdelta = XFS_ATTR_ABS(half - tmp);
@@ -1645,10 +1638,7 @@ xfs_attr3_leaf_figure_balance(
totallen -= count * sizeof(*entry);
if (foundit) {
totallen -= sizeof(*entry) +
- xfs_attr_leaf_newentsize(
- state->args->namelen,
- state->args->valuelen,
- state->blocksize, NULL);
+ xfs_attr_leaf_newentsize(state->args, NULL);
}
*countarg = count;
@@ -1700,7 +1690,7 @@ xfs_attr3_leaf_toosmall(
bytes = xfs_attr3_leaf_hdr_size(leaf) +
ichdr.count * sizeof(xfs_attr_leaf_entry_t) +
ichdr.usedbytes;
- if (bytes > (state->blocksize >> 1)) {
+ if (bytes > (state->args->geo->blksize >> 1)) {
*action = 0; /* blk over 50%, don't try to join */
return(0);
}
@@ -1754,7 +1744,8 @@ xfs_attr3_leaf_toosmall(
xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr);
- bytes = state->blocksize - (state->blocksize >> 2) -
+ bytes = state->args->geo->blksize -
+ (state->args->geo->blksize >> 2) -
ichdr.usedbytes - ichdr2.usedbytes -
((ichdr.count + ichdr2.count) *
sizeof(xfs_attr_leaf_entry_t)) -
@@ -1805,7 +1796,6 @@ xfs_attr3_leaf_remove(
struct xfs_attr_leafblock *leaf;
struct xfs_attr3_icleaf_hdr ichdr;
struct xfs_attr_leaf_entry *entry;
- struct xfs_mount *mp = args->trans->t_mountp;
int before;
int after;
int smallest;
@@ -1819,7 +1809,7 @@ xfs_attr3_leaf_remove(
leaf = bp->b_addr;
xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
- ASSERT(ichdr.count > 0 && ichdr.count < XFS_LBSIZE(mp) / 8);
+ ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8);
ASSERT(args->index >= 0 && args->index < ichdr.count);
ASSERT(ichdr.firstused >= ichdr.count * sizeof(*entry) +
xfs_attr3_leaf_hdr_size(leaf));
@@ -1827,7 +1817,7 @@ xfs_attr3_leaf_remove(
entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
- ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp));
+ ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize);
/*
* Scan through free region table:
@@ -1842,8 +1832,8 @@ xfs_attr3_leaf_remove(
smallest = XFS_ATTR_LEAF_MAPSIZE - 1;
entsize = xfs_attr_leaf_entsize(leaf, args->index);
for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
- ASSERT(ichdr.freemap[i].base < XFS_LBSIZE(mp));
- ASSERT(ichdr.freemap[i].size < XFS_LBSIZE(mp));
+ ASSERT(ichdr.freemap[i].base < args->geo->blksize);
+ ASSERT(ichdr.freemap[i].size < args->geo->blksize);
if (ichdr.freemap[i].base == tablesize) {
ichdr.freemap[i].base -= sizeof(xfs_attr_leaf_entry_t);
ichdr.freemap[i].size += sizeof(xfs_attr_leaf_entry_t);
@@ -1920,11 +1910,11 @@ xfs_attr3_leaf_remove(
* removing the name.
*/
if (smallest) {
- tmp = XFS_LBSIZE(mp);
+ tmp = args->geo->blksize;
entry = xfs_attr3_leaf_entryp(leaf);
for (i = ichdr.count - 1; i >= 0; entry++, i--) {
ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
- ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp));
+ ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize);
if (be16_to_cpu(entry->nameidx) < tmp)
tmp = be16_to_cpu(entry->nameidx);
@@ -1947,7 +1937,7 @@ xfs_attr3_leaf_remove(
tmp = ichdr.usedbytes + xfs_attr3_leaf_hdr_size(leaf) +
ichdr.count * sizeof(xfs_attr_leaf_entry_t);
- return tmp < mp->m_attr_magicpct; /* leaf is < 37% full */
+ return tmp < args->geo->magicpct; /* leaf is < 37% full */
}
/*
@@ -1964,7 +1954,6 @@ xfs_attr3_leaf_unbalance(
struct xfs_attr3_icleaf_hdr drophdr;
struct xfs_attr3_icleaf_hdr savehdr;
struct xfs_attr_leaf_entry *entry;
- struct xfs_mount *mp = state->mp;
trace_xfs_attr_leaf_unbalance(state->args);
@@ -1991,13 +1980,15 @@ xfs_attr3_leaf_unbalance(
*/
if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
drop_blk->bp, &drophdr)) {
- xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0,
+ xfs_attr3_leaf_moveents(state->args,
+ drop_leaf, &drophdr, 0,
save_leaf, &savehdr, 0,
- drophdr.count, mp);
+ drophdr.count);
} else {
- xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0,
+ xfs_attr3_leaf_moveents(state->args,
+ drop_leaf, &drophdr, 0,
save_leaf, &savehdr,
- savehdr.count, drophdr.count, mp);
+ savehdr.count, drophdr.count);
}
} else {
/*
@@ -2007,7 +1998,7 @@ xfs_attr3_leaf_unbalance(
struct xfs_attr_leafblock *tmp_leaf;
struct xfs_attr3_icleaf_hdr tmphdr;
- tmp_leaf = kmem_zalloc(state->blocksize, KM_SLEEP);
+ tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP);
/*
* Copy the header into the temp leaf so that all the stuff
@@ -2020,35 +2011,39 @@ xfs_attr3_leaf_unbalance(
tmphdr.magic = savehdr.magic;
tmphdr.forw = savehdr.forw;
tmphdr.back = savehdr.back;
- tmphdr.firstused = state->blocksize;
+ tmphdr.firstused = state->args->geo->blksize;
/* write the header to the temp buffer to initialise it */
xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr);
if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
drop_blk->bp, &drophdr)) {
- xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0,
+ xfs_attr3_leaf_moveents(state->args,
+ drop_leaf, &drophdr, 0,
tmp_leaf, &tmphdr, 0,
- drophdr.count, mp);
- xfs_attr3_leaf_moveents(save_leaf, &savehdr, 0,
+ drophdr.count);
+ xfs_attr3_leaf_moveents(state->args,
+ save_leaf, &savehdr, 0,
tmp_leaf, &tmphdr, tmphdr.count,
- savehdr.count, mp);
+ savehdr.count);
} else {
- xfs_attr3_leaf_moveents(save_leaf, &savehdr, 0,
+ xfs_attr3_leaf_moveents(state->args,
+ save_leaf, &savehdr, 0,
tmp_leaf, &tmphdr, 0,
- savehdr.count, mp);
- xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0,
+ savehdr.count);
+ xfs_attr3_leaf_moveents(state->args,
+ drop_leaf, &drophdr, 0,
tmp_leaf, &tmphdr, tmphdr.count,
- drophdr.count, mp);
+ drophdr.count);
}
- memcpy(save_leaf, tmp_leaf, state->blocksize);
+ memcpy(save_leaf, tmp_leaf, state->args->geo->blksize);
savehdr = tmphdr; /* struct copy */
kmem_free(tmp_leaf);
}
xfs_attr3_leaf_hdr_to_disk(save_leaf, &savehdr);
xfs_trans_log_buf(state->args->trans, save_blk->bp, 0,
- state->blocksize - 1);
+ state->args->geo->blksize - 1);
/*
* Copy out last hashval in each block for B-tree code.
@@ -2094,7 +2089,7 @@ xfs_attr3_leaf_lookup_int(
leaf = bp->b_addr;
xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
entries = xfs_attr3_leaf_entryp(leaf);
- ASSERT(ichdr.count < XFS_LBSIZE(args->dp->i_mount) / 8);
+ ASSERT(ichdr.count < args->geo->blksize / 8);
/*
* Binary search. (note: small blocks will skip this loop)
@@ -2198,7 +2193,7 @@ xfs_attr3_leaf_getvalue(
leaf = bp->b_addr;
xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
- ASSERT(ichdr.count < XFS_LBSIZE(args->dp->i_mount) / 8);
+ ASSERT(ichdr.count < args->geo->blksize / 8);
ASSERT(args->index < ichdr.count);
entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
@@ -2249,14 +2244,14 @@ xfs_attr3_leaf_getvalue(
/*ARGSUSED*/
STATIC void
xfs_attr3_leaf_moveents(
+ struct xfs_da_args *args,
struct xfs_attr_leafblock *leaf_s,
struct xfs_attr3_icleaf_hdr *ichdr_s,
int start_s,
struct xfs_attr_leafblock *leaf_d,
struct xfs_attr3_icleaf_hdr *ichdr_d,
int start_d,
- int count,
- struct xfs_mount *mp)
+ int count)
{
struct xfs_attr_leaf_entry *entry_s;
struct xfs_attr_leaf_entry *entry_d;
@@ -2276,10 +2271,10 @@ xfs_attr3_leaf_moveents(
ASSERT(ichdr_s->magic == XFS_ATTR_LEAF_MAGIC ||
ichdr_s->magic == XFS_ATTR3_LEAF_MAGIC);
ASSERT(ichdr_s->magic == ichdr_d->magic);
- ASSERT(ichdr_s->count > 0 && ichdr_s->count < XFS_LBSIZE(mp) / 8);
+ ASSERT(ichdr_s->count > 0 && ichdr_s->count < args->geo->blksize / 8);
ASSERT(ichdr_s->firstused >= (ichdr_s->count * sizeof(*entry_s))
+ xfs_attr3_leaf_hdr_size(leaf_s));
- ASSERT(ichdr_d->count < XFS_LBSIZE(mp) / 8);
+ ASSERT(ichdr_d->count < args->geo->blksize / 8);
ASSERT(ichdr_d->firstused >= (ichdr_d->count * sizeof(*entry_d))
+ xfs_attr3_leaf_hdr_size(leaf_d));
@@ -2331,11 +2326,11 @@ xfs_attr3_leaf_moveents(
entry_d->nameidx = cpu_to_be16(ichdr_d->firstused);
entry_d->flags = entry_s->flags;
ASSERT(be16_to_cpu(entry_d->nameidx) + tmp
- <= XFS_LBSIZE(mp));
+ <= args->geo->blksize);
memmove(xfs_attr3_leaf_name(leaf_d, desti),
xfs_attr3_leaf_name(leaf_s, start_s + i), tmp);
ASSERT(be16_to_cpu(entry_s->nameidx) + tmp
- <= XFS_LBSIZE(mp));
+ <= args->geo->blksize);
memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp);
ichdr_s->usedbytes -= tmp;
ichdr_d->usedbytes += tmp;
@@ -2356,7 +2351,7 @@ xfs_attr3_leaf_moveents(
tmp = count * sizeof(xfs_attr_leaf_entry_t);
entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
ASSERT(((char *)entry_s + tmp) <=
- ((char *)leaf_s + XFS_LBSIZE(mp)));
+ ((char *)leaf_s + args->geo->blksize));
memset(entry_s, 0, tmp);
} else {
/*
@@ -2371,7 +2366,7 @@ xfs_attr3_leaf_moveents(
tmp = count * sizeof(xfs_attr_leaf_entry_t);
entry_s = &xfs_attr3_leaf_entryp(leaf_s)[ichdr_s->count];
ASSERT(((char *)entry_s + tmp) <=
- ((char *)leaf_s + XFS_LBSIZE(mp)));
+ ((char *)leaf_s + args->geo->blksize));
memset(entry_s, 0, tmp);
}
@@ -2439,22 +2434,21 @@ xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index)
* a "local" or a "remote" attribute.
*/
int
-xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local)
+xfs_attr_leaf_newentsize(
+ struct xfs_da_args *args,
+ int *local)
{
- int size;
+ int size;
- size = xfs_attr_leaf_entsize_local(namelen, valuelen);
- if (size < xfs_attr_leaf_entsize_local_max(blocksize)) {
- if (local) {
+ size = xfs_attr_leaf_entsize_local(args->namelen, args->valuelen);
+ if (size < xfs_attr_leaf_entsize_local_max(args->geo->blksize)) {
+ if (local)
*local = 1;
- }
- } else {
- size = xfs_attr_leaf_entsize_remote(namelen);
- if (local) {
- *local = 0;
- }
+ return size;
}
- return size;
+ if (local)
+ *local = 0;
+ return xfs_attr_leaf_entsize_remote(args->namelen);
}
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index 3ec5ec0b8678..e2929da7c3ba 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -96,8 +96,7 @@ int xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
xfs_dahash_t xfs_attr_leaf_lasthash(struct xfs_buf *bp, int *count);
int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp,
struct xfs_buf *leaf2_bp);
-int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize,
- int *local);
+int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local);
int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
xfs_dablk_t bno, xfs_daddr_t mappedbno,
struct xfs_buf **bpp);
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 833fe5d98d80..90e2eeb21207 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -444,6 +444,7 @@ xfs_attr3_leaf_list_int(
xfs_da_args_t args;
memset((char *)&args, 0, sizeof(args));
+ args.geo = context->dp->i_mount->m_attr_geo;
args.dp = context->dp;
args.whichfork = XFS_ATTR_FORK;
args.valuelen = valuelen;
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
index d2e6e948cec7..b5adfecbb8ee 100644
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/xfs_attr_remote.c
@@ -68,7 +68,6 @@ xfs_attr3_rmt_blocks(
*/
static bool
xfs_attr3_rmt_hdr_ok(
- struct xfs_mount *mp,
void *ptr,
xfs_ino_t ino,
uint32_t offset,
@@ -126,6 +125,7 @@ xfs_attr3_rmt_read_verify(
char *ptr;
int len;
xfs_daddr_t bno;
+ int blksize = mp->m_attr_geo->blksize;
/* no verification of non-crc buffers */
if (!xfs_sb_version_hascrc(&mp->m_sb))
@@ -134,21 +134,20 @@ xfs_attr3_rmt_read_verify(
ptr = bp->b_addr;
bno = bp->b_bn;
len = BBTOB(bp->b_length);
- ASSERT(len >= XFS_LBSIZE(mp));
+ ASSERT(len >= blksize);
while (len > 0) {
- if (!xfs_verify_cksum(ptr, XFS_LBSIZE(mp),
- XFS_ATTR3_RMT_CRC_OFF)) {
+ if (!xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) {
xfs_buf_ioerror(bp, EFSBADCRC);
break;
}
- if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) {
+ if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
xfs_buf_ioerror(bp, EFSCORRUPTED);
break;
}
- len -= XFS_LBSIZE(mp);
- ptr += XFS_LBSIZE(mp);
- bno += mp->m_bsize;
+ len -= blksize;
+ ptr += blksize;
+ bno += BTOBB(blksize);
}
if (bp->b_error)
@@ -166,6 +165,7 @@ xfs_attr3_rmt_write_verify(
char *ptr;
int len;
xfs_daddr_t bno;
+ int blksize = mp->m_attr_geo->blksize;
/* no verification of non-crc buffers */
if (!xfs_sb_version_hascrc(&mp->m_sb))
@@ -174,10 +174,10 @@ xfs_attr3_rmt_write_verify(
ptr = bp->b_addr;
bno = bp->b_bn;
len = BBTOB(bp->b_length);
- ASSERT(len >= XFS_LBSIZE(mp));
+ ASSERT(len >= blksize);
while (len > 0) {
- if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) {
+ if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
xfs_buf_ioerror(bp, EFSCORRUPTED);
xfs_verifier_error(bp);
return;
@@ -188,11 +188,11 @@ xfs_attr3_rmt_write_verify(
rmt = (struct xfs_attr3_rmt_hdr *)ptr;
rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn);
}
- xfs_update_cksum(ptr, XFS_LBSIZE(mp), XFS_ATTR3_RMT_CRC_OFF);
+ xfs_update_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF);
- len -= XFS_LBSIZE(mp);
- ptr += XFS_LBSIZE(mp);
- bno += mp->m_bsize;
+ len -= blksize;
+ ptr += blksize;
+ bno += BTOBB(blksize);
}
ASSERT(len == 0);
}
@@ -241,17 +241,18 @@ xfs_attr_rmtval_copyout(
char *src = bp->b_addr;
xfs_daddr_t bno = bp->b_bn;
int len = BBTOB(bp->b_length);
+ int blksize = mp->m_attr_geo->blksize;
- ASSERT(len >= XFS_LBSIZE(mp));
+ ASSERT(len >= blksize);
while (len > 0 && *valuelen > 0) {
int hdr_size = 0;
- int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp));
+ int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
byte_cnt = min(*valuelen, byte_cnt);
if (xfs_sb_version_hascrc(&mp->m_sb)) {
- if (!xfs_attr3_rmt_hdr_ok(mp, src, ino, *offset,
+ if (!xfs_attr3_rmt_hdr_ok(src, ino, *offset,
byte_cnt, bno)) {
xfs_alert(mp,
"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
@@ -264,9 +265,9 @@ xfs_attr_rmtval_copyout(
memcpy(*dst, src + hdr_size, byte_cnt);
/* roll buffer forwards */
- len -= XFS_LBSIZE(mp);
- src += XFS_LBSIZE(mp);
- bno += mp->m_bsize;
+ len -= blksize;
+ src += blksize;
+ bno += BTOBB(blksize);
/* roll attribute data forwards */
*valuelen -= byte_cnt;
@@ -288,12 +289,13 @@ xfs_attr_rmtval_copyin(
char *dst = bp->b_addr;
xfs_daddr_t bno = bp->b_bn;
int len = BBTOB(bp->b_length);
+ int blksize = mp->m_attr_geo->blksize;
- ASSERT(len >= XFS_LBSIZE(mp));
+ ASSERT(len >= blksize);
while (len > 0 && *valuelen > 0) {
int hdr_size;
- int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp));
+ int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
byte_cnt = min(*valuelen, byte_cnt);
hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset,
@@ -305,17 +307,17 @@ xfs_attr_rmtval_copyin(
* If this is the last block, zero the remainder of it.
* Check that we are actually the last block, too.
*/
- if (byte_cnt + hdr_size < XFS_LBSIZE(mp)) {
+ if (byte_cnt + hdr_size < blksize) {
ASSERT(*valuelen - byte_cnt == 0);
- ASSERT(len == XFS_LBSIZE(mp));
+ ASSERT(len == blksize);
memset(dst + hdr_size + byte_cnt, 0,
- XFS_LBSIZE(mp) - hdr_size - byte_cnt);
+ blksize - hdr_size - byte_cnt);
}
/* roll buffer forwards */
- len -= XFS_LBSIZE(mp);
- dst += XFS_LBSIZE(mp);
- bno += mp->m_bsize;
+ len -= blksize;
+ dst += blksize;
+ bno += BTOBB(blksize);
/* roll attribute data forwards */
*valuelen -= byte_cnt;
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h
index f1e3c907044d..e1649c0d3e02 100644
--- a/fs/xfs/xfs_bit.h
+++ b/fs/xfs/xfs_bit.h
@@ -66,8 +66,11 @@ static inline int xfs_lowbit64(__uint64_t v)
n = ffs(w);
} else { /* upper bits */
w = (__uint32_t)(v >> 32);
- if (w && (n = ffs(w)))
- n += 32;
+ if (w) {
+ n = ffs(w);
+ if (n)
+ n += 32;
+ }
}
return n - 1;
}
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index f0efc7e970ef..96175df211b1 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -94,7 +94,7 @@ xfs_bmap_compute_maxlevels(
maxleafents = MAXAEXTNUM;
sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
}
- maxrootrecs = xfs_bmdr_maxrecs(mp, sz, 0);
+ maxrootrecs = xfs_bmdr_maxrecs(sz, 0);
minleafrecs = mp->m_bmap_dmnr[0];
minnoderecs = mp->m_bmap_dmnr[1];
maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
@@ -233,7 +233,6 @@ xfs_default_attroffset(
*/
STATIC void
xfs_bmap_forkoff_reset(
- xfs_mount_t *mp,
xfs_inode_t *ip,
int whichfork)
{
@@ -905,7 +904,7 @@ xfs_bmap_local_to_extents_empty(
ASSERT(ifp->if_bytes == 0);
ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
- xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork);
+ xfs_bmap_forkoff_reset(ip, whichfork);
ifp->if_flags &= ~XFS_IFINLINE;
ifp->if_flags |= XFS_IFEXTENTS;
XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
@@ -1099,10 +1098,11 @@ xfs_bmap_add_attrfork_local(
if (S_ISDIR(ip->i_d.di_mode)) {
memset(&dargs, 0, sizeof(dargs));
+ dargs.geo = ip->i_mount->m_dir_geo;
dargs.dp = ip;
dargs.firstblock = firstblock;
dargs.flist = flist;
- dargs.total = ip->i_mount->m_dirblkfsbs;
+ dargs.total = dargs.geo->fsbcount;
dargs.whichfork = XFS_DATA_FORK;
dargs.trans = tp;
return xfs_dir2_sf_to_block(&dargs);
@@ -1675,7 +1675,6 @@ xfs_bmap_isaeof(
*/
int
xfs_bmap_last_offset(
- struct xfs_trans *tp,
struct xfs_inode *ip,
xfs_fileoff_t *last_block,
int whichfork)
@@ -3517,6 +3516,67 @@ xfs_bmap_adjacent(
#undef ISVALID
}
+static int
+xfs_bmap_longest_free_extent(
+ struct xfs_trans *tp,
+ xfs_agnumber_t ag,
+ xfs_extlen_t *blen,
+ int *notinit)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_perag *pag;
+ xfs_extlen_t longest;
+ int error = 0;
+
+ pag = xfs_perag_get(mp, ag);
+ if (!pag->pagf_init) {
+ error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK);
+ if (error)
+ goto out;
+
+ if (!pag->pagf_init) {
+ *notinit = 1;
+ goto out;
+ }
+ }
+
+ longest = xfs_alloc_longest_free_extent(mp, pag);
+ if (*blen < longest)
+ *blen = longest;
+
+out:
+ xfs_perag_put(pag);
+ return error;
+}
+
+static void
+xfs_bmap_select_minlen(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args,
+ xfs_extlen_t *blen,
+ int notinit)
+{
+ if (notinit || *blen < ap->minlen) {
+ /*
+ * Since we did a BUF_TRYLOCK above, it is possible that
+ * there is space for this request.
+ */
+ args->minlen = ap->minlen;
+ } else if (*blen < args->maxlen) {
+ /*
+ * If the best seen length is less than the request length,
+ * use the best as the minimum.
+ */
+ args->minlen = *blen;
+ } else {
+ /*
+ * Otherwise we've seen an extent as big as maxlen, use that
+ * as the minimum.
+ */
+ args->minlen = args->maxlen;
+ }
+}
+
STATIC int
xfs_bmap_btalloc_nullfb(
struct xfs_bmalloca *ap,
@@ -3524,111 +3584,74 @@ xfs_bmap_btalloc_nullfb(
xfs_extlen_t *blen)
{
struct xfs_mount *mp = ap->ip->i_mount;
- struct xfs_perag *pag;
xfs_agnumber_t ag, startag;
int notinit = 0;
int error;
- if (ap->userdata && xfs_inode_is_filestream(ap->ip))
- args->type = XFS_ALLOCTYPE_NEAR_BNO;
- else
- args->type = XFS_ALLOCTYPE_START_BNO;
+ args->type = XFS_ALLOCTYPE_START_BNO;
args->total = ap->total;
- /*
- * Search for an allocation group with a single extent large enough
- * for the request. If one isn't found, then adjust the minimum
- * allocation size to the largest space found.
- */
startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
if (startag == NULLAGNUMBER)
startag = ag = 0;
- pag = xfs_perag_get(mp, ag);
while (*blen < args->maxlen) {
- if (!pag->pagf_init) {
- error = xfs_alloc_pagf_init(mp, args->tp, ag,
- XFS_ALLOC_FLAG_TRYLOCK);
- if (error) {
- xfs_perag_put(pag);
- return error;
- }
- }
-
- /*
- * See xfs_alloc_fix_freelist...
- */
- if (pag->pagf_init) {
- xfs_extlen_t longest;
- longest = xfs_alloc_longest_free_extent(mp, pag);
- if (*blen < longest)
- *blen = longest;
- } else
- notinit = 1;
-
- if (xfs_inode_is_filestream(ap->ip)) {
- if (*blen >= args->maxlen)
- break;
-
- if (ap->userdata) {
- /*
- * If startag is an invalid AG, we've
- * come here once before and
- * xfs_filestream_new_ag picked the
- * best currently available.
- *
- * Don't continue looping, since we
- * could loop forever.
- */
- if (startag == NULLAGNUMBER)
- break;
-
- error = xfs_filestream_new_ag(ap, &ag);
- xfs_perag_put(pag);
- if (error)
- return error;
+ error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
+ &notinit);
+ if (error)
+ return error;
- /* loop again to set 'blen'*/
- startag = NULLAGNUMBER;
- pag = xfs_perag_get(mp, ag);
- continue;
- }
- }
if (++ag == mp->m_sb.sb_agcount)
ag = 0;
if (ag == startag)
break;
- xfs_perag_put(pag);
- pag = xfs_perag_get(mp, ag);
}
- xfs_perag_put(pag);
- /*
- * Since the above loop did a BUF_TRYLOCK, it is
- * possible that there is space for this request.
- */
- if (notinit || *blen < ap->minlen)
- args->minlen = ap->minlen;
- /*
- * If the best seen length is less than the request
- * length, use the best as the minimum.
- */
- else if (*blen < args->maxlen)
- args->minlen = *blen;
- /*
- * Otherwise we've seen an extent as big as maxlen,
- * use that as the minimum.
- */
- else
- args->minlen = args->maxlen;
+ xfs_bmap_select_minlen(ap, args, blen, notinit);
+ return 0;
+}
+
+STATIC int
+xfs_bmap_btalloc_filestreams(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args,
+ xfs_extlen_t *blen)
+{
+ struct xfs_mount *mp = ap->ip->i_mount;
+ xfs_agnumber_t ag;
+ int notinit = 0;
+ int error;
+
+ args->type = XFS_ALLOCTYPE_NEAR_BNO;
+ args->total = ap->total;
+
+ ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
+ if (ag == NULLAGNUMBER)
+ ag = 0;
+
+ error = xfs_bmap_longest_free_extent(args->tp, ag, blen, &notinit);
+ if (error)
+ return error;
+
+ if (*blen < args->maxlen) {
+ error = xfs_filestream_new_ag(ap, &ag);
+ if (error)
+ return error;
+
+ error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
+ &notinit);
+ if (error)
+ return error;
+
+ }
+
+ xfs_bmap_select_minlen(ap, args, blen, notinit);
/*
- * set the failure fallback case to look in the selected
- * AG as the stream may have moved.
+ * Set the failure fallback case to look in the selected AG as stream
+ * may have moved.
*/
- if (xfs_inode_is_filestream(ap->ip))
- ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
-
+ ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
return 0;
}
@@ -3708,7 +3731,15 @@ xfs_bmap_btalloc(
args.firstblock = *ap->firstblock;
blen = 0;
if (nullfb) {
- error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
+ /*
+ * Search for an allocation group with a single extent large
+ * enough for the request. If one isn't found, then adjust
+ * the minimum allocation size to the largest space found.
+ */
+ if (ap->userdata && xfs_inode_is_filestream(ap->ip))
+ error = xfs_bmap_btalloc_filestreams(ap, &args, &blen);
+ else
+ error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
if (error)
return error;
} else if (ap->flist->xbf_low) {
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index f84bd7af43be..38ba36e9b2f0 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -156,8 +156,8 @@ int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t *last_block, int whichfork);
-int xfs_bmap_last_offset(struct xfs_trans *tp, struct xfs_inode *ip,
- xfs_fileoff_t *unused, int whichfork);
+int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused,
+ int whichfork);
int xfs_bmap_one_block(struct xfs_inode *ip, int whichfork);
int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip,
int whichfork);
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 818d546664e7..948836c4fd90 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -84,7 +84,7 @@ xfs_bmdr_to_bmbt(
rblock->bb_level = dblock->bb_level;
ASSERT(be16_to_cpu(rblock->bb_level) > 0);
rblock->bb_numrecs = dblock->bb_numrecs;
- dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
+ dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
@@ -443,7 +443,7 @@ xfs_bmbt_to_bmdr(
ASSERT(rblock->bb_level != 0);
dblock->bb_level = rblock->bb_level;
dblock->bb_numrecs = rblock->bb_numrecs;
- dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
+ dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
tkp = XFS_BMDR_KEY_ADDR(dblock, 1);
fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
@@ -519,7 +519,6 @@ xfs_bmbt_alloc_block(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *start,
union xfs_btree_ptr *new,
- int length,
int *stat)
{
xfs_alloc_arg_t args; /* block allocation args */
@@ -672,8 +671,7 @@ xfs_bmbt_get_dmaxrecs(
{
if (level != cur->bc_nlevels - 1)
return cur->bc_mp->m_bmap_dmxr[level != 0];
- return xfs_bmdr_maxrecs(cur->bc_mp, cur->bc_private.b.forksize,
- level == 0);
+ return xfs_bmdr_maxrecs(cur->bc_private.b.forksize, level == 0);
}
STATIC void
@@ -914,7 +912,6 @@ xfs_bmbt_maxrecs(
*/
int
xfs_bmdr_maxrecs(
- struct xfs_mount *mp,
int blocklen,
int leaf)
{
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 6e42e1e50b89..819a8a4dee95 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -130,7 +130,7 @@ extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
xfs_bmdr_block_t *, int);
extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
-extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+extern int xfs_bmdr_maxrecs(int blocklen, int leaf);
extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 296160b8e78c..703b3ec1796c 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -258,14 +258,23 @@ xfs_bmapi_allocate_worker(
struct xfs_bmalloca *args = container_of(work,
struct xfs_bmalloca, work);
unsigned long pflags;
+ unsigned long new_pflags = PF_FSTRANS;
- /* we are in a transaction context here */
- current_set_flags_nested(&pflags, PF_FSTRANS);
+ /*
+ * we are in a transaction context here, but may also be doing work
+ * in kswapd context, and hence we may need to inherit that state
+ * temporarily to ensure that we don't block waiting for memory reclaim
+ * in any way.
+ */
+ if (args->kswapd)
+ new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+
+ current_set_flags_nested(&pflags, new_pflags);
args->result = __xfs_bmapi_allocate(args);
complete(args->done);
- current_restore_flags_nested(&pflags, PF_FSTRANS);
+ current_restore_flags_nested(&pflags, new_pflags);
}
/*
@@ -284,6 +293,7 @@ xfs_bmapi_allocate(
args->done = &done;
+ args->kswapd = current_is_kswapd();
INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker);
queue_work(xfs_alloc_wq, &args->work);
wait_for_completion(&done);
@@ -1519,7 +1529,6 @@ xfs_collapse_file_space(
while (!error && !done) {
tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
- tp->t_flags |= XFS_TRANS_RESERVE;
/*
* We would need to reserve permanent block for transaction.
* This will come into picture when after shifting extent into
@@ -1529,7 +1538,6 @@ xfs_collapse_file_space(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
if (error) {
- ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
xfs_trans_cancel(tp, 0);
break;
}
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 935ed2b24edf..075f72232a64 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -50,12 +50,13 @@ struct xfs_bmalloca {
xfs_extlen_t total; /* total blocks needed for xaction */
xfs_extlen_t minlen; /* minimum allocation size (blocks) */
xfs_extlen_t minleft; /* amount must be left after alloc */
- char eof; /* set if allocating past last extent */
- char wasdel; /* replacing a delayed allocation */
- char userdata;/* set if is user data */
- char aeof; /* allocated space at eof */
- char conv; /* overwriting unwritten extents */
- char stack_switch;
+ bool eof; /* set if allocating past last extent */
+ bool wasdel; /* replacing a delayed allocation */
+ bool userdata;/* set if is user data */
+ bool aeof; /* allocated space at eof */
+ bool conv; /* overwriting unwritten extents */
+ bool stack_switch;
+ bool kswapd; /* allocation in kswapd context */
int flags;
struct completion *done;
struct work_struct work;
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index e80d59fdf89a..bf810c6baf2b 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -43,9 +43,10 @@ kmem_zone_t *xfs_btree_cur_zone;
* Btree magic numbers.
*/
static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
- { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC },
+ { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
+ XFS_FIBT_MAGIC },
{ XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC,
- XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC }
+ XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC }
};
#define xfs_btree_magic(cur) \
xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
@@ -552,14 +553,11 @@ xfs_btree_get_bufl(
xfs_fsblock_t fsbno, /* file system block number */
uint lock) /* lock flags for get_buf */
{
- xfs_buf_t *bp; /* buffer pointer (return value) */
xfs_daddr_t d; /* real disk block address */
ASSERT(fsbno != NULLFSBLOCK);
d = XFS_FSB_TO_DADDR(mp, fsbno);
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
- ASSERT(!xfs_buf_geterror(bp));
- return bp;
+ return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
}
/*
@@ -574,15 +572,12 @@ xfs_btree_get_bufs(
xfs_agblock_t agbno, /* allocation group block number */
uint lock) /* lock flags for get_buf */
{
- xfs_buf_t *bp; /* buffer pointer (return value) */
xfs_daddr_t d; /* real disk block address */
ASSERT(agno != NULLAGNUMBER);
ASSERT(agbno != NULLAGBLOCK);
d = XFS_AGB_TO_DADDR(mp, agno, agbno);
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
- ASSERT(!xfs_buf_geterror(bp));
- return bp;
+ return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
}
/*
@@ -722,7 +717,6 @@ xfs_btree_read_bufl(
mp->m_bsize, lock, &bp, ops);
if (error)
return error;
- ASSERT(!xfs_buf_geterror(bp));
if (bp)
xfs_buf_set_ref(bp, refval);
*bpp = bp;
@@ -1115,6 +1109,7 @@ xfs_btree_set_refs(
xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
break;
case XFS_BTNUM_INO:
+ case XFS_BTNUM_FINO:
xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
break;
case XFS_BTNUM_BMAP:
@@ -1159,7 +1154,6 @@ STATIC int
xfs_btree_read_buf_block(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr,
- int level,
int flags,
struct xfs_btree_block **block,
struct xfs_buf **bpp)
@@ -1178,7 +1172,6 @@ xfs_btree_read_buf_block(
if (error)
return error;
- ASSERT(!xfs_buf_geterror(*bpp));
xfs_btree_set_refs(cur, *bpp);
*block = XFS_BUF_TO_BLOCK(*bpp);
return 0;
@@ -1517,8 +1510,8 @@ xfs_btree_increment(
union xfs_btree_ptr *ptrp;
ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
- error = xfs_btree_read_buf_block(cur, ptrp, --lev,
- 0, &block, &bp);
+ --lev;
+ error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
if (error)
goto error0;
@@ -1616,8 +1609,8 @@ xfs_btree_decrement(
union xfs_btree_ptr *ptrp;
ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
- error = xfs_btree_read_buf_block(cur, ptrp, --lev,
- 0, &block, &bp);
+ --lev;
+ error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
if (error)
goto error0;
xfs_btree_setbuf(cur, lev, bp);
@@ -1667,7 +1660,7 @@ xfs_btree_lookup_get_block(
return 0;
}
- error = xfs_btree_read_buf_block(cur, pp, level, 0, blkp, &bp);
+ error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
if (error)
return error;
@@ -2018,7 +2011,7 @@ xfs_btree_lshift(
goto out0;
/* Set up the left neighbor as "left". */
- error = xfs_btree_read_buf_block(cur, &lptr, level, 0, &left, &lbp);
+ error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
if (error)
goto error0;
@@ -2202,7 +2195,7 @@ xfs_btree_rshift(
goto out0;
/* Set up the right neighbor as "right". */
- error = xfs_btree_read_buf_block(cur, &rptr, level, 0, &right, &rbp);
+ error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
if (error)
goto error0;
@@ -2372,7 +2365,7 @@ xfs_btree_split(
xfs_btree_buf_to_ptr(cur, lbp, &lptr);
/* Allocate the new block. If we can't do it, we're toast. Give up. */
- error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, 1, stat);
+ error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
if (error)
goto error0;
if (*stat == 0)
@@ -2470,7 +2463,7 @@ xfs_btree_split(
* point back to right instead of to left.
*/
if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
- error = xfs_btree_read_buf_block(cur, &rrptr, level,
+ error = xfs_btree_read_buf_block(cur, &rrptr,
0, &rrblock, &rrbp);
if (error)
goto error0;
@@ -2545,7 +2538,7 @@ xfs_btree_new_iroot(
pp = xfs_btree_ptr_addr(cur, 1, block);
/* Allocate the new block. If we can't do it, we're toast. Give up. */
- error = cur->bc_ops->alloc_block(cur, pp, &nptr, 1, stat);
+ error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
if (error)
goto error0;
if (*stat == 0) {
@@ -2649,7 +2642,7 @@ xfs_btree_new_root(
cur->bc_ops->init_ptr_from_cur(cur, &rptr);
/* Allocate the new block. If we can't do it, we're toast. Give up. */
- error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, 1, stat);
+ error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
if (error)
goto error0;
if (*stat == 0)
@@ -2684,8 +2677,7 @@ xfs_btree_new_root(
lbp = bp;
xfs_btree_buf_to_ptr(cur, lbp, &lptr);
left = block;
- error = xfs_btree_read_buf_block(cur, &rptr,
- cur->bc_nlevels - 1, 0, &right, &rbp);
+ error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
if (error)
goto error0;
bp = rbp;
@@ -2696,8 +2688,7 @@ xfs_btree_new_root(
xfs_btree_buf_to_ptr(cur, rbp, &rptr);
right = block;
xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
- error = xfs_btree_read_buf_block(cur, &lptr,
- cur->bc_nlevels - 1, 0, &left, &lbp);
+ error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
if (error)
goto error0;
bp = lbp;
@@ -3649,8 +3640,7 @@ xfs_btree_delrec(
rptr = cptr;
right = block;
rbp = bp;
- error = xfs_btree_read_buf_block(cur, &lptr, level,
- 0, &left, &lbp);
+ error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
if (error)
goto error0;
@@ -3667,8 +3657,7 @@ xfs_btree_delrec(
lptr = cptr;
left = block;
lbp = bp;
- error = xfs_btree_read_buf_block(cur, &rptr, level,
- 0, &right, &rbp);
+ error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
if (error)
goto error0;
@@ -3740,8 +3729,7 @@ xfs_btree_delrec(
/* If there is a right sibling, point it to the remaining block. */
xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
if (!xfs_btree_ptr_is_null(cur, &cptr)) {
- error = xfs_btree_read_buf_block(cur, &cptr, level,
- 0, &rrblock, &rrbp);
+ error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
if (error)
goto error0;
xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 91e34f21bace..a04b69422f67 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -62,6 +62,7 @@ union xfs_btree_rec {
#define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi)
#define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi)
#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi)
+#define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi)
/*
* For logging record fields.
@@ -92,6 +93,7 @@ do { \
case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break; \
case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break; \
case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break; \
+ case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(fibt, stat); break; \
case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
} \
} while (0)
@@ -105,6 +107,7 @@ do { \
case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \
case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \
case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \
+ case XFS_BTNUM_FINO: __XFS_BTREE_STATS_ADD(fibt, stat, val); break; \
case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
} \
} while (0)
@@ -129,7 +132,7 @@ struct xfs_btree_ops {
int (*alloc_block)(struct xfs_btree_cur *cur,
union xfs_btree_ptr *start_bno,
union xfs_btree_ptr *new_bno,
- int length, int *stat);
+ int *stat);
int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp);
/* update last record information */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index cb10a0aaab3a..7a34a1ae6552 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -216,8 +216,7 @@ _xfs_buf_alloc(
STATIC int
_xfs_buf_get_pages(
xfs_buf_t *bp,
- int page_count,
- xfs_buf_flags_t flags)
+ int page_count)
{
/* Make sure that we have a page list */
if (bp->b_pages == NULL) {
@@ -330,7 +329,7 @@ use_alloc_page:
end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1)
>> PAGE_SHIFT;
page_count = end - start;
- error = _xfs_buf_get_pages(bp, page_count, flags);
+ error = _xfs_buf_get_pages(bp, page_count);
if (unlikely(error))
return error;
@@ -778,7 +777,7 @@ xfs_buf_associate_memory(
bp->b_pages = NULL;
bp->b_addr = mem;
- rval = _xfs_buf_get_pages(bp, page_count, 0);
+ rval = _xfs_buf_get_pages(bp, page_count);
if (rval)
return rval;
@@ -811,7 +810,7 @@ xfs_buf_get_uncached(
goto fail;
page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT;
- error = _xfs_buf_get_pages(bp, page_count, 0);
+ error = _xfs_buf_get_pages(bp, page_count);
if (error)
goto fail_free_buf;
@@ -1615,7 +1614,6 @@ xfs_free_buftarg(
int
xfs_setsize_buftarg(
xfs_buftarg_t *btp,
- unsigned int blocksize,
unsigned int sectorsize)
{
/* Set up metadata sector size info */
@@ -1650,16 +1648,13 @@ xfs_setsize_buftarg_early(
xfs_buftarg_t *btp,
struct block_device *bdev)
{
- return xfs_setsize_buftarg(btp, PAGE_SIZE,
- bdev_logical_block_size(bdev));
+ return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
}
xfs_buftarg_t *
xfs_alloc_buftarg(
struct xfs_mount *mp,
- struct block_device *bdev,
- int external,
- const char *fsname)
+ struct block_device *bdev)
{
xfs_buftarg_t *btp;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index b8a3abf6cf47..3a7a5523d3dc 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -298,11 +298,6 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
extern int xfs_bioerror_relse(struct xfs_buf *);
-static inline int xfs_buf_geterror(xfs_buf_t *bp)
-{
- return bp ? bp->b_error : ENOMEM;
-}
-
/* Buffer Utility Routines */
extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
@@ -387,10 +382,10 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
* Handling of buftargs.
*/
extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
- struct block_device *, int, const char *);
+ struct block_device *);
extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
extern void xfs_wait_buftarg(xfs_buftarg_t *);
-extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
+extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int);
#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 8752821443be..4654338b03fc 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -812,7 +812,6 @@ xfs_buf_item_init(
*/
static void
xfs_buf_item_log_segment(
- struct xfs_buf_log_item *bip,
uint first,
uint last,
uint *map)
@@ -920,7 +919,7 @@ xfs_buf_item_log(
if (end > last)
end = last;
- xfs_buf_item_log_segment(bip, first, end,
+ xfs_buf_item_log_segment(first, end,
&bip->bli_formats[i].blf_data_map[0]);
start += bp->b_maps[i].bm_len;
@@ -1053,7 +1052,7 @@ xfs_buf_iodone_callbacks(
static ulong lasttime;
static xfs_buftarg_t *lasttarg;
- if (likely(!xfs_buf_geterror(bp)))
+ if (likely(!bp->b_error))
goto do_callbacks;
/*
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 6cc5f6785a77..a514ab616650 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -167,8 +167,8 @@ xfs_da3_node_verify(
* we don't know if the node is for and attribute or directory tree,
* so only fail if the count is outside both bounds
*/
- if (ichdr.count > mp->m_dir_node_ents &&
- ichdr.count > mp->m_attr_node_ents)
+ if (ichdr.count > mp->m_dir_geo->node_ents &&
+ ichdr.count > mp->m_attr_geo->node_ents)
return false;
/* XXX: hash order check? */
@@ -598,7 +598,7 @@ xfs_da3_root_split(
* Set up the new root node.
*/
error = xfs_da3_node_create(args,
- (args->whichfork == XFS_DATA_FORK) ? mp->m_dirleafblk : 0,
+ (args->whichfork == XFS_DATA_FORK) ? args->geo->leafblk : 0,
level + 1, &bp, args->whichfork);
if (error)
return error;
@@ -616,10 +616,10 @@ xfs_da3_root_split(
#ifdef DEBUG
if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
- ASSERT(blk1->blkno >= mp->m_dirleafblk &&
- blk1->blkno < mp->m_dirfreeblk);
- ASSERT(blk2->blkno >= mp->m_dirleafblk &&
- blk2->blkno < mp->m_dirfreeblk);
+ ASSERT(blk1->blkno >= args->geo->leafblk &&
+ blk1->blkno < args->geo->freeblk);
+ ASSERT(blk2->blkno >= args->geo->leafblk &&
+ blk2->blkno < args->geo->freeblk);
}
#endif
@@ -663,7 +663,7 @@ xfs_da3_node_split(
/*
* Do we have to split the node?
*/
- if (nodehdr.count + newcount > state->node_ents) {
+ if (nodehdr.count + newcount > state->args->geo->node_ents) {
/*
* Allocate a new node, add to the doubly linked chain of
* nodes, then move some of our excess entries into it.
@@ -894,8 +894,8 @@ xfs_da3_node_add(
ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count);
ASSERT(newblk->blkno != 0);
if (state->args->whichfork == XFS_DATA_FORK)
- ASSERT(newblk->blkno >= state->mp->m_dirleafblk &&
- newblk->blkno < state->mp->m_dirfreeblk);
+ ASSERT(newblk->blkno >= state->args->geo->leafblk &&
+ newblk->blkno < state->args->geo->freeblk);
/*
* We may need to make some room before we insert the new node.
@@ -1089,14 +1089,15 @@ xfs_da3_root_join(
* that could occur. For dir3 blocks we also need to update the block
* number in the buffer header.
*/
- memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize);
+ memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize);
root_blk->bp->b_ops = bp->b_ops;
xfs_trans_buf_copy_type(root_blk->bp, bp);
if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) {
struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr;
da3->blkno = cpu_to_be64(root_blk->bp->b_bn);
}
- xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
+ xfs_trans_log_buf(args->trans, root_blk->bp, 0,
+ args->geo->blksize - 1);
error = xfs_da_shrink_inode(args, child, bp);
return(error);
}
@@ -1139,7 +1140,7 @@ xfs_da3_node_toosmall(
info = blk->bp->b_addr;
node = (xfs_da_intnode_t *)info;
dp->d_ops->node_hdr_from_disk(&nodehdr, node);
- if (nodehdr.count > (state->node_ents >> 1)) {
+ if (nodehdr.count > (state->args->geo->node_ents >> 1)) {
*action = 0; /* blk over 50%, don't try to join */
return(0); /* blk over 50%, don't try to join */
}
@@ -1176,8 +1177,8 @@ xfs_da3_node_toosmall(
* We prefer coalescing with the lower numbered sibling so as
* to shrink a directory over time.
*/
- count = state->node_ents;
- count -= state->node_ents >> 2;
+ count = state->args->geo->node_ents;
+ count -= state->args->geo->node_ents >> 2;
count -= nodehdr.count;
/* start with smaller blk num */
@@ -1472,7 +1473,7 @@ xfs_da3_node_lookup_int(
* Descend thru the B-tree searching each level for the right
* node to use, until the right hashval is found.
*/
- blkno = (args->whichfork == XFS_DATA_FORK)? state->mp->m_dirleafblk : 0;
+ blkno = (args->whichfork == XFS_DATA_FORK)? args->geo->leafblk : 0;
for (blk = &state->path.blk[0], state->path.active = 1;
state->path.active <= XFS_DA_NODE_MAXDEPTH;
blk++, state->path.active++) {
@@ -2090,20 +2091,12 @@ xfs_da_grow_inode(
xfs_dablk_t *new_blkno)
{
xfs_fileoff_t bno;
- int count;
int error;
trace_xfs_da_grow_inode(args);
- if (args->whichfork == XFS_DATA_FORK) {
- bno = args->dp->i_mount->m_dirleafblk;
- count = args->dp->i_mount->m_dirblkfsbs;
- } else {
- bno = 0;
- count = 1;
- }
-
- error = xfs_da_grow_inode_int(args, &bno, count);
+ bno = args->geo->leafblk;
+ error = xfs_da_grow_inode_int(args, &bno, args->geo->fsbcount);
if (!error)
*new_blkno = (xfs_dablk_t)bno;
return error;
@@ -2158,7 +2151,7 @@ xfs_da3_swap_lastblock(
w = args->whichfork;
ASSERT(w == XFS_DATA_FORK);
mp = dp->i_mount;
- lastoff = mp->m_dirfreeblk;
+ lastoff = args->geo->freeblk;
error = xfs_bmap_last_before(tp, dp, &lastoff, w);
if (error)
return error;
@@ -2170,15 +2163,15 @@ xfs_da3_swap_lastblock(
/*
* Read the last block in the btree space.
*/
- last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs;
+ last_blkno = (xfs_dablk_t)lastoff - args->geo->fsbcount;
error = xfs_da3_node_read(tp, dp, last_blkno, -1, &last_buf, w);
if (error)
return error;
/*
* Copy the last block into the dead buffer and log it.
*/
- memcpy(dead_buf->b_addr, last_buf->b_addr, mp->m_dirblksize);
- xfs_trans_log_buf(tp, dead_buf, 0, mp->m_dirblksize - 1);
+ memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize);
+ xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1);
dead_info = dead_buf->b_addr;
/*
* Get values from the moved block.
@@ -2247,7 +2240,7 @@ xfs_da3_swap_lastblock(
sizeof(sib_info->back)));
sib_buf = NULL;
}
- par_blkno = mp->m_dirleafblk;
+ par_blkno = args->geo->leafblk;
level = -1;
/*
* Walk down the tree looking for the parent of the moved block.
@@ -2357,10 +2350,7 @@ xfs_da_shrink_inode(
w = args->whichfork;
tp = args->trans;
mp = dp->i_mount;
- if (w == XFS_DATA_FORK)
- count = mp->m_dirblkfsbs;
- else
- count = 1;
+ count = args->geo->fsbcount;
for (;;) {
/*
* Remove extents. If we get ENOSPC for a dir we have to move
@@ -2462,7 +2452,6 @@ xfs_buf_map_from_irec(
*/
static int
xfs_dabuf_map(
- struct xfs_trans *trans,
struct xfs_inode *dp,
xfs_dablk_t bno,
xfs_daddr_t mappedbno,
@@ -2480,7 +2469,10 @@ xfs_dabuf_map(
ASSERT(map && *map);
ASSERT(*nmaps == 1);
- nfsb = (whichfork == XFS_DATA_FORK) ? mp->m_dirblkfsbs : 1;
+ if (whichfork == XFS_DATA_FORK)
+ nfsb = mp->m_dir_geo->fsbcount;
+ else
+ nfsb = mp->m_attr_geo->fsbcount;
/*
* Caller doesn't have a mapping. -2 means don't complain
@@ -2558,7 +2550,7 @@ xfs_da_get_buf(
*bpp = NULL;
mapp = &map;
nmap = 1;
- error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
+ error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
&mapp, &nmap);
if (error) {
/* mapping a hole is not an error, but we don't continue */
@@ -2606,7 +2598,7 @@ xfs_da_read_buf(
*bpp = NULL;
mapp = &map;
nmap = 1;
- error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
+ error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
&mapp, &nmap);
if (error) {
/* mapping a hole is not an error, but we don't continue */
@@ -2625,47 +2617,6 @@ xfs_da_read_buf(
xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF);
else
xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF);
-
- /*
- * This verification code will be moved to a CRC verification callback
- * function so just leave it here unchanged until then.
- */
- {
- xfs_dir2_data_hdr_t *hdr = bp->b_addr;
- xfs_dir2_free_t *free = bp->b_addr;
- xfs_da_blkinfo_t *info = bp->b_addr;
- uint magic, magic1;
- struct xfs_mount *mp = dp->i_mount;
-
- magic = be16_to_cpu(info->magic);
- magic1 = be32_to_cpu(hdr->magic);
- if (unlikely(
- XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) &&
- (magic != XFS_DA3_NODE_MAGIC) &&
- (magic != XFS_ATTR_LEAF_MAGIC) &&
- (magic != XFS_ATTR3_LEAF_MAGIC) &&
- (magic != XFS_DIR2_LEAF1_MAGIC) &&
- (magic != XFS_DIR3_LEAF1_MAGIC) &&
- (magic != XFS_DIR2_LEAFN_MAGIC) &&
- (magic != XFS_DIR3_LEAFN_MAGIC) &&
- (magic1 != XFS_DIR2_BLOCK_MAGIC) &&
- (magic1 != XFS_DIR3_BLOCK_MAGIC) &&
- (magic1 != XFS_DIR2_DATA_MAGIC) &&
- (magic1 != XFS_DIR3_DATA_MAGIC) &&
- (free->hdr.magic !=
- cpu_to_be32(XFS_DIR2_FREE_MAGIC)) &&
- (free->hdr.magic !=
- cpu_to_be32(XFS_DIR3_FREE_MAGIC)),
- mp, XFS_ERRTAG_DA_READ_BUF,
- XFS_RANDOM_DA_READ_BUF))) {
- trace_xfs_da_btree_corrupt(bp, _RET_IP_);
- XFS_CORRUPTION_ERROR("xfs_da_do_buf(2)",
- XFS_ERRLEVEL_LOW, mp, info);
- error = XFS_ERROR(EFSCORRUPTED);
- xfs_trans_brelse(trans, bp);
- goto out_free;
- }
- }
*bpp = bp;
out_free:
if (mapp != &map)
@@ -2679,7 +2630,6 @@ out_free:
*/
xfs_daddr_t
xfs_da_reada_buf(
- struct xfs_trans *trans,
struct xfs_inode *dp,
xfs_dablk_t bno,
xfs_daddr_t mappedbno,
@@ -2693,7 +2643,7 @@ xfs_da_reada_buf(
mapp = &map;
nmap = 1;
- error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
+ error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
&mapp, &nmap);
if (error) {
/* mapping a hole is not an error, but we don't continue */
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 201c6091d26a..6e153e399a77 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -25,6 +25,23 @@ struct xfs_trans;
struct zone;
struct xfs_dir_ops;
+/*
+ * Directory/attribute geometry information. There will be one of these for each
+ * data fork type, and it will be passed around via the xfs_da_args. Global
+ * structures will be attached to the xfs_mount.
+ */
+struct xfs_da_geometry {
+ int blksize; /* da block size in bytes */
+ int fsbcount; /* da block size in filesystem blocks */
+ uint8_t fsblog; /* log2 of _filesystem_ block size */
+ uint8_t blklog; /* log2 of da block size */
+ uint node_ents; /* # of entries in a danode */
+ int magicpct; /* 37% of block size in bytes */
+ xfs_dablk_t datablk; /* blockno of dir data v2 */
+ xfs_dablk_t leafblk; /* blockno of leaf data v2 */
+ xfs_dablk_t freeblk; /* blockno of free data v2 */
+};
+
/*========================================================================
* Btree searching and modification structure definitions.
*========================================================================*/
@@ -42,6 +59,7 @@ enum xfs_dacmp {
* Structure to ease passing around component names.
*/
typedef struct xfs_da_args {
+ struct xfs_da_geometry *geo; /* da block geometry */
const __uint8_t *name; /* string (maybe not NULL terminated) */
int namelen; /* length of string (maybe no NULL) */
__uint8_t filetype; /* filetype of inode for directories */
@@ -110,8 +128,6 @@ typedef struct xfs_da_state_path {
typedef struct xfs_da_state {
xfs_da_args_t *args; /* filename arguments */
struct xfs_mount *mp; /* filesystem mount point */
- unsigned int blocksize; /* logical block size */
- unsigned int node_ents; /* how many entries in danode */
xfs_da_state_path_t path; /* search/split paths */
xfs_da_state_path_t altpath; /* alternate path for join */
unsigned char inleaf; /* insert into 1->lf, 0->splf */
@@ -185,9 +201,9 @@ int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
xfs_dablk_t bno, xfs_daddr_t mappedbno,
struct xfs_buf **bpp, int whichfork,
const struct xfs_buf_ops *ops);
-xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
- xfs_dablk_t bno, xfs_daddr_t mapped_bno,
- int whichfork, const struct xfs_buf_ops *ops);
+xfs_daddr_t xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
+ xfs_daddr_t mapped_bno, int whichfork,
+ const struct xfs_buf_ops *ops);
int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
struct xfs_buf *dead_buf);
diff --git a/fs/xfs/xfs_da_format.c b/fs/xfs/xfs_da_format.c
index e6c83e1fbc8a..c9aee52a37e2 100644
--- a/fs/xfs/xfs_da_format.c
+++ b/fs/xfs/xfs_da_format.c
@@ -26,8 +26,10 @@
#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
/*
* Shortform directory ops
@@ -425,9 +427,9 @@ xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
* Directory Leaf block operations
*/
static int
-xfs_dir2_max_leaf_ents(struct xfs_mount *mp)
+xfs_dir2_max_leaf_ents(struct xfs_da_geometry *geo)
{
- return (mp->m_dirblksize - sizeof(struct xfs_dir2_leaf_hdr)) /
+ return (geo->blksize - sizeof(struct xfs_dir2_leaf_hdr)) /
(uint)sizeof(struct xfs_dir2_leaf_entry);
}
@@ -438,9 +440,9 @@ xfs_dir2_leaf_ents_p(struct xfs_dir2_leaf *lp)
}
static int
-xfs_dir3_max_leaf_ents(struct xfs_mount *mp)
+xfs_dir3_max_leaf_ents(struct xfs_da_geometry *geo)
{
- return (mp->m_dirblksize - sizeof(struct xfs_dir3_leaf_hdr)) /
+ return (geo->blksize - sizeof(struct xfs_dir3_leaf_hdr)) /
(uint)sizeof(struct xfs_dir2_leaf_entry);
}
@@ -591,9 +593,9 @@ xfs_da3_node_hdr_to_disk(
* Directory free space block operations
*/
static int
-xfs_dir2_free_max_bests(struct xfs_mount *mp)
+xfs_dir2_free_max_bests(struct xfs_da_geometry *geo)
{
- return (mp->m_dirblksize - sizeof(struct xfs_dir2_free_hdr)) /
+ return (geo->blksize - sizeof(struct xfs_dir2_free_hdr)) /
sizeof(xfs_dir2_data_off_t);
}
@@ -607,24 +609,25 @@ xfs_dir2_free_bests_p(struct xfs_dir2_free *free)
* Convert data space db to the corresponding free db.
*/
static xfs_dir2_db_t
-xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db)
+xfs_dir2_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
{
- return XFS_DIR2_FREE_FIRSTDB(mp) + db / xfs_dir2_free_max_bests(mp);
+ return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
+ (db / xfs_dir2_free_max_bests(geo));
}
/*
* Convert data space db to the corresponding index in a free db.
*/
static int
-xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db)
+xfs_dir2_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
{
- return db % xfs_dir2_free_max_bests(mp);
+ return db % xfs_dir2_free_max_bests(geo);
}
static int
-xfs_dir3_free_max_bests(struct xfs_mount *mp)
+xfs_dir3_free_max_bests(struct xfs_da_geometry *geo)
{
- return (mp->m_dirblksize - sizeof(struct xfs_dir3_free_hdr)) /
+ return (geo->blksize - sizeof(struct xfs_dir3_free_hdr)) /
sizeof(xfs_dir2_data_off_t);
}
@@ -638,18 +641,19 @@ xfs_dir3_free_bests_p(struct xfs_dir2_free *free)
* Convert data space db to the corresponding free db.
*/
static xfs_dir2_db_t
-xfs_dir3_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db)
+xfs_dir3_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
{
- return XFS_DIR2_FREE_FIRSTDB(mp) + db / xfs_dir3_free_max_bests(mp);
+ return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
+ (db / xfs_dir3_free_max_bests(geo));
}
/*
* Convert data space db to the corresponding index in a free db.
*/
static int
-xfs_dir3_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db)
+xfs_dir3_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
{
- return db % xfs_dir3_free_max_bests(mp);
+ return db % xfs_dir3_free_max_bests(geo);
}
static void
diff --git a/fs/xfs/xfs_da_format.h b/fs/xfs/xfs_da_format.h
index a19d3f8f639c..0a49b0286372 100644
--- a/fs/xfs/xfs_da_format.h
+++ b/fs/xfs/xfs_da_format.h
@@ -19,10 +19,6 @@
#ifndef __XFS_DA_FORMAT_H__
#define __XFS_DA_FORMAT_H__
-/*========================================================================
- * Directory Structure when greater than XFS_LBSIZE(mp) bytes.
- *========================================================================*/
-
/*
* This structure is common to both leaf nodes and non-leaf nodes in the Btree.
*
@@ -122,8 +118,6 @@ struct xfs_da3_icnode_hdr {
__uint16_t level;
};
-#define XFS_LBSIZE(mp) (mp)->m_sb.sb_blocksize
-
/*
* Directory version 2.
*
@@ -330,8 +324,6 @@ xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
#define XFS_DIR2_DATA_SPACE 0
#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
-#define XFS_DIR2_DATA_FIRSTDB(mp) \
- xfs_dir2_byte_to_db(mp, XFS_DIR2_DATA_OFFSET)
/*
* Describe a free area in the data block.
@@ -456,8 +448,6 @@ xfs_dir2_data_unused_tag_p(struct xfs_dir2_data_unused *dup)
*/
#define XFS_DIR2_LEAF_SPACE 1
#define XFS_DIR2_LEAF_OFFSET (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE)
-#define XFS_DIR2_LEAF_FIRSTDB(mp) \
- xfs_dir2_byte_to_db(mp, XFS_DIR2_LEAF_OFFSET)
/*
* Leaf block header.
@@ -514,17 +504,6 @@ struct xfs_dir3_leaf {
#define XFS_DIR3_LEAF_CRC_OFF offsetof(struct xfs_dir3_leaf_hdr, info.crc)
/*
- * Get address of the bestcount field in the single-leaf block.
- */
-static inline struct xfs_dir2_leaf_tail *
-xfs_dir2_leaf_tail_p(struct xfs_mount *mp, struct xfs_dir2_leaf *lp)
-{
- return (struct xfs_dir2_leaf_tail *)
- ((char *)lp + mp->m_dirblksize -
- sizeof(struct xfs_dir2_leaf_tail));
-}
-
-/*
* Get address of the bests array in the single-leaf block.
*/
static inline __be16 *
@@ -534,123 +513,6 @@ xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp)
}
/*
- * DB blocks here are logical directory block numbers, not filesystem blocks.
- */
-
-/*
- * Convert dataptr to byte in file space
- */
-static inline xfs_dir2_off_t
-xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
-{
- return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG;
-}
-
-/*
- * Convert byte in file space to dataptr. It had better be aligned.
- */
-static inline xfs_dir2_dataptr_t
-xfs_dir2_byte_to_dataptr(struct xfs_mount *mp, xfs_dir2_off_t by)
-{
- return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG);
-}
-
-/*
- * Convert byte in space to (DB) block
- */
-static inline xfs_dir2_db_t
-xfs_dir2_byte_to_db(struct xfs_mount *mp, xfs_dir2_off_t by)
-{
- return (xfs_dir2_db_t)
- (by >> (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog));
-}
-
-/*
- * Convert dataptr to a block number
- */
-static inline xfs_dir2_db_t
-xfs_dir2_dataptr_to_db(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
-{
- return xfs_dir2_byte_to_db(mp, xfs_dir2_dataptr_to_byte(mp, dp));
-}
-
-/*
- * Convert byte in space to offset in a block
- */
-static inline xfs_dir2_data_aoff_t
-xfs_dir2_byte_to_off(struct xfs_mount *mp, xfs_dir2_off_t by)
-{
- return (xfs_dir2_data_aoff_t)(by &
- ((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) - 1));
-}
-
-/*
- * Convert dataptr to a byte offset in a block
- */
-static inline xfs_dir2_data_aoff_t
-xfs_dir2_dataptr_to_off(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
-{
- return xfs_dir2_byte_to_off(mp, xfs_dir2_dataptr_to_byte(mp, dp));
-}
-
-/*
- * Convert block and offset to byte in space
- */
-static inline xfs_dir2_off_t
-xfs_dir2_db_off_to_byte(struct xfs_mount *mp, xfs_dir2_db_t db,
- xfs_dir2_data_aoff_t o)
-{
- return ((xfs_dir2_off_t)db <<
- (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) + o;
-}
-
-/*
- * Convert block (DB) to block (dablk)
- */
-static inline xfs_dablk_t
-xfs_dir2_db_to_da(struct xfs_mount *mp, xfs_dir2_db_t db)
-{
- return (xfs_dablk_t)(db << mp->m_sb.sb_dirblklog);
-}
-
-/*
- * Convert byte in space to (DA) block
- */
-static inline xfs_dablk_t
-xfs_dir2_byte_to_da(struct xfs_mount *mp, xfs_dir2_off_t by)
-{
- return xfs_dir2_db_to_da(mp, xfs_dir2_byte_to_db(mp, by));
-}
-
-/*
- * Convert block and offset to dataptr
- */
-static inline xfs_dir2_dataptr_t
-xfs_dir2_db_off_to_dataptr(struct xfs_mount *mp, xfs_dir2_db_t db,
- xfs_dir2_data_aoff_t o)
-{
- return xfs_dir2_byte_to_dataptr(mp, xfs_dir2_db_off_to_byte(mp, db, o));
-}
-
-/*
- * Convert block (dablk) to block (DB)
- */
-static inline xfs_dir2_db_t
-xfs_dir2_da_to_db(struct xfs_mount *mp, xfs_dablk_t da)
-{
- return (xfs_dir2_db_t)(da >> mp->m_sb.sb_dirblklog);
-}
-
-/*
- * Convert block (dablk) to byte offset in space
- */
-static inline xfs_dir2_off_t
-xfs_dir2_da_to_byte(struct xfs_mount *mp, xfs_dablk_t da)
-{
- return xfs_dir2_db_off_to_byte(mp, xfs_dir2_da_to_db(mp, da), 0);
-}
-
-/*
* Free space block defintions for the node format.
*/
@@ -659,8 +521,6 @@ xfs_dir2_da_to_byte(struct xfs_mount *mp, xfs_dablk_t da)
*/
#define XFS_DIR2_FREE_SPACE 2
#define XFS_DIR2_FREE_OFFSET (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE)
-#define XFS_DIR2_FREE_FIRSTDB(mp) \
- xfs_dir2_byte_to_db(mp, XFS_DIR2_FREE_OFFSET)
typedef struct xfs_dir2_free_hdr {
__be32 magic; /* XFS_DIR2_FREE_MAGIC */
@@ -736,16 +596,6 @@ typedef struct xfs_dir2_block_tail {
} xfs_dir2_block_tail_t;
/*
- * Pointer to the leaf header embedded in a data block (1-block format)
- */
-static inline struct xfs_dir2_block_tail *
-xfs_dir2_block_tail_p(struct xfs_mount *mp, struct xfs_dir2_data_hdr *hdr)
-{
- return ((struct xfs_dir2_block_tail *)
- ((char *)hdr + mp->m_dirblksize)) - 1;
-}
-
-/*
* Pointer to the leaf entries embedded in a data block (1-block format)
*/
static inline struct xfs_dir2_leaf_entry *
@@ -764,10 +614,6 @@ xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp)
* of an attribute name may not be unique, we may have duplicate keys. The
* internal links in the Btree are logical block offsets into the file.
*
- *========================================================================
- * Attribute structure when equal to XFS_LBSIZE(mp) bytes.
- *========================================================================
- *
* Struct leaf_entry's are packed from the top. Name/values grow from the
* bottom but are not packed. The freemap contains run-length-encoded entries
* for the free bytes after the leaf_entry's, but only the N largest such,
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index fda46253966a..79670cda48ae 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -85,38 +85,74 @@ static struct xfs_nameops xfs_ascii_ci_nameops = {
.compname = xfs_ascii_ci_compname,
};
-void
-xfs_dir_mount(
- xfs_mount_t *mp)
+int
+xfs_da_mount(
+ struct xfs_mount *mp)
{
- int nodehdr_size;
+ struct xfs_da_geometry *dageo;
+ int nodehdr_size;
- ASSERT(xfs_sb_version_hasdirv2(&mp->m_sb));
+ ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <=
XFS_MAX_BLOCKSIZE);
mp->m_dir_inode_ops = xfs_dir_get_ops(mp, NULL);
mp->m_nondir_inode_ops = xfs_nondir_get_ops(mp, NULL);
- mp->m_dirblksize = 1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog);
- mp->m_dirblkfsbs = 1 << mp->m_sb.sb_dirblklog;
- mp->m_dirdatablk = xfs_dir2_db_to_da(mp, XFS_DIR2_DATA_FIRSTDB(mp));
- mp->m_dirleafblk = xfs_dir2_db_to_da(mp, XFS_DIR2_LEAF_FIRSTDB(mp));
- mp->m_dirfreeblk = xfs_dir2_db_to_da(mp, XFS_DIR2_FREE_FIRSTDB(mp));
-
nodehdr_size = mp->m_dir_inode_ops->node_hdr_size;
- mp->m_attr_node_ents = (mp->m_sb.sb_blocksize - nodehdr_size) /
+ mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
+ KM_SLEEP | KM_MAYFAIL);
+ mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
+ KM_SLEEP | KM_MAYFAIL);
+ if (!mp->m_dir_geo || !mp->m_attr_geo) {
+ kmem_free(mp->m_dir_geo);
+ kmem_free(mp->m_attr_geo);
+ return ENOMEM;
+ }
+
+ /* set up directory geometry */
+ dageo = mp->m_dir_geo;
+ dageo->blklog = mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog;
+ dageo->fsblog = mp->m_sb.sb_blocklog;
+ dageo->blksize = 1 << dageo->blklog;
+ dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog;
+
+ /*
+ * Now we've set up the block conversion variables, we can calculate the
+ * segment block constants using the geometry structure.
+ */
+ dageo->datablk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_DATA_OFFSET);
+ dageo->leafblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_LEAF_OFFSET);
+ dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET);
+ dageo->node_ents = (dageo->blksize - nodehdr_size) /
(uint)sizeof(xfs_da_node_entry_t);
- mp->m_dir_node_ents = (mp->m_dirblksize - nodehdr_size) /
+ dageo->magicpct = (dageo->blksize * 37) / 100;
+
+ /* set up attribute geometry - single fsb only */
+ dageo = mp->m_attr_geo;
+ dageo->blklog = mp->m_sb.sb_blocklog;
+ dageo->fsblog = mp->m_sb.sb_blocklog;
+ dageo->blksize = 1 << dageo->blklog;
+ dageo->fsbcount = 1;
+ dageo->node_ents = (dageo->blksize - nodehdr_size) /
(uint)sizeof(xfs_da_node_entry_t);
+ dageo->magicpct = (dageo->blksize * 37) / 100;
- mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100;
if (xfs_sb_version_hasasciici(&mp->m_sb))
mp->m_dirnameops = &xfs_ascii_ci_nameops;
else
mp->m_dirnameops = &xfs_default_nameops;
+ return 0;
+}
+
+void
+xfs_da_unmount(
+ struct xfs_mount *mp)
+{
+ kmem_free(mp->m_dir_geo);
+ kmem_free(mp->m_attr_geo);
}
/*
@@ -192,6 +228,7 @@ xfs_dir_init(
if (!args)
return ENOMEM;
+ args->geo = dp->i_mount->m_dir_geo;
args->dp = dp;
args->trans = tp;
error = xfs_dir2_sf_create(args, pdp->i_ino);
@@ -226,6 +263,7 @@ xfs_dir_createname(
if (!args)
return ENOMEM;
+ args->geo = dp->i_mount->m_dir_geo;
args->name = name->name;
args->namelen = name->len;
args->filetype = name->type;
@@ -244,7 +282,7 @@ xfs_dir_createname(
goto out_free;
}
- rval = xfs_dir2_isblock(tp, dp, &v);
+ rval = xfs_dir2_isblock(args, &v);
if (rval)
goto out_free;
if (v) {
@@ -252,7 +290,7 @@ xfs_dir_createname(
goto out_free;
}
- rval = xfs_dir2_isleaf(tp, dp, &v);
+ rval = xfs_dir2_isleaf(args, &v);
if (rval)
goto out_free;
if (v)
@@ -320,6 +358,7 @@ xfs_dir_lookup(
* annotations into the reclaim path for the ilock.
*/
args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ args->geo = dp->i_mount->m_dir_geo;
args->name = name->name;
args->namelen = name->len;
args->filetype = name->type;
@@ -336,7 +375,7 @@ xfs_dir_lookup(
goto out_check_rval;
}
- rval = xfs_dir2_isblock(tp, dp, &v);
+ rval = xfs_dir2_isblock(args, &v);
if (rval)
goto out_free;
if (v) {
@@ -344,7 +383,7 @@ xfs_dir_lookup(
goto out_check_rval;
}
- rval = xfs_dir2_isleaf(tp, dp, &v);
+ rval = xfs_dir2_isleaf(args, &v);
if (rval)
goto out_free;
if (v)
@@ -391,6 +430,7 @@ xfs_dir_removename(
if (!args)
return ENOMEM;
+ args->geo = dp->i_mount->m_dir_geo;
args->name = name->name;
args->namelen = name->len;
args->filetype = name->type;
@@ -408,7 +448,7 @@ xfs_dir_removename(
goto out_free;
}
- rval = xfs_dir2_isblock(tp, dp, &v);
+ rval = xfs_dir2_isblock(args, &v);
if (rval)
goto out_free;
if (v) {
@@ -416,7 +456,7 @@ xfs_dir_removename(
goto out_free;
}
- rval = xfs_dir2_isleaf(tp, dp, &v);
+ rval = xfs_dir2_isleaf(args, &v);
if (rval)
goto out_free;
if (v)
@@ -455,6 +495,7 @@ xfs_dir_replace(
if (!args)
return ENOMEM;
+ args->geo = dp->i_mount->m_dir_geo;
args->name = name->name;
args->namelen = name->len;
args->filetype = name->type;
@@ -472,7 +513,7 @@ xfs_dir_replace(
goto out_free;
}
- rval = xfs_dir2_isblock(tp, dp, &v);
+ rval = xfs_dir2_isblock(args, &v);
if (rval)
goto out_free;
if (v) {
@@ -480,7 +521,7 @@ xfs_dir_replace(
goto out_free;
}
- rval = xfs_dir2_isleaf(tp, dp, &v);
+ rval = xfs_dir2_isleaf(args, &v);
if (rval)
goto out_free;
if (v)
@@ -516,6 +557,7 @@ xfs_dir_canenter(
if (!args)
return ENOMEM;
+ args->geo = dp->i_mount->m_dir_geo;
args->name = name->name;
args->namelen = name->len;
args->filetype = name->type;
@@ -531,7 +573,7 @@ xfs_dir_canenter(
goto out_free;
}
- rval = xfs_dir2_isblock(tp, dp, &v);
+ rval = xfs_dir2_isblock(args, &v);
if (rval)
goto out_free;
if (v) {
@@ -539,7 +581,7 @@ xfs_dir_canenter(
goto out_free;
}
- rval = xfs_dir2_isleaf(tp, dp, &v);
+ rval = xfs_dir2_isleaf(args, &v);
if (rval)
goto out_free;
if (v)
@@ -579,13 +621,13 @@ xfs_dir2_grow_inode(
* Set lowest possible block in the space requested.
*/
bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE);
- count = mp->m_dirblkfsbs;
+ count = args->geo->fsbcount;
error = xfs_da_grow_inode_int(args, &bno, count);
if (error)
return error;
- *dbp = xfs_dir2_da_to_db(mp, (xfs_dablk_t)bno);
+ *dbp = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)bno);
/*
* Update file's size if this is the data space and it grew.
@@ -607,19 +649,16 @@ xfs_dir2_grow_inode(
*/
int
xfs_dir2_isblock(
- xfs_trans_t *tp,
- xfs_inode_t *dp,
- int *vp) /* out: 1 is block, 0 is not block */
+ struct xfs_da_args *args,
+ int *vp) /* out: 1 is block, 0 is not block */
{
- xfs_fileoff_t last; /* last file offset */
- xfs_mount_t *mp;
- int rval;
+ xfs_fileoff_t last; /* last file offset */
+ int rval;
- mp = dp->i_mount;
- if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK)))
+ if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
return rval;
- rval = XFS_FSB_TO_B(mp, last) == mp->m_dirblksize;
- ASSERT(rval == 0 || dp->i_d.di_size == mp->m_dirblksize);
+ rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize;
+ ASSERT(rval == 0 || args->dp->i_d.di_size == args->geo->blksize);
*vp = rval;
return 0;
}
@@ -629,18 +668,15 @@ xfs_dir2_isblock(
*/
int
xfs_dir2_isleaf(
- xfs_trans_t *tp,
- xfs_inode_t *dp,
- int *vp) /* out: 1 is leaf, 0 is not leaf */
+ struct xfs_da_args *args,
+ int *vp) /* out: 1 is block, 0 is not block */
{
- xfs_fileoff_t last; /* last file offset */
- xfs_mount_t *mp;
- int rval;
+ xfs_fileoff_t last; /* last file offset */
+ int rval;
- mp = dp->i_mount;
- if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK)))
+ if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
return rval;
- *vp = last == mp->m_dirleafblk + (1 << mp->m_sb.sb_dirblklog);
+ *vp = last == args->geo->leafblk + args->geo->fsbcount;
return 0;
}
@@ -668,11 +704,11 @@ xfs_dir2_shrink_inode(
dp = args->dp;
mp = dp->i_mount;
tp = args->trans;
- da = xfs_dir2_db_to_da(mp, db);
+ da = xfs_dir2_db_to_da(args->geo, db);
/*
* Unmap the fsblock(s).
*/
- if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs,
+ if ((error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount,
XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
&done))) {
/*
@@ -699,12 +735,12 @@ xfs_dir2_shrink_inode(
/*
* If it's not a data block, we're done.
*/
- if (db >= XFS_DIR2_LEAF_FIRSTDB(mp))
+ if (db >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET))
return 0;
/*
* If the block isn't the last one in the directory, we're done.
*/
- if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(mp, db + 1, 0))
+ if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(args->geo, db + 1, 0))
return 0;
bno = da;
if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) {
@@ -713,7 +749,7 @@ xfs_dir2_shrink_inode(
*/
return error;
}
- if (db == mp->m_dirdatablk)
+ if (db == args->geo->datablk)
ASSERT(bno == 0);
else
ASSERT(bno > 0);
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index cec70e0781ab..c8e86b0b5e99 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -80,7 +80,7 @@ struct xfs_dir_ops {
struct xfs_dir3_icleaf_hdr *from);
void (*leaf_hdr_from_disk)(struct xfs_dir3_icleaf_hdr *to,
struct xfs_dir2_leaf *from);
- int (*leaf_max_ents)(struct xfs_mount *mp);
+ int (*leaf_max_ents)(struct xfs_da_geometry *geo);
struct xfs_dir2_leaf_entry *
(*leaf_ents_p)(struct xfs_dir2_leaf *lp);
@@ -97,10 +97,12 @@ struct xfs_dir_ops {
struct xfs_dir3_icfree_hdr *from);
void (*free_hdr_from_disk)(struct xfs_dir3_icfree_hdr *to,
struct xfs_dir2_free *from);
- int (*free_max_bests)(struct xfs_mount *mp);
+ int (*free_max_bests)(struct xfs_da_geometry *geo);
__be16 * (*free_bests_p)(struct xfs_dir2_free *free);
- xfs_dir2_db_t (*db_to_fdb)(struct xfs_mount *mp, xfs_dir2_db_t db);
- int (*db_to_fdindex)(struct xfs_mount *mp, xfs_dir2_db_t db);
+ xfs_dir2_db_t (*db_to_fdb)(struct xfs_da_geometry *geo,
+ xfs_dir2_db_t db);
+ int (*db_to_fdindex)(struct xfs_da_geometry *geo,
+ xfs_dir2_db_t db);
};
extern const struct xfs_dir_ops *
@@ -112,7 +114,9 @@ extern const struct xfs_dir_ops *
* Generic directory interface routines
*/
extern void xfs_dir_startup(void);
-extern void xfs_dir_mount(struct xfs_mount *mp);
+extern int xfs_da_mount(struct xfs_mount *mp);
+extern void xfs_da_unmount(struct xfs_mount *mp);
+
extern int xfs_dir_isempty(struct xfs_inode *dp);
extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_inode *pdp);
@@ -142,23 +146,23 @@ extern int xfs_dir2_sf_to_block(struct xfs_da_args *args);
/*
* Interface routines used by userspace utilities
*/
-extern int xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, int *r);
-extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *r);
+extern int xfs_dir2_isblock(struct xfs_da_args *args, int *r);
+extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r);
extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
struct xfs_buf *bp);
extern void xfs_dir2_data_freescan(struct xfs_inode *dp,
struct xfs_dir2_data_hdr *hdr, int *loghead);
-extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_inode *dp,
+extern void xfs_dir2_data_log_entry(struct xfs_da_args *args,
struct xfs_buf *bp, struct xfs_dir2_data_entry *dep);
-extern void xfs_dir2_data_log_header(struct xfs_trans *tp, struct xfs_inode *dp,
+extern void xfs_dir2_data_log_header(struct xfs_da_args *args,
struct xfs_buf *bp);
-extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_buf *bp,
- struct xfs_dir2_data_unused *dup);
-extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_inode *dp,
+extern void xfs_dir2_data_log_unused(struct xfs_da_args *args,
+ struct xfs_buf *bp, struct xfs_dir2_data_unused *dup);
+extern void xfs_dir2_data_make_free(struct xfs_da_args *args,
struct xfs_buf *bp, xfs_dir2_data_aoff_t offset,
xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
-extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_inode *dp,
+extern void xfs_dir2_data_use_free(struct xfs_da_args *args,
struct xfs_buf *bp, struct xfs_dir2_data_unused *dup,
xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len,
int *needlogp, int *needscanp);
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 4f6a38cb83a4..c7cd3154026a 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -136,7 +136,7 @@ xfs_dir3_block_read(
struct xfs_mount *mp = dp->i_mount;
int err;
- err = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp,
+ err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp,
XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
if (!err && tp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
@@ -281,8 +281,7 @@ out:
*/
static void
xfs_dir2_block_compact(
- struct xfs_trans *tp,
- struct xfs_inode *dp,
+ struct xfs_da_args *args,
struct xfs_buf *bp,
struct xfs_dir2_data_hdr *hdr,
struct xfs_dir2_block_tail *btp,
@@ -315,18 +314,17 @@ xfs_dir2_block_compact(
*lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
*lfloghigh -= be32_to_cpu(btp->stale) - 1;
be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
- xfs_dir2_data_make_free(tp, dp, bp,
+ xfs_dir2_data_make_free(args, bp,
(xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
(xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
needlog, &needscan);
- blp += be32_to_cpu(btp->stale) - 1;
btp->stale = cpu_to_be32(1);
/*
* If we now need to rebuild the bestfree map, do so.
* This needs to happen before the next call to use_free.
*/
if (needscan)
- xfs_dir2_data_freescan(dp, hdr, needlog);
+ xfs_dir2_data_freescan(args->dp, hdr, needlog);
}
/*
@@ -378,7 +376,7 @@ xfs_dir2_block_addname(
* Set up pointers to parts of the block.
*/
hdr = bp->b_addr;
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
blp = xfs_dir2_block_leaf_p(btp);
/*
@@ -421,7 +419,7 @@ xfs_dir2_block_addname(
* If need to compact the leaf entries, do it now.
*/
if (compact) {
- xfs_dir2_block_compact(tp, dp, bp, hdr, btp, blp, &needlog,
+ xfs_dir2_block_compact(args, bp, hdr, btp, blp, &needlog,
&lfloghigh, &lfloglow);
/* recalculate blp post-compaction */
blp = xfs_dir2_block_leaf_p(btp);
@@ -456,7 +454,7 @@ xfs_dir2_block_addname(
/*
* Mark the space needed for the new leaf entry, now in use.
*/
- xfs_dir2_data_use_free(tp, dp, bp, enddup,
+ xfs_dir2_data_use_free(args, bp, enddup,
(xfs_dir2_data_aoff_t)
((char *)enddup - (char *)hdr + be16_to_cpu(enddup->length) -
sizeof(*blp)),
@@ -537,13 +535,13 @@ xfs_dir2_block_addname(
* Fill in the leaf entry.
*/
blp[mid].hashval = cpu_to_be32(args->hashval);
- blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
+ blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
(char *)dep - (char *)hdr));
xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh);
/*
* Mark space for the data entry used.
*/
- xfs_dir2_data_use_free(tp, dp, bp, dup,
+ xfs_dir2_data_use_free(args, bp, dup,
(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
(xfs_dir2_data_aoff_t)len, &needlog, &needscan);
/*
@@ -561,9 +559,9 @@ xfs_dir2_block_addname(
if (needscan)
xfs_dir2_data_freescan(dp, hdr, &needlog);
if (needlog)
- xfs_dir2_data_log_header(tp, dp, bp);
+ xfs_dir2_data_log_header(args, bp);
xfs_dir2_block_log_tail(tp, bp);
- xfs_dir2_data_log_entry(tp, dp, bp, dep);
+ xfs_dir2_data_log_entry(args, bp, dep);
xfs_dir3_data_check(dp, bp);
return 0;
}
@@ -582,7 +580,7 @@ xfs_dir2_block_log_leaf(
xfs_dir2_leaf_entry_t *blp;
xfs_dir2_block_tail_t *btp;
- btp = xfs_dir2_block_tail_p(tp->t_mountp, hdr);
+ btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr);
blp = xfs_dir2_block_leaf_p(btp);
xfs_trans_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr),
(uint)((char *)&blp[last + 1] - (char *)hdr - 1));
@@ -599,7 +597,7 @@ xfs_dir2_block_log_tail(
xfs_dir2_data_hdr_t *hdr = bp->b_addr;
xfs_dir2_block_tail_t *btp;
- btp = xfs_dir2_block_tail_p(tp->t_mountp, hdr);
+ btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr);
xfs_trans_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr),
(uint)((char *)(btp + 1) - (char *)hdr - 1));
}
@@ -634,13 +632,14 @@ xfs_dir2_block_lookup(
mp = dp->i_mount;
hdr = bp->b_addr;
xfs_dir3_data_check(dp, bp);
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
blp = xfs_dir2_block_leaf_p(btp);
/*
* Get the offset from the leaf entry, to point to the data.
*/
dep = (xfs_dir2_data_entry_t *)((char *)hdr +
- xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address)));
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(blp[ent].address)));
/*
* Fill in inode number, CI name if appropriate, release the block.
*/
@@ -686,7 +685,7 @@ xfs_dir2_block_lookup_int(
hdr = bp->b_addr;
xfs_dir3_data_check(dp, bp);
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
blp = xfs_dir2_block_leaf_p(btp);
/*
* Loop doing a binary search for our hash value.
@@ -724,7 +723,7 @@ xfs_dir2_block_lookup_int(
* Get pointer to the entry from the leaf.
*/
dep = (xfs_dir2_data_entry_t *)
- ((char *)hdr + xfs_dir2_dataptr_to_off(mp, addr));
+ ((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, addr));
/*
* Compare name and if it's an exact match, return the index
* and buffer. If it's the first case-insensitive match, store
@@ -791,18 +790,19 @@ xfs_dir2_block_removename(
tp = args->trans;
mp = dp->i_mount;
hdr = bp->b_addr;
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
blp = xfs_dir2_block_leaf_p(btp);
/*
* Point to the data entry using the leaf entry.
*/
- dep = (xfs_dir2_data_entry_t *)
- ((char *)hdr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address)));
+ dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(blp[ent].address)));
/*
* Mark the data entry's space free.
*/
needlog = needscan = 0;
- xfs_dir2_data_make_free(tp, dp, bp,
+ xfs_dir2_data_make_free(args, bp,
(xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
/*
@@ -821,7 +821,7 @@ xfs_dir2_block_removename(
if (needscan)
xfs_dir2_data_freescan(dp, hdr, &needlog);
if (needlog)
- xfs_dir2_data_log_header(tp, dp, bp);
+ xfs_dir2_data_log_header(args, bp);
xfs_dir3_data_check(dp, bp);
/*
* See if the size as a shortform is good enough.
@@ -866,20 +866,21 @@ xfs_dir2_block_replace(
dp = args->dp;
mp = dp->i_mount;
hdr = bp->b_addr;
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
blp = xfs_dir2_block_leaf_p(btp);
/*
* Point to the data entry we need to change.
*/
- dep = (xfs_dir2_data_entry_t *)
- ((char *)hdr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address)));
+ dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(blp[ent].address)));
ASSERT(be64_to_cpu(dep->inumber) != args->inumber);
/*
* Change the inode number to the new value.
*/
dep->inumber = cpu_to_be64(args->inumber);
dp->d_ops->data_put_ftype(dep, args->filetype);
- xfs_dir2_data_log_entry(args->trans, dp, bp, dep);
+ xfs_dir2_data_log_entry(args, bp, dep);
xfs_dir3_data_check(dp, bp);
return 0;
}
@@ -939,7 +940,7 @@ xfs_dir2_leaf_to_block(
leaf = lbp->b_addr;
dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
ents = dp->d_ops->leaf_ents_p(leaf);
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC ||
leafhdr.magic == XFS_DIR3_LEAF1_MAGIC);
@@ -949,13 +950,13 @@ xfs_dir2_leaf_to_block(
* been left behind during no-space-reservation operations.
* These will show up in the leaf bests table.
*/
- while (dp->i_d.di_size > mp->m_dirblksize) {
+ while (dp->i_d.di_size > args->geo->blksize) {
int hdrsz;
hdrsz = dp->d_ops->data_entry_offset;
bestsp = xfs_dir2_leaf_bests_p(ltp);
if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) ==
- mp->m_dirblksize - hdrsz) {
+ args->geo->blksize - hdrsz) {
if ((error =
xfs_dir2_leaf_trim_data(args, lbp,
(xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1))))
@@ -967,7 +968,7 @@ xfs_dir2_leaf_to_block(
* Read the data block if we don't already have it, give up if it fails.
*/
if (!dbp) {
- error = xfs_dir3_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp);
+ error = xfs_dir3_data_read(tp, dp, args->geo->datablk, -1, &dbp);
if (error)
return error;
}
@@ -983,7 +984,7 @@ xfs_dir2_leaf_to_block(
/*
* Look at the last data entry.
*/
- tagp = (__be16 *)((char *)hdr + mp->m_dirblksize) - 1;
+ tagp = (__be16 *)((char *)hdr + args->geo->blksize) - 1;
dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
/*
* If it's not free or is too short we can't do it.
@@ -1002,12 +1003,12 @@ xfs_dir2_leaf_to_block(
/*
* Use up the space at the end of the block (blp/btp).
*/
- xfs_dir2_data_use_free(tp, dp, dbp, dup, mp->m_dirblksize - size, size,
+ xfs_dir2_data_use_free(args, dbp, dup, args->geo->blksize - size, size,
&needlog, &needscan);
/*
* Initialize the block tail.
*/
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
btp->count = cpu_to_be32(leafhdr.count - leafhdr.stale);
btp->stale = 0;
xfs_dir2_block_log_tail(tp, dbp);
@@ -1028,11 +1029,11 @@ xfs_dir2_leaf_to_block(
if (needscan)
xfs_dir2_data_freescan(dp, hdr, &needlog);
if (needlog)
- xfs_dir2_data_log_header(tp, dp, dbp);
+ xfs_dir2_data_log_header(args, dbp);
/*
* Pitch the old leaf block.
*/
- error = xfs_da_shrink_inode(args, mp->m_dirleafblk, lbp);
+ error = xfs_da_shrink_inode(args, args->geo->leafblk, lbp);
if (error)
return error;
@@ -1141,13 +1142,13 @@ xfs_dir2_sf_to_block(
*/
dup = dp->d_ops->data_unused_p(hdr);
needlog = needscan = 0;
- xfs_dir2_data_use_free(tp, dp, bp, dup, mp->m_dirblksize - i, i, &needlog,
- &needscan);
+ xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i,
+ i, &needlog, &needscan);
ASSERT(needscan == 0);
/*
* Fill in the tail.
*/
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
btp->count = cpu_to_be32(sfp->count + 2); /* ., .. */
btp->stale = 0;
blp = xfs_dir2_block_leaf_p(btp);
@@ -1155,7 +1156,7 @@ xfs_dir2_sf_to_block(
/*
* Remove the freespace, we'll manage it.
*/
- xfs_dir2_data_use_free(tp, dp, bp, dup,
+ xfs_dir2_data_use_free(args, bp, dup,
(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
be16_to_cpu(dup->length), &needlog, &needscan);
/*
@@ -1168,9 +1169,9 @@ xfs_dir2_sf_to_block(
dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR);
tagp = dp->d_ops->data_entry_tag_p(dep);
*tagp = cpu_to_be16((char *)dep - (char *)hdr);
- xfs_dir2_data_log_entry(tp, dp, bp, dep);
+ xfs_dir2_data_log_entry(args, bp, dep);
blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot);
- blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
+ blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
(char *)dep - (char *)hdr));
/*
* Create entry for ..
@@ -1182,9 +1183,9 @@ xfs_dir2_sf_to_block(
dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR);
tagp = dp->d_ops->data_entry_tag_p(dep);
*tagp = cpu_to_be16((char *)dep - (char *)hdr);
- xfs_dir2_data_log_entry(tp, dp, bp, dep);
+ xfs_dir2_data_log_entry(args, bp, dep);
blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
- blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
+ blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
(char *)dep - (char *)hdr));
offset = dp->d_ops->data_first_offset;
/*
@@ -1216,7 +1217,7 @@ xfs_dir2_sf_to_block(
dup->length = cpu_to_be16(newoffset - offset);
*xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16(
((char *)dup - (char *)hdr));
- xfs_dir2_data_log_unused(tp, bp, dup);
+ xfs_dir2_data_log_unused(args, bp, dup);
xfs_dir2_data_freeinsert(hdr,
dp->d_ops->data_bestfree_p(hdr),
dup, &dummy);
@@ -1233,12 +1234,12 @@ xfs_dir2_sf_to_block(
memcpy(dep->name, sfep->name, dep->namelen);
tagp = dp->d_ops->data_entry_tag_p(dep);
*tagp = cpu_to_be16((char *)dep - (char *)hdr);
- xfs_dir2_data_log_entry(tp, dp, bp, dep);
+ xfs_dir2_data_log_entry(args, bp, dep);
name.name = sfep->name;
name.len = sfep->namelen;
blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops->
hashname(&name));
- blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
+ blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
(char *)dep - (char *)hdr));
offset = (int)((char *)(tagp + 1) - (char *)hdr);
if (++i == sfp->count)
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index afa4ad523f3f..8c2f6422648e 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -63,8 +63,10 @@ __xfs_dir3_data_check(
int stale; /* count of stale leaves */
struct xfs_name name;
const struct xfs_dir_ops *ops;
+ struct xfs_da_geometry *geo;
mp = bp->b_target->bt_mount;
+ geo = mp->m_dir_geo;
/*
* We can be passed a null dp here from a verifier, so we need to go the
@@ -78,7 +80,7 @@ __xfs_dir3_data_check(
switch (hdr->magic) {
case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(geo, hdr);
lep = xfs_dir2_block_leaf_p(btp);
endp = (char *)lep;
@@ -94,7 +96,7 @@ __xfs_dir3_data_check(
break;
case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
- endp = (char *)hdr + mp->m_dirblksize;
+ endp = (char *)hdr + geo->blksize;
break;
default:
XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp);
@@ -172,9 +174,9 @@ __xfs_dir3_data_check(
lastfree = 0;
if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
- addr = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
- (xfs_dir2_data_aoff_t)
- ((char *)dep - (char *)hdr));
+ addr = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+ (xfs_dir2_data_aoff_t)
+ ((char *)dep - (char *)hdr));
name.name = dep->name;
name.len = dep->namelen;
hash = mp->m_dirnameops->hashname(&name);
@@ -329,12 +331,11 @@ xfs_dir3_data_read(
int
xfs_dir3_data_readahead(
- struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t bno,
xfs_daddr_t mapped_bno)
{
- return xfs_da_reada_buf(tp, dp, bno, mapped_bno,
+ return xfs_da_reada_buf(dp, bno, mapped_bno,
XFS_DATA_FORK, &xfs_dir3_data_reada_buf_ops);
}
@@ -510,6 +511,7 @@ xfs_dir2_data_freescan(
struct xfs_dir2_data_free *bf;
char *endp; /* end of block's data */
char *p; /* current entry pointer */
+ struct xfs_da_geometry *geo = dp->i_mount->m_dir_geo;
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
@@ -528,10 +530,10 @@ xfs_dir2_data_freescan(
p = (char *)dp->d_ops->data_entry_p(hdr);
if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
- btp = xfs_dir2_block_tail_p(dp->i_mount, hdr);
+ btp = xfs_dir2_block_tail_p(geo, hdr);
endp = (char *)xfs_dir2_block_leaf_p(btp);
} else
- endp = (char *)hdr + dp->i_mount->m_dirblksize;
+ endp = (char *)hdr + geo->blksize;
/*
* Loop over the block's entries.
*/
@@ -585,8 +587,8 @@ xfs_dir3_data_init(
/*
* Get the buffer set up for the block.
*/
- error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp,
- XFS_DATA_FORK);
+ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, blkno),
+ -1, &bp, XFS_DATA_FORK);
if (error)
return error;
bp->b_ops = &xfs_dir3_data_buf_ops;
@@ -621,15 +623,15 @@ xfs_dir3_data_init(
dup = dp->d_ops->data_unused_p(hdr);
dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
- t = mp->m_dirblksize - (uint)dp->d_ops->data_entry_offset;
+ t = args->geo->blksize - (uint)dp->d_ops->data_entry_offset;
bf[0].length = cpu_to_be16(t);
dup->length = cpu_to_be16(t);
*xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr);
/*
* Log it and return it.
*/
- xfs_dir2_data_log_header(tp, dp, bp);
- xfs_dir2_data_log_unused(tp, bp, dup);
+ xfs_dir2_data_log_header(args, bp);
+ xfs_dir2_data_log_unused(args, bp, dup);
*bpp = bp;
return 0;
}
@@ -639,8 +641,7 @@ xfs_dir3_data_init(
*/
void
xfs_dir2_data_log_entry(
- struct xfs_trans *tp,
- struct xfs_inode *dp,
+ struct xfs_da_args *args,
struct xfs_buf *bp,
xfs_dir2_data_entry_t *dep) /* data entry pointer */
{
@@ -651,8 +652,8 @@ xfs_dir2_data_log_entry(
hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
- xfs_trans_log_buf(tp, bp, (uint)((char *)dep - (char *)hdr),
- (uint)((char *)(dp->d_ops->data_entry_tag_p(dep) + 1) -
+ xfs_trans_log_buf(args->trans, bp, (uint)((char *)dep - (char *)hdr),
+ (uint)((char *)(args->dp->d_ops->data_entry_tag_p(dep) + 1) -
(char *)hdr - 1));
}
@@ -661,8 +662,7 @@ xfs_dir2_data_log_entry(
*/
void
xfs_dir2_data_log_header(
- struct xfs_trans *tp,
- struct xfs_inode *dp,
+ struct xfs_da_args *args,
struct xfs_buf *bp)
{
#ifdef DEBUG
@@ -674,7 +674,8 @@ xfs_dir2_data_log_header(
hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
#endif
- xfs_trans_log_buf(tp, bp, 0, dp->d_ops->data_entry_offset - 1);
+ xfs_trans_log_buf(args->trans, bp, 0,
+ args->dp->d_ops->data_entry_offset - 1);
}
/*
@@ -682,7 +683,7 @@ xfs_dir2_data_log_header(
*/
void
xfs_dir2_data_log_unused(
- struct xfs_trans *tp,
+ struct xfs_da_args *args,
struct xfs_buf *bp,
xfs_dir2_data_unused_t *dup) /* data unused pointer */
{
@@ -696,13 +697,13 @@ xfs_dir2_data_log_unused(
/*
* Log the first part of the unused entry.
*/
- xfs_trans_log_buf(tp, bp, (uint)((char *)dup - (char *)hdr),
+ xfs_trans_log_buf(args->trans, bp, (uint)((char *)dup - (char *)hdr),
(uint)((char *)&dup->length + sizeof(dup->length) -
1 - (char *)hdr));
/*
* Log the end (tag) of the unused entry.
*/
- xfs_trans_log_buf(tp, bp,
+ xfs_trans_log_buf(args->trans, bp,
(uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr),
(uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr +
sizeof(xfs_dir2_data_off_t) - 1));
@@ -714,8 +715,7 @@ xfs_dir2_data_log_unused(
*/
void
xfs_dir2_data_make_free(
- struct xfs_trans *tp,
- struct xfs_inode *dp,
+ struct xfs_da_args *args,
struct xfs_buf *bp,
xfs_dir2_data_aoff_t offset, /* starting byte offset */
xfs_dir2_data_aoff_t len, /* length in bytes */
@@ -725,14 +725,12 @@ xfs_dir2_data_make_free(
xfs_dir2_data_hdr_t *hdr; /* data block pointer */
xfs_dir2_data_free_t *dfp; /* bestfree pointer */
char *endptr; /* end of data area */
- xfs_mount_t *mp; /* filesystem mount point */
int needscan; /* need to regen bestfree */
xfs_dir2_data_unused_t *newdup; /* new unused entry */
xfs_dir2_data_unused_t *postdup; /* unused entry after us */
xfs_dir2_data_unused_t *prevdup; /* unused entry before us */
struct xfs_dir2_data_free *bf;
- mp = tp->t_mountp;
hdr = bp->b_addr;
/*
@@ -740,20 +738,20 @@ xfs_dir2_data_make_free(
*/
if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC))
- endptr = (char *)hdr + mp->m_dirblksize;
+ endptr = (char *)hdr + args->geo->blksize;
else {
xfs_dir2_block_tail_t *btp; /* block tail */
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
endptr = (char *)xfs_dir2_block_leaf_p(btp);
}
/*
* If this isn't the start of the block, then back up to
* the previous entry and see if it's free.
*/
- if (offset > dp->d_ops->data_entry_offset) {
+ if (offset > args->dp->d_ops->data_entry_offset) {
__be16 *tagp; /* tag just before us */
tagp = (__be16 *)((char *)hdr + offset) - 1;
@@ -779,7 +777,7 @@ xfs_dir2_data_make_free(
* Previous and following entries are both free,
* merge everything into a single free entry.
*/
- bf = dp->d_ops->data_bestfree_p(hdr);
+ bf = args->dp->d_ops->data_bestfree_p(hdr);
if (prevdup && postdup) {
xfs_dir2_data_free_t *dfp2; /* another bestfree pointer */
@@ -801,7 +799,7 @@ xfs_dir2_data_make_free(
be16_add_cpu(&prevdup->length, len + be16_to_cpu(postdup->length));
*xfs_dir2_data_unused_tag_p(prevdup) =
cpu_to_be16((char *)prevdup - (char *)hdr);
- xfs_dir2_data_log_unused(tp, bp, prevdup);
+ xfs_dir2_data_log_unused(args, bp, prevdup);
if (!needscan) {
/*
* Has to be the case that entries 0 and 1 are
@@ -836,7 +834,7 @@ xfs_dir2_data_make_free(
be16_add_cpu(&prevdup->length, len);
*xfs_dir2_data_unused_tag_p(prevdup) =
cpu_to_be16((char *)prevdup - (char *)hdr);
- xfs_dir2_data_log_unused(tp, bp, prevdup);
+ xfs_dir2_data_log_unused(args, bp, prevdup);
/*
* If the previous entry was in the table, the new entry
* is longer, so it will be in the table too. Remove
@@ -864,7 +862,7 @@ xfs_dir2_data_make_free(
newdup->length = cpu_to_be16(len + be16_to_cpu(postdup->length));
*xfs_dir2_data_unused_tag_p(newdup) =
cpu_to_be16((char *)newdup - (char *)hdr);
- xfs_dir2_data_log_unused(tp, bp, newdup);
+ xfs_dir2_data_log_unused(args, bp, newdup);
/*
* If the following entry was in the table, the new entry
* is longer, so it will be in the table too. Remove
@@ -891,7 +889,7 @@ xfs_dir2_data_make_free(
newdup->length = cpu_to_be16(len);
*xfs_dir2_data_unused_tag_p(newdup) =
cpu_to_be16((char *)newdup - (char *)hdr);
- xfs_dir2_data_log_unused(tp, bp, newdup);
+ xfs_dir2_data_log_unused(args, bp, newdup);
xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp);
}
*needscanp = needscan;
@@ -902,8 +900,7 @@ xfs_dir2_data_make_free(
*/
void
xfs_dir2_data_use_free(
- struct xfs_trans *tp,
- struct xfs_inode *dp,
+ struct xfs_da_args *args,
struct xfs_buf *bp,
xfs_dir2_data_unused_t *dup, /* unused entry */
xfs_dir2_data_aoff_t offset, /* starting offset to use */
@@ -934,7 +931,7 @@ xfs_dir2_data_use_free(
* Look up the entry in the bestfree table.
*/
oldlen = be16_to_cpu(dup->length);
- bf = dp->d_ops->data_bestfree_p(hdr);
+ bf = args->dp->d_ops->data_bestfree_p(hdr);
dfp = xfs_dir2_data_freefind(hdr, bf, dup);
ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length));
/*
@@ -966,7 +963,7 @@ xfs_dir2_data_use_free(
newdup->length = cpu_to_be16(oldlen - len);
*xfs_dir2_data_unused_tag_p(newdup) =
cpu_to_be16((char *)newdup - (char *)hdr);
- xfs_dir2_data_log_unused(tp, bp, newdup);
+ xfs_dir2_data_log_unused(args, bp, newdup);
/*
* If it was in the table, remove it and add the new one.
*/
@@ -994,7 +991,7 @@ xfs_dir2_data_use_free(
newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup);
*xfs_dir2_data_unused_tag_p(newdup) =
cpu_to_be16((char *)newdup - (char *)hdr);
- xfs_dir2_data_log_unused(tp, bp, newdup);
+ xfs_dir2_data_log_unused(args, bp, newdup);
/*
* If it was in the table, remove it and add the new one.
*/
@@ -1022,13 +1019,13 @@ xfs_dir2_data_use_free(
newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup);
*xfs_dir2_data_unused_tag_p(newdup) =
cpu_to_be16((char *)newdup - (char *)hdr);
- xfs_dir2_data_log_unused(tp, bp, newdup);
+ xfs_dir2_data_log_unused(args, bp, newdup);
newdup2 = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
newdup2->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
newdup2->length = cpu_to_be16(oldlen - len - be16_to_cpu(newdup->length));
*xfs_dir2_data_unused_tag_p(newdup2) =
cpu_to_be16((char *)newdup2 - (char *)hdr);
- xfs_dir2_data_log_unused(tp, bp, newdup2);
+ xfs_dir2_data_log_unused(args, bp, newdup2);
/*
* If the old entry was in the table, we need to scan
* if the 3rd entry was valid, since these entries
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index d36e97df1187..fb0aad4440c1 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -41,9 +41,10 @@
*/
static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp,
int *indexp, struct xfs_buf **dbpp);
-static void xfs_dir3_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp,
- int first, int last);
-static void xfs_dir3_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp);
+static void xfs_dir3_leaf_log_bests(struct xfs_da_args *args,
+ struct xfs_buf *bp, int first, int last);
+static void xfs_dir3_leaf_log_tail(struct xfs_da_args *args,
+ struct xfs_buf *bp);
/*
* Check the internal consistency of a leaf1 block.
@@ -92,6 +93,7 @@ xfs_dir3_leaf_check_int(
int i;
const struct xfs_dir_ops *ops;
struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_da_geometry *geo = mp->m_dir_geo;
/*
* we can be passed a null dp here from a verifier, so we need to go the
@@ -105,14 +107,14 @@ xfs_dir3_leaf_check_int(
}
ents = ops->leaf_ents_p(leaf);
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ ltp = xfs_dir2_leaf_tail_p(geo, leaf);
/*
* XXX (dgc): This value is not restrictive enough.
* Should factor in the size of the bests table as well.
* We can deduce a value for that from di_size.
*/
- if (hdr->count > ops->leaf_max_ents(mp))
+ if (hdr->count > ops->leaf_max_ents(geo))
return false;
/* Leaves and bests don't overlap in leaf format. */
@@ -323,7 +325,7 @@ xfs_dir3_leaf_init(
if (type == XFS_DIR2_LEAF1_MAGIC) {
struct xfs_dir2_leaf_tail *ltp;
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ ltp = xfs_dir2_leaf_tail_p(mp->m_dir_geo, leaf);
ltp->bestcount = 0;
bp->b_ops = &xfs_dir3_leaf1_buf_ops;
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAF1_BUF);
@@ -347,18 +349,18 @@ xfs_dir3_leaf_get_buf(
int error;
ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
- ASSERT(bno >= XFS_DIR2_LEAF_FIRSTDB(mp) &&
- bno < XFS_DIR2_FREE_FIRSTDB(mp));
+ ASSERT(bno >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET) &&
+ bno < xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET));
- error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp,
- XFS_DATA_FORK);
+ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, bno),
+ -1, &bp, XFS_DATA_FORK);
if (error)
return error;
xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic);
- xfs_dir3_leaf_log_header(tp, dp, bp);
+ xfs_dir3_leaf_log_header(args, bp);
if (magic == XFS_DIR2_LEAF1_MAGIC)
- xfs_dir3_leaf_log_tail(tp, bp);
+ xfs_dir3_leaf_log_tail(args, bp);
*bpp = bp;
return 0;
}
@@ -403,8 +405,8 @@ xfs_dir2_block_to_leaf(
if ((error = xfs_da_grow_inode(args, &blkno))) {
return error;
}
- ldb = xfs_dir2_da_to_db(mp, blkno);
- ASSERT(ldb == XFS_DIR2_LEAF_FIRSTDB(mp));
+ ldb = xfs_dir2_da_to_db(args->geo, blkno);
+ ASSERT(ldb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET));
/*
* Initialize the leaf block, get a buffer for it.
*/
@@ -415,7 +417,7 @@ xfs_dir2_block_to_leaf(
leaf = lbp->b_addr;
hdr = dbp->b_addr;
xfs_dir3_data_check(dp, dbp);
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
blp = xfs_dir2_block_leaf_p(btp);
bf = dp->d_ops->data_bestfree_p(hdr);
ents = dp->d_ops->leaf_ents_p(leaf);
@@ -427,23 +429,23 @@ xfs_dir2_block_to_leaf(
leafhdr.count = be32_to_cpu(btp->count);
leafhdr.stale = be32_to_cpu(btp->stale);
dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
- xfs_dir3_leaf_log_header(tp, dp, lbp);
+ xfs_dir3_leaf_log_header(args, lbp);
/*
* Could compact these but I think we always do the conversion
* after squeezing out stale entries.
*/
memcpy(ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t));
- xfs_dir3_leaf_log_ents(tp, dp, lbp, 0, leafhdr.count - 1);
+ xfs_dir3_leaf_log_ents(args, lbp, 0, leafhdr.count - 1);
needscan = 0;
needlog = 1;
/*
* Make the space formerly occupied by the leaf entries and block
* tail be free.
*/
- xfs_dir2_data_make_free(tp, dp, dbp,
+ xfs_dir2_data_make_free(args, dbp,
(xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
- (xfs_dir2_data_aoff_t)((char *)hdr + mp->m_dirblksize -
+ (xfs_dir2_data_aoff_t)((char *)hdr + args->geo->blksize -
(char *)blp),
&needlog, &needscan);
/*
@@ -461,7 +463,7 @@ xfs_dir2_block_to_leaf(
/*
* Set up leaf tail and bests table.
*/
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
ltp->bestcount = cpu_to_be32(1);
bestsp = xfs_dir2_leaf_bests_p(ltp);
bestsp[0] = bf[0].length;
@@ -469,10 +471,10 @@ xfs_dir2_block_to_leaf(
* Log the data header and leaf bests table.
*/
if (needlog)
- xfs_dir2_data_log_header(tp, dp, dbp);
+ xfs_dir2_data_log_header(args, dbp);
xfs_dir3_leaf_check(dp, lbp);
xfs_dir3_data_check(dp, dbp);
- xfs_dir3_leaf_log_bests(tp, lbp, 0, 0);
+ xfs_dir3_leaf_log_bests(args, lbp, 0, 0);
return 0;
}
@@ -641,7 +643,7 @@ xfs_dir2_leaf_addname(
tp = args->trans;
mp = dp->i_mount;
- error = xfs_dir3_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
+ error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
if (error)
return error;
@@ -653,7 +655,7 @@ xfs_dir2_leaf_addname(
*/
index = xfs_dir2_leaf_search_hash(args, lbp);
leaf = lbp->b_addr;
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
ents = dp->d_ops->leaf_ents_p(leaf);
dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
bestsp = xfs_dir2_leaf_bests_p(ltp);
@@ -670,7 +672,7 @@ xfs_dir2_leaf_addname(
index++, lep++) {
if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
continue;
- i = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+ i = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
ASSERT(i < be32_to_cpu(ltp->bestcount));
ASSERT(bestsp[i] != cpu_to_be16(NULLDATAOFF));
if (be16_to_cpu(bestsp[i]) >= length) {
@@ -810,14 +812,15 @@ xfs_dir2_leaf_addname(
memmove(&bestsp[0], &bestsp[1],
be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0]));
be32_add_cpu(&ltp->bestcount, 1);
- xfs_dir3_leaf_log_tail(tp, lbp);
- xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+ xfs_dir3_leaf_log_tail(args, lbp);
+ xfs_dir3_leaf_log_bests(args, lbp, 0,
+ be32_to_cpu(ltp->bestcount) - 1);
}
/*
* If we're filling in a previously empty block just log it.
*/
else
- xfs_dir3_leaf_log_bests(tp, lbp, use_block, use_block);
+ xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block);
hdr = dbp->b_addr;
bf = dp->d_ops->data_bestfree_p(hdr);
bestsp[use_block] = bf[0].length;
@@ -828,8 +831,8 @@ xfs_dir2_leaf_addname(
* Just read that one in.
*/
error = xfs_dir3_data_read(tp, dp,
- xfs_dir2_db_to_da(mp, use_block),
- -1, &dbp);
+ xfs_dir2_db_to_da(args->geo, use_block),
+ -1, &dbp);
if (error) {
xfs_trans_brelse(tp, lbp);
return error;
@@ -848,7 +851,7 @@ xfs_dir2_leaf_addname(
/*
* Mark the initial part of our freespace in use for the new entry.
*/
- xfs_dir2_data_use_free(tp, dp, dbp, dup,
+ xfs_dir2_data_use_free(args, dbp, dup,
(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length,
&needlog, &needscan);
/*
@@ -870,8 +873,8 @@ xfs_dir2_leaf_addname(
* Need to log the data block's header.
*/
if (needlog)
- xfs_dir2_data_log_header(tp, dp, dbp);
- xfs_dir2_data_log_entry(tp, dp, dbp, dep);
+ xfs_dir2_data_log_header(args, dbp);
+ xfs_dir2_data_log_entry(args, dbp, dep);
/*
* If the bests table needs to be changed, do it.
* Log the change unless we've already done that.
@@ -879,7 +882,7 @@ xfs_dir2_leaf_addname(
if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(bf[0].length)) {
bestsp[use_block] = bf[0].length;
if (!grown)
- xfs_dir3_leaf_log_bests(tp, lbp, use_block, use_block);
+ xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block);
}
lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale,
@@ -889,14 +892,15 @@ xfs_dir2_leaf_addname(
* Fill in the new leaf entry.
*/
lep->hashval = cpu_to_be32(args->hashval);
- lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp, use_block,
+ lep->address = cpu_to_be32(
+ xfs_dir2_db_off_to_dataptr(args->geo, use_block,
be16_to_cpu(*tagp)));
/*
* Log the leaf fields and give up the buffers.
*/
dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
- xfs_dir3_leaf_log_header(tp, dp, lbp);
- xfs_dir3_leaf_log_ents(tp, dp, lbp, lfloglow, lfloghigh);
+ xfs_dir3_leaf_log_header(args, lbp);
+ xfs_dir3_leaf_log_ents(args, lbp, lfloglow, lfloghigh);
xfs_dir3_leaf_check(dp, lbp);
xfs_dir3_data_check(dp, dbp);
return 0;
@@ -948,9 +952,9 @@ xfs_dir3_leaf_compact(
leafhdr->stale = 0;
dp->d_ops->leaf_hdr_to_disk(leaf, leafhdr);
- xfs_dir3_leaf_log_header(args->trans, dp, bp);
+ xfs_dir3_leaf_log_header(args, bp);
if (loglow != -1)
- xfs_dir3_leaf_log_ents(args->trans, dp, bp, loglow, to - 1);
+ xfs_dir3_leaf_log_ents(args, bp, loglow, to - 1);
}
/*
@@ -1052,7 +1056,7 @@ xfs_dir3_leaf_compact_x1(
*/
static void
xfs_dir3_leaf_log_bests(
- xfs_trans_t *tp, /* transaction pointer */
+ struct xfs_da_args *args,
struct xfs_buf *bp, /* leaf buffer */
int first, /* first entry to log */
int last) /* last entry to log */
@@ -1065,10 +1069,11 @@ xfs_dir3_leaf_log_bests(
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC));
- ltp = xfs_dir2_leaf_tail_p(tp->t_mountp, leaf);
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
firstb = xfs_dir2_leaf_bests_p(ltp) + first;
lastb = xfs_dir2_leaf_bests_p(ltp) + last;
- xfs_trans_log_buf(tp, bp, (uint)((char *)firstb - (char *)leaf),
+ xfs_trans_log_buf(args->trans, bp,
+ (uint)((char *)firstb - (char *)leaf),
(uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1));
}
@@ -1077,8 +1082,7 @@ xfs_dir3_leaf_log_bests(
*/
void
xfs_dir3_leaf_log_ents(
- struct xfs_trans *tp,
- struct xfs_inode *dp,
+ struct xfs_da_args *args,
struct xfs_buf *bp,
int first,
int last)
@@ -1093,10 +1097,11 @@ xfs_dir3_leaf_log_ents(
leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
- ents = dp->d_ops->leaf_ents_p(leaf);
+ ents = args->dp->d_ops->leaf_ents_p(leaf);
firstlep = &ents[first];
lastlep = &ents[last];
- xfs_trans_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf),
+ xfs_trans_log_buf(args->trans, bp,
+ (uint)((char *)firstlep - (char *)leaf),
(uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1));
}
@@ -1105,8 +1110,7 @@ xfs_dir3_leaf_log_ents(
*/
void
xfs_dir3_leaf_log_header(
- struct xfs_trans *tp,
- struct xfs_inode *dp,
+ struct xfs_da_args *args,
struct xfs_buf *bp)
{
struct xfs_dir2_leaf *leaf = bp->b_addr;
@@ -1116,8 +1120,9 @@ xfs_dir3_leaf_log_header(
leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
- xfs_trans_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf),
- dp->d_ops->leaf_hdr_size - 1);
+ xfs_trans_log_buf(args->trans, bp,
+ (uint)((char *)&leaf->hdr - (char *)leaf),
+ args->dp->d_ops->leaf_hdr_size - 1);
}
/*
@@ -1125,21 +1130,20 @@ xfs_dir3_leaf_log_header(
*/
STATIC void
xfs_dir3_leaf_log_tail(
- struct xfs_trans *tp,
+ struct xfs_da_args *args,
struct xfs_buf *bp)
{
struct xfs_dir2_leaf *leaf = bp->b_addr;
xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
- struct xfs_mount *mp = tp->t_mountp;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
- xfs_trans_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf),
- (uint)(mp->m_dirblksize - 1));
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+ xfs_trans_log_buf(args->trans, bp, (uint)((char *)ltp - (char *)leaf),
+ (uint)(args->geo->blksize - 1));
}
/*
@@ -1185,7 +1189,7 @@ xfs_dir2_leaf_lookup(
*/
dep = (xfs_dir2_data_entry_t *)
((char *)dbp->b_addr +
- xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address)));
+ xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
/*
* Return the found inode number & CI name if appropriate
*/
@@ -1231,7 +1235,7 @@ xfs_dir2_leaf_lookup_int(
tp = args->trans;
mp = dp->i_mount;
- error = xfs_dir3_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
+ error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
if (error)
return error;
@@ -1260,7 +1264,8 @@ xfs_dir2_leaf_lookup_int(
/*
* Get the new data block number.
*/
- newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+ newdb = xfs_dir2_dataptr_to_db(args->geo,
+ be32_to_cpu(lep->address));
/*
* If it's not the same as the old data block number,
* need to pitch the old one and read the new one.
@@ -1269,8 +1274,8 @@ xfs_dir2_leaf_lookup_int(
if (dbp)
xfs_trans_brelse(tp, dbp);
error = xfs_dir3_data_read(tp, dp,
- xfs_dir2_db_to_da(mp, newdb),
- -1, &dbp);
+ xfs_dir2_db_to_da(args->geo, newdb),
+ -1, &dbp);
if (error) {
xfs_trans_brelse(tp, lbp);
return error;
@@ -1281,7 +1286,8 @@ xfs_dir2_leaf_lookup_int(
* Point to the data entry.
*/
dep = (xfs_dir2_data_entry_t *)((char *)dbp->b_addr +
- xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(lep->address)));
/*
* Compare name and if it's an exact match, return the index
* and buffer. If it's the first case-insensitive match, store
@@ -1310,8 +1316,8 @@ xfs_dir2_leaf_lookup_int(
if (cidb != curdb) {
xfs_trans_brelse(tp, dbp);
error = xfs_dir3_data_read(tp, dp,
- xfs_dir2_db_to_da(mp, cidb),
- -1, &dbp);
+ xfs_dir2_db_to_da(args->geo, cidb),
+ -1, &dbp);
if (error) {
xfs_trans_brelse(tp, lbp);
return error;
@@ -1380,18 +1386,18 @@ xfs_dir2_leaf_removename(
* Point to the leaf entry, use that to point to the data entry.
*/
lep = &ents[index];
- db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
- dep = (xfs_dir2_data_entry_t *)
- ((char *)hdr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
+ db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
+ dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+ xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
needscan = needlog = 0;
oldbest = be16_to_cpu(bf[0].length);
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
bestsp = xfs_dir2_leaf_bests_p(ltp);
ASSERT(be16_to_cpu(bestsp[db]) == oldbest);
/*
* Mark the former data entry unused.
*/
- xfs_dir2_data_make_free(tp, dp, dbp,
+ xfs_dir2_data_make_free(args, dbp,
(xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
/*
@@ -1399,10 +1405,10 @@ xfs_dir2_leaf_removename(
*/
leafhdr.stale++;
dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
- xfs_dir3_leaf_log_header(tp, dp, lbp);
+ xfs_dir3_leaf_log_header(args, lbp);
lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
- xfs_dir3_leaf_log_ents(tp, dp, lbp, index, index);
+ xfs_dir3_leaf_log_ents(args, lbp, index, index);
/*
* Scan the freespace in the data block again if necessary,
@@ -1411,22 +1417,22 @@ xfs_dir2_leaf_removename(
if (needscan)
xfs_dir2_data_freescan(dp, hdr, &needlog);
if (needlog)
- xfs_dir2_data_log_header(tp, dp, dbp);
+ xfs_dir2_data_log_header(args, dbp);
/*
* If the longest freespace in the data block has changed,
* put the new value in the bests table and log that.
*/
if (be16_to_cpu(bf[0].length) != oldbest) {
bestsp[db] = bf[0].length;
- xfs_dir3_leaf_log_bests(tp, lbp, db, db);
+ xfs_dir3_leaf_log_bests(args, lbp, db, db);
}
xfs_dir3_data_check(dp, dbp);
/*
* If the data block is now empty then get rid of the data block.
*/
if (be16_to_cpu(bf[0].length) ==
- mp->m_dirblksize - dp->d_ops->data_entry_offset) {
- ASSERT(db != mp->m_dirdatablk);
+ args->geo->blksize - dp->d_ops->data_entry_offset) {
+ ASSERT(db != args->geo->datablk);
if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
/*
* Nope, can't get rid of it because it caused
@@ -1459,15 +1465,16 @@ xfs_dir2_leaf_removename(
memmove(&bestsp[db - i], bestsp,
(be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp));
be32_add_cpu(&ltp->bestcount, -(db - i));
- xfs_dir3_leaf_log_tail(tp, lbp);
- xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+ xfs_dir3_leaf_log_tail(args, lbp);
+ xfs_dir3_leaf_log_bests(args, lbp, 0,
+ be32_to_cpu(ltp->bestcount) - 1);
} else
bestsp[db] = cpu_to_be16(NULLDATAOFF);
}
/*
* If the data block was not the first one, drop it.
*/
- else if (db != mp->m_dirdatablk)
+ else if (db != args->geo->datablk)
dbp = NULL;
xfs_dir3_leaf_check(dp, lbp);
@@ -1515,7 +1522,7 @@ xfs_dir2_leaf_replace(
*/
dep = (xfs_dir2_data_entry_t *)
((char *)dbp->b_addr +
- xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address)));
+ xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
ASSERT(args->inumber != be64_to_cpu(dep->inumber));
/*
* Put the new inode number in, log it.
@@ -1523,7 +1530,7 @@ xfs_dir2_leaf_replace(
dep->inumber = cpu_to_be64(args->inumber);
dp->d_ops->data_put_ftype(dep, args->filetype);
tp = args->trans;
- xfs_dir2_data_log_entry(tp, dp, dbp, dep);
+ xfs_dir2_data_log_entry(args, dbp, dep);
xfs_dir3_leaf_check(dp, lbp);
xfs_trans_brelse(tp, lbp);
return 0;
@@ -1609,12 +1616,13 @@ xfs_dir2_leaf_trim_data(
/*
* Read the offending data block. We need its buffer.
*/
- error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp);
+ error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, db),
+ -1, &dbp);
if (error)
return error;
leaf = lbp->b_addr;
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
#ifdef DEBUG
{
@@ -1624,7 +1632,7 @@ xfs_dir2_leaf_trim_data(
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
ASSERT(be16_to_cpu(bf[0].length) ==
- mp->m_dirblksize - dp->d_ops->data_entry_offset);
+ args->geo->blksize - dp->d_ops->data_entry_offset);
ASSERT(db == be32_to_cpu(ltp->bestcount) - 1);
}
#endif
@@ -1643,8 +1651,8 @@ xfs_dir2_leaf_trim_data(
bestsp = xfs_dir2_leaf_bests_p(ltp);
be32_add_cpu(&ltp->bestcount, -1);
memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp));
- xfs_dir3_leaf_log_tail(tp, lbp);
- xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+ xfs_dir3_leaf_log_tail(args, lbp);
+ xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
return 0;
}
@@ -1708,22 +1716,22 @@ xfs_dir2_node_to_leaf(
/*
* Get the last offset in the file.
*/
- if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK))) {
+ if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) {
return error;
}
- fo -= mp->m_dirblkfsbs;
+ fo -= args->geo->fsbcount;
/*
* If there are freespace blocks other than the first one,
* take this opportunity to remove trailing empty freespace blocks
* that may have been left behind during no-space-reservation
* operations.
*/
- while (fo > mp->m_dirfreeblk) {
+ while (fo > args->geo->freeblk) {
if ((error = xfs_dir2_node_trim_free(args, fo, &rval))) {
return error;
}
if (rval)
- fo -= mp->m_dirblkfsbs;
+ fo -= args->geo->fsbcount;
else
return 0;
}
@@ -1736,7 +1744,7 @@ xfs_dir2_node_to_leaf(
/*
* If it's not the single leaf block, give up.
*/
- if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + mp->m_dirblksize)
+ if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + args->geo->blksize)
return 0;
lbp = state->path.blk[0].bp;
leaf = lbp->b_addr;
@@ -1748,7 +1756,7 @@ xfs_dir2_node_to_leaf(
/*
* Read the freespace block.
*/
- error = xfs_dir2_free_read(tp, dp, mp->m_dirfreeblk, &fbp);
+ error = xfs_dir2_free_read(tp, dp, args->geo->freeblk, &fbp);
if (error)
return error;
free = fbp->b_addr;
@@ -1760,7 +1768,7 @@ xfs_dir2_node_to_leaf(
* Now see if the leafn and free data will fit in a leaf1.
* If not, release the buffer and give up.
*/
- if (xfs_dir3_leaf_size(&leafhdr, freehdr.nvalid) > mp->m_dirblksize) {
+ if (xfs_dir3_leaf_size(&leafhdr, freehdr.nvalid) > args->geo->blksize) {
xfs_trans_brelse(tp, fbp);
return 0;
}
@@ -1780,7 +1788,7 @@ xfs_dir2_node_to_leaf(
/*
* Set up the leaf tail from the freespace block.
*/
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
ltp->bestcount = cpu_to_be32(freehdr.nvalid);
/*
@@ -1790,15 +1798,17 @@ xfs_dir2_node_to_leaf(
freehdr.nvalid * sizeof(xfs_dir2_data_off_t));
dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
- xfs_dir3_leaf_log_header(tp, dp, lbp);
- xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
- xfs_dir3_leaf_log_tail(tp, lbp);
+ xfs_dir3_leaf_log_header(args, lbp);
+ xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+ xfs_dir3_leaf_log_tail(args, lbp);
xfs_dir3_leaf_check(dp, lbp);
/*
* Get rid of the freespace block.
*/
- error = xfs_dir2_shrink_inode(args, XFS_DIR2_FREE_FIRSTDB(mp), fbp);
+ error = xfs_dir2_shrink_inode(args,
+ xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET),
+ fbp);
if (error) {
/*
* This can't fail here because it can only happen when
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index cb434d732681..da43d304fca2 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -195,17 +195,18 @@ xfs_dir2_free_try_read(
static int
xfs_dir3_free_get_buf(
- struct xfs_trans *tp,
- struct xfs_inode *dp,
+ xfs_da_args_t *args,
xfs_dir2_db_t fbno,
struct xfs_buf **bpp)
{
+ struct xfs_trans *tp = args->trans;
+ struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
struct xfs_buf *bp;
int error;
struct xfs_dir3_icfree_hdr hdr;
- error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fbno),
+ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, fbno),
-1, &bp, XFS_DATA_FORK);
if (error)
return error;
@@ -240,8 +241,7 @@ xfs_dir3_free_get_buf(
*/
STATIC void
xfs_dir2_free_log_bests(
- struct xfs_trans *tp,
- struct xfs_inode *dp,
+ struct xfs_da_args *args,
struct xfs_buf *bp,
int first, /* first entry to log */
int last) /* last entry to log */
@@ -250,10 +250,10 @@ xfs_dir2_free_log_bests(
__be16 *bests;
free = bp->b_addr;
- bests = dp->d_ops->free_bests_p(free);
+ bests = args->dp->d_ops->free_bests_p(free);
ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
- xfs_trans_log_buf(tp, bp,
+ xfs_trans_log_buf(args->trans, bp,
(uint)((char *)&bests[first] - (char *)free),
(uint)((char *)&bests[last] - (char *)free +
sizeof(bests[0]) - 1));
@@ -264,8 +264,7 @@ xfs_dir2_free_log_bests(
*/
static void
xfs_dir2_free_log_header(
- struct xfs_trans *tp,
- struct xfs_inode *dp,
+ struct xfs_da_args *args,
struct xfs_buf *bp)
{
#ifdef DEBUG
@@ -275,7 +274,8 @@ xfs_dir2_free_log_header(
ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
#endif
- xfs_trans_log_buf(tp, bp, 0, dp->d_ops->free_hdr_size - 1);
+ xfs_trans_log_buf(args->trans, bp, 0,
+ args->dp->d_ops->free_hdr_size - 1);
}
/*
@@ -315,20 +315,20 @@ xfs_dir2_leaf_to_node(
if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fdb))) {
return error;
}
- ASSERT(fdb == XFS_DIR2_FREE_FIRSTDB(mp));
+ ASSERT(fdb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET));
/*
* Get the buffer for the new freespace block.
*/
- error = xfs_dir3_free_get_buf(tp, dp, fdb, &fbp);
+ error = xfs_dir3_free_get_buf(args, fdb, &fbp);
if (error)
return error;
free = fbp->b_addr;
dp->d_ops->free_hdr_from_disk(&freehdr, free);
leaf = lbp->b_addr;
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
ASSERT(be32_to_cpu(ltp->bestcount) <=
- (uint)dp->i_d.di_size / mp->m_dirblksize);
+ (uint)dp->i_d.di_size / args->geo->blksize);
/*
* Copy freespace entries from the leaf block to the new block.
@@ -349,8 +349,8 @@ xfs_dir2_leaf_to_node(
freehdr.nvalid = be32_to_cpu(ltp->bestcount);
dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
- xfs_dir2_free_log_bests(tp, dp, fbp, 0, freehdr.nvalid - 1);
- xfs_dir2_free_log_header(tp, dp, fbp);
+ xfs_dir2_free_log_bests(args, fbp, 0, freehdr.nvalid - 1);
+ xfs_dir2_free_log_header(args, fbp);
/*
* Converting the leaf to a leafnode is just a matter of changing the
@@ -364,7 +364,7 @@ xfs_dir2_leaf_to_node(
leaf->hdr.info.magic = cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
lbp->b_ops = &xfs_dir3_leafn_buf_ops;
xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAFN_BUF);
- xfs_dir3_leaf_log_header(tp, dp, lbp);
+ xfs_dir3_leaf_log_header(args, lbp);
xfs_dir3_leaf_check(dp, lbp);
return 0;
}
@@ -415,7 +415,7 @@ xfs_dir2_leafn_add(
* a compact.
*/
- if (leafhdr.count == dp->d_ops->leaf_max_ents(mp)) {
+ if (leafhdr.count == dp->d_ops->leaf_max_ents(args->geo)) {
if (!leafhdr.stale)
return XFS_ERROR(ENOSPC);
compact = leafhdr.stale > 1;
@@ -450,12 +450,12 @@ xfs_dir2_leafn_add(
highstale, &lfloglow, &lfloghigh);
lep->hashval = cpu_to_be32(args->hashval);
- lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp,
+ lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(args->geo,
args->blkno, args->index));
dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
- xfs_dir3_leaf_log_header(tp, dp, bp);
- xfs_dir3_leaf_log_ents(tp, dp, bp, lfloglow, lfloghigh);
+ xfs_dir3_leaf_log_header(args, bp);
+ xfs_dir3_leaf_log_ents(args, bp, lfloglow, lfloghigh);
xfs_dir3_leaf_check(dp, bp);
return 0;
}
@@ -471,7 +471,8 @@ xfs_dir2_free_hdr_check(
dp->d_ops->free_hdr_from_disk(&hdr, bp->b_addr);
- ASSERT((hdr.firstdb % dp->d_ops->free_max_bests(dp->i_mount)) == 0);
+ ASSERT((hdr.firstdb %
+ dp->d_ops->free_max_bests(dp->i_mount->m_dir_geo)) == 0);
ASSERT(hdr.firstdb <= db);
ASSERT(db < hdr.firstdb + hdr.nvalid);
}
@@ -576,7 +577,8 @@ xfs_dir2_leafn_lookup_for_addname(
/*
* Pull the data block number from the entry.
*/
- newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+ newdb = xfs_dir2_dataptr_to_db(args->geo,
+ be32_to_cpu(lep->address));
/*
* For addname, we're looking for a place to put the new entry.
* We want to use a data block with an entry of equal
@@ -593,7 +595,7 @@ xfs_dir2_leafn_lookup_for_addname(
* Convert the data block to the free block
* holding its freespace information.
*/
- newfdb = dp->d_ops->db_to_fdb(mp, newdb);
+ newfdb = dp->d_ops->db_to_fdb(args->geo, newdb);
/*
* If it's not the one we have in hand, read it in.
*/
@@ -605,7 +607,8 @@ xfs_dir2_leafn_lookup_for_addname(
xfs_trans_brelse(tp, curbp);
error = xfs_dir2_free_read(tp, dp,
- xfs_dir2_db_to_da(mp, newfdb),
+ xfs_dir2_db_to_da(args->geo,
+ newfdb),
&curbp);
if (error)
return error;
@@ -616,7 +619,7 @@ xfs_dir2_leafn_lookup_for_addname(
/*
* Get the index for our entry.
*/
- fi = dp->d_ops->db_to_fdindex(mp, curdb);
+ fi = dp->d_ops->db_to_fdindex(args->geo, curdb);
/*
* If it has room, return it.
*/
@@ -721,7 +724,8 @@ xfs_dir2_leafn_lookup_for_entry(
/*
* Pull the data block number from the entry.
*/
- newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+ newdb = xfs_dir2_dataptr_to_db(args->geo,
+ be32_to_cpu(lep->address));
/*
* Not adding a new entry, so we really want to find
* the name given to us.
@@ -746,7 +750,8 @@ xfs_dir2_leafn_lookup_for_entry(
curbp = state->extrablk.bp;
} else {
error = xfs_dir3_data_read(tp, dp,
- xfs_dir2_db_to_da(mp, newdb),
+ xfs_dir2_db_to_da(args->geo,
+ newdb),
-1, &curbp);
if (error)
return error;
@@ -758,7 +763,8 @@ xfs_dir2_leafn_lookup_for_entry(
* Point to the data entry.
*/
dep = (xfs_dir2_data_entry_t *)((char *)curbp->b_addr +
- xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(lep->address)));
/*
* Compare the entry and if it's an exact match, return
* EEXIST immediately. If it's the first case-insensitive
@@ -844,7 +850,6 @@ xfs_dir3_leafn_moveents(
int start_d,/* destination leaf index */
int count) /* count of leaves to copy */
{
- struct xfs_trans *tp = args->trans;
int stale; /* count stale leaves copied */
trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count);
@@ -863,7 +868,7 @@ xfs_dir3_leafn_moveents(
if (start_d < dhdr->count) {
memmove(&dents[start_d + count], &dents[start_d],
(dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t));
- xfs_dir3_leaf_log_ents(tp, args->dp, bp_d, start_d + count,
+ xfs_dir3_leaf_log_ents(args, bp_d, start_d + count,
count + dhdr->count - 1);
}
/*
@@ -885,8 +890,7 @@ xfs_dir3_leafn_moveents(
*/
memcpy(&dents[start_d], &sents[start_s],
count * sizeof(xfs_dir2_leaf_entry_t));
- xfs_dir3_leaf_log_ents(tp, args->dp, bp_d,
- start_d, start_d + count - 1);
+ xfs_dir3_leaf_log_ents(args, bp_d, start_d, start_d + count - 1);
/*
* If there are source entries after the ones we copied,
@@ -895,8 +899,7 @@ xfs_dir3_leafn_moveents(
if (start_s + count < shdr->count) {
memmove(&sents[start_s], &sents[start_s + count],
count * sizeof(xfs_dir2_leaf_entry_t));
- xfs_dir3_leaf_log_ents(tp, args->dp, bp_s,
- start_s, start_s + count - 1);
+ xfs_dir3_leaf_log_ents(args, bp_s, start_s, start_s + count - 1);
}
/*
@@ -1032,8 +1035,8 @@ xfs_dir2_leafn_rebalance(
/* log the changes made when moving the entries */
dp->d_ops->leaf_hdr_to_disk(leaf1, &hdr1);
dp->d_ops->leaf_hdr_to_disk(leaf2, &hdr2);
- xfs_dir3_leaf_log_header(args->trans, dp, blk1->bp);
- xfs_dir3_leaf_log_header(args->trans, dp, blk2->bp);
+ xfs_dir3_leaf_log_header(args, blk1->bp);
+ xfs_dir3_leaf_log_header(args, blk2->bp);
xfs_dir3_leaf_check(dp, blk1->bp);
xfs_dir3_leaf_check(dp, blk2->bp);
@@ -1076,7 +1079,6 @@ xfs_dir3_data_block_free(
struct xfs_buf *fbp,
int longest)
{
- struct xfs_trans *tp = args->trans;
int logfree = 0;
__be16 *bests;
struct xfs_dir3_icfree_hdr freehdr;
@@ -1090,7 +1092,7 @@ xfs_dir3_data_block_free(
* value.
*/
bests[findex] = cpu_to_be16(longest);
- xfs_dir2_free_log_bests(tp, dp, fbp, findex, findex);
+ xfs_dir2_free_log_bests(args, fbp, findex, findex);
return 0;
}
@@ -1118,7 +1120,7 @@ xfs_dir3_data_block_free(
}
dp->d_ops->free_hdr_to_disk(free, &freehdr);
- xfs_dir2_free_log_header(tp, dp, fbp);
+ xfs_dir2_free_log_header(args, fbp);
/*
* If there are no useful entries left in the block, get rid of the
@@ -1142,7 +1144,7 @@ xfs_dir3_data_block_free(
/* Log the free entry that changed, unless we got rid of it. */
if (logfree)
- xfs_dir2_free_log_bests(tp, dp, fbp, findex, findex);
+ xfs_dir2_free_log_bests(args, fbp, findex, findex);
return 0;
}
@@ -1193,9 +1195,9 @@ xfs_dir2_leafn_remove(
/*
* Extract the data block and offset from the entry.
*/
- db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+ db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
ASSERT(dblk->blkno == db);
- off = xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address));
+ off = xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address));
ASSERT(dblk->index == off);
/*
@@ -1204,10 +1206,10 @@ xfs_dir2_leafn_remove(
*/
leafhdr.stale++;
dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
- xfs_dir3_leaf_log_header(tp, dp, bp);
+ xfs_dir3_leaf_log_header(args, bp);
lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
- xfs_dir3_leaf_log_ents(tp, dp, bp, index, index);
+ xfs_dir3_leaf_log_ents(args, bp, index, index);
/*
* Make the data entry free. Keep track of the longest freespace
@@ -1219,7 +1221,7 @@ xfs_dir2_leafn_remove(
bf = dp->d_ops->data_bestfree_p(hdr);
longest = be16_to_cpu(bf[0].length);
needlog = needscan = 0;
- xfs_dir2_data_make_free(tp, dp, dbp, off,
+ xfs_dir2_data_make_free(args, dbp, off,
dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
/*
* Rescan the data block freespaces for bestfree.
@@ -1228,7 +1230,7 @@ xfs_dir2_leafn_remove(
if (needscan)
xfs_dir2_data_freescan(dp, hdr, &needlog);
if (needlog)
- xfs_dir2_data_log_header(tp, dp, dbp);
+ xfs_dir2_data_log_header(args, dbp);
xfs_dir3_data_check(dp, dbp);
/*
* If the longest data block freespace changes, need to update
@@ -1245,8 +1247,9 @@ xfs_dir2_leafn_remove(
* Convert the data block number to a free block,
* read in the free block.
*/
- fdb = dp->d_ops->db_to_fdb(mp, db);
- error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(mp, fdb),
+ fdb = dp->d_ops->db_to_fdb(args->geo, db);
+ error = xfs_dir2_free_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo, fdb),
&fbp);
if (error)
return error;
@@ -1255,20 +1258,21 @@ xfs_dir2_leafn_remove(
{
struct xfs_dir3_icfree_hdr freehdr;
dp->d_ops->free_hdr_from_disk(&freehdr, free);
- ASSERT(freehdr.firstdb == dp->d_ops->free_max_bests(mp) *
- (fdb - XFS_DIR2_FREE_FIRSTDB(mp)));
+ ASSERT(freehdr.firstdb == dp->d_ops->free_max_bests(args->geo) *
+ (fdb - xfs_dir2_byte_to_db(args->geo,
+ XFS_DIR2_FREE_OFFSET)));
}
#endif
/*
* Calculate which entry we need to fix.
*/
- findex = dp->d_ops->db_to_fdindex(mp, db);
+ findex = dp->d_ops->db_to_fdindex(args->geo, db);
longest = be16_to_cpu(bf[0].length);
/*
* If the data block is now empty we can get rid of it
* (usually).
*/
- if (longest == mp->m_dirblksize -
+ if (longest == args->geo->blksize -
dp->d_ops->data_entry_offset) {
/*
* Try to punch out the data block.
@@ -1303,7 +1307,7 @@ xfs_dir2_leafn_remove(
*/
*rval = (dp->d_ops->leaf_hdr_size +
(uint)sizeof(ents[0]) * (leafhdr.count - leafhdr.stale)) <
- mp->m_dir_magicpct;
+ args->geo->magicpct;
return 0;
}
@@ -1336,7 +1340,7 @@ xfs_dir2_leafn_split(
/*
* Initialize the new leaf block.
*/
- error = xfs_dir3_leaf_get_buf(args, xfs_dir2_da_to_db(mp, blkno),
+ error = xfs_dir3_leaf_get_buf(args, xfs_dir2_da_to_db(args->geo, blkno),
&newblk->bp, XFS_DIR2_LEAFN_MAGIC);
if (error)
return error;
@@ -1410,7 +1414,7 @@ xfs_dir2_leafn_toosmall(
count = leafhdr.count - leafhdr.stale;
bytes = dp->d_ops->leaf_hdr_size + count * sizeof(ents[0]);
- if (bytes > (state->blocksize >> 1)) {
+ if (bytes > (state->args->geo->blksize >> 1)) {
/*
* Blk over 50%, don't try to join.
*/
@@ -1463,7 +1467,8 @@ xfs_dir2_leafn_toosmall(
* Count bytes in the two blocks combined.
*/
count = leafhdr.count - leafhdr.stale;
- bytes = state->blocksize - (state->blocksize >> 2);
+ bytes = state->args->geo->blksize -
+ (state->args->geo->blksize >> 2);
leaf = bp->b_addr;
dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf);
@@ -1560,8 +1565,8 @@ xfs_dir2_leafn_unbalance(
/* log the changes made when moving the entries */
dp->d_ops->leaf_hdr_to_disk(save_leaf, &savehdr);
dp->d_ops->leaf_hdr_to_disk(drop_leaf, &drophdr);
- xfs_dir3_leaf_log_header(args->trans, dp, save_blk->bp);
- xfs_dir3_leaf_log_header(args->trans, dp, drop_blk->bp);
+ xfs_dir3_leaf_log_header(args, save_blk->bp);
+ xfs_dir3_leaf_log_header(args, drop_blk->bp);
xfs_dir3_leaf_check(dp, save_blk->bp);
xfs_dir3_leaf_check(dp, drop_blk->bp);
@@ -1587,8 +1592,6 @@ xfs_dir2_node_addname(
state = xfs_da_state_alloc();
state->args = args;
state->mp = args->dp->i_mount;
- state->blocksize = state->mp->m_dirblksize;
- state->node_ents = state->mp->m_dir_node_ents;
/*
* Look up the name. We're not supposed to find it, but
* this gives us the insertion point.
@@ -1727,9 +1730,9 @@ xfs_dir2_node_addname_int(
if (dbno == -1) {
xfs_fileoff_t fo; /* freespace block number */
- if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK)))
+ if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK)))
return error;
- lastfbno = xfs_dir2_da_to_db(mp, (xfs_dablk_t)fo);
+ lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
fbno = ifbno;
}
/*
@@ -1747,7 +1750,8 @@ xfs_dir2_node_addname_int(
* us a freespace block to start with.
*/
if (++fbno == 0)
- fbno = XFS_DIR2_FREE_FIRSTDB(mp);
+ fbno = xfs_dir2_byte_to_db(args->geo,
+ XFS_DIR2_FREE_OFFSET);
/*
* If it's ifbno we already looked at it.
*/
@@ -1765,8 +1769,8 @@ xfs_dir2_node_addname_int(
* to avoid it.
*/
error = xfs_dir2_free_try_read(tp, dp,
- xfs_dir2_db_to_da(mp, fbno),
- &fbp);
+ xfs_dir2_db_to_da(args->geo, fbno),
+ &fbp);
if (error)
return error;
if (!fbp)
@@ -1834,10 +1838,10 @@ xfs_dir2_node_addname_int(
* Get the freespace block corresponding to the data block
* that was just allocated.
*/
- fbno = dp->d_ops->db_to_fdb(mp, dbno);
+ fbno = dp->d_ops->db_to_fdb(args->geo, dbno);
error = xfs_dir2_free_try_read(tp, dp,
- xfs_dir2_db_to_da(mp, fbno),
- &fbp);
+ xfs_dir2_db_to_da(args->geo, fbno),
+ &fbp);
if (error)
return error;
@@ -1851,12 +1855,13 @@ xfs_dir2_node_addname_int(
if (error)
return error;
- if (unlikely(dp->d_ops->db_to_fdb(mp, dbno) != fbno)) {
+ if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) {
xfs_alert(mp,
"%s: dir ino %llu needed freesp block %lld for\n"
" data block %lld, got %lld ifbno %llu lastfbno %d",
__func__, (unsigned long long)dp->i_ino,
- (long long)dp->d_ops->db_to_fdb(mp, dbno),
+ (long long)dp->d_ops->db_to_fdb(
+ args->geo, dbno),
(long long)dbno, (long long)fbno,
(unsigned long long)ifbno, lastfbno);
if (fblk) {
@@ -1877,7 +1882,7 @@ xfs_dir2_node_addname_int(
/*
* Get a buffer for the new block.
*/
- error = xfs_dir3_free_get_buf(tp, dp, fbno, &fbp);
+ error = xfs_dir3_free_get_buf(args, fbno, &fbp);
if (error)
return error;
free = fbp->b_addr;
@@ -1887,8 +1892,10 @@ xfs_dir2_node_addname_int(
/*
* Remember the first slot as our empty slot.
*/
- freehdr.firstdb = (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) *
- dp->d_ops->free_max_bests(mp);
+ freehdr.firstdb =
+ (fbno - xfs_dir2_byte_to_db(args->geo,
+ XFS_DIR2_FREE_OFFSET)) *
+ dp->d_ops->free_max_bests(args->geo);
} else {
free = fbp->b_addr;
bests = dp->d_ops->free_bests_p(free);
@@ -1898,13 +1905,13 @@ xfs_dir2_node_addname_int(
/*
* Set the freespace block index from the data block number.
*/
- findex = dp->d_ops->db_to_fdindex(mp, dbno);
+ findex = dp->d_ops->db_to_fdindex(args->geo, dbno);
/*
* If it's after the end of the current entries in the
* freespace block, extend that table.
*/
if (findex >= freehdr.nvalid) {
- ASSERT(findex < dp->d_ops->free_max_bests(mp));
+ ASSERT(findex < dp->d_ops->free_max_bests(args->geo));
freehdr.nvalid = findex + 1;
/*
* Tag new entry so nused will go up.
@@ -1918,7 +1925,7 @@ xfs_dir2_node_addname_int(
if (bests[findex] == cpu_to_be16(NULLDATAOFF)) {
freehdr.nused++;
dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
- xfs_dir2_free_log_header(tp, dp, fbp);
+ xfs_dir2_free_log_header(args, fbp);
}
/*
* Update the real value in the table.
@@ -1943,7 +1950,8 @@ xfs_dir2_node_addname_int(
/*
* Read the data block in.
*/
- error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno),
+ error = xfs_dir3_data_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo, dbno),
-1, &dbp);
if (error)
return error;
@@ -1961,7 +1969,7 @@ xfs_dir2_node_addname_int(
/*
* Mark the first part of the unused space, inuse for us.
*/
- xfs_dir2_data_use_free(tp, dp, dbp, dup,
+ xfs_dir2_data_use_free(args, dbp, dup,
(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length,
&needlog, &needscan);
/*
@@ -1974,7 +1982,7 @@ xfs_dir2_node_addname_int(
dp->d_ops->data_put_ftype(dep, args->filetype);
tagp = dp->d_ops->data_entry_tag_p(dep);
*tagp = cpu_to_be16((char *)dep - (char *)hdr);
- xfs_dir2_data_log_entry(tp, dp, dbp, dep);
+ xfs_dir2_data_log_entry(args, dbp, dep);
/*
* Rescan the block for bestfree if needed.
*/
@@ -1984,7 +1992,7 @@ xfs_dir2_node_addname_int(
* Log the data block header if needed.
*/
if (needlog)
- xfs_dir2_data_log_header(tp, dp, dbp);
+ xfs_dir2_data_log_header(args, dbp);
/*
* If the freespace entry is now wrong, update it.
*/
@@ -1997,7 +2005,7 @@ xfs_dir2_node_addname_int(
* Log the freespace entry if needed.
*/
if (logfree)
- xfs_dir2_free_log_bests(tp, dp, fbp, findex, findex);
+ xfs_dir2_free_log_bests(args, fbp, findex, findex);
/*
* Return the data block and offset in args, then drop the data block.
*/
@@ -2028,8 +2036,6 @@ xfs_dir2_node_lookup(
state = xfs_da_state_alloc();
state->args = args;
state->mp = args->dp->i_mount;
- state->blocksize = state->mp->m_dirblksize;
- state->node_ents = state->mp->m_dir_node_ents;
/*
* Fill in the path to the entry in the cursor.
*/
@@ -2083,8 +2089,6 @@ xfs_dir2_node_removename(
state = xfs_da_state_alloc();
state->args = args;
state->mp = args->dp->i_mount;
- state->blocksize = state->mp->m_dirblksize;
- state->node_ents = state->mp->m_dir_node_ents;
/* Look up the entry we're deleting, set up the cursor. */
error = xfs_da3_node_lookup_int(state, &rval);
@@ -2153,8 +2157,6 @@ xfs_dir2_node_replace(
state = xfs_da_state_alloc();
state->args = args;
state->mp = args->dp->i_mount;
- state->blocksize = state->mp->m_dirblksize;
- state->node_ents = state->mp->m_dir_node_ents;
inum = args->inumber;
/*
* Lookup the entry to change in the btree.
@@ -2186,15 +2188,15 @@ xfs_dir2_node_replace(
hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
dep = (xfs_dir2_data_entry_t *)
((char *)hdr +
- xfs_dir2_dataptr_to_off(state->mp, be32_to_cpu(lep->address)));
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(lep->address)));
ASSERT(inum != be64_to_cpu(dep->inumber));
/*
* Fill in the new inode number and log the entry.
*/
dep->inumber = cpu_to_be64(inum);
args->dp->d_ops->data_put_ftype(dep, args->filetype);
- xfs_dir2_data_log_entry(args->trans, args->dp,
- state->extrablk.bp, dep);
+ xfs_dir2_data_log_entry(args, state->extrablk.bp, dep);
rval = 0;
}
/*
@@ -2262,9 +2264,9 @@ xfs_dir2_node_trim_free(
/*
* Blow the block away.
*/
- if ((error =
- xfs_dir2_shrink_inode(args, xfs_dir2_da_to_db(mp, (xfs_dablk_t)fo),
- bp))) {
+ error = xfs_dir2_shrink_inode(args,
+ xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo), bp);
+ if (error) {
/*
* Can't fail with ENOSPC since that only happens with no
* space reservation, when breaking up an extent into two
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 8b9d2281f85b..27ce0794d196 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -20,6 +20,140 @@
struct dir_context;
+/*
+ * Directory offset/block conversion functions.
+ *
+ * DB blocks here are logical directory block numbers, not filesystem blocks.
+ */
+
+/*
+ * Convert dataptr to byte in file space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp)
+{
+ return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG;
+}
+
+/*
+ * Convert byte in file space to dataptr. It had better be aligned.
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by)
+{
+ return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG);
+}
+
+/*
+ * Convert byte in space to (DB) block
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+ return (xfs_dir2_db_t)(by >> geo->blklog);
+}
+
+/*
+ * Convert dataptr to a block number
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
+{
+ return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp));
+}
+
+/*
+ * Convert byte in space to offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+ return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1));
+}
+
+/*
+ * Convert dataptr to a byte offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
+{
+ return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp));
+}
+
+/*
+ * Convert block and offset to byte in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
+ xfs_dir2_data_aoff_t o)
+{
+ return ((xfs_dir2_off_t)db << geo->blklog) + o;
+}
+
+/*
+ * Convert block (DB) to block (dablk)
+ */
+static inline xfs_dablk_t
+xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+ return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog));
+}
+
+/*
+ * Convert byte in space to (DA) block
+ */
+static inline xfs_dablk_t
+xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+ return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by));
+}
+
+/*
+ * Convert block and offset to dataptr
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
+ xfs_dir2_data_aoff_t o)
+{
+ return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o));
+}
+
+/*
+ * Convert block (dablk) to block (DB)
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da)
+{
+ return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog));
+}
+
+/*
+ * Convert block (dablk) to byte offset in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da)
+{
+ return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0);
+}
+
+/*
+ * Directory tail pointer accessor functions. Based on block geometry.
+ */
+static inline struct xfs_dir2_block_tail *
+xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr)
+{
+ return ((struct xfs_dir2_block_tail *)
+ ((char *)hdr + geo->blksize)) - 1;
+}
+
+static inline struct xfs_dir2_leaf_tail *
+xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
+{
+ return (struct xfs_dir2_leaf_tail *)
+ ((char *)lp + geo->blksize -
+ sizeof(struct xfs_dir2_leaf_tail));
+}
+
/* xfs_dir2.c */
extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
@@ -54,8 +188,8 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
-extern int xfs_dir3_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp,
- xfs_dablk_t bno, xfs_daddr_t mapped_bno);
+extern int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno,
+ xfs_daddr_t mapped_bno);
extern struct xfs_dir2_data_free *
xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
@@ -77,9 +211,9 @@ extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr,
int *lowstalep, int *highstalep, int *lowlogp, int *highlogp);
extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
struct xfs_buf **bpp, __uint16_t magic);
-extern void xfs_dir3_leaf_log_ents(struct xfs_trans *tp, struct xfs_inode *dp,
+extern void xfs_dir3_leaf_log_ents(struct xfs_da_args *args,
struct xfs_buf *bp, int first, int last);
-extern void xfs_dir3_leaf_log_header(struct xfs_trans *tp, struct xfs_inode *dp,
+extern void xfs_dir3_leaf_log_header(struct xfs_da_args *args,
struct xfs_buf *bp);
extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args);
extern int xfs_dir2_leaf_removename(struct xfs_da_args *args);
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index aead369e1c30..48e99afb9cb0 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -76,26 +76,25 @@ const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = {
STATIC int
xfs_dir2_sf_getdents(
- xfs_inode_t *dp, /* incore directory inode */
+ struct xfs_da_args *args,
struct dir_context *ctx)
{
int i; /* shortform entry number */
- xfs_mount_t *mp; /* filesystem mount point */
+ struct xfs_inode *dp = args->dp; /* incore directory inode */
xfs_dir2_dataptr_t off; /* current entry's offset */
xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
xfs_dir2_dataptr_t dot_offset;
xfs_dir2_dataptr_t dotdot_offset;
xfs_ino_t ino;
-
- mp = dp->i_mount;
+ struct xfs_da_geometry *geo = args->geo;
ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
/*
* Give up if the directory is way too short.
*/
if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
- ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
return XFS_ERROR(EIO);
}
@@ -109,18 +108,18 @@ xfs_dir2_sf_getdents(
/*
* If the block number in the offset is out of range, we're done.
*/
- if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
+ if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk)
return 0;
/*
* Precalculate offsets for . and .. as we will always need them.
*
* XXX(hch): the second argument is sometimes 0 and sometimes
- * mp->m_dirdatablk.
+ * geo->datablk
*/
- dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
+ dot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
dp->d_ops->data_dot_offset);
- dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
+ dotdot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
dp->d_ops->data_dotdot_offset);
/*
@@ -149,7 +148,7 @@ xfs_dir2_sf_getdents(
for (i = 0; i < sfp->count; i++) {
__uint8_t filetype;
- off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
+ off = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
xfs_dir2_sf_get_offset(sfep));
if (ctx->pos > off) {
@@ -161,13 +160,13 @@ xfs_dir2_sf_getdents(
filetype = dp->d_ops->sf_get_ftype(sfep);
ctx->pos = off & 0x7fffffff;
if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino,
- xfs_dir3_get_dtype(mp, filetype)))
+ xfs_dir3_get_dtype(dp->i_mount, filetype)))
return 0;
sfep = dp->d_ops->sf_nextentry(sfp, sfep);
}
- ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
- 0x7fffffff;
+ ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) &
+ 0x7fffffff;
return 0;
}
@@ -176,9 +175,10 @@ xfs_dir2_sf_getdents(
*/
STATIC int
xfs_dir2_block_getdents(
- xfs_inode_t *dp, /* incore inode */
+ struct xfs_da_args *args,
struct dir_context *ctx)
{
+ struct xfs_inode *dp = args->dp; /* incore directory inode */
xfs_dir2_data_hdr_t *hdr; /* block header */
struct xfs_buf *bp; /* buffer for block */
xfs_dir2_block_tail_t *btp; /* block tail */
@@ -186,16 +186,15 @@ xfs_dir2_block_getdents(
xfs_dir2_data_unused_t *dup; /* block unused entry */
char *endptr; /* end of the data entries */
int error; /* error return value */
- xfs_mount_t *mp; /* filesystem mount point */
char *ptr; /* current data entry */
int wantoff; /* starting block offset */
xfs_off_t cook;
+ struct xfs_da_geometry *geo = args->geo;
- mp = dp->i_mount;
/*
* If the block number in the offset is out of range, we're done.
*/
- if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
+ if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk)
return 0;
error = xfs_dir3_block_read(NULL, dp, &bp);
@@ -206,13 +205,13 @@ xfs_dir2_block_getdents(
* Extract the byte offset we start at from the seek pointer.
* We'll skip entries before this.
*/
- wantoff = xfs_dir2_dataptr_to_off(mp, ctx->pos);
+ wantoff = xfs_dir2_dataptr_to_off(geo, ctx->pos);
hdr = bp->b_addr;
xfs_dir3_data_check(dp, bp);
/*
* Set up values for the loop.
*/
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(geo, hdr);
ptr = (char *)dp->d_ops->data_entry_p(hdr);
endptr = (char *)xfs_dir2_block_leaf_p(btp);
@@ -244,7 +243,7 @@ xfs_dir2_block_getdents(
if ((char *)dep - (char *)hdr < wantoff)
continue;
- cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
+ cook = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
(char *)dep - (char *)hdr);
ctx->pos = cook & 0x7fffffff;
@@ -254,7 +253,7 @@ xfs_dir2_block_getdents(
*/
if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
be64_to_cpu(dep->inumber),
- xfs_dir3_get_dtype(mp, filetype))) {
+ xfs_dir3_get_dtype(dp->i_mount, filetype))) {
xfs_trans_brelse(NULL, bp);
return 0;
}
@@ -264,8 +263,8 @@ xfs_dir2_block_getdents(
* Reached the end of the block.
* Set the offset to a non-existent block 1 and return.
*/
- ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
- 0x7fffffff;
+ ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) &
+ 0x7fffffff;
xfs_trans_brelse(NULL, bp);
return 0;
}
@@ -286,13 +285,13 @@ struct xfs_dir2_leaf_map_info {
STATIC int
xfs_dir2_leaf_readbuf(
- struct xfs_inode *dp,
+ struct xfs_da_args *args,
size_t bufsize,
struct xfs_dir2_leaf_map_info *mip,
xfs_dir2_off_t *curoff,
struct xfs_buf **bpp)
{
- struct xfs_mount *mp = dp->i_mount;
+ struct xfs_inode *dp = args->dp;
struct xfs_buf *bp = *bpp;
struct xfs_bmbt_irec *map = mip->map;
struct blk_plug plug;
@@ -300,6 +299,7 @@ xfs_dir2_leaf_readbuf(
int length;
int i;
int j;
+ struct xfs_da_geometry *geo = args->geo;
/*
* If we have a buffer, we need to release it and
@@ -309,12 +309,12 @@ xfs_dir2_leaf_readbuf(
if (bp) {
xfs_trans_brelse(NULL, bp);
bp = NULL;
- mip->map_blocks -= mp->m_dirblkfsbs;
+ mip->map_blocks -= geo->fsbcount;
/*
* Loop to get rid of the extents for the
* directory block.
*/
- for (i = mp->m_dirblkfsbs; i > 0; ) {
+ for (i = geo->fsbcount; i > 0; ) {
j = min_t(int, map->br_blockcount, i);
map->br_blockcount -= j;
map->br_startblock += j;
@@ -333,8 +333,7 @@ xfs_dir2_leaf_readbuf(
/*
* Recalculate the readahead blocks wanted.
*/
- mip->ra_want = howmany(bufsize + mp->m_dirblksize,
- mp->m_sb.sb_blocksize) - 1;
+ mip->ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog)) - 1;
ASSERT(mip->ra_want >= 0);
/*
@@ -342,14 +341,14 @@ xfs_dir2_leaf_readbuf(
* run out of data blocks, get some more mappings.
*/
if (1 + mip->ra_want > mip->map_blocks &&
- mip->map_off < xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) {
+ mip->map_off < xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET)) {
/*
* Get more bmaps, fill in after the ones
* we already have in the table.
*/
mip->nmap = mip->map_size - mip->map_valid;
error = xfs_bmapi_read(dp, mip->map_off,
- xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET) -
+ xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET) -
mip->map_off,
&map[mip->map_valid], &mip->nmap, 0);
@@ -370,7 +369,7 @@ xfs_dir2_leaf_readbuf(
i = mip->map_valid + mip->nmap - 1;
mip->map_off = map[i].br_startoff + map[i].br_blockcount;
} else
- mip->map_off = xfs_dir2_byte_to_da(mp,
+ mip->map_off = xfs_dir2_byte_to_da(geo,
XFS_DIR2_LEAF_OFFSET);
/*
@@ -396,18 +395,18 @@ xfs_dir2_leaf_readbuf(
* No valid mappings, so no more data blocks.
*/
if (!mip->map_valid) {
- *curoff = xfs_dir2_da_to_byte(mp, mip->map_off);
+ *curoff = xfs_dir2_da_to_byte(geo, mip->map_off);
goto out;
}
/*
* Read the directory block starting at the first mapping.
*/
- mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff);
+ mip->curdb = xfs_dir2_da_to_db(geo, map->br_startoff);
error = xfs_dir3_data_read(NULL, dp, map->br_startoff,
- map->br_blockcount >= mp->m_dirblkfsbs ?
- XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp);
-
+ map->br_blockcount >= geo->fsbcount ?
+ XFS_FSB_TO_DADDR(dp->i_mount, map->br_startblock) :
+ -1, &bp);
/*
* Should just skip over the data block instead of giving up.
*/
@@ -419,7 +418,7 @@ xfs_dir2_leaf_readbuf(
* was previously ra.
*/
if (mip->ra_current)
- mip->ra_current -= mp->m_dirblkfsbs;
+ mip->ra_current -= geo->fsbcount;
/*
* Do we need more readahead?
@@ -427,16 +426,16 @@ xfs_dir2_leaf_readbuf(
blk_start_plug(&plug);
for (mip->ra_index = mip->ra_offset = i = 0;
mip->ra_want > mip->ra_current && i < mip->map_blocks;
- i += mp->m_dirblkfsbs) {
+ i += geo->fsbcount) {
ASSERT(mip->ra_index < mip->map_valid);
/*
* Read-ahead a contiguous directory block.
*/
if (i > mip->ra_current &&
- map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) {
- xfs_dir3_data_readahead(NULL, dp,
+ map[mip->ra_index].br_blockcount >= geo->fsbcount) {
+ xfs_dir3_data_readahead(dp,
map[mip->ra_index].br_startoff + mip->ra_offset,
- XFS_FSB_TO_DADDR(mp,
+ XFS_FSB_TO_DADDR(dp->i_mount,
map[mip->ra_index].br_startblock +
mip->ra_offset));
mip->ra_current = i;
@@ -447,7 +446,7 @@ xfs_dir2_leaf_readbuf(
* use our mapping, but this is a very rare case.
*/
else if (i > mip->ra_current) {
- xfs_dir3_data_readahead(NULL, dp,
+ xfs_dir3_data_readahead(dp,
map[mip->ra_index].br_startoff +
mip->ra_offset, -1);
mip->ra_current = i;
@@ -456,15 +455,14 @@ xfs_dir2_leaf_readbuf(
/*
* Advance offset through the mapping table.
*/
- for (j = 0; j < mp->m_dirblkfsbs; j++) {
+ for (j = 0; j < geo->fsbcount; j += length ) {
/*
* The rest of this extent but not more than a dir
* block.
*/
- length = min_t(int, mp->m_dirblkfsbs,
+ length = min_t(int, geo->fsbcount,
map[mip->ra_index].br_blockcount -
mip->ra_offset);
- j += length;
mip->ra_offset += length;
/*
@@ -489,22 +487,23 @@ out:
*/
STATIC int
xfs_dir2_leaf_getdents(
- xfs_inode_t *dp, /* incore directory inode */
+ struct xfs_da_args *args,
struct dir_context *ctx,
size_t bufsize)
{
+ struct xfs_inode *dp = args->dp;
struct xfs_buf *bp = NULL; /* data block buffer */
xfs_dir2_data_hdr_t *hdr; /* data block header */
xfs_dir2_data_entry_t *dep; /* data entry */
xfs_dir2_data_unused_t *dup; /* unused entry */
int error = 0; /* error return value */
int length; /* temporary length value */
- xfs_mount_t *mp; /* filesystem mount point */
int byteoff; /* offset in current block */
xfs_dir2_off_t curoff; /* current overall offset */
xfs_dir2_off_t newoff; /* new curoff after new blk */
char *ptr = NULL; /* pointer to current data */
struct xfs_dir2_leaf_map_info *map_info;
+ struct xfs_da_geometry *geo = args->geo;
/*
* If the offset is at or past the largest allowed value,
@@ -513,15 +512,12 @@ xfs_dir2_leaf_getdents(
if (ctx->pos >= XFS_DIR2_MAX_DATAPTR)
return 0;
- mp = dp->i_mount;
-
/*
* Set up to bmap a number of blocks based on the caller's
* buffer size, the directory block size, and the filesystem
* block size.
*/
- length = howmany(bufsize + mp->m_dirblksize,
- mp->m_sb.sb_blocksize);
+ length = howmany(bufsize + geo->blksize, (1 << geo->fsblog));
map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
(length * sizeof(struct xfs_bmbt_irec)),
KM_SLEEP | KM_NOFS);
@@ -531,14 +527,14 @@ xfs_dir2_leaf_getdents(
* Inside the loop we keep the main offset value as a byte offset
* in the directory file.
*/
- curoff = xfs_dir2_dataptr_to_byte(mp, ctx->pos);
+ curoff = xfs_dir2_dataptr_to_byte(ctx->pos);
/*
* Force this conversion through db so we truncate the offset
* down to get the start of the data block.
*/
- map_info->map_off = xfs_dir2_db_to_da(mp,
- xfs_dir2_byte_to_db(mp, curoff));
+ map_info->map_off = xfs_dir2_db_to_da(geo,
+ xfs_dir2_byte_to_db(geo, curoff));
/*
* Loop over directory entries until we reach the end offset.
@@ -551,9 +547,9 @@ xfs_dir2_leaf_getdents(
* If we have no buffer, or we're off the end of the
* current buffer, need to get another one.
*/
- if (!bp || ptr >= (char *)bp->b_addr + mp->m_dirblksize) {
+ if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) {
- error = xfs_dir2_leaf_readbuf(dp, bufsize, map_info,
+ error = xfs_dir2_leaf_readbuf(args, bufsize, map_info,
&curoff, &bp);
if (error || !map_info->map_valid)
break;
@@ -561,7 +557,8 @@ xfs_dir2_leaf_getdents(
/*
* Having done a read, we need to set a new offset.
*/
- newoff = xfs_dir2_db_off_to_byte(mp, map_info->curdb, 0);
+ newoff = xfs_dir2_db_off_to_byte(geo,
+ map_info->curdb, 0);
/*
* Start of the current block.
*/
@@ -571,7 +568,7 @@ xfs_dir2_leaf_getdents(
* Make sure we're in the right block.
*/
else if (curoff > newoff)
- ASSERT(xfs_dir2_byte_to_db(mp, curoff) ==
+ ASSERT(xfs_dir2_byte_to_db(geo, curoff) ==
map_info->curdb);
hdr = bp->b_addr;
xfs_dir3_data_check(dp, bp);
@@ -579,7 +576,7 @@ xfs_dir2_leaf_getdents(
* Find our position in the block.
*/
ptr = (char *)dp->d_ops->data_entry_p(hdr);
- byteoff = xfs_dir2_byte_to_off(mp, curoff);
+ byteoff = xfs_dir2_byte_to_off(geo, curoff);
/*
* Skip past the header.
*/
@@ -608,10 +605,10 @@ xfs_dir2_leaf_getdents(
* Now set our real offset.
*/
curoff =
- xfs_dir2_db_off_to_byte(mp,
- xfs_dir2_byte_to_db(mp, curoff),
+ xfs_dir2_db_off_to_byte(geo,
+ xfs_dir2_byte_to_db(geo, curoff),
(char *)ptr - (char *)hdr);
- if (ptr >= (char *)hdr + mp->m_dirblksize) {
+ if (ptr >= (char *)hdr + geo->blksize) {
continue;
}
}
@@ -635,10 +632,10 @@ xfs_dir2_leaf_getdents(
length = dp->d_ops->data_entsize(dep->namelen);
filetype = dp->d_ops->data_get_ftype(dep);
- ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
+ ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
be64_to_cpu(dep->inumber),
- xfs_dir3_get_dtype(mp, filetype)))
+ xfs_dir3_get_dtype(dp->i_mount, filetype)))
break;
/*
@@ -653,10 +650,10 @@ xfs_dir2_leaf_getdents(
/*
* All done. Set output offset value to current offset.
*/
- if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR))
+ if (curoff > xfs_dir2_dataptr_to_byte(XFS_DIR2_MAX_DATAPTR))
ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
else
- ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
+ ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
kmem_free(map_info);
if (bp)
xfs_trans_brelse(NULL, bp);
@@ -668,13 +665,14 @@ xfs_dir2_leaf_getdents(
*/
int
xfs_readdir(
- xfs_inode_t *dp,
- struct dir_context *ctx,
- size_t bufsize)
+ struct xfs_inode *dp,
+ struct dir_context *ctx,
+ size_t bufsize)
{
- int rval; /* return value */
- int v; /* type-checking value */
- uint lock_mode;
+ struct xfs_da_args args = { NULL };
+ int rval;
+ int v;
+ uint lock_mode;
trace_xfs_readdir(dp);
@@ -684,15 +682,18 @@ xfs_readdir(
ASSERT(S_ISDIR(dp->i_d.di_mode));
XFS_STATS_INC(xs_dir_getdents);
+ args.dp = dp;
+ args.geo = dp->i_mount->m_dir_geo;
+
lock_mode = xfs_ilock_data_map_shared(dp);
if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
- rval = xfs_dir2_sf_getdents(dp, ctx);
- else if ((rval = xfs_dir2_isblock(NULL, dp, &v)))
+ rval = xfs_dir2_sf_getdents(&args, ctx);
+ else if ((rval = xfs_dir2_isblock(&args, &v)))
;
else if (v)
- rval = xfs_dir2_block_getdents(dp, ctx);
+ rval = xfs_dir2_block_getdents(&args, ctx);
else
- rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize);
+ rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize);
xfs_iunlock(dp, lock_mode);
return rval;
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 3725fb1b902b..53c3be619db5 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -82,8 +82,10 @@ xfs_dir2_block_sfsize(
xfs_ino_t parent = 0; /* parent inode number */
int size=0; /* total computed size */
int has_ftype;
+ struct xfs_da_geometry *geo;
mp = dp->i_mount;
+ geo = mp->m_dir_geo;
/*
* if there is a filetype field, add the extra byte to the namelen
@@ -92,7 +94,7 @@ xfs_dir2_block_sfsize(
has_ftype = xfs_sb_version_hasftype(&mp->m_sb) ? 1 : 0;
count = i8count = namelen = 0;
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(geo, hdr);
blp = xfs_dir2_block_leaf_p(btp);
/*
@@ -104,8 +106,8 @@ xfs_dir2_block_sfsize(
/*
* Calculate the pointer to the entry at hand.
*/
- dep = (xfs_dir2_data_entry_t *)
- ((char *)hdr + xfs_dir2_dataptr_to_off(mp, addr));
+ dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+ xfs_dir2_dataptr_to_off(geo, addr));
/*
* Detect . and .., so we can special-case them.
* . is not included in sf directories.
@@ -195,7 +197,7 @@ xfs_dir2_block_to_sf(
/*
* Set up to loop over the block's entries.
*/
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
ptr = (char *)dp->d_ops->data_entry_p(hdr);
endptr = (char *)xfs_dir2_block_leaf_p(btp);
sfep = xfs_dir2_sf_firstentry(sfp);
@@ -247,7 +249,7 @@ xfs_dir2_block_to_sf(
/* now we are done with the block, we can shrink the inode */
logflags = XFS_ILOG_CORE;
- error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp);
+ error = xfs_dir2_shrink_inode(args, args->geo->datablk, bp);
if (error) {
ASSERT(error != ENOSPC);
goto out;
@@ -285,14 +287,12 @@ int /* error */
xfs_dir2_sf_addname(
xfs_da_args_t *args) /* operation arguments */
{
- int add_entsize; /* size of the new entry */
xfs_inode_t *dp; /* incore directory inode */
int error; /* error return value */
int incr_isize; /* total change in size */
int new_isize; /* di_size after adding name */
int objchange; /* changing to 8-byte inodes */
xfs_dir2_data_aoff_t offset = 0; /* offset for new entry */
- int old_isize; /* di_size before adding name */
int pick; /* which algorithm to use */
xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
xfs_dir2_sf_entry_t *sfep = NULL; /* shortform entry */
@@ -316,8 +316,7 @@ xfs_dir2_sf_addname(
/*
* Compute entry (and change in) size.
*/
- add_entsize = dp->d_ops->sf_entsize(sfp, args->namelen);
- incr_isize = add_entsize;
+ incr_isize = dp->d_ops->sf_entsize(sfp, args->namelen);
objchange = 0;
#if XFS_BIG_INUMS
/*
@@ -325,11 +324,8 @@ xfs_dir2_sf_addname(
*/
if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) {
/*
- * Yes, adjust the entry size and the total size.
+ * Yes, adjust the inode size. old count + (parent + new)
*/
- add_entsize +=
- (uint)sizeof(xfs_dir2_ino8_t) -
- (uint)sizeof(xfs_dir2_ino4_t);
incr_isize +=
(sfp->count + 2) *
((uint)sizeof(xfs_dir2_ino8_t) -
@@ -337,8 +333,7 @@ xfs_dir2_sf_addname(
objchange = 1;
}
#endif
- old_isize = (int)dp->i_d.di_size;
- new_isize = old_isize + incr_isize;
+ new_isize = (int)dp->i_d.di_size + incr_isize;
/*
* Won't fit as shortform any more (due to size),
* or the pick routine says it won't (due to offset values).
@@ -593,7 +588,7 @@ xfs_dir2_sf_addname_pick(
* we'll go back, convert to block, then try the insert and convert
* to leaf.
*/
- if (used + (holefit ? 0 : size) > mp->m_dirblksize)
+ if (used + (holefit ? 0 : size) > args->geo->blksize)
return 0;
/*
* If changing the inode number size, do it the hard way.
@@ -608,7 +603,7 @@ xfs_dir2_sf_addname_pick(
/*
* If it won't fit at the end then do it the hard way (use the hole).
*/
- if (used + size > mp->m_dirblksize)
+ if (used + size > args->geo->blksize)
return 2;
/*
* Do it the easy way.
@@ -659,7 +654,7 @@ xfs_dir2_sf_check(
ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size);
ASSERT(offset +
(sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
- (uint)sizeof(xfs_dir2_block_tail_t) <= mp->m_dirblksize);
+ (uint)sizeof(xfs_dir2_block_tail_t) <= args->geo->blksize);
}
#endif /* DEBUG */
@@ -1110,9 +1105,9 @@ xfs_dir2_sf_toino4(
}
/*
- * Convert from 4-byte inode numbers to 8-byte inode numbers.
- * The new 8-byte inode number is not there yet, we leave with the
- * count 1 but no corresponding entry.
+ * Convert existing entries from 4-byte inode numbers to 8-byte inode numbers.
+ * The new entry w/ an 8-byte inode number is not there yet; we leave with
+ * i8count set to 1, but no corresponding 8-byte entry.
*/
static void
xfs_dir2_sf_toino8(
@@ -1145,7 +1140,7 @@ xfs_dir2_sf_toino8(
ASSERT(oldsfp->i8count == 0);
memcpy(buf, oldsfp, oldsize);
/*
- * Compute the new inode size.
+ * Compute the new inode size (nb: entry count + 1 for parent)
*/
newsize =
oldsize +
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 868b19f096bf..3ee0cd43edc0 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -353,10 +353,10 @@ xfs_qm_dqalloc(
dqp->q_blkno,
mp->m_quotainfo->qi_dqchunklen,
0);
-
- error = xfs_buf_geterror(bp);
- if (error)
+ if (!bp) {
+ error = ENOMEM;
goto error1;
+ }
bp->b_ops = &xfs_dquot_buf_ops;
/*
@@ -832,47 +832,6 @@ restart:
return (0);
}
-
-STATIC void
-xfs_qm_dqput_final(
- struct xfs_dquot *dqp)
-{
- struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo;
- struct xfs_dquot *gdqp;
- struct xfs_dquot *pdqp;
-
- trace_xfs_dqput_free(dqp);
-
- if (list_lru_add(&qi->qi_lru, &dqp->q_lru))
- XFS_STATS_INC(xs_qm_dquot_unused);
-
- /*
- * If we just added a udquot to the freelist, then we want to release
- * the gdquot/pdquot reference that it (probably) has. Otherwise it'll
- * keep the gdquot/pdquot from getting reclaimed.
- */
- gdqp = dqp->q_gdquot;
- if (gdqp) {
- xfs_dqlock(gdqp);
- dqp->q_gdquot = NULL;
- }
-
- pdqp = dqp->q_pdquot;
- if (pdqp) {
- xfs_dqlock(pdqp);
- dqp->q_pdquot = NULL;
- }
- xfs_dqunlock(dqp);
-
- /*
- * If we had a group/project quota hint, release it now.
- */
- if (gdqp)
- xfs_qm_dqput(gdqp);
- if (pdqp)
- xfs_qm_dqput(pdqp);
-}
-
/*
* Release a reference to the dquot (decrement ref-count) and unlock it.
*
@@ -888,10 +847,14 @@ xfs_qm_dqput(
trace_xfs_dqput(dqp);
- if (--dqp->q_nrefs > 0)
- xfs_dqunlock(dqp);
- else
- xfs_qm_dqput_final(dqp);
+ if (--dqp->q_nrefs == 0) {
+ struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo;
+ trace_xfs_dqput_free(dqp);
+
+ if (list_lru_add(&qi->qi_lru, &dqp->q_lru))
+ XFS_STATS_INC(xs_qm_dquot_unused);
+ }
+ xfs_dqunlock(dqp);
}
/*
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index d22ed0053c32..68a68f704837 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -52,8 +52,6 @@ typedef struct xfs_dquot {
int q_bufoffset; /* off of dq in buffer (# dquots) */
xfs_fileoff_t q_fileoffset; /* offset in quotas file */
- struct xfs_dquot*q_gdquot; /* group dquot, hint only */
- struct xfs_dquot*q_pdquot; /* project dquot, hint only */
xfs_disk_dquot_t q_core; /* actual usage & quotas */
xfs_dq_logitem_t q_logitem; /* dquot log item */
xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */
diff --git a/fs/xfs/xfs_dquot_buf.c b/fs/xfs/xfs_dquot_buf.c
index 610da8177737..c2ac0c611ad8 100644
--- a/fs/xfs/xfs_dquot_buf.c
+++ b/fs/xfs/xfs_dquot_buf.c
@@ -35,7 +35,6 @@
int
xfs_calc_dquots_per_chunk(
- struct xfs_mount *mp,
unsigned int nbblks) /* basic block units */
{
unsigned int ndquots;
@@ -194,7 +193,7 @@ xfs_dquot_buf_verify_crc(
if (mp->m_quotainfo)
ndquots = mp->m_quotainfo->qi_dqperchunk;
else
- ndquots = xfs_calc_dquots_per_chunk(mp,
+ ndquots = xfs_calc_dquots_per_chunk(
XFS_BB_TO_FSB(mp, bp->b_length));
for (i = 0; i < ndquots; i++, d++) {
@@ -225,7 +224,7 @@ xfs_dquot_buf_verify(
if (mp->m_quotainfo)
ndquots = mp->m_quotainfo->qi_dqperchunk;
else
- ndquots = xfs_calc_dquots_per_chunk(mp, bp->b_length);
+ ndquots = xfs_calc_dquots_per_chunk(bp->b_length);
/*
* On the first read of the buffer, verify that each dquot is valid.
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 830c1c937b88..1f66779d7a46 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -229,34 +229,27 @@ xfs_file_fsync(
}
STATIC ssize_t
-xfs_file_aio_read(
+xfs_file_read_iter(
struct kiocb *iocb,
- const struct iovec *iovp,
- unsigned long nr_segs,
- loff_t pos)
+ struct iov_iter *to)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- size_t size = 0;
+ size_t size = iov_iter_count(to);
ssize_t ret = 0;
int ioflags = 0;
xfs_fsize_t n;
+ loff_t pos = iocb->ki_pos;
XFS_STATS_INC(xs_read_calls);
- BUG_ON(iocb->ki_pos != pos);
-
if (unlikely(file->f_flags & O_DIRECT))
ioflags |= IO_ISDIRECT;
if (file->f_mode & FMODE_NOCMTIME)
ioflags |= IO_INVIS;
- ret = generic_segment_checks(iovp, &nr_segs, &size, VERIFY_WRITE);
- if (ret < 0)
- return ret;
-
if (unlikely(ioflags & IO_ISDIRECT)) {
xfs_buftarg_t *target =
XFS_IS_REALTIME_INODE(ip) ?
@@ -309,7 +302,7 @@ xfs_file_aio_read(
trace_xfs_file_read(ip, size, pos, ioflags);
- ret = generic_file_aio_read(iocb, iovp, nr_segs, pos);
+ ret = generic_file_read_iter(iocb, to);
if (ret > 0)
XFS_STATS_ADD(xs_read_bytes, ret);
@@ -350,47 +343,6 @@ xfs_file_splice_read(
}
/*
- * xfs_file_splice_write() does not use xfs_rw_ilock() because
- * generic_file_splice_write() takes the i_mutex itself. This, in theory,
- * couuld cause lock inversions between the aio_write path and the splice path
- * if someone is doing concurrent splice(2) based writes and write(2) based
- * writes to the same inode. The only real way to fix this is to re-implement
- * the generic code here with correct locking orders.
- */
-STATIC ssize_t
-xfs_file_splice_write(
- struct pipe_inode_info *pipe,
- struct file *outfilp,
- loff_t *ppos,
- size_t count,
- unsigned int flags)
-{
- struct inode *inode = outfilp->f_mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
- int ioflags = 0;
- ssize_t ret;
-
- XFS_STATS_INC(xs_write_calls);
-
- if (outfilp->f_mode & FMODE_NOCMTIME)
- ioflags |= IO_INVIS;
-
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return -EIO;
-
- xfs_ilock(ip, XFS_IOLOCK_EXCL);
-
- trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
-
- ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
- if (ret > 0)
- XFS_STATS_ADD(xs_write_bytes, ret);
-
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- return ret;
-}
-
-/*
* This routine is called to handle zeroing any space in the last block of the
* file that is beyond the EOF. We do this since the size is being increased
* without writing anything to that block and we don't want to read the
@@ -625,10 +577,7 @@ restart:
STATIC ssize_t
xfs_file_dio_aio_write(
struct kiocb *iocb,
- const struct iovec *iovp,
- unsigned long nr_segs,
- loff_t pos,
- size_t ocount)
+ struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -636,9 +585,10 @@ xfs_file_dio_aio_write(
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
ssize_t ret = 0;
- size_t count = ocount;
int unaligned_io = 0;
int iolock;
+ size_t count = iov_iter_count(from);
+ loff_t pos = iocb->ki_pos;
struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
@@ -677,6 +627,7 @@ xfs_file_dio_aio_write(
ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
if (ret)
goto out;
+ iov_iter_truncate(from, count);
if (mapping->nrpages) {
ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
@@ -698,8 +649,7 @@ xfs_file_dio_aio_write(
}
trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
- ret = generic_file_direct_write(iocb, iovp,
- &nr_segs, pos, count, ocount);
+ ret = generic_file_direct_write(iocb, from, pos);
out:
xfs_rw_iunlock(ip, iolock);
@@ -712,10 +662,7 @@ out:
STATIC ssize_t
xfs_file_buffered_aio_write(
struct kiocb *iocb,
- const struct iovec *iovp,
- unsigned long nr_segs,
- loff_t pos,
- size_t count)
+ struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -724,7 +671,8 @@ xfs_file_buffered_aio_write(
ssize_t ret;
int enospc = 0;
int iolock = XFS_IOLOCK_EXCL;
- struct iov_iter from;
+ loff_t pos = iocb->ki_pos;
+ size_t count = iov_iter_count(from);
xfs_rw_ilock(ip, iolock);
@@ -732,13 +680,13 @@ xfs_file_buffered_aio_write(
if (ret)
goto out;
- iov_iter_init(&from, iovp, nr_segs, count, 0);
+ iov_iter_truncate(from, count);
/* We can write back this queue in page reclaim */
current->backing_dev_info = mapping->backing_dev_info;
write_retry:
trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
- ret = generic_perform_write(file, &from, pos);
+ ret = generic_perform_write(file, from, pos);
if (likely(ret >= 0))
iocb->ki_pos = pos + ret;
/*
@@ -759,40 +707,29 @@ out:
}
STATIC ssize_t
-xfs_file_aio_write(
+xfs_file_write_iter(
struct kiocb *iocb,
- const struct iovec *iovp,
- unsigned long nr_segs,
- loff_t pos)
+ struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
- size_t ocount = 0;
+ size_t ocount = iov_iter_count(from);
XFS_STATS_INC(xs_write_calls);
- BUG_ON(iocb->ki_pos != pos);
-
- ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
- if (ret)
- return ret;
-
if (ocount == 0)
return 0;
- if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- ret = -EIO;
- goto out;
- }
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ return -EIO;
if (unlikely(file->f_flags & O_DIRECT))
- ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount);
+ ret = xfs_file_dio_aio_write(iocb, from);
else
- ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
- ocount);
+ ret = xfs_file_buffered_aio_write(iocb, from);
if (ret > 0) {
ssize_t err;
@@ -804,8 +741,6 @@ xfs_file_aio_write(
if (err < 0)
ret = err;
}
-
-out:
return ret;
}
@@ -944,7 +879,7 @@ xfs_dir_open(
*/
mode = xfs_ilock_data_map_shared(ip);
if (ip->i_d.di_nextents > 0)
- xfs_dir3_data_readahead(NULL, ip, 0, -1);
+ xfs_dir3_data_readahead(ip, 0, -1);
xfs_iunlock(ip, mode);
return 0;
}
@@ -1461,12 +1396,12 @@ xfs_file_llseek(
const struct file_operations xfs_file_operations = {
.llseek = xfs_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = xfs_file_aio_read,
- .aio_write = xfs_file_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = xfs_file_read_iter,
+ .write_iter = xfs_file_write_iter,
.splice_read = xfs_file_splice_read,
- .splice_write = xfs_file_splice_write,
+ .splice_write = iter_file_splice_write,
.unlocked_ioctl = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = xfs_file_compat_ioctl,
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 12b6e7701985..8ec81bed7992 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2006-2007 Silicon Graphics, Inc.
+ * Copyright (c) 2014 Christoph Hellwig.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -32,100 +33,20 @@
#include "xfs_filestream.h"
#include "xfs_trace.h"
-#ifdef XFS_FILESTREAMS_TRACE
-
-ktrace_t *xfs_filestreams_trace_buf;
-
-STATIC void
-xfs_filestreams_trace(
- xfs_mount_t *mp, /* mount point */
- int type, /* type of trace */
- const char *func, /* source function */
- int line, /* source line number */
- __psunsigned_t arg0,
- __psunsigned_t arg1,
- __psunsigned_t arg2,
- __psunsigned_t arg3,
- __psunsigned_t arg4,
- __psunsigned_t arg5)
-{
- ktrace_enter(xfs_filestreams_trace_buf,
- (void *)(__psint_t)(type | (line << 16)),
- (void *)func,
- (void *)(__psunsigned_t)current_pid(),
- (void *)mp,
- (void *)(__psunsigned_t)arg0,
- (void *)(__psunsigned_t)arg1,
- (void *)(__psunsigned_t)arg2,
- (void *)(__psunsigned_t)arg3,
- (void *)(__psunsigned_t)arg4,
- (void *)(__psunsigned_t)arg5,
- NULL, NULL, NULL, NULL, NULL, NULL);
-}
-
-#define TRACE0(mp,t) TRACE6(mp,t,0,0,0,0,0,0)
-#define TRACE1(mp,t,a0) TRACE6(mp,t,a0,0,0,0,0,0)
-#define TRACE2(mp,t,a0,a1) TRACE6(mp,t,a0,a1,0,0,0,0)
-#define TRACE3(mp,t,a0,a1,a2) TRACE6(mp,t,a0,a1,a2,0,0,0)
-#define TRACE4(mp,t,a0,a1,a2,a3) TRACE6(mp,t,a0,a1,a2,a3,0,0)
-#define TRACE5(mp,t,a0,a1,a2,a3,a4) TRACE6(mp,t,a0,a1,a2,a3,a4,0)
-#define TRACE6(mp,t,a0,a1,a2,a3,a4,a5) \
- xfs_filestreams_trace(mp, t, __func__, __LINE__, \
- (__psunsigned_t)a0, (__psunsigned_t)a1, \
- (__psunsigned_t)a2, (__psunsigned_t)a3, \
- (__psunsigned_t)a4, (__psunsigned_t)a5)
-
-#define TRACE_AG_SCAN(mp, ag, ag2) \
- TRACE2(mp, XFS_FSTRM_KTRACE_AGSCAN, ag, ag2);
-#define TRACE_AG_PICK1(mp, max_ag, maxfree) \
- TRACE2(mp, XFS_FSTRM_KTRACE_AGPICK1, max_ag, maxfree);
-#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) \
- TRACE6(mp, XFS_FSTRM_KTRACE_AGPICK2, ag, ag2, \
- cnt, free, scan, flag)
-#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2) \
- TRACE5(mp, XFS_FSTRM_KTRACE_UPDATE, ip, ag, cnt, ag2, cnt2)
-#define TRACE_FREE(mp, ip, pip, ag, cnt) \
- TRACE4(mp, XFS_FSTRM_KTRACE_FREE, ip, pip, ag, cnt)
-#define TRACE_LOOKUP(mp, ip, pip, ag, cnt) \
- TRACE4(mp, XFS_FSTRM_KTRACE_ITEM_LOOKUP, ip, pip, ag, cnt)
-#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt) \
- TRACE4(mp, XFS_FSTRM_KTRACE_ASSOCIATE, ip, pip, ag, cnt)
-#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt) \
- TRACE6(mp, XFS_FSTRM_KTRACE_MOVEAG, ip, pip, oag, ocnt, nag, ncnt)
-#define TRACE_ORPHAN(mp, ip, ag) \
- TRACE2(mp, XFS_FSTRM_KTRACE_ORPHAN, ip, ag);
-
-
-#else
-#define TRACE_AG_SCAN(mp, ag, ag2)
-#define TRACE_AG_PICK1(mp, max_ag, maxfree)
-#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag)
-#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2)
-#define TRACE_FREE(mp, ip, pip, ag, cnt)
-#define TRACE_LOOKUP(mp, ip, pip, ag, cnt)
-#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt)
-#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt)
-#define TRACE_ORPHAN(mp, ip, ag)
-#endif
-
-static kmem_zone_t *item_zone;
+struct xfs_fstrm_item {
+ struct xfs_mru_cache_elem mru;
+ struct xfs_inode *ip;
+ xfs_agnumber_t ag; /* AG in use for this directory */
+};
-/*
- * Structure for associating a file or a directory with an allocation group.
- * The parent directory pointer is only needed for files, but since there will
- * generally be vastly more files than directories in the cache, using the same
- * data structure simplifies the code with very little memory overhead.
- */
-typedef struct fstrm_item
-{
- xfs_agnumber_t ag; /* AG currently in use for the file/directory. */
- xfs_inode_t *ip; /* inode self-pointer. */
- xfs_inode_t *pip; /* Parent directory inode pointer. */
-} fstrm_item_t;
+enum xfs_fstrm_alloc {
+ XFS_PICK_USERDATA = 1,
+ XFS_PICK_LOWSPACE = 2,
+};
/*
* Allocation group filestream associations are tracked with per-ag atomic
- * counters. These counters allow _xfs_filestream_pick_ag() to tell whether a
+ * counters. These counters allow xfs_filestream_pick_ag() to tell whether a
* particular AG already has active filestreams associated with it. The mount
* point's m_peraglock is used to protect these counters from per-ag array
* re-allocation during a growfs operation. When xfs_growfs_data_private() is
@@ -160,7 +81,7 @@ typedef struct fstrm_item
* the cache that reference per-ag array elements that have since been
* reallocated.
*/
-static int
+int
xfs_filestream_peek_ag(
xfs_mount_t *mp,
xfs_agnumber_t agno)
@@ -200,23 +121,40 @@ xfs_filestream_put_ag(
xfs_perag_put(pag);
}
+static void
+xfs_fstrm_free_func(
+ struct xfs_mru_cache_elem *mru)
+{
+ struct xfs_fstrm_item *item =
+ container_of(mru, struct xfs_fstrm_item, mru);
+
+ xfs_filestream_put_ag(item->ip->i_mount, item->ag);
+
+ trace_xfs_filestream_free(item->ip, item->ag);
+
+ kmem_free(item);
+}
+
/*
* Scan the AGs starting at startag looking for an AG that isn't in use and has
* at least minlen blocks free.
*/
static int
-_xfs_filestream_pick_ag(
- xfs_mount_t *mp,
- xfs_agnumber_t startag,
- xfs_agnumber_t *agp,
- int flags,
- xfs_extlen_t minlen)
+xfs_filestream_pick_ag(
+ struct xfs_inode *ip,
+ xfs_agnumber_t startag,
+ xfs_agnumber_t *agp,
+ int flags,
+ xfs_extlen_t minlen)
{
- int streams, max_streams;
- int err, trylock, nscan;
- xfs_extlen_t longest, free, minfree, maxfree = 0;
- xfs_agnumber_t ag, max_ag = NULLAGNUMBER;
- struct xfs_perag *pag;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_fstrm_item *item;
+ struct xfs_perag *pag;
+ xfs_extlen_t longest, free = 0, minfree, maxfree = 0;
+ xfs_agnumber_t ag, max_ag = NULLAGNUMBER;
+ int err, trylock, nscan;
+
+ ASSERT(S_ISDIR(ip->i_d.di_mode));
/* 2% of an AG's blocks must be free for it to be chosen. */
minfree = mp->m_sb.sb_agblocks / 50;
@@ -228,8 +166,9 @@ _xfs_filestream_pick_ag(
trylock = XFS_ALLOC_FLAG_TRYLOCK;
for (nscan = 0; 1; nscan++) {
+ trace_xfs_filestream_scan(ip, ag);
+
pag = xfs_perag_get(mp, ag);
- TRACE_AG_SCAN(mp, ag, atomic_read(&pag->pagf_fstrms));
if (!pag->pagf_init) {
err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
@@ -246,7 +185,6 @@ _xfs_filestream_pick_ag(
/* Keep track of the AG with the most free blocks. */
if (pag->pagf_freeblks > maxfree) {
maxfree = pag->pagf_freeblks;
- max_streams = atomic_read(&pag->pagf_fstrms);
max_ag = ag;
}
@@ -269,7 +207,6 @@ _xfs_filestream_pick_ag(
/* Break out, retaining the reference on the AG. */
free = pag->pagf_freeblks;
- streams = atomic_read(&pag->pagf_fstrms);
xfs_perag_put(pag);
*agp = ag;
break;
@@ -305,317 +242,98 @@ next_ag:
*/
if (max_ag != NULLAGNUMBER) {
xfs_filestream_get_ag(mp, max_ag);
- TRACE_AG_PICK1(mp, max_ag, maxfree);
- streams = max_streams;
free = maxfree;
*agp = max_ag;
break;
}
/* take AG 0 if none matched */
- TRACE_AG_PICK1(mp, max_ag, maxfree);
+ trace_xfs_filestream_pick(ip, *agp, free, nscan);
*agp = 0;
return 0;
}
- TRACE_AG_PICK2(mp, startag, *agp, streams, free, nscan, flags);
-
- return 0;
-}
+ trace_xfs_filestream_pick(ip, *agp, free, nscan);
-/*
- * Set the allocation group number for a file or a directory, updating inode
- * references and per-AG references as appropriate.
- */
-static int
-_xfs_filestream_update_ag(
- xfs_inode_t *ip,
- xfs_inode_t *pip,
- xfs_agnumber_t ag)
-{
- int err = 0;
- xfs_mount_t *mp;
- xfs_mru_cache_t *cache;
- fstrm_item_t *item;
- xfs_agnumber_t old_ag;
- xfs_inode_t *old_pip;
-
- /*
- * Either ip is a regular file and pip is a directory, or ip is a
- * directory and pip is NULL.
- */
- ASSERT(ip && ((S_ISREG(ip->i_d.di_mode) && pip &&
- S_ISDIR(pip->i_d.di_mode)) ||
- (S_ISDIR(ip->i_d.di_mode) && !pip)));
-
- mp = ip->i_mount;
- cache = mp->m_filestream;
-
- item = xfs_mru_cache_lookup(cache, ip->i_ino);
- if (item) {
- ASSERT(item->ip == ip);
- old_ag = item->ag;
- item->ag = ag;
- old_pip = item->pip;
- item->pip = pip;
- xfs_mru_cache_done(cache);
-
- /*
- * If the AG has changed, drop the old ref and take a new one,
- * effectively transferring the reference from old to new AG.
- */
- if (ag != old_ag) {
- xfs_filestream_put_ag(mp, old_ag);
- xfs_filestream_get_ag(mp, ag);
- }
-
- /*
- * If ip is a file and its pip has changed, drop the old ref and
- * take a new one.
- */
- if (pip && pip != old_pip) {
- IRELE(old_pip);
- IHOLD(pip);
- }
-
- TRACE_UPDATE(mp, ip, old_ag, xfs_filestream_peek_ag(mp, old_ag),
- ag, xfs_filestream_peek_ag(mp, ag));
+ if (*agp == NULLAGNUMBER)
return 0;
- }
- item = kmem_zone_zalloc(item_zone, KM_MAYFAIL);
+ err = ENOMEM;
+ item = kmem_alloc(sizeof(*item), KM_MAYFAIL);
if (!item)
- return ENOMEM;
+ goto out_put_ag;
- item->ag = ag;
+ item->ag = *agp;
item->ip = ip;
- item->pip = pip;
- err = xfs_mru_cache_insert(cache, ip->i_ino, item);
+ err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru);
if (err) {
- kmem_zone_free(item_zone, item);
- return err;
+ if (err == EEXIST)
+ err = 0;
+ goto out_free_item;
}
- /* Take a reference on the AG. */
- xfs_filestream_get_ag(mp, ag);
-
- /*
- * Take a reference on the inode itself regardless of whether it's a
- * regular file or a directory.
- */
- IHOLD(ip);
-
- /*
- * In the case of a regular file, take a reference on the parent inode
- * as well to ensure it remains in-core.
- */
- if (pip)
- IHOLD(pip);
-
- TRACE_UPDATE(mp, ip, ag, xfs_filestream_peek_ag(mp, ag),
- ag, xfs_filestream_peek_ag(mp, ag));
-
return 0;
-}
-
-/* xfs_fstrm_free_func(): callback for freeing cached stream items. */
-STATIC void
-xfs_fstrm_free_func(
- unsigned long ino,
- void *data)
-{
- fstrm_item_t *item = (fstrm_item_t *)data;
- xfs_inode_t *ip = item->ip;
-
- ASSERT(ip->i_ino == ino);
-
- xfs_iflags_clear(ip, XFS_IFILESTREAM);
-
- /* Drop the reference taken on the AG when the item was added. */
- xfs_filestream_put_ag(ip->i_mount, item->ag);
-
- TRACE_FREE(ip->i_mount, ip, item->pip, item->ag,
- xfs_filestream_peek_ag(ip->i_mount, item->ag));
-
- /*
- * _xfs_filestream_update_ag() always takes a reference on the inode
- * itself, whether it's a file or a directory. Release it here.
- * This can result in the inode being freed and so we must
- * not hold any inode locks when freeing filesstreams objects
- * otherwise we can deadlock here.
- */
- IRELE(ip);
-
- /*
- * In the case of a regular file, _xfs_filestream_update_ag() also
- * takes a ref on the parent inode to keep it in-core. Release that
- * too.
- */
- if (item->pip)
- IRELE(item->pip);
-
- /* Finally, free the memory allocated for the item. */
- kmem_zone_free(item_zone, item);
-}
-
-/*
- * xfs_filestream_init() is called at xfs initialisation time to set up the
- * memory zone that will be used for filestream data structure allocation.
- */
-int
-xfs_filestream_init(void)
-{
- item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item");
- if (!item_zone)
- return -ENOMEM;
-
- return 0;
-}
-
-/*
- * xfs_filestream_uninit() is called at xfs termination time to destroy the
- * memory zone that was used for filestream data structure allocation.
- */
-void
-xfs_filestream_uninit(void)
-{
- kmem_zone_destroy(item_zone);
-}
-
-/*
- * xfs_filestream_mount() is called when a file system is mounted with the
- * filestream option. It is responsible for allocating the data structures
- * needed to track the new file system's file streams.
- */
-int
-xfs_filestream_mount(
- xfs_mount_t *mp)
-{
- int err;
- unsigned int lifetime, grp_count;
-
- /*
- * The filestream timer tunable is currently fixed within the range of
- * one second to four minutes, with five seconds being the default. The
- * group count is somewhat arbitrary, but it'd be nice to adhere to the
- * timer tunable to within about 10 percent. This requires at least 10
- * groups.
- */
- lifetime = xfs_fstrm_centisecs * 10;
- grp_count = 10;
-
- err = xfs_mru_cache_create(&mp->m_filestream, lifetime, grp_count,
- xfs_fstrm_free_func);
+out_free_item:
+ kmem_free(item);
+out_put_ag:
+ xfs_filestream_put_ag(mp, *agp);
return err;
}
-/*
- * xfs_filestream_unmount() is called when a file system that was mounted with
- * the filestream option is unmounted. It drains the data structures created
- * to track the file system's file streams and frees all the memory that was
- * allocated.
- */
-void
-xfs_filestream_unmount(
- xfs_mount_t *mp)
+static struct xfs_inode *
+xfs_filestream_get_parent(
+ struct xfs_inode *ip)
{
- xfs_mru_cache_destroy(mp->m_filestream);
-}
+ struct inode *inode = VFS_I(ip), *dir = NULL;
+ struct dentry *dentry, *parent;
-/*
- * Return the AG of the filestream the file or directory belongs to, or
- * NULLAGNUMBER otherwise.
- */
-xfs_agnumber_t
-xfs_filestream_lookup_ag(
- xfs_inode_t *ip)
-{
- xfs_mru_cache_t *cache;
- fstrm_item_t *item;
- xfs_agnumber_t ag;
- int ref;
-
- if (!S_ISREG(ip->i_d.di_mode) && !S_ISDIR(ip->i_d.di_mode)) {
- ASSERT(0);
- return NULLAGNUMBER;
- }
+ dentry = d_find_alias(inode);
+ if (!dentry)
+ goto out;
- cache = ip->i_mount->m_filestream;
- item = xfs_mru_cache_lookup(cache, ip->i_ino);
- if (!item) {
- TRACE_LOOKUP(ip->i_mount, ip, NULL, NULLAGNUMBER, 0);
- return NULLAGNUMBER;
- }
+ parent = dget_parent(dentry);
+ if (!parent)
+ goto out_dput;
- ASSERT(ip == item->ip);
- ag = item->ag;
- ref = xfs_filestream_peek_ag(ip->i_mount, ag);
- xfs_mru_cache_done(cache);
+ dir = igrab(parent->d_inode);
+ dput(parent);
- TRACE_LOOKUP(ip->i_mount, ip, item->pip, ag, ref);
- return ag;
+out_dput:
+ dput(dentry);
+out:
+ return dir ? XFS_I(dir) : NULL;
}
/*
- * xfs_filestream_associate() should only be called to associate a regular file
- * with its parent directory. Calling it with a child directory isn't
- * appropriate because filestreams don't apply to entire directory hierarchies.
- * Creating a file in a child directory of an existing filestream directory
- * starts a new filestream with its own allocation group association.
+ * Find the right allocation group for a file, either by finding an
+ * existing file stream or creating a new one.
*
- * Returns < 0 on error, 0 if successful association occurred, > 0 if
- * we failed to get an association because of locking issues.
+ * Returns NULLAGNUMBER in case of an error.
*/
-int
-xfs_filestream_associate(
- xfs_inode_t *pip,
- xfs_inode_t *ip)
+xfs_agnumber_t
+xfs_filestream_lookup_ag(
+ struct xfs_inode *ip)
{
- xfs_mount_t *mp;
- xfs_mru_cache_t *cache;
- fstrm_item_t *item;
- xfs_agnumber_t ag, rotorstep, startag;
- int err = 0;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_inode *pip = NULL;
+ xfs_agnumber_t startag, ag = NULLAGNUMBER;
+ struct xfs_mru_cache_elem *mru;
- ASSERT(S_ISDIR(pip->i_d.di_mode));
ASSERT(S_ISREG(ip->i_d.di_mode));
- if (!S_ISDIR(pip->i_d.di_mode) || !S_ISREG(ip->i_d.di_mode))
- return -EINVAL;
- mp = pip->i_mount;
- cache = mp->m_filestream;
+ pip = xfs_filestream_get_parent(ip);
+ if (!pip)
+ goto out;
- /*
- * We have a problem, Houston.
- *
- * Taking the iolock here violates inode locking order - we already
- * hold the ilock. Hence if we block getting this lock we may never
- * wake. Unfortunately, that means if we can't get the lock, we're
- * screwed in terms of getting a stream association - we can't spin
- * waiting for the lock because someone else is waiting on the lock we
- * hold and we cannot drop that as we are in a transaction here.
- *
- * Lucky for us, this inversion is not a problem because it's a
- * directory inode that we are trying to lock here.
- *
- * So, if we can't get the iolock without sleeping then just give up
- */
- if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL))
- return 1;
-
- /* If the parent directory is already in the cache, use its AG. */
- item = xfs_mru_cache_lookup(cache, pip->i_ino);
- if (item) {
- ASSERT(item->ip == pip);
- ag = item->ag;
- xfs_mru_cache_done(cache);
-
- TRACE_LOOKUP(mp, pip, pip, ag, xfs_filestream_peek_ag(mp, ag));
- err = _xfs_filestream_update_ag(ip, pip, ag);
+ mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino);
+ if (mru) {
+ ag = container_of(mru, struct xfs_fstrm_item, mru)->ag;
+ xfs_mru_cache_done(mp->m_filestream);
- goto exit;
+ trace_xfs_filestream_lookup(ip, ag);
+ goto out;
}
/*
@@ -623,202 +341,94 @@ xfs_filestream_associate(
* use the directory inode's AG.
*/
if (mp->m_flags & XFS_MOUNT_32BITINODES) {
- rotorstep = xfs_rotorstep;
+ xfs_agnumber_t rotorstep = xfs_rotorstep;
startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount;
mp->m_agfrotor = (mp->m_agfrotor + 1) %
(mp->m_sb.sb_agcount * rotorstep);
} else
startag = XFS_INO_TO_AGNO(mp, pip->i_ino);
- /* Pick a new AG for the parent inode starting at startag. */
- err = _xfs_filestream_pick_ag(mp, startag, &ag, 0, 0);
- if (err || ag == NULLAGNUMBER)
- goto exit_did_pick;
-
- /* Associate the parent inode with the AG. */
- err = _xfs_filestream_update_ag(pip, NULL, ag);
- if (err)
- goto exit_did_pick;
-
- /* Associate the file inode with the AG. */
- err = _xfs_filestream_update_ag(ip, pip, ag);
- if (err)
- goto exit_did_pick;
-
- TRACE_ASSOCIATE(mp, ip, pip, ag, xfs_filestream_peek_ag(mp, ag));
-
-exit_did_pick:
- /*
- * If _xfs_filestream_pick_ag() returned a valid AG, remove the
- * reference it took on it, since the file and directory will have taken
- * their own now if they were successfully cached.
- */
- if (ag != NULLAGNUMBER)
- xfs_filestream_put_ag(mp, ag);
-
-exit:
- xfs_iunlock(pip, XFS_IOLOCK_EXCL);
- return -err;
+ if (xfs_filestream_pick_ag(pip, startag, &ag, 0, 0))
+ ag = NULLAGNUMBER;
+out:
+ IRELE(pip);
+ return ag;
}
/*
- * Pick a new allocation group for the current file and its file stream. This
- * function is called by xfs_bmap_filestreams() with the mount point's per-ag
- * lock held.
+ * Pick a new allocation group for the current file and its file stream.
+ *
+ * This is called when the allocator can't find a suitable extent in the
+ * current AG, and we have to move the stream into a new AG with more space.
*/
int
xfs_filestream_new_ag(
struct xfs_bmalloca *ap,
xfs_agnumber_t *agp)
{
- int flags, err;
- xfs_inode_t *ip, *pip = NULL;
- xfs_mount_t *mp;
- xfs_mru_cache_t *cache;
- xfs_extlen_t minlen;
- fstrm_item_t *dir, *file;
- xfs_agnumber_t ag = NULLAGNUMBER;
-
- ip = ap->ip;
- mp = ip->i_mount;
- cache = mp->m_filestream;
- minlen = ap->length;
- *agp = NULLAGNUMBER;
+ struct xfs_inode *ip = ap->ip, *pip;
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_extlen_t minlen = ap->length;
+ xfs_agnumber_t startag = 0;
+ int flags, err = 0;
+ struct xfs_mru_cache_elem *mru;
- /*
- * Look for the file in the cache, removing it if it's found. Doing
- * this allows it to be held across the dir lookup that follows.
- */
- file = xfs_mru_cache_remove(cache, ip->i_ino);
- if (file) {
- ASSERT(ip == file->ip);
-
- /* Save the file's parent inode and old AG number for later. */
- pip = file->pip;
- ag = file->ag;
-
- /* Look for the file's directory in the cache. */
- dir = xfs_mru_cache_lookup(cache, pip->i_ino);
- if (dir) {
- ASSERT(pip == dir->ip);
-
- /*
- * If the directory has already moved on to a new AG,
- * use that AG as the new AG for the file. Don't
- * forget to twiddle the AG refcounts to match the
- * movement.
- */
- if (dir->ag != file->ag) {
- xfs_filestream_put_ag(mp, file->ag);
- xfs_filestream_get_ag(mp, dir->ag);
- *agp = file->ag = dir->ag;
- }
-
- xfs_mru_cache_done(cache);
- }
+ *agp = NULLAGNUMBER;
- /*
- * Put the file back in the cache. If this fails, the free
- * function needs to be called to tidy up in the same way as if
- * the item had simply expired from the cache.
- */
- err = xfs_mru_cache_insert(cache, ip->i_ino, file);
- if (err) {
- xfs_fstrm_free_func(ip->i_ino, file);
- return err;
- }
+ pip = xfs_filestream_get_parent(ip);
+ if (!pip)
+ goto exit;
- /*
- * If the file's AG was moved to the directory's new AG, there's
- * nothing more to be done.
- */
- if (*agp != NULLAGNUMBER) {
- TRACE_MOVEAG(mp, ip, pip,
- ag, xfs_filestream_peek_ag(mp, ag),
- *agp, xfs_filestream_peek_ag(mp, *agp));
- return 0;
- }
+ mru = xfs_mru_cache_remove(mp->m_filestream, pip->i_ino);
+ if (mru) {
+ struct xfs_fstrm_item *item =
+ container_of(mru, struct xfs_fstrm_item, mru);
+ startag = (item->ag + 1) % mp->m_sb.sb_agcount;
}
- /*
- * If the file's parent directory is known, take its iolock in exclusive
- * mode to prevent two sibling files from racing each other to migrate
- * themselves and their parent to different AGs.
- *
- * Note that we lock the parent directory iolock inside the child
- * iolock here. That's fine as we never hold both parent and child
- * iolock in any other place. This is different from the ilock,
- * which requires locking of the child after the parent for namespace
- * operations.
- */
- if (pip)
- xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
-
- /*
- * A new AG needs to be found for the file. If the file's parent
- * directory is also known, it will be moved to the new AG as well to
- * ensure that files created inside it in future use the new AG.
- */
- ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount;
flags = (ap->userdata ? XFS_PICK_USERDATA : 0) |
(ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0);
- err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen);
- if (err || *agp == NULLAGNUMBER)
- goto exit;
+ err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen);
/*
- * If the file wasn't found in the file cache, then its parent directory
- * inode isn't known. For this to have happened, the file must either
- * be pre-existing, or it was created long enough ago that its cache
- * entry has expired. This isn't the sort of usage that the filestreams
- * allocator is trying to optimise, so there's no point trying to track
- * its new AG somehow in the filestream data structures.
+ * Only free the item here so we skip over the old AG earlier.
*/
- if (!pip) {
- TRACE_ORPHAN(mp, ip, *agp);
- goto exit;
- }
-
- /* Associate the parent inode with the AG. */
- err = _xfs_filestream_update_ag(pip, NULL, *agp);
- if (err)
- goto exit;
-
- /* Associate the file inode with the AG. */
- err = _xfs_filestream_update_ag(ip, pip, *agp);
- if (err)
- goto exit;
-
- TRACE_MOVEAG(mp, ip, pip, NULLAGNUMBER, 0,
- *agp, xfs_filestream_peek_ag(mp, *agp));
+ if (mru)
+ xfs_fstrm_free_func(mru);
+ IRELE(pip);
exit:
- /*
- * If _xfs_filestream_pick_ag() returned a valid AG, remove the
- * reference it took on it, since the file and directory will have taken
- * their own now if they were successfully cached.
- */
- if (*agp != NULLAGNUMBER)
- xfs_filestream_put_ag(mp, *agp);
- else
+ if (*agp == NULLAGNUMBER)
*agp = 0;
-
- if (pip)
- xfs_iunlock(pip, XFS_IOLOCK_EXCL);
-
return err;
}
-/*
- * Remove an association between an inode and a filestream object.
- * Typically this is done on last close of an unlinked file.
- */
void
xfs_filestream_deassociate(
- xfs_inode_t *ip)
+ struct xfs_inode *ip)
{
- xfs_mru_cache_t *cache = ip->i_mount->m_filestream;
+ xfs_mru_cache_delete(ip->i_mount->m_filestream, ip->i_ino);
+}
+
+int
+xfs_filestream_mount(
+ xfs_mount_t *mp)
+{
+ /*
+ * The filestream timer tunable is currently fixed within the range of
+ * one second to four minutes, with five seconds being the default. The
+ * group count is somewhat arbitrary, but it'd be nice to adhere to the
+ * timer tunable to within about 10 percent. This requires at least 10
+ * groups.
+ */
+ return xfs_mru_cache_create(&mp->m_filestream, xfs_fstrm_centisecs * 10,
+ 10, xfs_fstrm_free_func);
+}
- xfs_mru_cache_delete(cache, ip->i_ino);
+void
+xfs_filestream_unmount(
+ xfs_mount_t *mp)
+{
+ xfs_mru_cache_destroy(mp->m_filestream);
}
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
index 6d61dbee8564..2ef43406e53b 100644
--- a/fs/xfs/xfs_filestream.h
+++ b/fs/xfs/xfs_filestream.h
@@ -20,50 +20,20 @@
struct xfs_mount;
struct xfs_inode;
-struct xfs_perag;
struct xfs_bmalloca;
-#ifdef XFS_FILESTREAMS_TRACE
-#define XFS_FSTRM_KTRACE_INFO 1
-#define XFS_FSTRM_KTRACE_AGSCAN 2
-#define XFS_FSTRM_KTRACE_AGPICK1 3
-#define XFS_FSTRM_KTRACE_AGPICK2 4
-#define XFS_FSTRM_KTRACE_UPDATE 5
-#define XFS_FSTRM_KTRACE_FREE 6
-#define XFS_FSTRM_KTRACE_ITEM_LOOKUP 7
-#define XFS_FSTRM_KTRACE_ASSOCIATE 8
-#define XFS_FSTRM_KTRACE_MOVEAG 9
-#define XFS_FSTRM_KTRACE_ORPHAN 10
-
-#define XFS_FSTRM_KTRACE_SIZE 16384
-extern ktrace_t *xfs_filestreams_trace_buf;
-
-#endif
-
-/* allocation selection flags */
-typedef enum xfs_fstrm_alloc {
- XFS_PICK_USERDATA = 1,
- XFS_PICK_LOWSPACE = 2,
-} xfs_fstrm_alloc_t;
-
-/* prototypes for filestream.c */
-int xfs_filestream_init(void);
-void xfs_filestream_uninit(void);
int xfs_filestream_mount(struct xfs_mount *mp);
void xfs_filestream_unmount(struct xfs_mount *mp);
-xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip);
-int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip);
void xfs_filestream_deassociate(struct xfs_inode *ip);
+xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip);
int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp);
+int xfs_filestream_peek_ag(struct xfs_mount *mp, xfs_agnumber_t agno);
-
-/* filestreams for the inode? */
static inline int
xfs_inode_is_filestream(
struct xfs_inode *ip)
{
return (ip->i_mount->m_flags & XFS_MOUNT_FILESTREAMS) ||
- xfs_iflags_test(ip, XFS_IFILESTREAM) ||
(ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM);
}
diff --git a/fs/xfs/xfs_format.h b/fs/xfs/xfs_format.h
index 9898f31d05d8..34d85aca3058 100644
--- a/fs/xfs/xfs_format.h
+++ b/fs/xfs/xfs_format.h
@@ -202,6 +202,8 @@ typedef __be32 xfs_alloc_ptr_t;
*/
#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */
#define XFS_IBT_CRC_MAGIC 0x49414233 /* 'IAB3' */
+#define XFS_FIBT_MAGIC 0x46494254 /* 'FIBT' */
+#define XFS_FIBT_CRC_MAGIC 0x46494233 /* 'FIB3' */
typedef __uint64_t xfs_inofree_t;
#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t))
@@ -244,7 +246,17 @@ typedef __be32 xfs_inobt_ptr_t;
* block numbers in the AG.
*/
#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
-#define XFS_PREALLOC_BLOCKS(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
+#define XFS_FIBT_BLOCK(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
+
+/*
+ * The first data block of an AG depends on whether the filesystem was formatted
+ * with the finobt feature. If so, account for the finobt reserved root btree
+ * block.
+ */
+#define XFS_PREALLOC_BLOCKS(mp) \
+ (xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \
+ XFS_FIBT_BLOCK(mp) + 1 : \
+ XFS_IBT_BLOCK(mp) + 1)
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index c5fc116dfaa3..d34703dbcb42 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -238,6 +238,7 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */
#define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */
#define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */
+#define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */
/*
* Minimum and maximum sizes need for growth checks.
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 02fb943cbf22..d2295561570a 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -24,6 +24,8 @@
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_inode_item.h"
@@ -74,23 +76,18 @@ xfs_fs_geometry(
}
if (new_version >= 3) {
geo->version = XFS_FSOP_GEOM_VERSION;
- geo->flags =
+ geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK |
+ XFS_FSOP_GEOM_FLAGS_DIRV2 |
(xfs_sb_version_hasattr(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_ATTR : 0) |
- (xfs_sb_version_hasnlink(&mp->m_sb) ?
- XFS_FSOP_GEOM_FLAGS_NLINK : 0) |
(xfs_sb_version_hasquota(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_QUOTA : 0) |
(xfs_sb_version_hasalign(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_IALIGN : 0) |
(xfs_sb_version_hasdalign(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_DALIGN : 0) |
- (xfs_sb_version_hasshared(&mp->m_sb) ?
- XFS_FSOP_GEOM_FLAGS_SHARED : 0) |
(xfs_sb_version_hasextflgbit(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_EXTFLG : 0) |
- (xfs_sb_version_hasdirv2(&mp->m_sb) ?
- XFS_FSOP_GEOM_FLAGS_DIRV2 : 0) |
(xfs_sb_version_hassector(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_SECTOR : 0) |
(xfs_sb_version_hasasciici(&mp->m_sb) ?
@@ -104,11 +101,13 @@ xfs_fs_geometry(
(xfs_sb_version_hascrc(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_V5SB : 0) |
(xfs_sb_version_hasftype(&mp->m_sb) ?
- XFS_FSOP_GEOM_FLAGS_FTYPE : 0);
+ XFS_FSOP_GEOM_FLAGS_FTYPE : 0) |
+ (xfs_sb_version_hasfinobt(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_FINOBT : 0);
geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
mp->m_sb.sb_logsectsize : BBSIZE;
geo->rtsectsize = mp->m_sb.sb_blocksize;
- geo->dirblocksize = mp->m_dirblksize;
+ geo->dirblocksize = mp->m_dir_geo->blksize;
}
if (new_version >= 4) {
geo->flags |=
@@ -316,6 +315,10 @@ xfs_growfs_data_private(
agi->agi_dirino = cpu_to_be32(NULLAGINO);
if (xfs_sb_version_hascrc(&mp->m_sb))
uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_uuid);
+ if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp));
+ agi->agi_free_level = cpu_to_be32(1);
+ }
for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
@@ -407,6 +410,34 @@ xfs_growfs_data_private(
xfs_buf_relse(bp);
if (error)
goto error0;
+
+ /*
+ * FINO btree root block
+ */
+ if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ bp = xfs_growfs_get_hdr_buf(mp,
+ XFS_AGB_TO_DADDR(mp, agno, XFS_FIBT_BLOCK(mp)),
+ BTOBB(mp->m_sb.sb_blocksize), 0,
+ &xfs_inobt_buf_ops);
+ if (!bp) {
+ error = ENOMEM;
+ goto error0;
+ }
+
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ xfs_btree_init_block(mp, bp, XFS_FIBT_CRC_MAGIC,
+ 0, 0, agno,
+ XFS_BTREE_CRC_BLOCKS);
+ else
+ xfs_btree_init_block(mp, bp, XFS_FIBT_MAGIC, 0,
+ 0, agno, 0);
+
+ error = xfs_bwrite(bp);
+ xfs_buf_relse(bp);
+ if (error)
+ goto error0;
+ }
+
}
xfs_trans_agblocks_delta(tp, nfree);
/*
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 8f711db61a0c..5960e5593fe0 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -112,6 +112,66 @@ xfs_inobt_get_rec(
}
/*
+ * Insert a single inobt record. Cursor must already point to desired location.
+ */
+STATIC int
+xfs_inobt_insert_rec(
+ struct xfs_btree_cur *cur,
+ __int32_t freecount,
+ xfs_inofree_t free,
+ int *stat)
+{
+ cur->bc_rec.i.ir_freecount = freecount;
+ cur->bc_rec.i.ir_free = free;
+ return xfs_btree_insert(cur, stat);
+}
+
+/*
+ * Insert records describing a newly allocated inode chunk into the inobt.
+ */
+STATIC int
+xfs_inobt_insert(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agino_t newino,
+ xfs_agino_t newlen,
+ xfs_btnum_t btnum)
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ xfs_agino_t thisino;
+ int i;
+ int error;
+
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
+
+ for (thisino = newino;
+ thisino < newino + newlen;
+ thisino += XFS_INODES_PER_CHUNK) {
+ error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i);
+ if (error) {
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+ }
+ ASSERT(i == 0);
+
+ error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK,
+ XFS_INOBT_ALL_FREE, &i);
+ if (error) {
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+ }
+ ASSERT(i == 1);
+ }
+
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+
+ return 0;
+}
+
+/*
* Verify that the number of free inodes in the AGI is correct.
*/
#ifdef DEBUG
@@ -220,10 +280,8 @@ xfs_ialloc_inode_init(
if (tp)
xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos,
mp->m_sb.sb_inodesize, length, gen);
- } else if (xfs_sb_version_hasnlink(&mp->m_sb))
+ } else
version = 2;
- else
- version = 1;
for (j = 0; j < nbufs; j++) {
/*
@@ -303,13 +361,10 @@ xfs_ialloc_ag_alloc(
{
xfs_agi_t *agi; /* allocation group header */
xfs_alloc_arg_t args; /* allocation argument structure */
- xfs_btree_cur_t *cur; /* inode btree cursor */
xfs_agnumber_t agno;
int error;
- int i;
xfs_agino_t newino; /* new first inode's number */
xfs_agino_t newlen; /* new number of inodes */
- xfs_agino_t thisino; /* current inode number, for loop */
int isaligned = 0; /* inode allocation at stripe unit */
/* boundary */
struct xfs_perag *pag;
@@ -459,29 +514,19 @@ xfs_ialloc_ag_alloc(
agi->agi_newino = cpu_to_be32(newino);
/*
- * Insert records describing the new inode chunk into the btree.
+ * Insert records describing the new inode chunk into the btrees.
*/
- cur = xfs_inobt_init_cursor(args.mp, tp, agbp, agno);
- for (thisino = newino;
- thisino < newino + newlen;
- thisino += XFS_INODES_PER_CHUNK) {
- cur->bc_rec.i.ir_startino = thisino;
- cur->bc_rec.i.ir_freecount = XFS_INODES_PER_CHUNK;
- cur->bc_rec.i.ir_free = XFS_INOBT_ALL_FREE;
- error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, &i);
- if (error) {
- xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
- return error;
- }
- ASSERT(i == 0);
- error = xfs_btree_insert(cur, &i);
- if (error) {
- xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
+ XFS_BTNUM_INO);
+ if (error)
+ return error;
+
+ if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
+ error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
+ XFS_BTNUM_FINO);
+ if (error)
return error;
- }
- ASSERT(i == 1);
}
- xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
/*
* Log allocation group header fields
*/
@@ -675,13 +720,10 @@ xfs_ialloc_get_rec(
}
/*
- * Allocate an inode.
- *
- * The caller selected an AG for us, and made sure that free inodes are
- * available.
+ * Allocate an inode using the inobt-only algorithm.
*/
STATIC int
-xfs_dialloc_ag(
+xfs_dialloc_ag_inobt(
struct xfs_trans *tp,
struct xfs_buf *agbp,
xfs_ino_t parent,
@@ -707,7 +749,7 @@ xfs_dialloc_ag(
ASSERT(pag->pagi_freecount > 0);
restart_pagno:
- cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
/*
* If pagino is 0 (this is the root inode allocation) use newino.
* This must work because we've just allocated some.
@@ -940,6 +982,294 @@ error0:
}
/*
+ * Use the free inode btree to allocate an inode based on distance from the
+ * parent. Note that the provided cursor may be deleted and replaced.
+ */
+STATIC int
+xfs_dialloc_ag_finobt_near(
+ xfs_agino_t pagino,
+ struct xfs_btree_cur **ocur,
+ struct xfs_inobt_rec_incore *rec)
+{
+ struct xfs_btree_cur *lcur = *ocur; /* left search cursor */
+ struct xfs_btree_cur *rcur; /* right search cursor */
+ struct xfs_inobt_rec_incore rrec;
+ int error;
+ int i, j;
+
+ error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i);
+ if (error)
+ return error;
+
+ if (i == 1) {
+ error = xfs_inobt_get_rec(lcur, rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+ /*
+ * See if we've landed in the parent inode record. The finobt
+ * only tracks chunks with at least one free inode, so record
+ * existence is enough.
+ */
+ if (pagino >= rec->ir_startino &&
+ pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK))
+ return 0;
+ }
+
+ error = xfs_btree_dup_cursor(lcur, &rcur);
+ if (error)
+ return error;
+
+ error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j);
+ if (error)
+ goto error_rcur;
+ if (j == 1) {
+ error = xfs_inobt_get_rec(rcur, &rrec, &j);
+ if (error)
+ goto error_rcur;
+ XFS_WANT_CORRUPTED_GOTO(j == 1, error_rcur);
+ }
+
+ XFS_WANT_CORRUPTED_GOTO(i == 1 || j == 1, error_rcur);
+ if (i == 1 && j == 1) {
+ /*
+ * Both the left and right records are valid. Choose the closer
+ * inode chunk to the target.
+ */
+ if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) >
+ (rrec.ir_startino - pagino)) {
+ *rec = rrec;
+ xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
+ *ocur = rcur;
+ } else {
+ xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
+ }
+ } else if (j == 1) {
+ /* only the right record is valid */
+ *rec = rrec;
+ xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
+ *ocur = rcur;
+ } else if (i == 1) {
+ /* only the left record is valid */
+ xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
+ }
+
+ return 0;
+
+error_rcur:
+ xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Use the free inode btree to find a free inode based on a newino hint. If
+ * the hint is NULL, find the first free inode in the AG.
+ */
+STATIC int
+xfs_dialloc_ag_finobt_newino(
+ struct xfs_agi *agi,
+ struct xfs_btree_cur *cur,
+ struct xfs_inobt_rec_incore *rec)
+{
+ int error;
+ int i;
+
+ if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
+ error = xfs_inobt_lookup(cur, agi->agi_newino, XFS_LOOKUP_EQ,
+ &i);
+ if (error)
+ return error;
+ if (i == 1) {
+ error = xfs_inobt_get_rec(cur, rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+ return 0;
+ }
+ }
+
+ /*
+ * Find the first inode available in the AG.
+ */
+ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+ error = xfs_inobt_get_rec(cur, rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+ return 0;
+}
+
+/*
+ * Update the inobt based on a modification made to the finobt. Also ensure that
+ * the records from both trees are equivalent post-modification.
+ */
+STATIC int
+xfs_dialloc_ag_update_inobt(
+ struct xfs_btree_cur *cur, /* inobt cursor */
+ struct xfs_inobt_rec_incore *frec, /* finobt record */
+ int offset) /* inode offset */
+{
+ struct xfs_inobt_rec_incore rec;
+ int error;
+ int i;
+
+ error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+ ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
+ XFS_INODES_PER_CHUNK) == 0);
+
+ rec.ir_free &= ~XFS_INOBT_MASK(offset);
+ rec.ir_freecount--;
+
+ XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) &&
+ (rec.ir_freecount == frec->ir_freecount));
+
+ error = xfs_inobt_update(cur, &rec);
+ if (error)
+ return error;
+
+ return 0;
+}
+
+/*
+ * Allocate an inode using the free inode btree, if available. Otherwise, fall
+ * back to the inobt search algorithm.
+ *
+ * The caller selected an AG for us, and made sure that free inodes are
+ * available.
+ */
+STATIC int
+xfs_dialloc_ag(
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_ino_t parent,
+ xfs_ino_t *inop)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent);
+ xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent);
+ struct xfs_perag *pag;
+ struct xfs_btree_cur *cur; /* finobt cursor */
+ struct xfs_btree_cur *icur; /* inobt cursor */
+ struct xfs_inobt_rec_incore rec;
+ xfs_ino_t ino;
+ int error;
+ int offset;
+ int i;
+
+ if (!xfs_sb_version_hasfinobt(&mp->m_sb))
+ return xfs_dialloc_ag_inobt(tp, agbp, parent, inop);
+
+ pag = xfs_perag_get(mp, agno);
+
+ /*
+ * If pagino is 0 (this is the root inode allocation) use newino.
+ * This must work because we've just allocated some.
+ */
+ if (!pagino)
+ pagino = be32_to_cpu(agi->agi_newino);
+
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
+
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error_cur;
+
+ /*
+ * The search algorithm depends on whether we're in the same AG as the
+ * parent. If so, find the closest available inode to the parent. If
+ * not, consider the agi hint or find the first free inode in the AG.
+ */
+ if (agno == pagno)
+ error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec);
+ else
+ error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec);
+ if (error)
+ goto error_cur;
+
+ offset = xfs_lowbit64(rec.ir_free);
+ ASSERT(offset >= 0);
+ ASSERT(offset < XFS_INODES_PER_CHUNK);
+ ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
+ XFS_INODES_PER_CHUNK) == 0);
+ ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
+
+ /*
+ * Modify or remove the finobt record.
+ */
+ rec.ir_free &= ~XFS_INOBT_MASK(offset);
+ rec.ir_freecount--;
+ if (rec.ir_freecount)
+ error = xfs_inobt_update(cur, &rec);
+ else
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto error_cur;
+
+ /*
+ * The finobt has now been updated appropriately. We haven't updated the
+ * agi and superblock yet, so we can create an inobt cursor and validate
+ * the original freecount. If all is well, make the equivalent update to
+ * the inobt using the finobt record and offset information.
+ */
+ icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+
+ error = xfs_check_agi_freecount(icur, agi);
+ if (error)
+ goto error_icur;
+
+ error = xfs_dialloc_ag_update_inobt(icur, &rec, offset);
+ if (error)
+ goto error_icur;
+
+ /*
+ * Both trees have now been updated. We must update the perag and
+ * superblock before we can check the freecount for each btree.
+ */
+ be32_add_cpu(&agi->agi_freecount, -1);
+ xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+ pag->pagi_freecount--;
+
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
+
+ error = xfs_check_agi_freecount(icur, agi);
+ if (error)
+ goto error_icur;
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error_icur;
+
+ xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR);
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ xfs_perag_put(pag);
+ *inop = ino;
+ return 0;
+
+error_icur:
+ xfs_btree_del_cursor(icur, XFS_BTREE_ERROR);
+error_cur:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ xfs_perag_put(pag);
+ return error;
+}
+
+/*
* Allocate an inode on disk.
*
* Mode is used to tell whether the new inode will need space, and whether it
@@ -1098,78 +1428,34 @@ out_error:
return XFS_ERROR(error);
}
-/*
- * Free disk inode. Carefully avoids touching the incore inode, all
- * manipulations incore are the caller's responsibility.
- * The on-disk inode is not changed by this operation, only the
- * btree (free inode mask) is changed.
- */
-int
-xfs_difree(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_ino_t inode, /* inode to be freed */
- xfs_bmap_free_t *flist, /* extents to free */
- int *delete, /* set if inode cluster was deleted */
- xfs_ino_t *first_ino) /* first inode in deleted cluster */
+STATIC int
+xfs_difree_inobt(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agino_t agino,
+ struct xfs_bmap_free *flist,
+ int *deleted,
+ xfs_ino_t *first_ino,
+ struct xfs_inobt_rec_incore *orec)
{
- /* REFERENCED */
- xfs_agblock_t agbno; /* block number containing inode */
- xfs_buf_t *agbp; /* buffer containing allocation group header */
- xfs_agino_t agino; /* inode number relative to allocation group */
- xfs_agnumber_t agno; /* allocation group number */
- xfs_agi_t *agi; /* allocation group header */
- xfs_btree_cur_t *cur; /* inode btree cursor */
- int error; /* error return value */
- int i; /* result code */
- int ilen; /* inodes in an inode cluster */
- xfs_mount_t *mp; /* mount structure for filesystem */
- int off; /* offset of inode in inode chunk */
- xfs_inobt_rec_incore_t rec; /* btree record */
- struct xfs_perag *pag;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ struct xfs_perag *pag;
+ struct xfs_btree_cur *cur;
+ struct xfs_inobt_rec_incore rec;
+ int ilen;
+ int error;
+ int i;
+ int off;
- mp = tp->t_mountp;
-
- /*
- * Break up inode number into its components.
- */
- agno = XFS_INO_TO_AGNO(mp, inode);
- if (agno >= mp->m_sb.sb_agcount) {
- xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
- __func__, agno, mp->m_sb.sb_agcount);
- ASSERT(0);
- return XFS_ERROR(EINVAL);
- }
- agino = XFS_INO_TO_AGINO(mp, inode);
- if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) {
- xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
- __func__, (unsigned long long)inode,
- (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
- ASSERT(0);
- return XFS_ERROR(EINVAL);
- }
- agbno = XFS_AGINO_TO_AGBNO(mp, agino);
- if (agbno >= mp->m_sb.sb_agblocks) {
- xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
- __func__, agbno, mp->m_sb.sb_agblocks);
- ASSERT(0);
- return XFS_ERROR(EINVAL);
- }
- /*
- * Get the allocation group header.
- */
- error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
- if (error) {
- xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
- __func__, error);
- return error;
- }
- agi = XFS_BUF_TO_AGI(agbp);
ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
- ASSERT(agbno < be32_to_cpu(agi->agi_length));
+ ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length));
+
/*
* Initialize the cursor.
*/
- cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
error = xfs_check_agi_freecount(cur, agi);
if (error)
@@ -1209,7 +1495,7 @@ xfs_difree(
if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
(rec.ir_freecount == mp->m_ialloc_inos)) {
- *delete = 1;
+ *deleted = 1;
*first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
/*
@@ -1237,7 +1523,7 @@ xfs_difree(
XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)),
mp->m_ialloc_blks, flist, mp);
} else {
- *delete = 0;
+ *deleted = 0;
error = xfs_inobt_update(cur, &rec);
if (error) {
@@ -1261,6 +1547,7 @@ xfs_difree(
if (error)
goto error0;
+ *orec = rec;
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
return 0;
@@ -1269,6 +1556,182 @@ error0:
return error;
}
+/*
+ * Free an inode in the free inode btree.
+ */
+STATIC int
+xfs_difree_finobt(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agino_t agino,
+ struct xfs_inobt_rec_incore *ibtrec) /* inobt record */
+{
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ struct xfs_btree_cur *cur;
+ struct xfs_inobt_rec_incore rec;
+ int offset = agino - ibtrec->ir_startino;
+ int error;
+ int i;
+
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
+
+ error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
+ if (error)
+ goto error;
+ if (i == 0) {
+ /*
+ * If the record does not exist in the finobt, we must have just
+ * freed an inode in a previously fully allocated chunk. If not,
+ * something is out of sync.
+ */
+ XFS_WANT_CORRUPTED_GOTO(ibtrec->ir_freecount == 1, error);
+
+ error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount,
+ ibtrec->ir_free, &i);
+ if (error)
+ goto error;
+ ASSERT(i == 1);
+
+ goto out;
+ }
+
+ /*
+ * Read and update the existing record. We could just copy the ibtrec
+ * across here, but that would defeat the purpose of having redundant
+ * metadata. By making the modifications independently, we can catch
+ * corruptions that we wouldn't see if we just copied from one record
+ * to another.
+ */
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ goto error;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error);
+
+ rec.ir_free |= XFS_INOBT_MASK(offset);
+ rec.ir_freecount++;
+
+ XFS_WANT_CORRUPTED_GOTO((rec.ir_free == ibtrec->ir_free) &&
+ (rec.ir_freecount == ibtrec->ir_freecount),
+ error);
+
+ /*
+ * The content of inobt records should always match between the inobt
+ * and finobt. The lifecycle of records in the finobt is different from
+ * the inobt in that the finobt only tracks records with at least one
+ * free inode. Hence, if all of the inodes are free and we aren't
+ * keeping inode chunks permanently on disk, remove the record.
+ * Otherwise, update the record with the new information.
+ */
+ if (rec.ir_freecount == mp->m_ialloc_inos &&
+ !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto error;
+ ASSERT(i == 1);
+ } else {
+ error = xfs_inobt_update(cur, &rec);
+ if (error)
+ goto error;
+ }
+
+out:
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error;
+
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ return 0;
+
+error:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Free disk inode. Carefully avoids touching the incore inode, all
+ * manipulations incore are the caller's responsibility.
+ * The on-disk inode is not changed by this operation, only the
+ * btree (free inode mask) is changed.
+ */
+int
+xfs_difree(
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_ino_t inode, /* inode to be freed */
+ struct xfs_bmap_free *flist, /* extents to free */
+ int *deleted,/* set if inode cluster was deleted */
+ xfs_ino_t *first_ino)/* first inode in deleted cluster */
+{
+ /* REFERENCED */
+ xfs_agblock_t agbno; /* block number containing inode */
+ struct xfs_buf *agbp; /* buffer for allocation group header */
+ xfs_agino_t agino; /* allocation group inode number */
+ xfs_agnumber_t agno; /* allocation group number */
+ int error; /* error return value */
+ struct xfs_mount *mp; /* mount structure for filesystem */
+ struct xfs_inobt_rec_incore rec;/* btree record */
+
+ mp = tp->t_mountp;
+
+ /*
+ * Break up inode number into its components.
+ */
+ agno = XFS_INO_TO_AGNO(mp, inode);
+ if (agno >= mp->m_sb.sb_agcount) {
+ xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
+ __func__, agno, mp->m_sb.sb_agcount);
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+ agino = XFS_INO_TO_AGINO(mp, inode);
+ if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) {
+ xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
+ __func__, (unsigned long long)inode,
+ (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+ agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+ if (agbno >= mp->m_sb.sb_agblocks) {
+ xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
+ __func__, agbno, mp->m_sb.sb_agblocks);
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+ /*
+ * Get the allocation group header.
+ */
+ error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
+ __func__, error);
+ return error;
+ }
+
+ /*
+ * Fix up the inode allocation btree.
+ */
+ error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino,
+ &rec);
+ if (error)
+ goto error0;
+
+ /*
+ * Fix up the free inode btree.
+ */
+ if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ error = xfs_difree_finobt(mp, tp, agbp, agino, &rec);
+ if (error)
+ goto error0;
+ }
+
+ return 0;
+
+error0:
+ return error;
+}
+
STATIC int
xfs_imap_lookup(
struct xfs_mount *mp,
@@ -1300,7 +1763,7 @@ xfs_imap_lookup(
* we have a record, we need to ensure it contains the inode number
* we are looking up.
*/
- cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
if (!error) {
if (i)
@@ -1488,7 +1951,16 @@ xfs_ialloc_compute_maxlevels(
}
/*
- * Log specified fields for the ag hdr (inode section)
+ * Log specified fields for the ag hdr (inode section). The growth of the agi
+ * structure over time requires that we interpret the buffer as two logical
+ * regions delineated by the end of the unlinked list. This is due to the size
+ * of the hash table and its location in the middle of the agi.
+ *
+ * For example, a request to log a field before agi_unlinked and a field after
+ * agi_unlinked could cause us to log the entire hash table and use an excessive
+ * amount of log space. To avoid this behavior, log the region up through
+ * agi_unlinked in one call and the region after agi_unlinked through the end of
+ * the structure in another.
*/
void
xfs_ialloc_log_agi(
@@ -1511,6 +1983,8 @@ xfs_ialloc_log_agi(
offsetof(xfs_agi_t, agi_newino),
offsetof(xfs_agi_t, agi_dirino),
offsetof(xfs_agi_t, agi_unlinked),
+ offsetof(xfs_agi_t, agi_free_root),
+ offsetof(xfs_agi_t, agi_free_level),
sizeof(xfs_agi_t)
};
#ifdef DEBUG
@@ -1519,15 +1993,30 @@ xfs_ialloc_log_agi(
agi = XFS_BUF_TO_AGI(bp);
ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
#endif
+
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF);
+
/*
- * Compute byte offsets for the first and last fields.
+ * Compute byte offsets for the first and last fields in the first
+ * region and log the agi buffer. This only logs up through
+ * agi_unlinked.
*/
- xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS, &first, &last);
+ if (fields & XFS_AGI_ALL_BITS_R1) {
+ xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1,
+ &first, &last);
+ xfs_trans_log_buf(tp, bp, first, last);
+ }
+
/*
- * Log the allocation group inode header buffer.
+ * Mask off the bits in the first region and calculate the first and
+ * last field offsets for any bits in the second region.
*/
- xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF);
- xfs_trans_log_buf(tp, bp, first, last);
+ fields &= ~XFS_AGI_ALL_BITS_R1;
+ if (fields) {
+ xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2,
+ &first, &last);
+ xfs_trans_log_buf(tp, bp, first, last);
+ }
}
#ifdef DEBUG
@@ -1640,7 +2129,6 @@ xfs_read_agi(
if (error)
return error;
- ASSERT(!xfs_buf_geterror(*bpp));
xfs_buf_set_ref(*bpp, XFS_AGI_REF);
return 0;
}
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 812365d17e67..95ad1c002d60 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -90,7 +90,7 @@ xfs_difree(
struct xfs_trans *tp, /* transaction pointer */
xfs_ino_t inode, /* inode to be freed */
struct xfs_bmap_free *flist, /* extents to free */
- int *delete, /* set if inode cluster was deleted */
+ int *deleted, /* set if inode cluster was deleted */
xfs_ino_t *first_ino); /* first inode in deleted cluster */
/*
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 7e309b11e87d..726f83a681a5 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -49,7 +49,8 @@ xfs_inobt_dup_cursor(
struct xfs_btree_cur *cur)
{
return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agbp, cur->bc_private.a.agno);
+ cur->bc_private.a.agbp, cur->bc_private.a.agno,
+ cur->bc_btnum);
}
STATIC void
@@ -66,12 +67,26 @@ xfs_inobt_set_root(
xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL);
}
+STATIC void
+xfs_finobt_set_root(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *nptr,
+ int inc) /* level change */
+{
+ struct xfs_buf *agbp = cur->bc_private.a.agbp;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+
+ agi->agi_free_root = nptr->s;
+ be32_add_cpu(&agi->agi_free_level, inc);
+ xfs_ialloc_log_agi(cur->bc_tp, agbp,
+ XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL);
+}
+
STATIC int
xfs_inobt_alloc_block(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *start,
union xfs_btree_ptr *new,
- int length,
int *stat)
{
xfs_alloc_arg_t args; /* block allocation args */
@@ -173,6 +188,17 @@ xfs_inobt_init_ptr_from_cur(
ptr->s = agi->agi_root;
}
+STATIC void
+xfs_finobt_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
+
+ ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
+ ptr->s = agi->agi_free_root;
+}
+
STATIC __int64_t
xfs_inobt_key_diff(
struct xfs_btree_cur *cur,
@@ -203,6 +229,7 @@ xfs_inobt_verify(
*/
switch (block->bb_magic) {
case cpu_to_be32(XFS_IBT_CRC_MAGIC):
+ case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
if (!xfs_sb_version_hascrc(&mp->m_sb))
return false;
if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
@@ -214,6 +241,7 @@ xfs_inobt_verify(
return false;
/* fall through */
case cpu_to_be32(XFS_IBT_MAGIC):
+ case cpu_to_be32(XFS_FIBT_MAGIC):
break;
default:
return 0;
@@ -317,6 +345,28 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
#endif
};
+static const struct xfs_btree_ops xfs_finobt_ops = {
+ .rec_len = sizeof(xfs_inobt_rec_t),
+ .key_len = sizeof(xfs_inobt_key_t),
+
+ .dup_cursor = xfs_inobt_dup_cursor,
+ .set_root = xfs_finobt_set_root,
+ .alloc_block = xfs_inobt_alloc_block,
+ .free_block = xfs_inobt_free_block,
+ .get_minrecs = xfs_inobt_get_minrecs,
+ .get_maxrecs = xfs_inobt_get_maxrecs,
+ .init_key_from_rec = xfs_inobt_init_key_from_rec,
+ .init_rec_from_key = xfs_inobt_init_rec_from_key,
+ .init_rec_from_cur = xfs_inobt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_finobt_init_ptr_from_cur,
+ .key_diff = xfs_inobt_key_diff,
+ .buf_ops = &xfs_inobt_buf_ops,
+#if defined(DEBUG) || defined(XFS_WARN)
+ .keys_inorder = xfs_inobt_keys_inorder,
+ .recs_inorder = xfs_inobt_recs_inorder,
+#endif
+};
+
/*
* Allocate a new inode btree cursor.
*/
@@ -325,7 +375,8 @@ xfs_inobt_init_cursor(
struct xfs_mount *mp, /* file system mount point */
struct xfs_trans *tp, /* transaction pointer */
struct xfs_buf *agbp, /* buffer for agi structure */
- xfs_agnumber_t agno) /* allocation group number */
+ xfs_agnumber_t agno, /* allocation group number */
+ xfs_btnum_t btnum) /* ialloc or free ino btree */
{
struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
struct xfs_btree_cur *cur;
@@ -334,11 +385,17 @@ xfs_inobt_init_cursor(
cur->bc_tp = tp;
cur->bc_mp = mp;
- cur->bc_nlevels = be32_to_cpu(agi->agi_level);
- cur->bc_btnum = XFS_BTNUM_INO;
+ cur->bc_btnum = btnum;
+ if (btnum == XFS_BTNUM_INO) {
+ cur->bc_nlevels = be32_to_cpu(agi->agi_level);
+ cur->bc_ops = &xfs_inobt_ops;
+ } else {
+ cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
+ cur->bc_ops = &xfs_finobt_ops;
+ }
+
cur->bc_blocklog = mp->m_sb.sb_blocklog;
- cur->bc_ops = &xfs_inobt_ops;
if (xfs_sb_version_hascrc(&mp->m_sb))
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index f38b22011c4e..d7ebea72c2d0 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -58,7 +58,8 @@ struct xfs_mount;
((index) - 1) * sizeof(xfs_inobt_ptr_t)))
extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
- struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t);
+ struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t,
+ xfs_btnum_t);
extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 98d35244eecc..c48df5f25b9f 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -507,8 +507,7 @@ STATIC int
xfs_inode_ag_walk(
struct xfs_mount *mp,
struct xfs_perag *pag,
- int (*execute)(struct xfs_inode *ip,
- struct xfs_perag *pag, int flags,
+ int (*execute)(struct xfs_inode *ip, int flags,
void *args),
int flags,
void *args,
@@ -582,7 +581,7 @@ restart:
for (i = 0; i < nr_found; i++) {
if (!batch[i])
continue;
- error = execute(batch[i], pag, flags, args);
+ error = execute(batch[i], flags, args);
IRELE(batch[i]);
if (error == EAGAIN) {
skipped++;
@@ -636,8 +635,7 @@ xfs_eofblocks_worker(
int
xfs_inode_ag_iterator(
struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip,
- struct xfs_perag *pag, int flags,
+ int (*execute)(struct xfs_inode *ip, int flags,
void *args),
int flags,
void *args)
@@ -664,8 +662,7 @@ xfs_inode_ag_iterator(
int
xfs_inode_ag_iterator_tag(
struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip,
- struct xfs_perag *pag, int flags,
+ int (*execute)(struct xfs_inode *ip, int flags,
void *args),
int flags,
void *args,
@@ -1209,7 +1206,6 @@ xfs_inode_match_id(
STATIC int
xfs_inode_free_eofblocks(
struct xfs_inode *ip,
- struct xfs_perag *pag,
int flags,
void *args)
{
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 9ed68bb750f5..9cf017b899be 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -60,12 +60,10 @@ int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
void xfs_eofblocks_worker(struct work_struct *);
int xfs_inode_ag_iterator(struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
- int flags, void *args),
+ int (*execute)(struct xfs_inode *ip, int flags, void *args),
int flags, void *args);
int xfs_inode_ag_iterator_tag(struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
- int flags, void *args),
+ int (*execute)(struct xfs_inode *ip, int flags, void *args),
int flags, void *args, int tag);
static inline int
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 768087bedbac..a6115fe1ac94 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -655,7 +655,6 @@ xfs_ialloc(
uint flags;
int error;
timespec_t tv;
- int filestreams = 0;
/*
* Call the space management code to pick
@@ -682,6 +681,14 @@ xfs_ialloc(
return error;
ASSERT(ip != NULL);
+ /*
+ * We always convert v1 inodes to v2 now - we only support filesystems
+ * with >= v2 inode capability, so there is no reason for ever leaving
+ * an inode in v1 format.
+ */
+ if (ip->i_d.di_version == 1)
+ ip->i_d.di_version = 2;
+
ip->i_d.di_mode = mode;
ip->i_d.di_onlink = 0;
ip->i_d.di_nlink = nlink;
@@ -691,27 +698,6 @@ xfs_ialloc(
xfs_set_projid(ip, prid);
memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
- /*
- * If the superblock version is up to where we support new format
- * inodes and this is currently an old format inode, then change
- * the inode version number now. This way we only do the conversion
- * here rather than here and in the flush/logging code.
- */
- if (xfs_sb_version_hasnlink(&mp->m_sb) &&
- ip->i_d.di_version == 1) {
- ip->i_d.di_version = 2;
- /*
- * We've already zeroed the old link count, the projid field,
- * and the pad field.
- */
- }
-
- /*
- * Project ids won't be stored on disk if we are using a version 1 inode.
- */
- if ((prid != 0) && (ip->i_d.di_version == 1))
- xfs_bump_ino_vers2(tp, ip);
-
if (pip && XFS_INHERIT_GID(pip)) {
ip->i_d.di_gid = pip->i_d.di_gid;
if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) {
@@ -772,13 +758,6 @@ xfs_ialloc(
flags |= XFS_ILOG_DEV;
break;
case S_IFREG:
- /*
- * we can't set up filestreams until after the VFS inode
- * is set up properly.
- */
- if (pip && xfs_inode_is_filestream(pip))
- filestreams = 1;
- /* fall through */
case S_IFDIR:
if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
uint di_flags = 0;
@@ -844,15 +823,6 @@ xfs_ialloc(
/* now that we have an i_mode we can setup inode ops and unlock */
xfs_setup_inode(ip);
- /* now we have set up the vfs inode we can associate the filestream */
- if (filestreams) {
- error = xfs_filestream_associate(pip, ip);
- if (error < 0)
- return -error;
- if (!error)
- xfs_iflags_set(ip, XFS_IFILESTREAM);
- }
-
*ipp = ip;
return 0;
}
@@ -1073,40 +1043,6 @@ xfs_droplink(
}
/*
- * This gets called when the inode's version needs to be changed from 1 to 2.
- * Currently this happens when the nlink field overflows the old 16-bit value
- * or when chproj is called to change the project for the first time.
- * As a side effect the superblock version will also get rev'd
- * to contain the NLINK bit.
- */
-void
-xfs_bump_ino_vers2(
- xfs_trans_t *tp,
- xfs_inode_t *ip)
-{
- xfs_mount_t *mp;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(ip->i_d.di_version == 1);
-
- ip->i_d.di_version = 2;
- ip->i_d.di_onlink = 0;
- memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
- mp = tp->t_mountp;
- if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
- spin_lock(&mp->m_sb_lock);
- if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
- xfs_sb_version_addnlink(&mp->m_sb);
- spin_unlock(&mp->m_sb_lock);
- xfs_mod_sb(tp, XFS_SB_VERSIONNUM);
- } else {
- spin_unlock(&mp->m_sb_lock);
- }
- }
- /* Caller must log the inode */
-}
-
-/*
* Increment the link count on an inode & log the change.
*/
int
@@ -1116,22 +1052,10 @@ xfs_bumplink(
{
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+ ASSERT(ip->i_d.di_version > 1);
ASSERT(ip->i_d.di_nlink > 0 || (VFS_I(ip)->i_state & I_LINKABLE));
ip->i_d.di_nlink++;
inc_nlink(VFS_I(ip));
- if ((ip->i_d.di_version == 1) &&
- (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
- /*
- * The inode has increased its number of links beyond
- * what can fit in an old format inode. It now needs
- * to be converted to a version 2 inode with a 32 bit
- * link count. If this is the first inode in the file
- * system to do this, then we need to bump the superblock
- * version number as well.
- */
- xfs_bump_ino_vers2(tp, ip);
- }
-
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
return 0;
}
@@ -1699,16 +1623,6 @@ xfs_release(
int truncated;
/*
- * If we are using filestreams, and we have an unlinked
- * file that we are processing the last close on, then nothing
- * will be able to reopen and write to this file. Purge this
- * inode from the filestreams cache so that it doesn't delay
- * teardown of the inode.
- */
- if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
- xfs_filestream_deassociate(ip);
-
- /*
* If we previously truncated this file and removed old data
* in the process, we want to initiate "early" writeout on
* the last close. This is an attempt to combat the notorious
@@ -1838,9 +1752,33 @@ xfs_inactive_ifree(
int error;
tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, 0, 0);
+
+ /*
+ * The ifree transaction might need to allocate blocks for record
+ * insertion to the finobt. We don't want to fail here at ENOSPC, so
+ * allow ifree to dip into the reserved block pool if necessary.
+ *
+ * Freeing large sets of inodes generally means freeing inode chunks,
+ * directory and file data blocks, so this should be relatively safe.
+ * Only under severe circumstances should it be possible to free enough
+ * inodes to exhaust the reserve block pool via finobt expansion while
+ * at the same time not creating free space in the filesystem.
+ *
+ * Send a warning if the reservation does happen to fail, as the inode
+ * now remains allocated and sits on the unlinked list until the fs is
+ * repaired.
+ */
+ tp->t_flags |= XFS_TRANS_RESERVE;
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree,
+ XFS_IFREE_SPACE_RES(mp), 0);
if (error) {
- ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ if (error == ENOSPC) {
+ xfs_warn_ratelimited(mp,
+ "Failed to remove inode(s) from unlinked list. "
+ "Please free space, unmount and run xfs_repair.");
+ } else {
+ ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ }
xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
return error;
}
@@ -2664,13 +2602,7 @@ xfs_remove(
if (error)
goto std_return;
- /*
- * If we are using filestreams, kill the stream association.
- * If the file is still open it may get a new one but that
- * will get killed on last close in xfs_close() so we don't
- * have to worry about that.
- */
- if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
+ if (is_dir && xfs_inode_is_filestream(ip))
xfs_filestream_deassociate(ip);
return 0;
@@ -3258,6 +3190,7 @@ xfs_iflush_int(
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
ASSERT(iip != NULL && iip->ili_fields != 0);
+ ASSERT(ip->i_d.di_version > 1);
/* set *dip = inode's place in the buffer */
dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
@@ -3318,7 +3251,7 @@ xfs_iflush_int(
}
/*
- * Inode item log recovery for v1/v2 inodes are dependent on the
+ * Inode item log recovery for v2 inodes are dependent on the
* di_flushiter count for correct sequencing. We bump the flush
* iteration count so we can detect flushes which postdate a log record
* during recovery. This is redundant as we now log every change and
@@ -3341,40 +3274,9 @@ xfs_iflush_int(
if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
ip->i_d.di_flushiter = 0;
- /*
- * If this is really an old format inode and the superblock version
- * has not been updated to support only new format inodes, then
- * convert back to the old inode format. If the superblock version
- * has been updated, then make the conversion permanent.
- */
- ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
- if (ip->i_d.di_version == 1) {
- if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
- /*
- * Convert it back.
- */
- ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
- dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink);
- } else {
- /*
- * The superblock version has already been bumped,
- * so just make the conversion to the new inode
- * format permanent.
- */
- ip->i_d.di_version = 2;
- dip->di_version = 2;
- ip->i_d.di_onlink = 0;
- dip->di_onlink = 0;
- memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
- memset(&(dip->di_pad[0]), 0,
- sizeof(dip->di_pad));
- ASSERT(xfs_get_projid(ip) == 0);
- }
- }
-
- xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp);
+ xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
if (XFS_IFORK_Q(ip))
- xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
+ xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
xfs_inobp_check(mp, bp);
/*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index f2fcde52b66d..f72bffa67266 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -209,7 +209,6 @@ xfs_get_initial_prid(struct xfs_inode *dp)
#define XFS_ISTALE (1 << 1) /* inode has been staled */
#define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */
#define XFS_INEW (1 << 3) /* inode has just been allocated */
-#define XFS_IFILESTREAM (1 << 4) /* inode is in a filestream dir. */
#define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */
#define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */
#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */
@@ -225,8 +224,7 @@ xfs_get_initial_prid(struct xfs_inode *dp)
*/
#define XFS_IRECLAIM_RESET_FLAGS \
(XFS_IRECLAIMABLE | XFS_IRECLAIM | \
- XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | \
- XFS_IFILESTREAM);
+ XFS_IDIRTY_RELEASE | XFS_ITRUNCATED)
/*
* Synchronize processes attempting to flush the in-core inode back to disk.
@@ -379,7 +377,6 @@ int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
struct xfs_inode **, int *);
int xfs_droplink(struct xfs_trans *, struct xfs_inode *);
int xfs_bumplink(struct xfs_trans *, struct xfs_inode *);
-void xfs_bump_ino_vers2(struct xfs_trans *, struct xfs_inode *);
/* from xfs_file.c */
int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/xfs_inode_buf.c
index 24e993996bdc..cb35ae41d4a1 100644
--- a/fs/xfs/xfs_inode_buf.c
+++ b/fs/xfs/xfs_inode_buf.c
@@ -437,17 +437,16 @@ xfs_iread(
}
/*
- * The inode format changed when we moved the link count and
- * made it 32 bits long. If this is an old format inode,
- * convert it in memory to look like a new one. If it gets
- * flushed to disk we will convert back before flushing or
- * logging it. We zero out the new projid field and the old link
- * count field. We'll handle clearing the pad field (the remains
- * of the old uuid field) when we actually convert the inode to
- * the new format. We don't change the version number so that we
- * can distinguish this from a real new format inode.
+ * Automatically convert version 1 inode formats in memory to version 2
+ * inode format. If the inode is modified, it will get logged and
+ * rewritten as a version 2 inode. We can do this because we set the
+ * superblock feature bit for v2 inodes unconditionally during mount
+ * and it means the reast of the code can assume the inode version is 2
+ * or higher.
*/
if (ip->i_d.di_version == 1) {
+ ip->i_d.di_version = 2;
+ memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
ip->i_d.di_nlink = ip->i_d.di_onlink;
ip->i_d.di_onlink = 0;
xfs_set_projid(ip, 0);
diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/xfs_inode_fork.c
index 73514c0486b7..b031e8d0d928 100644
--- a/fs/xfs/xfs_inode_fork.c
+++ b/fs/xfs/xfs_inode_fork.c
@@ -798,8 +798,7 @@ xfs_iflush_fork(
xfs_inode_t *ip,
xfs_dinode_t *dip,
xfs_inode_log_item_t *iip,
- int whichfork,
- xfs_buf_t *bp)
+ int whichfork)
{
char *cp;
xfs_ifork_t *ifp;
diff --git a/fs/xfs/xfs_inode_fork.h b/fs/xfs/xfs_inode_fork.h
index eb329a1ea888..7d3b1ed6dcbe 100644
--- a/fs/xfs/xfs_inode_fork.h
+++ b/fs/xfs/xfs_inode_fork.h
@@ -127,8 +127,7 @@ typedef struct xfs_ifork {
int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *);
void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
- struct xfs_inode_log_item *, int,
- struct xfs_buf *);
+ struct xfs_inode_log_item *, int);
void xfs_idestroy_fork(struct xfs_inode *, int);
void xfs_idata_realloc(struct xfs_inode *, int, int);
void xfs_iroot_realloc(struct xfs_inode *, int, int);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 686889b4a1e5..a640137b3573 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -145,34 +145,6 @@ xfs_inode_item_size(
xfs_inode_item_attr_fork_size(iip, nvecs, nbytes);
}
-/*
- * If this is a v1 format inode, then we need to log it as such. This means
- * that we have to copy the link count from the new field to the old. We
- * don't have to worry about the new fields, because nothing trusts them as
- * long as the old inode version number is there.
- */
-STATIC void
-xfs_inode_item_format_v1_inode(
- struct xfs_inode *ip)
-{
- if (!xfs_sb_version_hasnlink(&ip->i_mount->m_sb)) {
- /*
- * Convert it back.
- */
- ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
- ip->i_d.di_onlink = ip->i_d.di_nlink;
- } else {
- /*
- * The superblock version has already been bumped,
- * so just make the conversion to the new inode
- * format permanent.
- */
- ip->i_d.di_version = 2;
- ip->i_d.di_onlink = 0;
- memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
- }
-}
-
STATIC void
xfs_inode_item_format_data_fork(
struct xfs_inode_log_item *iip,
@@ -370,6 +342,8 @@ xfs_inode_item_format(
struct xfs_inode_log_format *ilf;
struct xfs_log_iovec *vecp = NULL;
+ ASSERT(ip->i_d.di_version > 1);
+
ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT);
ilf->ilf_type = XFS_LI_INODE;
ilf->ilf_ino = ip->i_ino;
@@ -380,8 +354,6 @@ xfs_inode_item_format(
ilf->ilf_size = 2; /* format + core */
xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format));
- if (ip->i_d.di_version == 1)
- xfs_inode_item_format_v1_inode(ip);
xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICORE,
&ip->i_d,
xfs_icdinode_size(ip->i_d.di_version));
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 0b18776b075e..8bc1bbce7451 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -543,10 +543,11 @@ xfs_attrmulti_by_handle(
ops = memdup_user(am_hreq.ops, size);
if (IS_ERR(ops)) {
- error = PTR_ERR(ops);
+ error = -PTR_ERR(ops);
goto out_dput;
}
+ error = ENOMEM;
attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
if (!attr_name)
goto out_kfree_ops;
@@ -556,7 +557,7 @@ xfs_attrmulti_by_handle(
ops[i].am_error = strncpy_from_user((char *)attr_name,
ops[i].am_attrname, MAXNAMELEN);
if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
- error = -ERANGE;
+ error = ERANGE;
if (ops[i].am_error < 0)
break;
@@ -1215,7 +1216,7 @@ xfs_ioctl_setattr(
* cleared upon successful return from chown()
*/
if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
- !inode_capable(VFS_I(ip), CAP_FSETID))
+ !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID))
ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
/*
@@ -1227,15 +1228,8 @@ xfs_ioctl_setattr(
olddquot = xfs_qm_vop_chown(tp, ip,
&ip->i_pdquot, pdqp);
}
+ ASSERT(ip->i_d.di_version > 1);
xfs_set_projid(ip, fa->fsx_projid);
-
- /*
- * We may have to rev the inode as well as
- * the superblock version number since projids didn't
- * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
- */
- if (ip->i_d.di_version == 1)
- xfs_bump_ino_vers2(tp, ip);
}
}
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index a7992f8de9d3..944d5baa710a 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -424,10 +424,11 @@ xfs_compat_attrmulti_by_handle(
ops = memdup_user(compat_ptr(am_hreq.ops), size);
if (IS_ERR(ops)) {
- error = PTR_ERR(ops);
+ error = -PTR_ERR(ops);
goto out_dput;
}
+ error = ENOMEM;
attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
if (!attr_name)
goto out_kfree_ops;
@@ -438,7 +439,7 @@ xfs_compat_attrmulti_by_handle(
compat_ptr(ops[i].am_attrname),
MAXNAMELEN);
if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
- error = -ERANGE;
+ error = ERANGE;
if (ops[i].am_error < 0)
break;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 3b80ebae05f5..6c5eb4c551e3 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -730,7 +730,7 @@ xfs_iomap_write_allocate(
*/
nimaps = 1;
end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
- error = xfs_bmap_last_offset(NULL, ip, &last_block,
+ error = xfs_bmap_last_offset(ip, &last_block,
XFS_DATA_FORK);
if (error)
goto trans_cancel;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 36d630319a27..205613a06068 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -829,22 +829,34 @@ xfs_setattr_size(
*/
inode_dio_wait(inode);
+ /*
+ * Do all the page cache truncate work outside the transaction context
+ * as the "lock" order is page lock->log space reservation. i.e.
+ * locking pages inside the transaction can ABBA deadlock with
+ * writeback. We have to do the VFS inode size update before we truncate
+ * the pagecache, however, to avoid racing with page faults beyond the
+ * new EOF they are not serialised against truncate operations except by
+ * page locks and size updates.
+ *
+ * Hence we are in a situation where a truncate can fail with ENOMEM
+ * from xfs_trans_reserve(), but having already truncated the in-memory
+ * version of the file (i.e. made user visible changes). There's not
+ * much we can do about this, except to hope that the caller sees ENOMEM
+ * and retries the truncate operation.
+ */
error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
if (error)
return error;
+ truncate_setsize(inode, newsize);
tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
if (error)
goto out_trans_cancel;
- truncate_setsize(inode, newsize);
-
commit_flags = XFS_TRANS_RELEASE_LOG_RES;
lock_flags |= XFS_ILOCK_EXCL;
-
xfs_ilock(ip, XFS_ILOCK_EXCL);
-
xfs_trans_ijoin(tp, ip, 0);
/*
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f46338285152..cb64f222d607 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -270,7 +270,8 @@ xfs_bulkstat(
/*
* Allocate and initialize a btree cursor for ialloc btree.
*/
- cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
+ cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
+ XFS_BTNUM_INO);
irbp = irbuf;
irbufend = irbuf + nirbuf;
end_of_ag = 0;
@@ -621,7 +622,8 @@ xfs_inumbers(
agino = 0;
continue;
}
- cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
+ cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
+ XFS_BTNUM_INO);
error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE,
&tmp);
if (error) {
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index a5f8bd9899d3..292308dede6d 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1165,7 +1165,7 @@ xlog_iodone(xfs_buf_t *bp)
/*
* Race to shutdown the filesystem if we see an error.
*/
- if (XFS_TEST_ERROR((xfs_buf_geterror(bp)), l->l_mp,
+ if (XFS_TEST_ERROR(bp->b_error, l->l_mp,
XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
xfs_buf_ioerror_alert(bp, __func__);
xfs_buf_stale(bp);
@@ -3952,11 +3952,14 @@ xfs_log_force_umount(
retval = xlog_state_ioerror(log);
spin_unlock(&log->l_icloglock);
}
+
/*
- * Wake up everybody waiting on xfs_log_force.
- * Callback all log item committed functions as if the
- * log writes were completed.
+ * Wake up everybody waiting on xfs_log_force. Wake the CIL push first
+ * as if the log writes were completed. The abort handling in the log
+ * item committed callback functions will do this again under lock to
+ * avoid races.
*/
+ wake_up_all(&log->l_cilp->xc_commit_wait);
xlog_state_do_callback(log, XFS_LI_ABORTED, NULL);
#ifdef XFSERRORDEBUG
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 2c4004475e71..84e0deb95abd 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -24,7 +24,8 @@ struct xfs_log_vec {
struct xfs_log_iovec *lv_iovecp; /* iovec array */
struct xfs_log_item *lv_item; /* owner */
char *lv_buf; /* formatted buffer */
- int lv_buf_len; /* size of formatted buffer */
+ int lv_bytes; /* accounted space in buffer */
+ int lv_buf_len; /* aligned size of buffer */
int lv_size; /* size of allocated lv */
};
@@ -52,15 +53,21 @@ xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
return vec->i_addr;
}
+/*
+ * We need to make sure the next buffer is naturally aligned for the biggest
+ * basic data type we put into it. We already accounted for this padding when
+ * sizing the buffer.
+ *
+ * However, this padding does not get written into the log, and hence we have to
+ * track the space used by the log vectors separately to prevent log space hangs
+ * due to inaccurate accounting (i.e. a leak) of the used log space through the
+ * CIL context ticket.
+ */
static inline void
xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len)
{
- /*
- * We need to make sure the next buffer is naturally aligned for the
- * biggest basic data type we put into it. We already accounted for
- * this when sizing the buffer.
- */
lv->lv_buf_len += round_up(len, sizeof(uint64_t));
+ lv->lv_bytes += len;
vec->i_len = len;
}
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 7e5455391176..b3425b34e3d5 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -97,7 +97,7 @@ xfs_cil_prepare_item(
{
/* Account for the new LV being passed in */
if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
- *diff_len += lv->lv_buf_len;
+ *diff_len += lv->lv_bytes;
*diff_iovecs += lv->lv_niovecs;
}
@@ -111,7 +111,7 @@ xfs_cil_prepare_item(
else if (old_lv != lv) {
ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
- *diff_len -= old_lv->lv_buf_len;
+ *diff_len -= old_lv->lv_bytes;
*diff_iovecs -= old_lv->lv_niovecs;
kmem_free(old_lv);
}
@@ -239,7 +239,7 @@ xlog_cil_insert_format_items(
* that the space reservation accounting is correct.
*/
*diff_iovecs -= lv->lv_niovecs;
- *diff_len -= lv->lv_buf_len;
+ *diff_len -= lv->lv_bytes;
} else {
/* allocate new data chunk */
lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
@@ -259,6 +259,7 @@ xlog_cil_insert_format_items(
/* The allocated data region lies beyond the iovec region */
lv->lv_buf_len = 0;
+ lv->lv_bytes = 0;
lv->lv_buf = (char *)lv + buf_size - nbytes;
ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
@@ -385,7 +386,15 @@ xlog_cil_committed(
xfs_extent_busy_clear(mp, &ctx->busy_extents,
(mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
+ /*
+ * If we are aborting the commit, wake up anyone waiting on the
+ * committing list. If we don't, then a shutdown we can leave processes
+ * waiting in xlog_cil_force_lsn() waiting on a sequence commit that
+ * will never happen because we aborted it.
+ */
spin_lock(&ctx->cil->xc_push_lock);
+ if (abort)
+ wake_up_all(&ctx->cil->xc_commit_wait);
list_del(&ctx->committing);
spin_unlock(&ctx->cil->xc_push_lock);
@@ -564,8 +573,18 @@ restart:
spin_lock(&cil->xc_push_lock);
list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
/*
+ * Avoid getting stuck in this loop because we were woken by the
+ * shutdown, but then went back to sleep once already in the
+ * shutdown state.
+ */
+ if (XLOG_FORCED_SHUTDOWN(log)) {
+ spin_unlock(&cil->xc_push_lock);
+ goto out_abort_free_ticket;
+ }
+
+ /*
* Higher sequences will wait for this one so skip them.
- * Don't wait for own own sequence, either.
+ * Don't wait for our own sequence, either.
*/
if (new_ctx->sequence >= ctx->sequence)
continue;
@@ -810,6 +829,13 @@ restart:
*/
spin_lock(&cil->xc_push_lock);
list_for_each_entry(ctx, &cil->xc_committing, committing) {
+ /*
+ * Avoid getting stuck in this loop because we were woken by the
+ * shutdown, but then went back to sleep once already in the
+ * shutdown state.
+ */
+ if (XLOG_FORCED_SHUTDOWN(log))
+ goto out_shutdown;
if (ctx->sequence > sequence)
continue;
if (!ctx->commit_lsn) {
@@ -833,14 +859,12 @@ restart:
* push sequence after the above wait loop and the CIL still contains
* dirty objects.
*
- * When the push occurs, it will empty the CIL and
- * atomically increment the currect sequence past the push sequence and
- * move it into the committing list. Of course, if the CIL is clean at
- * the time of the push, it won't have pushed the CIL at all, so in that
- * case we should try the push for this sequence again from the start
- * just in case.
+ * When the push occurs, it will empty the CIL and atomically increment
+ * the currect sequence past the push sequence and move it into the
+ * committing list. Of course, if the CIL is clean at the time of the
+ * push, it won't have pushed the CIL at all, so in that case we should
+ * try the push for this sequence again from the start just in case.
*/
-
if (sequence == cil->xc_current_sequence &&
!list_empty(&cil->xc_cil)) {
spin_unlock(&cil->xc_push_lock);
@@ -849,6 +873,17 @@ restart:
spin_unlock(&cil->xc_push_lock);
return commit_lsn;
+
+ /*
+ * We detected a shutdown in progress. We need to trigger the log force
+ * to pass through it's iclog state machine error handling, even though
+ * we are already in a shutdown state. Hence we can't return
+ * NULLCOMMITLSN here as that has special meaning to log forces (i.e.
+ * LSN is already stable), so we return a zero LSN instead.
+ */
+out_shutdown:
+ spin_unlock(&cil->xc_push_lock);
+ return 0;
}
/*
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index bce53ac81096..981af0f6504b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2138,7 +2138,9 @@ xlog_recover_validate_buf_type(
bp->b_ops = &xfs_allocbt_buf_ops;
break;
case XFS_IBT_CRC_MAGIC:
+ case XFS_FIBT_CRC_MAGIC:
case XFS_IBT_MAGIC:
+ case XFS_FIBT_MAGIC:
bp->b_ops = &xfs_inobt_buf_ops;
break;
case XFS_BMAP_CRC_MAGIC:
@@ -3145,7 +3147,7 @@ xlog_recover_efd_pass2(
}
lip = xfs_trans_ail_cursor_next(ailp, &cur);
}
- xfs_trans_ail_cursor_done(ailp, &cur);
+ xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->xa_lock);
return 0;
@@ -3520,8 +3522,7 @@ out:
STATIC int
xlog_recover_unmount_trans(
- struct xlog *log,
- struct xlog_recover *trans)
+ struct xlog *log)
{
/* Do nothing now */
xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
@@ -3595,7 +3596,7 @@ xlog_recover_process_data(
trans, pass);
break;
case XLOG_UNMOUNT_TRANS:
- error = xlog_recover_unmount_trans(log, trans);
+ error = xlog_recover_unmount_trans(log);
break;
case XLOG_WAS_CONT_TRANS:
error = xlog_recover_add_to_cont_trans(log,
@@ -3757,7 +3758,7 @@ xlog_recover_process_efis(
lip = xfs_trans_ail_cursor_next(ailp, &cur);
}
out:
- xfs_trans_ail_cursor_done(ailp, &cur);
+ xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->xa_lock);
return error;
}
diff --git a/fs/xfs/xfs_log_rlimit.c b/fs/xfs/xfs_log_rlimit.c
index 2af1a0a4d0f1..ee7e0e80246b 100644
--- a/fs/xfs/xfs_log_rlimit.c
+++ b/fs/xfs/xfs_log_rlimit.c
@@ -42,7 +42,7 @@ xfs_log_calc_max_attrsetm_res(
int size;
int nblks;
- size = xfs_attr_leaf_entsize_local_max(mp->m_sb.sb_blocksize) -
+ size = xfs_attr_leaf_entsize_local_max(mp->m_attr_geo->blksize) -
MAXNAMELEN - 1;
nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
nblks += XFS_B_TO_FSB(mp, size);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 944f3d9456a8..3507cd0ec400 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -323,8 +323,19 @@ reread:
/*
* Initialize the mount structure from the superblock.
*/
- xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
- xfs_sb_quota_from_disk(&mp->m_sb);
+ xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
+ xfs_sb_quota_from_disk(sbp);
+
+ /*
+ * If we haven't validated the superblock, do so now before we try
+ * to check the sector size and reread the superblock appropriately.
+ */
+ if (sbp->sb_magicnum != XFS_SB_MAGIC) {
+ if (loud)
+ xfs_warn(mp, "Invalid superblock magic number");
+ error = EINVAL;
+ goto release_buf;
+ }
/*
* We must be able to do sector-sized and sector-aligned IO.
@@ -337,11 +348,11 @@ reread:
goto release_buf;
}
- /*
- * Re-read the superblock so the buffer is correctly sized,
- * and properly verified.
- */
if (buf_ops == NULL) {
+ /*
+ * Re-read the superblock so the buffer is correctly sized,
+ * and properly verified.
+ */
xfs_buf_relse(bp);
sector_size = sbp->sb_sectsize;
buf_ops = loud ? &xfs_sb_buf_ops : &xfs_sb_quiet_buf_ops;
@@ -697,6 +708,12 @@ xfs_mountfs(
mp->m_update_flags |= XFS_SB_VERSIONNUM;
}
+ /* always use v2 inodes by default now */
+ if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) {
+ mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
+ mp->m_update_flags |= XFS_SB_VERSIONNUM;
+ }
+
/*
* Check if sb_agblocks is aligned at stripe boundary
* If sb_agblocks is NOT aligned turn off m_dalign since
@@ -774,12 +791,11 @@ xfs_mountfs(
mp->m_dmevmask = 0; /* not persistent; set after each mount */
- xfs_dir_mount(mp);
-
- /*
- * Initialize the attribute manager's entries.
- */
- mp->m_attr_magicpct = (mp->m_sb.sb_blocksize * 37) / 100;
+ error = xfs_da_mount(mp);
+ if (error) {
+ xfs_warn(mp, "Failed dir/attr init: %d", error);
+ goto out_remove_uuid;
+ }
/*
* Initialize the precomputed transaction reservations values.
@@ -794,7 +810,7 @@ xfs_mountfs(
error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
if (error) {
xfs_warn(mp, "Failed per-ag init: %d", error);
- goto out_remove_uuid;
+ goto out_free_dir;
}
if (!sbp->sb_logblocks) {
@@ -969,6 +985,8 @@ xfs_mountfs(
xfs_wait_buftarg(mp->m_ddev_targp);
out_free_perag:
xfs_free_perag(mp);
+ out_free_dir:
+ xfs_da_unmount(mp);
out_remove_uuid:
xfs_uuid_unmount(mp);
out:
@@ -1046,6 +1064,7 @@ xfs_unmountfs(
"Freespace may not be correct on next mount.");
xfs_log_unmount(mp);
+ xfs_da_unmount(mp);
xfs_uuid_unmount(mp);
#if defined(DEBUG)
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a466c5e5826e..7295a0b7c343 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -27,6 +27,7 @@ struct xfs_nameops;
struct xfs_ail;
struct xfs_quotainfo;
struct xfs_dir_ops;
+struct xfs_da_geometry;
#ifdef HAVE_PERCPU_SB
@@ -96,6 +97,8 @@ typedef struct xfs_mount {
uint m_readio_blocks; /* min read size blocks */
uint m_writeio_log; /* min write size log bytes */
uint m_writeio_blocks; /* min write size blocks */
+ struct xfs_da_geometry *m_dir_geo; /* directory block geometry */
+ struct xfs_da_geometry *m_attr_geo; /* attribute block geometry */
struct xlog *m_log; /* log specific stuff */
int m_logbufs; /* number of log buffers */
int m_logbsize; /* size of each log buffer */
@@ -131,8 +134,6 @@ typedef struct xfs_mount {
int m_fixedfsid[2]; /* unchanged for life of FS */
uint m_dmevmask; /* DMI events for this FS */
__uint64_t m_flags; /* global mount flags */
- uint m_dir_node_ents; /* #entries in a dir danode */
- uint m_attr_node_ents; /* #entries in attr danode */
int m_ialloc_inos; /* inodes in inode allocation */
int m_ialloc_blks; /* blocks in inode allocation */
int m_inoalign_mask;/* mask sb_inoalignmt if used */
@@ -145,17 +146,10 @@ typedef struct xfs_mount {
int m_dalign; /* stripe unit */
int m_swidth; /* stripe width */
int m_sinoalign; /* stripe unit inode alignment */
- int m_attr_magicpct;/* 37% of the blocksize */
- int m_dir_magicpct; /* 37% of the dir blocksize */
__uint8_t m_sectbb_log; /* sectlog - BBSHIFT */
const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */
const struct xfs_dir_ops *m_dir_inode_ops; /* vector of dir inode ops */
const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */
- int m_dirblksize; /* directory block sz--bytes */
- int m_dirblkfsbs; /* directory block sz--fsbs */
- xfs_dablk_t m_dirdatablk; /* blockno of dir data v2 */
- xfs_dablk_t m_dirleafblk; /* blockno of dir non-data v2 */
- xfs_dablk_t m_dirfreeblk; /* blockno of dirfreeindex v2 */
uint m_chsize; /* size of next field */
atomic_t m_active_trans; /* number trans frozen */
#ifdef HAVE_PERCPU_SB
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 4aff56395732..f99b4933dc22 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -100,14 +100,20 @@
* likely result in a loop in one of the lists. That's a sure-fire recipe for
* an infinite loop in the code.
*/
-typedef struct xfs_mru_cache_elem
-{
- struct list_head list_node;
- unsigned long key;
- void *value;
-} xfs_mru_cache_elem_t;
+struct xfs_mru_cache {
+ struct radix_tree_root store; /* Core storage data structure. */
+ struct list_head *lists; /* Array of lists, one per grp. */
+ struct list_head reap_list; /* Elements overdue for reaping. */
+ spinlock_t lock; /* Lock to protect this struct. */
+ unsigned int grp_count; /* Number of discrete groups. */
+ unsigned int grp_time; /* Time period spanned by grps. */
+ unsigned int lru_grp; /* Group containing time zero. */
+ unsigned long time_zero; /* Time first element was added. */
+ xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */
+ struct delayed_work work; /* Workqueue data for reaping. */
+ unsigned int queued; /* work has been queued */
+};
-static kmem_zone_t *xfs_mru_elem_zone;
static struct workqueue_struct *xfs_mru_reap_wq;
/*
@@ -129,12 +135,12 @@ static struct workqueue_struct *xfs_mru_reap_wq;
*/
STATIC unsigned long
_xfs_mru_cache_migrate(
- xfs_mru_cache_t *mru,
- unsigned long now)
+ struct xfs_mru_cache *mru,
+ unsigned long now)
{
- unsigned int grp;
- unsigned int migrated = 0;
- struct list_head *lru_list;
+ unsigned int grp;
+ unsigned int migrated = 0;
+ struct list_head *lru_list;
/* Nothing to do if the data store is empty. */
if (!mru->time_zero)
@@ -193,11 +199,11 @@ _xfs_mru_cache_migrate(
*/
STATIC void
_xfs_mru_cache_list_insert(
- xfs_mru_cache_t *mru,
- xfs_mru_cache_elem_t *elem)
+ struct xfs_mru_cache *mru,
+ struct xfs_mru_cache_elem *elem)
{
- unsigned int grp = 0;
- unsigned long now = jiffies;
+ unsigned int grp = 0;
+ unsigned long now = jiffies;
/*
* If the data store is empty, initialise time zero, leave grp set to
@@ -231,10 +237,10 @@ _xfs_mru_cache_list_insert(
*/
STATIC void
_xfs_mru_cache_clear_reap_list(
- xfs_mru_cache_t *mru) __releases(mru->lock) __acquires(mru->lock)
-
+ struct xfs_mru_cache *mru)
+ __releases(mru->lock) __acquires(mru->lock)
{
- xfs_mru_cache_elem_t *elem, *next;
+ struct xfs_mru_cache_elem *elem, *next;
struct list_head tmp;
INIT_LIST_HEAD(&tmp);
@@ -252,15 +258,8 @@ _xfs_mru_cache_clear_reap_list(
spin_unlock(&mru->lock);
list_for_each_entry_safe(elem, next, &tmp, list_node) {
-
- /* Remove the element from the reap list. */
list_del_init(&elem->list_node);
-
- /* Call the client's free function with the key and value pointer. */
- mru->free_func(elem->key, elem->value);
-
- /* Free the element structure. */
- kmem_zone_free(xfs_mru_elem_zone, elem);
+ mru->free_func(elem);
}
spin_lock(&mru->lock);
@@ -277,7 +276,8 @@ STATIC void
_xfs_mru_cache_reap(
struct work_struct *work)
{
- xfs_mru_cache_t *mru = container_of(work, xfs_mru_cache_t, work.work);
+ struct xfs_mru_cache *mru =
+ container_of(work, struct xfs_mru_cache, work.work);
unsigned long now, next;
ASSERT(mru && mru->lists);
@@ -304,28 +304,16 @@ _xfs_mru_cache_reap(
int
xfs_mru_cache_init(void)
{
- xfs_mru_elem_zone = kmem_zone_init(sizeof(xfs_mru_cache_elem_t),
- "xfs_mru_cache_elem");
- if (!xfs_mru_elem_zone)
- goto out;
-
xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1);
if (!xfs_mru_reap_wq)
- goto out_destroy_mru_elem_zone;
-
+ return -ENOMEM;
return 0;
-
- out_destroy_mru_elem_zone:
- kmem_zone_destroy(xfs_mru_elem_zone);
- out:
- return -ENOMEM;
}
void
xfs_mru_cache_uninit(void)
{
destroy_workqueue(xfs_mru_reap_wq);
- kmem_zone_destroy(xfs_mru_elem_zone);
}
/*
@@ -336,14 +324,14 @@ xfs_mru_cache_uninit(void)
*/
int
xfs_mru_cache_create(
- xfs_mru_cache_t **mrup,
+ struct xfs_mru_cache **mrup,
unsigned int lifetime_ms,
unsigned int grp_count,
xfs_mru_cache_free_func_t free_func)
{
- xfs_mru_cache_t *mru = NULL;
- int err = 0, grp;
- unsigned int grp_time;
+ struct xfs_mru_cache *mru = NULL;
+ int err = 0, grp;
+ unsigned int grp_time;
if (mrup)
*mrup = NULL;
@@ -400,7 +388,7 @@ exit:
*/
static void
xfs_mru_cache_flush(
- xfs_mru_cache_t *mru)
+ struct xfs_mru_cache *mru)
{
if (!mru || !mru->lists)
return;
@@ -420,7 +408,7 @@ xfs_mru_cache_flush(
void
xfs_mru_cache_destroy(
- xfs_mru_cache_t *mru)
+ struct xfs_mru_cache *mru)
{
if (!mru || !mru->lists)
return;
@@ -438,38 +426,30 @@ xfs_mru_cache_destroy(
*/
int
xfs_mru_cache_insert(
- xfs_mru_cache_t *mru,
- unsigned long key,
- void *value)
+ struct xfs_mru_cache *mru,
+ unsigned long key,
+ struct xfs_mru_cache_elem *elem)
{
- xfs_mru_cache_elem_t *elem;
+ int error;
ASSERT(mru && mru->lists);
if (!mru || !mru->lists)
return EINVAL;
- elem = kmem_zone_zalloc(xfs_mru_elem_zone, KM_SLEEP);
- if (!elem)
+ if (radix_tree_preload(GFP_KERNEL))
return ENOMEM;
- if (radix_tree_preload(GFP_KERNEL)) {
- kmem_zone_free(xfs_mru_elem_zone, elem);
- return ENOMEM;
- }
-
INIT_LIST_HEAD(&elem->list_node);
elem->key = key;
- elem->value = value;
spin_lock(&mru->lock);
-
- radix_tree_insert(&mru->store, key, elem);
+ error = -radix_tree_insert(&mru->store, key, elem);
radix_tree_preload_end();
- _xfs_mru_cache_list_insert(mru, elem);
-
+ if (!error)
+ _xfs_mru_cache_list_insert(mru, elem);
spin_unlock(&mru->lock);
- return 0;
+ return error;
}
/*
@@ -478,13 +458,12 @@ xfs_mru_cache_insert(
* the client data pointer for the removed element is returned, otherwise this
* function will return a NULL pointer.
*/
-void *
+struct xfs_mru_cache_elem *
xfs_mru_cache_remove(
- xfs_mru_cache_t *mru,
- unsigned long key)
+ struct xfs_mru_cache *mru,
+ unsigned long key)
{
- xfs_mru_cache_elem_t *elem;
- void *value = NULL;
+ struct xfs_mru_cache_elem *elem;
ASSERT(mru && mru->lists);
if (!mru || !mru->lists)
@@ -492,17 +471,11 @@ xfs_mru_cache_remove(
spin_lock(&mru->lock);
elem = radix_tree_delete(&mru->store, key);
- if (elem) {
- value = elem->value;
+ if (elem)
list_del(&elem->list_node);
- }
-
spin_unlock(&mru->lock);
- if (elem)
- kmem_zone_free(xfs_mru_elem_zone, elem);
-
- return value;
+ return elem;
}
/*
@@ -511,13 +484,14 @@ xfs_mru_cache_remove(
*/
void
xfs_mru_cache_delete(
- xfs_mru_cache_t *mru,
- unsigned long key)
+ struct xfs_mru_cache *mru,
+ unsigned long key)
{
- void *value = xfs_mru_cache_remove(mru, key);
+ struct xfs_mru_cache_elem *elem;
- if (value)
- mru->free_func(key, value);
+ elem = xfs_mru_cache_remove(mru, key);
+ if (elem)
+ mru->free_func(elem);
}
/*
@@ -540,12 +514,12 @@ xfs_mru_cache_delete(
* status, we need to help it get it right by annotating the path that does
* not release the lock.
*/
-void *
+struct xfs_mru_cache_elem *
xfs_mru_cache_lookup(
- xfs_mru_cache_t *mru,
- unsigned long key)
+ struct xfs_mru_cache *mru,
+ unsigned long key)
{
- xfs_mru_cache_elem_t *elem;
+ struct xfs_mru_cache_elem *elem;
ASSERT(mru && mru->lists);
if (!mru || !mru->lists)
@@ -560,7 +534,7 @@ xfs_mru_cache_lookup(
} else
spin_unlock(&mru->lock);
- return elem ? elem->value : NULL;
+ return elem;
}
/*
@@ -570,7 +544,8 @@ xfs_mru_cache_lookup(
*/
void
xfs_mru_cache_done(
- xfs_mru_cache_t *mru) __releases(mru->lock)
+ struct xfs_mru_cache *mru)
+ __releases(mru->lock)
{
spin_unlock(&mru->lock);
}
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
index 36dd3ec8b4eb..fb5245ba5ff7 100644
--- a/fs/xfs/xfs_mru_cache.h
+++ b/fs/xfs/xfs_mru_cache.h
@@ -18,24 +18,15 @@
#ifndef __XFS_MRU_CACHE_H__
#define __XFS_MRU_CACHE_H__
+struct xfs_mru_cache;
-/* Function pointer type for callback to free a client's data pointer. */
-typedef void (*xfs_mru_cache_free_func_t)(unsigned long, void*);
+struct xfs_mru_cache_elem {
+ struct list_head list_node;
+ unsigned long key;
+};
-typedef struct xfs_mru_cache
-{
- struct radix_tree_root store; /* Core storage data structure. */
- struct list_head *lists; /* Array of lists, one per grp. */
- struct list_head reap_list; /* Elements overdue for reaping. */
- spinlock_t lock; /* Lock to protect this struct. */
- unsigned int grp_count; /* Number of discrete groups. */
- unsigned int grp_time; /* Time period spanned by grps. */
- unsigned int lru_grp; /* Group containing time zero. */
- unsigned long time_zero; /* Time first element was added. */
- xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */
- struct delayed_work work; /* Workqueue data for reaping. */
- unsigned int queued; /* work has been queued */
-} xfs_mru_cache_t;
+/* Function pointer type for callback to free a client's data pointer. */
+typedef void (*xfs_mru_cache_free_func_t)(struct xfs_mru_cache_elem *elem);
int xfs_mru_cache_init(void);
void xfs_mru_cache_uninit(void);
@@ -44,10 +35,12 @@ int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms,
xfs_mru_cache_free_func_t free_func);
void xfs_mru_cache_destroy(struct xfs_mru_cache *mru);
int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
- void *value);
-void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key);
+ struct xfs_mru_cache_elem *elem);
+struct xfs_mru_cache_elem *
+xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key);
void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key);
-void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key);
+struct xfs_mru_cache_elem *
+xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key);
void xfs_mru_cache_done(struct xfs_mru_cache *mru);
#endif /* __XFS_MRU_CACHE_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index dc977b6e6a36..6d26759c779a 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -193,47 +193,6 @@ xfs_qm_dqpurge(
}
/*
- * Release the group or project dquot pointers the user dquots maybe carrying
- * around as a hint, and proceed to purge the user dquot cache if requested.
-*/
-STATIC int
-xfs_qm_dqpurge_hints(
- struct xfs_dquot *dqp,
- void *data)
-{
- struct xfs_dquot *gdqp = NULL;
- struct xfs_dquot *pdqp = NULL;
- uint flags = *((uint *)data);
-
- xfs_dqlock(dqp);
- if (dqp->dq_flags & XFS_DQ_FREEING) {
- xfs_dqunlock(dqp);
- return EAGAIN;
- }
-
- /* If this quota has a hint attached, prepare for releasing it now */
- gdqp = dqp->q_gdquot;
- if (gdqp)
- dqp->q_gdquot = NULL;
-
- pdqp = dqp->q_pdquot;
- if (pdqp)
- dqp->q_pdquot = NULL;
-
- xfs_dqunlock(dqp);
-
- if (gdqp)
- xfs_qm_dqrele(gdqp);
- if (pdqp)
- xfs_qm_dqrele(pdqp);
-
- if (flags & XFS_QMOPT_UQUOTA)
- return xfs_qm_dqpurge(dqp, NULL);
-
- return 0;
-}
-
-/*
* Purge the dquot cache.
*/
void
@@ -241,18 +200,8 @@ xfs_qm_dqpurge_all(
struct xfs_mount *mp,
uint flags)
{
- /*
- * We have to release group/project dquot hint(s) from the user dquot
- * at first if they are there, otherwise we would run into an infinite
- * loop while walking through radix tree to purge other type of dquots
- * since their refcount is not zero if the user dquot refers to them
- * as hint.
- *
- * Call the special xfs_qm_dqpurge_hints() will end up go through the
- * general xfs_qm_dqpurge() against user dquot cache if requested.
- */
- xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge_hints, &flags);
-
+ if (flags & XFS_QMOPT_UQUOTA)
+ xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL);
if (flags & XFS_QMOPT_GQUOTA)
xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL);
if (flags & XFS_QMOPT_PQUOTA)
@@ -409,7 +358,6 @@ xfs_qm_dqattach_one(
xfs_dqid_t id,
uint type,
uint doalloc,
- xfs_dquot_t *udqhint, /* hint */
xfs_dquot_t **IO_idqpp)
{
xfs_dquot_t *dqp;
@@ -419,9 +367,9 @@ xfs_qm_dqattach_one(
error = 0;
/*
- * See if we already have it in the inode itself. IO_idqpp is
- * &i_udquot or &i_gdquot. This made the code look weird, but
- * made the logic a lot simpler.
+ * See if we already have it in the inode itself. IO_idqpp is &i_udquot
+ * or &i_gdquot. This made the code look weird, but made the logic a lot
+ * simpler.
*/
dqp = *IO_idqpp;
if (dqp) {
@@ -430,49 +378,10 @@ xfs_qm_dqattach_one(
}
/*
- * udqhint is the i_udquot field in inode, and is non-NULL only
- * when the type arg is group/project. Its purpose is to save a
- * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside
- * the user dquot.
- */
- if (udqhint) {
- ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
- xfs_dqlock(udqhint);
-
- /*
- * No need to take dqlock to look at the id.
- *
- * The ID can't change until it gets reclaimed, and it won't
- * be reclaimed as long as we have a ref from inode and we
- * hold the ilock.
- */
- if (type == XFS_DQ_GROUP)
- dqp = udqhint->q_gdquot;
- else
- dqp = udqhint->q_pdquot;
- if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) {
- ASSERT(*IO_idqpp == NULL);
-
- *IO_idqpp = xfs_qm_dqhold(dqp);
- xfs_dqunlock(udqhint);
- return 0;
- }
-
- /*
- * We can't hold a dquot lock when we call the dqget code.
- * We'll deadlock in no time, because of (not conforming to)
- * lock ordering - the inodelock comes before any dquot lock,
- * and we may drop and reacquire the ilock in xfs_qm_dqget().
- */
- xfs_dqunlock(udqhint);
- }
-
- /*
- * Find the dquot from somewhere. This bumps the
- * reference count of dquot and returns it locked.
- * This can return ENOENT if dquot didn't exist on
- * disk and we didn't ask it to allocate;
- * ESRCH if quotas got turned off suddenly.
+ * Find the dquot from somewhere. This bumps the reference count of
+ * dquot and returns it locked. This can return ENOENT if dquot didn't
+ * exist on disk and we didn't ask it to allocate; ESRCH if quotas got
+ * turned off suddenly.
*/
error = xfs_qm_dqget(ip->i_mount, ip, id, type,
doalloc | XFS_QMOPT_DOWARN, &dqp);
@@ -490,48 +399,6 @@ xfs_qm_dqattach_one(
return 0;
}
-
-/*
- * Given a udquot and group/project type, attach the group/project
- * dquot pointer to the udquot as a hint for future lookups.
- */
-STATIC void
-xfs_qm_dqattach_hint(
- struct xfs_inode *ip,
- int type)
-{
- struct xfs_dquot **dqhintp;
- struct xfs_dquot *dqp;
- struct xfs_dquot *udq = ip->i_udquot;
-
- ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
-
- xfs_dqlock(udq);
-
- if (type == XFS_DQ_GROUP) {
- dqp = ip->i_gdquot;
- dqhintp = &udq->q_gdquot;
- } else {
- dqp = ip->i_pdquot;
- dqhintp = &udq->q_pdquot;
- }
-
- if (*dqhintp) {
- struct xfs_dquot *tmp;
-
- if (*dqhintp == dqp)
- goto done;
-
- tmp = *dqhintp;
- *dqhintp = NULL;
- xfs_qm_dqrele(tmp);
- }
-
- *dqhintp = xfs_qm_dqhold(dqp);
-done:
- xfs_dqunlock(udq);
-}
-
static bool
xfs_qm_need_dqattach(
struct xfs_inode *ip)
@@ -562,7 +429,6 @@ xfs_qm_dqattach_locked(
uint flags)
{
xfs_mount_t *mp = ip->i_mount;
- uint nquotas = 0;
int error = 0;
if (!xfs_qm_need_dqattach(ip))
@@ -570,77 +436,39 @@ xfs_qm_dqattach_locked(
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- if (XFS_IS_UQUOTA_ON(mp)) {
+ if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) {
error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
flags & XFS_QMOPT_DQALLOC,
- NULL, &ip->i_udquot);
+ &ip->i_udquot);
if (error)
goto done;
- nquotas++;
+ ASSERT(ip->i_udquot);
}
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- if (XFS_IS_GQUOTA_ON(mp)) {
+ if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) {
error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
flags & XFS_QMOPT_DQALLOC,
- ip->i_udquot, &ip->i_gdquot);
- /*
- * Don't worry about the udquot that we may have
- * attached above. It'll get detached, if not already.
- */
+ &ip->i_gdquot);
if (error)
goto done;
- nquotas++;
+ ASSERT(ip->i_gdquot);
}
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- if (XFS_IS_PQUOTA_ON(mp)) {
+ if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) {
error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
flags & XFS_QMOPT_DQALLOC,
- ip->i_udquot, &ip->i_pdquot);
- /*
- * Don't worry about the udquot that we may have
- * attached above. It'll get detached, if not already.
- */
+ &ip->i_pdquot);
if (error)
goto done;
- nquotas++;
+ ASSERT(ip->i_pdquot);
}
+done:
/*
- * Attach this group/project quota to the user quota as a hint.
- * This WON'T, in general, result in a thrash.
+ * Don't worry about the dquots that we may have attached before any
+ * error - they'll get detached later if it has not already been done.
*/
- if (nquotas > 1 && ip->i_udquot) {
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(ip->i_gdquot || !XFS_IS_GQUOTA_ON(mp));
- ASSERT(ip->i_pdquot || !XFS_IS_PQUOTA_ON(mp));
-
- /*
- * We do not have i_udquot locked at this point, but this check
- * is OK since we don't depend on the i_gdquot to be accurate
- * 100% all the time. It is just a hint, and this will
- * succeed in general.
- */
- if (ip->i_udquot->q_gdquot != ip->i_gdquot)
- xfs_qm_dqattach_hint(ip, XFS_DQ_GROUP);
-
- if (ip->i_udquot->q_pdquot != ip->i_pdquot)
- xfs_qm_dqattach_hint(ip, XFS_DQ_PROJ);
- }
-
- done:
-#ifdef DEBUG
- if (!error) {
- if (XFS_IS_UQUOTA_ON(mp))
- ASSERT(ip->i_udquot);
- if (XFS_IS_GQUOTA_ON(mp))
- ASSERT(ip->i_gdquot);
- if (XFS_IS_PQUOTA_ON(mp))
- ASSERT(ip->i_pdquot);
- }
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-#endif
return error;
}
@@ -865,8 +693,7 @@ xfs_qm_init_quotainfo(
/* Precalc some constants */
qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
- qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(mp,
- qinf->qi_dqchunklen);
+ qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(qinf->qi_dqchunklen);
mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD);
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 3daf5ea1eb8d..bbc813caba4c 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -278,9 +278,10 @@ xfs_qm_scall_trunc_qfiles(
xfs_mount_t *mp,
uint flags)
{
- int error;
+ int error = EINVAL;
- if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
+ if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0 ||
+ (flags & ~XFS_DQ_ALLTYPES)) {
xfs_debug(mp, "%s: flags=%x m_qflags=%x",
__func__, flags, mp->m_qflags);
return XFS_ERROR(EINVAL);
@@ -959,7 +960,6 @@ xfs_qm_export_flags(
STATIC int
xfs_dqrele_inode(
struct xfs_inode *ip,
- struct xfs_perag *pag,
int flags,
void *args)
{
diff --git a/fs/xfs/xfs_quota_defs.h b/fs/xfs/xfs_quota_defs.h
index b3b2b1065c0f..137e20937077 100644
--- a/fs/xfs/xfs_quota_defs.h
+++ b/fs/xfs/xfs_quota_defs.h
@@ -156,6 +156,6 @@ typedef __uint16_t xfs_qwarncnt_t;
extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq,
xfs_dqid_t id, uint type, uint flags, char *str);
-extern int xfs_calc_dquots_per_chunk(struct xfs_mount *mp, unsigned int nbblks);
+extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index af33cafe69b6..2ad1b9822e92 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -100,16 +100,36 @@ xfs_fs_set_xstate(
if (!XFS_IS_QUOTA_ON(mp))
return -EINVAL;
return -xfs_qm_scall_quotaoff(mp, flags);
- case Q_XQUOTARM:
- if (XFS_IS_QUOTA_ON(mp))
- return -EINVAL;
- return -xfs_qm_scall_trunc_qfiles(mp, flags);
}
return -EINVAL;
}
STATIC int
+xfs_fs_rm_xquota(
+ struct super_block *sb,
+ unsigned int uflags)
+{
+ struct xfs_mount *mp = XFS_M(sb);
+ unsigned int flags = 0;
+
+ if (sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ if (XFS_IS_QUOTA_ON(mp))
+ return -EINVAL;
+
+ if (uflags & FS_USER_QUOTA)
+ flags |= XFS_DQ_USER;
+ if (uflags & FS_GROUP_QUOTA)
+ flags |= XFS_DQ_GROUP;
+ if (uflags & FS_USER_QUOTA)
+ flags |= XFS_DQ_PROJ;
+
+ return -xfs_qm_scall_trunc_qfiles(mp, flags);
+}
+
+STATIC int
xfs_fs_get_dqblk(
struct super_block *sb,
struct kqid qid,
@@ -149,6 +169,7 @@ const struct quotactl_ops xfs_quotactl_operations = {
.get_xstatev = xfs_fs_get_xstatev,
.get_xstate = xfs_fs_get_xstate,
.set_xstate = xfs_fs_set_xstate,
+ .rm_xquota = xfs_fs_rm_xquota,
.get_dqblk = xfs_fs_get_dqblk,
.set_dqblk = xfs_fs_set_dqblk,
};
diff --git a/fs/xfs/xfs_rtbitmap.c b/fs/xfs/xfs_rtbitmap.c
index b1f2fe8af4a8..f4dd697cac08 100644
--- a/fs/xfs/xfs_rtbitmap.c
+++ b/fs/xfs/xfs_rtbitmap.c
@@ -74,7 +74,6 @@ xfs_rtbuf_get(
mp->m_bsize, 0, &bp, NULL);
if (error)
return error;
- ASSERT(!xfs_buf_geterror(bp));
*bpp = bp;
return 0;
}
diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c
index 8baf61afae1d..c3453b11f563 100644
--- a/fs/xfs/xfs_sb.c
+++ b/fs/xfs/xfs_sb.c
@@ -291,7 +291,8 @@ xfs_mount_validate_sb(
(sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */) ||
sbp->sb_dblocks == 0 ||
sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) ||
- sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) {
+ sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp) ||
+ sbp->sb_shared_vn != 0)) {
xfs_notice(mp, "SB sanity check failed");
return XFS_ERROR(EFSCORRUPTED);
}
@@ -333,15 +334,6 @@ xfs_mount_validate_sb(
xfs_warn(mp, "Offline file system operation in progress!");
return XFS_ERROR(EFSCORRUPTED);
}
-
- /*
- * Version 1 directory format has never worked on Linux.
- */
- if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
- xfs_warn(mp, "file system using version 1 directory format");
- return XFS_ERROR(ENOSYS);
- }
-
return 0;
}
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index f7b2fe77c5a5..c43c2d609a24 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -36,8 +36,6 @@ struct xfs_trans;
#define XFS_SB_VERSION_5 5 /* CRC enabled filesystem */
#define XFS_SB_VERSION_NUMBITS 0x000f
#define XFS_SB_VERSION_ALLFBITS 0xfff0
-#define XFS_SB_VERSION_SASHFBITS 0xf000
-#define XFS_SB_VERSION_REALFBITS 0x0ff0
#define XFS_SB_VERSION_ATTRBIT 0x0010
#define XFS_SB_VERSION_NLINKBIT 0x0020
#define XFS_SB_VERSION_QUOTABIT 0x0040
@@ -50,24 +48,15 @@ struct xfs_trans;
#define XFS_SB_VERSION_DIRV2BIT 0x2000
#define XFS_SB_VERSION_BORGBIT 0x4000 /* ASCII only case-insens. */
#define XFS_SB_VERSION_MOREBITSBIT 0x8000
-#define XFS_SB_VERSION_OKSASHFBITS \
- (XFS_SB_VERSION_EXTFLGBIT | \
- XFS_SB_VERSION_DIRV2BIT | \
- XFS_SB_VERSION_BORGBIT)
-#define XFS_SB_VERSION_OKREALFBITS \
- (XFS_SB_VERSION_ATTRBIT | \
- XFS_SB_VERSION_NLINKBIT | \
- XFS_SB_VERSION_QUOTABIT | \
- XFS_SB_VERSION_ALIGNBIT | \
- XFS_SB_VERSION_DALIGNBIT | \
- XFS_SB_VERSION_SHAREDBIT | \
- XFS_SB_VERSION_LOGV2BIT | \
- XFS_SB_VERSION_SECTORBIT | \
- XFS_SB_VERSION_MOREBITSBIT)
-#define XFS_SB_VERSION_OKREALBITS \
- (XFS_SB_VERSION_NUMBITS | \
- XFS_SB_VERSION_OKREALFBITS | \
- XFS_SB_VERSION_OKSASHFBITS)
+
+/*
+ * Supported feature bit list is just all bits in the versionnum field because
+ * we've used them all up and understand them all. Except, of course, for the
+ * shared superblock bit, which nobody knows what it does and so is unsupported.
+ */
+#define XFS_SB_VERSION_OKBITS \
+ ((XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALLFBITS) & \
+ ~XFS_SB_VERSION_SHAREDBIT)
/*
* There are two words to hold XFS "feature" bits: the original
@@ -76,7 +65,6 @@ struct xfs_trans;
*
* These defines represent bits in sb_features2.
*/
-#define XFS_SB_VERSION2_REALFBITS 0x00ffffff /* Mask: features */
#define XFS_SB_VERSION2_RESERVED1BIT 0x00000001
#define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002 /* Superblk counters */
#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004
@@ -86,16 +74,11 @@ struct xfs_trans;
#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */
#define XFS_SB_VERSION2_FTYPE 0x00000200 /* inode type in dir */
-#define XFS_SB_VERSION2_OKREALFBITS \
+#define XFS_SB_VERSION2_OKBITS \
(XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
XFS_SB_VERSION2_ATTR2BIT | \
XFS_SB_VERSION2_PROJID32BIT | \
XFS_SB_VERSION2_FTYPE)
-#define XFS_SB_VERSION2_OKSASHFBITS \
- (0)
-#define XFS_SB_VERSION2_OKREALBITS \
- (XFS_SB_VERSION2_OKREALFBITS | \
- XFS_SB_VERSION2_OKSASHFBITS )
/*
* Superblock - in core version. Must match the ondisk version below.
@@ -345,214 +328,140 @@ typedef enum {
#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
-static inline int xfs_sb_good_version(xfs_sb_t *sbp)
-{
- /* We always support version 1-3 */
- if (sbp->sb_versionnum >= XFS_SB_VERSION_1 &&
- sbp->sb_versionnum <= XFS_SB_VERSION_3)
- return 1;
-
- /* We support version 4 if all feature bits are supported */
- if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) {
- if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) ||
- ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
- (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS)))
- return 0;
-
- if (sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN)
- return 0;
- return 1;
- }
- if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5)
- return 1;
-
- return 0;
-}
-
/*
- * Detect a mismatched features2 field. Older kernels read/wrote
- * this into the wrong slot, so to be safe we keep them in sync.
+ * The first XFS version we support is a v4 superblock with V2 directories.
*/
-static inline int xfs_sb_has_mismatched_features2(xfs_sb_t *sbp)
+static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp)
{
- return (sbp->sb_bad_features2 != sbp->sb_features2);
-}
-
-static inline unsigned xfs_sb_version_tonew(unsigned v)
-{
- if (v == XFS_SB_VERSION_1)
- return XFS_SB_VERSION_4;
+ if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
+ return false;
- if (v == XFS_SB_VERSION_2)
- return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT;
+ /* check for unknown features in the fs */
+ if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
+ ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
+ (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
+ return false;
- return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT |
- XFS_SB_VERSION_NLINKBIT;
+ return true;
}
-static inline unsigned xfs_sb_version_toold(unsigned v)
+static inline bool xfs_sb_good_version(struct xfs_sb *sbp)
{
- if (v & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT))
- return 0;
- if (v & XFS_SB_VERSION_NLINKBIT)
- return XFS_SB_VERSION_3;
- if (v & XFS_SB_VERSION_ATTRBIT)
- return XFS_SB_VERSION_2;
- return XFS_SB_VERSION_1;
-}
-
-static inline int xfs_sb_version_hasattr(xfs_sb_t *sbp)
-{
- return sbp->sb_versionnum == XFS_SB_VERSION_2 ||
- sbp->sb_versionnum == XFS_SB_VERSION_3 ||
- (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT));
+ if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5)
+ return true;
+ if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
+ return xfs_sb_good_v4_features(sbp);
+ return false;
}
-static inline void xfs_sb_version_addattr(xfs_sb_t *sbp)
+/*
+ * Detect a mismatched features2 field. Older kernels read/wrote
+ * this into the wrong slot, so to be safe we keep them in sync.
+ */
+static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp)
{
- if (sbp->sb_versionnum == XFS_SB_VERSION_1)
- sbp->sb_versionnum = XFS_SB_VERSION_2;
- else if (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4)
- sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
- else
- sbp->sb_versionnum = XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT;
+ return sbp->sb_bad_features2 != sbp->sb_features2;
}
-static inline int xfs_sb_version_hasnlink(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp)
{
- return sbp->sb_versionnum == XFS_SB_VERSION_3 ||
- (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT));
+ return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT);
}
-static inline void xfs_sb_version_addnlink(xfs_sb_t *sbp)
+static inline void xfs_sb_version_addattr(struct xfs_sb *sbp)
{
- if (sbp->sb_versionnum <= XFS_SB_VERSION_2)
- sbp->sb_versionnum = XFS_SB_VERSION_3;
- else
- sbp->sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
+ sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
}
-static inline int xfs_sb_version_hasquota(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
+ return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
}
-static inline void xfs_sb_version_addquota(xfs_sb_t *sbp)
+static inline void xfs_sb_version_addquota(struct xfs_sb *sbp)
{
- if (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4)
- sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
- else
- sbp->sb_versionnum = xfs_sb_version_tonew(sbp->sb_versionnum) |
- XFS_SB_VERSION_QUOTABIT;
+ sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
}
-static inline int xfs_sb_version_hasalign(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp)
{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
- (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
(sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT));
}
-static inline int xfs_sb_version_hasdalign(xfs_sb_t *sbp)
-{
- return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
-}
-
-static inline int xfs_sb_version_hasshared(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT);
+ return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
}
-static inline int xfs_sb_version_hasdirv2(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp)
{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
- (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT));
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+ (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
}
-static inline int xfs_sb_version_haslogv2(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp)
{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
- (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT));
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+ (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
}
-static inline int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp)
{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
- (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT));
+ return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
}
-static inline int xfs_sb_version_hassector(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
+ return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
}
-static inline int xfs_sb_version_hasasciici(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
-}
-
-static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
-{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
- (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT));
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+ (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
}
/*
* sb_features2 bit version macros.
- *
- * For example, for a bit defined as XFS_SB_VERSION2_FUNBIT, has a macro:
- *
- * SB_VERSION_HASFUNBIT(xfs_sb_t *sbp)
- * ((xfs_sb_version_hasmorebits(sbp) &&
- * ((sbp)->sb_features2 & XFS_SB_VERSION2_FUNBIT)
*/
-
-static inline int xfs_sb_version_haslazysbcount(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp)
{
return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
(xfs_sb_version_hasmorebits(sbp) &&
(sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT));
}
-static inline int xfs_sb_version_hasattr2(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp)
{
return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
(xfs_sb_version_hasmorebits(sbp) &&
(sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT));
}
-static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp)
+static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp)
{
sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
+ sbp->sb_bad_features2 |= XFS_SB_VERSION2_ATTR2BIT;
}
-static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
+static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp)
{
sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
+ sbp->sb_bad_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
if (!sbp->sb_features2)
sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
}
-static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp)
{
return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
(xfs_sb_version_hasmorebits(sbp) &&
(sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT));
}
-static inline void xfs_sb_version_addprojid32bit(xfs_sb_t *sbp)
+static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp)
{
sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT;
@@ -587,7 +496,9 @@ xfs_sb_has_compat_feature(
return (sbp->sb_features_compat & feature) != 0;
}
-#define XFS_SB_FEAT_RO_COMPAT_ALL 0
+#define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */
+#define XFS_SB_FEAT_RO_COMPAT_ALL \
+ (XFS_SB_FEAT_RO_COMPAT_FINOBT)
#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL
static inline bool
xfs_sb_has_ro_compat_feature(
@@ -623,12 +534,12 @@ xfs_sb_has_incompat_log_feature(
/*
* V5 superblock specific feature checks
*/
-static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp)
+static inline int xfs_sb_version_hascrc(struct xfs_sb *sbp)
{
return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
}
-static inline int xfs_sb_version_has_pquotino(xfs_sb_t *sbp)
+static inline int xfs_sb_version_has_pquotino(struct xfs_sb *sbp)
{
return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
}
@@ -641,6 +552,12 @@ static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp)
(sbp->sb_features2 & XFS_SB_VERSION2_FTYPE));
}
+static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
+{
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
+ (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT);
+}
+
/*
* end of superblock version macros
*/
diff --git a/fs/xfs/xfs_shared.h b/fs/xfs/xfs_shared.h
index 4484e5151395..82404da2ca67 100644
--- a/fs/xfs/xfs_shared.h
+++ b/fs/xfs/xfs_shared.h
@@ -238,7 +238,7 @@ int xfs_log_calc_minimum_size(struct xfs_mount *);
int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
uint32_t size, struct xfs_buf *bp);
-bool xfs_symlink_hdr_ok(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
+bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset,
uint32_t size, struct xfs_buf *bp);
void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
struct xfs_inode *ip, struct xfs_ifork *ifp);
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index ce372b7d5644..f2240383d4bb 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -59,6 +59,7 @@ static int xfs_stat_proc_show(struct seq_file *m, void *v)
{ "abtc2", XFSSTAT_END_ABTC_V2 },
{ "bmbt2", XFSSTAT_END_BMBT_V2 },
{ "ibt2", XFSSTAT_END_IBT_V2 },
+ { "fibt2", XFSSTAT_END_FIBT_V2 },
/* we print both series of quota information together */
{ "qm", XFSSTAT_END_QM },
};
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index c03ad38ceaeb..c8f238b8299a 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -183,7 +183,23 @@ struct xfsstats {
__uint32_t xs_ibt_2_alloc;
__uint32_t xs_ibt_2_free;
__uint32_t xs_ibt_2_moves;
-#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_IBT_V2+6)
+#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2+15)
+ __uint32_t xs_fibt_2_lookup;
+ __uint32_t xs_fibt_2_compare;
+ __uint32_t xs_fibt_2_insrec;
+ __uint32_t xs_fibt_2_delrec;
+ __uint32_t xs_fibt_2_newroot;
+ __uint32_t xs_fibt_2_killroot;
+ __uint32_t xs_fibt_2_increment;
+ __uint32_t xs_fibt_2_decrement;
+ __uint32_t xs_fibt_2_lshift;
+ __uint32_t xs_fibt_2_rshift;
+ __uint32_t xs_fibt_2_split;
+ __uint32_t xs_fibt_2_join;
+ __uint32_t xs_fibt_2_alloc;
+ __uint32_t xs_fibt_2_free;
+ __uint32_t xs_fibt_2_moves;
+#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_FIBT_V2+6)
__uint32_t xs_qm_dqreclaims;
__uint32_t xs_qm_dqreclaim_misses;
__uint32_t xs_qm_dquot_dups;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 3494eff8e4eb..8f0333b3f7a0 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -765,20 +765,18 @@ xfs_open_devices(
* Setup xfs_mount buffer target pointers
*/
error = ENOMEM;
- mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, 0, mp->m_fsname);
+ mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev);
if (!mp->m_ddev_targp)
goto out_close_rtdev;
if (rtdev) {
- mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, 1,
- mp->m_fsname);
+ mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev);
if (!mp->m_rtdev_targp)
goto out_free_ddev_targ;
}
if (logdev && logdev != ddev) {
- mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, 1,
- mp->m_fsname);
+ mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev);
if (!mp->m_logdev_targp)
goto out_free_rtdev_targ;
} else {
@@ -811,8 +809,7 @@ xfs_setup_devices(
{
int error;
- error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_blocksize,
- mp->m_sb.sb_sectsize);
+ error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize);
if (error)
return error;
@@ -822,14 +819,12 @@ xfs_setup_devices(
if (xfs_sb_version_hassector(&mp->m_sb))
log_sector_size = mp->m_sb.sb_logsectsize;
error = xfs_setsize_buftarg(mp->m_logdev_targp,
- mp->m_sb.sb_blocksize,
log_sector_size);
if (error)
return error;
}
if (mp->m_rtdev_targp) {
error = xfs_setsize_buftarg(mp->m_rtdev_targp,
- mp->m_sb.sb_blocksize,
mp->m_sb.sb_sectsize);
if (error)
return error;
@@ -1754,13 +1749,9 @@ init_xfs_fs(void)
if (error)
goto out_destroy_wq;
- error = xfs_filestream_init();
- if (error)
- goto out_mru_cache_uninit;
-
error = xfs_buf_init();
if (error)
- goto out_filestream_uninit;
+ goto out_mru_cache_uninit;
error = xfs_init_procfs();
if (error)
@@ -1787,8 +1778,6 @@ init_xfs_fs(void)
xfs_cleanup_procfs();
out_buf_terminate:
xfs_buf_terminate();
- out_filestream_uninit:
- xfs_filestream_uninit();
out_mru_cache_uninit:
xfs_mru_cache_uninit();
out_destroy_wq:
@@ -1807,7 +1796,6 @@ exit_xfs_fs(void)
xfs_sysctl_unregister();
xfs_cleanup_procfs();
xfs_buf_terminate();
- xfs_filestream_uninit();
xfs_mru_cache_uninit();
xfs_destroy_workqueues();
xfs_destroy_zones();
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 52979aa90986..d69363c833e1 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -27,6 +27,7 @@
#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_dir2.h"
#include "xfs_inode.h"
#include "xfs_ialloc.h"
@@ -92,7 +93,7 @@ xfs_readlink_bmap(
cur_chunk = bp->b_addr;
if (xfs_sb_version_hascrc(&mp->m_sb)) {
- if (!xfs_symlink_hdr_ok(mp, ip->i_ino, offset,
+ if (!xfs_symlink_hdr_ok(ip->i_ino, offset,
byte_cnt, bp)) {
error = EFSCORRUPTED;
xfs_alert(mp,
diff --git a/fs/xfs/xfs_symlink_remote.c b/fs/xfs/xfs_symlink_remote.c
index 9b32052ff65e..23c2f2577c8d 100644
--- a/fs/xfs/xfs_symlink_remote.c
+++ b/fs/xfs/xfs_symlink_remote.c
@@ -80,7 +80,6 @@ xfs_symlink_hdr_set(
*/
bool
xfs_symlink_hdr_ok(
- struct xfs_mount *mp,
xfs_ino_t ino,
uint32_t offset,
uint32_t size,
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index dee3279c095e..1e85bcd0e418 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -46,6 +46,7 @@
#include "xfs_log_recover.h"
#include "xfs_inode_item.h"
#include "xfs_bmap_btree.h"
+#include "xfs_filestream.h"
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 65d8c793a25c..152f82782630 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -538,6 +538,64 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered);
+DECLARE_EVENT_CLASS(xfs_filestream_class,
+ TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno),
+ TP_ARGS(ip, agno),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_agnumber_t, agno)
+ __field(int, streams)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->agno = agno;
+ __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx agno %u streams %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->agno,
+ __entry->streams)
+)
+#define DEFINE_FILESTREAM_EVENT(name) \
+DEFINE_EVENT(xfs_filestream_class, name, \
+ TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), \
+ TP_ARGS(ip, agno))
+DEFINE_FILESTREAM_EVENT(xfs_filestream_free);
+DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup);
+DEFINE_FILESTREAM_EVENT(xfs_filestream_scan);
+
+TRACE_EVENT(xfs_filestream_pick,
+ TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno,
+ xfs_extlen_t free, int nscan),
+ TP_ARGS(ip, agno, free, nscan),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_agnumber_t, agno)
+ __field(int, streams)
+ __field(xfs_extlen_t, free)
+ __field(int, nscan)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->agno = agno;
+ __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno);
+ __entry->free = free;
+ __entry->nscan = nscan;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx agno %u streams %d free %d nscan %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->agno,
+ __entry->streams,
+ __entry->free,
+ __entry->nscan)
+);
+
DECLARE_EVENT_CLASS(xfs_lock_class,
TP_PROTO(struct xfs_inode *ip, unsigned lock_flags,
unsigned long caller_ip),
@@ -1060,7 +1118,6 @@ DEFINE_RW_EVENT(xfs_file_read);
DEFINE_RW_EVENT(xfs_file_buffered_write);
DEFINE_RW_EVENT(xfs_file_direct_write);
DEFINE_RW_EVENT(xfs_file_splice_read);
-DEFINE_RW_EVENT(xfs_file_splice_write);
DECLARE_EVENT_CLASS(xfs_page_class,
TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 54a57326d85b..d03932564ccb 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -827,7 +827,7 @@ xfs_trans_committed_bulk(
xfs_log_item_batch_insert(ailp, &cur, log_items, i, commit_lsn);
spin_lock(&ailp->xa_lock);
- xfs_trans_ail_cursor_done(ailp, &cur);
+ xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->xa_lock);
}
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index a7287354e535..cb0f3a84cc68 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -173,7 +173,6 @@ xfs_trans_ail_cursor_next(
*/
void
xfs_trans_ail_cursor_done(
- struct xfs_ail *ailp,
struct xfs_ail_cursor *cur)
{
cur->item = NULL;
@@ -368,7 +367,7 @@ xfsaild_push(
* If the AIL is empty or our push has reached the end we are
* done now.
*/
- xfs_trans_ail_cursor_done(ailp, &cur);
+ xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->xa_lock);
goto out_done;
}
@@ -453,7 +452,7 @@ xfsaild_push(
break;
lsn = lip->li_lsn;
}
- xfs_trans_ail_cursor_done(ailp, &cur);
+ xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->xa_lock);
if (xfs_buf_delwri_submit_nowait(&ailp->xa_buf_list))
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 12e86af9d9b9..bd1281862ad7 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -133,8 +133,7 @@ struct xfs_log_item * xfs_trans_ail_cursor_last(struct xfs_ail *ailp,
xfs_lsn_t lsn);
struct xfs_log_item * xfs_trans_ail_cursor_next(struct xfs_ail *ailp,
struct xfs_ail_cursor *cur);
-void xfs_trans_ail_cursor_done(struct xfs_ail *ailp,
- struct xfs_ail_cursor *cur);
+void xfs_trans_ail_cursor_done(struct xfs_ail_cursor *cur);
#if BITS_PER_LONG != 64
static inline void
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c
index ae368165244d..f2bda7c76b8a 100644
--- a/fs/xfs/xfs_trans_resv.c
+++ b/fs/xfs/xfs_trans_resv.c
@@ -26,6 +26,7 @@
#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_bmap_btree.h"
#include "xfs_ialloc.h"
@@ -106,6 +107,47 @@ xfs_calc_inode_res(
}
/*
+ * The free inode btree is a conditional feature and the log reservation
+ * requirements differ slightly from that of the traditional inode allocation
+ * btree. The finobt tracks records for inode chunks with at least one free
+ * inode. A record can be removed from the tree for an inode allocation
+ * or free and thus the finobt reservation is unconditional across:
+ *
+ * - inode allocation
+ * - inode free
+ * - inode chunk allocation
+ *
+ * The 'modify' param indicates to include the record modification scenario. The
+ * 'alloc' param indicates to include the reservation for free space btree
+ * modifications on behalf of finobt modifications. This is required only for
+ * transactions that do not already account for free space btree modifications.
+ *
+ * the free inode btree: max depth * block size
+ * the allocation btrees: 2 trees * (max depth - 1) * block size
+ * the free inode btree entry: block size
+ */
+STATIC uint
+xfs_calc_finobt_res(
+ struct xfs_mount *mp,
+ int alloc,
+ int modify)
+{
+ uint res;
+
+ if (!xfs_sb_version_hasfinobt(&mp->m_sb))
+ return 0;
+
+ res = xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1));
+ if (alloc)
+ res += xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ XFS_FSB_TO_B(mp, 1));
+ if (modify)
+ res += (uint)XFS_FSB_TO_B(mp, 1);
+
+ return res;
+}
+
+/*
* Various log reservation values.
*
* These are based on the size of the file system block because that is what
@@ -302,6 +344,7 @@ xfs_calc_remove_reservation(
* the superblock for the nlink flag: sector size
* the directory btree: (max depth + v2) * dir block size
* the directory inode's bmap btree: (max depth + v2) * block size
+ * the finobt (record modification and allocation btrees)
*/
STATIC uint
xfs_calc_create_resv_modify(
@@ -310,7 +353,8 @@ xfs_calc_create_resv_modify(
return xfs_calc_inode_res(mp, 2) +
xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
(uint)XFS_FSB_TO_B(mp, 1) +
- xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
+ xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_finobt_res(mp, 1, 1);
}
/*
@@ -348,6 +392,7 @@ __xfs_calc_create_reservation(
* the superblock for the nlink flag: sector size
* the inode btree: max depth * blocksize
* the allocation btrees: 2 trees * (max depth - 1) * block size
+ * the finobt (record insertion)
*/
STATIC uint
xfs_calc_icreate_resv_alloc(
@@ -357,7 +402,8 @@ xfs_calc_icreate_resv_alloc(
mp->m_sb.sb_sectsize +
xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
- XFS_FSB_TO_B(mp, 1));
+ XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_finobt_res(mp, 0, 0);
}
STATIC uint
@@ -425,6 +471,7 @@ xfs_calc_symlink_reservation(
* the on disk inode before ours in the agi hash list: inode cluster size
* the inode btree: max depth * blocksize
* the allocation btrees: 2 trees * (max depth - 1) * block size
+ * the finobt (record insertion, removal or modification)
*/
STATIC uint
xfs_calc_ifree_reservation(
@@ -439,7 +486,8 @@ xfs_calc_ifree_reservation(
xfs_calc_buf_res(2 + mp->m_ialloc_blks +
mp->m_in_maxlevels, 0) +
xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
- XFS_FSB_TO_B(mp, 1));
+ XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_finobt_res(mp, 0, 1);
}
/*
@@ -562,7 +610,7 @@ xfs_calc_addafork_reservation(
return XFS_DQUOT_LOGRES(mp) +
xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(1, mp->m_dirblksize) +
+ xfs_calc_buf_res(1, mp->m_dir_geo->blksize) +
xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
XFS_FSB_TO_B(mp, 1)) +
xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h
index af5dbe06cb65..bf9c4579334d 100644
--- a/fs/xfs/xfs_trans_space.h
+++ b/fs/xfs/xfs_trans_space.h
@@ -28,7 +28,8 @@
(((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
XFS_EXTENTADD_SPACE_RES(mp,w))
-#define XFS_DAENTER_1B(mp,w) ((w) == XFS_DATA_FORK ? (mp)->m_dirblkfsbs : 1)
+#define XFS_DAENTER_1B(mp,w) \
+ ((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1)
#define XFS_DAENTER_DBS(mp,w) \
(XFS_DA_NODE_MAXDEPTH + (((w) == XFS_DATA_FORK) ? 2 : 0))
#define XFS_DAENTER_BLOCKS(mp,w) \
@@ -47,13 +48,15 @@
#define XFS_DIRREMOVE_SPACE_RES(mp) \
XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
#define XFS_IALLOC_SPACE_RES(mp) \
- ((mp)->m_ialloc_blks + (mp)->m_in_maxlevels - 1)
+ ((mp)->m_ialloc_blks + \
+ (xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1 * \
+ ((mp)->m_in_maxlevels - 1)))
/*
* Space reservation values for various transactions.
*/
#define XFS_ADDAFORK_SPACE_RES(mp) \
- ((mp)->m_dirblkfsbs + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK))
+ ((mp)->m_dir_geo->fsbcount + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK))
#define XFS_ATTRRM_SPACE_RES(mp) \
XFS_DAREMOVE_SPACE_RES(mp, XFS_ATTR_FORK)
/* This macro is not used - see inline code in xfs_attr_set */
@@ -82,5 +85,8 @@
(XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
#define XFS_SYMLINK_SPACE_RES(mp,nl,b) \
(XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b))
+#define XFS_IFREE_SPACE_RES(mp) \
+ (xfs_sb_version_hasfinobt(&mp->m_sb) ? (mp)->m_in_maxlevels : 0)
+
#endif /* __XFS_TRANS_SPACE_H__ */
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 82bbc34d54a3..65c6e6650b1a 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -134,7 +134,7 @@ typedef enum {
typedef enum {
XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_BMAPi, XFS_BTNUM_INOi,
- XFS_BTNUM_MAX
+ XFS_BTNUM_FINOi, XFS_BTNUM_MAX
} xfs_btnum_t;
struct xfs_name {