summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_file.c2
-rw-r--r--fs/adfs/super.c2
-rw-r--r--fs/affs/affs.h20
-rw-r--r--fs/affs/amigaffs.c23
-rw-r--r--fs/affs/dir.c28
-rw-r--r--fs/affs/namei.c32
-rw-r--r--fs/affs/super.c8
-rw-r--r--fs/aio.c120
-rw-r--r--fs/autofs4/dev-ioctl.c3
-rw-r--r--fs/bfs/inode.c2
-rw-r--r--fs/binfmt_elf.c4
-rw-r--r--fs/bio-integrity.c22
-rw-r--r--fs/bio.c10
-rw-r--r--fs/block_dev.c2
-rw-r--r--fs/btrfs/async-thread.c2
-rw-r--r--fs/btrfs/backref.c33
-rw-r--r--fs/btrfs/ctree.c94
-rw-r--r--fs/btrfs/ctree.h13
-rw-r--r--fs/btrfs/disk-io.c23
-rw-r--r--fs/btrfs/extent-tree.c35
-rw-r--r--fs/btrfs/extent_io.c8
-rw-r--r--fs/btrfs/extent_io.h1
-rw-r--r--fs/btrfs/file.c22
-rw-r--r--fs/btrfs/inode-map.c14
-rw-r--r--fs/btrfs/inode.c36
-rw-r--r--fs/btrfs/ioctl.c35
-rw-r--r--fs/btrfs/relocation.c21
-rw-r--r--fs/btrfs/scrub.c108
-rw-r--r--fs/btrfs/send.c117
-rw-r--r--fs/btrfs/super.c22
-rw-r--r--fs/btrfs/transaction.c48
-rw-r--r--fs/btrfs/transaction.h3
-rw-r--r--fs/btrfs/volumes.c35
-rw-r--r--fs/buffer.c6
-rw-r--r--fs/cachefiles/bind.c1
-rw-r--r--fs/cachefiles/namei.c3
-rw-r--r--fs/ceph/cache.c1
-rw-r--r--fs/ceph/cache.h10
-rw-r--r--fs/ceph/caps.c9
-rw-r--r--fs/ceph/debugfs.c5
-rw-r--r--fs/ceph/dir.c53
-rw-r--r--fs/ceph/export.c267
-rw-r--r--fs/ceph/file.c20
-rw-r--r--fs/ceph/inode.c76
-rw-r--r--fs/ceph/ioctl.c8
-rw-r--r--fs/ceph/locks.c98
-rw-r--r--fs/ceph/mds_client.c97
-rw-r--r--fs/ceph/mds_client.h4
-rw-r--r--fs/ceph/strings.c1
-rw-r--r--fs/ceph/super.c1
-rw-r--r--fs/ceph/super.h3
-rw-r--r--fs/ceph/xattr.c48
-rw-r--r--fs/cifs/cifsfs.c15
-rw-r--r--fs/cifs/cifsglob.h8
-rw-r--r--fs/cifs/cifsproto.h3
-rw-r--r--fs/cifs/cifssmb.c3
-rw-r--r--fs/cifs/file.c162
-rw-r--r--fs/cifs/misc.c74
-rw-r--r--fs/cifs/smb1ops.c11
-rw-r--r--fs/cifs/smb2misc.c18
-rw-r--r--fs/cifs/smb2ops.c14
-rw-r--r--fs/cifs/smb2pdu.c2
-rw-r--r--fs/coredump.c7
-rw-r--r--fs/dcache.c1
-rw-r--r--fs/dlm/lowcomms.c2
-rw-r--r--fs/exec.c28
-rw-r--r--fs/exofs/ore_raid.c4
-rw-r--r--fs/exofs/super.c2
-rw-r--r--fs/ext2/acl.c1
-rw-r--r--fs/ext2/ialloc.c2
-rw-r--r--fs/ext2/super.c2
-rw-r--r--fs/ext2/xattr_security.c4
-rw-r--r--fs/ext3/balloc.c5
-rw-r--r--fs/ext3/dir.c2
-rw-r--r--fs/ext3/ialloc.c2
-rw-r--r--fs/ext3/inode.c86
-rw-r--r--fs/ext3/super.c2
-rw-r--r--fs/ext3/xattr_security.c5
-rw-r--r--fs/ext4/file.c3
-rw-r--r--fs/f2fs/acl.c8
-rw-r--r--fs/f2fs/checkpoint.c208
-rw-r--r--fs/f2fs/data.c106
-rw-r--r--fs/f2fs/debug.c12
-rw-r--r--fs/f2fs/dir.c85
-rw-r--r--fs/f2fs/f2fs.h105
-rw-r--r--fs/f2fs/file.c32
-rw-r--r--fs/f2fs/gc.c16
-rw-r--r--fs/f2fs/inline.c4
-rw-r--r--fs/f2fs/inode.c27
-rw-r--r--fs/f2fs/namei.c9
-rw-r--r--fs/f2fs/node.c334
-rw-r--r--fs/f2fs/node.h25
-rw-r--r--fs/f2fs/recovery.c37
-rw-r--r--fs/f2fs/segment.c222
-rw-r--r--fs/f2fs/segment.h75
-rw-r--r--fs/f2fs/super.c97
-rw-r--r--fs/f2fs/xattr.c7
-rw-r--r--fs/file.c11
-rw-r--r--fs/file_table.c43
-rw-r--r--fs/fuse/cuse.c4
-rw-r--r--fs/fuse/dev.c14
-rw-r--r--fs/fuse/file.c6
-rw-r--r--fs/gfs2/file.c1
-rw-r--r--fs/isofs/inode.c2
-rw-r--r--fs/jffs2/compr_rtime.c4
-rw-r--r--fs/jffs2/fs.c9
-rw-r--r--fs/jffs2/nodelist.h2
-rw-r--r--fs/jffs2/nodemgmt.c14
-rw-r--r--fs/kernfs/inode.c14
-rw-r--r--fs/lockd/svc.c1
-rw-r--r--fs/mount.h5
-rw-r--r--fs/namei.c67
-rw-r--r--fs/namespace.c56
-rw-r--r--fs/ncpfs/dir.c69
-rw-r--r--fs/ncpfs/file.c24
-rw-r--r--fs/ncpfs/getopt.c12
-rw-r--r--fs/ncpfs/inode.c82
-rw-r--r--fs/ncpfs/ioctl.c17
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--fs/ncpfs/ncp_fs.h30
-rw-r--r--fs/ncpfs/ncp_fs_sb.h6
-rw-r--r--fs/ncpfs/ncplib_kernel.c28
-rw-r--r--fs/ncpfs/sock.c53
-rw-r--r--fs/ncpfs/symlink.c2
-rw-r--r--fs/nfs/callback_proc.c19
-rw-r--r--fs/nfs/dir.c62
-rw-r--r--fs/nfs/file.c1
-rw-r--r--fs/nfs/inode.c34
-rw-r--r--fs/nfs/internal.h8
-rw-r--r--fs/nfs/nfs3proc.c36
-rw-r--r--fs/nfs/nfs4_fs.h11
-rw-r--r--fs/nfs/nfs4client.c7
-rw-r--r--fs/nfs/nfs4proc.c197
-rw-r--r--fs/nfs/nfs4state.c6
-rw-r--r--fs/nfs/nfs4xdr.c3
-rw-r--r--fs/nfs/pnfs.c17
-rw-r--r--fs/nfs/proc.c25
-rw-r--r--fs/nfs/unlink.c35
-rw-r--r--fs/nfsd/acl.h10
-rw-r--r--fs/nfsd/nfs4acl.c13
-rw-r--r--fs/nfsd/nfs4callback.c19
-rw-r--r--fs/nfsd/nfs4proc.c39
-rw-r--r--fs/nfsd/nfs4state.c28
-rw-r--r--fs/nfsd/nfs4xdr.c30
-rw-r--r--fs/nfsd/nfsctl.c5
-rw-r--r--fs/nfsd/nfsd.h2
-rw-r--r--fs/nfsd/nfsfh.h14
-rw-r--r--fs/nfsd/nfsxdr.c2
-rw-r--r--fs/nfsd/vfs.c15
-rw-r--r--fs/nfsd/xdr4.h2
-rw-r--r--fs/nilfs2/file.c1
-rw-r--r--fs/ntfs/debug.c58
-rw-r--r--fs/ntfs/debug.h7
-rw-r--r--fs/ntfs/inode.c2
-rw-r--r--fs/ntfs/super.c28
-rw-r--r--fs/ocfs2/cluster/sys.c2
-rw-r--r--fs/ocfs2/cluster/tcp.c64
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h2
-rw-r--r--fs/ocfs2/file.c9
-rw-r--r--fs/ocfs2/stackglue.c8
-rw-r--r--fs/open.c68
-rw-r--r--fs/pipe.c133
-rw-r--r--fs/pnode.c198
-rw-r--r--fs/pnode.h3
-rw-r--r--fs/proc/array.c4
-rw-r--r--fs/proc/base.c55
-rw-r--r--fs/proc/fd.c6
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/meminfo.c2
-rw-r--r--fs/proc/namespaces.c14
-rw-r--r--fs/proc/self.c2
-rw-r--r--fs/proc/task_mmu.c3
-rw-r--r--fs/proc/vmcore.c3
-rw-r--r--fs/proc_namespace.c1
-rw-r--r--fs/quota/Kconfig7
-rw-r--r--fs/reiserfs/dir.c6
-rw-r--r--fs/splice.c126
-rw-r--r--fs/super.c5
-rw-r--r--fs/sysfs/file.c92
-rw-r--r--fs/ubifs/file.c1
-rw-r--r--fs/udf/file.c2
-rw-r--r--fs/udf/super.c8
-rw-r--r--fs/ufs/balloc.c12
-rw-r--r--fs/ufs/ialloc.c4
-rw-r--r--fs/ufs/super.c8
-rw-r--r--fs/xfs/xfs_aops.c51
-rw-r--r--fs/xfs/xfs_bmap.c17
-rw-r--r--fs/xfs/xfs_bmap_util.c13
-rw-r--r--fs/xfs/xfs_buf.c16
-rw-r--r--fs/xfs/xfs_file.c16
-rw-r--r--fs/xfs/xfs_inode.c5
-rw-r--r--fs/xfs/xfs_inode.h2
-rw-r--r--fs/xfs/xfs_ioctl.c28
-rw-r--r--fs/xfs/xfs_iops.c20
-rw-r--r--fs/xfs/xfs_log.c53
-rw-r--r--fs/xfs/xfs_trace.h1
196 files changed, 3269 insertions, 2653 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index a16b0ff497ca..d8223209d4b1 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -832,6 +832,7 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
static const struct vm_operations_struct v9fs_file_vm_ops = {
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = v9fs_vm_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
@@ -839,6 +840,7 @@ static const struct vm_operations_struct v9fs_file_vm_ops = {
static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
.close = v9fs_mmap_vm_close,
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = v9fs_vm_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 952aeb048349..9852bdf34d76 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -266,7 +266,7 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
-static int init_inodecache(void)
+static int __init init_inodecache(void)
{
adfs_inode_cachep = kmem_cache_create("adfs_inode_cache",
sizeof(struct adfs_inode_info),
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 3952121f2f28..25b23b1e7f22 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -5,14 +5,6 @@
#include <linux/mutex.h>
#include <linux/workqueue.h>
-/* AmigaOS allows file names with up to 30 characters length.
- * Names longer than that will be silently truncated. If you
- * want to disallow this, comment out the following #define.
- * Creating filesystem objects with longer names will then
- * result in an error (ENAMETOOLONG).
- */
-/*#define AFFS_NO_TRUNCATE */
-
/* Ugly macros make the code more pretty. */
#define GET_END_PTR(st,p,sz) ((st *)((char *)(p)+((sz)-sizeof(st))))
@@ -28,7 +20,6 @@
#define AFFS_CACHE_SIZE PAGE_SIZE
-#define AFFS_MAX_PREALLOC 32
#define AFFS_LC_SIZE (AFFS_CACHE_SIZE/sizeof(u32)/2)
#define AFFS_AC_SIZE (AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2)
#define AFFS_AC_MASK (AFFS_AC_SIZE-1)
@@ -118,6 +109,7 @@ struct affs_sb_info {
#define SF_OFS 0x0200 /* Old filesystem */
#define SF_PREFIX 0x0400 /* Buffer for prefix is allocated */
#define SF_VERBOSE 0x0800 /* Talk about fs when mounting */
+#define SF_NO_TRUNCATE 0x1000 /* Don't truncate filenames */
/* short cut to get to the affs specific sb data */
static inline struct affs_sb_info *AFFS_SB(struct super_block *sb)
@@ -137,9 +129,13 @@ extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh);
extern void secs_to_datestamp(time_t secs, struct affs_date *ds);
extern umode_t prot_to_mode(u32 prot);
extern void mode_to_prot(struct inode *inode);
-extern void affs_error(struct super_block *sb, const char *function, const char *fmt, ...);
-extern void affs_warning(struct super_block *sb, const char *function, const char *fmt, ...);
-extern int affs_check_name(const unsigned char *name, int len);
+extern void affs_error(struct super_block *sb, const char *function,
+ const char *fmt, ...);
+extern void affs_warning(struct super_block *sb, const char *function,
+ const char *fmt, ...);
+extern bool affs_nofilenametruncate(const struct dentry *dentry);
+extern int affs_check_name(const unsigned char *name, int len,
+ bool notruncate);
extern int affs_copy_name(unsigned char *bstr, struct dentry *dentry);
/* bitmap. c */
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index d9a43674cb94..533a322c41c0 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -471,20 +471,27 @@ affs_warning(struct super_block *sb, const char *function, const char *fmt, ...)
function,ErrorBuffer);
}
+bool
+affs_nofilenametruncate(const struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+ return AFFS_SB(inode->i_sb)->s_flags & SF_NO_TRUNCATE;
+
+}
+
/* Check if the name is valid for a affs object. */
int
-affs_check_name(const unsigned char *name, int len)
+affs_check_name(const unsigned char *name, int len, bool notruncate)
{
int i;
- if (len > 30)
-#ifdef AFFS_NO_TRUNCATE
- return -ENAMETOOLONG;
-#else
- len = 30;
-#endif
-
+ if (len > 30) {
+ if (notruncate)
+ return -ENAMETOOLONG;
+ else
+ len = 30;
+ }
for (i = 0; i < len; i++) {
if (name[i] < ' ' || name[i] == ':'
|| (name[i] > 0x7e && name[i] < 0xa0))
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index f1eba8c3644e..cbbda476a805 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -52,8 +52,10 @@ affs_readdir(struct file *file, struct dir_context *ctx)
int hash_pos;
int chain_pos;
u32 ino;
+ int error = 0;
- pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)ctx->pos);
+ pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",
+ inode->i_ino, (unsigned long)ctx->pos);
if (ctx->pos < 2) {
file->private_data = (void *)0;
@@ -72,7 +74,7 @@ affs_readdir(struct file *file, struct dir_context *ctx)
}
dir_bh = affs_bread(sb, inode->i_ino);
if (!dir_bh)
- goto readdir_out;
+ goto out_unlock_dir;
/* If the directory hasn't changed since the last call to readdir(),
* we can jump directly to where we left off.
@@ -88,7 +90,8 @@ affs_readdir(struct file *file, struct dir_context *ctx)
fh_bh = affs_bread(sb, ino);
if (!fh_bh) {
affs_error(sb, "readdir","Cannot read block %d", i);
- return -EIO;
+ error = -EIO;
+ goto out_brelse_dir;
}
ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
affs_brelse(fh_bh);
@@ -107,29 +110,34 @@ inside:
do {
fh_bh = affs_bread(sb, ino);
if (!fh_bh) {
- affs_error(sb, "readdir","Cannot read block %d", ino);
+ affs_error(sb, "readdir",
+ "Cannot read block %d", ino);
break;
}
namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30);
name = AFFS_TAIL(sb, fh_bh)->name + 1;
- pr_debug("AFFS: readdir(): filldir(\"%.*s\", ino=%u), hash=%d, f_pos=%x\n",
+ pr_debug("AFFS: readdir(): dir_emit(\"%.*s\", "
+ "ino=%u), hash=%d, f_pos=%x\n",
namelen, name, ino, hash_pos, (u32)ctx->pos);
+
if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN))
- goto readdir_done;
+ goto done;
ctx->pos++;
ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
affs_brelse(fh_bh);
fh_bh = NULL;
} while (ino);
}
-readdir_done:
+done:
file->f_version = inode->i_version;
file->private_data = (void *)(long)ino;
+ affs_brelse(fh_bh);
-readdir_out:
+out_brelse_dir:
affs_brelse(dir_bh);
- affs_brelse(fh_bh);
+
+out_unlock_dir:
affs_unlock_dir(inode);
- return 0;
+ return error;
}
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index c36cbb4537a2..6dae1ccd176d 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -60,13 +60,13 @@ affs_get_toupper(struct super_block *sb)
* Note: the dentry argument is the parent dentry.
*/
static inline int
-__affs_hash_dentry(struct qstr *qstr, toupper_t toupper)
+__affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate)
{
const u8 *name = qstr->name;
unsigned long hash;
int i;
- i = affs_check_name(qstr->name, qstr->len);
+ i = affs_check_name(qstr->name, qstr->len, notruncate);
if (i)
return i;
@@ -82,16 +82,22 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper)
static int
affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
{
- return __affs_hash_dentry(qstr, affs_toupper);
+ return __affs_hash_dentry(qstr, affs_toupper,
+ affs_nofilenametruncate(dentry));
+
}
+
static int
affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
{
- return __affs_hash_dentry(qstr, affs_intl_toupper);
+ return __affs_hash_dentry(qstr, affs_intl_toupper,
+ affs_nofilenametruncate(dentry));
+
}
static inline int __affs_compare_dentry(unsigned int len,
- const char *str, const struct qstr *name, toupper_t toupper)
+ const char *str, const struct qstr *name, toupper_t toupper,
+ bool notruncate)
{
const u8 *aname = str;
const u8 *bname = name->name;
@@ -101,7 +107,7 @@ static inline int __affs_compare_dentry(unsigned int len,
* must be valid. 'name' must be validated first.
*/
- if (affs_check_name(name->name, name->len))
+ if (affs_check_name(name->name, name->len, notruncate))
return 1;
/*
@@ -126,13 +132,18 @@ static int
affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
- return __affs_compare_dentry(len, str, name, affs_toupper);
+
+ return __affs_compare_dentry(len, str, name, affs_toupper,
+ affs_nofilenametruncate(parent));
}
+
static int
affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
- return __affs_compare_dentry(len, str, name, affs_intl_toupper);
+ return __affs_compare_dentry(len, str, name, affs_intl_toupper,
+ affs_nofilenametruncate(parent));
+
}
/*
@@ -411,7 +422,10 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
(u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name,
(u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name);
- retval = affs_check_name(new_dentry->d_name.name,new_dentry->d_name.len);
+ retval = affs_check_name(new_dentry->d_name.name,
+ new_dentry->d_name.len,
+ affs_nofilenametruncate(old_dentry));
+
if (retval)
return retval;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 307453086c3f..6d589f28bf9b 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -128,7 +128,7 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
-static int init_inodecache(void)
+static int __init init_inodecache(void)
{
affs_inode_cachep = kmem_cache_create("affs_inode_cache",
sizeof(struct affs_inode_info),
@@ -163,7 +163,7 @@ static const struct super_operations affs_sops = {
};
enum {
- Opt_bs, Opt_mode, Opt_mufs, Opt_prefix, Opt_protect,
+ Opt_bs, Opt_mode, Opt_mufs, Opt_notruncate, Opt_prefix, Opt_protect,
Opt_reserved, Opt_root, Opt_setgid, Opt_setuid,
Opt_verbose, Opt_volume, Opt_ignore, Opt_err,
};
@@ -172,6 +172,7 @@ static const match_table_t tokens = {
{Opt_bs, "bs=%u"},
{Opt_mode, "mode=%o"},
{Opt_mufs, "mufs"},
+ {Opt_notruncate, "nofilenametruncate"},
{Opt_prefix, "prefix=%s"},
{Opt_protect, "protect"},
{Opt_reserved, "reserved=%u"},
@@ -233,6 +234,9 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved,
case Opt_mufs:
*mount_opts |= SF_MUFS;
break;
+ case Opt_notruncate:
+ *mount_opts |= SF_NO_TRUNCATE;
+ break;
case Opt_prefix:
*prefix = match_strdup(&args[0]);
if (!*prefix)
diff --git a/fs/aio.c b/fs/aio.c
index 062a5f6a1448..12a3de0ee6da 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -52,7 +52,8 @@
struct aio_ring {
unsigned id; /* kernel internal index number */
unsigned nr; /* number of io_events */
- unsigned head;
+ unsigned head; /* Written to by userland or under ring_lock
+ * mutex by aio_read_events_ring(). */
unsigned tail;
unsigned magic;
@@ -243,6 +244,11 @@ static void aio_free_ring(struct kioctx *ctx)
{
int i;
+ /* Disconnect the kiotx from the ring file. This prevents future
+ * accesses to the kioctx from page migration.
+ */
+ put_aio_ring_file(ctx);
+
for (i = 0; i < ctx->nr_pages; i++) {
struct page *page;
pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
@@ -254,8 +260,6 @@ static void aio_free_ring(struct kioctx *ctx)
put_page(page);
}
- put_aio_ring_file(ctx);
-
if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) {
kfree(ctx->ring_pages);
ctx->ring_pages = NULL;
@@ -283,29 +287,38 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
{
struct kioctx *ctx;
unsigned long flags;
+ pgoff_t idx;
int rc;
rc = 0;
- /* Make sure the old page hasn't already been changed */
+ /* mapping->private_lock here protects against the kioctx teardown. */
spin_lock(&mapping->private_lock);
ctx = mapping->private_data;
- if (ctx) {
- pgoff_t idx;
- spin_lock_irqsave(&ctx->completion_lock, flags);
- idx = old->index;
- if (idx < (pgoff_t)ctx->nr_pages) {
- if (ctx->ring_pages[idx] != old)
- rc = -EAGAIN;
- } else
- rc = -EINVAL;
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ if (!ctx) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ /* The ring_lock mutex. The prevents aio_read_events() from writing
+ * to the ring's head, and prevents page migration from mucking in
+ * a partially initialized kiotx.
+ */
+ if (!mutex_trylock(&ctx->ring_lock)) {
+ rc = -EAGAIN;
+ goto out;
+ }
+
+ idx = old->index;
+ if (idx < (pgoff_t)ctx->nr_pages) {
+ /* Make sure the old page hasn't already been changed */
+ if (ctx->ring_pages[idx] != old)
+ rc = -EAGAIN;
} else
rc = -EINVAL;
- spin_unlock(&mapping->private_lock);
if (rc != 0)
- return rc;
+ goto out_unlock;
/* Writeback must be complete */
BUG_ON(PageWriteback(old));
@@ -314,38 +327,26 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1);
if (rc != MIGRATEPAGE_SUCCESS) {
put_page(new);
- return rc;
+ goto out_unlock;
}
- /* We can potentially race against kioctx teardown here. Use the
- * address_space's private data lock to protect the mapping's
- * private_data.
+ /* Take completion_lock to prevent other writes to the ring buffer
+ * while the old page is copied to the new. This prevents new
+ * events from being lost.
*/
- spin_lock(&mapping->private_lock);
- ctx = mapping->private_data;
- if (ctx) {
- pgoff_t idx;
- spin_lock_irqsave(&ctx->completion_lock, flags);
- migrate_page_copy(new, old);
- idx = old->index;
- if (idx < (pgoff_t)ctx->nr_pages) {
- /* And only do the move if things haven't changed */
- if (ctx->ring_pages[idx] == old)
- ctx->ring_pages[idx] = new;
- else
- rc = -EAGAIN;
- } else
- rc = -EINVAL;
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
- } else
- rc = -EBUSY;
- spin_unlock(&mapping->private_lock);
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ migrate_page_copy(new, old);
+ BUG_ON(ctx->ring_pages[idx] != old);
+ ctx->ring_pages[idx] = new;
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
- if (rc == MIGRATEPAGE_SUCCESS)
- put_page(old);
- else
- put_page(new);
+ /* The old page is no longer accessible. */
+ put_page(old);
+out_unlock:
+ mutex_unlock(&ctx->ring_lock);
+out:
+ spin_unlock(&mapping->private_lock);
return rc;
}
#endif
@@ -380,7 +381,7 @@ static int aio_setup_ring(struct kioctx *ctx)
file = aio_private_file(ctx, nr_pages);
if (IS_ERR(file)) {
ctx->aio_ring_file = NULL;
- return -EAGAIN;
+ return -ENOMEM;
}
ctx->aio_ring_file = file;
@@ -415,7 +416,7 @@ static int aio_setup_ring(struct kioctx *ctx)
if (unlikely(i != nr_pages)) {
aio_free_ring(ctx);
- return -EAGAIN;
+ return -ENOMEM;
}
ctx->mmap_size = nr_pages * PAGE_SIZE;
@@ -429,7 +430,7 @@ static int aio_setup_ring(struct kioctx *ctx)
if (IS_ERR((void *)ctx->mmap_base)) {
ctx->mmap_size = 0;
aio_free_ring(ctx);
- return -EAGAIN;
+ return -ENOMEM;
}
pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
@@ -556,6 +557,10 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
rcu_read_unlock();
spin_unlock(&mm->ioctx_lock);
+ /* While kioctx setup is in progress,
+ * we are protected from page migration
+ * changes ring_pages by ->ring_lock.
+ */
ring = kmap_atomic(ctx->ring_pages[0]);
ring->id = ctx->id;
kunmap_atomic(ring);
@@ -640,24 +645,28 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
ctx->max_reqs = nr_events;
- if (percpu_ref_init(&ctx->users, free_ioctx_users))
- goto err;
-
- if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs))
- goto err;
-
spin_lock_init(&ctx->ctx_lock);
spin_lock_init(&ctx->completion_lock);
mutex_init(&ctx->ring_lock);
+ /* Protect against page migration throughout kiotx setup by keeping
+ * the ring_lock mutex held until setup is complete. */
+ mutex_lock(&ctx->ring_lock);
init_waitqueue_head(&ctx->wait);
INIT_LIST_HEAD(&ctx->active_reqs);
+ if (percpu_ref_init(&ctx->users, free_ioctx_users))
+ goto err;
+
+ if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs))
+ goto err;
+
ctx->cpu = alloc_percpu(struct kioctx_cpu);
if (!ctx->cpu)
goto err;
- if (aio_setup_ring(ctx) < 0)
+ err = aio_setup_ring(ctx);
+ if (err < 0)
goto err;
atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
@@ -683,6 +692,9 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
if (err)
goto err_cleanup;
+ /* Release the ring_lock mutex now that all setup is complete. */
+ mutex_unlock(&ctx->ring_lock);
+
pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
ctx, ctx->user_id, mm, ctx->nr_events);
return ctx;
@@ -692,6 +704,7 @@ err_cleanup:
err_ctx:
aio_free_ring(ctx);
err:
+ mutex_unlock(&ctx->ring_lock);
free_percpu(ctx->cpu);
free_percpu(ctx->reqs.pcpu_count);
free_percpu(ctx->users.pcpu_count);
@@ -1024,6 +1037,7 @@ static long aio_read_events_ring(struct kioctx *ctx,
mutex_lock(&ctx->ring_lock);
+ /* Access to ->ring_pages here is protected by ctx->ring_lock. */
ring = kmap_atomic(ctx->ring_pages[0]);
head = ring->head;
tail = ring->tail;
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 3182c0e68b42..232e03d4780d 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -103,6 +103,9 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
if (tmp.size < sizeof(tmp))
return ERR_PTR(-EINVAL);
+ if (tmp.size > (PATH_MAX + sizeof(tmp)))
+ return ERR_PTR(-ENAMETOOLONG);
+
return memdup_user(in, tmp.size);
}
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 29aa5cf6639b..7041ac35ace8 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -266,7 +266,7 @@ static void init_once(void *foo)
inode_init_once(&bi->vfs_inode);
}
-static int init_inodecache(void)
+static int __init init_inodecache(void)
{
bfs_inode_cachep = kmem_cache_create("bfs_inode_cache",
sizeof(struct bfs_inode_info),
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 0f59799fa105..aa3cb626671e 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -584,7 +584,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
unsigned long start_code, end_code, start_data, end_data;
unsigned long reloc_func_desc __maybe_unused = 0;
int executable_stack = EXSTACK_DEFAULT;
- unsigned long def_flags = 0;
struct pt_regs *regs = current_pt_regs();
struct {
struct elfhdr elf_ex;
@@ -724,9 +723,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
if (retval)
goto out_free_dentry;
- /* OK, This is the point of no return */
- current->mm->def_flags = def_flags;
-
/* Do this immediately, since STACK_TOP as used in setup_arg_pages
may depend on the personality. */
SET_PERSONALITY(loc->elf_ex);
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 29696b78d1f4..1c2ce0c87711 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -182,6 +182,9 @@ static int bdev_integrity_enabled(struct block_device *bdev, int rw)
*/
int bio_integrity_enabled(struct bio *bio)
{
+ if (!bio_is_rw(bio))
+ return 0;
+
/* Already protected? */
if (bio_integrity(bio))
return 0;
@@ -309,10 +312,9 @@ static int bio_integrity_generate_verify(struct bio *bio, int operate)
{
struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
struct blk_integrity_exchg bix;
- struct bio_vec bv;
- struct bvec_iter iter;
+ struct bio_vec *bv;
sector_t sector;
- unsigned int sectors, ret = 0;
+ unsigned int sectors, ret = 0, i;
void *prot_buf = bio->bi_integrity->bip_buf;
if (operate)
@@ -323,16 +325,16 @@ static int bio_integrity_generate_verify(struct bio *bio, int operate)
bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
bix.sector_size = bi->sector_size;
- bio_for_each_segment(bv, bio, iter) {
- void *kaddr = kmap_atomic(bv.bv_page);
- bix.data_buf = kaddr + bv.bv_offset;
- bix.data_size = bv.bv_len;
+ bio_for_each_segment_all(bv, bio, i) {
+ void *kaddr = kmap_atomic(bv->bv_page);
+ bix.data_buf = kaddr + bv->bv_offset;
+ bix.data_size = bv->bv_len;
bix.prot_buf = prot_buf;
bix.sector = sector;
- if (operate) {
+ if (operate)
bi->generate_fn(&bix);
- } else {
+ else {
ret = bi->verify_fn(&bix);
if (ret) {
kunmap_atomic(kaddr);
@@ -340,7 +342,7 @@ static int bio_integrity_generate_verify(struct bio *bio, int operate)
}
}
- sectors = bv.bv_len / bi->sector_size;
+ sectors = bv->bv_len / bi->sector_size;
sector += sectors;
prot_buf += sectors * bi->tuple_size;
diff --git a/fs/bio.c b/fs/bio.c
index b1bc722b89aa..6f0362b77806 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1002,7 +1002,7 @@ struct bio_map_data {
};
static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
- struct sg_iovec *iov, int iov_count,
+ const struct sg_iovec *iov, int iov_count,
int is_our_pages)
{
memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
@@ -1022,7 +1022,7 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs,
sizeof(struct sg_iovec) * iov_count, gfp_mask);
}
-static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
+static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_count,
int to_user, int from_user, int do_free_page)
{
int ret = 0, i;
@@ -1120,7 +1120,7 @@ EXPORT_SYMBOL(bio_uncopy_user);
*/
struct bio *bio_copy_user_iov(struct request_queue *q,
struct rq_map_data *map_data,
- struct sg_iovec *iov, int iov_count,
+ const struct sg_iovec *iov, int iov_count,
int write_to_vm, gfp_t gfp_mask)
{
struct bio_map_data *bmd;
@@ -1259,7 +1259,7 @@ EXPORT_SYMBOL(bio_copy_user);
static struct bio *__bio_map_user_iov(struct request_queue *q,
struct block_device *bdev,
- struct sg_iovec *iov, int iov_count,
+ const struct sg_iovec *iov, int iov_count,
int write_to_vm, gfp_t gfp_mask)
{
int i, j;
@@ -1407,7 +1407,7 @@ EXPORT_SYMBOL(bio_map_user);
* device. Returns an error pointer in case of error.
*/
struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
- struct sg_iovec *iov, int iov_count,
+ const struct sg_iovec *iov, int iov_count,
int write_to_vm, gfp_t gfp_mask)
{
struct bio *bio;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ba0d2b05bb78..552a8d13bc32 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1518,7 +1518,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
BUG_ON(iocb->ki_pos != pos);
blk_start_plug(&plug);
- ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+ ret = __generic_file_aio_write(iocb, iov, nr_segs);
if (ret > 0) {
ssize_t err;
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index ecb5832c0967..5a201d81049c 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -323,6 +323,8 @@ void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max)
{
+ if (!wq)
+ return;
wq->normal->max_active = max;
if (wq->high)
wq->high->max_active = max;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index aad7201ad11b..10db21fa0926 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -330,7 +330,10 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
goto out;
}
- root_level = btrfs_old_root_level(root, time_seq);
+ if (path->search_commit_root)
+ root_level = btrfs_header_level(root->commit_root);
+ else
+ root_level = btrfs_old_root_level(root, time_seq);
if (root_level + 1 == level) {
srcu_read_unlock(&fs_info->subvol_srcu, index);
@@ -1099,9 +1102,9 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
*
* returns 0 on success, < 0 on error.
*/
-int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist **roots)
+static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist **roots)
{
struct ulist *tmp;
struct ulist_node *node = NULL;
@@ -1137,6 +1140,20 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
return 0;
}
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist **roots)
+{
+ int ret;
+
+ if (!trans)
+ down_read(&fs_info->commit_root_sem);
+ ret = __btrfs_find_all_roots(trans, fs_info, bytenr, time_seq, roots);
+ if (!trans)
+ up_read(&fs_info->commit_root_sem);
+ return ret;
+}
+
/*
* this makes the path point to (inum INODE_ITEM ioff)
*/
@@ -1516,6 +1533,8 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
if (IS_ERR(trans))
return PTR_ERR(trans);
btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+ } else {
+ down_read(&fs_info->commit_root_sem);
}
ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
@@ -1526,8 +1545,8 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
ULIST_ITER_INIT(&ref_uiter);
while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
- ret = btrfs_find_all_roots(trans, fs_info, ref_node->val,
- tree_mod_seq_elem.seq, &roots);
+ ret = __btrfs_find_all_roots(trans, fs_info, ref_node->val,
+ tree_mod_seq_elem.seq, &roots);
if (ret)
break;
ULIST_ITER_INIT(&root_uiter);
@@ -1549,6 +1568,8 @@ out:
if (!search_commit_root) {
btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
btrfs_end_transaction(trans, fs_info->extent_root);
+ } else {
+ up_read(&fs_info->commit_root_sem);
}
return ret;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 88d1b1eedc9c..1bcfcdb23cf4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2769,9 +2769,13 @@ again:
* the commit roots are read only
* so we always do read locks
*/
+ if (p->need_commit_sem)
+ down_read(&root->fs_info->commit_root_sem);
b = root->commit_root;
extent_buffer_get(b);
level = btrfs_header_level(b);
+ if (p->need_commit_sem)
+ up_read(&root->fs_info->commit_root_sem);
if (!p->skip_locking)
btrfs_tree_read_lock(b);
} else {
@@ -5360,7 +5364,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
{
int ret;
int cmp;
- struct btrfs_trans_handle *trans = NULL;
struct btrfs_path *left_path = NULL;
struct btrfs_path *right_path = NULL;
struct btrfs_key left_key;
@@ -5378,9 +5381,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
u64 right_blockptr;
u64 left_gen;
u64 right_gen;
- u64 left_start_ctransid;
- u64 right_start_ctransid;
- u64 ctransid;
left_path = btrfs_alloc_path();
if (!left_path) {
@@ -5404,21 +5404,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
right_path->search_commit_root = 1;
right_path->skip_locking = 1;
- spin_lock(&left_root->root_item_lock);
- left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
- spin_unlock(&left_root->root_item_lock);
-
- spin_lock(&right_root->root_item_lock);
- right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
- spin_unlock(&right_root->root_item_lock);
-
- trans = btrfs_join_transaction(left_root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- trans = NULL;
- goto out;
- }
-
/*
* Strategy: Go to the first items of both trees. Then do
*
@@ -5455,6 +5440,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
* the right if possible or go up and right.
*/
+ down_read(&left_root->fs_info->commit_root_sem);
left_level = btrfs_header_level(left_root->commit_root);
left_root_level = left_level;
left_path->nodes[left_level] = left_root->commit_root;
@@ -5464,6 +5450,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
right_root_level = right_level;
right_path->nodes[right_level] = right_root->commit_root;
extent_buffer_get(right_path->nodes[right_level]);
+ up_read(&left_root->fs_info->commit_root_sem);
if (left_level == 0)
btrfs_item_key_to_cpu(left_path->nodes[left_level],
@@ -5482,67 +5469,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
advance_left = advance_right = 0;
while (1) {
- /*
- * We need to make sure the transaction does not get committed
- * while we do anything on commit roots. This means, we need to
- * join and leave transactions for every item that we process.
- */
- if (trans && btrfs_should_end_transaction(trans, left_root)) {
- btrfs_release_path(left_path);
- btrfs_release_path(right_path);
-
- ret = btrfs_end_transaction(trans, left_root);
- trans = NULL;
- if (ret < 0)
- goto out;
- }
- /* now rejoin the transaction */
- if (!trans) {
- trans = btrfs_join_transaction(left_root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- trans = NULL;
- goto out;
- }
-
- spin_lock(&left_root->root_item_lock);
- ctransid = btrfs_root_ctransid(&left_root->root_item);
- spin_unlock(&left_root->root_item_lock);
- if (ctransid != left_start_ctransid)
- left_start_ctransid = 0;
-
- spin_lock(&right_root->root_item_lock);
- ctransid = btrfs_root_ctransid(&right_root->root_item);
- spin_unlock(&right_root->root_item_lock);
- if (ctransid != right_start_ctransid)
- right_start_ctransid = 0;
-
- if (!left_start_ctransid || !right_start_ctransid) {
- WARN(1, KERN_WARNING
- "BTRFS: btrfs_compare_tree detected "
- "a change in one of the trees while "
- "iterating. This is probably a "
- "bug.\n");
- ret = -EIO;
- goto out;
- }
-
- /*
- * the commit root may have changed, so start again
- * where we stopped
- */
- left_path->lowest_level = left_level;
- right_path->lowest_level = right_level;
- ret = btrfs_search_slot(NULL, left_root,
- &left_key, left_path, 0, 0);
- if (ret < 0)
- goto out;
- ret = btrfs_search_slot(NULL, right_root,
- &right_key, right_path, 0, 0);
- if (ret < 0)
- goto out;
- }
-
if (advance_left && !left_end_reached) {
ret = tree_advance(left_root, left_path, &left_level,
left_root_level,
@@ -5672,14 +5598,6 @@ out:
btrfs_free_path(left_path);
btrfs_free_path(right_path);
kfree(tmp_buf);
-
- if (trans) {
- if (!ret)
- ret = btrfs_end_transaction(trans, left_root);
- else
- btrfs_end_transaction(trans, left_root);
- }
-
return ret;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index bc96c03dd259..4c48df572bd6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -609,6 +609,7 @@ struct btrfs_path {
unsigned int skip_locking:1;
unsigned int leave_spinning:1;
unsigned int search_commit_root:1;
+ unsigned int need_commit_sem:1;
};
/*
@@ -986,7 +987,8 @@ struct btrfs_dev_replace_item {
#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7)
#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8)
-#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
+#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \
+ BTRFS_SPACE_INFO_GLOBAL_RSV)
enum btrfs_raid_types {
BTRFS_RAID_RAID10,
@@ -1018,6 +1020,12 @@ enum btrfs_raid_types {
*/
#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48)
+/*
+ * A fake block group type that is used to communicate global block reserve
+ * size to userspace via the SPACE_INFO ioctl.
+ */
+#define BTRFS_SPACE_INFO_GLOBAL_RSV (1ULL << 49)
+
#define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \
BTRFS_AVAIL_ALLOC_BIT_SINGLE)
@@ -1440,7 +1448,7 @@ struct btrfs_fs_info {
*/
struct mutex ordered_extent_flush_mutex;
- struct rw_semaphore extent_commit_sem;
+ struct rw_semaphore commit_root_sem;
struct rw_semaphore cleanup_work_sem;
@@ -1711,7 +1719,6 @@ struct btrfs_root {
struct btrfs_block_rsv *block_rsv;
/* free ino cache stuff */
- struct mutex fs_commit_mutex;
struct btrfs_free_space_ctl *free_ino_ctl;
enum btrfs_caching_type cached;
spinlock_t cache_lock;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index bd0f752b797b..029d46c2e170 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -329,6 +329,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
{
struct extent_state *cached_state = NULL;
int ret;
+ bool need_lock = (current->journal_info ==
+ (void *)BTRFS_SEND_TRANS_STUB);
if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
return 0;
@@ -336,6 +338,11 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
if (atomic)
return -EAGAIN;
+ if (need_lock) {
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ }
+
lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
0, &cached_state);
if (extent_buffer_uptodate(eb) &&
@@ -347,10 +354,21 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
"found %llu\n",
eb->start, parent_transid, btrfs_header_generation(eb));
ret = 1;
- clear_extent_buffer_uptodate(eb);
+
+ /*
+ * Things reading via commit roots that don't have normal protection,
+ * like send, can have a really old block in cache that may point at a
+ * block that has been free'd and re-allocated. So don't clear uptodate
+ * if we find an eb that is under IO (dirty/writeback) because we could
+ * end up reading in the stale data and then writing it back out and
+ * making everybody very sad.
+ */
+ if (!extent_buffer_under_io(eb))
+ clear_extent_buffer_uptodate(eb);
out:
unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
&cached_state, GFP_NOFS);
+ btrfs_tree_read_unlock_blocking(eb);
return ret;
}
@@ -1546,7 +1564,6 @@ int btrfs_init_fs_root(struct btrfs_root *root)
root->subv_writers = writers;
btrfs_init_free_ino_ctl(root);
- mutex_init(&root->fs_commit_mutex);
spin_lock_init(&root->cache_lock);
init_waitqueue_head(&root->cache_wait);
@@ -2324,7 +2341,7 @@ int open_ctree(struct super_block *sb,
mutex_init(&fs_info->transaction_kthread_mutex);
mutex_init(&fs_info->cleaner_mutex);
mutex_init(&fs_info->volume_mutex);
- init_rwsem(&fs_info->extent_commit_sem);
+ init_rwsem(&fs_info->commit_root_sem);
init_rwsem(&fs_info->cleanup_work_sem);
init_rwsem(&fs_info->subvol_sem);
sema_init(&fs_info->uuid_tree_rescan_sem, 1);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c6b6a6e3e735..1306487c82cf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -419,7 +419,7 @@ static noinline void caching_thread(struct btrfs_work *work)
again:
mutex_lock(&caching_ctl->mutex);
/* need to make sure the commit_root doesn't disappear */
- down_read(&fs_info->extent_commit_sem);
+ down_read(&fs_info->commit_root_sem);
next:
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
@@ -443,10 +443,10 @@ next:
break;
if (need_resched() ||
- rwsem_is_contended(&fs_info->extent_commit_sem)) {
+ rwsem_is_contended(&fs_info->commit_root_sem)) {
caching_ctl->progress = last;
btrfs_release_path(path);
- up_read(&fs_info->extent_commit_sem);
+ up_read(&fs_info->commit_root_sem);
mutex_unlock(&caching_ctl->mutex);
cond_resched();
goto again;
@@ -513,7 +513,7 @@ next:
err:
btrfs_free_path(path);
- up_read(&fs_info->extent_commit_sem);
+ up_read(&fs_info->commit_root_sem);
free_excluded_extents(extent_root, block_group);
@@ -633,10 +633,10 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
return 0;
}
- down_write(&fs_info->extent_commit_sem);
+ down_write(&fs_info->commit_root_sem);
atomic_inc(&caching_ctl->count);
list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
- up_write(&fs_info->extent_commit_sem);
+ up_write(&fs_info->commit_root_sem);
btrfs_get_block_group(cache);
@@ -2444,7 +2444,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
spin_unlock(&locked_ref->lock);
spin_lock(&delayed_refs->lock);
spin_lock(&locked_ref->lock);
- if (rb_first(&locked_ref->ref_root)) {
+ if (rb_first(&locked_ref->ref_root) ||
+ locked_ref->extent_op) {
spin_unlock(&locked_ref->lock);
spin_unlock(&delayed_refs->lock);
continue;
@@ -5470,7 +5471,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *cache;
struct btrfs_space_info *space_info;
- down_write(&fs_info->extent_commit_sem);
+ down_write(&fs_info->commit_root_sem);
list_for_each_entry_safe(caching_ctl, next,
&fs_info->caching_block_groups, list) {
@@ -5489,7 +5490,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
else
fs_info->pinned_extents = &fs_info->freed_extents[0];
- up_write(&fs_info->extent_commit_sem);
+ up_write(&fs_info->commit_root_sem);
list_for_each_entry_rcu(space_info, &fs_info->space_info, list)
percpu_counter_set(&space_info->total_bytes_pinned, 0);
@@ -5744,6 +5745,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
"unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu",
bytenr, parent, root_objectid, owner_objectid,
owner_offset);
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
} else {
btrfs_abort_transaction(trans, extent_root, ret);
goto out;
@@ -8255,14 +8258,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
struct btrfs_caching_control *caching_ctl;
struct rb_node *n;
- down_write(&info->extent_commit_sem);
+ down_write(&info->commit_root_sem);
while (!list_empty(&info->caching_block_groups)) {
caching_ctl = list_entry(info->caching_block_groups.next,
struct btrfs_caching_control, list);
list_del(&caching_ctl->list);
put_caching_control(caching_ctl);
}
- up_write(&info->extent_commit_sem);
+ up_write(&info->commit_root_sem);
spin_lock(&info->block_group_cache_lock);
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
@@ -8336,9 +8339,15 @@ static void __link_block_group(struct btrfs_space_info *space_info,
struct btrfs_block_group_cache *cache)
{
int index = get_block_group_index(cache);
+ bool first = false;
down_write(&space_info->groups_sem);
- if (list_empty(&space_info->block_groups[index])) {
+ if (list_empty(&space_info->block_groups[index]))
+ first = true;
+ list_add_tail(&cache->list, &space_info->block_groups[index]);
+ up_write(&space_info->groups_sem);
+
+ if (first) {
struct kobject *kobj = &space_info->block_group_kobjs[index];
int ret;
@@ -8350,8 +8359,6 @@ static void __link_block_group(struct btrfs_space_info *space_info,
kobject_put(&space_info->kobj);
}
}
- list_add_tail(&cache->list, &space_info->block_groups[index]);
- up_write(&space_info->groups_sem);
}
static struct btrfs_block_group_cache *
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ae69a00387e7..3955e475ceec 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -749,6 +749,7 @@ again:
* our range starts
*/
node = tree_search(tree, start);
+process_node:
if (!node)
break;
@@ -769,7 +770,10 @@ again:
if (start > end)
break;
- cond_resched_lock(&tree->lock);
+ if (!cond_resched_lock(&tree->lock)) {
+ node = rb_next(node);
+ goto process_node;
+ }
}
out:
spin_unlock(&tree->lock);
@@ -4306,7 +4310,7 @@ static void __free_extent_buffer(struct extent_buffer *eb)
kmem_cache_free(extent_buffer_cache, eb);
}
-static int extent_buffer_under_io(struct extent_buffer *eb)
+int extent_buffer_under_io(struct extent_buffer *eb)
{
return (atomic_read(&eb->io_pages) ||
test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 58b27e5ab521..c488b45237bf 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -320,6 +320,7 @@ int set_extent_buffer_dirty(struct extent_buffer *eb);
int set_extent_buffer_uptodate(struct extent_buffer *eb);
int clear_extent_buffer_uptodate(struct extent_buffer *eb);
int extent_buffer_uptodate(struct extent_buffer *eb);
+int extent_buffer_under_io(struct extent_buffer *eb);
int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
unsigned long min_len, char **map,
unsigned long *map_start,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e1ffb1e22898..eb742c07e7a4 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -425,13 +425,8 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
struct page *page = prepared_pages[pg];
/*
* Copy data from userspace to the current page
- *
- * Disable pagefault to avoid recursive lock since
- * the pages are already locked
*/
- pagefault_disable();
copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
- pagefault_enable();
/* Flush processor's dcache for this page */
flush_dcache_page(page);
@@ -1665,7 +1660,7 @@ again:
static ssize_t __btrfs_direct_write(struct kiocb *iocb,
const struct iovec *iov,
unsigned long nr_segs, loff_t pos,
- loff_t *ppos, size_t count, size_t ocount)
+ size_t count, size_t ocount)
{
struct file *file = iocb->ki_filp;
struct iov_iter i;
@@ -1674,7 +1669,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
loff_t endbyte;
int err;
- written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,
+ written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
count, ocount);
if (written < 0 || written == count)
@@ -1693,7 +1688,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
if (err)
goto out;
written += written_buffered;
- *ppos = pos + written_buffered;
+ iocb->ki_pos = pos + written_buffered;
invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
endbyte >> PAGE_CACHE_SHIFT);
out:
@@ -1725,8 +1720,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
- loff_t *ppos = &iocb->ki_pos;
u64 start_pos;
+ u64 end_pos;
ssize_t num_written = 0;
ssize_t err = 0;
size_t count, ocount;
@@ -1781,7 +1776,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
start_pos = round_down(pos, root->sectorsize);
if (start_pos > i_size_read(inode)) {
- err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
+ /* Expand hole size to cover write data, preventing empty gap */
+ end_pos = round_up(pos + iov->iov_len, root->sectorsize);
+ err = btrfs_cont_expand(inode, i_size_read(inode), end_pos);
if (err) {
mutex_unlock(&inode->i_mutex);
goto out;
@@ -1793,7 +1790,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
if (unlikely(file->f_flags & O_DIRECT)) {
num_written = __btrfs_direct_write(iocb, iov, nr_segs,
- pos, ppos, count, ocount);
+ pos, count, ocount);
} else {
struct iov_iter i;
@@ -1801,7 +1798,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
num_written = __btrfs_buffered_write(file, &i, pos);
if (num_written > 0)
- *ppos = pos + num_written;
+ iocb->ki_pos = pos + num_written;
}
mutex_unlock(&inode->i_mutex);
@@ -2025,6 +2022,7 @@ out:
static const struct vm_operations_struct btrfs_file_vm_ops = {
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = btrfs_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index ab485e57b6fe..cc8ca193d830 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -55,7 +55,7 @@ static int caching_kthread(void *data)
key.type = BTRFS_INODE_ITEM_KEY;
again:
/* need to make sure the commit_root doesn't disappear */
- mutex_lock(&root->fs_commit_mutex);
+ down_read(&fs_info->commit_root_sem);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
@@ -88,7 +88,7 @@ again:
btrfs_item_key_to_cpu(leaf, &key, 0);
btrfs_release_path(path);
root->cache_progress = last;
- mutex_unlock(&root->fs_commit_mutex);
+ up_read(&fs_info->commit_root_sem);
schedule_timeout(1);
goto again;
} else
@@ -127,7 +127,7 @@ next:
btrfs_unpin_free_ino(root);
out:
wake_up(&root->cache_wait);
- mutex_unlock(&root->fs_commit_mutex);
+ up_read(&fs_info->commit_root_sem);
btrfs_free_path(path);
@@ -223,11 +223,11 @@ again:
* or the caching work is done.
*/
- mutex_lock(&root->fs_commit_mutex);
+ down_write(&root->fs_info->commit_root_sem);
spin_lock(&root->cache_lock);
if (root->cached == BTRFS_CACHE_FINISHED) {
spin_unlock(&root->cache_lock);
- mutex_unlock(&root->fs_commit_mutex);
+ up_write(&root->fs_info->commit_root_sem);
goto again;
}
spin_unlock(&root->cache_lock);
@@ -240,7 +240,7 @@ again:
else
__btrfs_add_free_space(pinned, objectid, 1);
- mutex_unlock(&root->fs_commit_mutex);
+ up_write(&root->fs_info->commit_root_sem);
}
}
@@ -250,7 +250,7 @@ again:
* and others will just be dropped, because the commit root we were
* searching has changed.
*
- * Must be called with root->fs_commit_mutex held
+ * Must be called with root->fs_info->commit_root_sem held
*/
void btrfs_unpin_free_ino(struct btrfs_root *root)
{
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 06e9a4152b14..5f805bc944fa 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -394,6 +394,14 @@ static noinline int compress_file_range(struct inode *inode,
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
btrfs_add_inode_defrag(NULL, inode);
+ /*
+ * skip compression for a small file range(<=blocksize) that
+ * isn't an inline extent, since it dosen't save disk space at all.
+ */
+ if ((end - start + 1) <= blocksize &&
+ (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
+ goto cleanup_and_bail_uncompressed;
+
actual_end = min_t(u64, isize, end + 1);
again:
will_compress = 0;
@@ -1271,6 +1279,15 @@ next_slot:
disk_bytenr += cur_offset - found_key.offset;
num_bytes = min(end + 1, extent_end) - cur_offset;
/*
+ * if there are pending snapshots for this root,
+ * we fall into common COW way.
+ */
+ if (!nolock) {
+ err = btrfs_start_nocow_write(root);
+ if (!err)
+ goto out_check;
+ }
+ /*
* force cow if csum exists in the range.
* this ensure that csum for a given extent are
* either valid or do not exist.
@@ -1289,6 +1306,8 @@ next_slot:
out_check:
if (extent_end <= start) {
path->slots[0]++;
+ if (!nolock && nocow)
+ btrfs_end_nocow_write(root);
goto next_slot;
}
if (!nocow) {
@@ -1306,8 +1325,11 @@ out_check:
ret = cow_file_range(inode, locked_page,
cow_start, found_key.offset - 1,
page_started, nr_written, 1);
- if (ret)
+ if (ret) {
+ if (!nolock && nocow)
+ btrfs_end_nocow_write(root);
goto error;
+ }
cow_start = (u64)-1;
}
@@ -1354,8 +1376,11 @@ out_check:
BTRFS_DATA_RELOC_TREE_OBJECTID) {
ret = btrfs_reloc_clone_csums(inode, cur_offset,
num_bytes);
- if (ret)
+ if (ret) {
+ if (!nolock && nocow)
+ btrfs_end_nocow_write(root);
goto error;
+ }
}
extent_clear_unlock_delalloc(inode, cur_offset,
@@ -1363,6 +1388,8 @@ out_check:
locked_page, EXTENT_LOCKED |
EXTENT_DELALLOC, PAGE_UNLOCK |
PAGE_SET_PRIVATE2);
+ if (!nolock && nocow)
+ btrfs_end_nocow_write(root);
cur_offset = extent_end;
if (cur_offset > end)
break;
@@ -8476,19 +8503,20 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
else
iput(inode);
ret = -ENOMEM;
- break;
+ goto out;
}
list_add_tail(&work->list, &works);
btrfs_queue_work(root->fs_info->flush_workers,
&work->work);
ret++;
if (nr != -1 && ret >= nr)
- break;
+ goto out;
cond_resched();
spin_lock(&root->delalloc_lock);
}
spin_unlock(&root->delalloc_lock);
+out:
list_for_each_entry_safe(work, next, &works, list) {
list_del_init(&work->list);
btrfs_wait_and_free_delalloc_work(work);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0401397b5c92..e79ff6b90cb7 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1472,6 +1472,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
struct btrfs_trans_handle *trans;
struct btrfs_device *device = NULL;
char *sizestr;
+ char *retptr;
char *devstr = NULL;
int ret = 0;
int mod = 0;
@@ -1539,8 +1540,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
mod = 1;
sizestr++;
}
- new_size = memparse(sizestr, NULL);
- if (new_size == 0) {
+ new_size = memparse(sizestr, &retptr);
+ if (*retptr != '\0' || new_size == 0) {
ret = -EINVAL;
goto out_free;
}
@@ -3140,8 +3141,9 @@ process_slot:
new_key.offset + datal,
1);
if (ret) {
- btrfs_abort_transaction(trans, root,
- ret);
+ if (ret != -EINVAL)
+ btrfs_abort_transaction(trans,
+ root, ret);
btrfs_end_transaction(trans, root);
goto out;
}
@@ -3538,6 +3540,11 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
up_read(&info->groups_sem);
}
+ /*
+ * Global block reserve, exported as a space_info
+ */
+ slot_count++;
+
/* space_slots == 0 means they are asking for a count */
if (space_args.space_slots == 0) {
space_args.total_spaces = slot_count;
@@ -3596,6 +3603,21 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
up_read(&info->groups_sem);
}
+ /*
+ * Add global block reserve
+ */
+ if (slot_count) {
+ struct btrfs_block_rsv *block_rsv = &root->fs_info->global_block_rsv;
+
+ spin_lock(&block_rsv->lock);
+ space.total_bytes = block_rsv->size;
+ space.used_bytes = block_rsv->size - block_rsv->reserved;
+ spin_unlock(&block_rsv->lock);
+ space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV;
+ memcpy(dest, &space, sizeof(space));
+ space_args.total_spaces++;
+ }
+
user_dest = (struct btrfs_ioctl_space_info __user *)
(arg + sizeof(struct btrfs_ioctl_space_args));
@@ -4531,9 +4553,8 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file,
}
args64 = kmalloc(sizeof(*args64), GFP_NOFS);
- if (IS_ERR(args64)) {
- ret = PTR_ERR(args64);
- args64 = NULL;
+ if (!args64) {
+ ret = -ENOMEM;
goto out;
}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index def428a25b2a..7f92ab1daa87 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2317,7 +2317,6 @@ void free_reloc_roots(struct list_head *list)
static noinline_for_stack
int merge_reloc_roots(struct reloc_control *rc)
{
- struct btrfs_trans_handle *trans;
struct btrfs_root *root;
struct btrfs_root *reloc_root;
u64 last_snap;
@@ -2375,26 +2374,6 @@ again:
list_add_tail(&reloc_root->root_list,
&reloc_roots);
goto out;
- } else if (!ret) {
- /*
- * recover the last snapshot tranid to avoid
- * the space balance break NOCOW.
- */
- root = read_fs_root(rc->extent_root->fs_info,
- objectid);
- if (IS_ERR(root))
- continue;
-
- trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
-
- /* Check if the fs/file tree was snapshoted or not. */
- if (btrfs_root_last_snapshot(&root->root_item) ==
- otransid - 1)
- btrfs_set_root_last_snapshot(&root->root_item,
- last_snap);
-
- btrfs_end_transaction(trans, root);
}
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 93e6d7172844..0be77993378e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2235,6 +2235,47 @@ behind_scrub_pages:
return 0;
}
+/*
+ * Given a physical address, this will calculate it's
+ * logical offset. if this is a parity stripe, it will return
+ * the most left data stripe's logical offset.
+ *
+ * return 0 if it is a data stripe, 1 means parity stripe.
+ */
+static int get_raid56_logic_offset(u64 physical, int num,
+ struct map_lookup *map, u64 *offset)
+{
+ int i;
+ int j = 0;
+ u64 stripe_nr;
+ u64 last_offset;
+ int stripe_index;
+ int rot;
+
+ last_offset = (physical - map->stripes[num].physical) *
+ nr_data_stripes(map);
+ *offset = last_offset;
+ for (i = 0; i < nr_data_stripes(map); i++) {
+ *offset = last_offset + i * map->stripe_len;
+
+ stripe_nr = *offset;
+ do_div(stripe_nr, map->stripe_len);
+ do_div(stripe_nr, nr_data_stripes(map));
+
+ /* Work out the disk rotation on this stripe-set */
+ rot = do_div(stripe_nr, map->num_stripes);
+ /* calculate which stripe this data locates */
+ rot += i;
+ stripe_index = rot % map->num_stripes;
+ if (stripe_index == num)
+ return 0;
+ if (stripe_index < num)
+ j++;
+ }
+ *offset = last_offset + j * map->stripe_len;
+ return 1;
+}
+
static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct map_lookup *map,
struct btrfs_device *scrub_dev,
@@ -2256,6 +2297,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
u64 physical;
u64 logical;
u64 logic_end;
+ u64 physical_end;
u64 generation;
int mirror_num;
struct reada_control *reada1;
@@ -2269,16 +2311,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
u64 extent_len;
struct btrfs_device *extent_dev;
int extent_mirror_num;
- int stop_loop;
-
- if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6)) {
- if (num >= nr_data_stripes(map)) {
- return 0;
- }
- }
+ int stop_loop = 0;
nstripes = length;
+ physical = map->stripes[num].physical;
offset = 0;
do_div(nstripes, map->stripe_len);
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
@@ -2296,6 +2332,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
increment = map->stripe_len;
mirror_num = num % map->num_stripes + 1;
+ } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6)) {
+ get_raid56_logic_offset(physical, num, map, &offset);
+ increment = map->stripe_len * nr_data_stripes(map);
+ mirror_num = 1;
} else {
increment = map->stripe_len;
mirror_num = 1;
@@ -2319,7 +2360,15 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
* to not hold off transaction commits
*/
logical = base + offset;
-
+ physical_end = physical + nstripes * map->stripe_len;
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6)) {
+ get_raid56_logic_offset(physical_end, num,
+ map, &logic_end);
+ logic_end += base;
+ } else {
+ logic_end = logical + increment * nstripes;
+ }
wait_event(sctx->list_wait,
atomic_read(&sctx->bios_in_flight) == 0);
scrub_blocked_if_needed(fs_info);
@@ -2328,7 +2377,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
key_start.objectid = logical;
key_start.type = BTRFS_EXTENT_ITEM_KEY;
key_start.offset = (u64)0;
- key_end.objectid = base + offset + nstripes * increment;
+ key_end.objectid = logic_end;
key_end.type = BTRFS_METADATA_ITEM_KEY;
key_end.offset = (u64)-1;
reada1 = btrfs_reada_add(root, &key_start, &key_end);
@@ -2338,7 +2387,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
key_start.offset = logical;
key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
key_end.type = BTRFS_EXTENT_CSUM_KEY;
- key_end.offset = base + offset + nstripes * increment;
+ key_end.offset = logic_end;
reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
if (!IS_ERR(reada1))
@@ -2356,11 +2405,17 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
/*
* now find all extents for each stripe and scrub them
*/
- logical = base + offset;
- physical = map->stripes[num].physical;
- logic_end = logical + increment * nstripes;
ret = 0;
- while (logical < logic_end) {
+ while (physical < physical_end) {
+ /* for raid56, we skip parity stripe */
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6)) {
+ ret = get_raid56_logic_offset(physical, num,
+ map, &logical);
+ logical += base;
+ if (ret)
+ goto skip;
+ }
/*
* canceled?
*/
@@ -2504,15 +2559,29 @@ again:
scrub_free_csums(sctx);
if (extent_logical + extent_len <
key.objectid + bytes) {
- logical += increment;
- physical += map->stripe_len;
-
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6)) {
+ /*
+ * loop until we find next data stripe
+ * or we have finished all stripes.
+ */
+ do {
+ physical += map->stripe_len;
+ ret = get_raid56_logic_offset(
+ physical, num,
+ map, &logical);
+ logical += base;
+ } while (physical < physical_end && ret);
+ } else {
+ physical += map->stripe_len;
+ logical += increment;
+ }
if (logical < key.objectid + bytes) {
cond_resched();
goto again;
}
- if (logical >= logic_end) {
+ if (physical >= physical_end) {
stop_loop = 1;
break;
}
@@ -2521,6 +2590,7 @@ next:
path->slots[0]++;
}
btrfs_release_path(path);
+skip:
logical += increment;
physical += map->stripe_len;
spin_lock(&sctx->stat_lock);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 9b6da9d55f9a..1ac3ca98c429 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -493,6 +493,7 @@ static struct btrfs_path *alloc_path_for_send(void)
return NULL;
path->search_commit_root = 1;
path->skip_locking = 1;
+ path->need_commit_sem = 1;
return path;
}
@@ -771,29 +772,22 @@ out:
/*
* Helper function to retrieve some fields from an inode item.
*/
-static int get_inode_info(struct btrfs_root *root,
- u64 ino, u64 *size, u64 *gen,
- u64 *mode, u64 *uid, u64 *gid,
- u64 *rdev)
+static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
+ u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid,
+ u64 *gid, u64 *rdev)
{
int ret;
struct btrfs_inode_item *ii;
struct btrfs_key key;
- struct btrfs_path *path;
-
- path = alloc_path_for_send();
- if (!path)
- return -ENOMEM;
key.objectid = ino;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
if (ret) {
- ret = -ENOENT;
- goto out;
+ if (ret > 0)
+ ret = -ENOENT;
+ return ret;
}
ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -811,7 +805,22 @@ static int get_inode_info(struct btrfs_root *root,
if (rdev)
*rdev = btrfs_inode_rdev(path->nodes[0], ii);
-out:
+ return ret;
+}
+
+static int get_inode_info(struct btrfs_root *root,
+ u64 ino, u64 *size, u64 *gen,
+ u64 *mode, u64 *uid, u64 *gid,
+ u64 *rdev)
+{
+ struct btrfs_path *path;
+ int ret;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+ ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid,
+ rdev);
btrfs_free_path(path);
return ret;
}
@@ -1085,6 +1094,7 @@ out:
struct backref_ctx {
struct send_ctx *sctx;
+ struct btrfs_path *path;
/* number of total found references */
u64 found;
@@ -1155,8 +1165,9 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
* There are inodes that have extents that lie behind its i_size. Don't
* accept clones from these extents.
*/
- ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL,
- NULL);
+ ret = __get_inode_info(found->root, bctx->path, ino, &i_size, NULL, NULL,
+ NULL, NULL, NULL);
+ btrfs_release_path(bctx->path);
if (ret < 0)
return ret;
@@ -1235,12 +1246,17 @@ static int find_extent_clone(struct send_ctx *sctx,
if (!tmp_path)
return -ENOMEM;
+ /* We only use this path under the commit sem */
+ tmp_path->need_commit_sem = 0;
+
backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS);
if (!backref_ctx) {
ret = -ENOMEM;
goto out;
}
+ backref_ctx->path = tmp_path;
+
if (data_offset >= ino_size) {
/*
* There may be extents that lie behind the file's size.
@@ -1268,8 +1284,10 @@ static int find_extent_clone(struct send_ctx *sctx,
}
logical = disk_byte + btrfs_file_extent_offset(eb, fi);
+ down_read(&sctx->send_root->fs_info->commit_root_sem);
ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path,
&found_key, &flags);
+ up_read(&sctx->send_root->fs_info->commit_root_sem);
btrfs_release_path(tmp_path);
if (ret < 0)
@@ -4418,6 +4436,9 @@ static int send_hole(struct send_ctx *sctx, u64 end)
p = fs_path_alloc();
if (!p)
return -ENOMEM;
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+ if (ret < 0)
+ goto tlv_put_failure;
memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE);
while (offset < end) {
len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE);
@@ -4425,9 +4446,6 @@ static int send_hole(struct send_ctx *sctx, u64 end)
ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
if (ret < 0)
break;
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
- if (ret < 0)
- break;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len);
@@ -4968,7 +4986,9 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
if (S_ISREG(sctx->cur_inode_mode)) {
if (need_send_hole(sctx)) {
- if (sctx->cur_inode_last_extent == (u64)-1) {
+ if (sctx->cur_inode_last_extent == (u64)-1 ||
+ sctx->cur_inode_last_extent <
+ sctx->cur_inode_size) {
ret = get_last_extent(sctx, (u64)-1);
if (ret)
goto out;
@@ -5367,57 +5387,21 @@ out:
static int full_send_tree(struct send_ctx *sctx)
{
int ret;
- struct btrfs_trans_handle *trans = NULL;
struct btrfs_root *send_root = sctx->send_root;
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_path *path;
struct extent_buffer *eb;
int slot;
- u64 start_ctransid;
- u64 ctransid;
path = alloc_path_for_send();
if (!path)
return -ENOMEM;
- spin_lock(&send_root->root_item_lock);
- start_ctransid = btrfs_root_ctransid(&send_root->root_item);
- spin_unlock(&send_root->root_item_lock);
-
key.objectid = BTRFS_FIRST_FREE_OBJECTID;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
-join_trans:
- /*
- * We need to make sure the transaction does not get committed
- * while we do anything on commit roots. Join a transaction to prevent
- * this.
- */
- trans = btrfs_join_transaction(send_root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- trans = NULL;
- goto out;
- }
-
- /*
- * Make sure the tree has not changed after re-joining. We detect this
- * by comparing start_ctransid and ctransid. They should always match.
- */
- spin_lock(&send_root->root_item_lock);
- ctransid = btrfs_root_ctransid(&send_root->root_item);
- spin_unlock(&send_root->root_item_lock);
-
- if (ctransid != start_ctransid) {
- WARN(1, KERN_WARNING "BTRFS: the root that you're trying to "
- "send was modified in between. This is "
- "probably a bug.\n");
- ret = -EIO;
- goto out;
- }
-
ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
if (ret < 0)
goto out;
@@ -5425,19 +5409,6 @@ join_trans:
goto out_finish;
while (1) {
- /*
- * When someone want to commit while we iterate, end the
- * joined transaction and rejoin.
- */
- if (btrfs_should_end_transaction(trans, send_root)) {
- ret = btrfs_end_transaction(trans, send_root);
- trans = NULL;
- if (ret < 0)
- goto out;
- btrfs_release_path(path);
- goto join_trans;
- }
-
eb = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(eb, &found_key, slot);
@@ -5465,12 +5436,6 @@ out_finish:
out:
btrfs_free_path(path);
- if (trans) {
- if (!ret)
- ret = btrfs_end_transaction(trans, send_root);
- else
- btrfs_end_transaction(trans, send_root);
- }
return ret;
}
@@ -5718,7 +5683,9 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
NULL);
sort_clone_roots = 1;
+ current->journal_info = (void *)BTRFS_SEND_TRANS_STUB;
ret = send_subvol(sctx);
+ current->journal_info = NULL;
if (ret < 0)
goto out;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9dbf42395153..5011aadacab8 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -66,6 +66,8 @@
static const struct super_operations btrfs_super_ops;
static struct file_system_type btrfs_fs_type;
+static int btrfs_remount(struct super_block *sb, int *flags, char *data);
+
static const char *btrfs_decode_error(int errno)
{
char *errstr = "unknown";
@@ -1185,6 +1187,26 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
newargs);
kfree(newargs);
+
+ if (PTR_RET(mnt) == -EBUSY) {
+ if (flags & MS_RDONLY) {
+ mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name,
+ newargs);
+ } else {
+ int r;
+ mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name,
+ newargs);
+ if (IS_ERR(mnt))
+ return ERR_CAST(mnt);
+
+ r = btrfs_remount(mnt->mnt_sb, &flags, NULL);
+ if (r < 0) {
+ /* FIXME: release vfsmount mnt ??*/
+ return ERR_PTR(r);
+ }
+ }
+ }
+
if (IS_ERR(mnt))
return ERR_CAST(mnt);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a04707f740d6..7579f6d0b854 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -75,10 +75,21 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
}
}
-static noinline void switch_commit_root(struct btrfs_root *root)
+static noinline void switch_commit_roots(struct btrfs_transaction *trans,
+ struct btrfs_fs_info *fs_info)
{
- free_extent_buffer(root->commit_root);
- root->commit_root = btrfs_root_node(root);
+ struct btrfs_root *root, *tmp;
+
+ down_write(&fs_info->commit_root_sem);
+ list_for_each_entry_safe(root, tmp, &trans->switch_commits,
+ dirty_list) {
+ list_del_init(&root->dirty_list);
+ free_extent_buffer(root->commit_root);
+ root->commit_root = btrfs_root_node(root);
+ if (is_fstree(root->objectid))
+ btrfs_unpin_free_ino(root);
+ }
+ up_write(&fs_info->commit_root_sem);
}
static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
@@ -208,6 +219,7 @@ loop:
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
INIT_LIST_HEAD(&cur_trans->ordered_operations);
INIT_LIST_HEAD(&cur_trans->pending_chunks);
+ INIT_LIST_HEAD(&cur_trans->switch_commits);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
fs_info->btree_inode->i_mapping);
@@ -375,7 +387,8 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
return ERR_PTR(-EROFS);
- if (current->journal_info) {
+ if (current->journal_info &&
+ current->journal_info != (void *)BTRFS_SEND_TRANS_STUB) {
WARN_ON(type & TRANS_EXTWRITERS);
h = current->journal_info;
h->use_count++;
@@ -919,9 +932,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
return ret;
}
- if (root != root->fs_info->extent_root)
- switch_commit_root(root);
-
return 0;
}
@@ -977,15 +987,16 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
list_del_init(next);
root = list_entry(next, struct btrfs_root, dirty_list);
+ if (root != fs_info->extent_root)
+ list_add_tail(&root->dirty_list,
+ &trans->transaction->switch_commits);
ret = update_cowonly_root(trans, root);
if (ret)
return ret;
}
- down_write(&fs_info->extent_commit_sem);
- switch_commit_root(fs_info->extent_root);
- up_write(&fs_info->extent_commit_sem);
-
+ list_add_tail(&fs_info->extent_root->dirty_list,
+ &trans->transaction->switch_commits);
btrfs_after_dev_replace_commit(fs_info);
return 0;
@@ -1042,11 +1053,8 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
smp_wmb();
if (root->commit_root != root->node) {
- mutex_lock(&root->fs_commit_mutex);
- switch_commit_root(root);
- btrfs_unpin_free_ino(root);
- mutex_unlock(&root->fs_commit_mutex);
-
+ list_add_tail(&root->dirty_list,
+ &trans->transaction->switch_commits);
btrfs_set_root_node(&root->root_item,
root->node);
}
@@ -1857,11 +1865,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_set_root_node(&root->fs_info->tree_root->root_item,
root->fs_info->tree_root->node);
- switch_commit_root(root->fs_info->tree_root);
+ list_add_tail(&root->fs_info->tree_root->dirty_list,
+ &cur_trans->switch_commits);
btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
root->fs_info->chunk_root->node);
- switch_commit_root(root->fs_info->chunk_root);
+ list_add_tail(&root->fs_info->chunk_root->dirty_list,
+ &cur_trans->switch_commits);
+
+ switch_commit_roots(cur_trans, root->fs_info);
assert_qgroups_uptodate(trans);
update_super_roots(root);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 6ac037e9f9f0..b57b924e8e03 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -57,6 +57,7 @@ struct btrfs_transaction {
struct list_head pending_snapshots;
struct list_head ordered_operations;
struct list_head pending_chunks;
+ struct list_head switch_commits;
struct btrfs_delayed_ref_root delayed_refs;
int aborted;
};
@@ -78,6 +79,8 @@ struct btrfs_transaction {
#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \
__TRANS_ATTACH)
+#define BTRFS_SEND_TRANS_STUB 1
+
struct btrfs_trans_handle {
u64 transid;
u64 bytes_reserved;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d241130a32fd..49d7fab73360 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -448,6 +448,14 @@ static void pending_bios_fn(struct btrfs_work *work)
run_scheduled_bios(device);
}
+/*
+ * Add new device to list of registered devices
+ *
+ * Returns:
+ * 1 - first time device is seen
+ * 0 - device already known
+ * < 0 - error
+ */
static noinline int device_list_add(const char *path,
struct btrfs_super_block *disk_super,
u64 devid, struct btrfs_fs_devices **fs_devices_ret)
@@ -455,6 +463,7 @@ static noinline int device_list_add(const char *path,
struct btrfs_device *device;
struct btrfs_fs_devices *fs_devices;
struct rcu_string *name;
+ int ret = 0;
u64 found_transid = btrfs_super_generation(disk_super);
fs_devices = find_fsid(disk_super->fsid);
@@ -495,6 +504,7 @@ static noinline int device_list_add(const char *path,
fs_devices->num_devices++;
mutex_unlock(&fs_devices->device_list_mutex);
+ ret = 1;
device->fs_devices = fs_devices;
} else if (!device->name || strcmp(device->name->str, path)) {
name = rcu_string_strdup(path, GFP_NOFS);
@@ -513,7 +523,8 @@ static noinline int device_list_add(const char *path,
fs_devices->latest_trans = found_transid;
}
*fs_devices_ret = fs_devices;
- return 0;
+
+ return ret;
}
static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
@@ -910,17 +921,19 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
transid = btrfs_super_generation(disk_super);
total_devices = btrfs_super_num_devices(disk_super);
- if (disk_super->label[0]) {
- if (disk_super->label[BTRFS_LABEL_SIZE - 1])
- disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
- printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
- } else {
- printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
- }
-
- printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path);
-
ret = device_list_add(path, disk_super, devid, fs_devices_ret);
+ if (ret > 0) {
+ if (disk_super->label[0]) {
+ if (disk_super->label[BTRFS_LABEL_SIZE - 1])
+ disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
+ printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
+ } else {
+ printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
+ }
+
+ printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path);
+ ret = 0;
+ }
if (!ret && fs_devices_ret)
(*fs_devices_ret)->total_devices = total_devices;
diff --git a/fs/buffer.c b/fs/buffer.c
index 8c53a2b15ecb..9ddb9fc7d923 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2114,8 +2114,8 @@ EXPORT_SYMBOL(generic_write_end);
* Returns true if all buffers which correspond to a file portion
* we want to read are uptodate.
*/
-int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
- unsigned long from)
+int block_is_partially_uptodate(struct page *page, unsigned long from,
+ unsigned long count)
{
unsigned block_start, block_end, blocksize;
unsigned to;
@@ -2127,7 +2127,7 @@ int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
head = page_buffers(page);
blocksize = head->b_size;
- to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
+ to = min_t(unsigned, PAGE_CACHE_SIZE - from, count);
to = from + to;
if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
return 0;
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index 622f4696e484..5b99bafc31d1 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -124,7 +124,6 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
/* check parameters */
ret = -EOPNOTSUPP;
if (!root->d_inode ||
- !root->d_inode->i_op ||
!root->d_inode->i_op->lookup ||
!root->d_inode->i_op->mkdir ||
!root->d_inode->i_op->setxattr ||
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 6494d9f673aa..c0a681705104 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -779,8 +779,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
}
ret = -EPERM;
- if (!subdir->d_inode->i_op ||
- !subdir->d_inode->i_op->setxattr ||
+ if (!subdir->d_inode->i_op->setxattr ||
!subdir->d_inode->i_op->getxattr ||
!subdir->d_inode->i_op->lookup ||
!subdir->d_inode->i_op->mkdir ||
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 8c44fdd4e1c3..834f9f3723fb 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -205,6 +205,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
ci->fscache = fscache_acquire_cookie(fsc->fscache,
&ceph_fscache_inode_object_def,
ci, true);
+ fscache_check_consistency(ci->fscache);
done:
mutex_unlock(&inode->i_mutex);
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index da95f61b7a09..5ac591bd012b 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -48,6 +48,12 @@ void ceph_readpage_to_fscache(struct inode *inode, struct page *page);
void ceph_invalidate_fscache_page(struct inode* inode, struct page *page);
void ceph_queue_revalidate(struct inode *inode);
+static inline void ceph_fscache_update_objectsize(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ fscache_attr_changed(ci->fscache);
+}
+
static inline void ceph_fscache_invalidate(struct inode *inode)
{
fscache_invalidate(ceph_inode(inode)->fscache);
@@ -135,6 +141,10 @@ static inline void ceph_readpage_to_fscache(struct inode *inode,
{
}
+static inline void ceph_fscache_update_objectsize(struct inode *inode)
+{
+}
+
static inline void ceph_fscache_invalidate(struct inode *inode)
{
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 17543383545c..2e5e648eb5c3 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -622,8 +622,10 @@ retry:
if (flags & CEPH_CAP_FLAG_AUTH) {
if (ci->i_auth_cap == NULL ||
- ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0)
+ ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) {
ci->i_auth_cap = cap;
+ cap->mds_wanted = wanted;
+ }
ci->i_cap_exporting_issued = 0;
} else {
WARN_ON(ci->i_auth_cap == cap);
@@ -885,7 +887,10 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
cap = rb_entry(p, struct ceph_cap, ci_node);
if (!__cap_is_valid(cap))
continue;
- mds_wanted |= cap->mds_wanted;
+ if (cap == ci->i_auth_cap)
+ mds_wanted |= cap->mds_wanted;
+ else
+ mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR);
}
return mds_wanted;
}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 6d59006bfa27..16b54aa31f08 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -93,6 +93,8 @@ static int mdsc_show(struct seq_file *s, void *p)
} else if (req->r_path1) {
seq_printf(s, " #%llx/%s", req->r_ino1.ino,
req->r_path1);
+ } else {
+ seq_printf(s, " #%llx", req->r_ino1.ino);
}
if (req->r_old_dentry) {
@@ -102,7 +104,8 @@ static int mdsc_show(struct seq_file *s, void *p)
path = NULL;
spin_lock(&req->r_old_dentry->d_lock);
seq_printf(s, " #%llx/%.*s (%s)",
- ceph_ino(req->r_old_dentry_dir),
+ req->r_old_dentry_dir ?
+ ceph_ino(req->r_old_dentry_dir) : 0,
req->r_old_dentry->d_name.len,
req->r_old_dentry->d_name.name,
path ? path : "");
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 45eda6d7a40c..766410a12c2c 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -119,7 +119,8 @@ static int fpos_cmp(loff_t l, loff_t r)
* defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
* the MDS if/when the directory is modified).
*/
-static int __dcache_readdir(struct file *file, struct dir_context *ctx)
+static int __dcache_readdir(struct file *file, struct dir_context *ctx,
+ u32 shared_gen)
{
struct ceph_file_info *fi = file->private_data;
struct dentry *parent = file->f_dentry;
@@ -133,8 +134,8 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx)
last = fi->dentry;
fi->dentry = NULL;
- dout("__dcache_readdir %p at %llu (last %p)\n", dir, ctx->pos,
- last);
+ dout("__dcache_readdir %p v%u at %llu (last %p)\n",
+ dir, shared_gen, ctx->pos, last);
spin_lock(&parent->d_lock);
@@ -161,7 +162,8 @@ more:
goto out_unlock;
}
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- if (!d_unhashed(dentry) && dentry->d_inode &&
+ if (di->lease_shared_gen == shared_gen &&
+ !d_unhashed(dentry) && dentry->d_inode &&
ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
fpos_cmp(ctx->pos, di->offset) <= 0)
@@ -190,7 +192,7 @@ more:
if (last) {
/* remember our position */
fi->dentry = last;
- fi->next_offset = di->offset;
+ fi->next_offset = fpos_off(di->offset);
}
dput(dentry);
return 0;
@@ -252,8 +254,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
int err;
u32 ftype;
struct ceph_mds_reply_info_parsed *rinfo;
- const int max_entries = fsc->mount_options->max_readdir;
- const int max_bytes = fsc->mount_options->max_readdir_bytes;
dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off);
if (fi->flags & CEPH_F_ATEND)
@@ -291,8 +291,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
ceph_snap(inode) != CEPH_SNAPDIR &&
__ceph_dir_is_complete(ci) &&
__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
+ u32 shared_gen = ci->i_shared_gen;
spin_unlock(&ci->i_ceph_lock);
- err = __dcache_readdir(file, ctx);
+ err = __dcache_readdir(file, ctx, shared_gen);
if (err != -EAGAIN)
return err;
} else {
@@ -322,14 +323,16 @@ more:
fi->last_readdir = NULL;
}
- /* requery frag tree, as the frag topology may have changed */
- frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
-
dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
ceph_vinop(inode), frag, fi->last_name);
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
+ err = ceph_alloc_readdir_reply_buffer(req, inode);
+ if (err) {
+ ceph_mdsc_put_request(req);
+ return err;
+ }
req->r_inode = inode;
ihold(inode);
req->r_dentry = dget(file->f_dentry);
@@ -340,9 +343,6 @@ more:
req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
req->r_readdir_offset = fi->next_offset;
req->r_args.readdir.frag = cpu_to_le32(frag);
- req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
- req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
- req->r_num_caps = max_entries + 1;
err = ceph_mdsc_do_request(mdsc, NULL, req);
if (err < 0) {
ceph_mdsc_put_request(req);
@@ -369,9 +369,9 @@ more:
fi->next_offset = 0;
off = fi->next_offset;
}
+ fi->frag = frag;
fi->offset = fi->next_offset;
fi->last_readdir = req;
- fi->frag = frag;
if (req->r_reply_info.dir_end) {
kfree(fi->last_name);
@@ -454,7 +454,7 @@ more:
return 0;
}
-static void reset_readdir(struct ceph_file_info *fi)
+static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
{
if (fi->last_readdir) {
ceph_mdsc_put_request(fi->last_readdir);
@@ -462,7 +462,10 @@ static void reset_readdir(struct ceph_file_info *fi)
}
kfree(fi->last_name);
fi->last_name = NULL;
- fi->next_offset = 2; /* compensate for . and .. */
+ if (ceph_frag_is_leftmost(frag))
+ fi->next_offset = 2; /* compensate for . and .. */
+ else
+ fi->next_offset = 0;
if (fi->dentry) {
dput(fi->dentry);
fi->dentry = NULL;
@@ -474,7 +477,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
{
struct ceph_file_info *fi = file->private_data;
struct inode *inode = file->f_mapping->host;
- loff_t old_offset = offset;
+ loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset);
loff_t retval;
mutex_lock(&inode->i_mutex);
@@ -491,7 +494,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
goto out;
}
- if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
+ if (offset >= 0) {
if (offset != file->f_pos) {
file->f_pos = offset;
file->f_version = 0;
@@ -504,14 +507,14 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
* seek to new frag, or seek prior to current chunk.
*/
if (offset == 0 ||
- fpos_frag(offset) != fpos_frag(old_offset) ||
+ fpos_frag(offset) != fi->frag ||
fpos_off(offset) < fi->offset) {
dout("dir_llseek dropping %p content\n", file);
- reset_readdir(fi);
+ reset_readdir(fi, fpos_frag(offset));
}
/* bump dir_release_count if we did a forward seek */
- if (offset > old_offset)
+ if (fpos_cmp(offset, old_offset) > 0)
fi->dir_release_count--;
}
out:
@@ -812,8 +815,7 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
}
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
- req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
- req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
+ req->r_old_dentry = dget(old_dentry);
req->r_locked_dir = dir;
req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
@@ -911,10 +913,11 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
+ ihold(old_dir);
req->r_dentry = dget(new_dentry);
req->r_num_caps = 2;
req->r_old_dentry = dget(old_dentry);
- req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
+ req->r_old_dentry_dir = old_dir;
req->r_locked_dir = new_dir;
req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 16796be53ca5..00d6af6a32ec 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -8,23 +8,6 @@
#include "mds_client.h"
/*
- * NFS export support
- *
- * NFS re-export of a ceph mount is, at present, only semireliable.
- * The basic issue is that the Ceph architectures doesn't lend itself
- * well to generating filehandles that will remain valid forever.
- *
- * So, we do our best. If you're lucky, your inode will be in the
- * client's cache. If it's not, and you have a connectable fh, then
- * the MDS server may be able to find it for you. Otherwise, you get
- * ESTALE.
- *
- * There are ways to this more reliable, but in the non-connectable fh
- * case, we won't every work perfectly, and in the connectable case,
- * some changes are needed on the MDS side to work better.
- */
-
-/*
* Basic fh
*/
struct ceph_nfs_fh {
@@ -32,22 +15,12 @@ struct ceph_nfs_fh {
} __attribute__ ((packed));
/*
- * Larger 'connectable' fh that includes parent ino and name hash.
- * Use this whenever possible, as it works more reliably.
+ * Larger fh that includes parent ino.
*/
struct ceph_nfs_confh {
u64 ino, parent_ino;
- u32 parent_name_hash;
} __attribute__ ((packed));
-/*
- * The presence of @parent_inode here tells us whether NFS wants a
- * connectable file handle. However, we want to make a connectionable
- * file handle unconditionally so that the MDS gets as much of a hint
- * as possible. That means we only use @parent_dentry to indicate
- * whether nfsd wants a connectable fh, and whether we should indicate
- * failure from a too-small @max_len.
- */
static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
struct inode *parent_inode)
{
@@ -56,54 +29,36 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
struct ceph_nfs_confh *cfh = (void *)rawfh;
int connected_handle_length = sizeof(*cfh)/4;
int handle_length = sizeof(*fh)/4;
- struct dentry *dentry;
- struct dentry *parent;
/* don't re-export snaps */
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EINVAL;
- dentry = d_find_alias(inode);
+ if (parent_inode && (*max_len < connected_handle_length)) {
+ *max_len = connected_handle_length;
+ return FILEID_INVALID;
+ } else if (*max_len < handle_length) {
+ *max_len = handle_length;
+ return FILEID_INVALID;
+ }
- /* if we found an alias, generate a connectable fh */
- if (*max_len >= connected_handle_length && dentry) {
- dout("encode_fh %p connectable\n", dentry);
- spin_lock(&dentry->d_lock);
- parent = dentry->d_parent;
+ if (parent_inode) {
+ dout("encode_fh %llx with parent %llx\n",
+ ceph_ino(inode), ceph_ino(parent_inode));
cfh->ino = ceph_ino(inode);
- cfh->parent_ino = ceph_ino(parent->d_inode);
- cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode,
- dentry);
+ cfh->parent_ino = ceph_ino(parent_inode);
*max_len = connected_handle_length;
- type = 2;
- spin_unlock(&dentry->d_lock);
- } else if (*max_len >= handle_length) {
- if (parent_inode) {
- /* nfsd wants connectable */
- *max_len = connected_handle_length;
- type = FILEID_INVALID;
- } else {
- dout("encode_fh %p\n", dentry);
- fh->ino = ceph_ino(inode);
- *max_len = handle_length;
- type = 1;
- }
+ type = FILEID_INO32_GEN_PARENT;
} else {
+ dout("encode_fh %llx\n", ceph_ino(inode));
+ fh->ino = ceph_ino(inode);
*max_len = handle_length;
- type = FILEID_INVALID;
+ type = FILEID_INO32_GEN;
}
- if (dentry)
- dput(dentry);
return type;
}
-/*
- * convert regular fh to dentry
- *
- * FIXME: we should try harder by querying the mds for the ino.
- */
-static struct dentry *__fh_to_dentry(struct super_block *sb,
- struct ceph_nfs_fh *fh, int fh_len)
+static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
struct inode *inode;
@@ -111,11 +66,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
struct ceph_vino vino;
int err;
- if (fh_len < sizeof(*fh) / 4)
- return ERR_PTR(-ESTALE);
-
- dout("__fh_to_dentry %llx\n", fh->ino);
- vino.ino = fh->ino;
+ vino.ino = ino;
vino.snap = CEPH_NOSNAP;
inode = ceph_find_inode(sb, vino);
if (!inode) {
@@ -139,139 +90,161 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
dentry = d_obtain_alias(inode);
if (IS_ERR(dentry)) {
- pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
- fh->ino, inode);
iput(inode);
return dentry;
}
err = ceph_init_dentry(dentry);
if (err < 0) {
- iput(inode);
+ dput(dentry);
return ERR_PTR(err);
}
- dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry);
+ dout("__fh_to_dentry %llx %p dentry %p\n", ino, inode, dentry);
return dentry;
}
/*
- * convert connectable fh to dentry
+ * convert regular fh to dentry
*/
-static struct dentry *__cfh_to_dentry(struct super_block *sb,
- struct ceph_nfs_confh *cfh, int fh_len)
+static struct dentry *ceph_fh_to_dentry(struct super_block *sb,
+ struct fid *fid,
+ int fh_len, int fh_type)
+{
+ struct ceph_nfs_fh *fh = (void *)fid->raw;
+
+ if (fh_type != FILEID_INO32_GEN &&
+ fh_type != FILEID_INO32_GEN_PARENT)
+ return NULL;
+ if (fh_len < sizeof(*fh) / 4)
+ return NULL;
+
+ dout("fh_to_dentry %llx\n", fh->ino);
+ return __fh_to_dentry(sb, fh->ino);
+}
+
+static struct dentry *__get_parent(struct super_block *sb,
+ struct dentry *child, u64 ino)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
+ struct ceph_mds_request *req;
struct inode *inode;
struct dentry *dentry;
- struct ceph_vino vino;
int err;
- if (fh_len < sizeof(*cfh) / 4)
- return ERR_PTR(-ESTALE);
-
- dout("__cfh_to_dentry %llx (%llx/%x)\n",
- cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
-
- vino.ino = cfh->ino;
- vino.snap = CEPH_NOSNAP;
- inode = ceph_find_inode(sb, vino);
- if (!inode) {
- struct ceph_mds_request *req;
-
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
- USE_ANY_MDS);
- if (IS_ERR(req))
- return ERR_CAST(req);
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT,
+ USE_ANY_MDS);
+ if (IS_ERR(req))
+ return ERR_CAST(req);
- req->r_ino1 = vino;
- req->r_ino2.ino = cfh->parent_ino;
- req->r_ino2.snap = CEPH_NOSNAP;
- req->r_path2 = kmalloc(16, GFP_NOFS);
- snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
- req->r_num_caps = 1;
- err = ceph_mdsc_do_request(mdsc, NULL, req);
- inode = req->r_target_inode;
- if (inode)
- ihold(inode);
- ceph_mdsc_put_request(req);
- if (!inode)
- return ERR_PTR(err ? err : -ESTALE);
+ if (child) {
+ req->r_inode = child->d_inode;
+ ihold(child->d_inode);
+ } else {
+ req->r_ino1 = (struct ceph_vino) {
+ .ino = ino,
+ .snap = CEPH_NOSNAP,
+ };
}
+ req->r_num_caps = 1;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ inode = req->r_target_inode;
+ if (inode)
+ ihold(inode);
+ ceph_mdsc_put_request(req);
+ if (!inode)
+ return ERR_PTR(-ENOENT);
dentry = d_obtain_alias(inode);
if (IS_ERR(dentry)) {
- pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
- cfh->ino, inode);
iput(inode);
return dentry;
}
err = ceph_init_dentry(dentry);
if (err < 0) {
- iput(inode);
+ dput(dentry);
return ERR_PTR(err);
}
- dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry);
+ dout("__get_parent ino %llx parent %p ino %llx.%llx\n",
+ child ? ceph_ino(child->d_inode) : ino,
+ dentry, ceph_vinop(inode));
return dentry;
}
-static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
- int fh_len, int fh_type)
+struct dentry *ceph_get_parent(struct dentry *child)
{
- if (fh_type == 1)
- return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw,
- fh_len);
- else
- return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw,
- fh_len);
+ /* don't re-export snaps */
+ if (ceph_snap(child->d_inode) != CEPH_NOSNAP)
+ return ERR_PTR(-EINVAL);
+
+ dout("get_parent %p ino %llx.%llx\n",
+ child, ceph_vinop(child->d_inode));
+ return __get_parent(child->d_sb, child, 0);
}
/*
- * get parent, if possible.
- *
- * FIXME: we could do better by querying the mds to discover the
- * parent.
+ * convert regular fh to parent
*/
static struct dentry *ceph_fh_to_parent(struct super_block *sb,
- struct fid *fid,
+ struct fid *fid,
int fh_len, int fh_type)
{
struct ceph_nfs_confh *cfh = (void *)fid->raw;
- struct ceph_vino vino;
- struct inode *inode;
struct dentry *dentry;
- int err;
- if (fh_type == 1)
- return ERR_PTR(-ESTALE);
+ if (fh_type != FILEID_INO32_GEN_PARENT)
+ return NULL;
if (fh_len < sizeof(*cfh) / 4)
- return ERR_PTR(-ESTALE);
+ return NULL;
- pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
- cfh->parent_name_hash);
+ dout("fh_to_parent %llx\n", cfh->parent_ino);
+ dentry = __get_parent(sb, NULL, cfh->ino);
+ if (IS_ERR(dentry) && PTR_ERR(dentry) == -ENOENT)
+ dentry = __fh_to_dentry(sb, cfh->parent_ino);
+ return dentry;
+}
- vino.ino = cfh->ino;
- vino.snap = CEPH_NOSNAP;
- inode = ceph_find_inode(sb, vino);
- if (!inode)
- return ERR_PTR(-ESTALE);
+static int ceph_get_name(struct dentry *parent, char *name,
+ struct dentry *child)
+{
+ struct ceph_mds_client *mdsc;
+ struct ceph_mds_request *req;
+ int err;
- dentry = d_obtain_alias(inode);
- if (IS_ERR(dentry)) {
- pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
- cfh->ino, inode);
- iput(inode);
- return dentry;
- }
- err = ceph_init_dentry(dentry);
- if (err < 0) {
- iput(inode);
- return ERR_PTR(err);
+ mdsc = ceph_inode_to_client(child->d_inode)->mdsc;
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME,
+ USE_ANY_MDS);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ mutex_lock(&parent->d_inode->i_mutex);
+
+ req->r_inode = child->d_inode;
+ ihold(child->d_inode);
+ req->r_ino2 = ceph_vino(parent->d_inode);
+ req->r_locked_dir = parent->d_inode;
+ req->r_num_caps = 2;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+
+ mutex_unlock(&parent->d_inode->i_mutex);
+
+ if (!err) {
+ struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+ memcpy(name, rinfo->dname, rinfo->dname_len);
+ name[rinfo->dname_len] = 0;
+ dout("get_name %p ino %llx.%llx name %s\n",
+ child, ceph_vinop(child->d_inode), name);
+ } else {
+ dout("get_name %p ino %llx.%llx err %d\n",
+ child, ceph_vinop(child->d_inode), err);
}
- dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry);
- return dentry;
+
+ ceph_mdsc_put_request(req);
+ return err;
}
const struct export_operations ceph_export_ops = {
.encode_fh = ceph_encode_fh,
.fh_to_dentry = ceph_fh_to_dentry,
.fh_to_parent = ceph_fh_to_parent,
+ .get_parent = ceph_get_parent,
+ .get_name = ceph_get_name,
};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 596e6cc9f9c4..88a6df4cbe6d 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -210,7 +210,7 @@ int ceph_open(struct inode *inode, struct file *file)
ihold(inode);
req->r_num_caps = 1;
- if (flags & (O_CREAT|O_TRUNC))
+ if (flags & O_CREAT)
parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);
err = ceph_mdsc_do_request(mdsc, parent_inode, req);
iput(parent_inode);
@@ -291,8 +291,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
}
err = finish_open(file, dentry, ceph_open, opened);
}
-
out_err:
+ if (!req->r_err && req->r_target_inode)
+ ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
ceph_mdsc_put_request(req);
dout("atomic_open result=%d\n", err);
return err;
@@ -600,7 +601,7 @@ ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov,
false);
if (IS_ERR(req)) {
ret = PTR_ERR(req);
- goto out;
+ break;
}
num_pages = calc_pages_for(page_align, len);
@@ -718,7 +719,7 @@ static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov,
false);
if (IS_ERR(req)) {
ret = PTR_ERR(req);
- goto out;
+ break;
}
/*
@@ -970,6 +971,8 @@ retry_snap:
goto retry_snap;
}
} else {
+ loff_t old_size = inode->i_size;
+ struct iov_iter from;
/*
* No need to acquire the i_truncate_mutex. Because
* the MDS revokes Fwb caps before sending truncate
@@ -977,9 +980,12 @@ retry_snap:
* are pending vmtruncate. So write and vmtruncate
* can not run at the same time
*/
- written = generic_file_buffered_write(iocb, iov, nr_segs,
- pos, &iocb->ki_pos,
- count, 0);
+ iov_iter_init(&from, iov, nr_segs, count, 0);
+ written = generic_perform_write(file, &from, pos);
+ if (likely(written >= 0))
+ iocb->ki_pos = pos + written;
+ if (inode->i_size > old_size)
+ ceph_fscache_update_objectsize(inode);
mutex_unlock(&inode->i_mutex);
}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 32d519d8a2e2..0b0728e5be2d 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -659,14 +659,6 @@ static int fill_inode(struct inode *inode,
le32_to_cpu(info->time_warp_seq),
&ctime, &mtime, &atime);
- /* only update max_size on auth cap */
- if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
- ci->i_max_size != le64_to_cpu(info->max_size)) {
- dout("max_size %lld -> %llu\n", ci->i_max_size,
- le64_to_cpu(info->max_size));
- ci->i_max_size = le64_to_cpu(info->max_size);
- }
-
ci->i_layout = info->layout;
inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
@@ -755,6 +747,14 @@ static int fill_inode(struct inode *inode,
ci->i_max_offset = 2;
}
no_change:
+ /* only update max_size on auth cap */
+ if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+ ci->i_max_size != le64_to_cpu(info->max_size)) {
+ dout("max_size %lld -> %llu\n", ci->i_max_size,
+ le64_to_cpu(info->max_size));
+ ci->i_max_size = le64_to_cpu(info->max_size);
+ }
+
spin_unlock(&ci->i_ceph_lock);
/* queue truncate if we saw i_size decrease */
@@ -1044,10 +1044,59 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
session, req->r_request_started, -1,
&req->r_caps_reservation);
if (err < 0)
- return err;
+ goto done;
} else {
WARN_ON_ONCE(1);
}
+
+ if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME) {
+ struct qstr dname;
+ struct dentry *dn, *parent;
+
+ BUG_ON(!rinfo->head->is_target);
+ BUG_ON(req->r_dentry);
+
+ parent = d_find_any_alias(dir);
+ BUG_ON(!parent);
+
+ dname.name = rinfo->dname;
+ dname.len = rinfo->dname_len;
+ dname.hash = full_name_hash(dname.name, dname.len);
+ vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+ vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+retry_lookup:
+ dn = d_lookup(parent, &dname);
+ dout("d_lookup on parent=%p name=%.*s got %p\n",
+ parent, dname.len, dname.name, dn);
+
+ if (!dn) {
+ dn = d_alloc(parent, &dname);
+ dout("d_alloc %p '%.*s' = %p\n", parent,
+ dname.len, dname.name, dn);
+ if (dn == NULL) {
+ dput(parent);
+ err = -ENOMEM;
+ goto done;
+ }
+ err = ceph_init_dentry(dn);
+ if (err < 0) {
+ dput(dn);
+ dput(parent);
+ goto done;
+ }
+ } else if (dn->d_inode &&
+ (ceph_ino(dn->d_inode) != vino.ino ||
+ ceph_snap(dn->d_inode) != vino.snap)) {
+ dout(" dn %p points to wrong inode %p\n",
+ dn, dn->d_inode);
+ d_delete(dn);
+ dput(dn);
+ goto retry_lookup;
+ }
+
+ req->r_dentry = dn;
+ dput(parent);
+ }
}
if (rinfo->head->is_target) {
@@ -1063,7 +1112,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
err = fill_inode(in, &rinfo->targeti, NULL,
session, req->r_request_started,
- (le32_to_cpu(rinfo->head->result) == 0) ?
+ (!req->r_aborted && rinfo->head->result == 0) ?
req->r_fmode : -1,
&req->r_caps_reservation);
if (err < 0) {
@@ -1616,8 +1665,6 @@ static const struct inode_operations ceph_symlink_iops = {
.getxattr = ceph_getxattr,
.listxattr = ceph_listxattr,
.removexattr = ceph_removexattr,
- .get_acl = ceph_get_acl,
- .set_acl = ceph_set_acl,
};
/*
@@ -1627,7 +1674,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct inode *parent_inode;
const unsigned int ia_valid = attr->ia_valid;
struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
@@ -1819,9 +1865,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
req->r_inode_drop = release;
req->r_args.setattr.mask = cpu_to_le32(mask);
req->r_num_caps = 1;
- parent_inode = ceph_get_dentry_parent_inode(dentry);
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- iput(parent_inode);
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
}
dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
ceph_cap_string(dirtied), mask);
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index dc66c9e023e4..fdf941b44ff1 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -1,9 +1,8 @@
+#include <linux/ceph/ceph_debug.h>
#include <linux/in.h>
#include "super.h"
#include "mds_client.h"
-#include <linux/ceph/ceph_debug.h>
-
#include "ioctl.h"
@@ -64,7 +63,6 @@ static long __validate_layout(struct ceph_mds_client *mdsc,
static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
- struct inode *parent_inode;
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_mds_request *req;
struct ceph_ioctl_layout l;
@@ -121,9 +119,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
cpu_to_le32(l.object_size);
req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
- parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- iput(parent_inode);
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
ceph_mdsc_put_request(req);
return err;
}
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index ae6d14e82b0f..d94ba0df9f4d 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -2,11 +2,31 @@
#include <linux/file.h>
#include <linux/namei.h>
+#include <linux/random.h>
#include "super.h"
#include "mds_client.h"
#include <linux/ceph/pagelist.h>
+static u64 lock_secret;
+
+static inline u64 secure_addr(void *addr)
+{
+ u64 v = lock_secret ^ (u64)(unsigned long)addr;
+ /*
+ * Set the most significant bit, so that MDS knows the 'owner'
+ * is sufficient to identify the owner of lock. (old code uses
+ * both 'owner' and 'pid')
+ */
+ v |= (1ULL << 63);
+ return v;
+}
+
+void __init ceph_flock_init(void)
+{
+ get_random_bytes(&lock_secret, sizeof(lock_secret));
+}
+
/**
* Implement fcntl and flock locking functions.
*/
@@ -14,11 +34,11 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
int cmd, u8 wait, struct file_lock *fl)
{
struct inode *inode = file_inode(file);
- struct ceph_mds_client *mdsc =
- ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_mds_request *req;
int err;
u64 length = 0;
+ u64 owner;
req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
if (IS_ERR(req))
@@ -32,25 +52,27 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
else
length = fl->fl_end - fl->fl_start + 1;
- dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
- "length: %llu, wait: %d, type: %d", (int)lock_type,
- (int)operation, (u64)fl->fl_pid, fl->fl_start,
- length, wait, fl->fl_type);
+ if (lock_type == CEPH_LOCK_FCNTL)
+ owner = secure_addr(fl->fl_owner);
+ else
+ owner = secure_addr(fl->fl_file);
+
+ dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, "
+ "start: %llu, length: %llu, wait: %d, type: %d", (int)lock_type,
+ (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length,
+ wait, fl->fl_type);
req->r_args.filelock_change.rule = lock_type;
req->r_args.filelock_change.type = cmd;
+ req->r_args.filelock_change.owner = cpu_to_le64(owner);
req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
- /* This should be adjusted, but I'm not sure if
- namespaces actually get id numbers*/
- req->r_args.filelock_change.pid_namespace =
- cpu_to_le64((u64)(unsigned long)fl->fl_nspid);
req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
req->r_args.filelock_change.length = cpu_to_le64(length);
req->r_args.filelock_change.wait = wait;
err = ceph_mdsc_do_request(mdsc, inode, req);
- if ( operation == CEPH_MDS_OP_GETFILELOCK){
+ if (operation == CEPH_MDS_OP_GETFILELOCK) {
fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid);
if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
fl->fl_type = F_RDLCK;
@@ -87,14 +109,19 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
u8 wait = 0;
u16 op = CEPH_MDS_OP_SETFILELOCK;
- fl->fl_nspid = get_pid(task_tgid(current));
- dout("ceph_lock, fl_pid:%d", fl->fl_pid);
+ if (!(fl->fl_flags & FL_POSIX))
+ return -ENOLCK;
+ /* No mandatory locks */
+ if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
+ return -ENOLCK;
+
+ dout("ceph_lock, fl_owner: %p", fl->fl_owner);
/* set wait bit as appropriate, then make command as Ceph expects it*/
- if (F_SETLKW == cmd)
- wait = 1;
- if (F_GETLK == cmd)
+ if (IS_GETLK(cmd))
op = CEPH_MDS_OP_GETFILELOCK;
+ else if (IS_SETLKW(cmd))
+ wait = 1;
if (F_RDLCK == fl->fl_type)
lock_cmd = CEPH_LOCK_SHARED;
@@ -105,7 +132,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl);
if (!err) {
- if ( op != CEPH_MDS_OP_GETFILELOCK ){
+ if (op != CEPH_MDS_OP_GETFILELOCK) {
dout("mds locked, locking locally");
err = posix_lock_file(file, fl, NULL);
if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
@@ -131,20 +158,22 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
{
u8 lock_cmd;
int err;
- u8 wait = 1;
-
- fl->fl_nspid = get_pid(task_tgid(current));
- dout("ceph_flock, fl_pid:%d", fl->fl_pid);
-
- /* set wait bit, then clear it out of cmd*/
- if (cmd & LOCK_NB)
- wait = 0;
- cmd = cmd & (LOCK_SH | LOCK_EX | LOCK_UN);
- /* set command sequence that Ceph wants to see:
- shared lock, exclusive lock, or unlock */
- if (LOCK_SH == cmd)
+ u8 wait = 0;
+
+ if (!(fl->fl_flags & FL_FLOCK))
+ return -ENOLCK;
+ /* No mandatory locks */
+ if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
+ return -ENOLCK;
+
+ dout("ceph_flock, fl_file: %p", fl->fl_file);
+
+ if (IS_SETLKW(cmd))
+ wait = 1;
+
+ if (F_RDLCK == fl->fl_type)
lock_cmd = CEPH_LOCK_SHARED;
- else if (LOCK_EX == cmd)
+ else if (F_WRLCK == fl->fl_type)
lock_cmd = CEPH_LOCK_EXCL;
else
lock_cmd = CEPH_LOCK_UNLOCK;
@@ -280,13 +309,14 @@ int lock_to_ceph_filelock(struct file_lock *lock,
struct ceph_filelock *cephlock)
{
int err = 0;
-
cephlock->start = cpu_to_le64(lock->fl_start);
cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
cephlock->client = cpu_to_le64(0);
- cephlock->pid = cpu_to_le64(lock->fl_pid);
- cephlock->pid_namespace =
- cpu_to_le64((u64)(unsigned long)lock->fl_nspid);
+ cephlock->pid = cpu_to_le64((u64)lock->fl_pid);
+ if (lock->fl_flags & FL_POSIX)
+ cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));
+ else
+ cephlock->owner = cpu_to_le64(secure_addr(lock->fl_file));
switch (lock->fl_type) {
case F_RDLCK:
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index f4f050a69a48..2b4d093d0563 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3,6 +3,7 @@
#include <linux/fs.h>
#include <linux/wait.h>
#include <linux/slab.h>
+#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
@@ -165,21 +166,18 @@ static int parse_reply_info_dir(void **p, void *end,
if (num == 0)
goto done;
- /* alloc large array */
- info->dir_nr = num;
- info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
- sizeof(*info->dir_dname) +
- sizeof(*info->dir_dname_len) +
- sizeof(*info->dir_dlease),
- GFP_NOFS);
- if (info->dir_in == NULL) {
- err = -ENOMEM;
- goto out_bad;
- }
+ BUG_ON(!info->dir_in);
info->dir_dname = (void *)(info->dir_in + num);
info->dir_dname_len = (void *)(info->dir_dname + num);
info->dir_dlease = (void *)(info->dir_dname_len + num);
+ if ((unsigned long)(info->dir_dlease + num) >
+ (unsigned long)info->dir_in + info->dir_buf_size) {
+ pr_err("dir contents are larger than expected\n");
+ WARN_ON(1);
+ goto bad;
+ }
+ info->dir_nr = num;
while (num) {
/* dentry */
ceph_decode_need(p, end, sizeof(u32)*2, bad);
@@ -327,7 +325,9 @@ out_bad:
static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
{
- kfree(info->dir_in);
+ if (!info->dir_in)
+ return;
+ free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size));
}
@@ -512,12 +512,11 @@ void ceph_mdsc_release_request(struct kref *kref)
struct ceph_mds_request *req = container_of(kref,
struct ceph_mds_request,
r_kref);
+ destroy_reply_info(&req->r_reply_info);
if (req->r_request)
ceph_msg_put(req->r_request);
- if (req->r_reply) {
+ if (req->r_reply)
ceph_msg_put(req->r_reply);
- destroy_reply_info(&req->r_reply_info);
- }
if (req->r_inode) {
ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
iput(req->r_inode);
@@ -528,7 +527,9 @@ void ceph_mdsc_release_request(struct kref *kref)
iput(req->r_target_inode);
if (req->r_dentry)
dput(req->r_dentry);
- if (req->r_old_dentry) {
+ if (req->r_old_dentry)
+ dput(req->r_old_dentry);
+ if (req->r_old_dentry_dir) {
/*
* track (and drop pins for) r_old_dentry_dir
* separately, since r_old_dentry's d_parent may have
@@ -537,7 +538,6 @@ void ceph_mdsc_release_request(struct kref *kref)
*/
ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
CEPH_CAP_PIN);
- dput(req->r_old_dentry);
iput(req->r_old_dentry_dir);
}
kfree(req->r_path1);
@@ -1311,6 +1311,9 @@ static int trim_caps(struct ceph_mds_client *mdsc,
trim_caps - session->s_trim_caps);
session->s_trim_caps = 0;
}
+
+ ceph_add_cap_releases(mdsc, session);
+ ceph_send_cap_releases(mdsc, session);
return 0;
}
@@ -1461,15 +1464,18 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,
dout("discard_cap_releases mds%d\n", session->s_mds);
- /* zero out the in-progress message */
- msg = list_first_entry(&session->s_cap_releases,
- struct ceph_msg, list_head);
- head = msg->front.iov_base;
- num = le32_to_cpu(head->num);
- dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
- head->num = cpu_to_le32(0);
- msg->front.iov_len = sizeof(*head);
- session->s_num_cap_releases += num;
+ if (!list_empty(&session->s_cap_releases)) {
+ /* zero out the in-progress message */
+ msg = list_first_entry(&session->s_cap_releases,
+ struct ceph_msg, list_head);
+ head = msg->front.iov_base;
+ num = le32_to_cpu(head->num);
+ dout("discard_cap_releases mds%d %p %u\n",
+ session->s_mds, msg, num);
+ head->num = cpu_to_le32(0);
+ msg->front.iov_len = sizeof(*head);
+ session->s_num_cap_releases += num;
+ }
/* requeue completed messages */
while (!list_empty(&session->s_cap_releases_done)) {
@@ -1492,6 +1498,43 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,
* requests
*/
+int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
+ struct inode *dir)
+{
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+ struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
+ size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) +
+ sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease);
+ int order, num_entries;
+
+ spin_lock(&ci->i_ceph_lock);
+ num_entries = ci->i_files + ci->i_subdirs;
+ spin_unlock(&ci->i_ceph_lock);
+ num_entries = max(num_entries, 1);
+ num_entries = min(num_entries, opt->max_readdir);
+
+ order = get_order(size * num_entries);
+ while (order >= 0) {
+ rinfo->dir_in = (void*)__get_free_pages(GFP_NOFS | __GFP_NOWARN,
+ order);
+ if (rinfo->dir_in)
+ break;
+ order--;
+ }
+ if (!rinfo->dir_in)
+ return -ENOMEM;
+
+ num_entries = (PAGE_SIZE << order) / size;
+ num_entries = min(num_entries, opt->max_readdir);
+
+ rinfo->dir_buf_size = PAGE_SIZE << order;
+ req->r_num_caps = num_entries + 1;
+ req->r_args.readdir.max_entries = cpu_to_le32(num_entries);
+ req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes);
+ return 0;
+}
+
/*
* Create an mds request.
*/
@@ -2053,7 +2096,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
if (req->r_locked_dir)
ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
- if (req->r_old_dentry)
+ if (req->r_old_dentry_dir)
ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
CEPH_CAP_PIN);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 68288917c737..e90cfccf93bd 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -67,6 +67,7 @@ struct ceph_mds_reply_info_parsed {
/* for readdir results */
struct {
struct ceph_mds_reply_dirfrag *dir_dir;
+ size_t dir_buf_size;
int dir_nr;
char **dir_dname;
u32 *dir_dname_len;
@@ -346,7 +347,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
struct dentry *dn);
extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
-
+extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
+ struct inode *dir);
extern struct ceph_mds_request *
ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index 4440f447fd3f..51cc23e48111 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -54,6 +54,7 @@ const char *ceph_mds_op_name(int op)
case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
case CEPH_MDS_OP_LOOKUPINO: return "lookupino";
+ case CEPH_MDS_OP_LOOKUPNAME: return "lookupname";
case CEPH_MDS_OP_GETATTR: return "getattr";
case CEPH_MDS_OP_SETXATTR: return "setxattr";
case CEPH_MDS_OP_SETATTR: return "setattr";
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 10a4ccbf38da..06150fd745ac 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1026,6 +1026,7 @@ static int __init init_ceph(void)
if (ret)
goto out;
+ ceph_flock_init();
ceph_xattr_init();
ret = register_filesystem(&ceph_fs_type);
if (ret)
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index d8801a95b685..7866cd05a6bb 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -577,7 +577,7 @@ struct ceph_file_info {
/* readdir: position within a frag */
unsigned offset; /* offset of last chunk, adjusted for . and .. */
- u64 next_offset; /* offset of next chunk (last_name's + 1) */
+ unsigned next_offset; /* offset of next chunk (last_name's + 1) */
char *last_name; /* last entry in previous chunk */
struct dentry *dentry; /* next dentry (for dcache readdir) */
int dir_release_count;
@@ -871,6 +871,7 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
extern const struct export_operations ceph_export_ops;
/* locks.c */
+extern __init void ceph_flock_init(void);
extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index a55ec37378c6..c9c2b887381e 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -64,32 +64,48 @@ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
}
static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
- size_t size)
+ size_t size)
{
int ret;
struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
struct ceph_osd_client *osdc = &fsc->client->osdc;
s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
const char *pool_name;
+ char buf[128];
dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
down_read(&osdc->map_sem);
pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
- if (pool_name)
- ret = snprintf(val, size,
- "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%s",
+ if (pool_name) {
+ size_t len = strlen(pool_name);
+ ret = snprintf(buf, sizeof(buf),
+ "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
(unsigned long long)ceph_file_layout_su(ci->i_layout),
(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
- (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
- pool_name);
- else
- ret = snprintf(val, size,
+ (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
+ if (!size) {
+ ret += len;
+ } else if (ret + len > size) {
+ ret = -ERANGE;
+ } else {
+ memcpy(val, buf, ret);
+ memcpy(val + ret, pool_name, len);
+ ret += len;
+ }
+ } else {
+ ret = snprintf(buf, sizeof(buf),
"stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld",
(unsigned long long)ceph_file_layout_su(ci->i_layout),
(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
(unsigned long long)ceph_file_layout_object_size(ci->i_layout),
(unsigned long long)pool);
-
+ if (size) {
+ if (ret <= size)
+ memcpy(val, buf, ret);
+ else
+ ret = -ERANGE;
+ }
+ }
up_read(&osdc->map_sem);
return ret;
}
@@ -215,7 +231,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
.name_size = sizeof("ceph.dir.layout"),
.getxattr_cb = ceph_vxattrcb_layout,
.readonly = false,
- .hidden = false,
+ .hidden = true,
.exists_cb = ceph_vxattrcb_layout_exists,
},
XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
@@ -242,7 +258,7 @@ static struct ceph_vxattr ceph_file_vxattrs[] = {
.name_size = sizeof("ceph.file.layout"),
.getxattr_cb = ceph_vxattrcb_layout,
.readonly = false,
- .hidden = false,
+ .hidden = true,
.exists_cb = ceph_vxattrcb_layout_exists,
},
XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
@@ -842,7 +858,6 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
struct inode *inode = dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct inode *parent_inode;
struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = fsc->mdsc;
int err;
@@ -893,9 +908,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
req->r_data_len = size;
dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
- parent_inode = ceph_get_dentry_parent_inode(dentry);
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- iput(parent_inode);
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
ceph_mdsc_put_request(req);
dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
@@ -1019,7 +1032,6 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
struct inode *inode = dentry->d_inode;
- struct inode *parent_inode;
struct ceph_mds_request *req;
int err;
@@ -1033,9 +1045,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
req->r_num_caps = 1;
req->r_path2 = kstrdup(name, GFP_NOFS);
- parent_inode = ceph_get_dentry_parent_inode(dentry);
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- iput(parent_inode);
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
ceph_mdsc_put_request(req);
return err;
}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 2c70cbe35d39..5be1f997ecde 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -253,6 +253,11 @@ cifs_alloc_inode(struct super_block *sb)
cifs_set_oplock_level(cifs_inode, 0);
cifs_inode->delete_pending = false;
cifs_inode->invalid_mapping = false;
+ clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cifs_inode->flags);
+ clear_bit(CIFS_INODE_PENDING_WRITERS, &cifs_inode->flags);
+ clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cifs_inode->flags);
+ spin_lock_init(&cifs_inode->writers_lock);
+ cifs_inode->writers = 0;
cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
cifs_inode->server_eof = 0;
cifs_inode->uniqueid = 0;
@@ -732,19 +737,26 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct inode *inode = file_inode(iocb->ki_filp);
+ struct cifsInodeInfo *cinode = CIFS_I(inode);
ssize_t written;
int rc;
+ written = cifs_get_writer(cinode);
+ if (written)
+ return written;
+
written = generic_file_aio_write(iocb, iov, nr_segs, pos);
if (CIFS_CACHE_WRITE(CIFS_I(inode)))
- return written;
+ goto out;
rc = filemap_fdatawrite(inode->i_mapping);
if (rc)
cifs_dbg(FYI, "cifs_file_aio_write: %d rc on %p inode\n",
rc, inode);
+out:
+ cifs_put_writer(cinode);
return written;
}
@@ -850,7 +862,6 @@ const struct inode_operations cifs_file_inode_ops = {
/* revalidate:cifs_revalidate, */
.setattr = cifs_setattr,
.getattr = cifs_getattr, /* do we need this anymore? */
- .rename = cifs_rename,
.permission = cifs_permission,
#ifdef CONFIG_CIFS_XATTR
.setxattr = cifs_setxattr,
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index c0f3718b77a8..30f6e9251a4a 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -228,6 +228,8 @@ struct smb_version_operations {
/* verify the message */
int (*check_message)(char *, unsigned int);
bool (*is_oplock_break)(char *, struct TCP_Server_Info *);
+ void (*downgrade_oplock)(struct TCP_Server_Info *,
+ struct cifsInodeInfo *, bool);
/* process transaction2 response */
bool (*check_trans2)(struct mid_q_entry *, struct TCP_Server_Info *,
char *, int);
@@ -1113,6 +1115,12 @@ struct cifsInodeInfo {
unsigned int epoch; /* used to track lease state changes */
bool delete_pending; /* DELETE_ON_CLOSE is set */
bool invalid_mapping; /* pagecache is invalid */
+ unsigned long flags;
+#define CIFS_INODE_PENDING_OPLOCK_BREAK (0) /* oplock break in progress */
+#define CIFS_INODE_PENDING_WRITERS (1) /* Writes in progress */
+#define CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2 (2) /* Downgrade oplock to L2 */
+ spinlock_t writers_lock;
+ unsigned int writers; /* Number of writers on this inode */
unsigned long time; /* jiffies of last update of inode */
u64 server_eof; /* current file size on server -- protected by i_lock */
u64 uniqueid; /* server inode number */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index acc4ee8ed075..ca7980a1e303 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -127,6 +127,9 @@ extern u64 cifs_UnixTimeToNT(struct timespec);
extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
int offset);
extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock);
+extern int cifs_get_writer(struct cifsInodeInfo *cinode);
+extern void cifs_put_writer(struct cifsInodeInfo *cinode);
+extern void cifs_done_oplock_break(struct cifsInodeInfo *cinode);
extern int cifs_unlock_range(struct cifsFileInfo *cfile,
struct file_lock *flock, const unsigned int xid);
extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index f3264bd7a83d..6ce4e0954b98 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -6197,6 +6197,9 @@ QAllEAsRetry:
cifs_dbg(FYI, "ea length %d\n", list_len);
if (list_len <= 8) {
cifs_dbg(FYI, "empty EA list returned from server\n");
+ /* didn't find the named attribute */
+ if (ea_name)
+ rc = -ENODATA;
goto QAllEAsOut;
}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 834fce759d80..5ed03e0b8b40 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2579,19 +2579,32 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
struct cifsInodeInfo *cinode = CIFS_I(inode);
struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
ssize_t rc = -EACCES;
- loff_t lock_pos = pos;
+ loff_t lock_pos = iocb->ki_pos;
- if (file->f_flags & O_APPEND)
- lock_pos = i_size_read(inode);
/*
* We need to hold the sem to be sure nobody modifies lock list
* with a brlock that prevents writing.
*/
down_read(&cinode->lock_sem);
+ mutex_lock(&inode->i_mutex);
+ if (file->f_flags & O_APPEND)
+ lock_pos = i_size_read(inode);
if (!cifs_find_lock_conflict(cfile, lock_pos, iov_length(iov, nr_segs),
server->vals->exclusive_lock_type, NULL,
- CIFS_WRITE_OP))
- rc = generic_file_aio_write(iocb, iov, nr_segs, pos);
+ CIFS_WRITE_OP)) {
+ rc = __generic_file_aio_write(iocb, iov, nr_segs);
+ mutex_unlock(&inode->i_mutex);
+
+ if (rc > 0) {
+ ssize_t err;
+
+ err = generic_write_sync(file, iocb->ki_pos - rc, rc);
+ if (err < 0)
+ rc = err;
+ }
+ } else {
+ mutex_unlock(&inode->i_mutex);
+ }
up_read(&cinode->lock_sem);
return rc;
}
@@ -2608,12 +2621,20 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
ssize_t written;
+ written = cifs_get_writer(cinode);
+ if (written)
+ return written;
+
if (CIFS_CACHE_WRITE(cinode)) {
if (cap_unix(tcon->ses) &&
(CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))
- && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
- return generic_file_aio_write(iocb, iov, nr_segs, pos);
- return cifs_writev(iocb, iov, nr_segs, pos);
+ && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) {
+ written = generic_file_aio_write(
+ iocb, iov, nr_segs, pos);
+ goto out;
+ }
+ written = cifs_writev(iocb, iov, nr_segs, pos);
+ goto out;
}
/*
* For non-oplocked files in strict cache mode we need to write the data
@@ -2633,6 +2654,8 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
inode);
cinode->oplock = 0;
}
+out:
+ cifs_put_writer(cinode);
return written;
}
@@ -2727,56 +2750,27 @@ cifs_retry_async_readv(struct cifs_readdata *rdata)
/**
* cifs_readdata_to_iov - copy data from pages in response to an iovec
* @rdata: the readdata response with list of pages holding data
- * @iov: vector in which we should copy the data
- * @nr_segs: number of segments in vector
- * @offset: offset into file of the first iovec
- * @copied: used to return the amount of data copied to the iov
+ * @iter: destination for our data
*
* This function copies data from a list of pages in a readdata response into
* an array of iovecs. It will first calculate where the data should go
* based on the info in the readdata and then copy the data into that spot.
*/
-static ssize_t
-cifs_readdata_to_iov(struct cifs_readdata *rdata, const struct iovec *iov,
- unsigned long nr_segs, loff_t offset, ssize_t *copied)
+static int
+cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter)
{
- int rc = 0;
- struct iov_iter ii;
- size_t pos = rdata->offset - offset;
- ssize_t remaining = rdata->bytes;
- unsigned char *pdata;
+ size_t remaining = rdata->bytes;
unsigned int i;
- /* set up iov_iter and advance to the correct offset */
- iov_iter_init(&ii, iov, nr_segs, iov_length(iov, nr_segs), 0);
- iov_iter_advance(&ii, pos);
-
- *copied = 0;
for (i = 0; i < rdata->nr_pages; i++) {
- ssize_t copy;
struct page *page = rdata->pages[i];
-
- /* copy a whole page or whatever's left */
- copy = min_t(ssize_t, remaining, PAGE_SIZE);
-
- /* ...but limit it to whatever space is left in the iov */
- copy = min_t(ssize_t, copy, iov_iter_count(&ii));
-
- /* go while there's data to be copied and no errors */
- if (copy && !rc) {
- pdata = kmap(page);
- rc = memcpy_toiovecend(ii.iov, pdata, ii.iov_offset,
- (int)copy);
- kunmap(page);
- if (!rc) {
- *copied += copy;
- remaining -= copy;
- iov_iter_advance(&ii, copy);
- }
- }
+ size_t copy = min_t(size_t, remaining, PAGE_SIZE);
+ size_t written = copy_page_to_iter(page, 0, copy, iter);
+ remaining -= written;
+ if (written < copy && iov_iter_count(iter) > 0)
+ break;
}
-
- return rc;
+ return remaining ? -EFAULT : 0;
}
static void
@@ -2837,20 +2831,21 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server,
return total_read > 0 ? total_read : result;
}
-static ssize_t
-cifs_iovec_read(struct file *file, const struct iovec *iov,
- unsigned long nr_segs, loff_t *poffset)
+ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
{
+ struct file *file = iocb->ki_filp;
ssize_t rc;
size_t len, cur_len;
ssize_t total_read = 0;
- loff_t offset = *poffset;
+ loff_t offset = pos;
unsigned int npages;
struct cifs_sb_info *cifs_sb;
struct cifs_tcon *tcon;
struct cifsFileInfo *open_file;
struct cifs_readdata *rdata, *tmp;
struct list_head rdata_list;
+ struct iov_iter to;
pid_t pid;
if (!nr_segs)
@@ -2860,6 +2855,8 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
if (!len)
return 0;
+ iov_iter_init(&to, iov, nr_segs, len, 0);
+
INIT_LIST_HEAD(&rdata_list);
cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
open_file = file->private_data;
@@ -2885,7 +2882,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
cifs_uncached_readv_complete);
if (!rdata) {
rc = -ENOMEM;
- goto error;
+ break;
}
rc = cifs_read_allocate_pages(rdata, npages);
@@ -2917,55 +2914,44 @@ error:
if (!list_empty(&rdata_list))
rc = 0;
+ len = iov_iter_count(&to);
/* the loop below should proceed in the order of increasing offsets */
-restart_loop:
list_for_each_entry_safe(rdata, tmp, &rdata_list, list) {
+ again:
if (!rc) {
- ssize_t copied;
-
/* FIXME: freezable sleep too? */
rc = wait_for_completion_killable(&rdata->done);
if (rc)
rc = -EINTR;
- else if (rdata->result)
+ else if (rdata->result) {
rc = rdata->result;
- else {
- rc = cifs_readdata_to_iov(rdata, iov,
- nr_segs, *poffset,
- &copied);
- total_read += copied;
+ /* resend call if it's a retryable error */
+ if (rc == -EAGAIN) {
+ rc = cifs_retry_async_readv(rdata);
+ goto again;
+ }
+ } else {
+ rc = cifs_readdata_to_iov(rdata, &to);
}
- /* resend call if it's a retryable error */
- if (rc == -EAGAIN) {
- rc = cifs_retry_async_readv(rdata);
- goto restart_loop;
- }
}
list_del_init(&rdata->list);
kref_put(&rdata->refcount, cifs_uncached_readdata_release);
}
+ total_read = len - iov_iter_count(&to);
+
cifs_stats_bytes_read(tcon, total_read);
- *poffset += total_read;
/* mask nodata case */
if (rc == -ENODATA)
rc = 0;
- return total_read ? total_read : rc;
-}
-
-ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
-{
- ssize_t read;
-
- read = cifs_iovec_read(iocb->ki_filp, iov, nr_segs, &pos);
- if (read > 0)
- iocb->ki_pos = pos;
-
- return read;
+ if (total_read) {
+ iocb->ki_pos = pos + total_read;
+ return total_read;
+ }
+ return rc;
}
ssize_t
@@ -3113,6 +3099,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
static struct vm_operations_struct cifs_file_vm_ops = {
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = cifs_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
@@ -3644,6 +3631,13 @@ static int cifs_launder_page(struct page *page)
return rc;
}
+static int
+cifs_pending_writers_wait(void *unused)
+{
+ schedule();
+ return 0;
+}
+
void cifs_oplock_break(struct work_struct *work)
{
struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
@@ -3651,8 +3645,15 @@ void cifs_oplock_break(struct work_struct *work)
struct inode *inode = cfile->dentry->d_inode;
struct cifsInodeInfo *cinode = CIFS_I(inode);
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+ struct TCP_Server_Info *server = tcon->ses->server;
int rc = 0;
+ wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS,
+ cifs_pending_writers_wait, TASK_UNINTERRUPTIBLE);
+
+ server->ops->downgrade_oplock(server, cinode,
+ test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags));
+
if (!CIFS_CACHE_WRITE(cinode) && CIFS_CACHE_READ(cinode) &&
cifs_has_mand_locks(cinode)) {
cifs_dbg(FYI, "Reset oplock to None for inode=%p due to mand locks\n",
@@ -3689,6 +3690,7 @@ void cifs_oplock_break(struct work_struct *work)
cinode);
cifs_dbg(FYI, "Oplock release rc = %d\n", rc);
}
+ cifs_done_oplock_break(cinode);
}
/*
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 2f9f3790679d..3b0c62e622da 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -466,8 +466,22 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
cifs_dbg(FYI, "file id match, oplock break\n");
pCifsInode = CIFS_I(netfile->dentry->d_inode);
- cifs_set_oplock_level(pCifsInode,
- pSMB->OplockLevel ? OPLOCK_READ : 0);
+ set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK,
+ &pCifsInode->flags);
+
+ /*
+ * Set flag if the server downgrades the oplock
+ * to L2 else clear.
+ */
+ if (pSMB->OplockLevel)
+ set_bit(
+ CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+ &pCifsInode->flags);
+ else
+ clear_bit(
+ CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+ &pCifsInode->flags);
+
queue_work(cifsiod_wq,
&netfile->oplock_break);
netfile->oplock_break_cancelled = false;
@@ -551,6 +565,62 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
cinode->oplock = 0;
}
+static int
+cifs_oplock_break_wait(void *unused)
+{
+ schedule();
+ return signal_pending(current) ? -ERESTARTSYS : 0;
+}
+
+/*
+ * We wait for oplock breaks to be processed before we attempt to perform
+ * writes.
+ */
+int cifs_get_writer(struct cifsInodeInfo *cinode)
+{
+ int rc;
+
+start:
+ rc = wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK,
+ cifs_oplock_break_wait, TASK_KILLABLE);
+ if (rc)
+ return rc;
+
+ spin_lock(&cinode->writers_lock);
+ if (!cinode->writers)
+ set_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags);
+ cinode->writers++;
+ /* Check to see if we have started servicing an oplock break */
+ if (test_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags)) {
+ cinode->writers--;
+ if (cinode->writers == 0) {
+ clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags);
+ wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS);
+ }
+ spin_unlock(&cinode->writers_lock);
+ goto start;
+ }
+ spin_unlock(&cinode->writers_lock);
+ return 0;
+}
+
+void cifs_put_writer(struct cifsInodeInfo *cinode)
+{
+ spin_lock(&cinode->writers_lock);
+ cinode->writers--;
+ if (cinode->writers == 0) {
+ clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags);
+ wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS);
+ }
+ spin_unlock(&cinode->writers_lock);
+}
+
+void cifs_done_oplock_break(struct cifsInodeInfo *cinode)
+{
+ clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags);
+ wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK);
+}
+
bool
backup_cred(struct cifs_sb_info *cifs_sb)
{
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 526fb89f9230..d1fdfa848703 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -372,6 +372,16 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr)
return 0;
}
+static void
+cifs_downgrade_oplock(struct TCP_Server_Info *server,
+ struct cifsInodeInfo *cinode, bool set_level2)
+{
+ if (set_level2)
+ cifs_set_oplock_level(cinode, OPLOCK_READ);
+ else
+ cifs_set_oplock_level(cinode, 0);
+}
+
static bool
cifs_check_trans2(struct mid_q_entry *mid, struct TCP_Server_Info *server,
char *buf, int malformed)
@@ -1019,6 +1029,7 @@ struct smb_version_operations smb1_operations = {
.clear_stats = cifs_clear_stats,
.print_stats = cifs_print_stats,
.is_oplock_break = is_valid_oplock_break,
+ .downgrade_oplock = cifs_downgrade_oplock,
.check_trans2 = cifs_check_trans2,
.need_neg = cifs_need_neg,
.negotiate = cifs_negotiate,
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index fb3966265b6e..b8021fde987d 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -575,9 +575,21 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
else
cfile->oplock_break_cancelled = false;
- server->ops->set_oplock_level(cinode,
- rsp->OplockLevel ? SMB2_OPLOCK_LEVEL_II : 0,
- 0, NULL);
+ set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK,
+ &cinode->flags);
+
+ /*
+ * Set flag if the server downgrades the oplock
+ * to L2 else clear.
+ */
+ if (rsp->OplockLevel)
+ set_bit(
+ CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+ &cinode->flags);
+ else
+ clear_bit(
+ CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+ &cinode->flags);
queue_work(cifsiod_wq, &cfile->oplock_break);
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 192f51a12cf1..35ddc3ed119d 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -905,6 +905,17 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
}
static void
+smb2_downgrade_oplock(struct TCP_Server_Info *server,
+ struct cifsInodeInfo *cinode, bool set_level2)
+{
+ if (set_level2)
+ server->ops->set_oplock_level(cinode, SMB2_OPLOCK_LEVEL_II,
+ 0, NULL);
+ else
+ server->ops->set_oplock_level(cinode, 0, 0, NULL);
+}
+
+static void
smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
unsigned int epoch, bool *purge_cache)
{
@@ -1110,6 +1121,7 @@ struct smb_version_operations smb20_operations = {
.clear_stats = smb2_clear_stats,
.print_stats = smb2_print_stats,
.is_oplock_break = smb2_is_valid_oplock_break,
+ .downgrade_oplock = smb2_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
.negotiate_wsize = smb2_negotiate_wsize,
@@ -1184,6 +1196,7 @@ struct smb_version_operations smb21_operations = {
.clear_stats = smb2_clear_stats,
.print_stats = smb2_print_stats,
.is_oplock_break = smb2_is_valid_oplock_break,
+ .downgrade_oplock = smb2_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
.negotiate_wsize = smb2_negotiate_wsize,
@@ -1259,6 +1272,7 @@ struct smb_version_operations smb30_operations = {
.print_stats = smb2_print_stats,
.dump_share_caps = smb2_dump_share_caps,
.is_oplock_break = smb2_is_valid_oplock_break,
+ .downgrade_oplock = smb2_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
.negotiate_wsize = smb2_negotiate_wsize,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 860344701067..3802f8c94acc 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1352,7 +1352,6 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid)
{
int rc;
- char *res_key = NULL;
struct compress_ioctl fsctl_input;
char *ret_data = NULL;
@@ -1365,7 +1364,6 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
2 /* in data len */, &ret_data /* out data */, NULL);
cifs_dbg(FYI, "set compression rc %d\n", rc);
- kfree(res_key);
return rc;
}
diff --git a/fs/coredump.c b/fs/coredump.c
index e3ad709a4232..0b2528fb640e 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -73,10 +73,15 @@ static int expand_corename(struct core_name *cn, int size)
static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg)
{
int free, need;
+ va_list arg_copy;
again:
free = cn->size - cn->used;
- need = vsnprintf(cn->corename + cn->used, free, fmt, arg);
+
+ va_copy(arg_copy, arg);
+ need = vsnprintf(cn->corename + cn->used, free, fmt, arg_copy);
+ va_end(arg_copy);
+
if (need < free) {
cn->used += need;
return 0;
diff --git a/fs/dcache.c b/fs/dcache.c
index 66cba5a8a346..40707d88a945 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3144,6 +3144,7 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
end = ERR_PTR(-ENAMETOOLONG);
return end;
}
+EXPORT_SYMBOL(simple_dname);
/*
* Write full pathname from the root of the filesystem into the buffer.
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 3190ca973dd6..1e5b45359509 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -424,7 +424,7 @@ int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len)
}
/* Data available on socket or listen socket received a connect */
-static void lowcomms_data_ready(struct sock *sk, int count_unused)
+static void lowcomms_data_ready(struct sock *sk)
{
struct connection *con = sock2con(sk);
if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags))
diff --git a/fs/exec.c b/fs/exec.c
index 25dfeba6d55f..476f3ebf437e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -26,6 +26,7 @@
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/stat.h>
#include <linux/fcntl.h>
#include <linux/swap.h>
@@ -812,7 +813,7 @@ EXPORT_SYMBOL(kernel_read);
ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
{
- ssize_t res = file->f_op->read(file, (void __user *)addr, len, &pos);
+ ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
if (res > 0)
flush_icache_range(addr, addr + len);
return res;
@@ -822,7 +823,7 @@ EXPORT_SYMBOL(read_code);
static int exec_mmap(struct mm_struct *mm)
{
struct task_struct *tsk;
- struct mm_struct * old_mm, *active_mm;
+ struct mm_struct *old_mm, *active_mm;
/* Notify parent that we're no longer interested in the old VM */
tsk = current;
@@ -848,6 +849,8 @@ static int exec_mmap(struct mm_struct *mm)
tsk->mm = mm;
tsk->active_mm = mm;
activate_mm(active_mm, mm);
+ tsk->mm->vmacache_seqnum = 0;
+ vmacache_flush(tsk);
task_unlock(tsk);
if (old_mm) {
up_read(&old_mm->mmap_sem);
@@ -1043,7 +1046,7 @@ EXPORT_SYMBOL_GPL(get_task_comm);
* so that a new one can be started
*/
-void set_task_comm(struct task_struct *tsk, char *buf)
+void set_task_comm(struct task_struct *tsk, const char *buf)
{
task_lock(tsk);
trace_task_rename(tsk, buf);
@@ -1052,21 +1055,6 @@ void set_task_comm(struct task_struct *tsk, char *buf)
perf_event_comm(tsk);
}
-static void filename_to_taskname(char *tcomm, const char *fn, unsigned int len)
-{
- int i, ch;
-
- /* Copies the binary name from after last slash */
- for (i = 0; (ch = *(fn++)) != '\0';) {
- if (ch == '/')
- i = 0; /* overwrite what we wrote */
- else
- if (i < len - 1)
- tcomm[i++] = ch;
- }
- tcomm[i] = '\0';
-}
-
int flush_old_exec(struct linux_binprm * bprm)
{
int retval;
@@ -1080,8 +1068,6 @@ int flush_old_exec(struct linux_binprm * bprm)
goto out;
set_mm_exe_file(bprm->mm, bprm->file);
-
- filename_to_taskname(bprm->tcomm, bprm->filename, sizeof(bprm->tcomm));
/*
* Release all of the old mmap stuff
*/
@@ -1124,7 +1110,7 @@ void setup_new_exec(struct linux_binprm * bprm)
else
set_dumpable(current->mm, suid_dumpable);
- set_task_comm(current, bprm->tcomm);
+ set_task_comm(current, kbasename(bprm->filename));
/* Set the new mm task size. We have to do that late because it may
* depend on TIF_32BIT which is only updated in flush_thread() on
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index 7682b970d0f1..4e2c032ab8a1 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -21,12 +21,12 @@
#undef ORE_DBGMSG2
#define ORE_DBGMSG2 ORE_DBGMSG
-struct page *_raid_page_alloc(void)
+static struct page *_raid_page_alloc(void)
{
return alloc_page(GFP_KERNEL);
}
-void _raid_page_free(struct page *p)
+static void _raid_page_free(struct page *p)
{
__free_page(p);
}
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 9d9763328734..ed73ed8ebbee 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -543,7 +543,7 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
return !(odi->systemid_len || odi->osdname_len);
}
-int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
+static int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
struct exofs_dev **peds)
{
struct __alloc_ore_devs_and_exofs_devs {
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 1b8001bbe947..27695e6f4e46 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -4,7 +4,6 @@
* Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
*/
-#include <linux/capability.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/slab.h>
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 7cadd823bb31..7d66fb0e4cca 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -284,7 +284,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
int best_ndir = inodes_per_group;
int best_group = -1;
- get_random_bytes(&group, sizeof(group));
+ group = prandom_u32();
parent_group = (unsigned)group % ngroups;
for (i = 0; i < ngroups; i++) {
group = (parent_group + i) % ngroups;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index d260115c0350..3750031cfa2f 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -192,7 +192,7 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
-static int init_inodecache(void)
+static int __init init_inodecache(void)
{
ext2_inode_cachep = kmem_cache_create("ext2_inode_cache",
sizeof(struct ext2_inode_info),
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index cfedb2cb0d8c..c0ebc4db8849 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -42,8 +42,8 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name,
value, size, flags);
}
-int ext2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
- void *fs_info)
+static int ext2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
{
const struct xattr *xattr;
int err = 0;
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 22548f56197b..158b5d4ce067 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1727,10 +1727,7 @@ allocated:
percpu_counter_sub(&sbi->s_freeblocks_counter, num);
BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
- err = ext3_journal_dirty_metadata(handle, gdp_bh);
- if (!fatal)
- fatal = err;
-
+ fatal = ext3_journal_dirty_metadata(handle, gdp_bh);
if (fatal)
goto out;
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index e66e4808719f..17742eed2c16 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -275,7 +275,7 @@ static inline loff_t ext3_get_htree_eof(struct file *filp)
* NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX)
* will be invalid once the directory was converted into a dx directory
*/
-loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence)
+static loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
int dx_dir = is_dx_dir(inode);
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 082afd78b107..a1b810230cc5 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -215,7 +215,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
int best_ndir = inodes_per_group;
int best_group = -1;
- get_random_bytes(&group, sizeof(group));
+ group = prandom_u32();
parent_group = (unsigned)group % ngroups;
for (i = 0; i < ngroups; i++) {
group = (parent_group + i) % ngroups;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index efce2bbfb5e5..f5157d0d1b43 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1559,56 +1559,17 @@ static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
}
/*
- * Note that we always start a transaction even if we're not journalling
- * data. This is to preserve ordering: any hole instantiation within
- * __block_write_full_page -> ext3_get_block() should be journalled
- * along with the data so we don't crash and then get metadata which
+ * Note that whenever we need to map blocks we start a transaction even if
+ * we're not journalling data. This is to preserve ordering: any hole
+ * instantiation within __block_write_full_page -> ext3_get_block() should be
+ * journalled along with the data so we don't crash and then get metadata which
* refers to old data.
*
* In all journalling modes block_write_full_page() will start the I/O.
*
- * Problem:
- *
- * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
- * ext3_writepage()
- *
- * Similar for:
- *
- * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
- *
- * Same applies to ext3_get_block(). We will deadlock on various things like
- * lock_journal and i_truncate_mutex.
- *
- * Setting PF_MEMALLOC here doesn't work - too many internal memory
- * allocations fail.
- *
- * 16May01: If we're reentered then journal_current_handle() will be
- * non-zero. We simply *return*.
- *
- * 1 July 2001: @@@ FIXME:
- * In journalled data mode, a data buffer may be metadata against the
- * current transaction. But the same file is part of a shared mapping
- * and someone does a writepage() on it.
- *
- * We will move the buffer onto the async_data list, but *after* it has
- * been dirtied. So there's a small window where we have dirty data on
- * BJ_Metadata.
- *
- * Note that this only applies to the last partial page in the file. The
- * bit which block_write_full_page() uses prepare/commit for. (That's
- * broken code anyway: it's wrong for msync()).
- *
- * It's a rare case: affects the final partial page, for journalled data
- * where the file is subject to bith write() and writepage() in the same
- * transction. To fix it we'll need a custom block_write_full_page().
- * We'll probably need that anyway for journalling writepage() output.
- *
* We don't honour synchronous mounts for writepage(). That would be
* disastrous. Any write() or metadata operation will sync the fs for
* us.
- *
- * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
- * we don't need to open a transaction here.
*/
static int ext3_ordered_writepage(struct page *page,
struct writeback_control *wbc)
@@ -1673,12 +1634,9 @@ static int ext3_ordered_writepage(struct page *page,
* block_write_full_page() succeeded. Otherwise they are unmapped,
* and generally junk.
*/
- if (ret == 0) {
- err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
+ if (ret == 0)
+ ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
NULL, journal_dirty_data_fn);
- if (!ret)
- ret = err;
- }
walk_page_buffers(handle, page_bufs, 0,
PAGE_CACHE_SIZE, NULL, bput_one);
err = ext3_journal_stop(handle);
@@ -1925,6 +1883,8 @@ retry:
* and pretend the write failed... */
ext3_truncate_failed_direct_write(inode);
ret = PTR_ERR(handle);
+ if (inode->i_nlink)
+ ext3_orphan_del(NULL, inode);
goto out;
}
if (inode->i_nlink)
@@ -3212,21 +3172,20 @@ out_brelse:
*
* We are called from a few places:
*
- * - Within generic_file_write() for O_SYNC files.
+ * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
* Here, there will be no transaction running. We wait for any running
* transaction to commit.
*
- * - Within sys_sync(), kupdate and such.
- * We wait on commit, if tol to.
+ * - Within flush work (for sys_sync(), kupdate and such).
+ * We wait on commit, if told to.
*
- * - Within prune_icache() (PF_MEMALLOC == true)
- * Here we simply return. We can't afford to block kswapd on the
- * journal commit.
+ * - Within iput_final() -> write_inode_now()
+ * We wait on commit, if told to.
*
* In all cases it is actually safe for us to return without doing anything,
* because the inode has been copied into a raw inode buffer in
- * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
- * knfsd.
+ * ext3_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL
+ * writeback.
*
* Note that we are absolutely dependent upon all inode dirtiers doing the
* right thing: they *must* call mark_inode_dirty() after dirtying info in
@@ -3238,13 +3197,13 @@ out_brelse:
* stuff();
* inode->i_size = expr;
*
- * is in error because a kswapd-driven write_inode() could occur while
- * `stuff()' is running, and the new i_size will be lost. Plus the inode
- * will no longer be on the superblock's dirty inode list.
+ * is in error because write_inode() could occur while `stuff()' is running,
+ * and the new i_size will be lost. Plus the inode will no longer be on the
+ * superblock's dirty inode list.
*/
int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
{
- if (current->flags & PF_MEMALLOC)
+ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
return 0;
if (ext3_journal_current_handle()) {
@@ -3253,7 +3212,12 @@ int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
return -EIO;
}
- if (wbc->sync_mode != WB_SYNC_ALL)
+ /*
+ * No need to force transaction in WB_SYNC_NONE mode. Also
+ * ext3_sync_fs() will force the commit after everything is
+ * written.
+ */
+ if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
return 0;
return ext3_force_commit(inode->i_sb);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 95c6c5a6d0c5..08cdfe5461e3 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -527,7 +527,7 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
-static int init_inodecache(void)
+static int __init init_inodecache(void)
{
ext3_inode_cachep = kmem_cache_create("ext3_inode_cache",
sizeof(struct ext3_inode_info),
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 3387664ad70e..722c2bf9645d 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -43,8 +43,9 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name,
name, value, size, flags);
}
-int ext3_initxattrs(struct inode *inode, const struct xattr *xattr_array,
- void *fs_info)
+static int ext3_initxattrs(struct inode *inode,
+ const struct xattr *xattr_array,
+ void *fs_info)
{
const struct xattr *xattr;
handle_t *handle = fs_info;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index bc765591101a..063fc1538355 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -146,7 +146,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
overwrite = 1;
}
- ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+ ret = __generic_file_aio_write(iocb, iov, nr_segs);
mutex_unlock(&inode->i_mutex);
if (ret > 0) {
@@ -200,6 +200,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
static const struct vm_operations_struct ext4_file_vm_ops = {
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = ext4_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index fa8da4cb8c4b..e93e4ec7d165 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -174,7 +174,7 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
retval = f2fs_getxattr(inode, name_index, "", NULL, 0);
if (retval > 0) {
- value = kmalloc(retval, GFP_KERNEL);
+ value = kmalloc(retval, GFP_F2FS_ZERO);
if (!value)
return ERR_PTR(-ENOMEM);
retval = f2fs_getxattr(inode, name_index, "", value, retval);
@@ -203,6 +203,12 @@ static int __f2fs_set_acl(struct inode *inode, int type,
size_t size = 0;
int error;
+ if (acl) {
+ error = posix_acl_valid(acl);
+ if (error < 0)
+ return error;
+ }
+
switch (type) {
case ACL_TYPE_ACCESS:
name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 293d0486a40f..4aa521aa9bc3 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -33,14 +33,12 @@ struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
struct address_space *mapping = META_MAPPING(sbi);
struct page *page = NULL;
repeat:
- page = grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
if (!page) {
cond_resched();
goto repeat;
}
- /* We wait writeback only inside grab_meta_page() */
- wait_on_page_writeback(page);
SetPageUptodate(page);
return page;
}
@@ -75,23 +73,102 @@ out:
return page;
}
+inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
+{
+ switch (type) {
+ case META_NAT:
+ return NM_I(sbi)->max_nid / NAT_ENTRY_PER_BLOCK;
+ case META_SIT:
+ return SIT_BLK_CNT(sbi);
+ case META_SSA:
+ case META_CP:
+ return 0;
+ default:
+ BUG();
+ }
+}
+
+/*
+ * Readahead CP/NAT/SIT/SSA pages
+ */
+int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type)
+{
+ block_t prev_blk_addr = 0;
+ struct page *page;
+ int blkno = start;
+ int max_blks = get_max_meta_blks(sbi, type);
+
+ struct f2fs_io_info fio = {
+ .type = META,
+ .rw = READ_SYNC | REQ_META | REQ_PRIO
+ };
+
+ for (; nrpages-- > 0; blkno++) {
+ block_t blk_addr;
+
+ switch (type) {
+ case META_NAT:
+ /* get nat block addr */
+ if (unlikely(blkno >= max_blks))
+ blkno = 0;
+ blk_addr = current_nat_addr(sbi,
+ blkno * NAT_ENTRY_PER_BLOCK);
+ break;
+ case META_SIT:
+ /* get sit block addr */
+ if (unlikely(blkno >= max_blks))
+ goto out;
+ blk_addr = current_sit_addr(sbi,
+ blkno * SIT_ENTRY_PER_BLOCK);
+ if (blkno != start && prev_blk_addr + 1 != blk_addr)
+ goto out;
+ prev_blk_addr = blk_addr;
+ break;
+ case META_SSA:
+ case META_CP:
+ /* get ssa/cp block addr */
+ blk_addr = blkno;
+ break;
+ default:
+ BUG();
+ }
+
+ page = grab_cache_page(META_MAPPING(sbi), blk_addr);
+ if (!page)
+ continue;
+ if (PageUptodate(page)) {
+ mark_page_accessed(page);
+ f2fs_put_page(page, 1);
+ continue;
+ }
+
+ f2fs_submit_page_mbio(sbi, page, blk_addr, &fio);
+ mark_page_accessed(page);
+ f2fs_put_page(page, 0);
+ }
+out:
+ f2fs_submit_merged_bio(sbi, META, READ);
+ return blkno - start;
+}
+
static int f2fs_write_meta_page(struct page *page,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- /* Should not write any meta pages, if any IO error was occurred */
- if (unlikely(sbi->por_doing ||
- is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
+ if (unlikely(sbi->por_doing))
goto redirty_out;
-
if (wbc->for_reclaim)
goto redirty_out;
- wait_on_page_writeback(page);
+ /* Should not write any meta pages, if any IO error was occurred */
+ if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
+ goto no_write;
+ f2fs_wait_on_page_writeback(page, META);
write_meta_page(sbi, page);
+no_write:
dec_page_count(sbi, F2FS_DIRTY_META);
unlock_page(page);
return 0;
@@ -99,6 +176,7 @@ static int f2fs_write_meta_page(struct page *page,
redirty_out:
dec_page_count(sbi, F2FS_DIRTY_META);
wbc->pages_skipped++;
+ account_page_redirty(page);
set_page_dirty(page);
return AOP_WRITEPAGE_ACTIVATE;
}
@@ -107,21 +185,23 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
- int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
- long written;
-
- if (wbc->for_kupdate)
- return 0;
+ long diff, written;
/* collect a number of dirty meta pages and write together */
- if (get_pages(sbi, F2FS_DIRTY_META) < nrpages)
- return 0;
+ if (wbc->for_kupdate ||
+ get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META))
+ goto skip_write;
/* if mounting is failed, skip writing node pages */
mutex_lock(&sbi->cp_mutex);
- written = sync_meta_pages(sbi, META, nrpages);
+ diff = nr_pages_to_write(sbi, META, wbc);
+ written = sync_meta_pages(sbi, META, wbc->nr_to_write);
mutex_unlock(&sbi->cp_mutex);
- wbc->nr_to_write -= written;
+ wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff);
+ return 0;
+
+skip_write:
+ wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META);
return 0;
}
@@ -148,10 +228,22 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
+
lock_page(page);
- f2fs_bug_on(page->mapping != mapping);
- f2fs_bug_on(!PageDirty(page));
- clear_page_dirty_for_io(page);
+
+ if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+ unlock_page(page);
+ continue;
+ }
+ if (!PageDirty(page)) {
+ /* someone wrote it for us */
+ goto continue_unlock;
+ }
+
+ if (!clear_page_dirty_for_io(page))
+ goto continue_unlock;
+
if (f2fs_write_meta_page(page, &wbc)) {
unlock_page(page);
break;
@@ -216,16 +308,15 @@ void release_orphan_inode(struct f2fs_sb_info *sbi)
void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
{
- struct list_head *head, *this;
- struct orphan_inode_entry *new = NULL, *orphan = NULL;
+ struct list_head *head;
+ struct orphan_inode_entry *new, *orphan;
new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC);
new->ino = ino;
spin_lock(&sbi->orphan_inode_lock);
head = &sbi->orphan_inode_list;
- list_for_each(this, head) {
- orphan = list_entry(this, struct orphan_inode_entry, list);
+ list_for_each_entry(orphan, head, list) {
if (orphan->ino == ino) {
spin_unlock(&sbi->orphan_inode_lock);
kmem_cache_free(orphan_entry_slab, new);
@@ -234,14 +325,10 @@ void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
if (orphan->ino > ino)
break;
- orphan = NULL;
}
- /* add new_oentry into list which is sorted by inode number */
- if (orphan)
- list_add(&new->list, this->prev);
- else
- list_add_tail(&new->list, head);
+ /* add new orphan entry into list which is sorted by inode number */
+ list_add_tail(&new->list, &orphan->list);
spin_unlock(&sbi->orphan_inode_lock);
}
@@ -255,10 +342,11 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
list_for_each_entry(orphan, head, list) {
if (orphan->ino == ino) {
list_del(&orphan->list);
- kmem_cache_free(orphan_entry_slab, orphan);
f2fs_bug_on(sbi->n_orphans == 0);
sbi->n_orphans--;
- break;
+ spin_unlock(&sbi->orphan_inode_lock);
+ kmem_cache_free(orphan_entry_slab, orphan);
+ return;
}
}
spin_unlock(&sbi->orphan_inode_lock);
@@ -285,6 +373,8 @@ void recover_orphan_inodes(struct f2fs_sb_info *sbi)
start_blk = __start_cp_addr(sbi) + 1;
orphan_blkaddr = __start_sum_addr(sbi) - 1;
+ ra_meta_pages(sbi, start_blk, orphan_blkaddr, META_CP);
+
for (i = 0; i < orphan_blkaddr; i++) {
struct page *page = get_meta_page(sbi, start_blk + i);
struct f2fs_orphan_block *orphan_blk;
@@ -466,14 +556,12 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct list_head *head = &sbi->dir_inode_list;
- struct list_head *this;
+ struct dir_inode_entry *entry;
- list_for_each(this, head) {
- struct dir_inode_entry *entry;
- entry = list_entry(this, struct dir_inode_entry, list);
+ list_for_each_entry(entry, head, list)
if (unlikely(entry->inode == inode))
return -EEXIST;
- }
+
list_add_tail(&new->list, head);
stat_inc_dirty_dir(sbi);
return 0;
@@ -483,6 +571,7 @@ void set_dirty_dir_page(struct inode *inode, struct page *page)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct dir_inode_entry *new;
+ int ret = 0;
if (!S_ISDIR(inode->i_mode))
return;
@@ -492,13 +581,13 @@ void set_dirty_dir_page(struct inode *inode, struct page *page)
INIT_LIST_HEAD(&new->list);
spin_lock(&sbi->dir_inode_lock);
- if (__add_dirty_inode(inode, new))
- kmem_cache_free(inode_entry_slab, new);
-
- inc_page_count(sbi, F2FS_DIRTY_DENTS);
+ ret = __add_dirty_inode(inode, new);
inode_inc_dirty_dents(inode);
SetPagePrivate(page);
spin_unlock(&sbi->dir_inode_lock);
+
+ if (ret)
+ kmem_cache_free(inode_entry_slab, new);
}
void add_dirty_dir_inode(struct inode *inode)
@@ -506,44 +595,47 @@ void add_dirty_dir_inode(struct inode *inode)
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct dir_inode_entry *new =
f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
+ int ret = 0;
new->inode = inode;
INIT_LIST_HEAD(&new->list);
spin_lock(&sbi->dir_inode_lock);
- if (__add_dirty_inode(inode, new))
- kmem_cache_free(inode_entry_slab, new);
+ ret = __add_dirty_inode(inode, new);
spin_unlock(&sbi->dir_inode_lock);
+
+ if (ret)
+ kmem_cache_free(inode_entry_slab, new);
}
void remove_dirty_dir_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-
- struct list_head *this, *head;
+ struct list_head *head;
+ struct dir_inode_entry *entry;
if (!S_ISDIR(inode->i_mode))
return;
spin_lock(&sbi->dir_inode_lock);
- if (atomic_read(&F2FS_I(inode)->dirty_dents)) {
+ if (get_dirty_dents(inode)) {
spin_unlock(&sbi->dir_inode_lock);
return;
}
head = &sbi->dir_inode_list;
- list_for_each(this, head) {
- struct dir_inode_entry *entry;
- entry = list_entry(this, struct dir_inode_entry, list);
+ list_for_each_entry(entry, head, list) {
if (entry->inode == inode) {
list_del(&entry->list);
- kmem_cache_free(inode_entry_slab, entry);
stat_dec_dirty_dir(sbi);
- break;
+ spin_unlock(&sbi->dir_inode_lock);
+ kmem_cache_free(inode_entry_slab, entry);
+ goto done;
}
}
spin_unlock(&sbi->dir_inode_lock);
+done:
/* Only from the recovery routine */
if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) {
clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT);
@@ -554,15 +646,14 @@ void remove_dirty_dir_inode(struct inode *inode)
struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino)
{
- struct list_head *this, *head;
+ struct list_head *head;
struct inode *inode = NULL;
+ struct dir_inode_entry *entry;
spin_lock(&sbi->dir_inode_lock);
head = &sbi->dir_inode_list;
- list_for_each(this, head) {
- struct dir_inode_entry *entry;
- entry = list_entry(this, struct dir_inode_entry, list);
+ list_for_each_entry(entry, head, list) {
if (entry->inode->i_ino == ino) {
inode = entry->inode;
break;
@@ -589,7 +680,7 @@ retry:
inode = igrab(entry->inode);
spin_unlock(&sbi->dir_inode_lock);
if (inode) {
- filemap_flush(inode->i_mapping);
+ filemap_fdatawrite(inode->i_mapping);
iput(inode);
} else {
/*
@@ -824,6 +915,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
unblock_operations(sbi);
mutex_unlock(&sbi->cp_mutex);
+ stat_inc_cp_count(sbi->stat_info);
trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint");
}
@@ -845,11 +937,11 @@ void init_orphan_info(struct f2fs_sb_info *sbi)
int __init create_checkpoint_caches(void)
{
orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry",
- sizeof(struct orphan_inode_entry), NULL);
+ sizeof(struct orphan_inode_entry));
if (!orphan_entry_slab)
return -ENOMEM;
inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry",
- sizeof(struct dir_inode_entry), NULL);
+ sizeof(struct dir_inode_entry));
if (!inode_entry_slab) {
kmem_cache_destroy(orphan_entry_slab);
return -ENOMEM;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 2261ccdd0b5f..45abd60e2bff 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -45,7 +45,7 @@ static void f2fs_read_end_io(struct bio *bio, int err)
static void f2fs_write_end_io(struct bio *bio, int err)
{
- struct f2fs_sb_info *sbi = F2FS_SB(bio->bi_io_vec->bv_page->mapping->host->i_sb);
+ struct f2fs_sb_info *sbi = bio->bi_private;
struct bio_vec *bvec;
int i;
@@ -55,15 +55,16 @@ static void f2fs_write_end_io(struct bio *bio, int err)
if (unlikely(err)) {
SetPageError(page);
set_bit(AS_EIO, &page->mapping->flags);
- set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
- sbi->sb->s_flags |= MS_RDONLY;
+ f2fs_stop_checkpoint(sbi);
}
end_page_writeback(page);
dec_page_count(sbi, F2FS_WRITEBACK);
}
- if (bio->bi_private)
- complete(bio->bi_private);
+ if (sbi->wait_io) {
+ complete(sbi->wait_io);
+ sbi->wait_io = NULL;
+ }
if (!get_pages(sbi, F2FS_WRITEBACK) &&
!list_empty(&sbi->cp_wait.task_list))
@@ -86,6 +87,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
bio->bi_bdev = sbi->sb->s_bdev;
bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
+ bio->bi_private = sbi;
return bio;
}
@@ -113,7 +115,7 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
*/
if (fio->type == META_FLUSH) {
DECLARE_COMPLETION_ONSTACK(wait);
- io->bio->bi_private = &wait;
+ io->sbi->wait_io = &wait;
submit_bio(rw, io->bio);
wait_for_completion(&wait);
} else {
@@ -132,7 +134,7 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype];
- mutex_lock(&io->io_mutex);
+ down_write(&io->io_rwsem);
/* change META to META_FLUSH in the checkpoint procedure */
if (type >= META_FLUSH) {
@@ -140,7 +142,7 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO;
}
__submit_merged_bio(io);
- mutex_unlock(&io->io_mutex);
+ up_write(&io->io_rwsem);
}
/*
@@ -178,7 +180,7 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
verify_block_addr(sbi, blk_addr);
- mutex_lock(&io->io_mutex);
+ down_write(&io->io_rwsem);
if (!is_read)
inc_page_count(sbi, F2FS_WRITEBACK);
@@ -202,7 +204,7 @@ alloc_new:
io->last_block_in_bio = blk_addr;
- mutex_unlock(&io->io_mutex);
+ up_write(&io->io_rwsem);
trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr);
}
@@ -797,48 +799,36 @@ static int f2fs_write_data_page(struct page *page,
*/
offset = i_size & (PAGE_CACHE_SIZE - 1);
if ((page->index >= end_index + 1) || !offset) {
- if (S_ISDIR(inode->i_mode)) {
- dec_page_count(sbi, F2FS_DIRTY_DENTS);
- inode_dec_dirty_dents(inode);
- }
+ inode_dec_dirty_dents(inode);
goto out;
}
zero_user_segment(page, offset, PAGE_CACHE_SIZE);
write:
- if (unlikely(sbi->por_doing)) {
- err = AOP_WRITEPAGE_ACTIVATE;
+ if (unlikely(sbi->por_doing))
goto redirty_out;
- }
/* Dentry blocks are controlled by checkpoint */
if (S_ISDIR(inode->i_mode)) {
- dec_page_count(sbi, F2FS_DIRTY_DENTS);
inode_dec_dirty_dents(inode);
err = do_write_data_page(page, &fio);
- } else {
- f2fs_lock_op(sbi);
-
- if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode)) {
- err = f2fs_write_inline_data(inode, page, offset);
- f2fs_unlock_op(sbi);
- goto out;
- } else {
- err = do_write_data_page(page, &fio);
- }
+ goto done;
+ }
- f2fs_unlock_op(sbi);
+ if (!wbc->for_reclaim)
need_balance_fs = true;
- }
- if (err == -ENOENT)
- goto out;
- else if (err)
+ else if (has_not_enough_free_secs(sbi, 0))
goto redirty_out;
- if (wbc->for_reclaim) {
- f2fs_submit_merged_bio(sbi, DATA, WRITE);
- need_balance_fs = false;
- }
+ f2fs_lock_op(sbi);
+ if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode))
+ err = f2fs_write_inline_data(inode, page, offset);
+ else
+ err = do_write_data_page(page, &fio);
+ f2fs_unlock_op(sbi);
+done:
+ if (err && err != -ENOENT)
+ goto redirty_out;
clear_cold_data(page);
out:
@@ -849,12 +839,11 @@ out:
redirty_out:
wbc->pages_skipped++;
+ account_page_redirty(page);
set_page_dirty(page);
- return err;
+ return AOP_WRITEPAGE_ACTIVATE;
}
-#define MAX_DESIRED_PAGES_WP 4096
-
static int __f2fs_writepage(struct page *page, struct writeback_control *wbc,
void *data)
{
@@ -871,17 +860,17 @@ static int f2fs_write_data_pages(struct address_space *mapping,
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
bool locked = false;
int ret;
- long excess_nrtw = 0, desired_nrtw;
+ long diff;
/* deal with chardevs and other special file */
if (!mapping->a_ops->writepage)
return 0;
- if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) {
- desired_nrtw = MAX_DESIRED_PAGES_WP;
- excess_nrtw = desired_nrtw - wbc->nr_to_write;
- wbc->nr_to_write = desired_nrtw;
- }
+ if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE &&
+ get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA))
+ goto skip_write;
+
+ diff = nr_pages_to_write(sbi, DATA, wbc);
if (!S_ISDIR(inode->i_mode)) {
mutex_lock(&sbi->writepages);
@@ -895,8 +884,12 @@ static int f2fs_write_data_pages(struct address_space *mapping,
remove_dirty_dir_inode(inode);
- wbc->nr_to_write -= excess_nrtw;
+ wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
return ret;
+
+skip_write:
+ wbc->pages_skipped += get_dirty_dents(inode);
+ return 0;
}
static int f2fs_write_begin(struct file *file, struct address_space *mapping,
@@ -949,13 +942,19 @@ inline_data:
if (dn.data_blkaddr == NEW_ADDR) {
zero_user_segment(page, 0, PAGE_CACHE_SIZE);
} else {
- if (f2fs_has_inline_data(inode))
+ if (f2fs_has_inline_data(inode)) {
err = f2fs_read_inline_data(inode, page);
- else
+ if (err) {
+ page_cache_release(page);
+ return err;
+ }
+ } else {
err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
READ_SYNC);
- if (err)
- return err;
+ if (err)
+ return err;
+ }
+
lock_page(page);
if (unlikely(!PageUptodate(page))) {
f2fs_put_page(page, 1);
@@ -1031,11 +1030,8 @@ static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
unsigned int length)
{
struct inode *inode = page->mapping->host;
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- if (S_ISDIR(inode->i_mode) && PageDirty(page)) {
- dec_page_count(sbi, F2FS_DIRTY_DENTS);
+ if (PageDirty(page))
inode_dec_dirty_dents(inode);
- }
ClearPagePrivate(page);
}
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 3de9d20d0c14..b52c12cf5873 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -86,7 +86,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
{
struct f2fs_stat_info *si = F2FS_STAT(sbi);
unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist;
- struct sit_info *sit_i = SIT_I(sbi);
unsigned int segno, vblocks;
int ndirty = 0;
@@ -94,7 +93,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
total_vblocks = 0;
blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg);
hblks_per_sec = blks_per_sec / 2;
- mutex_lock(&sit_i->sentry_lock);
for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) {
vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
dist = abs(vblocks - hblks_per_sec);
@@ -105,7 +103,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
ndirty++;
}
}
- mutex_unlock(&sit_i->sentry_lock);
dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100;
si->bimodal = bimodal / dist;
if (si->dirty_count)
@@ -236,6 +233,7 @@ static int stat_show(struct seq_file *s, void *v)
si->dirty_count);
seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n",
si->prefree_count, si->free_segs, si->free_secs);
+ seq_printf(s, "CP calls: %d\n", si->cp_count);
seq_printf(s, "GC calls: %d (BG: %d)\n",
si->call_count, si->bg_gc);
seq_printf(s, " - data segments : %d\n", si->data_segs);
@@ -252,10 +250,10 @@ static int stat_show(struct seq_file *s, void *v)
si->ndirty_dent, si->ndirty_dirs);
seq_printf(s, " - meta: %4d in %4d\n",
si->ndirty_meta, si->meta_pages);
- seq_printf(s, " - NATs: %5d > %lu\n",
- si->nats, NM_WOUT_THRESHOLD);
- seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n",
- si->sits, si->fnids);
+ seq_printf(s, " - NATs: %9d\n - SITs: %9d\n",
+ si->nats, si->sits);
+ seq_printf(s, " - free_nids: %9d\n",
+ si->fnids);
seq_puts(s, "\nDistribution of User Blocks:");
seq_puts(s, " [ valid | invalid | free ]\n");
seq_puts(s, " [");
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 2b7c255bcbdf..972fd0ef230f 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -21,12 +21,12 @@ static unsigned long dir_blocks(struct inode *inode)
>> PAGE_CACHE_SHIFT;
}
-static unsigned int dir_buckets(unsigned int level)
+static unsigned int dir_buckets(unsigned int level, int dir_level)
{
if (level < MAX_DIR_HASH_DEPTH / 2)
- return 1 << level;
+ return 1 << (level + dir_level);
else
- return 1 << ((MAX_DIR_HASH_DEPTH / 2) - 1);
+ return 1 << ((MAX_DIR_HASH_DEPTH / 2 + dir_level) - 1);
}
static unsigned int bucket_blocks(unsigned int level)
@@ -65,13 +65,14 @@ static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode)
de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
}
-static unsigned long dir_block_index(unsigned int level, unsigned int idx)
+static unsigned long dir_block_index(unsigned int level,
+ int dir_level, unsigned int idx)
{
unsigned long i;
unsigned long bidx = 0;
for (i = 0; i < level; i++)
- bidx += dir_buckets(i) * bucket_blocks(i);
+ bidx += dir_buckets(i, dir_level) * bucket_blocks(i);
bidx += idx * bucket_blocks(level);
return bidx;
}
@@ -93,16 +94,21 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
f2fs_hash_t namehash, struct page **res_page)
{
struct f2fs_dir_entry *de;
- unsigned long bit_pos, end_pos, next_pos;
+ unsigned long bit_pos = 0;
struct f2fs_dentry_block *dentry_blk = kmap(dentry_page);
- int slots;
+ const void *dentry_bits = &dentry_blk->dentry_bitmap;
+ int max_len = 0;
- bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
- NR_DENTRY_IN_BLOCK, 0);
while (bit_pos < NR_DENTRY_IN_BLOCK) {
+ if (!test_bit_le(bit_pos, dentry_bits)) {
+ if (bit_pos == 0)
+ max_len = 1;
+ else if (!test_bit_le(bit_pos - 1, dentry_bits))
+ max_len++;
+ bit_pos++;
+ continue;
+ }
de = &dentry_blk->dentry[bit_pos];
- slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
-
if (early_match_name(name, namelen, namehash, de)) {
if (!memcmp(dentry_blk->filename[bit_pos],
name, namelen)) {
@@ -110,20 +116,18 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
goto found;
}
}
- next_pos = bit_pos + slots;
- bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
- NR_DENTRY_IN_BLOCK, next_pos);
- if (bit_pos >= NR_DENTRY_IN_BLOCK)
- end_pos = NR_DENTRY_IN_BLOCK;
- else
- end_pos = bit_pos;
- if (*max_slots < end_pos - next_pos)
- *max_slots = end_pos - next_pos;
+ if (max_len > *max_slots) {
+ *max_slots = max_len;
+ max_len = 0;
+ }
+ bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
}
de = NULL;
kunmap(dentry_page);
found:
+ if (max_len > *max_slots)
+ *max_slots = max_len;
return de;
}
@@ -141,10 +145,11 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
f2fs_bug_on(level > MAX_DIR_HASH_DEPTH);
- nbucket = dir_buckets(level);
+ nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
nblock = bucket_blocks(level);
- bidx = dir_block_index(level, le32_to_cpu(namehash) % nbucket);
+ bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level,
+ le32_to_cpu(namehash) % nbucket);
end_block = bidx + nblock;
for (; bidx < end_block; bidx++) {
@@ -248,7 +253,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
struct page *page, struct inode *inode)
{
lock_page(page);
- wait_on_page_writeback(page);
+ f2fs_wait_on_page_writeback(page, DATA);
de->ino = cpu_to_le32(inode->i_ino);
set_de_type(de, inode);
kunmap(page);
@@ -347,14 +352,11 @@ static struct page *init_inode_metadata(struct inode *inode,
err = f2fs_init_security(inode, dir, name, page);
if (err)
goto put_error;
-
- wait_on_page_writeback(page);
} else {
page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
if (IS_ERR(page))
return page;
- wait_on_page_writeback(page);
set_cold_node(inode, page);
}
@@ -372,6 +374,10 @@ static struct page *init_inode_metadata(struct inode *inode,
put_error:
f2fs_put_page(page, 1);
+ /* once the failed inode becomes a bad inode, i_mode is S_IFREG */
+ truncate_inode_pages(&inode->i_data, 0);
+ truncate_blocks(inode, 0);
+ remove_dirty_dir_inode(inode);
error:
remove_inode_page(inode);
return ERR_PTR(err);
@@ -395,9 +401,6 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode,
set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
}
- if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR))
- update_inode_page(dir);
-
if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK))
clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
}
@@ -464,10 +467,11 @@ start:
if (level == current_depth)
++current_depth;
- nbucket = dir_buckets(level);
+ nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
nblock = bucket_blocks(level);
- bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket));
+ bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level,
+ (le32_to_cpu(dentry_hash) % nbucket));
for (block = bidx; block <= (bidx + nblock - 1); block++) {
dentry_page = get_new_data_page(dir, NULL, block, true);
@@ -487,8 +491,9 @@ start:
++level;
goto start;
add_dentry:
- wait_on_page_writeback(dentry_page);
+ f2fs_wait_on_page_writeback(dentry_page, DATA);
+ down_write(&F2FS_I(inode)->i_sem);
page = init_inode_metadata(inode, dir, name);
if (IS_ERR(page)) {
err = PTR_ERR(page);
@@ -511,7 +516,12 @@ add_dentry:
update_parent_metadata(dir, inode, current_depth);
fail:
- clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
+ up_write(&F2FS_I(inode)->i_sem);
+
+ if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) {
+ update_inode_page(dir);
+ clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
+ }
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
return err;
@@ -528,13 +538,12 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
unsigned int bit_pos;
struct address_space *mapping = page->mapping;
struct inode *dir = mapping->host;
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
void *kaddr = page_address(page);
int i;
lock_page(page);
- wait_on_page_writeback(page);
+ f2fs_wait_on_page_writeback(page, DATA);
dentry_blk = (struct f2fs_dentry_block *)kaddr;
bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry;
@@ -551,6 +560,10 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
if (inode) {
+ struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+
+ down_write(&F2FS_I(inode)->i_sem);
+
if (S_ISDIR(inode->i_mode)) {
drop_nlink(dir);
update_inode_page(dir);
@@ -561,6 +574,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
drop_nlink(inode);
i_size_write(inode, 0);
}
+ up_write(&F2FS_I(inode)->i_sem);
update_inode_page(inode);
if (inode->i_nlink == 0)
@@ -573,7 +587,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
truncate_hole(dir, page->index, page->index + 1);
clear_page_dirty_for_io(page);
ClearPageUptodate(page);
- dec_page_count(sbi, F2FS_DIRTY_DENTS);
inode_dec_dirty_dents(dir);
}
f2fs_put_page(page, 1);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index fc3c558cb4f3..2ecac8312359 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -40,6 +40,7 @@
#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040
#define F2FS_MOUNT_INLINE_XATTR 0x00000080
#define F2FS_MOUNT_INLINE_DATA 0x00000100
+#define F2FS_MOUNT_FLUSH_MERGE 0x00000200
#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -88,6 +89,16 @@ enum {
SIT_BITMAP
};
+/*
+ * For CP/NAT/SIT/SSA readahead
+ */
+enum {
+ META_CP,
+ META_NAT,
+ META_SIT,
+ META_SSA
+};
+
/* for the list of orphan inodes */
struct orphan_inode_entry {
struct list_head list; /* list head */
@@ -187,16 +198,20 @@ struct extent_info {
#define FADVISE_COLD_BIT 0x01
#define FADVISE_LOST_PINO_BIT 0x02
+#define DEF_DIR_LEVEL 0
+
struct f2fs_inode_info {
struct inode vfs_inode; /* serve a vfs inode */
unsigned long i_flags; /* keep an inode flags for ioctl */
unsigned char i_advise; /* use to give file attribute hints */
+ unsigned char i_dir_level; /* use for dentry level for large dir */
unsigned int i_current_depth; /* use only in directory structure */
unsigned int i_pino; /* parent inode number */
umode_t i_acl_mode; /* keep file acl mode temporarily */
/* Use below internally in f2fs*/
unsigned long flags; /* use to pass per-file flags */
+ struct rw_semaphore i_sem; /* protect fi info */
atomic_t dirty_dents; /* # of dirty dentry pages */
f2fs_hash_t chash; /* hash value of given file name */
unsigned int clevel; /* maximum level of given file name */
@@ -229,6 +244,7 @@ struct f2fs_nm_info {
block_t nat_blkaddr; /* base disk address of NAT */
nid_t max_nid; /* maximum possible node ids */
nid_t next_scan_nid; /* the next nid to be scanned */
+ unsigned int ram_thresh; /* control the memory footprint */
/* NAT cache management */
struct radix_tree_root nat_root;/* root of the nat entry cache */
@@ -238,6 +254,7 @@ struct f2fs_nm_info {
struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */
/* free node ids management */
+ struct radix_tree_root free_nid_root;/* root of the free_nid cache */
struct list_head free_nid_list; /* a list for free nids */
spinlock_t free_nid_list_lock; /* protect free nid list */
unsigned int fcnt; /* the number of free node id */
@@ -300,6 +317,12 @@ enum {
NO_CHECK_TYPE
};
+struct flush_cmd {
+ struct flush_cmd *next;
+ struct completion wait;
+ int ret;
+};
+
struct f2fs_sm_info {
struct sit_info *sit_info; /* whole segment information */
struct free_segmap_info *free_info; /* free segment information */
@@ -328,6 +351,14 @@ struct f2fs_sm_info {
unsigned int ipu_policy; /* in-place-update policy */
unsigned int min_ipu_util; /* in-place-update threshold */
+
+ /* for flush command control */
+ struct task_struct *f2fs_issue_flush; /* flush thread */
+ wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */
+ struct flush_cmd *issue_list; /* list for command issue */
+ struct flush_cmd *dispatch_list; /* list for command dispatch */
+ spinlock_t issue_lock; /* for issue list lock */
+ struct flush_cmd *issue_tail; /* list tail of issue list */
};
/*
@@ -378,7 +409,7 @@ struct f2fs_bio_info {
struct bio *bio; /* bios to merge */
sector_t last_block_in_bio; /* last block number */
struct f2fs_io_info fio; /* store buffered io info. */
- struct mutex io_mutex; /* mutex for bio */
+ struct rw_semaphore io_rwsem; /* blocking op for bio */
};
struct f2fs_sb_info {
@@ -398,6 +429,7 @@ struct f2fs_sb_info {
/* for bio operations */
struct f2fs_bio_info read_io; /* for read bios */
struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */
+ struct completion *wait_io; /* for completion bios */
/* for checkpoint */
struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
@@ -407,7 +439,6 @@ struct f2fs_sb_info {
struct mutex node_write; /* locking node writes */
struct mutex writepages; /* mutex for writepages() */
bool por_doing; /* recovery is doing or not */
- bool on_build_free_nids; /* build_free_nids is doing */
wait_queue_head_t cp_wait;
/* for orphan inode management */
@@ -436,6 +467,7 @@ struct f2fs_sb_info {
unsigned int total_valid_node_count; /* valid node block count */
unsigned int total_valid_inode_count; /* valid inode count */
int active_logs; /* # of active logs */
+ int dir_level; /* directory level */
block_t user_block_count; /* # of user blocks */
block_t total_valid_block_count; /* # of valid blocks */
@@ -622,6 +654,11 @@ static inline int F2FS_HAS_BLOCKS(struct inode *inode)
return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS;
}
+static inline bool f2fs_has_xattr_block(unsigned int ofs)
+{
+ return ofs == XATTR_NODE_OFFSET;
+}
+
static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
struct inode *inode, blkcnt_t count)
{
@@ -661,6 +698,7 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
static inline void inode_inc_dirty_dents(struct inode *inode)
{
+ inc_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS);
atomic_inc(&F2FS_I(inode)->dirty_dents);
}
@@ -671,6 +709,10 @@ static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
static inline void inode_dec_dirty_dents(struct inode *inode)
{
+ if (!S_ISDIR(inode->i_mode))
+ return;
+
+ dec_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS);
atomic_dec(&F2FS_I(inode)->dirty_dents);
}
@@ -679,6 +721,11 @@ static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
return atomic_read(&sbi->nr_pages[count_type]);
}
+static inline int get_dirty_dents(struct inode *inode)
+{
+ return atomic_read(&F2FS_I(inode)->dirty_dents);
+}
+
static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
{
unsigned int pages_per_sec = sbi->segs_per_sec *
@@ -689,11 +736,7 @@ static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi)
{
- block_t ret;
- spin_lock(&sbi->stat_lock);
- ret = sbi->total_valid_block_count;
- spin_unlock(&sbi->stat_lock);
- return ret;
+ return sbi->total_valid_block_count;
}
static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag)
@@ -789,11 +832,7 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi)
{
- unsigned int ret;
- spin_lock(&sbi->stat_lock);
- ret = sbi->total_valid_node_count;
- spin_unlock(&sbi->stat_lock);
- return ret;
+ return sbi->total_valid_node_count;
}
static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
@@ -814,11 +853,7 @@ static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi)
static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
{
- unsigned int ret;
- spin_lock(&sbi->stat_lock);
- ret = sbi->total_valid_inode_count;
- spin_unlock(&sbi->stat_lock);
- return ret;
+ return sbi->total_valid_inode_count;
}
static inline void f2fs_put_page(struct page *page, int unlock)
@@ -844,9 +879,9 @@ static inline void f2fs_put_dnode(struct dnode_of_data *dn)
}
static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name,
- size_t size, void (*ctor)(void *))
+ size_t size)
{
- return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, ctor);
+ return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, NULL);
}
static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep,
@@ -983,24 +1018,28 @@ static inline void set_raw_inline(struct f2fs_inode_info *fi,
ri->i_inline |= F2FS_INLINE_DATA;
}
+static inline int f2fs_has_inline_xattr(struct inode *inode)
+{
+ return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR);
+}
+
static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi)
{
- if (is_inode_flag_set(fi, FI_INLINE_XATTR))
+ if (f2fs_has_inline_xattr(&fi->vfs_inode))
return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS;
return DEF_ADDRS_PER_INODE;
}
static inline void *inline_xattr_addr(struct page *page)
{
- struct f2fs_inode *ri;
- ri = (struct f2fs_inode *)page_address(page);
+ struct f2fs_inode *ri = F2FS_INODE(page);
return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE -
F2FS_INLINE_XATTR_ADDRS]);
}
static inline int inline_xattr_size(struct inode *inode)
{
- if (is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR))
+ if (f2fs_has_inline_xattr(inode))
return F2FS_INLINE_XATTR_ADDRS << 2;
else
return 0;
@@ -1013,8 +1052,7 @@ static inline int f2fs_has_inline_data(struct inode *inode)
static inline void *inline_data_addr(struct page *page)
{
- struct f2fs_inode *ri;
- ri = (struct f2fs_inode *)page_address(page);
+ struct f2fs_inode *ri = F2FS_INODE(page);
return (void *)&(ri->i_addr[1]);
}
@@ -1023,6 +1061,12 @@ static inline int f2fs_readonly(struct super_block *sb)
return sb->s_flags & MS_RDONLY;
}
+static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi)
+{
+ set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+ sbi->sb->s_flags |= MS_RDONLY;
+}
+
#define get_inode_mode(i) \
((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \
(F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
@@ -1048,7 +1092,7 @@ void f2fs_set_inode_flags(struct inode *);
struct inode *f2fs_iget(struct super_block *, unsigned long);
int try_to_free_nats(struct f2fs_sb_info *, int);
void update_inode(struct inode *, struct page *);
-int update_inode_page(struct inode *);
+void update_inode_page(struct inode *);
int f2fs_write_inode(struct inode *, struct writeback_control *);
void f2fs_evict_inode(struct inode *);
@@ -1097,6 +1141,7 @@ struct dnode_of_data;
struct node_info;
int is_checkpointed_node(struct f2fs_sb_info *, nid_t);
+bool fsync_mark_done(struct f2fs_sb_info *, nid_t);
void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
int truncate_inode_blocks(struct inode *, pgoff_t);
@@ -1115,6 +1160,7 @@ void alloc_nid_done(struct f2fs_sb_info *, nid_t);
void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
void recover_node_page(struct f2fs_sb_info *, struct page *,
struct f2fs_summary *, struct node_info *, block_t);
+bool recover_xattr_data(struct inode *, struct page *, block_t);
int recover_inode_page(struct f2fs_sb_info *, struct page *);
int restore_node_summary(struct f2fs_sb_info *, unsigned int,
struct f2fs_summary_block *);
@@ -1129,7 +1175,9 @@ void destroy_node_manager_caches(void);
*/
void f2fs_balance_fs(struct f2fs_sb_info *);
void f2fs_balance_fs_bg(struct f2fs_sb_info *);
+int f2fs_issue_flush(struct f2fs_sb_info *);
void invalidate_blocks(struct f2fs_sb_info *, block_t);
+void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
void clear_prefree_segments(struct f2fs_sb_info *);
int npages_for_summary_flush(struct f2fs_sb_info *);
void allocate_new_segments(struct f2fs_sb_info *);
@@ -1162,6 +1210,7 @@ void destroy_segment_manager_caches(void);
*/
struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
+int ra_meta_pages(struct f2fs_sb_info *, int, int, int);
long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
int acquire_orphan_inode(struct f2fs_sb_info *);
void release_orphan_inode(struct f2fs_sb_info *);
@@ -1231,7 +1280,7 @@ struct f2fs_stat_info {
int util_free, util_valid, util_invalid;
int rsvd_segs, overp_segs;
int dirty_count, node_pages, meta_pages;
- int prefree_count, call_count;
+ int prefree_count, call_count, cp_count;
int tot_segs, node_segs, data_segs, free_segs, free_secs;
int tot_blks, data_blks, node_blks;
int curseg[NR_CURSEG_TYPE];
@@ -1248,6 +1297,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
return (struct f2fs_stat_info *)sbi->stat_info;
}
+#define stat_inc_cp_count(si) ((si)->cp_count++)
#define stat_inc_call_count(si) ((si)->call_count++)
#define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++)
#define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++)
@@ -1302,6 +1352,7 @@ void f2fs_destroy_stats(struct f2fs_sb_info *);
void __init f2fs_create_root_stats(void);
void f2fs_destroy_root_stats(void);
#else
+#define stat_inc_cp_count(si)
#define stat_inc_call_count(si)
#define stat_inc_bggc_count(si)
#define stat_inc_dirty_dir(sbi)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 0dfcef53a6ed..60e7d5448a1d 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -76,7 +76,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
trace_f2fs_vm_page_mkwrite(page, DATA);
mapped:
/* fill the page */
- wait_on_page_writeback(page);
+ f2fs_wait_on_page_writeback(page, DATA);
out:
sb_end_pagefault(inode->i_sb);
return block_page_mkwrite_return(err);
@@ -84,6 +84,7 @@ out:
static const struct vm_operations_struct f2fs_file_vm_ops = {
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = f2fs_vm_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
@@ -111,11 +112,12 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
struct inode *inode = file->f_mapping->host;
+ struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
int ret = 0;
bool need_cp = false;
struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
+ .sync_mode = WB_SYNC_ALL,
.nr_to_write = LONG_MAX,
.for_reclaim = 0,
};
@@ -133,7 +135,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
/* guarantee free sections for fsync */
f2fs_balance_fs(sbi);
- mutex_lock(&inode->i_mutex);
+ down_read(&fi->i_sem);
/*
* Both of fdatasync() and fsync() are able to be recovered from
@@ -150,25 +152,33 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi)))
need_cp = true;
+ up_read(&fi->i_sem);
+
if (need_cp) {
nid_t pino;
- F2FS_I(inode)->xattr_ver = 0;
-
/* all the dirty node pages should be flushed for POR */
ret = f2fs_sync_fs(inode->i_sb, 1);
+
+ down_write(&fi->i_sem);
+ F2FS_I(inode)->xattr_ver = 0;
if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
get_parent_ino(inode, &pino)) {
F2FS_I(inode)->i_pino = pino;
file_got_pino(inode);
+ up_write(&fi->i_sem);
mark_inode_dirty_sync(inode);
ret = f2fs_write_inode(inode, NULL);
if (ret)
goto out;
+ } else {
+ up_write(&fi->i_sem);
}
} else {
/* if there is no written node page, write its inode page */
while (!sync_node_pages(sbi, inode->i_ino, &wbc)) {
+ if (fsync_mark_done(sbi, inode->i_ino))
+ goto out;
mark_inode_dirty_sync(inode);
ret = f2fs_write_inode(inode, NULL);
if (ret)
@@ -177,10 +187,9 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
ret = wait_on_node_pages_writeback(sbi, inode->i_ino);
if (ret)
goto out;
- ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ ret = f2fs_issue_flush(F2FS_SB(inode->i_sb));
}
out:
- mutex_unlock(&inode->i_mutex);
trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
return ret;
}
@@ -245,7 +254,7 @@ static void truncate_partial_data_page(struct inode *inode, u64 from)
f2fs_put_page(page, 1);
return;
}
- wait_on_page_writeback(page);
+ f2fs_wait_on_page_writeback(page, DATA);
zero_user(page, offset, PAGE_CACHE_SIZE - offset);
set_page_dirty(page);
f2fs_put_page(page, 1);
@@ -422,7 +431,7 @@ static void fill_zero(struct inode *inode, pgoff_t index,
f2fs_unlock_op(sbi);
if (!IS_ERR(page)) {
- wait_on_page_writeback(page);
+ f2fs_wait_on_page_writeback(page, DATA);
zero_user(page, start, len);
set_page_dirty(page);
f2fs_put_page(page, 1);
@@ -560,6 +569,8 @@ static long f2fs_fallocate(struct file *file, int mode,
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
+ mutex_lock(&inode->i_mutex);
+
if (mode & FALLOC_FL_PUNCH_HOLE)
ret = punch_hole(inode, offset, len);
else
@@ -569,6 +580,9 @@ static long f2fs_fallocate(struct file *file, int mode,
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
mark_inode_dirty(inode);
}
+
+ mutex_unlock(&inode->i_mutex);
+
trace_f2fs_fallocate(inode, mode, offset, len, ret);
return ret;
}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index ea0371e854b4..b90dbe55403a 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -531,15 +531,10 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
set_page_dirty(page);
set_cold_data(page);
} else {
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-
f2fs_wait_on_page_writeback(page, DATA);
- if (clear_page_dirty_for_io(page) &&
- S_ISDIR(inode->i_mode)) {
- dec_page_count(sbi, F2FS_DIRTY_DENTS);
+ if (clear_page_dirty_for_io(page))
inode_dec_dirty_dents(inode);
- }
set_cold_data(page);
do_write_data_page(page, &fio);
clear_cold_data(page);
@@ -701,6 +696,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
gc_more:
if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
goto stop;
+ if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
+ goto stop;
if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
gc_type = FG_GC;
@@ -711,6 +708,11 @@ gc_more:
goto stop;
ret = 0;
+ /* readahead multi ssa blocks those have contiguous address */
+ if (sbi->segs_per_sec > 1)
+ ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec,
+ META_SSA);
+
for (i = 0; i < sbi->segs_per_sec; i++)
do_garbage_collect(sbi, segno + i, &ilist, gc_type);
@@ -740,7 +742,7 @@ void build_gc_manager(struct f2fs_sb_info *sbi)
int __init create_gc_caches(void)
{
winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes",
- sizeof(struct inode_entry), NULL);
+ sizeof(struct inode_entry));
if (!winode_slab)
return -ENOMEM;
return 0;
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 31ee5b164ff9..383db1fabcf4 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -45,8 +45,10 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
}
ipage = get_node_page(sbi, inode->i_ino);
- if (IS_ERR(ipage))
+ if (IS_ERR(ipage)) {
+ unlock_page(page);
return PTR_ERR(ipage);
+ }
zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 28cea76d78c6..ee829d360468 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -107,6 +107,7 @@ static int do_read_inode(struct inode *inode)
fi->flags = 0;
fi->i_advise = ri->i_advise;
fi->i_pino = le32_to_cpu(ri->i_pino);
+ fi->i_dir_level = ri->i_dir_level;
get_extent_info(&fi->ext, ri->i_ext);
get_inline_info(fi, ri);
@@ -204,6 +205,7 @@ void update_inode(struct inode *inode, struct page *node_page)
ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags);
ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino);
ri->i_generation = cpu_to_le32(inode->i_generation);
+ ri->i_dir_level = F2FS_I(inode)->i_dir_level;
__set_inode_rdev(inode, ri);
set_cold_node(inode, node_page);
@@ -212,24 +214,29 @@ void update_inode(struct inode *inode, struct page *node_page)
clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
}
-int update_inode_page(struct inode *inode)
+void update_inode_page(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct page *node_page;
-
+retry:
node_page = get_node_page(sbi, inode->i_ino);
- if (IS_ERR(node_page))
- return PTR_ERR(node_page);
-
+ if (IS_ERR(node_page)) {
+ int err = PTR_ERR(node_page);
+ if (err == -ENOMEM) {
+ cond_resched();
+ goto retry;
+ } else if (err != -ENOENT) {
+ f2fs_stop_checkpoint(sbi);
+ }
+ return;
+ }
update_inode(inode, node_page);
f2fs_put_page(node_page, 1);
- return 0;
}
int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- int ret;
if (inode->i_ino == F2FS_NODE_INO(sbi) ||
inode->i_ino == F2FS_META_INO(sbi))
@@ -243,13 +250,13 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
* during the urgent cleaning time when runing out of free sections.
*/
f2fs_lock_op(sbi);
- ret = update_inode_page(inode);
+ update_inode_page(inode);
f2fs_unlock_op(sbi);
if (wbc)
f2fs_balance_fs(sbi);
- return ret;
+ return 0;
}
/*
@@ -266,7 +273,7 @@ void f2fs_evict_inode(struct inode *inode)
inode->i_ino == F2FS_META_INO(sbi))
goto no_delete;
- f2fs_bug_on(atomic_read(&F2FS_I(inode)->dirty_dents));
+ f2fs_bug_on(get_dirty_dents(inode));
remove_dirty_dir_inode(inode);
if (inode->i_nlink || is_bad_inode(inode))
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 397d459e97bf..a9409d19dfd4 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -207,6 +207,8 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
inode = f2fs_iget(dir->i_sb, ino);
if (IS_ERR(inode))
return ERR_CAST(inode);
+
+ stat_inc_inline_inode(inode);
}
return d_splice_alias(inode, dentry);
@@ -424,12 +426,17 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
f2fs_set_link(new_dir, new_entry, new_page, old_inode);
+ down_write(&F2FS_I(old_inode)->i_sem);
F2FS_I(old_inode)->i_pino = new_dir->i_ino;
+ up_write(&F2FS_I(old_inode)->i_sem);
new_inode->i_ctime = CURRENT_TIME;
+ down_write(&F2FS_I(new_inode)->i_sem);
if (old_dir_entry)
drop_nlink(new_inode);
drop_nlink(new_inode);
+ up_write(&F2FS_I(new_inode)->i_sem);
+
mark_inode_dirty(new_inode);
if (!new_inode->i_nlink)
@@ -459,7 +466,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (old_dir != new_dir) {
f2fs_set_link(old_inode, old_dir_entry,
old_dir_page, new_dir);
+ down_write(&F2FS_I(old_inode)->i_sem);
F2FS_I(old_inode)->i_pino = new_dir->i_ino;
+ up_write(&F2FS_I(old_inode)->i_sem);
update_inode_page(old_inode);
} else {
kunmap(old_dir_page);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index b0649b76eb4f..a161e955c4c8 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -21,9 +21,27 @@
#include "segment.h"
#include <trace/events/f2fs.h>
+#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock)
+
static struct kmem_cache *nat_entry_slab;
static struct kmem_cache *free_nid_slab;
+static inline bool available_free_memory(struct f2fs_nm_info *nm_i, int type)
+{
+ struct sysinfo val;
+ unsigned long mem_size = 0;
+
+ si_meminfo(&val);
+ if (type == FREE_NIDS)
+ mem_size = nm_i->fcnt * sizeof(struct free_nid);
+ else if (type == NAT_ENTRIES)
+ mem_size += nm_i->nat_cnt * sizeof(struct nat_entry);
+ mem_size >>= 12;
+
+ /* give 50:50 memory for free nids and nat caches respectively */
+ return (mem_size < ((val.totalram * nm_i->ram_thresh) >> 11));
+}
+
static void clear_node_page_dirty(struct page *page)
{
struct address_space *mapping = page->mapping;
@@ -82,42 +100,6 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
return dst_page;
}
-/*
- * Readahead NAT pages
- */
-static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
-{
- struct address_space *mapping = META_MAPPING(sbi);
- struct f2fs_nm_info *nm_i = NM_I(sbi);
- struct page *page;
- pgoff_t index;
- int i;
- struct f2fs_io_info fio = {
- .type = META,
- .rw = READ_SYNC | REQ_META | REQ_PRIO
- };
-
-
- for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) {
- if (unlikely(nid >= nm_i->max_nid))
- nid = 0;
- index = current_nat_addr(sbi, nid);
-
- page = grab_cache_page(mapping, index);
- if (!page)
- continue;
- if (PageUptodate(page)) {
- mark_page_accessed(page);
- f2fs_put_page(page, 1);
- continue;
- }
- f2fs_submit_page_mbio(sbi, page, index, &fio);
- mark_page_accessed(page);
- f2fs_put_page(page, 0);
- }
- f2fs_submit_merged_bio(sbi, META, READ);
-}
-
static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
{
return radix_tree_lookup(&nm_i->nat_root, n);
@@ -151,6 +133,20 @@ int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
return is_cp;
}
+bool fsync_mark_done(struct f2fs_sb_info *sbi, nid_t nid)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct nat_entry *e;
+ bool fsync_done = false;
+
+ read_lock(&nm_i->nat_tree_lock);
+ e = __lookup_nat_cache(nm_i, nid);
+ if (e)
+ fsync_done = e->fsync_done;
+ read_unlock(&nm_i->nat_tree_lock);
+ return fsync_done;
+}
+
static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
{
struct nat_entry *new;
@@ -164,6 +160,7 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
}
memset(new, 0, sizeof(struct nat_entry));
nat_set_nid(new, nid);
+ new->checkpointed = true;
list_add_tail(&new->list, &nm_i->nat_entries);
nm_i->nat_cnt++;
return new;
@@ -185,13 +182,12 @@ retry:
nat_set_blkaddr(e, le32_to_cpu(ne->block_addr));
nat_set_ino(e, le32_to_cpu(ne->ino));
nat_set_version(e, ne->version);
- e->checkpointed = true;
}
write_unlock(&nm_i->nat_tree_lock);
}
static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
- block_t new_blkaddr)
+ block_t new_blkaddr, bool fsync_done)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
@@ -205,7 +201,6 @@ retry:
goto retry;
}
e->ni = *ni;
- e->checkpointed = true;
f2fs_bug_on(ni->blk_addr == NEW_ADDR);
} else if (new_blkaddr == NEW_ADDR) {
/*
@@ -217,9 +212,6 @@ retry:
f2fs_bug_on(ni->blk_addr != NULL_ADDR);
}
- if (new_blkaddr == NEW_ADDR)
- e->checkpointed = false;
-
/* sanity check */
f2fs_bug_on(nat_get_blkaddr(e) != ni->blk_addr);
f2fs_bug_on(nat_get_blkaddr(e) == NULL_ADDR &&
@@ -239,6 +231,11 @@ retry:
/* change address */
nat_set_blkaddr(e, new_blkaddr);
__set_nat_cache_dirty(nm_i, e);
+
+ /* update fsync_mark if its inode nat entry is still alive */
+ e = __lookup_nat_cache(nm_i, ni->ino);
+ if (e)
+ e->fsync_done = fsync_done;
write_unlock(&nm_i->nat_tree_lock);
}
@@ -246,7 +243,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
- if (nm_i->nat_cnt <= NM_WOUT_THRESHOLD)
+ if (available_free_memory(nm_i, NAT_ENTRIES))
return 0;
write_lock(&nm_i->nat_tree_lock);
@@ -505,7 +502,7 @@ static void truncate_node(struct dnode_of_data *dn)
/* Deallocate node address */
invalidate_blocks(sbi, ni.blk_addr);
dec_valid_node_count(sbi, dn->inode);
- set_node_addr(sbi, &ni, NULL_ADDR);
+ set_node_addr(sbi, &ni, NULL_ADDR, false);
if (dn->nid == dn->inode->i_ino) {
remove_orphan_inode(sbi, dn->nid);
@@ -763,7 +760,7 @@ skip_partial:
f2fs_put_page(page, 1);
goto restart;
}
- wait_on_page_writeback(page);
+ f2fs_wait_on_page_writeback(page, NODE);
ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
set_page_dirty(page);
unlock_page(page);
@@ -852,7 +849,8 @@ struct page *new_node_page(struct dnode_of_data *dn,
if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
return ERR_PTR(-EPERM);
- page = grab_cache_page(NODE_MAPPING(sbi), dn->nid);
+ page = grab_cache_page_write_begin(NODE_MAPPING(sbi),
+ dn->nid, AOP_FLAG_NOFS);
if (!page)
return ERR_PTR(-ENOMEM);
@@ -867,14 +865,14 @@ struct page *new_node_page(struct dnode_of_data *dn,
f2fs_bug_on(old_ni.blk_addr != NULL_ADDR);
new_ni = old_ni;
new_ni.ino = dn->inode->i_ino;
- set_node_addr(sbi, &new_ni, NEW_ADDR);
+ set_node_addr(sbi, &new_ni, NEW_ADDR, false);
fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
set_cold_node(dn->inode, page);
SetPageUptodate(page);
set_page_dirty(page);
- if (ofs == XATTR_NODE_OFFSET)
+ if (f2fs_has_xattr_block(ofs))
F2FS_I(dn->inode)->i_xattr_nid = dn->nid;
dn->node_page = page;
@@ -948,7 +946,8 @@ struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
struct page *page;
int err;
repeat:
- page = grab_cache_page(NODE_MAPPING(sbi), nid);
+ page = grab_cache_page_write_begin(NODE_MAPPING(sbi),
+ nid, AOP_FLAG_NOFS);
if (!page)
return ERR_PTR(-ENOMEM);
@@ -959,7 +958,7 @@ repeat:
goto got_it;
lock_page(page);
- if (unlikely(!PageUptodate(page))) {
+ if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) {
f2fs_put_page(page, 1);
return ERR_PTR(-EIO);
}
@@ -968,7 +967,6 @@ repeat:
goto repeat;
}
got_it:
- f2fs_bug_on(nid != nid_of_node(page));
mark_page_accessed(page);
return page;
}
@@ -1168,7 +1166,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
continue;
if (ino && ino_of_node(page) == ino) {
- wait_on_page_writeback(page);
+ f2fs_wait_on_page_writeback(page, NODE);
if (TestClearPageError(page))
ret = -EIO;
}
@@ -1201,7 +1199,7 @@ static int f2fs_write_node_page(struct page *page,
if (unlikely(sbi->por_doing))
goto redirty_out;
- wait_on_page_writeback(page);
+ f2fs_wait_on_page_writeback(page, NODE);
/* get old block addr of this node page */
nid = nid_of_node(page);
@@ -1222,7 +1220,7 @@ static int f2fs_write_node_page(struct page *page,
mutex_lock(&sbi->node_write);
set_page_writeback(page);
write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr);
- set_node_addr(sbi, &ni, new_addr);
+ set_node_addr(sbi, &ni, new_addr, is_fsync_dnode(page));
dec_page_count(sbi, F2FS_DIRTY_NODES);
mutex_unlock(&sbi->node_write);
unlock_page(page);
@@ -1231,35 +1229,32 @@ static int f2fs_write_node_page(struct page *page,
redirty_out:
dec_page_count(sbi, F2FS_DIRTY_NODES);
wbc->pages_skipped++;
+ account_page_redirty(page);
set_page_dirty(page);
return AOP_WRITEPAGE_ACTIVATE;
}
-/*
- * It is very important to gather dirty pages and write at once, so that we can
- * submit a big bio without interfering other data writes.
- * Be default, 512 pages (2MB) * 3 node types, is more reasonable.
- */
-#define COLLECT_DIRTY_NODES 1536
static int f2fs_write_node_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
- long nr_to_write = wbc->nr_to_write;
+ long diff;
/* balancing f2fs's metadata in background */
f2fs_balance_fs_bg(sbi);
/* collect a number of dirty node pages and write together */
- if (get_pages(sbi, F2FS_DIRTY_NODES) < COLLECT_DIRTY_NODES)
- return 0;
+ if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE))
+ goto skip_write;
- /* if mounting is failed, skip writing node pages */
- wbc->nr_to_write = 3 * max_hw_blocks(sbi);
+ diff = nr_pages_to_write(sbi, NODE, wbc);
wbc->sync_mode = WB_SYNC_NONE;
sync_node_pages(sbi, 0, wbc);
- wbc->nr_to_write = nr_to_write - (3 * max_hw_blocks(sbi) -
- wbc->nr_to_write);
+ wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
+ return 0;
+
+skip_write:
+ wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES);
return 0;
}
@@ -1307,22 +1302,17 @@ const struct address_space_operations f2fs_node_aops = {
.releasepage = f2fs_release_node_page,
};
-static struct free_nid *__lookup_free_nid_list(nid_t n, struct list_head *head)
+static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
+ nid_t n)
{
- struct list_head *this;
- struct free_nid *i;
- list_for_each(this, head) {
- i = list_entry(this, struct free_nid, list);
- if (i->nid == n)
- return i;
- }
- return NULL;
+ return radix_tree_lookup(&nm_i->free_nid_root, n);
}
-static void __del_from_free_nid_list(struct free_nid *i)
+static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i,
+ struct free_nid *i)
{
list_del(&i->list);
- kmem_cache_free(free_nid_slab, i);
+ radix_tree_delete(&nm_i->free_nid_root, i->nid);
}
static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
@@ -1331,7 +1321,7 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
struct nat_entry *ne;
bool allocated = false;
- if (nm_i->fcnt > 2 * MAX_FREE_NIDS)
+ if (!available_free_memory(nm_i, FREE_NIDS))
return -1;
/* 0 nid should not be used */
@@ -1342,7 +1332,8 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
/* do not add allocated nids */
read_lock(&nm_i->nat_tree_lock);
ne = __lookup_nat_cache(nm_i, nid);
- if (ne && nat_get_blkaddr(ne) != NULL_ADDR)
+ if (ne &&
+ (!ne->checkpointed || nat_get_blkaddr(ne) != NULL_ADDR))
allocated = true;
read_unlock(&nm_i->nat_tree_lock);
if (allocated)
@@ -1354,7 +1345,7 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
i->state = NID_NEW;
spin_lock(&nm_i->free_nid_list_lock);
- if (__lookup_free_nid_list(nid, &nm_i->free_nid_list)) {
+ if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) {
spin_unlock(&nm_i->free_nid_list_lock);
kmem_cache_free(free_nid_slab, i);
return 0;
@@ -1368,13 +1359,19 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
{
struct free_nid *i;
+ bool need_free = false;
+
spin_lock(&nm_i->free_nid_list_lock);
- i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
+ i = __lookup_free_nid_list(nm_i, nid);
if (i && i->state == NID_NEW) {
- __del_from_free_nid_list(i);
+ __del_from_free_nid_list(nm_i, i);
nm_i->fcnt--;
+ need_free = true;
}
spin_unlock(&nm_i->free_nid_list_lock);
+
+ if (need_free)
+ kmem_cache_free(free_nid_slab, i);
}
static void scan_nat_page(struct f2fs_nm_info *nm_i,
@@ -1413,7 +1410,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
return;
/* readahead nat pages to be scanned */
- ra_nat_pages(sbi, nid);
+ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT);
while (1) {
struct page *page = get_current_nat_page(sbi, nid);
@@ -1454,7 +1451,6 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i = NULL;
- struct list_head *this;
retry:
if (unlikely(sbi->total_valid_node_count + 1 >= nm_i->max_nid))
return false;
@@ -1462,13 +1458,11 @@ retry:
spin_lock(&nm_i->free_nid_list_lock);
/* We should not use stale free nids created by build_free_nids */
- if (nm_i->fcnt && !sbi->on_build_free_nids) {
+ if (nm_i->fcnt && !on_build_free_nids(nm_i)) {
f2fs_bug_on(list_empty(&nm_i->free_nid_list));
- list_for_each(this, &nm_i->free_nid_list) {
- i = list_entry(this, struct free_nid, list);
+ list_for_each_entry(i, &nm_i->free_nid_list, list)
if (i->state == NID_NEW)
break;
- }
f2fs_bug_on(i->state != NID_NEW);
*nid = i->nid;
@@ -1481,9 +1475,7 @@ retry:
/* Let's scan nat pages and its caches to get free nids */
mutex_lock(&nm_i->build_lock);
- sbi->on_build_free_nids = true;
build_free_nids(sbi);
- sbi->on_build_free_nids = false;
mutex_unlock(&nm_i->build_lock);
goto retry;
}
@@ -1497,10 +1489,12 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
struct free_nid *i;
spin_lock(&nm_i->free_nid_list_lock);
- i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
+ i = __lookup_free_nid_list(nm_i, nid);
f2fs_bug_on(!i || i->state != NID_ALLOC);
- __del_from_free_nid_list(i);
+ __del_from_free_nid_list(nm_i, i);
spin_unlock(&nm_i->free_nid_list_lock);
+
+ kmem_cache_free(free_nid_slab, i);
}
/*
@@ -1510,20 +1504,25 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i;
+ bool need_free = false;
if (!nid)
return;
spin_lock(&nm_i->free_nid_list_lock);
- i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
+ i = __lookup_free_nid_list(nm_i, nid);
f2fs_bug_on(!i || i->state != NID_ALLOC);
- if (nm_i->fcnt > 2 * MAX_FREE_NIDS) {
- __del_from_free_nid_list(i);
+ if (!available_free_memory(nm_i, FREE_NIDS)) {
+ __del_from_free_nid_list(nm_i, i);
+ need_free = true;
} else {
i->state = NID_NEW;
nm_i->fcnt++;
}
spin_unlock(&nm_i->free_nid_list_lock);
+
+ if (need_free)
+ kmem_cache_free(free_nid_slab, i);
}
void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
@@ -1531,10 +1530,83 @@ void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
block_t new_blkaddr)
{
rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr);
- set_node_addr(sbi, ni, new_blkaddr);
+ set_node_addr(sbi, ni, new_blkaddr, false);
clear_node_page_dirty(page);
}
+void recover_inline_xattr(struct inode *inode, struct page *page)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ void *src_addr, *dst_addr;
+ size_t inline_size;
+ struct page *ipage;
+ struct f2fs_inode *ri;
+
+ if (!f2fs_has_inline_xattr(inode))
+ return;
+
+ if (!IS_INODE(page))
+ return;
+
+ ri = F2FS_INODE(page);
+ if (!(ri->i_inline & F2FS_INLINE_XATTR))
+ return;
+
+ ipage = get_node_page(sbi, inode->i_ino);
+ f2fs_bug_on(IS_ERR(ipage));
+
+ dst_addr = inline_xattr_addr(ipage);
+ src_addr = inline_xattr_addr(page);
+ inline_size = inline_xattr_size(inode);
+
+ memcpy(dst_addr, src_addr, inline_size);
+
+ update_inode(inode, ipage);
+ f2fs_put_page(ipage, 1);
+}
+
+bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
+ nid_t new_xnid = nid_of_node(page);
+ struct node_info ni;
+
+ recover_inline_xattr(inode, page);
+
+ if (!f2fs_has_xattr_block(ofs_of_node(page)))
+ return false;
+
+ /* 1: invalidate the previous xattr nid */
+ if (!prev_xnid)
+ goto recover_xnid;
+
+ /* Deallocate node address */
+ get_node_info(sbi, prev_xnid, &ni);
+ f2fs_bug_on(ni.blk_addr == NULL_ADDR);
+ invalidate_blocks(sbi, ni.blk_addr);
+ dec_valid_node_count(sbi, inode);
+ set_node_addr(sbi, &ni, NULL_ADDR, false);
+
+recover_xnid:
+ /* 2: allocate new xattr nid */
+ if (unlikely(!inc_valid_node_count(sbi, inode)))
+ f2fs_bug_on(1);
+
+ remove_free_nid(NM_I(sbi), new_xnid);
+ get_node_info(sbi, new_xnid, &ni);
+ ni.ino = inode->i_ino;
+ set_node_addr(sbi, &ni, NEW_ADDR, false);
+ F2FS_I(inode)->i_xattr_nid = new_xnid;
+
+ /* 3: update xattr blkaddr */
+ refresh_sit_entry(sbi, NEW_ADDR, blkaddr);
+ set_node_addr(sbi, &ni, blkaddr, false);
+
+ update_inode_page(inode);
+ return true;
+}
+
int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
{
struct f2fs_inode *src, *dst;
@@ -1567,7 +1639,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
if (unlikely(!inc_valid_node_count(sbi, NULL)))
WARN_ON(1);
- set_node_addr(sbi, &new_ni, NEW_ADDR);
+ set_node_addr(sbi, &new_ni, NEW_ADDR, false);
inc_valid_inode_count(sbi);
f2fs_put_page(ipage, 1);
return 0;
@@ -1590,15 +1662,8 @@ static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages,
for (; page_idx < start + nrpages; page_idx++) {
/* alloc temporal page for read node summary info*/
page = alloc_page(GFP_F2FS_ZERO);
- if (!page) {
- struct page *tmp;
- list_for_each_entry_safe(page, tmp, pages, lru) {
- list_del(&page->lru);
- unlock_page(page);
- __free_pages(page, 0);
- }
- return -ENOMEM;
- }
+ if (!page)
+ break;
lock_page(page);
page->index = page_idx;
@@ -1609,7 +1674,8 @@ static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages,
f2fs_submit_page_mbio(sbi, page, page->index, &fio);
f2fs_submit_merged_bio(sbi, META, READ);
- return 0;
+
+ return page_idx - start;
}
int restore_node_summary(struct f2fs_sb_info *sbi,
@@ -1628,15 +1694,17 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
addr = START_BLOCK(sbi, segno);
sum_entry = &sum->entries[0];
- for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
+ for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) {
nrpages = min(last_offset - i, bio_blocks);
/* read ahead node pages */
- err = ra_sum_pages(sbi, &page_list, addr, nrpages);
- if (err)
- return err;
+ nrpages = ra_sum_pages(sbi, &page_list, addr, nrpages);
+ if (!nrpages)
+ return -ENOMEM;
list_for_each_entry_safe(page, tmp, &page_list, lru) {
+ if (err)
+ goto skip;
lock_page(page);
if (unlikely(!PageUptodate(page))) {
@@ -1648,9 +1716,9 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
sum_entry->ofs_in_node = 0;
sum_entry++;
}
-
- list_del(&page->lru);
unlock_page(page);
+skip:
+ list_del(&page->lru);
__free_pages(page, 0);
}
}
@@ -1709,7 +1777,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
struct f2fs_summary_block *sum = curseg->sum_blk;
- struct list_head *cur, *n;
+ struct nat_entry *ne, *cur;
struct page *page = NULL;
struct f2fs_nat_block *nat_blk = NULL;
nid_t start_nid = 0, end_nid = 0;
@@ -1721,18 +1789,17 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
mutex_lock(&curseg->curseg_mutex);
/* 1) flush dirty nat caches */
- list_for_each_safe(cur, n, &nm_i->dirty_nat_entries) {
- struct nat_entry *ne;
+ list_for_each_entry_safe(ne, cur, &nm_i->dirty_nat_entries, list) {
nid_t nid;
struct f2fs_nat_entry raw_ne;
int offset = -1;
block_t new_blkaddr;
- ne = list_entry(cur, struct nat_entry, list);
- nid = nat_get_nid(ne);
-
if (nat_get_blkaddr(ne) == NEW_ADDR)
continue;
+
+ nid = nat_get_nid(ne);
+
if (flushed)
goto to_nat_page;
@@ -1783,16 +1850,12 @@ flush_now:
} else {
write_lock(&nm_i->nat_tree_lock);
__clear_nat_cache_dirty(nm_i, ne);
- ne->checkpointed = true;
write_unlock(&nm_i->nat_tree_lock);
}
}
if (!flushed)
mutex_unlock(&curseg->curseg_mutex);
f2fs_put_page(page, 1);
-
- /* 2) shrink nat caches if necessary */
- try_to_free_nats(sbi, nm_i->nat_cnt - NM_WOUT_THRESHOLD);
}
static int init_node_manager(struct f2fs_sb_info *sbi)
@@ -1807,10 +1870,14 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
/* segment_count_nat includes pair segment so divide to 2. */
nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1;
nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
- nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
+
+ /* not used nids: 0, node, meta, (and root counted as valid node) */
+ nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks - 3;
nm_i->fcnt = 0;
nm_i->nat_cnt = 0;
+ nm_i->ram_thresh = DEF_RAM_THRESHOLD;
+ INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
INIT_LIST_HEAD(&nm_i->free_nid_list);
INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
INIT_LIST_HEAD(&nm_i->nat_entries);
@@ -1864,8 +1931,11 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
spin_lock(&nm_i->free_nid_list_lock);
list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
f2fs_bug_on(i->state == NID_ALLOC);
- __del_from_free_nid_list(i);
+ __del_from_free_nid_list(nm_i, i);
nm_i->fcnt--;
+ spin_unlock(&nm_i->free_nid_list_lock);
+ kmem_cache_free(free_nid_slab, i);
+ spin_lock(&nm_i->free_nid_list_lock);
}
f2fs_bug_on(nm_i->fcnt);
spin_unlock(&nm_i->free_nid_list_lock);
@@ -1875,11 +1945,9 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
while ((found = __gang_lookup_nat_cache(nm_i,
nid, NATVEC_SIZE, natvec))) {
unsigned idx;
- for (idx = 0; idx < found; idx++) {
- struct nat_entry *e = natvec[idx];
- nid = nat_get_nid(e) + 1;
- __del_from_nat_cache(nm_i, e);
- }
+ nid = nat_get_nid(natvec[found - 1]) + 1;
+ for (idx = 0; idx < found; idx++)
+ __del_from_nat_cache(nm_i, natvec[idx]);
}
f2fs_bug_on(nm_i->nat_cnt);
write_unlock(&nm_i->nat_tree_lock);
@@ -1892,12 +1960,12 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
int __init create_node_manager_caches(void)
{
nat_entry_slab = f2fs_kmem_cache_create("nat_entry",
- sizeof(struct nat_entry), NULL);
+ sizeof(struct nat_entry));
if (!nat_entry_slab)
return -ENOMEM;
free_nid_slab = f2fs_kmem_cache_create("free_nid",
- sizeof(struct free_nid), NULL);
+ sizeof(struct free_nid));
if (!free_nid_slab) {
kmem_cache_destroy(nat_entry_slab);
return -ENOMEM;
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index c4c79885c993..5decc1a375f0 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -17,14 +17,11 @@
/* # of pages to perform readahead before building free nids */
#define FREE_NID_PAGES 4
-/* maximum # of free node ids to produce during build_free_nids */
-#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES)
-
/* maximum readahead size for node during getting data blocks */
#define MAX_RA_NODE 128
-/* maximum cached nat entries to manage memory footprint */
-#define NM_WOUT_THRESHOLD (64 * NAT_ENTRY_PER_BLOCK)
+/* control the memory footprint threshold (10MB per 1GB ram) */
+#define DEF_RAM_THRESHOLD 10
/* vector size for gang look-up from nat cache that consists of radix tree */
#define NATVEC_SIZE 64
@@ -45,6 +42,7 @@ struct node_info {
struct nat_entry {
struct list_head list; /* for clean or dirty nat list */
bool checkpointed; /* whether it is checkpointed or not */
+ bool fsync_done; /* whether the latest node has fsync mark */
struct node_info ni; /* in-memory node information */
};
@@ -58,9 +56,15 @@ struct nat_entry {
#define nat_set_version(nat, v) (nat->ni.version = v)
#define __set_nat_cache_dirty(nm_i, ne) \
- list_move_tail(&ne->list, &nm_i->dirty_nat_entries);
+ do { \
+ ne->checkpointed = false; \
+ list_move_tail(&ne->list, &nm_i->dirty_nat_entries); \
+ } while (0);
#define __clear_nat_cache_dirty(nm_i, ne) \
- list_move_tail(&ne->list, &nm_i->nat_entries);
+ do { \
+ ne->checkpointed = true; \
+ list_move_tail(&ne->list, &nm_i->nat_entries); \
+ } while (0);
#define inc_node_version(version) (++version)
static inline void node_info_from_raw_nat(struct node_info *ni,
@@ -71,6 +75,11 @@ static inline void node_info_from_raw_nat(struct node_info *ni,
ni->version = raw_ne->version;
}
+enum nid_type {
+ FREE_NIDS, /* indicates the free nid list */
+ NAT_ENTRIES /* indicates the cached nat entry */
+};
+
/*
* For free nid mangement
*/
@@ -236,7 +245,7 @@ static inline bool IS_DNODE(struct page *node_page)
{
unsigned int ofs = ofs_of_node(node_page);
- if (ofs == XATTR_NODE_OFFSET)
+ if (f2fs_has_xattr_block(ofs))
return false;
if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK ||
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 976a7a934db5..b1ae89f0f44e 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -27,14 +27,12 @@ bool space_for_roll_forward(struct f2fs_sb_info *sbi)
static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
nid_t ino)
{
- struct list_head *this;
struct fsync_inode_entry *entry;
- list_for_each(this, head) {
- entry = list_entry(this, struct fsync_inode_entry, list);
+ list_for_each_entry(entry, head, list)
if (entry->inode->i_ino == ino)
return entry;
- }
+
return NULL;
}
@@ -136,7 +134,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
/* get node pages in the current segment */
curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
- blkaddr = START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff;
+ blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
/* read node page */
page = alloc_page(GFP_F2FS_ZERO);
@@ -218,13 +216,12 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
{
struct seg_entry *sentry;
unsigned int segno = GET_SEGNO(sbi, blkaddr);
- unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) &
- (sbi->blocks_per_seg - 1);
+ unsigned short blkoff = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
+ struct f2fs_summary_block *sum_node;
struct f2fs_summary sum;
+ struct page *sum_page, *node_page;
nid_t ino, nid;
- void *kaddr;
struct inode *inode;
- struct page *node_page;
unsigned int offset;
block_t bidx;
int i;
@@ -238,18 +235,15 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
struct curseg_info *curseg = CURSEG_I(sbi, i);
if (curseg->segno == segno) {
sum = curseg->sum_blk->entries[blkoff];
- break;
+ goto got_it;
}
}
- if (i > CURSEG_COLD_DATA) {
- struct page *sum_page = get_sum_page(sbi, segno);
- struct f2fs_summary_block *sum_node;
- kaddr = page_address(sum_page);
- sum_node = (struct f2fs_summary_block *)kaddr;
- sum = sum_node->entries[blkoff];
- f2fs_put_page(sum_page, 1);
- }
+ sum_page = get_sum_page(sbi, segno);
+ sum_node = (struct f2fs_summary_block *)page_address(sum_page);
+ sum = sum_node->entries[blkoff];
+ f2fs_put_page(sum_page, 1);
+got_it:
/* Use the locked dnode page and inode */
nid = le32_to_cpu(sum.nid);
if (dn->inode->i_ino == nid) {
@@ -301,6 +295,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
if (recover_inline_data(inode, page))
goto out;
+ if (recover_xattr_data(inode, page, blkaddr))
+ goto out;
+
start = start_bidx_of_node(ofs_of_node(page), fi);
if (IS_INODE(page))
end = start + ADDRS_PER_INODE(fi);
@@ -317,7 +314,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
goto out;
}
- wait_on_page_writeback(dn.node_page);
+ f2fs_wait_on_page_writeback(dn.node_page, NODE);
get_node_info(sbi, dn.nid, &ni);
f2fs_bug_on(ni.ino != ino_of_node(page));
@@ -437,7 +434,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
bool need_writecp = false;
fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
- sizeof(struct fsync_inode_entry), NULL);
+ sizeof(struct fsync_inode_entry));
if (!fsync_entry_slab)
return -ENOMEM;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 7caac5f2ca9e..085f548be7a3 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -13,6 +13,7 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/prefetch.h>
+#include <linux/kthread.h>
#include <linux/vmalloc.h>
#include <linux/swap.h>
@@ -24,6 +25,7 @@
#define __reverse_ffz(x) __reverse_ffs(~(x))
static struct kmem_cache *discard_entry_slab;
+static struct kmem_cache *flush_cmd_slab;
/*
* __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since
@@ -195,6 +197,73 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
f2fs_sync_fs(sbi->sb, true);
}
+static int issue_flush_thread(void *data)
+{
+ struct f2fs_sb_info *sbi = data;
+ struct f2fs_sm_info *sm_i = SM_I(sbi);
+ wait_queue_head_t *q = &sm_i->flush_wait_queue;
+repeat:
+ if (kthread_should_stop())
+ return 0;
+
+ spin_lock(&sm_i->issue_lock);
+ if (sm_i->issue_list) {
+ sm_i->dispatch_list = sm_i->issue_list;
+ sm_i->issue_list = sm_i->issue_tail = NULL;
+ }
+ spin_unlock(&sm_i->issue_lock);
+
+ if (sm_i->dispatch_list) {
+ struct bio *bio = bio_alloc(GFP_NOIO, 0);
+ struct flush_cmd *cmd, *next;
+ int ret;
+
+ bio->bi_bdev = sbi->sb->s_bdev;
+ ret = submit_bio_wait(WRITE_FLUSH, bio);
+
+ for (cmd = sm_i->dispatch_list; cmd; cmd = next) {
+ cmd->ret = ret;
+ next = cmd->next;
+ complete(&cmd->wait);
+ }
+ sm_i->dispatch_list = NULL;
+ }
+
+ wait_event_interruptible(*q, kthread_should_stop() || sm_i->issue_list);
+ goto repeat;
+}
+
+int f2fs_issue_flush(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_sm_info *sm_i = SM_I(sbi);
+ struct flush_cmd *cmd;
+ int ret;
+
+ if (!test_opt(sbi, FLUSH_MERGE))
+ return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL);
+
+ cmd = f2fs_kmem_cache_alloc(flush_cmd_slab, GFP_ATOMIC);
+ cmd->next = NULL;
+ cmd->ret = 0;
+ init_completion(&cmd->wait);
+
+ spin_lock(&sm_i->issue_lock);
+ if (sm_i->issue_list)
+ sm_i->issue_tail->next = cmd;
+ else
+ sm_i->issue_list = cmd;
+ sm_i->issue_tail = cmd;
+ spin_unlock(&sm_i->issue_lock);
+
+ if (!sm_i->dispatch_list)
+ wake_up(&sm_i->flush_wait_queue);
+
+ wait_for_completion(&cmd->wait);
+ ret = cmd->ret;
+ kmem_cache_free(flush_cmd_slab, cmd);
+ return ret;
+}
+
static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
enum dirty_type dirty_type)
{
@@ -340,8 +409,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
void clear_prefree_segments(struct f2fs_sb_info *sbi)
{
struct list_head *head = &(SM_I(sbi)->discard_list);
- struct list_head *this, *next;
- struct discard_entry *entry;
+ struct discard_entry *entry, *this;
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
unsigned int total_segs = TOTAL_SEGS(sbi);
@@ -370,8 +438,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
mutex_unlock(&dirty_i->seglist_lock);
/* send small discards */
- list_for_each_safe(this, next, head) {
- entry = list_entry(this, struct discard_entry, list);
+ list_for_each_entry_safe(entry, this, head, list) {
f2fs_issue_discard(sbi, entry->blkaddr, entry->len);
list_del(&entry->list);
SM_I(sbi)->nr_discards -= entry->len;
@@ -405,7 +472,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
se = get_seg_entry(sbi, segno);
new_vblocks = se->valid_blocks + del;
- offset = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & (sbi->blocks_per_seg - 1);
+ offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
f2fs_bug_on((new_vblocks >> (sizeof(unsigned short) << 3) ||
(new_vblocks > sbi->blocks_per_seg)));
@@ -434,12 +501,14 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
get_sec_entry(sbi, segno)->valid_blocks += del;
}
-static void refresh_sit_entry(struct f2fs_sb_info *sbi,
- block_t old_blkaddr, block_t new_blkaddr)
+void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new)
{
- update_sit_entry(sbi, new_blkaddr, 1);
- if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
- update_sit_entry(sbi, old_blkaddr, -1);
+ update_sit_entry(sbi, new, 1);
+ if (GET_SEGNO(sbi, old) != NULL_SEGNO)
+ update_sit_entry(sbi, old, -1);
+
+ locate_dirty_segment(sbi, GET_SEGNO(sbi, old));
+ locate_dirty_segment(sbi, GET_SEGNO(sbi, new));
}
void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
@@ -881,17 +950,15 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
stat_inc_block_count(sbi, curseg);
+ if (!__has_curseg_space(sbi, type))
+ sit_i->s_ops->allocate_segment(sbi, type, false);
/*
* SIT information should be updated before segment allocation,
* since SSR needs latest valid block information.
*/
refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr);
-
- if (!__has_curseg_space(sbi, type))
- sit_i->s_ops->allocate_segment(sbi, type, false);
-
locate_dirty_segment(sbi, old_cursegno);
- locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
+
mutex_unlock(&sit_i->sentry_lock);
if (page && IS_NODESEG(type))
@@ -987,14 +1054,11 @@ void recover_data_page(struct f2fs_sb_info *sbi,
change_curseg(sbi, type, true);
}
- curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
- (sbi->blocks_per_seg - 1);
+ curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
__add_sum_entry(sbi, type, sum);
refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
-
locate_dirty_segment(sbi, old_cursegno);
- locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
mutex_unlock(&sit_i->sentry_lock);
mutex_unlock(&curseg->curseg_mutex);
@@ -1028,8 +1092,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
curseg->next_segno = segno;
change_curseg(sbi, type, true);
}
- curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
- (sbi->blocks_per_seg - 1);
+ curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
__add_sum_entry(sbi, type, sum);
/* change the current log to the next block addr in advance */
@@ -1037,28 +1100,50 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
curseg->next_segno = next_segno;
change_curseg(sbi, type, true);
}
- curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, next_blkaddr) &
- (sbi->blocks_per_seg - 1);
+ curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, next_blkaddr);
/* rewrite node page */
set_page_writeback(page);
f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio);
f2fs_submit_merged_bio(sbi, NODE, WRITE);
refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
-
locate_dirty_segment(sbi, old_cursegno);
- locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
mutex_unlock(&sit_i->sentry_lock);
mutex_unlock(&curseg->curseg_mutex);
}
+static inline bool is_merged_page(struct f2fs_sb_info *sbi,
+ struct page *page, enum page_type type)
+{
+ enum page_type btype = PAGE_TYPE_OF_BIO(type);
+ struct f2fs_bio_info *io = &sbi->write_io[btype];
+ struct bio_vec *bvec;
+ int i;
+
+ down_read(&io->io_rwsem);
+ if (!io->bio)
+ goto out;
+
+ bio_for_each_segment_all(bvec, io->bio, i) {
+ if (page == bvec->bv_page) {
+ up_read(&io->io_rwsem);
+ return true;
+ }
+ }
+
+out:
+ up_read(&io->io_rwsem);
+ return false;
+}
+
void f2fs_wait_on_page_writeback(struct page *page,
enum page_type type)
{
struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
if (PageWriteback(page)) {
- f2fs_submit_merged_bio(sbi, type, WRITE);
+ if (is_merged_page(sbi, page, type))
+ f2fs_submit_merged_bio(sbi, type, WRITE);
wait_on_page_writeback(page);
}
}
@@ -1167,9 +1252,12 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
ns->ofs_in_node = 0;
}
} else {
- if (restore_node_summary(sbi, segno, sum)) {
+ int err;
+
+ err = restore_node_summary(sbi, segno, sum);
+ if (err) {
f2fs_put_page(new, 1);
- return -EINVAL;
+ return err;
}
}
}
@@ -1190,6 +1278,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
{
int type = CURSEG_HOT_DATA;
+ int err;
if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) {
/* restore for compacted data summary */
@@ -1198,9 +1287,12 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
type = CURSEG_HOT_NODE;
}
- for (; type <= CURSEG_COLD_NODE; type++)
- if (read_normal_summaries(sbi, type))
- return -EINVAL;
+ for (; type <= CURSEG_COLD_NODE; type++) {
+ err = read_normal_summaries(sbi, type);
+ if (err)
+ return err;
+ }
+
return 0;
}
@@ -1583,47 +1675,6 @@ static int build_curseg(struct f2fs_sb_info *sbi)
return restore_curseg_summaries(sbi);
}
-static int ra_sit_pages(struct f2fs_sb_info *sbi, int start, int nrpages)
-{
- struct address_space *mapping = META_MAPPING(sbi);
- struct page *page;
- block_t blk_addr, prev_blk_addr = 0;
- int sit_blk_cnt = SIT_BLK_CNT(sbi);
- int blkno = start;
- struct f2fs_io_info fio = {
- .type = META,
- .rw = READ_SYNC | REQ_META | REQ_PRIO
- };
-
- for (; blkno < start + nrpages && blkno < sit_blk_cnt; blkno++) {
-
- blk_addr = current_sit_addr(sbi, blkno * SIT_ENTRY_PER_BLOCK);
-
- if (blkno != start && prev_blk_addr + 1 != blk_addr)
- break;
- prev_blk_addr = blk_addr;
-repeat:
- page = grab_cache_page(mapping, blk_addr);
- if (!page) {
- cond_resched();
- goto repeat;
- }
- if (PageUptodate(page)) {
- mark_page_accessed(page);
- f2fs_put_page(page, 1);
- continue;
- }
-
- f2fs_submit_page_mbio(sbi, page, blk_addr, &fio);
-
- mark_page_accessed(page);
- f2fs_put_page(page, 0);
- }
-
- f2fs_submit_merged_bio(sbi, META, READ);
- return blkno - start;
-}
-
static void build_sit_entries(struct f2fs_sb_info *sbi)
{
struct sit_info *sit_i = SIT_I(sbi);
@@ -1635,7 +1686,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
do {
- readed = ra_sit_pages(sbi, start_blk, nrpages);
+ readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT);
start = start_blk * sit_i->sents_per_block;
end = (start_blk + readed) * sit_i->sents_per_block;
@@ -1781,6 +1832,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ dev_t dev = sbi->sb->s_bdev->bd_dev;
struct f2fs_sm_info *sm_info;
int err;
@@ -1799,7 +1851,8 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count);
sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main);
sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
- sm_info->rec_prefree_segments = DEF_RECLAIM_PREFREE_SEGMENTS;
+ sm_info->rec_prefree_segments = sm_info->main_segments *
+ DEF_RECLAIM_PREFREE_SEGMENTS / 100;
sm_info->ipu_policy = F2FS_IPU_DISABLE;
sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
@@ -1807,6 +1860,16 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
sm_info->nr_discards = 0;
sm_info->max_discards = 0;
+ if (test_opt(sbi, FLUSH_MERGE)) {
+ spin_lock_init(&sm_info->issue_lock);
+ init_waitqueue_head(&sm_info->flush_wait_queue);
+
+ sm_info->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
+ "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
+ if (IS_ERR(sm_info->f2fs_issue_flush))
+ return PTR_ERR(sm_info->f2fs_issue_flush);
+ }
+
err = build_sit_info(sbi);
if (err)
return err;
@@ -1915,6 +1978,8 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi)
struct f2fs_sm_info *sm_info = SM_I(sbi);
if (!sm_info)
return;
+ if (sm_info->f2fs_issue_flush)
+ kthread_stop(sm_info->f2fs_issue_flush);
destroy_dirty_segmap(sbi);
destroy_curseg(sbi);
destroy_free_segmap(sbi);
@@ -1926,13 +1991,20 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi)
int __init create_segment_manager_caches(void)
{
discard_entry_slab = f2fs_kmem_cache_create("discard_entry",
- sizeof(struct discard_entry), NULL);
+ sizeof(struct discard_entry));
if (!discard_entry_slab)
return -ENOMEM;
+ flush_cmd_slab = f2fs_kmem_cache_create("flush_command",
+ sizeof(struct flush_cmd));
+ if (!flush_cmd_slab) {
+ kmem_cache_destroy(discard_entry_slab);
+ return -ENOMEM;
+ }
return 0;
}
void destroy_segment_manager_caches(void)
{
kmem_cache_destroy(discard_entry_slab);
+ kmem_cache_destroy(flush_cmd_slab);
}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 5731682d7516..7091204680f4 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -14,7 +14,7 @@
#define NULL_SEGNO ((unsigned int)(~0))
#define NULL_SECNO ((unsigned int)(~0))
-#define DEF_RECLAIM_PREFREE_SEGMENTS 100 /* 200MB of prefree segments */
+#define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */
/* L: Logical segment # in volume, R: Relative segment # in main area */
#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno)
@@ -57,6 +57,9 @@
((blk_addr) - SM_I(sbi)->seg0_blkaddr)
#define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \
(GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg)
+#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \
+ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1))
+
#define GET_SEGNO(sbi, blk_addr) \
(((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \
NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \
@@ -377,26 +380,12 @@ static inline void get_sit_bitmap(struct f2fs_sb_info *sbi,
static inline block_t written_block_count(struct f2fs_sb_info *sbi)
{
- struct sit_info *sit_i = SIT_I(sbi);
- block_t vblocks;
-
- mutex_lock(&sit_i->sentry_lock);
- vblocks = sit_i->written_valid_blocks;
- mutex_unlock(&sit_i->sentry_lock);
-
- return vblocks;
+ return SIT_I(sbi)->written_valid_blocks;
}
static inline unsigned int free_segments(struct f2fs_sb_info *sbi)
{
- struct free_segmap_info *free_i = FREE_I(sbi);
- unsigned int free_segs;
-
- read_lock(&free_i->segmap_lock);
- free_segs = free_i->free_segments;
- read_unlock(&free_i->segmap_lock);
-
- return free_segs;
+ return FREE_I(sbi)->free_segments;
}
static inline int reserved_segments(struct f2fs_sb_info *sbi)
@@ -406,14 +395,7 @@ static inline int reserved_segments(struct f2fs_sb_info *sbi)
static inline unsigned int free_sections(struct f2fs_sb_info *sbi)
{
- struct free_segmap_info *free_i = FREE_I(sbi);
- unsigned int free_secs;
-
- read_lock(&free_i->segmap_lock);
- free_secs = free_i->free_sections;
- read_unlock(&free_i->segmap_lock);
-
- return free_secs;
+ return FREE_I(sbi)->free_sections;
}
static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
@@ -682,3 +664,46 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
struct request_queue *q = bdev_get_queue(bdev);
return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q));
}
+
+/*
+ * It is very important to gather dirty pages and write at once, so that we can
+ * submit a big bio without interfering other data writes.
+ * By default, 512 pages for directory data,
+ * 512 pages (2MB) * 3 for three types of nodes, and
+ * max_bio_blocks for meta are set.
+ */
+static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
+{
+ if (type == DATA)
+ return sbi->blocks_per_seg;
+ else if (type == NODE)
+ return 3 * sbi->blocks_per_seg;
+ else if (type == META)
+ return MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+ else
+ return 0;
+}
+
+/*
+ * When writing pages, it'd better align nr_to_write for segment size.
+ */
+static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type,
+ struct writeback_control *wbc)
+{
+ long nr_to_write, desired;
+
+ if (wbc->sync_mode != WB_SYNC_NONE)
+ return 0;
+
+ nr_to_write = wbc->nr_to_write;
+
+ if (type == DATA)
+ desired = 4096;
+ else if (type == NODE)
+ desired = 3 * max_hw_blocks(sbi);
+ else
+ desired = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+
+ wbc->nr_to_write = desired;
+ return desired - nr_to_write;
+}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 856bdf994c0a..c756923a7302 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -51,6 +51,7 @@ enum {
Opt_disable_ext_identify,
Opt_inline_xattr,
Opt_inline_data,
+ Opt_flush_merge,
Opt_err,
};
@@ -67,6 +68,7 @@ static match_table_t f2fs_tokens = {
{Opt_disable_ext_identify, "disable_ext_identify"},
{Opt_inline_xattr, "inline_xattr"},
{Opt_inline_data, "inline_data"},
+ {Opt_flush_merge, "flush_merge"},
{Opt_err, NULL},
};
@@ -74,6 +76,7 @@ static match_table_t f2fs_tokens = {
enum {
GC_THREAD, /* struct f2fs_gc_thread */
SM_INFO, /* struct f2fs_sm_info */
+ NM_INFO, /* struct f2fs_nm_info */
F2FS_SBI, /* struct f2fs_sb_info */
};
@@ -92,6 +95,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
return (unsigned char *)sbi->gc_thread;
else if (struct_type == SM_INFO)
return (unsigned char *)SM_I(sbi);
+ else if (struct_type == NM_INFO)
+ return (unsigned char *)NM_I(sbi);
else if (struct_type == F2FS_SBI)
return (unsigned char *)sbi;
return NULL;
@@ -183,7 +188,9 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
+F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
@@ -196,6 +203,8 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(ipu_policy),
ATTR_LIST(min_ipu_util),
ATTR_LIST(max_victim_search),
+ ATTR_LIST(dir_level),
+ ATTR_LIST(ram_thresh),
NULL,
};
@@ -256,9 +265,9 @@ static int parse_options(struct super_block *sb, char *options)
if (!name)
return -ENOMEM;
- if (!strncmp(name, "on", 2))
+ if (strlen(name) == 2 && !strncmp(name, "on", 2))
set_opt(sbi, BG_GC);
- else if (!strncmp(name, "off", 3))
+ else if (strlen(name) == 3 && !strncmp(name, "off", 3))
clear_opt(sbi, BG_GC);
else {
kfree(name);
@@ -327,6 +336,9 @@ static int parse_options(struct super_block *sb, char *options)
case Opt_inline_data:
set_opt(sbi, INLINE_DATA);
break;
+ case Opt_flush_merge:
+ set_opt(sbi, FLUSH_MERGE);
+ break;
default:
f2fs_msg(sb, KERN_ERR,
"Unrecognized mount option \"%s\" or missing value",
@@ -353,12 +365,16 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
fi->i_current_depth = 1;
fi->i_advise = 0;
rwlock_init(&fi->ext.ext_lock);
+ init_rwsem(&fi->i_sem);
set_inode_flag(fi, FI_NEW_INODE);
if (test_opt(F2FS_SB(sb), INLINE_XATTR))
set_inode_flag(fi, FI_INLINE_XATTR);
+ /* Will be used by directory only */
+ fi->i_dir_level = F2FS_SB(sb)->dir_level;
+
return &fi->vfs_inode;
}
@@ -526,6 +542,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",disable_ext_identify");
if (test_opt(sbi, INLINE_DATA))
seq_puts(seq, ",inline_data");
+ if (test_opt(sbi, FLUSH_MERGE))
+ seq_puts(seq, ",flush_merge");
seq_printf(seq, ",active_logs=%u", sbi->active_logs);
return 0;
@@ -539,13 +557,22 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset)
le32_to_cpu(sbi->raw_super->segment_count_main);
int i;
+ seq_puts(seq, "format: segment_type|valid_blocks\n"
+ "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n");
+
for (i = 0; i < total_segs; i++) {
- seq_printf(seq, "%u", get_valid_blocks(sbi, i, 1));
- if (i != 0 && (i % 10) == 0)
- seq_puts(seq, "\n");
+ struct seg_entry *se = get_seg_entry(sbi, i);
+
+ if ((i % 10) == 0)
+ seq_printf(seq, "%-5d", i);
+ seq_printf(seq, "%d|%-3u", se->type,
+ get_valid_blocks(sbi, i, 1));
+ if ((i % 10) == 9 || i == (total_segs - 1))
+ seq_putc(seq, '\n');
else
- seq_puts(seq, " ");
+ seq_putc(seq, ' ');
}
+
return 0;
}
@@ -640,6 +667,8 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
if (unlikely(ino < F2FS_ROOT_INO(sbi)))
return ERR_PTR(-ESTALE);
+ if (unlikely(ino >= NM_I(sbi)->max_nid))
+ return ERR_PTR(-ESTALE);
/*
* f2fs_iget isn't quite right if the inode is currently unallocated!
@@ -787,6 +816,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
for (i = 0; i < NR_COUNT_TYPE; i++)
atomic_set(&sbi->nr_pages[i], 0);
+
+ sbi->dir_level = DEF_DIR_LEVEL;
}
/*
@@ -898,11 +929,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
sbi->por_doing = false;
spin_lock_init(&sbi->stat_lock);
- mutex_init(&sbi->read_io.io_mutex);
+ init_rwsem(&sbi->read_io.io_rwsem);
sbi->read_io.sbi = sbi;
sbi->read_io.bio = NULL;
for (i = 0; i < NR_PAGE_TYPE; i++) {
- mutex_init(&sbi->write_io[i].io_mutex);
+ init_rwsem(&sbi->write_io[i].io_rwsem);
sbi->write_io[i].sbi = sbi;
sbi->write_io[i].bio = NULL;
}
@@ -991,28 +1022,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
goto free_root_inode;
}
- /* recover fsynced data */
- if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
- err = recover_fsync_data(sbi);
- if (err)
- f2fs_msg(sb, KERN_ERR,
- "Cannot recover all fsync data errno=%ld", err);
- }
-
- /*
- * If filesystem is not mounted as read-only then
- * do start the gc_thread.
- */
- if (!(sb->s_flags & MS_RDONLY)) {
- /* After POR, we can run background GC thread.*/
- err = start_gc_thread(sbi);
- if (err)
- goto free_gc;
- }
-
err = f2fs_build_stats(sbi);
if (err)
- goto free_gc;
+ goto free_root_inode;
if (f2fs_proc_root)
sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
@@ -1034,17 +1046,36 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL,
"%s", sb->s_id);
if (err)
- goto fail;
+ goto free_proc;
+
+ /* recover fsynced data */
+ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
+ err = recover_fsync_data(sbi);
+ if (err)
+ f2fs_msg(sb, KERN_ERR,
+ "Cannot recover all fsync data errno=%ld", err);
+ }
+ /*
+ * If filesystem is not mounted as read-only then
+ * do start the gc_thread.
+ */
+ if (!(sb->s_flags & MS_RDONLY)) {
+ /* After POR, we can run background GC thread.*/
+ err = start_gc_thread(sbi);
+ if (err)
+ goto free_kobj;
+ }
return 0;
-fail:
+
+free_kobj:
+ kobject_del(&sbi->s_kobj);
+free_proc:
if (sbi->s_proc) {
remove_proc_entry("segment_info", sbi->s_proc);
remove_proc_entry(sb->s_id, f2fs_proc_root);
}
f2fs_destroy_stats(sbi);
-free_gc:
- stop_gc_thread(sbi);
free_root_inode:
dput(sb->s_root);
sb->s_root = NULL;
@@ -1084,7 +1115,7 @@ MODULE_ALIAS_FS("f2fs");
static int __init init_inodecache(void)
{
f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache",
- sizeof(struct f2fs_inode_info), NULL);
+ sizeof(struct f2fs_inode_info));
if (!f2fs_inode_cachep)
return -ENOMEM;
return 0;
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 89d0422a91a8..503c2451131e 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -275,7 +275,7 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage)
inline_size = inline_xattr_size(inode);
- txattr_addr = kzalloc(inline_size + size, GFP_KERNEL);
+ txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO);
if (!txattr_addr)
return NULL;
@@ -407,6 +407,8 @@ int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
if (name == NULL)
return -EINVAL;
name_len = strlen(name);
+ if (name_len > F2FS_NAME_LEN)
+ return -ERANGE;
base_addr = read_all_xattrs(inode, NULL);
if (!base_addr)
@@ -590,7 +592,10 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
f2fs_balance_fs(sbi);
f2fs_lock_op(sbi);
+ /* protect xattr_ver */
+ down_write(&F2FS_I(inode)->i_sem);
err = __f2fs_setxattr(inode, name_index, name, value, value_len, ipage);
+ up_write(&F2FS_I(inode)->i_sem);
f2fs_unlock_op(sbi);
return err;
diff --git a/fs/file.c b/fs/file.c
index b61293badfb1..8f294cfac697 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -25,7 +25,10 @@
int sysctl_nr_open __read_mostly = 1024*1024;
int sysctl_nr_open_min = BITS_PER_LONG;
-int sysctl_nr_open_max = 1024 * 1024; /* raised later */
+/* our max() is unusable in constant expressions ;-/ */
+#define __const_max(x, y) ((x) < (y) ? (x) : (y))
+int sysctl_nr_open_max = __const_max(INT_MAX, ~(size_t)0/sizeof(void *)) &
+ -BITS_PER_LONG;
static void *alloc_fdmem(size_t size)
{
@@ -429,12 +432,6 @@ void exit_files(struct task_struct *tsk)
}
}
-void __init files_defer_init(void)
-{
- sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) &
- -BITS_PER_LONG;
-}
-
struct files_struct init_files = {
.count = ATOMIC_INIT(1),
.fdt = &init_files.fdtab,
diff --git a/fs/file_table.c b/fs/file_table.c
index 01071c4d752e..a374f5033e97 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -52,7 +52,6 @@ static void file_free_rcu(struct rcu_head *head)
static inline void file_free(struct file *f)
{
percpu_counter_dec(&nr_files);
- file_check_state(f);
call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
}
@@ -178,47 +177,12 @@ struct file *alloc_file(struct path *path, fmode_t mode,
file->f_mapping = path->dentry->d_inode->i_mapping;
file->f_mode = mode;
file->f_op = fop;
-
- /*
- * These mounts don't really matter in practice
- * for r/o bind mounts. They aren't userspace-
- * visible. We do this for consistency, and so
- * that we can do debugging checks at __fput()
- */
- if ((mode & FMODE_WRITE) && !special_file(path->dentry->d_inode->i_mode)) {
- file_take_write(file);
- WARN_ON(mnt_clone_write(path->mnt));
- }
if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
i_readcount_inc(path->dentry->d_inode);
return file;
}
EXPORT_SYMBOL(alloc_file);
-/**
- * drop_file_write_access - give up ability to write to a file
- * @file: the file to which we will stop writing
- *
- * This is a central place which will give up the ability
- * to write to @file, along with access to write through
- * its vfsmount.
- */
-static void drop_file_write_access(struct file *file)
-{
- struct vfsmount *mnt = file->f_path.mnt;
- struct dentry *dentry = file->f_path.dentry;
- struct inode *inode = dentry->d_inode;
-
- put_write_access(inode);
-
- if (special_file(inode->i_mode))
- return;
- if (file_check_writeable(file) != 0)
- return;
- __mnt_drop_write(mnt);
- file_release_write(file);
-}
-
/* the real guts of fput() - releasing the last reference to file
*/
static void __fput(struct file *file)
@@ -253,8 +217,10 @@ static void __fput(struct file *file)
put_pid(file->f_owner.pid);
if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
i_readcount_dec(inode);
- if (file->f_mode & FMODE_WRITE)
- drop_file_write_access(file);
+ if (file->f_mode & FMODE_WRITER) {
+ put_write_access(inode);
+ __mnt_drop_write(mnt);
+ }
file->f_path.dentry = NULL;
file->f_path.mnt = NULL;
file->f_inode = NULL;
@@ -359,6 +325,5 @@ void __init files_init(unsigned long mempages)
n = (mempages * (PAGE_SIZE / 1024)) / 10;
files_stat.max_files = max_t(unsigned long, n, NR_FILE);
- files_defer_init();
percpu_counter_init(&nr_files, 0);
}
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 23e363f38302..13b691a8a7d2 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -569,7 +569,7 @@ static ssize_t cuse_class_waiting_show(struct device *dev,
return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting));
}
-static DEVICE_ATTR(waiting, S_IFREG | 0400, cuse_class_waiting_show, NULL);
+static DEVICE_ATTR(waiting, 0400, cuse_class_waiting_show, NULL);
static ssize_t cuse_class_abort_store(struct device *dev,
struct device_attribute *attr,
@@ -580,7 +580,7 @@ static ssize_t cuse_class_abort_store(struct device *dev,
fuse_abort_conn(&cc->fc);
return count;
}
-static DEVICE_ATTR(abort, S_IFREG | 0200, NULL, cuse_class_abort_store);
+static DEVICE_ATTR(abort, 0200, NULL, cuse_class_abort_store);
static struct attribute *cuse_class_dev_attrs[] = {
&dev_attr_waiting.attr,
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 0a648bb455ae..aac71ce373e4 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -667,15 +667,15 @@ static void fuse_copy_finish(struct fuse_copy_state *cs)
struct pipe_buffer *buf = cs->currbuf;
if (!cs->write) {
- buf->ops->unmap(cs->pipe, buf, cs->mapaddr);
+ kunmap_atomic(cs->mapaddr);
} else {
- kunmap(buf->page);
+ kunmap_atomic(cs->mapaddr);
buf->len = PAGE_SIZE - cs->len;
}
cs->currbuf = NULL;
cs->mapaddr = NULL;
} else if (cs->mapaddr) {
- kunmap(cs->pg);
+ kunmap_atomic(cs->mapaddr);
if (cs->write) {
flush_dcache_page(cs->pg);
set_page_dirty_lock(cs->pg);
@@ -706,7 +706,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
BUG_ON(!cs->nr_segs);
cs->currbuf = buf;
- cs->mapaddr = buf->ops->map(cs->pipe, buf, 0);
+ cs->mapaddr = kmap_atomic(buf->page);
cs->len = buf->len;
cs->buf = cs->mapaddr + buf->offset;
cs->pipebufs++;
@@ -726,7 +726,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
buf->len = 0;
cs->currbuf = buf;
- cs->mapaddr = kmap(page);
+ cs->mapaddr = kmap_atomic(page);
cs->buf = cs->mapaddr;
cs->len = PAGE_SIZE;
cs->pipebufs++;
@@ -745,7 +745,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
return err;
BUG_ON(err != 1);
offset = cs->addr % PAGE_SIZE;
- cs->mapaddr = kmap(cs->pg);
+ cs->mapaddr = kmap_atomic(cs->pg);
cs->buf = cs->mapaddr + offset;
cs->len = min(PAGE_SIZE - offset, cs->seglen);
cs->seglen -= cs->len;
@@ -874,7 +874,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
out_fallback_unlock:
unlock_page(newpage);
out_fallback:
- cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
+ cs->mapaddr = kmap_atomic(buf->page);
cs->buf = cs->mapaddr + buf->offset;
err = lock_request(cs->fc, cs->req);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 65df7d8be4f5..13f8bdec5110 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1086,9 +1086,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
- pagefault_disable();
tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
- pagefault_enable();
flush_dcache_page(page);
mark_page_accessed(page);
@@ -1237,8 +1235,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
goto out;
if (file->f_flags & O_DIRECT) {
- written = generic_file_direct_write(iocb, iov, &nr_segs,
- pos, &iocb->ki_pos,
+ written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
count, ocount);
if (written < 0 || written == count)
goto out;
@@ -2117,6 +2114,7 @@ static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
static const struct vm_operations_struct fuse_file_vm_ops = {
.close = fuse_vma_close,
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = fuse_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 6c794085abac..80d67253623c 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -494,6 +494,7 @@ out:
static const struct vm_operations_struct gfs2_vm_ops = {
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = gfs2_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 6af66ee56390..4556ce1af5b0 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -93,7 +93,7 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
-static int init_inodecache(void)
+static int __init init_inodecache(void)
{
isofs_inode_cachep = kmem_cache_create("isofs_inode_cache",
sizeof(struct iso_inode_info),
diff --git a/fs/jffs2/compr_rtime.c b/fs/jffs2/compr_rtime.c
index 16a5047903a6..406d9cc84ba8 100644
--- a/fs/jffs2/compr_rtime.c
+++ b/fs/jffs2/compr_rtime.c
@@ -33,7 +33,7 @@ static int jffs2_rtime_compress(unsigned char *data_in,
unsigned char *cpage_out,
uint32_t *sourcelen, uint32_t *dstlen)
{
- short positions[256];
+ unsigned short positions[256];
int outpos = 0;
int pos=0;
@@ -74,7 +74,7 @@ static int jffs2_rtime_decompress(unsigned char *data_in,
unsigned char *cpage_out,
uint32_t srclen, uint32_t destlen)
{
- short positions[256];
+ unsigned short positions[256];
int outpos = 0;
int pos=0;
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index f73991522672..601afd1afddf 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -457,12 +457,14 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
The umask is only applied if there's no default ACL */
ret = jffs2_init_acl_pre(dir_i, inode, &mode);
if (ret) {
- make_bad_inode(inode);
- iput(inode);
- return ERR_PTR(ret);
+ mutex_unlock(&f->sem);
+ make_bad_inode(inode);
+ iput(inode);
+ return ERR_PTR(ret);
}
ret = jffs2_do_new_inode (c, f, mode, ri);
if (ret) {
+ mutex_unlock(&f->sem);
make_bad_inode(inode);
iput(inode);
return ERR_PTR(ret);
@@ -479,6 +481,7 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
inode->i_size = 0;
if (insert_inode_locked(inode) < 0) {
+ mutex_unlock(&f->sem);
make_bad_inode(inode);
iput(inode);
return ERR_PTR(-EINVAL);
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index e4619b00f7c5..fa35ff79ab35 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -231,7 +231,7 @@ struct jffs2_tmp_dnode_info
uint32_t version;
uint32_t data_crc;
uint32_t partial_crc;
- uint16_t csize;
+ uint32_t csize;
uint16_t overlapped;
};
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 03310721712f..b6bd4affd9ad 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -179,6 +179,7 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
spin_unlock(&c->erase_completion_lock);
schedule();
+ remove_wait_queue(&c->erase_wait, &wait);
} else
spin_unlock(&c->erase_completion_lock);
} else if (ret)
@@ -211,20 +212,25 @@ out:
int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize,
uint32_t *len, uint32_t sumsize)
{
- int ret = -EAGAIN;
+ int ret;
minsize = PAD(minsize);
jffs2_dbg(1, "%s(): Requested 0x%x bytes\n", __func__, minsize);
- spin_lock(&c->erase_completion_lock);
- while(ret == -EAGAIN) {
+ while (true) {
+ spin_lock(&c->erase_completion_lock);
ret = jffs2_do_reserve_space(c, minsize, len, sumsize);
if (ret) {
jffs2_dbg(1, "%s(): looping, ret is %d\n",
__func__, ret);
}
+ spin_unlock(&c->erase_completion_lock);
+
+ if (ret == -EAGAIN)
+ cond_resched();
+ else
+ break;
}
- spin_unlock(&c->erase_completion_lock);
if (!ret)
ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1);
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index abb0f1f53d93..985217626e66 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -48,14 +48,18 @@ void __init kernfs_inode_init(void)
static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
{
+ static DEFINE_MUTEX(iattr_mutex);
+ struct kernfs_iattrs *ret;
struct iattr *iattrs;
+ mutex_lock(&iattr_mutex);
+
if (kn->iattr)
- return kn->iattr;
+ goto out_unlock;
kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL);
if (!kn->iattr)
- return NULL;
+ goto out_unlock;
iattrs = &kn->iattr->ia_iattr;
/* assign default attributes */
@@ -65,8 +69,10 @@ static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
simple_xattrs_init(&kn->iattr->xattrs);
-
- return kn->iattr;
+out_unlock:
+ ret = kn->iattr;
+ mutex_unlock(&iattr_mutex);
+ return ret;
}
static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 10d6c41aecad..6bf06a07f3e0 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -235,6 +235,7 @@ out_err:
if (warned++ == 0)
printk(KERN_WARNING
"lockd_up: makesock failed, error=%d\n", err);
+ svc_shutdown_net(serv, net);
return err;
}
diff --git a/fs/mount.h b/fs/mount.h
index b29e42f05f34..d55297f2fa05 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -10,7 +10,7 @@ struct mnt_namespace {
struct user_namespace *user_ns;
u64 seq; /* Sequence number to prevent loops */
wait_queue_head_t poll;
- int event;
+ u64 event;
};
struct mnt_pcp {
@@ -104,6 +104,9 @@ struct proc_mounts {
struct mnt_namespace *ns;
struct path root;
int (*show)(struct seq_file *, struct vfsmount *);
+ void *cached_mount;
+ u64 cached_event;
+ loff_t cached_index;
};
#define proc_mounts(p) (container_of((p), struct proc_mounts, m))
diff --git a/fs/namei.c b/fs/namei.c
index 88339f59efb5..c6157c894fce 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -358,6 +358,7 @@ int generic_permission(struct inode *inode, int mask)
return -EACCES;
}
+EXPORT_SYMBOL(generic_permission);
/*
* We _really_ want to just do "generic_permission()" without
@@ -455,6 +456,7 @@ int inode_permission(struct inode *inode, int mask)
return retval;
return __inode_permission(inode, mask);
}
+EXPORT_SYMBOL(inode_permission);
/**
* path_get - get a reference to a path
@@ -924,6 +926,7 @@ int follow_up(struct path *path)
path->mnt = &parent->mnt;
return 1;
}
+EXPORT_SYMBOL(follow_up);
/*
* Perform an automount
@@ -1085,6 +1088,7 @@ int follow_down_one(struct path *path)
}
return 0;
}
+EXPORT_SYMBOL(follow_down_one);
static inline bool managed_dentry_might_block(struct dentry *dentry)
{
@@ -1223,6 +1227,7 @@ int follow_down(struct path *path)
}
return 0;
}
+EXPORT_SYMBOL(follow_down);
/*
* Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
@@ -2025,6 +2030,7 @@ int kern_path(const char *name, unsigned int flags, struct path *path)
*path = nd.path;
return res;
}
+EXPORT_SYMBOL(kern_path);
/**
* vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
@@ -2049,6 +2055,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
*path = nd.path;
return err;
}
+EXPORT_SYMBOL(vfs_path_lookup);
/*
* Restricted form of lookup. Doesn't follow links, single-component only,
@@ -2111,6 +2118,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
return __lookup_hash(&this, base, 0);
}
+EXPORT_SYMBOL(lookup_one_len);
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
struct path *path, int *empty)
@@ -2135,6 +2143,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
{
return user_path_at_empty(dfd, name, flags, path, NULL);
}
+EXPORT_SYMBOL(user_path_at);
/*
* NB: most callers don't do anything directly with the reference to the
@@ -2477,6 +2486,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
return NULL;
}
+EXPORT_SYMBOL(lock_rename);
void unlock_rename(struct dentry *p1, struct dentry *p2)
{
@@ -2486,6 +2496,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
}
}
+EXPORT_SYMBOL(unlock_rename);
int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool want_excl)
@@ -2506,6 +2517,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
fsnotify_create(dir, dentry);
return error;
}
+EXPORT_SYMBOL(vfs_create);
static int may_open(struct path *path, int acc_mode, int flag)
{
@@ -3375,6 +3387,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
fsnotify_create(dir, dentry);
return error;
}
+EXPORT_SYMBOL(vfs_mknod);
static int may_mknod(umode_t mode)
{
@@ -3464,6 +3477,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
fsnotify_mkdir(dir, dentry);
return error;
}
+EXPORT_SYMBOL(vfs_mkdir);
SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
{
@@ -3518,6 +3532,7 @@ void dentry_unhash(struct dentry *dentry)
__d_drop(dentry);
spin_unlock(&dentry->d_lock);
}
+EXPORT_SYMBOL(dentry_unhash);
int vfs_rmdir(struct inode *dir, struct dentry *dentry)
{
@@ -3555,6 +3570,7 @@ out:
d_delete(dentry);
return error;
}
+EXPORT_SYMBOL(vfs_rmdir);
static long do_rmdir(int dfd, const char __user *pathname)
{
@@ -3672,6 +3688,7 @@ out:
return error;
}
+EXPORT_SYMBOL(vfs_unlink);
/*
* Make sure that the actual truncation of the file will occur outside its
@@ -3785,6 +3802,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
fsnotify_create(dir, dentry);
return error;
}
+EXPORT_SYMBOL(vfs_symlink);
SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
int, newdfd, const char __user *, newname)
@@ -3893,6 +3911,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
fsnotify_link(dir, inode, new_dentry);
return error;
}
+EXPORT_SYMBOL(vfs_link);
/*
* Hardlinks are often used in delicate situations. We avoid
@@ -4152,6 +4171,7 @@ out:
return error;
}
+EXPORT_SYMBOL(vfs_rename);
SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
int, newdfd, const char __user *, newname, unsigned int, flags)
@@ -4304,11 +4324,9 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna
return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
}
-int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link)
+int readlink_copy(char __user *buffer, int buflen, const char *link)
{
- int len;
-
- len = PTR_ERR(link);
+ int len = PTR_ERR(link);
if (IS_ERR(link))
goto out;
@@ -4320,6 +4338,7 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const c
out:
return len;
}
+EXPORT_SYMBOL(readlink_copy);
/*
* A helper for ->readlink(). This should be used *ONLY* for symlinks that
@@ -4337,11 +4356,12 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
if (IS_ERR(cookie))
return PTR_ERR(cookie);
- res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
+ res = readlink_copy(buffer, buflen, nd_get_link(&nd));
if (dentry->d_inode->i_op->put_link)
dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
return res;
}
+EXPORT_SYMBOL(generic_readlink);
/* get the link contents into pagecache */
static char *page_getlink(struct dentry * dentry, struct page **ppage)
@@ -4361,14 +4381,14 @@ static char *page_getlink(struct dentry * dentry, struct page **ppage)
int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
struct page *page = NULL;
- char *s = page_getlink(dentry, &page);
- int res = vfs_readlink(dentry,buffer,buflen,s);
+ int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page));
if (page) {
kunmap(page);
page_cache_release(page);
}
return res;
}
+EXPORT_SYMBOL(page_readlink);
void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
{
@@ -4376,6 +4396,7 @@ void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
nd_set_link(nd, page_getlink(dentry, &page));
return page;
}
+EXPORT_SYMBOL(page_follow_link_light);
void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
{
@@ -4386,6 +4407,7 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
page_cache_release(page);
}
}
+EXPORT_SYMBOL(page_put_link);
/*
* The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
@@ -4423,45 +4445,18 @@ retry:
fail:
return err;
}
+EXPORT_SYMBOL(__page_symlink);
int page_symlink(struct inode *inode, const char *symname, int len)
{
return __page_symlink(inode, symname, len,
!(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
}
+EXPORT_SYMBOL(page_symlink);
const struct inode_operations page_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = page_follow_link_light,
.put_link = page_put_link,
};
-
-EXPORT_SYMBOL(user_path_at);
-EXPORT_SYMBOL(follow_down_one);
-EXPORT_SYMBOL(follow_down);
-EXPORT_SYMBOL(follow_up);
-EXPORT_SYMBOL(get_write_access); /* nfsd */
-EXPORT_SYMBOL(lock_rename);
-EXPORT_SYMBOL(lookup_one_len);
-EXPORT_SYMBOL(page_follow_link_light);
-EXPORT_SYMBOL(page_put_link);
-EXPORT_SYMBOL(page_readlink);
-EXPORT_SYMBOL(__page_symlink);
-EXPORT_SYMBOL(page_symlink);
EXPORT_SYMBOL(page_symlink_inode_operations);
-EXPORT_SYMBOL(kern_path);
-EXPORT_SYMBOL(vfs_path_lookup);
-EXPORT_SYMBOL(inode_permission);
-EXPORT_SYMBOL(unlock_rename);
-EXPORT_SYMBOL(vfs_create);
-EXPORT_SYMBOL(vfs_link);
-EXPORT_SYMBOL(vfs_mkdir);
-EXPORT_SYMBOL(vfs_mknod);
-EXPORT_SYMBOL(generic_permission);
-EXPORT_SYMBOL(vfs_readlink);
-EXPORT_SYMBOL(vfs_rename);
-EXPORT_SYMBOL(vfs_rmdir);
-EXPORT_SYMBOL(vfs_symlink);
-EXPORT_SYMBOL(vfs_unlink);
-EXPORT_SYMBOL(dentry_unhash);
-EXPORT_SYMBOL(generic_readlink);
diff --git a/fs/namespace.c b/fs/namespace.c
index 2ffc5a2905d4..182bc41cd887 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -52,7 +52,7 @@ static int __init set_mphash_entries(char *str)
}
__setup("mphash_entries=", set_mphash_entries);
-static int event;
+static u64 event;
static DEFINE_IDA(mnt_id_ida);
static DEFINE_IDA(mnt_group_ida);
static DEFINE_SPINLOCK(mnt_id_lock);
@@ -414,9 +414,7 @@ EXPORT_SYMBOL_GPL(mnt_clone_write);
*/
int __mnt_want_write_file(struct file *file)
{
- struct inode *inode = file_inode(file);
-
- if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode))
+ if (!(file->f_mode & FMODE_WRITER))
return __mnt_want_write(file->f_path.mnt);
else
return mnt_clone_write(file->f_path.mnt);
@@ -570,13 +568,17 @@ int sb_prepare_remount_readonly(struct super_block *sb)
static void free_vfsmnt(struct mount *mnt)
{
kfree(mnt->mnt_devname);
- mnt_free_id(mnt);
#ifdef CONFIG_SMP
free_percpu(mnt->mnt_pcp);
#endif
kmem_cache_free(mnt_cache, mnt);
}
+static void delayed_free_vfsmnt(struct rcu_head *head)
+{
+ free_vfsmnt(container_of(head, struct mount, mnt_rcu));
+}
+
/* call under rcu_read_lock */
bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
{
@@ -848,6 +850,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
root = mount_fs(type, flags, name, data);
if (IS_ERR(root)) {
+ mnt_free_id(mnt);
free_vfsmnt(mnt);
return ERR_CAST(root);
}
@@ -885,7 +888,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
goto out_free;
}
- mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD;
+ mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);
/* Don't allow unprivileged users to change mount flags */
if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY))
mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
@@ -928,20 +931,11 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
return mnt;
out_free:
+ mnt_free_id(mnt);
free_vfsmnt(mnt);
return ERR_PTR(err);
}
-static void delayed_free(struct rcu_head *head)
-{
- struct mount *mnt = container_of(head, struct mount, mnt_rcu);
- kfree(mnt->mnt_devname);
-#ifdef CONFIG_SMP
- free_percpu(mnt->mnt_pcp);
-#endif
- kmem_cache_free(mnt_cache, mnt);
-}
-
static void mntput_no_expire(struct mount *mnt)
{
put_again:
@@ -991,7 +985,7 @@ put_again:
dput(mnt->mnt.mnt_root);
deactivate_super(mnt->mnt.mnt_sb);
mnt_free_id(mnt);
- call_rcu(&mnt->mnt_rcu, delayed_free);
+ call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
}
void mntput(struct vfsmount *mnt)
@@ -1100,14 +1094,29 @@ static void *m_start(struct seq_file *m, loff_t *pos)
struct proc_mounts *p = proc_mounts(m);
down_read(&namespace_sem);
- return seq_list_start(&p->ns->list, *pos);
+ if (p->cached_event == p->ns->event) {
+ void *v = p->cached_mount;
+ if (*pos == p->cached_index)
+ return v;
+ if (*pos == p->cached_index + 1) {
+ v = seq_list_next(v, &p->ns->list, &p->cached_index);
+ return p->cached_mount = v;
+ }
+ }
+
+ p->cached_event = p->ns->event;
+ p->cached_mount = seq_list_start(&p->ns->list, *pos);
+ p->cached_index = *pos;
+ return p->cached_mount;
}
static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
struct proc_mounts *p = proc_mounts(m);
- return seq_list_next(v, &p->ns->list, pos);
+ p->cached_mount = seq_list_next(v, &p->ns->list, pos);
+ p->cached_index = *pos;
+ return p->cached_mount;
}
static void m_stop(struct seq_file *m, void *v)
@@ -1661,9 +1670,9 @@ static int attach_recursive_mnt(struct mount *source_mnt,
if (err)
goto out;
err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
+ lock_mount_hash();
if (err)
goto out_cleanup_ids;
- lock_mount_hash();
for (p = source_mnt; p; p = next_mnt(p, source_mnt))
set_mnt_shared(p);
} else {
@@ -1690,6 +1699,11 @@ static int attach_recursive_mnt(struct mount *source_mnt,
return 0;
out_cleanup_ids:
+ while (!hlist_empty(&tree_list)) {
+ child = hlist_entry(tree_list.first, struct mount, mnt_hash);
+ umount_tree(child, 0);
+ }
+ unlock_mount_hash();
cleanup_group_ids(source_mnt, NULL);
out:
return err;
@@ -2044,7 +2058,7 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
struct mount *parent;
int err;
- mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED | MNT_SYNC_UMOUNT);
+ mnt_flags &= ~MNT_INTERNAL_FLAGS;
mp = lock_mount(path);
if (IS_ERR(mp))
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index c320ac52353e..08b8ea8c353e 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -339,7 +339,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags)
if (val)
goto finished;
- DDPRINTK("ncp_lookup_validate: %pd2 not valid, age=%ld, server lookup\n",
+ ncp_dbg(2, "%pd2 not valid, age=%ld, server lookup\n",
dentry, NCP_GET_AGE(dentry));
len = sizeof(__name);
@@ -358,7 +358,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags)
res = ncp_obtain_info(server, dir, __name, &(finfo.i));
}
finfo.volume = finfo.i.volNumber;
- DDPRINTK("ncp_lookup_validate: looked for %pd/%s, res=%d\n",
+ ncp_dbg(2, "looked for %pd/%s, res=%d\n",
dentry->d_parent, __name, res);
/*
* If we didn't find it, or if it has a different dirEntNum to
@@ -372,14 +372,14 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags)
ncp_new_dentry(dentry);
val=1;
} else
- DDPRINTK("ncp_lookup_validate: found, but dirEntNum changed\n");
+ ncp_dbg(2, "found, but dirEntNum changed\n");
ncp_update_inode2(inode, &finfo);
mutex_unlock(&inode->i_mutex);
}
finished:
- DDPRINTK("ncp_lookup_validate: result=%d\n", val);
+ ncp_dbg(2, "result=%d\n", val);
dput(parent);
return val;
}
@@ -453,8 +453,7 @@ static int ncp_readdir(struct file *file, struct dir_context *ctx)
ctl.page = NULL;
ctl.cache = NULL;
- DDPRINTK("ncp_readdir: reading %pD2, pos=%d\n", file,
- (int) ctx->pos);
+ ncp_dbg(2, "reading %pD2, pos=%d\n", file, (int)ctx->pos);
result = -EIO;
/* Do not generate '.' and '..' when server is dead. */
@@ -697,8 +696,7 @@ ncp_read_volume_list(struct file *file, struct dir_context *ctx,
struct ncp_entry_info entry;
int i;
- DPRINTK("ncp_read_volume_list: pos=%ld\n",
- (unsigned long) ctx->pos);
+ ncp_dbg(1, "pos=%ld\n", (unsigned long)ctx->pos);
for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) {
int inval_dentry;
@@ -708,12 +706,11 @@ ncp_read_volume_list(struct file *file, struct dir_context *ctx,
if (!strlen(info.volume_name))
continue;
- DPRINTK("ncp_read_volume_list: found vol: %s\n",
- info.volume_name);
+ ncp_dbg(1, "found vol: %s\n", info.volume_name);
if (ncp_lookup_volume(server, info.volume_name,
&entry.i)) {
- DPRINTK("ncpfs: could not lookup vol %s\n",
+ ncp_dbg(1, "could not lookup vol %s\n",
info.volume_name);
continue;
}
@@ -738,14 +735,13 @@ ncp_do_readdir(struct file *file, struct dir_context *ctx,
int more;
size_t bufsize;
- DPRINTK("ncp_do_readdir: %pD2, fpos=%ld\n", file,
- (unsigned long) ctx->pos);
- PPRINTK("ncp_do_readdir: init %pD, volnum=%d, dirent=%u\n",
- file, NCP_FINFO(dir)->volNumber, NCP_FINFO(dir)->dirEntNum);
+ ncp_dbg(1, "%pD2, fpos=%ld\n", file, (unsigned long)ctx->pos);
+ ncp_vdbg("init %pD, volnum=%d, dirent=%u\n",
+ file, NCP_FINFO(dir)->volNumber, NCP_FINFO(dir)->dirEntNum);
err = ncp_initialize_search(server, dir, &seq);
if (err) {
- DPRINTK("ncp_do_readdir: init failed, err=%d\n", err);
+ ncp_dbg(1, "init failed, err=%d\n", err);
return;
}
/* We MUST NOT use server->buffer_size handshaked with server if we are
@@ -808,8 +804,7 @@ int ncp_conn_logged_in(struct super_block *sb)
goto out;
result = -ENOENT;
if (ncp_get_volume_root(server, __name, &volNumber, &dirEntNum, &DosDirNum)) {
- PPRINTK("ncp_conn_logged_in: %s not found\n",
- server->m.mounted_vol);
+ ncp_vdbg("%s not found\n", server->m.mounted_vol);
goto out;
}
dent = sb->s_root;
@@ -822,10 +817,10 @@ int ncp_conn_logged_in(struct super_block *sb)
NCP_FINFO(ino)->DosDirNum = DosDirNum;
result = 0;
} else {
- DPRINTK("ncpfs: sb->s_root->d_inode == NULL!\n");
+ ncp_dbg(1, "sb->s_root->d_inode == NULL!\n");
}
} else {
- DPRINTK("ncpfs: sb->s_root == NULL!\n");
+ ncp_dbg(1, "sb->s_root == NULL!\n");
}
} else
result = 0;
@@ -846,7 +841,7 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, unsig
if (!ncp_conn_valid(server))
goto finished;
- PPRINTK("ncp_lookup: server lookup for %pd2\n", dentry);
+ ncp_vdbg("server lookup for %pd2\n", dentry);
len = sizeof(__name);
if (ncp_is_server_root(dir)) {
@@ -854,15 +849,15 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, unsig
dentry->d_name.len, 1);
if (!res)
res = ncp_lookup_volume(server, __name, &(finfo.i));
- if (!res)
- ncp_update_known_namespace(server, finfo.i.volNumber, NULL);
+ if (!res)
+ ncp_update_known_namespace(server, finfo.i.volNumber, NULL);
} else {
res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
dentry->d_name.len, !ncp_preserve_case(dir));
if (!res)
res = ncp_obtain_info(server, dir, __name, &(finfo.i));
}
- PPRINTK("ncp_lookup: looked for %pd2, res=%d\n", dentry, res);
+ ncp_vdbg("looked for %pd2, res=%d\n", dentry, res);
/*
* If we didn't find an entry, make a negative dentry.
*/
@@ -886,7 +881,7 @@ add_entry:
}
finished:
- PPRINTK("ncp_lookup: result=%d\n", error);
+ ncp_vdbg("result=%d\n", error);
return ERR_PTR(error);
}
@@ -909,7 +904,7 @@ out:
return error;
out_close:
- PPRINTK("ncp_instantiate: %pd2 failed, closing file\n", dentry);
+ ncp_vdbg("%pd2 failed, closing file\n", dentry);
ncp_close_file(NCP_SERVER(dir), finfo->file_handle);
goto out;
}
@@ -923,7 +918,7 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, umode_t mode,
int opmode;
__u8 __name[NCP_MAXPATHLEN + 1];
- PPRINTK("ncp_create_new: creating %pd2, mode=%hx\n", dentry, mode);
+ ncp_vdbg("creating %pd2, mode=%hx\n", dentry, mode);
ncp_age_dentry(server, dentry);
len = sizeof(__name);
@@ -952,7 +947,7 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, umode_t mode,
error = -ENAMETOOLONG;
else if (result < 0)
error = result;
- DPRINTK("ncp_create: %pd2 failed\n", dentry);
+ ncp_dbg(1, "%pd2 failed\n", dentry);
goto out;
}
opmode = O_WRONLY;
@@ -985,7 +980,7 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
int error, len;
__u8 __name[NCP_MAXPATHLEN + 1];
- DPRINTK("ncp_mkdir: making %pd2\n", dentry);
+ ncp_dbg(1, "making %pd2\n", dentry);
ncp_age_dentry(server, dentry);
len = sizeof(__name);
@@ -1022,7 +1017,7 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
int error, result, len;
__u8 __name[NCP_MAXPATHLEN + 1];
- DPRINTK("ncp_rmdir: removing %pd2\n", dentry);
+ ncp_dbg(1, "removing %pd2\n", dentry);
len = sizeof(__name);
error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
@@ -1067,13 +1062,13 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry)
int error;
server = NCP_SERVER(dir);
- DPRINTK("ncp_unlink: unlinking %pd2\n", dentry);
+ ncp_dbg(1, "unlinking %pd2\n", dentry);
/*
* Check whether to close the file ...
*/
if (inode) {
- PPRINTK("ncp_unlink: closing file\n");
+ ncp_vdbg("closing file\n");
ncp_make_closed(inode);
}
@@ -1087,7 +1082,7 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry)
#endif
switch (error) {
case 0x00:
- DPRINTK("ncp: removed %pd2\n", dentry);
+ ncp_dbg(1, "removed %pd2\n", dentry);
break;
case 0x85:
case 0x8A:
@@ -1120,7 +1115,7 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
int old_len, new_len;
__u8 __old_name[NCP_MAXPATHLEN + 1], __new_name[NCP_MAXPATHLEN + 1];
- DPRINTK("ncp_rename: %pd2 to %pd2\n", old_dentry, new_dentry);
+ ncp_dbg(1, "%pd2 to %pd2\n", old_dentry, new_dentry);
ncp_age_dentry(server, old_dentry);
ncp_age_dentry(server, new_dentry);
@@ -1150,8 +1145,8 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
#endif
switch (error) {
case 0x00:
- DPRINTK("ncp renamed %pd -> %pd.\n",
- old_dentry, new_dentry);
+ ncp_dbg(1, "renamed %pd -> %pd\n",
+ old_dentry, new_dentry);
break;
case 0x9E:
error = -ENAMETOOLONG;
@@ -1173,7 +1168,7 @@ static int ncp_mknod(struct inode * dir, struct dentry *dentry,
if (!new_valid_dev(rdev))
return -EINVAL;
if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber)) {
- DPRINTK(KERN_DEBUG "ncp_mknod: mode = 0%ho\n", mode);
+ ncp_dbg(1, "mode = 0%ho\n", mode);
return ncp_create_new(dir, dentry, mode, rdev, 0);
}
return -EPERM; /* Strange, but true */
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 8f5074e1ecb9..77640a8bfb87 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -6,6 +6,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <asm/uaccess.h>
#include <linux/time.h>
@@ -34,11 +36,11 @@ int ncp_make_open(struct inode *inode, int right)
error = -EINVAL;
if (!inode) {
- printk(KERN_ERR "ncp_make_open: got NULL inode\n");
+ pr_err("%s: got NULL inode\n", __func__);
goto out;
}
- DPRINTK("ncp_make_open: opened=%d, volume # %u, dir entry # %u\n",
+ ncp_dbg(1, "opened=%d, volume # %u, dir entry # %u\n",
atomic_read(&NCP_FINFO(inode)->opened),
NCP_FINFO(inode)->volNumber,
NCP_FINFO(inode)->dirEntNum);
@@ -71,7 +73,7 @@ int ncp_make_open(struct inode *inode, int right)
break;
}
if (result) {
- PPRINTK("ncp_make_open: failed, result=%d\n", result);
+ ncp_vdbg("failed, result=%d\n", result);
goto out_unlock;
}
/*
@@ -83,7 +85,7 @@ int ncp_make_open(struct inode *inode, int right)
}
access = NCP_FINFO(inode)->access;
- PPRINTK("ncp_make_open: file open, access=%x\n", access);
+ ncp_vdbg("file open, access=%x\n", access);
if (access == right || access == O_RDWR) {
atomic_inc(&NCP_FINFO(inode)->opened);
error = 0;
@@ -107,7 +109,7 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
void* freepage;
size_t freelen;
- DPRINTK("ncp_file_read: enter %pd2\n", dentry);
+ ncp_dbg(1, "enter %pd2\n", dentry);
pos = *ppos;
@@ -124,7 +126,7 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
error = ncp_make_open(inode, O_RDONLY);
if (error) {
- DPRINTK(KERN_ERR "ncp_file_read: open failed, error=%d\n", error);
+ ncp_dbg(1, "open failed, error=%d\n", error);
return error;
}
@@ -165,7 +167,7 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
file_accessed(file);
- DPRINTK("ncp_file_read: exit %pd2\n", dentry);
+ ncp_dbg(1, "exit %pd2\n", dentry);
outrel:
ncp_inode_close(inode);
return already_read ? already_read : error;
@@ -182,7 +184,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
int errno;
void* bouncebuffer;
- DPRINTK("ncp_file_write: enter %pd2\n", dentry);
+ ncp_dbg(1, "enter %pd2\n", dentry);
if ((ssize_t) count < 0)
return -EINVAL;
pos = *ppos;
@@ -211,7 +213,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
return 0;
errno = ncp_make_open(inode, O_WRONLY);
if (errno) {
- DPRINTK(KERN_ERR "ncp_file_write: open failed, error=%d\n", errno);
+ ncp_dbg(1, "open failed, error=%d\n", errno);
return errno;
}
bufsize = NCP_SERVER(inode)->buffer_size;
@@ -261,7 +263,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
i_size_write(inode, pos);
mutex_unlock(&inode->i_mutex);
}
- DPRINTK("ncp_file_write: exit %pd2\n", dentry);
+ ncp_dbg(1, "exit %pd2\n", dentry);
outrel:
ncp_inode_close(inode);
return already_written ? already_written : errno;
@@ -269,7 +271,7 @@ outrel:
static int ncp_release(struct inode *inode, struct file *file) {
if (ncp_make_closed(inode)) {
- DPRINTK("ncp_release: failed to close\n");
+ ncp_dbg(1, "failed to close\n");
}
return 0;
}
diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c
index 0af3349de851..03ffde1f44d6 100644
--- a/fs/ncpfs/getopt.c
+++ b/fs/ncpfs/getopt.c
@@ -2,6 +2,8 @@
* getopt.c
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/string.h>
@@ -46,8 +48,8 @@ int ncp_getopt(const char *caller, char **options, const struct ncp_option *opts
if (opts->has_arg & OPT_NOPARAM) {
return opts->val;
}
- printk(KERN_INFO "%s: the %s option requires an argument\n",
- caller, token);
+ pr_info("%s: the %s option requires an argument\n",
+ caller, token);
return -EINVAL;
}
if (opts->has_arg & OPT_INT) {
@@ -57,18 +59,18 @@ int ncp_getopt(const char *caller, char **options, const struct ncp_option *opts
if (!*v) {
return opts->val;
}
- printk(KERN_INFO "%s: invalid numeric value in %s=%s\n",
+ pr_info("%s: invalid numeric value in %s=%s\n",
caller, token, val);
return -EDOM;
}
if (opts->has_arg & OPT_STRING) {
return opts->val;
}
- printk(KERN_INFO "%s: unexpected argument %s to the %s option\n",
+ pr_info("%s: unexpected argument %s to the %s option\n",
caller, val, token);
return -EINVAL;
}
}
- printk(KERN_INFO "%s: Unrecognized mount option %s\n", caller, token);
+ pr_info("%s: Unrecognized mount option %s\n", caller, token);
return -EOPNOTSUPP;
}
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 647d86d2db39..e31e589369a4 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -9,6 +9,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <asm/uaccess.h>
@@ -133,7 +135,7 @@ void ncp_update_inode(struct inode *inode, struct ncp_entry_info *nwinfo)
NCP_FINFO(inode)->access = nwinfo->access;
memcpy(NCP_FINFO(inode)->file_handle, nwinfo->file_handle,
sizeof(nwinfo->file_handle));
- DPRINTK("ncp_update_inode: updated %s, volnum=%d, dirent=%u\n",
+ ncp_dbg(1, "updated %s, volnum=%d, dirent=%u\n",
nwinfo->i.entryName, NCP_FINFO(inode)->volNumber,
NCP_FINFO(inode)->dirEntNum);
}
@@ -141,8 +143,7 @@ void ncp_update_inode(struct inode *inode, struct ncp_entry_info *nwinfo)
static void ncp_update_dates(struct inode *inode, struct nw_info_struct *nwi)
{
/* NFS namespace mode overrides others if it's set. */
- DPRINTK(KERN_DEBUG "ncp_update_dates_and_mode: (%s) nfs.mode=0%o\n",
- nwi->entryName, nwi->nfs.mode);
+ ncp_dbg(1, "(%s) nfs.mode=0%o\n", nwi->entryName, nwi->nfs.mode);
if (nwi->nfs.mode) {
/* XXX Security? */
inode->i_mode = nwi->nfs.mode;
@@ -230,7 +231,7 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
ncp_update_attrs(inode, nwinfo);
- DDPRINTK("ncp_read_inode: inode->i_mode = %u\n", inode->i_mode);
+ ncp_dbg(2, "inode->i_mode = %u\n", inode->i_mode);
set_nlink(inode, 1);
inode->i_uid = server->m.uid;
@@ -258,7 +259,7 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
struct inode *inode;
if (info == NULL) {
- printk(KERN_ERR "ncp_iget: info is NULL\n");
+ pr_err("%s: info is NULL\n", __func__);
return NULL;
}
@@ -290,7 +291,7 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
}
insert_inode_hash(inode);
} else
- printk(KERN_ERR "ncp_iget: iget failed!\n");
+ pr_err("%s: iget failed!\n", __func__);
return inode;
}
@@ -301,12 +302,12 @@ ncp_evict_inode(struct inode *inode)
clear_inode(inode);
if (S_ISDIR(inode->i_mode)) {
- DDPRINTK("ncp_evict_inode: put directory %ld\n", inode->i_ino);
+ ncp_dbg(2, "put directory %ld\n", inode->i_ino);
}
if (ncp_make_closed(inode) != 0) {
/* We can't do anything but complain. */
- printk(KERN_ERR "ncp_evict_inode: could not close\n");
+ pr_err("%s: could not close\n", __func__);
}
}
@@ -469,9 +470,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
{
struct ncp_mount_data_kernel data;
struct ncp_server *server;
- struct file *ncp_filp;
struct inode *root_inode;
- struct inode *sock_inode;
struct socket *sock;
int error;
int default_bufsize;
@@ -540,18 +539,10 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
if (!uid_valid(data.mounted_uid) || !uid_valid(data.uid) ||
!gid_valid(data.gid))
goto out;
- error = -EBADF;
- ncp_filp = fget(data.ncp_fd);
- if (!ncp_filp)
- goto out;
- error = -ENOTSOCK;
- sock_inode = file_inode(ncp_filp);
- if (!S_ISSOCK(sock_inode->i_mode))
- goto out_fput;
- sock = SOCKET_I(sock_inode);
+ sock = sockfd_lookup(data.ncp_fd, &error);
if (!sock)
- goto out_fput;
-
+ goto out;
+
if (sock->type == SOCK_STREAM)
default_bufsize = 0xF000;
else
@@ -573,27 +564,16 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
if (error)
goto out_fput;
- server->ncp_filp = ncp_filp;
server->ncp_sock = sock;
if (data.info_fd != -1) {
- struct socket *info_sock;
-
- error = -EBADF;
- server->info_filp = fget(data.info_fd);
- if (!server->info_filp)
- goto out_bdi;
- error = -ENOTSOCK;
- sock_inode = file_inode(server->info_filp);
- if (!S_ISSOCK(sock_inode->i_mode))
- goto out_fput2;
- info_sock = SOCKET_I(sock_inode);
+ struct socket *info_sock = sockfd_lookup(data.info_fd, &error);
if (!info_sock)
- goto out_fput2;
+ goto out_bdi;
+ server->info_sock = info_sock;
error = -EBADFD;
if (info_sock->type != SOCK_STREAM)
goto out_fput2;
- server->info_sock = info_sock;
}
/* server->lock = 0; */
@@ -621,7 +601,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
now because of PATH_MAX changes.. */
if (server->m.time_out < 1) {
server->m.time_out = 10;
- printk(KERN_INFO "You need to recompile your ncpfs utils..\n");
+ pr_info("You need to recompile your ncpfs utils..\n");
}
server->m.time_out = server->m.time_out * HZ / 100;
server->m.file_mode = (server->m.file_mode & S_IRWXUGO) | S_IFREG;
@@ -682,7 +662,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
ncp_unlock_server(server);
if (error < 0)
goto out_rxbuf;
- DPRINTK("ncp_fill_super: NCP_SBP(sb) = %x\n", (int) NCP_SBP(sb));
+ ncp_dbg(1, "NCP_SBP(sb) = %p\n", NCP_SBP(sb));
error = -EMSGSIZE; /* -EREMOTESIDEINCOMPATIBLE */
#ifdef CONFIG_NCPFS_PACKET_SIGNING
@@ -710,7 +690,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
if (ncp_negotiate_buffersize(server, default_bufsize,
&(server->buffer_size)) != 0)
goto out_disconnect;
- DPRINTK("ncpfs: bufsize = %d\n", server->buffer_size);
+ ncp_dbg(1, "bufsize = %d\n", server->buffer_size);
memset(&finfo, 0, sizeof(finfo));
finfo.i.attributes = aDIR;
@@ -739,7 +719,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
root_inode = ncp_iget(sb, &finfo);
if (!root_inode)
goto out_disconnect;
- DPRINTK("ncp_fill_super: root vol=%d\n", NCP_FINFO(root_inode)->volNumber);
+ ncp_dbg(1, "root vol=%d\n", NCP_FINFO(root_inode)->volNumber);
sb->s_root = d_make_root(root_inode);
if (!sb->s_root)
goto out_disconnect;
@@ -765,17 +745,12 @@ out_nls:
mutex_destroy(&server->root_setup_lock);
mutex_destroy(&server->mutex);
out_fput2:
- if (server->info_filp)
- fput(server->info_filp);
+ if (server->info_sock)
+ sockfd_put(server->info_sock);
out_bdi:
bdi_destroy(&server->bdi);
out_fput:
- /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>:
- *
- * The previously used put_filp(ncp_filp); was bogus, since
- * it doesn't perform proper unlocking.
- */
- fput(ncp_filp);
+ sockfd_put(sock);
out:
put_pid(data.wdog_pid);
sb->s_fs_info = NULL;
@@ -808,9 +783,9 @@ static void ncp_put_super(struct super_block *sb)
mutex_destroy(&server->root_setup_lock);
mutex_destroy(&server->mutex);
- if (server->info_filp)
- fput(server->info_filp);
- fput(server->ncp_filp);
+ if (server->info_sock)
+ sockfd_put(server->info_sock);
+ sockfd_put(server->ncp_sock);
kill_pid(server->m.wdog_pid, SIGTERM, 1);
put_pid(server->m.wdog_pid);
@@ -985,8 +960,7 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
if ((attr->ia_valid & ATTR_SIZE) != 0) {
int written;
- DPRINTK("ncpfs: trying to change size to %ld\n",
- attr->ia_size);
+ ncp_dbg(1, "trying to change size to %llu\n", attr->ia_size);
if ((result = ncp_make_open(inode, O_WRONLY)) < 0) {
result = -EACCES;
@@ -1072,7 +1046,7 @@ MODULE_ALIAS_FS("ncpfs");
static int __init init_ncp_fs(void)
{
int err;
- DPRINTK("ncpfs: init_ncp_fs called\n");
+ ncp_dbg(1, "called\n");
err = init_inodecache();
if (err)
@@ -1089,7 +1063,7 @@ out1:
static void __exit exit_ncp_fs(void)
{
- DPRINTK("ncpfs: exit_ncp_fs called\n");
+ ncp_dbg(1, "called\n");
unregister_filesystem(&ncp_fs_type);
destroy_inodecache();
}
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 60426ccb3b65..d5659d96ee7f 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -41,7 +41,7 @@ ncp_get_fs_info(struct ncp_server * server, struct inode *inode,
return -EFAULT;
if (info.version != NCP_GET_FS_INFO_VERSION) {
- DPRINTK("info.version invalid: %d\n", info.version);
+ ncp_dbg(1, "info.version invalid: %d\n", info.version);
return -EINVAL;
}
/* TODO: info.addr = server->m.serv_addr; */
@@ -66,7 +66,7 @@ ncp_get_fs_info_v2(struct ncp_server * server, struct inode *inode,
return -EFAULT;
if (info2.version != NCP_GET_FS_INFO_VERSION_V2) {
- DPRINTK("info.version invalid: %d\n", info2.version);
+ ncp_dbg(1, "info.version invalid: %d\n", info2.version);
return -EINVAL;
}
info2.mounted_uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid);
@@ -132,7 +132,7 @@ ncp_get_compat_fs_info_v2(struct ncp_server * server, struct inode *inode,
return -EFAULT;
if (info2.version != NCP_GET_FS_INFO_VERSION_V2) {
- DPRINTK("info.version invalid: %d\n", info2.version);
+ ncp_dbg(1, "info.version invalid: %d\n", info2.version);
return -EINVAL;
}
info2.mounted_uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid);
@@ -308,8 +308,7 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg
else
result = server->reply_size;
ncp_unlock_server(server);
- DPRINTK("ncp_ioctl: copy %d bytes\n",
- result);
+ ncp_dbg(1, "copy %d bytes\n", result);
if (result >= 0)
if (copy_to_user(request.data, bouncebuffer, result))
result = -EFAULT;
@@ -385,9 +384,9 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg
sr.namespace = server->name_space[sr.volNumber];
result = 0;
} else
- DPRINTK("ncpfs: s_root->d_inode==NULL\n");
+ ncp_dbg(1, "s_root->d_inode==NULL\n");
} else
- DPRINTK("ncpfs: s_root==NULL\n");
+ ncp_dbg(1, "s_root==NULL\n");
} else {
sr.volNumber = -1;
sr.namespace = 0;
@@ -440,11 +439,11 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg
NCP_FINFO(s_inode)->DosDirNum = dosde;
server->root_setuped = 1;
} else {
- DPRINTK("ncpfs: s_root->d_inode==NULL\n");
+ ncp_dbg(1, "s_root->d_inode==NULL\n");
result = -EIO;
}
} else {
- DPRINTK("ncpfs: s_root==NULL\n");
+ ncp_dbg(1, "s_root==NULL\n");
result = -EIO;
}
}
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 3c5dd55d284c..b359d12eb359 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -107,7 +107,7 @@ int ncp_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file_inode(file);
- DPRINTK("ncp_mmap: called\n");
+ ncp_dbg(1, "called\n");
if (!ncp_conn_valid(NCP_SERVER(inode)))
return -EIO;
diff --git a/fs/ncpfs/ncp_fs.h b/fs/ncpfs/ncp_fs.h
index 31831afe1c3b..b9f69e1b1f43 100644
--- a/fs/ncpfs/ncp_fs.h
+++ b/fs/ncpfs/ncp_fs.h
@@ -2,30 +2,32 @@
#include "ncp_fs_i.h"
#include "ncp_fs_sb.h"
-/* define because it is easy to change PRINTK to {*}PRINTK */
-#define PRINTK(format, args...) printk(KERN_DEBUG format , ## args)
-
#undef NCPFS_PARANOIA
#ifdef NCPFS_PARANOIA
-#define PPRINTK(format, args...) PRINTK(format , ## args)
+#define ncp_vdbg(fmt, ...) \
+ pr_debug(fmt, ##__VA_ARGS__)
#else
-#define PPRINTK(format, args...)
+#define ncp_vdbg(fmt, ...) \
+do { \
+ if (0) \
+ pr_debug(fmt, ##__VA_ARGS__); \
+} while (0)
#endif
#ifndef DEBUG_NCP
#define DEBUG_NCP 0
#endif
-#if DEBUG_NCP > 0
-#define DPRINTK(format, args...) PRINTK(format , ## args)
-#else
-#define DPRINTK(format, args...)
-#endif
-#if DEBUG_NCP > 1
-#define DDPRINTK(format, args...) PRINTK(format , ## args)
-#else
-#define DDPRINTK(format, args...)
+
+#if DEBUG_NCP > 0 && !defined(DEBUG)
+#define DEBUG
#endif
+#define ncp_dbg(level, fmt, ...) \
+do { \
+ if (level <= DEBUG_NCP) \
+ pr_debug(fmt, ##__VA_ARGS__); \
+} while (0)
+
#define NCP_MAX_RPC_TIMEOUT (6*HZ)
diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h
index b81e97adc5a9..55e26fd80886 100644
--- a/fs/ncpfs/ncp_fs_sb.h
+++ b/fs/ncpfs/ncp_fs_sb.h
@@ -45,9 +45,7 @@ struct ncp_server {
__u8 name_space[NCP_NUMBER_OF_VOLUMES + 2];
- struct file *ncp_filp; /* File pointer to ncp socket */
struct socket *ncp_sock;/* ncp socket */
- struct file *info_filp;
struct socket *info_sock;
u8 sequence;
@@ -111,7 +109,7 @@ struct ncp_server {
spinlock_t requests_lock; /* Lock accesses to tx.requests, tx.creq and rcv.creq when STREAM mode */
- void (*data_ready)(struct sock* sk, int len);
+ void (*data_ready)(struct sock* sk);
void (*error_report)(struct sock* sk);
void (*write_space)(struct sock* sk); /* STREAM mode only */
struct {
@@ -153,7 +151,7 @@ extern void ncp_tcp_tx_proc(struct work_struct *work);
extern void ncpdgram_rcv_proc(struct work_struct *work);
extern void ncpdgram_timeout_proc(struct work_struct *work);
extern void ncpdgram_timeout_call(unsigned long server);
-extern void ncp_tcp_data_ready(struct sock* sk, int len);
+extern void ncp_tcp_data_ready(struct sock* sk);
extern void ncp_tcp_write_space(struct sock* sk);
extern void ncp_tcp_error_report(struct sock* sk);
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index 981a95617fc9..482387532f54 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -9,14 +9,14 @@
*
*/
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "ncp_fs.h"
static inline void assert_server_locked(struct ncp_server *server)
{
if (server->lock == 0) {
- DPRINTK("ncpfs: server not locked!\n");
+ ncp_dbg(1, "server not locked!\n");
}
}
@@ -75,7 +75,7 @@ static void ncp_add_pstring(struct ncp_server *server, const char *s)
int len = strlen(s);
assert_server_locked(server);
if (len > 255) {
- DPRINTK("ncpfs: string too long: %s\n", s);
+ ncp_dbg(1, "string too long: %s\n", s);
len = 255;
}
ncp_add_byte(server, len);
@@ -225,7 +225,7 @@ int ncp_get_volume_info_with_number(struct ncp_server* server,
result = -EIO;
len = ncp_reply_byte(server, 29);
if (len > NCP_VOLNAME_LEN) {
- DPRINTK("ncpfs: volume name too long: %d\n", len);
+ ncp_dbg(1, "volume name too long: %d\n", len);
goto out;
}
memcpy(&(target->volume_name), ncp_reply_data(server, 30), len);
@@ -259,7 +259,7 @@ int ncp_get_directory_info(struct ncp_server* server, __u8 n,
result = -EIO;
len = ncp_reply_byte(server, 21);
if (len > NCP_VOLNAME_LEN) {
- DPRINTK("ncpfs: volume name too long: %d\n", len);
+ ncp_dbg(1, "volume name too long: %d\n", len);
goto out;
}
memcpy(&(target->volume_name), ncp_reply_data(server, 22), len);
@@ -295,9 +295,9 @@ ncp_make_closed(struct inode *inode)
err = ncp_close_file(NCP_SERVER(inode), NCP_FINFO(inode)->file_handle);
if (!err)
- PPRINTK("ncp_make_closed: volnum=%d, dirent=%u, error=%d\n",
- NCP_FINFO(inode)->volNumber,
- NCP_FINFO(inode)->dirEntNum, err);
+ ncp_vdbg("volnum=%d, dirent=%u, error=%d\n",
+ NCP_FINFO(inode)->volNumber,
+ NCP_FINFO(inode)->dirEntNum, err);
}
mutex_unlock(&NCP_FINFO(inode)->open_mutex);
return err;
@@ -394,8 +394,7 @@ int ncp_obtain_nfs_info(struct ncp_server *server,
if ((result = ncp_request(server, 87)) == 0) {
ncp_extract_nfs_info(ncp_reply_data(server, 0), &target->nfs);
- DPRINTK(KERN_DEBUG
- "ncp_obtain_nfs_info: (%s) mode=0%o, rdev=0x%x\n",
+ ncp_dbg(1, "(%s) mode=0%o, rdev=0x%x\n",
target->entryName, target->nfs.mode,
target->nfs.rdev);
} else {
@@ -425,7 +424,7 @@ int ncp_obtain_info(struct ncp_server *server, struct inode *dir, const char *pa
int result;
if (target == NULL) {
- printk(KERN_ERR "ncp_obtain_info: invalid call\n");
+ pr_err("%s: invalid call\n", __func__);
return -EINVAL;
}
ncp_init_request(server);
@@ -498,7 +497,7 @@ ncp_get_known_namespace(struct ncp_server *server, __u8 volume)
namespace = ncp_reply_data(server, 2);
while (no_namespaces > 0) {
- DPRINTK("get_namespaces: found %d on %d\n", *namespace, volume);
+ ncp_dbg(1, "found %d on %d\n", *namespace, volume);
#ifdef CONFIG_NCPFS_NFS_NS
if ((*namespace == NW_NS_NFS) && !(server->m.flags&NCP_MOUNT_NO_NFS))
@@ -531,8 +530,7 @@ ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns)
if (ret_ns)
*ret_ns = ns;
- DPRINTK("lookup_vol: namespace[%d] = %d\n",
- volume, server->name_space[volume]);
+ ncp_dbg(1, "namespace[%d] = %d\n", volume, server->name_space[volume]);
if (server->name_space[volume] == ns)
return 0;
@@ -596,7 +594,7 @@ ncp_get_volume_root(struct ncp_server *server,
{
int result;
- DPRINTK("ncp_get_volume_root: looking up vol %s\n", volname);
+ ncp_dbg(1, "looking up vol %s\n", volname);
ncp_init_request(server);
ncp_add_byte(server, 22); /* Subfunction: Generate dir handle */
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index 3a1587222c8a..471bc3d1139e 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -8,6 +8,7 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/time.h>
#include <linux/errno.h>
@@ -96,11 +97,11 @@ static void ncp_req_put(struct ncp_request_reply *req)
kfree(req);
}
-void ncp_tcp_data_ready(struct sock *sk, int len)
+void ncp_tcp_data_ready(struct sock *sk)
{
struct ncp_server *server = sk->sk_user_data;
- server->data_ready(sk, len);
+ server->data_ready(sk);
schedule_work(&server->rcv.tq);
}
@@ -231,7 +232,7 @@ static void __ncptcp_try_send(struct ncp_server *server)
return;
if (result < 0) {
- printk(KERN_ERR "ncpfs: tcp: Send failed: %d\n", result);
+ pr_err("tcp: Send failed: %d\n", result);
__ncp_abort_request(server, rq, result);
return;
}
@@ -332,7 +333,7 @@ static int ncp_add_request(struct ncp_server *server, struct ncp_request_reply *
mutex_lock(&server->rcv.creq_mutex);
if (!ncp_conn_valid(server)) {
mutex_unlock(&server->rcv.creq_mutex);
- printk(KERN_ERR "ncpfs: tcp: Server died\n");
+ pr_err("tcp: Server died\n");
return -EIO;
}
ncp_req_get(req);
@@ -405,15 +406,15 @@ void ncpdgram_rcv_proc(struct work_struct *work)
}
result = _recv(sock, buf, sizeof(buf), MSG_DONTWAIT);
if (result < 0) {
- DPRINTK("recv failed with %d\n", result);
+ ncp_dbg(1, "recv failed with %d\n", result);
continue;
}
if (result < 10) {
- DPRINTK("too short (%u) watchdog packet\n", result);
+ ncp_dbg(1, "too short (%u) watchdog packet\n", result);
continue;
}
if (buf[9] != '?') {
- DPRINTK("bad signature (%02X) in watchdog packet\n", buf[9]);
+ ncp_dbg(1, "bad signature (%02X) in watchdog packet\n", buf[9]);
continue;
}
buf[9] = 'Y';
@@ -448,7 +449,7 @@ void ncpdgram_rcv_proc(struct work_struct *work)
result -= 8;
hdrl = sock->sk->sk_family == AF_INET ? 8 : 6;
if (sign_verify_reply(server, server->rxbuf + hdrl, result - hdrl, cpu_to_le32(result), server->rxbuf + result)) {
- printk(KERN_INFO "ncpfs: Signature violation\n");
+ pr_info("Signature violation\n");
result = -EIO;
}
}
@@ -524,7 +525,7 @@ static int do_tcp_rcv(struct ncp_server *server, void *buffer, size_t len)
return result;
}
if (result > len) {
- printk(KERN_ERR "ncpfs: tcp: bug in recvmsg (%u > %Zu)\n", result, len);
+ pr_err("tcp: bug in recvmsg (%u > %Zu)\n", result, len);
return -EIO;
}
return result;
@@ -552,9 +553,9 @@ static int __ncptcp_rcv_proc(struct ncp_server *server)
__ncptcp_abort(server);
}
if (result < 0) {
- printk(KERN_ERR "ncpfs: tcp: error in recvmsg: %d\n", result);
+ pr_err("tcp: error in recvmsg: %d\n", result);
} else {
- DPRINTK(KERN_ERR "ncpfs: tcp: EOF\n");
+ ncp_dbg(1, "tcp: EOF\n");
}
return -EIO;
}
@@ -566,20 +567,20 @@ static int __ncptcp_rcv_proc(struct ncp_server *server)
switch (server->rcv.state) {
case 0:
if (server->rcv.buf.magic != htonl(NCP_TCP_RCVD_MAGIC)) {
- printk(KERN_ERR "ncpfs: tcp: Unexpected reply type %08X\n", ntohl(server->rcv.buf.magic));
+ pr_err("tcp: Unexpected reply type %08X\n", ntohl(server->rcv.buf.magic));
__ncptcp_abort(server);
return -EIO;
}
datalen = ntohl(server->rcv.buf.len) & 0x0FFFFFFF;
if (datalen < 10) {
- printk(KERN_ERR "ncpfs: tcp: Unexpected reply len %d\n", datalen);
+ pr_err("tcp: Unexpected reply len %d\n", datalen);
__ncptcp_abort(server);
return -EIO;
}
#ifdef CONFIG_NCPFS_PACKET_SIGNING
if (server->sign_active) {
if (datalen < 18) {
- printk(KERN_ERR "ncpfs: tcp: Unexpected reply len %d\n", datalen);
+ pr_err("tcp: Unexpected reply len %d\n", datalen);
__ncptcp_abort(server);
return -EIO;
}
@@ -604,7 +605,7 @@ cont:;
server->rcv.len = datalen - 10;
break;
}
- DPRINTK("ncpfs: tcp: Unexpected NCP type %02X\n", type);
+ ncp_dbg(1, "tcp: Unexpected NCP type %02X\n", type);
skipdata2:;
server->rcv.state = 2;
skipdata:;
@@ -614,11 +615,11 @@ skipdata:;
}
req = server->rcv.creq;
if (!req) {
- DPRINTK(KERN_ERR "ncpfs: Reply without appropriate request\n");
+ ncp_dbg(1, "Reply without appropriate request\n");
goto skipdata2;
}
if (datalen > req->datalen + 8) {
- printk(KERN_ERR "ncpfs: tcp: Unexpected reply len %d (expected at most %Zd)\n", datalen, req->datalen + 8);
+ pr_err("tcp: Unexpected reply len %d (expected at most %Zd)\n", datalen, req->datalen + 8);
server->rcv.state = 3;
goto skipdata;
}
@@ -638,12 +639,12 @@ skipdata:;
req = server->rcv.creq;
if (req->tx_type != NCP_ALLOC_SLOT_REQUEST) {
if (((struct ncp_reply_header*)server->rxbuf)->sequence != server->sequence) {
- printk(KERN_ERR "ncpfs: tcp: Bad sequence number\n");
+ pr_err("tcp: Bad sequence number\n");
__ncp_abort_request(server, req, -EIO);
return -EIO;
}
if ((((struct ncp_reply_header*)server->rxbuf)->conn_low | (((struct ncp_reply_header*)server->rxbuf)->conn_high << 8)) != server->connection) {
- printk(KERN_ERR "ncpfs: tcp: Connection number mismatch\n");
+ pr_err("tcp: Connection number mismatch\n");
__ncp_abort_request(server, req, -EIO);
return -EIO;
}
@@ -651,7 +652,7 @@ skipdata:;
#ifdef CONFIG_NCPFS_PACKET_SIGNING
if (server->sign_active && req->tx_type != NCP_DEALLOC_SLOT_REQUEST) {
if (sign_verify_reply(server, server->rxbuf + 6, req->datalen - 6, cpu_to_be32(req->datalen + 16), &server->rcv.buf.type)) {
- printk(KERN_ERR "ncpfs: tcp: Signature violation\n");
+ pr_err("tcp: Signature violation\n");
__ncp_abort_request(server, req, -EIO);
return -EIO;
}
@@ -742,7 +743,7 @@ static int ncp_do_request(struct ncp_server *server, int size,
int result;
if (server->lock == 0) {
- printk(KERN_ERR "ncpfs: Server not locked!\n");
+ pr_err("Server not locked!\n");
return -EIO;
}
if (!ncp_conn_valid(server)) {
@@ -781,7 +782,7 @@ static int ncp_do_request(struct ncp_server *server, int size,
spin_unlock_irqrestore(&current->sighand->siglock, flags);
}
- DDPRINTK("do_ncp_rpc_call returned %d\n", result);
+ ncp_dbg(2, "do_ncp_rpc_call returned %d\n", result);
return result;
}
@@ -811,7 +812,7 @@ int ncp_request2(struct ncp_server *server, int function,
result = ncp_do_request(server, server->current_size, reply, size);
if (result < 0) {
- DPRINTK("ncp_request_error: %d\n", result);
+ ncp_dbg(1, "ncp_request_error: %d\n", result);
goto out;
}
server->completion = reply->completion_code;
@@ -822,7 +823,7 @@ int ncp_request2(struct ncp_server *server, int function,
result = reply->completion_code;
if (result != 0)
- PPRINTK("ncp_request: completion code=%x\n", result);
+ ncp_vdbg("completion code=%x\n", result);
out:
return result;
}
@@ -865,14 +866,14 @@ void ncp_lock_server(struct ncp_server *server)
{
mutex_lock(&server->mutex);
if (server->lock)
- printk(KERN_WARNING "ncp_lock_server: was locked!\n");
+ pr_warn("%s: was locked!\n", __func__);
server->lock = 1;
}
void ncp_unlock_server(struct ncp_server *server)
{
if (!server->lock) {
- printk(KERN_WARNING "ncp_unlock_server: was not locked!\n");
+ pr_warn("%s: was not locked!\n", __func__);
return;
}
server->lock = 0;
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index 52439ddc8de0..1a63bfdb4a65 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -112,7 +112,7 @@ int ncp_symlink(struct inode *dir, struct dentry *dentry, const char *symname) {
__le32 attr;
unsigned int hdr;
- DPRINTK("ncp_symlink(dir=%p,dentry=%p,symname=%s)\n",dir,dentry,symname);
+ ncp_dbg(1, "dir=%p, dentry=%p, symname=%s\n", dir, dentry, symname);
if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber))
kludge = 0;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index ae2e87b95453..41db5258e7a7 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -112,7 +112,8 @@ out:
* TODO: keep track of all layouts (and delegations) in a hash table
* hashed by filehandle.
*/
-static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, struct nfs_fh *fh)
+static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
+ struct nfs_fh *fh, nfs4_stateid *stateid)
{
struct nfs_server *server;
struct inode *ino;
@@ -120,17 +121,19 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
list_for_each_entry(lo, &server->layouts, plh_layouts) {
+ if (!nfs4_stateid_match_other(&lo->plh_stateid, stateid))
+ continue;
if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh))
continue;
ino = igrab(lo->plh_inode);
if (!ino)
- continue;
+ break;
spin_lock(&ino->i_lock);
/* Is this layout in the process of being freed? */
if (NFS_I(ino)->layout != lo) {
spin_unlock(&ino->i_lock);
iput(ino);
- continue;
+ break;
}
pnfs_get_layout_hdr(lo);
spin_unlock(&ino->i_lock);
@@ -141,13 +144,14 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
return NULL;
}
-static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, struct nfs_fh *fh)
+static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
+ struct nfs_fh *fh, nfs4_stateid *stateid)
{
struct pnfs_layout_hdr *lo;
spin_lock(&clp->cl_lock);
rcu_read_lock();
- lo = get_layout_by_fh_locked(clp, fh);
+ lo = get_layout_by_fh_locked(clp, fh, stateid);
rcu_read_unlock();
spin_unlock(&clp->cl_lock);
@@ -162,9 +166,9 @@ static u32 initiate_file_draining(struct nfs_client *clp,
u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
LIST_HEAD(free_me_list);
- lo = get_layout_by_fh(clp, &args->cbl_fh);
+ lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid);
if (!lo)
- return NFS4ERR_NOMATCHING_LAYOUT;
+ goto out;
ino = lo->plh_inode;
spin_lock(&ino->i_lock);
@@ -179,6 +183,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
pnfs_free_lseg_list(&free_me_list);
pnfs_put_layout_hdr(lo);
iput(ino);
+out:
return rv;
}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 4a48fe4b84b6..d9f3d067cd15 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -69,21 +69,28 @@ const struct address_space_operations nfs_dir_aops = {
static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred)
{
+ struct nfs_inode *nfsi = NFS_I(dir);
struct nfs_open_dir_context *ctx;
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
if (ctx != NULL) {
ctx->duped = 0;
- ctx->attr_gencount = NFS_I(dir)->attr_gencount;
+ ctx->attr_gencount = nfsi->attr_gencount;
ctx->dir_cookie = 0;
ctx->dup_cookie = 0;
ctx->cred = get_rpccred(cred);
+ spin_lock(&dir->i_lock);
+ list_add(&ctx->list, &nfsi->open_files);
+ spin_unlock(&dir->i_lock);
return ctx;
}
return ERR_PTR(-ENOMEM);
}
-static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx)
+static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_context *ctx)
{
+ spin_lock(&dir->i_lock);
+ list_del(&ctx->list);
+ spin_unlock(&dir->i_lock);
put_rpccred(ctx->cred);
kfree(ctx);
}
@@ -126,7 +133,7 @@ out:
static int
nfs_closedir(struct inode *inode, struct file *filp)
{
- put_nfs_open_dir_context(filp->private_data);
+ put_nfs_open_dir_context(filp->f_path.dentry->d_inode, filp->private_data);
return 0;
}
@@ -306,10 +313,9 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
if (printk_ratelimit()) {
pr_notice("NFS: directory %pD2 contains a readdir loop."
"Please contact your server vendor. "
- "The file: %s has duplicate cookie %llu\n",
- desc->file,
- array->array[i].string.name,
- *desc->dir_cookie);
+ "The file: %.*s has duplicate cookie %llu\n",
+ desc->file, array->array[i].string.len,
+ array->array[i].string.name, *desc->dir_cookie);
}
status = -ELOOP;
goto out;
@@ -437,6 +443,22 @@ void nfs_advise_use_readdirplus(struct inode *dir)
set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags);
}
+/*
+ * This function is mainly for use by nfs_getattr().
+ *
+ * If this is an 'ls -l', we want to force use of readdirplus.
+ * Do this by checking if there is an active file descriptor
+ * and calling nfs_advise_use_readdirplus, then forcing a
+ * cache flush.
+ */
+void nfs_force_use_readdirplus(struct inode *dir)
+{
+ if (!list_empty(&NFS_I(dir)->open_files)) {
+ nfs_advise_use_readdirplus(dir);
+ nfs_zap_mapping(dir, dir->i_mapping);
+ }
+}
+
static
void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
{
@@ -815,6 +837,17 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc)
goto out;
}
+static bool nfs_dir_mapping_need_revalidate(struct inode *dir)
+{
+ struct nfs_inode *nfsi = NFS_I(dir);
+
+ if (nfs_attribute_cache_expired(dir))
+ return true;
+ if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+ return true;
+ return false;
+}
+
/* The file offset position represents the dirent entry number. A
last cookie cache takes care of the common case of reading the
whole directory.
@@ -847,7 +880,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
nfs_block_sillyrename(dentry);
- if (ctx->pos == 0 || nfs_attribute_cache_expired(inode))
+ if (ctx->pos == 0 || nfs_dir_mapping_need_revalidate(inode))
res = nfs_revalidate_mapping(inode, file->f_mapping);
if (res < 0)
goto out;
@@ -1911,6 +1944,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *old_inode = old_dentry->d_inode;
struct inode *new_inode = new_dentry->d_inode;
struct dentry *dentry = NULL, *rehash = NULL;
+ struct rpc_task *task;
int error = -EBUSY;
dfprintk(VFS, "NFS: rename(%pd2 -> %pd2, ct=%d)\n",
@@ -1958,8 +1992,16 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (new_inode != NULL)
NFS_PROTO(new_inode)->return_delegation(new_inode);
- error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name,
- new_dir, &new_dentry->d_name);
+ task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL);
+ if (IS_ERR(task)) {
+ error = PTR_ERR(task);
+ goto out;
+ }
+
+ error = rpc_wait_for_completion_task(task);
+ if (error == 0)
+ error = task->tk_status;
+ rpc_put_task(task);
nfs_mark_for_revalidate(old_inode);
out:
if (rehash)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 5bb790a69c71..284ca901fe16 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -617,6 +617,7 @@ out:
static const struct vm_operations_struct nfs_file_vm_ops = {
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = nfs_vm_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c4702baa22b8..0c438973f3c8 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -588,6 +588,25 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
}
EXPORT_SYMBOL_GPL(nfs_setattr_update_inode);
+static void nfs_request_parent_use_readdirplus(struct dentry *dentry)
+{
+ struct dentry *parent;
+
+ parent = dget_parent(dentry);
+ nfs_force_use_readdirplus(parent->d_inode);
+ dput(parent);
+}
+
+static bool nfs_need_revalidate_inode(struct inode *inode)
+{
+ if (NFS_I(inode)->cache_validity &
+ (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
+ return true;
+ if (nfs_attribute_cache_expired(inode))
+ return true;
+ return false;
+}
+
int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
struct inode *inode = dentry->d_inode;
@@ -616,10 +635,13 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
need_atime = 0;
- if (need_atime)
- err = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
- else
- err = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ if (need_atime || nfs_need_revalidate_inode(inode)) {
+ struct nfs_server *server = NFS_SERVER(inode);
+
+ if (server->caps & NFS_CAP_READDIRPLUS)
+ nfs_request_parent_use_readdirplus(dentry);
+ err = __nfs_revalidate_inode(server, inode);
+ }
if (!err) {
generic_fillattr(inode, stat);
stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
@@ -961,9 +983,7 @@ int nfs_attribute_cache_expired(struct inode *inode)
*/
int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
{
- if (!(NFS_I(inode)->cache_validity &
- (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
- && !nfs_attribute_cache_expired(inode))
+ if (!nfs_need_revalidate_inode(inode))
return NFS_STALE(inode) ? -ESTALE : 0;
return __nfs_revalidate_inode(server, inode);
}
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index b46cf5a67329..dd8bfc2e2464 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -301,6 +301,7 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
const char *ip_addr);
/* dir.c */
+extern void nfs_force_use_readdirplus(struct inode *dir);
extern unsigned long nfs_access_cache_count(struct shrinker *shrink,
struct shrink_control *sc);
extern unsigned long nfs_access_cache_scan(struct shrinker *shrink,
@@ -474,6 +475,13 @@ extern int nfs_migrate_page(struct address_space *,
#define nfs_migrate_page NULL
#endif
+/* unlink.c */
+extern struct rpc_task *
+nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
+ struct dentry *old_dentry, struct dentry *new_dentry,
+ void (*complete)(struct rpc_task *, struct nfs_renamedata *));
+extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry);
+
/* direct.c */
void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
struct nfs_direct_req *dreq);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index a462ef0fb5d6..db60149c4579 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -479,41 +479,6 @@ nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
}
static int
-nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
- struct inode *new_dir, struct qstr *new_name)
-{
- struct nfs_renameargs arg = {
- .old_dir = NFS_FH(old_dir),
- .old_name = old_name,
- .new_dir = NFS_FH(new_dir),
- .new_name = new_name,
- };
- struct nfs_renameres res;
- struct rpc_message msg = {
- .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME],
- .rpc_argp = &arg,
- .rpc_resp = &res,
- };
- int status = -ENOMEM;
-
- dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name);
-
- res.old_fattr = nfs_alloc_fattr();
- res.new_fattr = nfs_alloc_fattr();
- if (res.old_fattr == NULL || res.new_fattr == NULL)
- goto out;
-
- status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
- nfs_post_op_update_inode(old_dir, res.old_fattr);
- nfs_post_op_update_inode(new_dir, res.new_fattr);
-out:
- nfs_free_fattr(res.old_fattr);
- nfs_free_fattr(res.new_fattr);
- dprintk("NFS reply rename: %d\n", status);
- return status;
-}
-
-static int
nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
{
struct nfs3_linkargs arg = {
@@ -968,7 +933,6 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
.unlink_setup = nfs3_proc_unlink_setup,
.unlink_rpc_prepare = nfs3_proc_unlink_rpc_prepare,
.unlink_done = nfs3_proc_unlink_done,
- .rename = nfs3_proc_rename,
.rename_setup = nfs3_proc_rename_setup,
.rename_rpc_prepare = nfs3_proc_rename_rpc_prepare,
.rename_done = nfs3_proc_rename_done,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a5b27c2d9689..e1d1badbe53c 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -427,6 +427,7 @@ extern void nfs4_close_sync(struct nfs4_state *, fmode_t);
extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
extern void nfs_inode_find_state_and_recover(struct inode *inode,
const nfs4_stateid *stateid);
+extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *, struct nfs4_state *);
extern void nfs4_schedule_lease_recovery(struct nfs_client *);
extern int nfs4_wait_clnt_recover(struct nfs_client *clp);
extern int nfs4_client_recover_expired_lease(struct nfs_client *clp);
@@ -500,6 +501,16 @@ static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_statei
return memcmp(dst, src, sizeof(*dst)) == 0;
}
+static inline bool nfs4_stateid_match_other(const nfs4_stateid *dst, const nfs4_stateid *src)
+{
+ return memcmp(dst->other, src->other, NFS4_STATEID_OTHER_SIZE) == 0;
+}
+
+static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stateid *s2)
+{
+ return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0;
+}
+
static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state)
{
return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0;
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 0e46d3d1b6cc..aa9ef4876046 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -531,6 +531,13 @@ int nfs40_walk_client_list(struct nfs_client *new,
*result = pos;
dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",
__func__, pos, atomic_read(&pos->cl_count));
+ goto out;
+ case -ERESTARTSYS:
+ case -ETIMEDOUT:
+ /* The callback path may have been inadvertently
+ * changed. Schedule recovery!
+ */
+ nfs4_schedule_path_down_recovery(pos);
default:
goto out;
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 450bfedbe2f4..397be39c6dc8 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1068,6 +1068,7 @@ static void nfs4_opendata_free(struct kref *kref)
dput(p->dentry);
nfs_sb_deactive(sb);
nfs_fattr_free_names(&p->f_attr);
+ kfree(p->f_attr.mdsthreshold);
kfree(p);
}
@@ -1137,12 +1138,71 @@ static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode)
nfs4_state_set_mode_locked(state, state->state | fmode);
}
-static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
+static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state)
+{
+ struct nfs_client *clp = state->owner->so_server->nfs_client;
+ bool need_recover = false;
+
+ if (test_and_clear_bit(NFS_O_RDONLY_STATE, &state->flags) && state->n_rdonly)
+ need_recover = true;
+ if (test_and_clear_bit(NFS_O_WRONLY_STATE, &state->flags) && state->n_wronly)
+ need_recover = true;
+ if (test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags) && state->n_rdwr)
+ need_recover = true;
+ if (need_recover)
+ nfs4_state_mark_reclaim_nograce(clp, state);
+}
+
+static bool nfs_need_update_open_stateid(struct nfs4_state *state,
+ nfs4_stateid *stateid)
+{
+ if (test_and_set_bit(NFS_OPEN_STATE, &state->flags) == 0)
+ return true;
+ if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) {
+ nfs_test_and_clear_all_open_stateid(state);
+ return true;
+ }
+ if (nfs4_stateid_is_newer(stateid, &state->open_stateid))
+ return true;
+ return false;
+}
+
+static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
+ nfs4_stateid *stateid, fmode_t fmode)
{
+ clear_bit(NFS_O_RDWR_STATE, &state->flags);
+ switch (fmode & (FMODE_READ|FMODE_WRITE)) {
+ case FMODE_WRITE:
+ clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+ break;
+ case FMODE_READ:
+ clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+ break;
+ case 0:
+ clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+ clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+ clear_bit(NFS_OPEN_STATE, &state->flags);
+ }
+ if (stateid == NULL)
+ return;
+ if (!nfs_need_update_open_stateid(state, stateid))
+ return;
if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
nfs4_stateid_copy(&state->stateid, stateid);
nfs4_stateid_copy(&state->open_stateid, stateid);
- set_bit(NFS_OPEN_STATE, &state->flags);
+}
+
+static void nfs_clear_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
+{
+ write_seqlock(&state->seqlock);
+ nfs_clear_open_stateid_locked(state, stateid, fmode);
+ write_sequnlock(&state->seqlock);
+ if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
+ nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
+}
+
+static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
+{
switch (fmode) {
case FMODE_READ:
set_bit(NFS_O_RDONLY_STATE, &state->flags);
@@ -1153,13 +1213,11 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *
case FMODE_READ|FMODE_WRITE:
set_bit(NFS_O_RDWR_STATE, &state->flags);
}
-}
-
-static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
-{
- write_seqlock(&state->seqlock);
- nfs_set_open_stateid_locked(state, stateid, fmode);
- write_sequnlock(&state->seqlock);
+ if (!nfs_need_update_open_stateid(state, stateid))
+ return;
+ if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
+ nfs4_stateid_copy(&state->stateid, stateid);
+ nfs4_stateid_copy(&state->open_stateid, stateid);
}
static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode)
@@ -1217,6 +1275,8 @@ no_delegation:
__update_open_stateid(state, open_stateid, NULL, fmode);
ret = 1;
}
+ if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
+ nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
return ret;
}
@@ -1450,12 +1510,15 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
struct nfs4_state *newstate;
int ret;
+ /* Don't trigger recovery in nfs_test_and_clear_all_open_stateid */
+ clear_bit(NFS_O_RDWR_STATE, &state->flags);
+ clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+ clear_bit(NFS_O_RDONLY_STATE, &state->flags);
/* memory barrier prior to reading state->n_* */
clear_bit(NFS_DELEGATED_STATE, &state->flags);
clear_bit(NFS_OPEN_STATE, &state->flags);
smp_rmb();
if (state->n_rdwr != 0) {
- clear_bit(NFS_O_RDWR_STATE, &state->flags);
ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate);
if (ret != 0)
return ret;
@@ -1463,7 +1526,6 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
return -ESTALE;
}
if (state->n_wronly != 0) {
- clear_bit(NFS_O_WRONLY_STATE, &state->flags);
ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate);
if (ret != 0)
return ret;
@@ -1471,7 +1533,6 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
return -ESTALE;
}
if (state->n_rdonly != 0) {
- clear_bit(NFS_O_RDONLY_STATE, &state->flags);
ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate);
if (ret != 0)
return ret;
@@ -2244,10 +2305,12 @@ static int _nfs4_do_open(struct inode *dir,
}
}
- if (ctx_th && server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) {
- opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc();
- if (!opendata->f_attr.mdsthreshold)
- goto err_free_label;
+ if (server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) {
+ if (!opendata->f_attr.mdsthreshold) {
+ opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc();
+ if (!opendata->f_attr.mdsthreshold)
+ goto err_free_label;
+ }
opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0];
}
if (dentry->d_inode != NULL)
@@ -2275,11 +2338,10 @@ static int _nfs4_do_open(struct inode *dir,
if (opendata->file_created)
*opened |= FILE_CREATED;
- if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server))
+ if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) {
*ctx_th = opendata->f_attr.mdsthreshold;
- else
- kfree(opendata->f_attr.mdsthreshold);
- opendata->f_attr.mdsthreshold = NULL;
+ opendata->f_attr.mdsthreshold = NULL;
+ }
nfs4_label_free(olabel);
@@ -2289,7 +2351,6 @@ static int _nfs4_do_open(struct inode *dir,
err_free_label:
nfs4_label_free(olabel);
err_opendata_put:
- kfree(opendata->f_attr.mdsthreshold);
nfs4_opendata_put(opendata);
err_put_state_owner:
nfs4_put_state_owner(sp);
@@ -2479,26 +2540,6 @@ static void nfs4_free_closedata(void *data)
kfree(calldata);
}
-static void nfs4_close_clear_stateid_flags(struct nfs4_state *state,
- fmode_t fmode)
-{
- spin_lock(&state->owner->so_lock);
- clear_bit(NFS_O_RDWR_STATE, &state->flags);
- switch (fmode & (FMODE_READ|FMODE_WRITE)) {
- case FMODE_WRITE:
- clear_bit(NFS_O_RDONLY_STATE, &state->flags);
- break;
- case FMODE_READ:
- clear_bit(NFS_O_WRONLY_STATE, &state->flags);
- break;
- case 0:
- clear_bit(NFS_O_RDONLY_STATE, &state->flags);
- clear_bit(NFS_O_WRONLY_STATE, &state->flags);
- clear_bit(NFS_OPEN_STATE, &state->flags);
- }
- spin_unlock(&state->owner->so_lock);
-}
-
static void nfs4_close_done(struct rpc_task *task, void *data)
{
struct nfs4_closedata *calldata = data;
@@ -2517,9 +2558,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
if (calldata->roc)
pnfs_roc_set_barrier(state->inode,
calldata->roc_barrier);
- nfs_set_open_stateid(state, &calldata->res.stateid, 0);
+ nfs_clear_open_stateid(state, &calldata->res.stateid, 0);
renew_lease(server, calldata->timestamp);
- break;
+ goto out_release;
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_OLD_STATEID:
@@ -2533,7 +2574,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
goto out_release;
}
}
- nfs4_close_clear_stateid_flags(state, calldata->arg.fmode);
+ nfs_clear_open_stateid(state, NULL, calldata->arg.fmode);
out_release:
nfs_release_seqid(calldata->arg.seqid);
nfs_refresh_inode(calldata->inode, calldata->res.fattr);
@@ -3507,49 +3548,6 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
return 1;
}
-static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
- struct inode *new_dir, struct qstr *new_name)
-{
- struct nfs_server *server = NFS_SERVER(old_dir);
- struct nfs_renameargs arg = {
- .old_dir = NFS_FH(old_dir),
- .new_dir = NFS_FH(new_dir),
- .old_name = old_name,
- .new_name = new_name,
- };
- struct nfs_renameres res = {
- .server = server,
- };
- struct rpc_message msg = {
- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME],
- .rpc_argp = &arg,
- .rpc_resp = &res,
- };
- int status = -ENOMEM;
-
- status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
- if (!status) {
- update_changeattr(old_dir, &res.old_cinfo);
- update_changeattr(new_dir, &res.new_cinfo);
- }
- return status;
-}
-
-static int nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
- struct inode *new_dir, struct qstr *new_name)
-{
- struct nfs4_exception exception = { };
- int err;
- do {
- err = _nfs4_proc_rename(old_dir, old_name,
- new_dir, new_name);
- trace_nfs4_rename(old_dir, old_name, new_dir, new_name, err);
- err = nfs4_handle_exception(NFS_SERVER(old_dir), err,
- &exception);
- } while (exception.retry);
- return err;
-}
-
static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
{
struct nfs_server *server = NFS_SERVER(inode);
@@ -4884,6 +4882,20 @@ nfs4_init_uniform_client_string(const struct nfs_client *clp,
nodename);
}
+/*
+ * nfs4_callback_up_net() starts only "tcp" and "tcp6" callback
+ * services. Advertise one based on the address family of the
+ * clientaddr.
+ */
+static unsigned int
+nfs4_init_callback_netid(const struct nfs_client *clp, char *buf, size_t len)
+{
+ if (strchr(clp->cl_ipaddr, ':') != NULL)
+ return scnprintf(buf, len, "tcp6");
+ else
+ return scnprintf(buf, len, "tcp");
+}
+
/**
* nfs4_proc_setclientid - Negotiate client ID
* @clp: state data structure
@@ -4925,12 +4937,10 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
setclientid.sc_name,
sizeof(setclientid.sc_name));
/* cb_client4 */
- rcu_read_lock();
- setclientid.sc_netid_len = scnprintf(setclientid.sc_netid,
- sizeof(setclientid.sc_netid), "%s",
- rpc_peeraddr2str(clp->cl_rpcclient,
- RPC_DISPLAY_NETID));
- rcu_read_unlock();
+ setclientid.sc_netid_len =
+ nfs4_init_callback_netid(clp,
+ setclientid.sc_netid,
+ sizeof(setclientid.sc_netid));
setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr,
sizeof(setclientid.sc_uaddr), "%s.%u.%u",
clp->cl_ipaddr, port >> 8, port & 255);
@@ -8408,7 +8418,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
.unlink_setup = nfs4_proc_unlink_setup,
.unlink_rpc_prepare = nfs4_proc_unlink_rpc_prepare,
.unlink_done = nfs4_proc_unlink_done,
- .rename = nfs4_proc_rename,
.rename_setup = nfs4_proc_rename_setup,
.rename_rpc_prepare = nfs4_proc_rename_rpc_prepare,
.rename_done = nfs4_proc_rename_done,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 0deb32105ccf..2349518eef2c 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1316,7 +1316,7 @@ static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_st
return 1;
}
-static int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
+int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
{
set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
@@ -2075,8 +2075,10 @@ again:
switch (status) {
case 0:
break;
- case -NFS4ERR_DELAY:
case -ETIMEDOUT:
+ if (clnt->cl_softrtry)
+ break;
+ case -NFS4ERR_DELAY:
case -EAGAIN:
ssleep(1);
case -NFS4ERR_STALE_CLIENTID:
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 72f3bf1754ef..73ce8d4fe2c8 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -203,8 +203,7 @@ static int nfs4_stat_to_errno(int);
2 + encode_verifier_maxsz + 5 + \
nfs4_label_maxsz)
#define decode_readdir_maxsz (op_decode_hdr_maxsz + \
- decode_verifier_maxsz + \
- nfs4_label_maxsz + nfs4_fattr_maxsz)
+ decode_verifier_maxsz)
#define encode_readlink_maxsz (op_encode_hdr_maxsz)
#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1)
#define encode_write_maxsz (op_encode_hdr_maxsz + \
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 4755858e37a0..cb53d450ae32 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -662,7 +662,18 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
*/
static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
{
- return (s32)s1 - (s32)s2 > 0;
+ return (s32)(s1 - s2) > 0;
+}
+
+static void
+pnfs_verify_layout_stateid(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *new,
+ struct list_head *free_me_list)
+{
+ if (nfs4_stateid_match_other(&lo->plh_stateid, new))
+ return;
+ /* Layout is new! Kill existing layout segments */
+ pnfs_mark_matching_lsegs_invalid(lo, free_me_list, NULL);
}
/* update lo->plh_stateid with new if is more recent */
@@ -1315,6 +1326,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
struct nfs4_layoutget_res *res = &lgp->res;
struct pnfs_layout_segment *lseg;
struct inode *ino = lo->plh_inode;
+ LIST_HEAD(free_me);
int status = 0;
/* Inject layout blob into I/O device driver */
@@ -1341,6 +1353,8 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
goto out_forget_reply;
}
+ /* Check that the new stateid matches the old stateid */
+ pnfs_verify_layout_stateid(lo, &res->stateid, &free_me);
/* Done processing layoutget. Set the layout stateid */
pnfs_set_layout_stateid(lo, &res->stateid, false);
@@ -1355,6 +1369,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
}
spin_unlock(&ino->i_lock);
+ pnfs_free_lseg_list(&free_me);
return lseg;
out:
return ERR_PTR(status);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index fddbba2d9eff..e55ce9e8b034 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -357,30 +357,6 @@ nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
}
static int
-nfs_proc_rename(struct inode *old_dir, struct qstr *old_name,
- struct inode *new_dir, struct qstr *new_name)
-{
- struct nfs_renameargs arg = {
- .old_dir = NFS_FH(old_dir),
- .old_name = old_name,
- .new_dir = NFS_FH(new_dir),
- .new_name = new_name,
- };
- struct rpc_message msg = {
- .rpc_proc = &nfs_procedures[NFSPROC_RENAME],
- .rpc_argp = &arg,
- };
- int status;
-
- dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name);
- status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
- nfs_mark_for_revalidate(old_dir);
- nfs_mark_for_revalidate(new_dir);
- dprintk("NFS reply rename: %d\n", status);
- return status;
-}
-
-static int
nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
{
struct nfs_linkargs arg = {
@@ -745,7 +721,6 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
.unlink_setup = nfs_proc_unlink_setup,
.unlink_rpc_prepare = nfs_proc_unlink_rpc_prepare,
.unlink_done = nfs_proc_unlink_done,
- .rename = nfs_proc_rename,
.rename_setup = nfs_proc_rename_setup,
.rename_rpc_prepare = nfs_proc_rename_rpc_prepare,
.rename_done = nfs_proc_rename_done,
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 11d78944de79..de54129336c6 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -14,6 +14,7 @@
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/namei.h>
+#include <linux/fsnotify.h>
#include "internal.h"
#include "nfs4_fs.h"
@@ -353,8 +354,8 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
return;
}
- if (task->tk_status != 0)
- nfs_cancel_async_unlink(old_dentry);
+ if (data->complete)
+ data->complete(task, data);
}
/**
@@ -399,9 +400,10 @@ static const struct rpc_call_ops nfs_rename_ops = {
*
* It's expected that valid references to the dentries and inodes are held
*/
-static struct rpc_task *
+struct rpc_task *
nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
- struct dentry *old_dentry, struct dentry *new_dentry)
+ struct dentry *old_dentry, struct dentry *new_dentry,
+ void (*complete)(struct rpc_task *, struct nfs_renamedata *))
{
struct nfs_renamedata *data;
struct rpc_message msg = { };
@@ -438,6 +440,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
data->new_dentry = dget(new_dentry);
nfs_fattr_init(&data->old_fattr);
nfs_fattr_init(&data->new_fattr);
+ data->complete = complete;
/* set up nfs_renameargs */
data->args.old_dir = NFS_FH(old_dir);
@@ -456,6 +459,27 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
return rpc_run_task(&task_setup_data);
}
+/*
+ * Perform tasks needed when a sillyrename is done such as cancelling the
+ * queued async unlink if it failed.
+ */
+static void
+nfs_complete_sillyrename(struct rpc_task *task, struct nfs_renamedata *data)
+{
+ struct dentry *dentry = data->old_dentry;
+
+ if (task->tk_status != 0) {
+ nfs_cancel_async_unlink(dentry);
+ return;
+ }
+
+ /*
+ * vfs_unlink and the like do not issue this when a file is
+ * sillyrenamed, so do it here.
+ */
+ fsnotify_nameremove(dentry, 0);
+}
+
#define SILLYNAME_PREFIX ".nfs"
#define SILLYNAME_PREFIX_LEN ((unsigned)sizeof(SILLYNAME_PREFIX) - 1)
#define SILLYNAME_FILEID_LEN ((unsigned)sizeof(u64) << 1)
@@ -548,7 +572,8 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
}
/* run the rename task, undo unlink if it fails */
- task = nfs_async_rename(dir, dir, dentry, sdentry);
+ task = nfs_async_rename(dir, dir, dentry, sdentry,
+ nfs_complete_sillyrename);
if (IS_ERR(task)) {
error = -EBUSY;
nfs_cancel_async_unlink(dentry);
diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
index a812fd1b92a4..b481e1f5eecc 100644
--- a/fs/nfsd/acl.h
+++ b/fs/nfsd/acl.h
@@ -39,9 +39,13 @@ struct nfs4_acl;
struct svc_fh;
struct svc_rqst;
-/* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to
- * fit in a page: */
-#define NFS4_ACL_MAX 170
+/*
+ * Maximum ACL we'll accept from a client; chosen (somewhat
+ * arbitrarily) so that kmalloc'ing the ACL shouldn't require a
+ * high-order allocation. This allows 204 ACEs on x86_64:
+ */
+#define NFS4_ACL_MAX ((PAGE_SIZE - sizeof(struct nfs4_acl)) \
+ / sizeof(struct nfs4_ace))
struct nfs4_acl *nfs4_acl_new(int);
int nfs4_acl_get_whotype(char *, u32);
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index d190e33d0ec2..6f3f392d48af 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -542,7 +542,10 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
* up setting a 3-element effective posix ACL with all
* permissions zero.
*/
- nace = 4 + state->users->n + state->groups->n;
+ if (!state->users->n && !state->groups->n)
+ nace = 3;
+ else /* Note we also include a MASK ACE in this case: */
+ nace = 4 + state->users->n + state->groups->n;
pacl = posix_acl_alloc(nace, GFP_KERNEL);
if (!pacl)
return ERR_PTR(-ENOMEM);
@@ -586,9 +589,11 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
add_to_mask(state, &state->groups->aces[i].perms);
}
- pace++;
- pace->e_tag = ACL_MASK;
- low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags);
+ if (!state->users->n && !state->groups->n) {
+ pace++;
+ pace->e_tag = ACL_MASK;
+ low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags);
+ }
pace++;
pace->e_tag = ACL_OTHER;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7f05cd140de3..39c8ef875f91 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -32,6 +32,7 @@
*/
#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/xprt.h>
#include <linux/sunrpc/svc_xprt.h>
#include <linux/slab.h>
#include "nfsd.h"
@@ -635,6 +636,22 @@ static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc
}
}
+static struct rpc_clnt *create_backchannel_client(struct rpc_create_args *args)
+{
+ struct rpc_xprt *xprt;
+
+ if (args->protocol != XPRT_TRANSPORT_BC_TCP)
+ return rpc_create(args);
+
+ xprt = args->bc_xprt->xpt_bc_xprt;
+ if (xprt) {
+ xprt_get(xprt);
+ return rpc_create_xprt(args, xprt);
+ }
+
+ return rpc_create(args);
+}
+
static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)
{
struct rpc_timeout timeparms = {
@@ -674,7 +691,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
args.authflavor = ses->se_cb_sec.flavor;
}
/* Create RPC client */
- client = rpc_create(&args);
+ client = create_backchannel_client(&args);
if (IS_ERR(client)) {
dprintk("NFSD: couldn't create callback client: %ld\n",
PTR_ERR(client));
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 82189b208af3..d543222babf3 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1273,6 +1273,8 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
struct nfsd4_op *op;
struct nfsd4_operation *opdesc;
struct nfsd4_compound_state *cstate = &resp->cstate;
+ struct svc_fh *current_fh = &cstate->current_fh;
+ struct svc_fh *save_fh = &cstate->save_fh;
int slack_bytes;
u32 plen = 0;
__be32 status;
@@ -1288,11 +1290,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
resp->tag = args->tag;
resp->opcnt = 0;
resp->rqstp = rqstp;
- resp->cstate.minorversion = args->minorversion;
- resp->cstate.replay_owner = NULL;
- resp->cstate.session = NULL;
- fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
- fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
+ cstate->minorversion = args->minorversion;
+ cstate->replay_owner = NULL;
+ cstate->session = NULL;
+ fh_init(current_fh, NFS4_FHSIZE);
+ fh_init(save_fh, NFS4_FHSIZE);
/*
* Don't use the deferral mechanism for NFSv4; compounds make it
* too hard to avoid non-idempotency problems.
@@ -1345,20 +1347,28 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
opdesc = OPDESC(op);
- if (!cstate->current_fh.fh_dentry) {
+ if (!current_fh->fh_dentry) {
if (!(opdesc->op_flags & ALLOWED_WITHOUT_FH)) {
op->status = nfserr_nofilehandle;
goto encode_op;
}
- } else if (cstate->current_fh.fh_export->ex_fslocs.migrated &&
+ } else if (current_fh->fh_export->ex_fslocs.migrated &&
!(opdesc->op_flags & ALLOWED_ON_ABSENT_FS)) {
op->status = nfserr_moved;
goto encode_op;
}
+ fh_clear_wcc(current_fh);
+
/* If op is non-idempotent */
if (opdesc->op_flags & OP_MODIFIES_SOMETHING) {
plen = opdesc->op_rsize_bop(rqstp, op);
+ /*
+ * If there's still another operation, make sure
+ * we'll have space to at least encode an error:
+ */
+ if (resp->opcnt < args->opcnt)
+ plen += COMPOUND_ERR_SLACK_SPACE;
op->status = nfsd4_check_resp_size(resp, plen);
}
@@ -1377,12 +1387,12 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
clear_current_stateid(cstate);
if (need_wrongsec_check(rqstp))
- op->status = check_nfsd_access(cstate->current_fh.fh_export, rqstp);
+ op->status = check_nfsd_access(current_fh->fh_export, rqstp);
}
encode_op:
/* Only from SEQUENCE */
- if (resp->cstate.status == nfserr_replay_cache) {
+ if (cstate->status == nfserr_replay_cache) {
dprintk("%s NFS4.1 replay from cache\n", __func__);
status = op->status;
goto out;
@@ -1411,10 +1421,10 @@ encode_op:
nfsd4_increment_op_stats(op->opnum);
}
- resp->cstate.status = status;
- fh_put(&resp->cstate.current_fh);
- fh_put(&resp->cstate.save_fh);
- BUG_ON(resp->cstate.replay_owner);
+ cstate->status = status;
+ fh_put(current_fh);
+ fh_put(save_fh);
+ BUG_ON(cstate->replay_owner);
out:
/* Reset deferral mechanism for RPC deferrals */
rqstp->rq_usedeferral = 1;
@@ -1523,7 +1533,8 @@ static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *o
static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
- return (op_encode_hdr_size + 2 + 1024) * sizeof(__be32);
+ return (op_encode_hdr_size + 2 + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) *
+ sizeof(__be32);
}
static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d5d070fbeb35..3ba65979a3cd 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1538,7 +1538,7 @@ out_err:
}
/*
- * Cache a reply. nfsd4_check_drc_limit() has bounded the cache size.
+ * Cache a reply. nfsd4_check_resp_size() has bounded the cache size.
*/
void
nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
@@ -1596,7 +1596,7 @@ nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args,
* The sequence operation is not cached because we can use the slot and
* session values.
*/
-__be32
+static __be32
nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
struct nfsd4_sequence *seq)
{
@@ -1605,9 +1605,8 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
dprintk("--> %s slot %p\n", __func__, slot);
- /* Either returns 0 or nfserr_retry_uncached */
status = nfsd4_enc_sequence_replay(resp->rqstp->rq_argp, resp);
- if (status == nfserr_retry_uncached_rep)
+ if (status)
return status;
/* The sequence operation has been encoded, cstate->datap set. */
@@ -2287,7 +2286,8 @@ out:
if (!list_empty(&clp->cl_revoked))
seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED;
out_no_session:
- kfree(conn);
+ if (conn)
+ free_conn(conn);
spin_unlock(&nn->client_lock);
return status;
out_put_session:
@@ -3627,8 +3627,11 @@ static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask,
return nfserr_bad_stateid;
status = lookup_clientid(&stateid->si_opaque.so_clid, sessions,
nn, &cl);
- if (status == nfserr_stale_clientid)
+ if (status == nfserr_stale_clientid) {
+ if (sessions)
+ return nfserr_bad_stateid;
return nfserr_stale_stateid;
+ }
if (status)
return status;
*s = find_stateid_by_type(cl, stateid, typemask);
@@ -5062,7 +5065,6 @@ nfs4_state_destroy_net(struct net *net)
int i;
struct nfs4_client *clp = NULL;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- struct rb_node *node, *tmp;
for (i = 0; i < CLIENT_HASH_SIZE; i++) {
while (!list_empty(&nn->conf_id_hashtbl[i])) {
@@ -5071,13 +5073,11 @@ nfs4_state_destroy_net(struct net *net)
}
}
- node = rb_first(&nn->unconf_name_tree);
- while (node != NULL) {
- tmp = node;
- node = rb_next(tmp);
- clp = rb_entry(tmp, struct nfs4_client, cl_namenode);
- rb_erase(tmp, &nn->unconf_name_tree);
- destroy_client(clp);
+ for (i = 0; i < CLIENT_HASH_SIZE; i++) {
+ while (!list_empty(&nn->unconf_id_hashtbl[i])) {
+ clp = list_entry(nn->unconf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
+ destroy_client(clp);
+ }
}
kfree(nn->sessionid_hashtbl);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 63f2395c57ed..2723c1badd01 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -294,7 +294,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
READ32(nace);
if (nace > NFS4_ACL_MAX)
- return nfserr_resource;
+ return nfserr_fbig;
*acl = nfs4_acl_new(nace);
if (*acl == NULL)
@@ -1222,7 +1222,6 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
}
write->wr_head.iov_base = p;
write->wr_head.iov_len = avail;
- WARN_ON(avail != (XDR_QUADLEN(avail) << 2));
write->wr_pagelist = argp->pagelist;
len = XDR_QUADLEN(write->wr_buflen) << 2;
@@ -2483,6 +2482,8 @@ out_acl:
goto out;
}
if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
+ if ((buflen -= 16) < 0)
+ goto out_resource;
WRITE32(3);
WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1);
@@ -2499,8 +2500,10 @@ out:
security_release_secctx(context, contextlen);
#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
kfree(acl);
- if (tempfh)
+ if (tempfh) {
fh_put(tempfh);
+ kfree(tempfh);
+ }
return status;
out_nfserr:
status = nfserrno(err);
@@ -3471,6 +3474,9 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_test_stateid_id *stateid, *next;
__be32 *p;
+ if (nfserr)
+ return nfserr;
+
RESERVE_SPACE(4 + (4 * test_stateid->ts_num_ids));
*p++ = htonl(test_stateid->ts_num_ids);
@@ -3579,8 +3585,6 @@ __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad)
return 0;
session = resp->cstate.session;
- if (session == NULL)
- return 0;
if (xb->page_len == 0) {
length = (char *)resp->p - (char *)xb->head[0].iov_base + pad;
@@ -3620,9 +3624,17 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) ||
!nfsd4_enc_ops[op->opnum]);
op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
- /* nfsd4_check_drc_limit guarantees enough room for error status */
+ /* nfsd4_check_resp_size guarantees enough room for error status */
if (!op->status)
op->status = nfsd4_check_resp_size(resp, 0);
+ if (op->status == nfserr_resource && nfsd4_has_session(&resp->cstate)) {
+ struct nfsd4_slot *slot = resp->cstate.slot;
+
+ if (slot->sl_flags & NFSD4_SLOT_CACHETHIS)
+ op->status = nfserr_rep_too_big_to_cache;
+ else
+ op->status = nfserr_rep_too_big;
+ }
if (so) {
so->so_replay.rp_status = op->status;
so->so_replay.rp_buflen = (char *)resp->p - (char *)(statp+1);
@@ -3691,6 +3703,12 @@ int nfsd4_release_compoundargs(void *rq, __be32 *p, void *resp)
int
nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compoundargs *args)
{
+ if (rqstp->rq_arg.head[0].iov_len % 4) {
+ /* client is nuts */
+ dprintk("%s: compound not properly padded! (peeraddr=%pISc xid=0x%x)",
+ __func__, svc_addr(rqstp), be32_to_cpu(rqstp->rq_xid));
+ return 0;
+ }
args->p = p;
args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len;
args->pagelist = rqstp->rq_arg.pages;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 7f555179bf81..f34d9de802ab 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -699,6 +699,11 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net)
if (err != 0 || fd < 0)
return -EINVAL;
+ if (svc_alien_sock(net, fd)) {
+ printk(KERN_ERR "%s: socket net is different to NFSd's one\n", __func__);
+ return -EINVAL;
+ }
+
err = nfsd_create_serv(net);
if (err != 0)
return err;
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 30f34ab02137..479eb681c27c 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -282,7 +282,7 @@ void nfsd_lockd_shutdown(void);
* reason.
*/
#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */
-#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */
+#define COMPOUND_ERR_SLACK_SPACE 16 /* OP_SETATTR */
#define NFSD_LAUNDROMAT_MINTIMEOUT 1 /* seconds */
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 4775bc4896c8..ad67964d0bb1 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -133,6 +133,17 @@ fh_init(struct svc_fh *fhp, int maxsize)
#ifdef CONFIG_NFSD_V3
/*
+ * The wcc data stored in current_fh should be cleared
+ * between compound ops.
+ */
+static inline void
+fh_clear_wcc(struct svc_fh *fhp)
+{
+ fhp->fh_post_saved = 0;
+ fhp->fh_pre_saved = 0;
+}
+
+/*
* Fill in the pre_op attr for the wcc data
*/
static inline void
@@ -152,7 +163,8 @@ fill_pre_wcc(struct svc_fh *fhp)
extern void fill_post_wcc(struct svc_fh *);
#else
-#define fill_pre_wcc(ignored)
+#define fh_clear_wcc(ignored)
+#define fill_pre_wcc(ignored)
#define fill_post_wcc(notused)
#endif /* CONFIG_NFSD_V3 */
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index b17d93214d01..9c769a47ac5a 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -152,7 +152,7 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
type = (stat->mode & S_IFMT);
*p++ = htonl(nfs_ftypes[type >> 12]);
- *p++ = htonl((u32) (stat->mode & S_IALLUGO));
+ *p++ = htonl((u32) stat->mode);
*p++ = htonl((u32) stat->nlink);
*p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid));
*p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid));
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 915808b36df7..16f0673a423c 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -404,6 +404,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
umode_t ftype = 0;
__be32 err;
int host_err;
+ bool get_write_count;
int size_change = 0;
if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
@@ -411,10 +412,18 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
if (iap->ia_valid & ATTR_SIZE)
ftype = S_IFREG;
+ /* Callers that do fh_verify should do the fh_want_write: */
+ get_write_count = !fhp->fh_dentry;
+
/* Get inode */
err = fh_verify(rqstp, fhp, ftype, accmode);
if (err)
goto out;
+ if (get_write_count) {
+ host_err = fh_want_write(fhp);
+ if (host_err)
+ return nfserrno(host_err);
+ }
dentry = fhp->fh_dentry;
inode = dentry->d_inode;
@@ -1706,10 +1715,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
dput(odentry);
out_nfserr:
err = nfserrno(host_err);
-
- /* we cannot reply on fh_unlock on the two filehandles,
+ /*
+ * We cannot rely on fh_unlock on the two filehandles,
* as that would do the wrong thing if the two directories
- * were the same, so again we do it by hand
+ * were the same, so again we do it by hand.
*/
fill_post_wcc(ffhp);
fill_post_wcc(tfhp);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index d278a0d03496..5ea7df305083 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -574,8 +574,6 @@ extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
struct nfsd4_compound_state *,
struct nfsd4_setclientid_confirm *setclientid_confirm);
extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp);
-extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
- struct nfsd4_sequence *seq);
extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
extern __be32 nfsd4_backchannel_ctl(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_backchannel_ctl *);
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 08fdb77852ac..f3a82fbcae02 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -134,6 +134,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
static const struct vm_operations_struct nilfs_file_vm_ops = {
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = nilfs_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c
index 807150e2c2b9..dd6103cc93c1 100644
--- a/fs/ntfs/debug.c
+++ b/fs/ntfs/debug.c
@@ -18,16 +18,9 @@
* distribution in the file COPYING); if not, write to the Free Software
* Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "debug.h"
-/*
- * A static buffer to hold the error string being displayed and a spinlock
- * to protect concurrent accesses to it.
- */
-static char err_buf[1024];
-static DEFINE_SPINLOCK(err_buf_lock);
-
/**
* __ntfs_warning - output a warning to the syslog
* @function: name of function outputting the warning
@@ -50,6 +43,7 @@ static DEFINE_SPINLOCK(err_buf_lock);
void __ntfs_warning(const char *function, const struct super_block *sb,
const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
int flen = 0;
@@ -59,17 +53,15 @@ void __ntfs_warning(const char *function, const struct super_block *sb,
#endif
if (function)
flen = strlen(function);
- spin_lock(&err_buf_lock);
va_start(args, fmt);
- vsnprintf(err_buf, sizeof(err_buf), fmt, args);
- va_end(args);
+ vaf.fmt = fmt;
+ vaf.va = &args;
if (sb)
- printk(KERN_ERR "NTFS-fs warning (device %s): %s(): %s\n",
- sb->s_id, flen ? function : "", err_buf);
+ pr_warn("(device %s): %s(): %pV\n",
+ sb->s_id, flen ? function : "", &vaf);
else
- printk(KERN_ERR "NTFS-fs warning: %s(): %s\n",
- flen ? function : "", err_buf);
- spin_unlock(&err_buf_lock);
+ pr_warn("%s(): %pV\n", flen ? function : "", &vaf);
+ va_end(args);
}
/**
@@ -94,6 +86,7 @@ void __ntfs_warning(const char *function, const struct super_block *sb,
void __ntfs_error(const char *function, const struct super_block *sb,
const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
int flen = 0;
@@ -103,17 +96,15 @@ void __ntfs_error(const char *function, const struct super_block *sb,
#endif
if (function)
flen = strlen(function);
- spin_lock(&err_buf_lock);
va_start(args, fmt);
- vsnprintf(err_buf, sizeof(err_buf), fmt, args);
- va_end(args);
+ vaf.fmt = fmt;
+ vaf.va = &args;
if (sb)
- printk(KERN_ERR "NTFS-fs error (device %s): %s(): %s\n",
- sb->s_id, flen ? function : "", err_buf);
+ pr_err("(device %s): %s(): %pV\n",
+ sb->s_id, flen ? function : "", &vaf);
else
- printk(KERN_ERR "NTFS-fs error: %s(): %s\n",
- flen ? function : "", err_buf);
- spin_unlock(&err_buf_lock);
+ pr_err("%s(): %pV\n", flen ? function : "", &vaf);
+ va_end(args);
}
#ifdef DEBUG
@@ -124,6 +115,7 @@ int debug_msgs = 0;
void __ntfs_debug (const char *file, int line, const char *function,
const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
int flen = 0;
@@ -131,13 +123,11 @@ void __ntfs_debug (const char *file, int line, const char *function,
return;
if (function)
flen = strlen(function);
- spin_lock(&err_buf_lock);
va_start(args, fmt);
- vsnprintf(err_buf, sizeof(err_buf), fmt, args);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ pr_debug("(%s, %d): %s(): %pV", file, line, flen ? function : "", &vaf);
va_end(args);
- printk(KERN_DEBUG "NTFS-fs DEBUG (%s, %d): %s(): %s\n", file, line,
- flen ? function : "", err_buf);
- spin_unlock(&err_buf_lock);
}
/* Dump a runlist. Caller has to provide synchronisation for @rl. */
@@ -149,12 +139,12 @@ void ntfs_debug_dump_runlist(const runlist_element *rl)
if (!debug_msgs)
return;
- printk(KERN_DEBUG "NTFS-fs DEBUG: Dumping runlist (values in hex):\n");
+ pr_debug("Dumping runlist (values in hex):\n");
if (!rl) {
- printk(KERN_DEBUG "Run list not present.\n");
+ pr_debug("Run list not present.\n");
return;
}
- printk(KERN_DEBUG "VCN LCN Run length\n");
+ pr_debug("VCN LCN Run length\n");
for (i = 0; ; i++) {
LCN lcn = (rl + i)->lcn;
@@ -163,13 +153,13 @@ void ntfs_debug_dump_runlist(const runlist_element *rl)
if (index > -LCN_ENOENT - 1)
index = 3;
- printk(KERN_DEBUG "%-16Lx %s %-16Lx%s\n",
+ pr_debug("%-16Lx %s %-16Lx%s\n",
(long long)(rl + i)->vcn, lcn_str[index],
(long long)(rl + i)->length,
(rl + i)->length ? "" :
" (runlist end)");
} else
- printk(KERN_DEBUG "%-16Lx %-16Lx %-16Lx%s\n",
+ pr_debug("%-16Lx %-16Lx %-16Lx%s\n",
(long long)(rl + i)->vcn,
(long long)(rl + i)->lcn,
(long long)(rl + i)->length,
diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h
index 53c27eaf2307..61bf091e32a8 100644
--- a/fs/ntfs/debug.h
+++ b/fs/ntfs/debug.h
@@ -48,7 +48,12 @@ extern void ntfs_debug_dump_runlist(const runlist_element *rl);
#else /* !DEBUG */
-#define ntfs_debug(f, a...) do {} while (0)
+#define ntfs_debug(fmt, ...) \
+do { \
+ if (0) \
+ no_printk(fmt, ##__VA_ARGS__); \
+} while (0)
+
#define ntfs_debug_dump_runlist(rl) do {} while (0)
#endif /* !DEBUG */
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 9d8153ebacfb..f47af5e6e230 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1704,8 +1704,6 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
iput(bvi);
skip_large_index_stuff:
/* Setup the operations for this index inode. */
- vi->i_op = NULL;
- vi->i_fop = NULL;
vi->i_mapping->a_ops = &ntfs_mst_aops;
vi->i_blocks = ni->allocated_size >> 9;
/*
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index bd5610d48242..9de2491f2926 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -19,6 +19,7 @@
* distribution in the file COPYING); if not, write to the Free Software
* Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/stddef.h>
#include <linux/init.h>
@@ -1896,7 +1897,7 @@ get_ctx_vol_failed:
vol->minor_ver = vi->minor_ver;
ntfs_attr_put_search_ctx(ctx);
unmap_mft_record(NTFS_I(vol->vol_ino));
- printk(KERN_INFO "NTFS volume version %i.%i.\n", vol->major_ver,
+ pr_info("volume version %i.%i.\n", vol->major_ver,
vol->minor_ver);
if (vol->major_ver < 3 && NVolSparseEnabled(vol)) {
ntfs_warning(vol->sb, "Disabling sparse support due to NTFS "
@@ -3095,7 +3096,7 @@ static int __init init_ntfs_fs(void)
int err = 0;
/* This may be ugly but it results in pretty output so who cares. (-8 */
- printk(KERN_INFO "NTFS driver " NTFS_VERSION " [Flags: R/"
+ pr_info("driver " NTFS_VERSION " [Flags: R/"
#ifdef NTFS_RW
"W"
#else
@@ -3115,16 +3116,15 @@ static int __init init_ntfs_fs(void)
sizeof(ntfs_index_context), 0 /* offset */,
SLAB_HWCACHE_ALIGN, NULL /* ctor */);
if (!ntfs_index_ctx_cache) {
- printk(KERN_CRIT "NTFS: Failed to create %s!\n",
- ntfs_index_ctx_cache_name);
+ pr_crit("Failed to create %s!\n", ntfs_index_ctx_cache_name);
goto ictx_err_out;
}
ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name,
sizeof(ntfs_attr_search_ctx), 0 /* offset */,
SLAB_HWCACHE_ALIGN, NULL /* ctor */);
if (!ntfs_attr_ctx_cache) {
- printk(KERN_CRIT "NTFS: Failed to create %s!\n",
- ntfs_attr_ctx_cache_name);
+ pr_crit("NTFS: Failed to create %s!\n",
+ ntfs_attr_ctx_cache_name);
goto actx_err_out;
}
@@ -3132,8 +3132,7 @@ static int __init init_ntfs_fs(void)
(NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0,
SLAB_HWCACHE_ALIGN, NULL);
if (!ntfs_name_cache) {
- printk(KERN_CRIT "NTFS: Failed to create %s!\n",
- ntfs_name_cache_name);
+ pr_crit("Failed to create %s!\n", ntfs_name_cache_name);
goto name_err_out;
}
@@ -3141,8 +3140,7 @@ static int __init init_ntfs_fs(void)
sizeof(ntfs_inode), 0,
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
if (!ntfs_inode_cache) {
- printk(KERN_CRIT "NTFS: Failed to create %s!\n",
- ntfs_inode_cache_name);
+ pr_crit("Failed to create %s!\n", ntfs_inode_cache_name);
goto inode_err_out;
}
@@ -3151,15 +3149,14 @@ static int __init init_ntfs_fs(void)
SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
ntfs_big_inode_init_once);
if (!ntfs_big_inode_cache) {
- printk(KERN_CRIT "NTFS: Failed to create %s!\n",
- ntfs_big_inode_cache_name);
+ pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name);
goto big_inode_err_out;
}
/* Register the ntfs sysctls. */
err = ntfs_sysctl(1);
if (err) {
- printk(KERN_CRIT "NTFS: Failed to register NTFS sysctls!\n");
+ pr_crit("Failed to register NTFS sysctls!\n");
goto sysctl_err_out;
}
@@ -3168,7 +3165,7 @@ static int __init init_ntfs_fs(void)
ntfs_debug("NTFS driver registered successfully.");
return 0; /* Success! */
}
- printk(KERN_CRIT "NTFS: Failed to register NTFS filesystem driver!\n");
+ pr_crit("Failed to register NTFS filesystem driver!\n");
/* Unregister the ntfs sysctls. */
ntfs_sysctl(0);
@@ -3184,8 +3181,7 @@ actx_err_out:
kmem_cache_destroy(ntfs_index_ctx_cache);
ictx_err_out:
if (!err) {
- printk(KERN_CRIT "NTFS: Aborting NTFS filesystem driver "
- "registration...\n");
+ pr_crit("Aborting NTFS filesystem driver registration...\n");
err = -ENOMEM;
}
return err;
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index a4b07730b2e1..b7f57271d49c 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -41,7 +41,7 @@ static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr,
return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
}
static struct kobj_attribute attr_version =
- __ATTR(interface_revision, S_IFREG | S_IRUGO, version_show, NULL);
+ __ATTR(interface_revision, S_IRUGO, version_show, NULL);
static struct attribute *o2cb_attrs[] = {
&attr_version.attr,
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index eb649d23a4de..c6b90e670389 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -137,7 +137,7 @@ static int o2net_sys_err_translations[O2NET_ERR_MAX] =
static void o2net_sc_connect_completed(struct work_struct *work);
static void o2net_rx_until_empty(struct work_struct *work);
static void o2net_shutdown_sc(struct work_struct *work);
-static void o2net_listen_data_ready(struct sock *sk, int bytes);
+static void o2net_listen_data_ready(struct sock *sk);
static void o2net_sc_send_keep_req(struct work_struct *work);
static void o2net_idle_timer(unsigned long data);
static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
@@ -597,9 +597,9 @@ static void o2net_set_nn_state(struct o2net_node *nn,
}
/* see o2net_register_callbacks() */
-static void o2net_data_ready(struct sock *sk, int bytes)
+static void o2net_data_ready(struct sock *sk)
{
- void (*ready)(struct sock *sk, int bytes);
+ void (*ready)(struct sock *sk);
read_lock(&sk->sk_callback_lock);
if (sk->sk_user_data) {
@@ -613,7 +613,7 @@ static void o2net_data_ready(struct sock *sk, int bytes)
}
read_unlock(&sk->sk_callback_lock);
- ready(sk, bytes);
+ ready(sk);
}
/* see o2net_register_callbacks() */
@@ -916,57 +916,30 @@ static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key)
static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len)
{
- int ret;
- mm_segment_t oldfs;
- struct kvec vec = {
- .iov_len = len,
- .iov_base = data,
- };
- struct msghdr msg = {
- .msg_iovlen = 1,
- .msg_iov = (struct iovec *)&vec,
- .msg_flags = MSG_DONTWAIT,
- };
-
- oldfs = get_fs();
- set_fs(get_ds());
- ret = sock_recvmsg(sock, &msg, len, msg.msg_flags);
- set_fs(oldfs);
-
- return ret;
+ struct kvec vec = { .iov_len = len, .iov_base = data, };
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT, };
+ return kernel_recvmsg(sock, &msg, &vec, 1, len, msg.msg_flags);
}
static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
size_t veclen, size_t total)
{
int ret;
- mm_segment_t oldfs;
- struct msghdr msg = {
- .msg_iov = (struct iovec *)vec,
- .msg_iovlen = veclen,
- };
+ struct msghdr msg;
if (sock == NULL) {
ret = -EINVAL;
goto out;
}
- oldfs = get_fs();
- set_fs(get_ds());
- ret = sock_sendmsg(sock, &msg, total);
- set_fs(oldfs);
- if (ret != total) {
- mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret,
- total);
- if (ret >= 0)
- ret = -EPIPE; /* should be smarter, I bet */
- goto out;
- }
-
- ret = 0;
+ ret = kernel_sendmsg(sock, &msg, vec, veclen, total);
+ if (likely(ret == total))
+ return 0;
+ mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, total);
+ if (ret >= 0)
+ ret = -EPIPE; /* should be smarter, I bet */
out:
- if (ret < 0)
- mlog(0, "returning error: %d\n", ret);
+ mlog(0, "returning error: %d\n", ret);
return ret;
}
@@ -1953,9 +1926,9 @@ static void o2net_accept_many(struct work_struct *work)
cond_resched();
}
-static void o2net_listen_data_ready(struct sock *sk, int bytes)
+static void o2net_listen_data_ready(struct sock *sk)
{
- void (*ready)(struct sock *sk, int bytes);
+ void (*ready)(struct sock *sk);
read_lock(&sk->sk_callback_lock);
ready = sk->sk_user_data;
@@ -1978,7 +1951,6 @@ static void o2net_listen_data_ready(struct sock *sk, int bytes)
*/
if (sk->sk_state == TCP_LISTEN) {
- mlog(ML_TCP, "bytes: %d\n", bytes);
queue_work(o2net_wq, &o2net_listen_work);
} else {
ready = NULL;
@@ -1987,7 +1959,7 @@ static void o2net_listen_data_ready(struct sock *sk, int bytes)
out:
read_unlock(&sk->sk_callback_lock);
if (ready != NULL)
- ready(sk, bytes);
+ ready(sk);
}
static int o2net_open_listening_sock(__be32 addr, __be16 port)
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 4cbcb65784a3..dc024367110a 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -165,7 +165,7 @@ struct o2net_sock_container {
/* original handlers for the sockets */
void (*sc_state_change)(struct sock *sk);
- void (*sc_data_ready)(struct sock *sk, int bytes);
+ void (*sc_data_ready)(struct sock *sk);
u32 sc_msg_key;
u16 sc_msg_type;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ff33c5ef87f2..8970dcf74de5 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2367,15 +2367,18 @@ relock:
if (direct_io) {
written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
- ppos, count, ocount);
+ count, ocount);
if (written < 0) {
ret = written;
goto out_dio;
}
} else {
+ struct iov_iter from;
+ iov_iter_init(&from, iov, nr_segs, count, 0);
current->backing_dev_info = file->f_mapping->backing_dev_info;
- written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos,
- ppos, count, 0);
+ written = generic_perform_write(file, &from, *ppos);
+ if (likely(written >= 0))
+ iocb->ki_pos = *ppos + written;
current->backing_dev_info = NULL;
}
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 5c8343fe7438..83f1a665ae97 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -496,7 +496,7 @@ static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
}
static struct kobj_attribute ocfs2_attr_max_locking_protocol =
- __ATTR(max_locking_protocol, S_IFREG | S_IRUGO,
+ __ATTR(max_locking_protocol, S_IRUGO,
ocfs2_max_locking_protocol_show, NULL);
static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
@@ -528,7 +528,7 @@ static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
}
static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins =
- __ATTR(loaded_cluster_plugins, S_IFREG | S_IRUGO,
+ __ATTR(loaded_cluster_plugins, S_IRUGO,
ocfs2_loaded_cluster_plugins_show, NULL);
static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
@@ -550,7 +550,7 @@ static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
}
static struct kobj_attribute ocfs2_attr_active_cluster_plugin =
- __ATTR(active_cluster_plugin, S_IFREG | S_IRUGO,
+ __ATTR(active_cluster_plugin, S_IRUGO,
ocfs2_active_cluster_plugin_show, NULL);
static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj,
@@ -599,7 +599,7 @@ static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj,
static struct kobj_attribute ocfs2_attr_cluster_stack =
- __ATTR(cluster_stack, S_IFREG | S_IRUGO | S_IWUSR,
+ __ATTR(cluster_stack, S_IRUGO | S_IWUSR,
ocfs2_cluster_stack_show,
ocfs2_cluster_stack_store);
diff --git a/fs/open.c b/fs/open.c
index 7b823daa6a93..9d64679cec73 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -652,35 +652,6 @@ out:
return error;
}
-/*
- * You have to be very careful that these write
- * counts get cleaned up in error cases and
- * upon __fput(). This should probably never
- * be called outside of __dentry_open().
- */
-static inline int __get_file_write_access(struct inode *inode,
- struct vfsmount *mnt)
-{
- int error;
- error = get_write_access(inode);
- if (error)
- return error;
- /*
- * Do not take mount writer counts on
- * special files since no writes to
- * the mount itself will occur.
- */
- if (!special_file(inode->i_mode)) {
- /*
- * Balanced in __fput()
- */
- error = __mnt_want_write(mnt);
- if (error)
- put_write_access(inode);
- }
- return error;
-}
-
int open_check_o_direct(struct file *f)
{
/* NB: we're sure to have correct a_ops only after f_op->open */
@@ -705,26 +676,28 @@ static int do_dentry_open(struct file *f,
f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
FMODE_PREAD | FMODE_PWRITE;
- if (unlikely(f->f_flags & O_PATH))
- f->f_mode = FMODE_PATH;
-
path_get(&f->f_path);
inode = f->f_inode = f->f_path.dentry->d_inode;
- if (f->f_mode & FMODE_WRITE) {
- error = __get_file_write_access(inode, f->f_path.mnt);
- if (error)
- goto cleanup_file;
- if (!special_file(inode->i_mode))
- file_take_write(f);
- }
-
f->f_mapping = inode->i_mapping;
- if (unlikely(f->f_mode & FMODE_PATH)) {
+ if (unlikely(f->f_flags & O_PATH)) {
+ f->f_mode = FMODE_PATH;
f->f_op = &empty_fops;
return 0;
}
+ if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
+ error = get_write_access(inode);
+ if (unlikely(error))
+ goto cleanup_file;
+ error = __mnt_want_write(f->f_path.mnt);
+ if (unlikely(error)) {
+ put_write_access(inode);
+ goto cleanup_file;
+ }
+ f->f_mode |= FMODE_WRITER;
+ }
+
/* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
if (S_ISREG(inode->i_mode))
f->f_mode |= FMODE_ATOMIC_POS;
@@ -761,18 +734,9 @@ static int do_dentry_open(struct file *f,
cleanup_all:
fops_put(f->f_op);
- if (f->f_mode & FMODE_WRITE) {
+ if (f->f_mode & FMODE_WRITER) {
put_write_access(inode);
- if (!special_file(inode->i_mode)) {
- /*
- * We don't consider this a real
- * mnt_want/drop_write() pair
- * because it all happenend right
- * here, so just reset the state.
- */
- file_reset_write(f);
- __mnt_drop_write(f->f_path.mnt);
- }
+ __mnt_drop_write(f->f_path.mnt);
}
cleanup_file:
path_put(&f->f_path);
diff --git a/fs/pipe.c b/fs/pipe.c
index 78fd0d0788db..034bffac3f97 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -142,55 +142,6 @@ pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len,
return 0;
}
-static int
-pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len,
- int atomic)
-{
- unsigned long copy;
-
- while (len > 0) {
- while (!iov->iov_len)
- iov++;
- copy = min_t(unsigned long, len, iov->iov_len);
-
- if (atomic) {
- if (__copy_to_user_inatomic(iov->iov_base, from, copy))
- return -EFAULT;
- } else {
- if (copy_to_user(iov->iov_base, from, copy))
- return -EFAULT;
- }
- from += copy;
- len -= copy;
- iov->iov_base += copy;
- iov->iov_len -= copy;
- }
- return 0;
-}
-
-/*
- * Attempt to pre-fault in the user memory, so we can use atomic copies.
- * Returns the number of bytes not faulted in.
- */
-static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len)
-{
- while (!iov->iov_len)
- iov++;
-
- while (len > 0) {
- unsigned long this_len;
-
- this_len = min_t(unsigned long, len, iov->iov_len);
- if (fault_in_pages_writeable(iov->iov_base, this_len))
- break;
-
- len -= this_len;
- iov++;
- }
-
- return len;
-}
-
/*
* Pre-fault in the user memory, so we can use atomic copies.
*/
@@ -226,52 +177,6 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
}
/**
- * generic_pipe_buf_map - virtually map a pipe buffer
- * @pipe: the pipe that the buffer belongs to
- * @buf: the buffer that should be mapped
- * @atomic: whether to use an atomic map
- *
- * Description:
- * This function returns a kernel virtual address mapping for the
- * pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided
- * and the caller has to be careful not to fault before calling
- * the unmap function.
- *
- * Note that this function calls kmap_atomic() if @atomic != 0.
- */
-void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
- struct pipe_buffer *buf, int atomic)
-{
- if (atomic) {
- buf->flags |= PIPE_BUF_FLAG_ATOMIC;
- return kmap_atomic(buf->page);
- }
-
- return kmap(buf->page);
-}
-EXPORT_SYMBOL(generic_pipe_buf_map);
-
-/**
- * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer
- * @pipe: the pipe that the buffer belongs to
- * @buf: the buffer that should be unmapped
- * @map_data: the data that the mapping function returned
- *
- * Description:
- * This function undoes the mapping that ->map() provided.
- */
-void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
- struct pipe_buffer *buf, void *map_data)
-{
- if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
- buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
- kunmap_atomic(map_data);
- } else
- kunmap(buf->page);
-}
-EXPORT_SYMBOL(generic_pipe_buf_unmap);
-
-/**
* generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
* @pipe: the pipe that the buffer belongs to
* @buf: the buffer to attempt to steal
@@ -351,8 +256,6 @@ EXPORT_SYMBOL(generic_pipe_buf_release);
static const struct pipe_buf_operations anon_pipe_buf_ops = {
.can_merge = 1,
- .map = generic_pipe_buf_map,
- .unmap = generic_pipe_buf_unmap,
.confirm = generic_pipe_buf_confirm,
.release = anon_pipe_buf_release,
.steal = generic_pipe_buf_steal,
@@ -361,8 +264,6 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = {
static const struct pipe_buf_operations packet_pipe_buf_ops = {
.can_merge = 0,
- .map = generic_pipe_buf_map,
- .unmap = generic_pipe_buf_unmap,
.confirm = generic_pipe_buf_confirm,
.release = anon_pipe_buf_release,
.steal = generic_pipe_buf_steal,
@@ -379,12 +280,15 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
ssize_t ret;
struct iovec *iov = (struct iovec *)_iov;
size_t total_len;
+ struct iov_iter iter;
total_len = iov_length(iov, nr_segs);
/* Null read succeeds. */
if (unlikely(total_len == 0))
return 0;
+ iov_iter_init(&iter, iov, nr_segs, total_len, 0);
+
do_wakeup = 0;
ret = 0;
__pipe_lock(pipe);
@@ -394,9 +298,9 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
int curbuf = pipe->curbuf;
struct pipe_buffer *buf = pipe->bufs + curbuf;
const struct pipe_buf_operations *ops = buf->ops;
- void *addr;
size_t chars = buf->len;
- int error, atomic;
+ size_t written;
+ int error;
if (chars > total_len)
chars = total_len;
@@ -408,21 +312,10 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
break;
}
- atomic = !iov_fault_in_pages_write(iov, chars);
-redo:
- addr = ops->map(pipe, buf, atomic);
- error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic);
- ops->unmap(pipe, buf, addr);
- if (unlikely(error)) {
- /*
- * Just retry with the slow path if we failed.
- */
- if (atomic) {
- atomic = 0;
- goto redo;
- }
+ written = copy_page_to_iter(buf->page, buf->offset, chars, &iter);
+ if (unlikely(written < chars)) {
if (!ret)
- ret = error;
+ ret = -EFAULT;
break;
}
ret += chars;
@@ -538,10 +431,16 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
iov_fault_in_pages_read(iov, chars);
redo1:
- addr = ops->map(pipe, buf, atomic);
+ if (atomic)
+ addr = kmap_atomic(buf->page);
+ else
+ addr = kmap(buf->page);
error = pipe_iov_copy_from_user(offset + addr, iov,
chars, atomic);
- ops->unmap(pipe, buf, addr);
+ if (atomic)
+ kunmap_atomic(addr);
+ else
+ kunmap(buf->page);
ret = error;
do_wakeup = 1;
if (error) {
diff --git a/fs/pnode.c b/fs/pnode.c
index 88396df725b4..302bf22c4a30 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -164,46 +164,94 @@ static struct mount *propagation_next(struct mount *m,
}
}
-/*
- * return the source mount to be used for cloning
- *
- * @dest the current destination mount
- * @last_dest the last seen destination mount
- * @last_src the last seen source mount
- * @type return CL_SLAVE if the new mount has to be
- * cloned as a slave.
- */
-static struct mount *get_source(struct mount *dest,
- struct mount *last_dest,
- struct mount *last_src,
- int *type)
+static struct mount *next_group(struct mount *m, struct mount *origin)
{
- struct mount *p_last_src = NULL;
- struct mount *p_last_dest = NULL;
-
- while (last_dest != dest->mnt_master) {
- p_last_dest = last_dest;
- p_last_src = last_src;
- last_dest = last_dest->mnt_master;
- last_src = last_src->mnt_master;
+ while (1) {
+ while (1) {
+ struct mount *next;
+ if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
+ return first_slave(m);
+ next = next_peer(m);
+ if (m->mnt_group_id == origin->mnt_group_id) {
+ if (next == origin)
+ return NULL;
+ } else if (m->mnt_slave.next != &next->mnt_slave)
+ break;
+ m = next;
+ }
+ /* m is the last peer */
+ while (1) {
+ struct mount *master = m->mnt_master;
+ if (m->mnt_slave.next != &master->mnt_slave_list)
+ return next_slave(m);
+ m = next_peer(master);
+ if (master->mnt_group_id == origin->mnt_group_id)
+ break;
+ if (master->mnt_slave.next == &m->mnt_slave)
+ break;
+ m = master;
+ }
+ if (m == origin)
+ return NULL;
}
+}
- if (p_last_dest) {
- do {
- p_last_dest = next_peer(p_last_dest);
- } while (IS_MNT_NEW(p_last_dest));
- /* is that a peer of the earlier? */
- if (dest == p_last_dest) {
- *type = CL_MAKE_SHARED;
- return p_last_src;
+/* all accesses are serialized by namespace_sem */
+static struct user_namespace *user_ns;
+static struct mount *last_dest, *last_source, *dest_master;
+static struct mountpoint *mp;
+static struct hlist_head *list;
+
+static int propagate_one(struct mount *m)
+{
+ struct mount *child;
+ int type;
+ /* skip ones added by this propagate_mnt() */
+ if (IS_MNT_NEW(m))
+ return 0;
+ /* skip if mountpoint isn't covered by it */
+ if (!is_subdir(mp->m_dentry, m->mnt.mnt_root))
+ return 0;
+ if (m->mnt_group_id == last_dest->mnt_group_id) {
+ type = CL_MAKE_SHARED;
+ } else {
+ struct mount *n, *p;
+ for (n = m; ; n = p) {
+ p = n->mnt_master;
+ if (p == dest_master || IS_MNT_MARKED(p)) {
+ while (last_dest->mnt_master != p) {
+ last_source = last_source->mnt_master;
+ last_dest = last_source->mnt_parent;
+ }
+ if (n->mnt_group_id != last_dest->mnt_group_id) {
+ last_source = last_source->mnt_master;
+ last_dest = last_source->mnt_parent;
+ }
+ break;
+ }
}
+ type = CL_SLAVE;
+ /* beginning of peer group among the slaves? */
+ if (IS_MNT_SHARED(m))
+ type |= CL_MAKE_SHARED;
}
- /* slave of the earlier, then */
- *type = CL_SLAVE;
- /* beginning of peer group among the slaves? */
- if (IS_MNT_SHARED(dest))
- *type |= CL_MAKE_SHARED;
- return last_src;
+
+ /* Notice when we are propagating across user namespaces */
+ if (m->mnt_ns->user_ns != user_ns)
+ type |= CL_UNPRIVILEGED;
+ child = copy_tree(last_source, last_source->mnt.mnt_root, type);
+ if (IS_ERR(child))
+ return PTR_ERR(child);
+ mnt_set_mountpoint(m, mp, child);
+ last_dest = m;
+ last_source = child;
+ if (m->mnt_master != dest_master) {
+ read_seqlock_excl(&mount_lock);
+ SET_MNT_MARK(m->mnt_master);
+ read_sequnlock_excl(&mount_lock);
+ }
+ hlist_add_head(&child->mnt_hash, list);
+ return 0;
}
/*
@@ -222,56 +270,48 @@ static struct mount *get_source(struct mount *dest,
int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
struct mount *source_mnt, struct hlist_head *tree_list)
{
- struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
- struct mount *m, *child;
+ struct mount *m, *n;
int ret = 0;
- struct mount *prev_dest_mnt = dest_mnt;
- struct mount *prev_src_mnt = source_mnt;
- HLIST_HEAD(tmp_list);
-
- for (m = propagation_next(dest_mnt, dest_mnt); m;
- m = propagation_next(m, dest_mnt)) {
- int type;
- struct mount *source;
-
- if (IS_MNT_NEW(m))
- continue;
-
- source = get_source(m, prev_dest_mnt, prev_src_mnt, &type);
-
- /* Notice when we are propagating across user namespaces */
- if (m->mnt_ns->user_ns != user_ns)
- type |= CL_UNPRIVILEGED;
-
- child = copy_tree(source, source->mnt.mnt_root, type);
- if (IS_ERR(child)) {
- ret = PTR_ERR(child);
- tmp_list = *tree_list;
- tmp_list.first->pprev = &tmp_list.first;
- INIT_HLIST_HEAD(tree_list);
+
+ /*
+ * we don't want to bother passing tons of arguments to
+ * propagate_one(); everything is serialized by namespace_sem,
+ * so globals will do just fine.
+ */
+ user_ns = current->nsproxy->mnt_ns->user_ns;
+ last_dest = dest_mnt;
+ last_source = source_mnt;
+ mp = dest_mp;
+ list = tree_list;
+ dest_master = dest_mnt->mnt_master;
+
+ /* all peers of dest_mnt, except dest_mnt itself */
+ for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) {
+ ret = propagate_one(n);
+ if (ret)
goto out;
- }
+ }
- if (is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) {
- mnt_set_mountpoint(m, dest_mp, child);
- hlist_add_head(&child->mnt_hash, tree_list);
- } else {
- /*
- * This can happen if the parent mount was bind mounted
- * on some subdirectory of a shared/slave mount.
- */
- hlist_add_head(&child->mnt_hash, &tmp_list);
- }
- prev_dest_mnt = m;
- prev_src_mnt = child;
+ /* all slave groups */
+ for (m = next_group(dest_mnt, dest_mnt); m;
+ m = next_group(m, dest_mnt)) {
+ /* everything in that slave group */
+ n = m;
+ do {
+ ret = propagate_one(n);
+ if (ret)
+ goto out;
+ n = next_peer(n);
+ } while (n != m);
}
out:
- lock_mount_hash();
- while (!hlist_empty(&tmp_list)) {
- child = hlist_entry(tmp_list.first, struct mount, mnt_hash);
- umount_tree(child, 0);
+ read_seqlock_excl(&mount_lock);
+ hlist_for_each_entry(n, tree_list, mnt_hash) {
+ m = n->mnt_parent;
+ if (m->mnt_master != dest_mnt->mnt_master)
+ CLEAR_MNT_MARK(m->mnt_master);
}
- unlock_mount_hash();
+ read_sequnlock_excl(&mount_lock);
return ret;
}
diff --git a/fs/pnode.h b/fs/pnode.h
index fc28a27fa892..4a246358b031 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -16,6 +16,9 @@
#define IS_MNT_NEW(m) (!(m)->mnt_ns)
#define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED)
#define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE)
+#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
+#define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED)
+#define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
#define CL_EXPIRE 0x01
#define CL_SLAVE 0x02
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 656e401794de..64db2bceac59 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -138,8 +138,8 @@ static const char * const task_state_array[] = {
"D (disk sleep)", /* 2 */
"T (stopped)", /* 4 */
"t (tracing stop)", /* 8 */
- "Z (zombie)", /* 16 */
- "X (dead)", /* 32 */
+ "X (dead)", /* 16 */
+ "Z (zombie)", /* 32 */
};
static inline const char *get_task_state(struct task_struct *tsk)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b9760628e1fd..2d696b0c93bf 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -200,41 +200,9 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
return result;
}
-static int proc_pid_cmdline(struct task_struct *task, char * buffer)
+static int proc_pid_cmdline(struct task_struct *task, char *buffer)
{
- int res = 0;
- unsigned int len;
- struct mm_struct *mm = get_task_mm(task);
- if (!mm)
- goto out;
- if (!mm->arg_end)
- goto out_mm; /* Shh! No looking before we're done */
-
- len = mm->arg_end - mm->arg_start;
-
- if (len > PAGE_SIZE)
- len = PAGE_SIZE;
-
- res = access_process_vm(task, mm->arg_start, buffer, len, 0);
-
- // If the nul at the end of args has been overwritten, then
- // assume application is using setproctitle(3).
- if (res > 0 && buffer[res-1] != '\0' && len < PAGE_SIZE) {
- len = strnlen(buffer, res);
- if (len < res) {
- res = len;
- } else {
- len = mm->env_end - mm->env_start;
- if (len > PAGE_SIZE - res)
- len = PAGE_SIZE - res;
- res += access_process_vm(task, mm->env_start, buffer+res, len, 0);
- res = strnlen(buffer, res);
- }
- }
-out_mm:
- mmput(mm);
-out:
- return res;
+ return get_cmdline(task, buffer, PAGE_SIZE);
}
static int proc_pid_auxv(struct task_struct *task, char *buffer)
@@ -1236,6 +1204,9 @@ static ssize_t proc_fault_inject_write(struct file * file,
make_it_fail = simple_strtol(strstrip(buffer), &end, 0);
if (*end)
return -EINVAL;
+ if (make_it_fail < 0 || make_it_fail > 1)
+ return -EINVAL;
+
task = get_proc_task(file_inode(file));
if (!task)
return -ESRCH;
@@ -2588,7 +2559,7 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("environ", S_IRUSR, proc_environ_operations),
INF("auxv", S_IRUSR, proc_pid_auxv),
ONE("status", S_IRUGO, proc_pid_status),
- ONE("personality", S_IRUGO, proc_pid_personality),
+ ONE("personality", S_IRUSR, proc_pid_personality),
INF("limits", S_IRUGO, proc_pid_limits),
#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
@@ -2598,7 +2569,7 @@ static const struct pid_entry tgid_base_stuff[] = {
#endif
REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
- INF("syscall", S_IRUGO, proc_pid_syscall),
+ INF("syscall", S_IRUSR, proc_pid_syscall),
#endif
INF("cmdline", S_IRUGO, proc_pid_cmdline),
ONE("stat", S_IRUGO, proc_tgid_stat),
@@ -2617,7 +2588,7 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_PROC_PAGE_MONITOR
REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
REG("smaps", S_IRUGO, proc_pid_smaps_operations),
- REG("pagemap", S_IRUGO, proc_pagemap_operations),
+ REG("pagemap", S_IRUSR, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
@@ -2626,7 +2597,7 @@ static const struct pid_entry tgid_base_stuff[] = {
INF("wchan", S_IRUGO, proc_pid_wchan),
#endif
#ifdef CONFIG_STACKTRACE
- ONE("stack", S_IRUGO, proc_pid_stack),
+ ONE("stack", S_IRUSR, proc_pid_stack),
#endif
#ifdef CONFIG_SCHEDSTATS
INF("schedstat", S_IRUGO, proc_pid_schedstat),
@@ -2927,14 +2898,14 @@ static const struct pid_entry tid_base_stuff[] = {
REG("environ", S_IRUSR, proc_environ_operations),
INF("auxv", S_IRUSR, proc_pid_auxv),
ONE("status", S_IRUGO, proc_pid_status),
- ONE("personality", S_IRUGO, proc_pid_personality),
+ ONE("personality", S_IRUSR, proc_pid_personality),
INF("limits", S_IRUGO, proc_pid_limits),
#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
#endif
REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
- INF("syscall", S_IRUGO, proc_pid_syscall),
+ INF("syscall", S_IRUSR, proc_pid_syscall),
#endif
INF("cmdline", S_IRUGO, proc_pid_cmdline),
ONE("stat", S_IRUGO, proc_tid_stat),
@@ -2955,7 +2926,7 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_PROC_PAGE_MONITOR
REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
REG("smaps", S_IRUGO, proc_tid_smaps_operations),
- REG("pagemap", S_IRUGO, proc_pagemap_operations),
+ REG("pagemap", S_IRUSR, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
@@ -2964,7 +2935,7 @@ static const struct pid_entry tid_base_stuff[] = {
INF("wchan", S_IRUGO, proc_pid_wchan),
#endif
#ifdef CONFIG_STACKTRACE
- ONE("stack", S_IRUGO, proc_pid_stack),
+ ONE("stack", S_IRUSR, proc_pid_stack),
#endif
#ifdef CONFIG_SCHEDSTATS
INF("schedstat", S_IRUGO, proc_pid_schedstat),
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 985ea881b5bc..0788d093f5d8 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -11,6 +11,7 @@
#include <linux/proc_fs.h>
+#include "../mount.h"
#include "internal.h"
#include "fd.h"
@@ -48,8 +49,9 @@ static int seq_show(struct seq_file *m, void *v)
}
if (!ret) {
- seq_printf(m, "pos:\t%lli\nflags:\t0%o\n",
- (long long)file->f_pos, f_flags);
+ seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n",
+ (long long)file->f_pos, f_flags,
+ real_mount(file->f_path.mnt)->mnt_id);
if (file->f_op->show_fdinfo)
ret = file->f_op->show_fdinfo(m, file);
fput(file);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 8f20e3404fd2..0adbc02d60e3 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -47,7 +47,7 @@ static void proc_evict_inode(struct inode *inode)
pde_put(de);
head = PROC_I(inode)->sysctl;
if (head) {
- rcu_assign_pointer(PROC_I(inode)->sysctl, NULL);
+ RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
sysctl_head_put(head);
}
/* Release any associated namespace */
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 136e548d9567..7445af0b1aa3 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -73,7 +73,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
available += pagecache;
/*
- * Part of the reclaimable swap consists of items that are in use,
+ * Part of the reclaimable slab consists of items that are in use,
* and cannot be freed. Cap this estimate at the low watermark.
*/
available += global_page_state(NR_SLAB_RECLAIMABLE) -
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 9ae46b87470d..89026095f2b5 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -146,7 +146,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
struct task_struct *task;
void *ns;
char name[50];
- int len = -EACCES;
+ int res = -EACCES;
task = get_proc_task(inode);
if (!task)
@@ -155,24 +155,18 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
if (!ptrace_may_access(task, PTRACE_MODE_READ))
goto out_put_task;
- len = -ENOENT;
+ res = -ENOENT;
ns = ns_ops->get(task);
if (!ns)
goto out_put_task;
snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns));
- len = strlen(name);
-
- if (len > buflen)
- len = buflen;
- if (copy_to_user(buffer, name, len))
- len = -EFAULT;
-
+ res = readlink_copy(buffer, buflen, name);
ns_ops->put(ns);
out_put_task:
put_task_struct(task);
out:
- return len;
+ return res;
}
static const struct inode_operations proc_ns_link_inode_operations = {
diff --git a/fs/proc/self.c b/fs/proc/self.c
index ffeb202ec942..4348bb8907c2 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -16,7 +16,7 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
if (!tgid)
return -ENOENT;
sprintf(tmp, "%d", tgid);
- return vfs_readlink(dentry,buffer,buflen,tmp);
+ return readlink_copy(buffer, buflen, tmp);
}
static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fb52b548080d..442177b1119a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1,4 +1,5 @@
#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/hugetlb.h>
#include <linux/huge_mm.h>
#include <linux/mount.h>
@@ -152,7 +153,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
/*
* We remember last_addr rather than next_addr to hit with
- * mmap_cache most of the time. We have zero last_addr at
+ * vmacache most of the time. We have zero last_addr at
* the beginning and also after lseek. We will have -1 last_addr
* after the end of the vmas.
*/
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 88d4585b30f1..6a8e785b29da 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -484,7 +484,6 @@ static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr)
phdr_ptr->p_memsz = real_sz;
if (real_sz == 0) {
pr_warn("Warning: Zero PT_NOTE entries found\n");
- return -EINVAL;
}
}
@@ -671,7 +670,6 @@ static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr)
phdr_ptr->p_memsz = real_sz;
if (real_sz == 0) {
pr_warn("Warning: Zero PT_NOTE entries found\n");
- return -EINVAL;
}
}
@@ -1118,4 +1116,3 @@ void vmcore_cleanup(void)
}
free_elfcorebuf();
}
-EXPORT_SYMBOL_GPL(vmcore_cleanup);
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 7be26f03a3f5..1a81373947f3 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -267,6 +267,7 @@ static int mounts_open_common(struct inode *inode, struct file *file,
p->root = root;
p->m.poll_event = ns->event;
p->show = show;
+ p->cached_event = ~0ULL;
return 0;
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index 880fd9884366..c51df1dd237e 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -8,9 +8,10 @@ config QUOTA
help
If you say Y here, you will be able to set per user limits for disk
usage (also called disk quotas). Currently, it works for the
- ext2, ext3, and reiserfs file system. ext3 also supports journalled
- quotas for which you don't need to run quotacheck(8) after an unclean
- shutdown.
+ ext2, ext3, ext4, jfs, ocfs2 and reiserfs file systems.
+ Note that gfs2 and xfs use their own quota system.
+ Ext3, ext4 and reiserfs also support journaled quotas for which
+ you don't need to run quotacheck(8) after an unclean shutdown.
For further details, read the Quota mini-HOWTO, available from
<http://www.tldp.org/docs.html#howto>, or the documentation provided
with the quota tools. Probably the quota support is only useful for
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 1fd2051109a3..af677353a3f5 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -125,6 +125,7 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
int d_reclen;
char *d_name;
ino_t d_ino;
+ loff_t cur_pos = deh_offset(deh);
if (!de_visible(deh))
/* it is hidden entry */
@@ -196,8 +197,9 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
if (local_buf != small_buf) {
kfree(local_buf);
}
- // next entry should be looked for with such offset
- next_pos = deh_offset(deh) + 1;
+
+ /* deh_offset(deh) may be invalid now. */
+ next_pos = cur_pos + 1;
if (item_moved(&tmp_ih, &path_to_entry)) {
set_cpu_key_k_offset(&pos_key,
diff --git a/fs/splice.c b/fs/splice.c
index 12028fa41def..9bc07d2b53cf 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -136,8 +136,6 @@ error:
const struct pipe_buf_operations page_cache_pipe_buf_ops = {
.can_merge = 0,
- .map = generic_pipe_buf_map,
- .unmap = generic_pipe_buf_unmap,
.confirm = page_cache_pipe_buf_confirm,
.release = page_cache_pipe_buf_release,
.steal = page_cache_pipe_buf_steal,
@@ -156,8 +154,6 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
static const struct pipe_buf_operations user_page_pipe_buf_ops = {
.can_merge = 0,
- .map = generic_pipe_buf_map,
- .unmap = generic_pipe_buf_unmap,
.confirm = generic_pipe_buf_confirm,
.release = page_cache_pipe_buf_release,
.steal = user_page_pipe_buf_steal,
@@ -547,8 +543,6 @@ EXPORT_SYMBOL(generic_file_splice_read);
static const struct pipe_buf_operations default_pipe_buf_ops = {
.can_merge = 0,
- .map = generic_pipe_buf_map,
- .unmap = generic_pipe_buf_unmap,
.confirm = generic_pipe_buf_confirm,
.release = generic_pipe_buf_release,
.steal = generic_pipe_buf_steal,
@@ -564,8 +558,6 @@ static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
/* Pipe buffer operations for a socket and similar. */
const struct pipe_buf_operations nosteal_pipe_buf_ops = {
.can_merge = 0,
- .map = generic_pipe_buf_map,
- .unmap = generic_pipe_buf_unmap,
.confirm = generic_pipe_buf_confirm,
.release = generic_pipe_buf_release,
.steal = generic_pipe_buf_nosteal,
@@ -767,13 +759,13 @@ int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
goto out;
if (buf->page != page) {
- char *src = buf->ops->map(pipe, buf, 1);
+ char *src = kmap_atomic(buf->page);
char *dst = kmap_atomic(page);
memcpy(dst + offset, src + buf->offset, this_len);
flush_dcache_page(page);
kunmap_atomic(dst);
- buf->ops->unmap(pipe, buf, src);
+ kunmap_atomic(src);
}
ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
page, fsdata);
@@ -1067,9 +1059,9 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
void *data;
loff_t tmp = sd->pos;
- data = buf->ops->map(pipe, buf, 0);
+ data = kmap(buf->page);
ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp);
- buf->ops->unmap(pipe, buf, data);
+ kunmap(buf->page);
return ret;
}
@@ -1528,116 +1520,48 @@ static int get_iovec_page_array(const struct iovec __user *iov,
static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
struct splice_desc *sd)
{
- char *src;
- int ret;
-
- /*
- * See if we can use the atomic maps, by prefaulting in the
- * pages and doing an atomic copy
- */
- if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) {
- src = buf->ops->map(pipe, buf, 1);
- ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset,
- sd->len);
- buf->ops->unmap(pipe, buf, src);
- if (!ret) {
- ret = sd->len;
- goto out;
- }
- }
-
- /*
- * No dice, use slow non-atomic map and copy
- */
- src = buf->ops->map(pipe, buf, 0);
-
- ret = sd->len;
- if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len))
- ret = -EFAULT;
-
- buf->ops->unmap(pipe, buf, src);
-out:
- if (ret > 0)
- sd->u.userptr += ret;
- return ret;
+ int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
+ return n == sd->len ? n : -EFAULT;
}
/*
* For lack of a better implementation, implement vmsplice() to userspace
* as a simple copy of the pipes pages to the user iov.
*/
-static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
+static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
unsigned long nr_segs, unsigned int flags)
{
struct pipe_inode_info *pipe;
struct splice_desc sd;
- ssize_t size;
- int error;
long ret;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+ ssize_t count = 0;
pipe = get_pipe_info(file);
if (!pipe)
return -EBADF;
- pipe_lock(pipe);
-
- error = ret = 0;
- while (nr_segs) {
- void __user *base;
- size_t len;
-
- /*
- * Get user address base and length for this iovec.
- */
- error = get_user(base, &iov->iov_base);
- if (unlikely(error))
- break;
- error = get_user(len, &iov->iov_len);
- if (unlikely(error))
- break;
-
- /*
- * Sanity check this iovec. 0 read succeeds.
- */
- if (unlikely(!len))
- break;
- if (unlikely(!base)) {
- error = -EFAULT;
- break;
- }
-
- if (unlikely(!access_ok(VERIFY_WRITE, base, len))) {
- error = -EFAULT;
- break;
- }
-
- sd.len = 0;
- sd.total_len = len;
- sd.flags = flags;
- sd.u.userptr = base;
- sd.pos = 0;
-
- size = __splice_from_pipe(pipe, &sd, pipe_to_user);
- if (size < 0) {
- if (!ret)
- ret = size;
-
- break;
- }
-
- ret += size;
+ ret = rw_copy_check_uvector(READ, uiov, nr_segs,
+ ARRAY_SIZE(iovstack), iovstack, &iov);
+ if (ret <= 0)
+ return ret;
- if (size < len)
- break;
+ iov_iter_init(&iter, iov, nr_segs, count, 0);
- nr_segs--;
- iov++;
- }
+ sd.len = 0;
+ sd.total_len = count;
+ sd.flags = flags;
+ sd.u.data = &iter;
+ sd.pos = 0;
+ pipe_lock(pipe);
+ ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
pipe_unlock(pipe);
- if (!ret)
- ret = error;
+ if (iov != iovstack)
+ kfree(iov);
return ret;
}
diff --git a/fs/super.c b/fs/super.c
index e9dc3c3fe159..48377f7463c0 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -800,7 +800,10 @@ void emergency_remount(void)
static DEFINE_IDA(unnamed_dev_ida);
static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */
-static int unnamed_dev_start = 0; /* don't bother trying below it */
+/* Many userspace utilities consider an FSID of 0 invalid.
+ * Always return at least 1 from get_anon_bdev.
+ */
+static int unnamed_dev_start = 1;
int get_anon_bdev(dev_t *p)
{
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 1b8b91b67fdb..28cc1acd5439 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -453,95 +453,3 @@ void sysfs_remove_bin_file(struct kobject *kobj,
kernfs_remove_by_name(kobj->sd, attr->attr.name);
}
EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);
-
-struct sysfs_schedule_callback_struct {
- struct list_head workq_list;
- struct kobject *kobj;
- void (*func)(void *);
- void *data;
- struct module *owner;
- struct work_struct work;
-};
-
-static struct workqueue_struct *sysfs_workqueue;
-static DEFINE_MUTEX(sysfs_workq_mutex);
-static LIST_HEAD(sysfs_workq);
-static void sysfs_schedule_callback_work(struct work_struct *work)
-{
- struct sysfs_schedule_callback_struct *ss = container_of(work,
- struct sysfs_schedule_callback_struct, work);
-
- (ss->func)(ss->data);
- kobject_put(ss->kobj);
- module_put(ss->owner);
- mutex_lock(&sysfs_workq_mutex);
- list_del(&ss->workq_list);
- mutex_unlock(&sysfs_workq_mutex);
- kfree(ss);
-}
-
-/**
- * sysfs_schedule_callback - helper to schedule a callback for a kobject
- * @kobj: object we're acting for.
- * @func: callback function to invoke later.
- * @data: argument to pass to @func.
- * @owner: module owning the callback code
- *
- * sysfs attribute methods must not unregister themselves or their parent
- * kobject (which would amount to the same thing). Attempts to do so will
- * deadlock, since unregistration is mutually exclusive with driver
- * callbacks.
- *
- * Instead methods can call this routine, which will attempt to allocate
- * and schedule a workqueue request to call back @func with @data as its
- * argument in the workqueue's process context. @kobj will be pinned
- * until @func returns.
- *
- * Returns 0 if the request was submitted, -ENOMEM if storage could not
- * be allocated, -ENODEV if a reference to @owner isn't available,
- * -EAGAIN if a callback has already been scheduled for @kobj.
- */
-int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
- void *data, struct module *owner)
-{
- struct sysfs_schedule_callback_struct *ss, *tmp;
-
- if (!try_module_get(owner))
- return -ENODEV;
-
- mutex_lock(&sysfs_workq_mutex);
- list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list)
- if (ss->kobj == kobj) {
- module_put(owner);
- mutex_unlock(&sysfs_workq_mutex);
- return -EAGAIN;
- }
- mutex_unlock(&sysfs_workq_mutex);
-
- if (sysfs_workqueue == NULL) {
- sysfs_workqueue = create_singlethread_workqueue("sysfsd");
- if (sysfs_workqueue == NULL) {
- module_put(owner);
- return -ENOMEM;
- }
- }
-
- ss = kmalloc(sizeof(*ss), GFP_KERNEL);
- if (!ss) {
- module_put(owner);
- return -ENOMEM;
- }
- kobject_get(kobj);
- ss->kobj = kobj;
- ss->func = func;
- ss->data = data;
- ss->owner = owner;
- INIT_WORK(&ss->work, sysfs_schedule_callback_work);
- INIT_LIST_HEAD(&ss->workq_list);
- mutex_lock(&sysfs_workq_mutex);
- list_add_tail(&ss->workq_list, &sysfs_workq);
- mutex_unlock(&sysfs_workq_mutex);
- queue_work(sysfs_workqueue, &ss->work);
- return 0;
-}
-EXPORT_SYMBOL_GPL(sysfs_schedule_callback);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 123c79b7261e..4f34dbae823d 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1538,6 +1538,7 @@ out_unlock:
static const struct vm_operations_struct ubifs_file_vm_ops = {
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = ubifs_vm_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 1037637957c7..d2c170f8b035 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -171,7 +171,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
} else
up_write(&iinfo->i_data_sem);
- retval = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+ retval = __generic_file_aio_write(iocb, iov, nr_segs);
mutex_unlock(&inode->i_mutex);
if (retval > 0) {
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 64f2b7334d08..3286db047a40 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -175,7 +175,7 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
-static int init_inodecache(void)
+static int __init init_inodecache(void)
{
udf_inode_cachep = kmem_cache_create("udf_inode_cache",
sizeof(struct udf_inode_info),
@@ -505,6 +505,7 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
while ((p = strsep(&options, ",")) != NULL) {
substring_t args[MAX_OPT_ARGS];
int token;
+ unsigned n;
if (!*p)
continue;
@@ -516,7 +517,10 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
case Opt_bs:
if (match_int(&args[0], &option))
return 0;
- uopt->blocksize = option;
+ n = option;
+ if (n != 512 && n != 1024 && n != 2048 && n != 4096)
+ return 0;
+ uopt->blocksize = n;
uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET);
break;
case Opt_unhide:
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index a7ea492ae660..0ab1de4b39a5 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -38,7 +38,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
{
struct super_block * sb;
struct ufs_sb_private_info * uspi;
- struct ufs_super_block_first * usb1;
struct ufs_cg_private_info * ucpi;
struct ufs_cylinder_group * ucg;
unsigned cgno, bit, end_bit, bbase, blkmap, i;
@@ -46,7 +45,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
sb = inode->i_sb;
uspi = UFS_SB(sb)->s_uspi;
- usb1 = ubh_get_usb_first(uspi);
UFSD("ENTER, fragment %llu, count %u\n",
(unsigned long long)fragment, count);
@@ -135,7 +133,6 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count)
{
struct super_block * sb;
struct ufs_sb_private_info * uspi;
- struct ufs_super_block_first * usb1;
struct ufs_cg_private_info * ucpi;
struct ufs_cylinder_group * ucg;
unsigned overflow, cgno, bit, end_bit, i;
@@ -143,7 +140,6 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count)
sb = inode->i_sb;
uspi = UFS_SB(sb)->s_uspi;
- usb1 = ubh_get_usb_first(uspi);
UFSD("ENTER, fragment %llu, count %u\n",
(unsigned long long)fragment, count);
@@ -499,7 +495,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
{
struct super_block * sb;
struct ufs_sb_private_info * uspi;
- struct ufs_super_block_first * usb1;
struct ufs_cg_private_info * ucpi;
struct ufs_cylinder_group * ucg;
unsigned cgno, fragno, fragoff, count, fragsize, i;
@@ -509,7 +504,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
sb = inode->i_sb;
uspi = UFS_SB(sb)->s_uspi;
- usb1 = ubh_get_usb_first (uspi);
count = newcount - oldcount;
cgno = ufs_dtog(uspi, fragment);
@@ -577,7 +571,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
{
struct super_block * sb;
struct ufs_sb_private_info * uspi;
- struct ufs_super_block_first * usb1;
struct ufs_cg_private_info * ucpi;
struct ufs_cylinder_group * ucg;
unsigned oldcg, i, j, k, allocsize;
@@ -588,7 +581,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
sb = inode->i_sb;
uspi = UFS_SB(sb)->s_uspi;
- usb1 = ubh_get_usb_first(uspi);
oldcg = cgno;
/*
@@ -690,7 +682,6 @@ static u64 ufs_alloccg_block(struct inode *inode,
{
struct super_block * sb;
struct ufs_sb_private_info * uspi;
- struct ufs_super_block_first * usb1;
struct ufs_cylinder_group * ucg;
u64 result, blkno;
@@ -698,7 +689,6 @@ static u64 ufs_alloccg_block(struct inode *inode,
sb = inode->i_sb;
uspi = UFS_SB(sb)->s_uspi;
- usb1 = ubh_get_usb_first(uspi);
ucg = ubh_get_ucg(UCPI_UBH(ucpi));
if (goal == 0) {
@@ -794,7 +784,6 @@ static u64 ufs_bitmap_search(struct super_block *sb,
0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe
};
struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
- struct ufs_super_block_first *usb1;
struct ufs_cylinder_group *ucg;
unsigned start, length, loc;
unsigned pos, want, blockmap, mask, end;
@@ -803,7 +792,6 @@ static u64 ufs_bitmap_search(struct super_block *sb,
UFSD("ENTER, cg %u, goal %llu, count %u\n", ucpi->c_cgx,
(unsigned long long)goal, count);
- usb1 = ubh_get_usb_first (uspi);
ucg = ubh_get_ucg(UCPI_UBH(ucpi));
if (goal)
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index d0426d74817b..98f7211599ff 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -57,7 +57,6 @@ void ufs_free_inode (struct inode * inode)
{
struct super_block * sb;
struct ufs_sb_private_info * uspi;
- struct ufs_super_block_first * usb1;
struct ufs_cg_private_info * ucpi;
struct ufs_cylinder_group * ucg;
int is_directory;
@@ -67,7 +66,6 @@ void ufs_free_inode (struct inode * inode)
sb = inode->i_sb;
uspi = UFS_SB(sb)->s_uspi;
- usb1 = ubh_get_usb_first(uspi);
ino = inode->i_ino;
@@ -175,7 +173,6 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode)
struct super_block * sb;
struct ufs_sb_info * sbi;
struct ufs_sb_private_info * uspi;
- struct ufs_super_block_first * usb1;
struct ufs_cg_private_info * ucpi;
struct ufs_cylinder_group * ucg;
struct inode * inode;
@@ -195,7 +192,6 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode)
ufsi = UFS_I(inode);
sbi = UFS_SB(sb);
uspi = sbi->s_uspi;
- usb1 = ubh_get_usb_first(uspi);
mutex_lock(&sbi->s_lock);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index b8c6791f046f..c1183f9f69dc 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -524,11 +524,9 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
struct ufs_buffer_head * ubh;
unsigned char * base, * space;
unsigned size, blks, i;
- struct ufs_super_block_third *usb3;
UFSD("ENTER\n");
- usb3 = ubh_get_usb_third(uspi);
/*
* Read cs structures from (usually) first data block
* on the device.
@@ -1390,15 +1388,11 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
struct super_block *sb = dentry->d_sb;
struct ufs_sb_private_info *uspi= UFS_SB(sb)->s_uspi;
unsigned flags = UFS_SB(sb)->s_flags;
- struct ufs_super_block_first *usb1;
- struct ufs_super_block_second *usb2;
struct ufs_super_block_third *usb3;
u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
lock_ufs(sb);
- usb1 = ubh_get_usb_first(uspi);
- usb2 = ubh_get_usb_second(uspi);
usb3 = ubh_get_usb_third(uspi);
if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
@@ -1454,7 +1448,7 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
-static int init_inodecache(void)
+static int __init init_inodecache(void)
{
ufs_inode_cachep = kmem_cache_create("ufs_inode_cache",
sizeof(struct ufs_inode_info),
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 75df77d09f75..0479c32c5eb1 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1344,6 +1344,14 @@ __xfs_get_blocks(
/*
* If this is O_DIRECT or the mpage code calling tell them how large
* the mapping is, so that we can avoid repeated get_blocks calls.
+ *
+ * If the mapping spans EOF, then we have to break the mapping up as the
+ * mapping for blocks beyond EOF must be marked new so that sub block
+ * regions can be correctly zeroed. We can't do this for mappings within
+ * EOF unless the mapping was just allocated or is unwritten, otherwise
+ * the callers would overwrite existing data with zeros. Hence we have
+ * to split the mapping into a range up to and including EOF, and a
+ * second mapping for beyond EOF.
*/
if (direct || size > (1 << inode->i_blkbits)) {
xfs_off_t mapping_size;
@@ -1354,6 +1362,12 @@ __xfs_get_blocks(
ASSERT(mapping_size > 0);
if (mapping_size > size)
mapping_size = size;
+ if (offset < i_size_read(inode) &&
+ offset + mapping_size >= i_size_read(inode)) {
+ /* limit mapping to block that spans EOF */
+ mapping_size = roundup_64(i_size_read(inode) - offset,
+ 1 << inode->i_blkbits);
+ }
if (mapping_size > LONG_MAX)
mapping_size = LONG_MAX;
@@ -1566,6 +1580,16 @@ xfs_vm_write_failed(
xfs_vm_kill_delalloc_range(inode, block_offset,
block_offset + bh->b_size);
+
+ /*
+ * This buffer does not contain data anymore. make sure anyone
+ * who finds it knows that for certain.
+ */
+ clear_buffer_delay(bh);
+ clear_buffer_uptodate(bh);
+ clear_buffer_mapped(bh);
+ clear_buffer_new(bh);
+ clear_buffer_dirty(bh);
}
}
@@ -1599,12 +1623,21 @@ xfs_vm_write_begin(
status = __block_write_begin(page, pos, len, xfs_get_blocks);
if (unlikely(status)) {
struct inode *inode = mapping->host;
+ size_t isize = i_size_read(inode);
xfs_vm_write_failed(inode, page, pos, len);
unlock_page(page);
- if (pos + len > i_size_read(inode))
- truncate_pagecache(inode, i_size_read(inode));
+ /*
+ * If the write is beyond EOF, we only want to kill blocks
+ * allocated in this write, not blocks that were previously
+ * written successfully.
+ */
+ if (pos + len > isize) {
+ ssize_t start = max_t(ssize_t, pos, isize);
+
+ truncate_pagecache_range(inode, start, pos + len);
+ }
page_cache_release(page);
page = NULL;
@@ -1615,9 +1648,12 @@ xfs_vm_write_begin(
}
/*
- * On failure, we only need to kill delalloc blocks beyond EOF because they
- * will never be written. For blocks within EOF, generic_write_end() zeros them
- * so they are safe to leave alone and be written with all the other valid data.
+ * On failure, we only need to kill delalloc blocks beyond EOF in the range of
+ * this specific write because they will never be written. Previous writes
+ * beyond EOF where block allocation succeeded do not need to be trashed, so
+ * only new blocks from this write should be trashed. For blocks within
+ * EOF, generic_write_end() zeros them so they are safe to leave alone and be
+ * written with all the other valid data.
*/
STATIC int
xfs_vm_write_end(
@@ -1640,8 +1676,11 @@ xfs_vm_write_end(
loff_t to = pos + len;
if (to > isize) {
- truncate_pagecache(inode, isize);
+ /* only kill blocks in this write beyond EOF */
+ if (pos > isize)
+ isize = pos;
xfs_vm_kill_delalloc_range(inode, isize, to);
+ truncate_pagecache_range(inode, isize, to);
}
}
return ret;
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 5b6092ef51ef..f0efc7e970ef 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5413,6 +5413,7 @@ xfs_bmap_shift_extents(
int whichfork = XFS_DATA_FORK;
int logflags;
xfs_filblks_t blockcount = 0;
+ int total_extents;
if (unlikely(XFS_TEST_ERROR(
(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
@@ -5429,7 +5430,6 @@ xfs_bmap_shift_extents(
ASSERT(current_ext != NULL);
ifp = XFS_IFORK_PTR(ip, whichfork);
-
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
/* Read in all the extents */
error = xfs_iread_extents(tp, ip, whichfork);
@@ -5456,7 +5456,6 @@ xfs_bmap_shift_extents(
/* We are going to change core inode */
logflags = XFS_ILOG_CORE;
-
if (ifp->if_flags & XFS_IFBROOT) {
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
cur->bc_private.b.firstblock = *firstblock;
@@ -5467,8 +5466,14 @@ xfs_bmap_shift_extents(
logflags |= XFS_ILOG_DEXT;
}
- while (nexts++ < num_exts &&
- *current_ext < XFS_IFORK_NEXTENTS(ip, whichfork)) {
+ /*
+ * There may be delalloc extents in the data fork before the range we
+ * are collapsing out, so we cannot
+ * use the count of real extents here. Instead we have to calculate it
+ * from the incore fork.
+ */
+ total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+ while (nexts++ < num_exts && *current_ext < total_extents) {
gotp = xfs_iext_get_ext(ifp, *current_ext);
xfs_bmbt_get_all(gotp, &got);
@@ -5556,10 +5561,11 @@ xfs_bmap_shift_extents(
}
(*current_ext)++;
+ total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
}
/* Check if we are done */
- if (*current_ext == XFS_IFORK_NEXTENTS(ip, whichfork))
+ if (*current_ext == total_extents)
*done = 1;
del_cursor:
@@ -5568,6 +5574,5 @@ del_cursor:
error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
xfs_trans_log_inode(tp, ip, logflags);
-
return error;
}
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 01f6a646caa1..296160b8e78c 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1418,6 +1418,8 @@ xfs_zero_file_space(
xfs_off_t end_boundary;
int error;
+ trace_xfs_zero_file_space(ip);
+
granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
/*
@@ -1432,9 +1434,18 @@ xfs_zero_file_space(
ASSERT(end_boundary <= offset + len);
if (start_boundary < end_boundary - 1) {
- /* punch out the page cache over the conversion range */
+ /*
+ * punch out delayed allocation blocks and the page cache over
+ * the conversion range
+ */
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_bmap_punch_delalloc_range(ip,
+ XFS_B_TO_FSBT(mp, start_boundary),
+ XFS_B_TO_FSB(mp, end_boundary - start_boundary));
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
truncate_pagecache_range(VFS_I(ip), start_boundary,
end_boundary - 1);
+
/* convert the blocks */
error = xfs_alloc_file_space(ip, start_boundary,
end_boundary - start_boundary - 1,
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 107f2fdfe41f..cb10a0aaab3a 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1372,21 +1372,29 @@ xfs_buf_iorequest(
xfs_buf_wait_unpin(bp);
xfs_buf_hold(bp);
- /* Set the count to 1 initially, this will stop an I/O
+ /*
+ * Set the count to 1 initially, this will stop an I/O
* completion callout which happens before we have started
* all the I/O from calling xfs_buf_ioend too early.
*/
atomic_set(&bp->b_io_remaining, 1);
_xfs_buf_ioapply(bp);
- _xfs_buf_ioend(bp, 1);
+ /*
+ * If _xfs_buf_ioapply failed, we'll get back here with
+ * only the reference we took above. _xfs_buf_ioend will
+ * drop it to zero, so we'd better not queue it for later,
+ * or we'll free it before it's done.
+ */
+ _xfs_buf_ioend(bp, bp->b_error ? 0 : 1);
xfs_buf_rele(bp);
}
/*
* Waits for I/O to complete on the buffer supplied. It returns immediately if
- * no I/O is pending or there is already a pending error on the buffer. It
- * returns the I/O error code, if any, or 0 if there was no error.
+ * no I/O is pending or there is already a pending error on the buffer, in which
+ * case nothing will ever complete. It returns the I/O error code, if any, or
+ * 0 if there was no error.
*/
int
xfs_buf_iowait(
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 3cb528c4f27c..951a2321ee01 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -679,7 +679,7 @@ xfs_file_dio_aio_write(
goto out;
if (mapping->nrpages) {
- ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+ ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
pos, -1);
if (ret)
goto out;
@@ -699,7 +699,7 @@ xfs_file_dio_aio_write(
trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
ret = generic_file_direct_write(iocb, iovp,
- &nr_segs, pos, &iocb->ki_pos, count, ocount);
+ &nr_segs, pos, count, ocount);
out:
xfs_rw_iunlock(ip, iolock);
@@ -715,7 +715,7 @@ xfs_file_buffered_aio_write(
const struct iovec *iovp,
unsigned long nr_segs,
loff_t pos,
- size_t ocount)
+ size_t count)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -724,7 +724,7 @@ xfs_file_buffered_aio_write(
ssize_t ret;
int enospc = 0;
int iolock = XFS_IOLOCK_EXCL;
- size_t count = ocount;
+ struct iov_iter from;
xfs_rw_ilock(ip, iolock);
@@ -732,14 +732,15 @@ xfs_file_buffered_aio_write(
if (ret)
goto out;
+ iov_iter_init(&from, iovp, nr_segs, count, 0);
/* We can write back this queue in page reclaim */
current->backing_dev_info = mapping->backing_dev_info;
write_retry:
trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
- ret = generic_file_buffered_write(iocb, iovp, nr_segs,
- pos, &iocb->ki_pos, count, 0);
-
+ ret = generic_perform_write(file, &from, pos);
+ if (likely(ret >= 0))
+ iocb->ki_pos = pos + ret;
/*
* If we just got an ENOSPC, try to write back all dirty inodes to
* convert delalloc space to free up some of the excess reserved
@@ -1491,6 +1492,7 @@ const struct file_operations xfs_dir_file_operations = {
static const struct vm_operations_struct xfs_file_vm_ops = {
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = xfs_vm_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 5e7a38fa6ee6..768087bedbac 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1334,7 +1334,8 @@ int
xfs_create_tmpfile(
struct xfs_inode *dp,
struct dentry *dentry,
- umode_t mode)
+ umode_t mode,
+ struct xfs_inode **ipp)
{
struct xfs_mount *mp = dp->i_mount;
struct xfs_inode *ip = NULL;
@@ -1402,7 +1403,6 @@ xfs_create_tmpfile(
xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
ip->i_d.di_nlink--;
- d_tmpfile(dentry, VFS_I(ip));
error = xfs_iunlink(tp, ip);
if (error)
goto out_trans_abort;
@@ -1415,6 +1415,7 @@ xfs_create_tmpfile(
xfs_qm_dqrele(gdqp);
xfs_qm_dqrele(pdqp);
+ *ipp = ip;
return 0;
out_trans_abort:
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 396cc1fafd0d..f2fcde52b66d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -334,7 +334,7 @@ int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
int xfs_create(struct xfs_inode *dp, struct xfs_name *name,
umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp);
int xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry,
- umode_t mode);
+ umode_t mode, struct xfs_inode **ipp);
int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
struct xfs_inode *ip);
int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index bcfe61202115..0b18776b075e 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -271,32 +271,6 @@ xfs_open_by_handle(
return error;
}
-/*
- * This is a copy from fs/namei.c:vfs_readlink(), except for removing it's
- * unused first argument.
- */
-STATIC int
-do_readlink(
- char __user *buffer,
- int buflen,
- const char *link)
-{
- int len;
-
- len = PTR_ERR(link);
- if (IS_ERR(link))
- goto out;
-
- len = strlen(link);
- if (len > (unsigned) buflen)
- len = buflen;
- if (copy_to_user(buffer, link, len))
- len = -EFAULT;
- out:
- return len;
-}
-
-
int
xfs_readlink_by_handle(
struct file *parfilp,
@@ -334,7 +308,7 @@ xfs_readlink_by_handle(
error = -xfs_readlink(XFS_I(dentry->d_inode), link);
if (error)
goto out_kfree;
- error = do_readlink(hreq->ohandle, olen, link);
+ error = readlink_copy(hreq->ohandle, olen, link);
if (error)
goto out_kfree;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 89b07e43ca28..ef1ca010f417 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1053,11 +1053,25 @@ xfs_vn_tmpfile(
struct dentry *dentry,
umode_t mode)
{
- int error;
+ int error;
+ struct xfs_inode *ip;
+ struct inode *inode;
- error = xfs_create_tmpfile(XFS_I(dir), dentry, mode);
+ error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip);
+ if (unlikely(error))
+ return -error;
- return -error;
+ inode = VFS_I(ip);
+
+ error = xfs_init_security(inode, dir, &dentry->d_name);
+ if (unlikely(error)) {
+ iput(inode);
+ return -error;
+ }
+
+ d_tmpfile(dentry, inode);
+
+ return 0;
}
static const struct inode_operations xfs_inode_operations = {
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 8497a00e399d..08624dc67317 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1181,11 +1181,14 @@ xlog_iodone(xfs_buf_t *bp)
/* log I/O is always issued ASYNC */
ASSERT(XFS_BUF_ISASYNC(bp));
xlog_state_done_syncing(iclog, aborted);
+
/*
- * do not reference the buffer (bp) here as we could race
- * with it being freed after writing the unmount record to the
- * log.
+ * drop the buffer lock now that we are done. Nothing references
+ * the buffer after this, so an unmount waiting on this lock can now
+ * tear it down safely. As such, it is unsafe to reference the buffer
+ * (bp) after the unlock as we could race with it being freed.
*/
+ xfs_buf_unlock(bp);
}
/*
@@ -1368,8 +1371,16 @@ xlog_alloc_log(
bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0);
if (!bp)
goto out_free_log;
- bp->b_iodone = xlog_iodone;
+
+ /*
+ * The iclogbuf buffer locks are held over IO but we are not going to do
+ * IO yet. Hence unlock the buffer so that the log IO path can grab it
+ * when appropriately.
+ */
ASSERT(xfs_buf_islocked(bp));
+ xfs_buf_unlock(bp);
+
+ bp->b_iodone = xlog_iodone;
log->l_xbuf = bp;
spin_lock_init(&log->l_icloglock);
@@ -1398,6 +1409,9 @@ xlog_alloc_log(
if (!bp)
goto out_free_iclog;
+ ASSERT(xfs_buf_islocked(bp));
+ xfs_buf_unlock(bp);
+
bp->b_iodone = xlog_iodone;
iclog->ic_bp = bp;
iclog->ic_data = bp->b_addr;
@@ -1422,7 +1436,6 @@ xlog_alloc_log(
iclog->ic_callback_tail = &(iclog->ic_callback);
iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
- ASSERT(xfs_buf_islocked(iclog->ic_bp));
init_waitqueue_head(&iclog->ic_force_wait);
init_waitqueue_head(&iclog->ic_write_wait);
@@ -1631,6 +1644,12 @@ xlog_cksum(
* we transition the iclogs to IOERROR state *after* flushing all existing
* iclogs to disk. This is because we don't want anymore new transactions to be
* started or completed afterwards.
+ *
+ * We lock the iclogbufs here so that we can serialise against IO completion
+ * during unmount. We might be processing a shutdown triggered during unmount,
+ * and that can occur asynchronously to the unmount thread, and hence we need to
+ * ensure that completes before tearing down the iclogbufs. Hence we need to
+ * hold the buffer lock across the log IO to acheive that.
*/
STATIC int
xlog_bdstrat(
@@ -1638,6 +1657,7 @@ xlog_bdstrat(
{
struct xlog_in_core *iclog = bp->b_fspriv;
+ xfs_buf_lock(bp);
if (iclog->ic_state & XLOG_STATE_IOERROR) {
xfs_buf_ioerror(bp, EIO);
xfs_buf_stale(bp);
@@ -1645,7 +1665,8 @@ xlog_bdstrat(
/*
* It would seem logical to return EIO here, but we rely on
* the log state machine to propagate I/O errors instead of
- * doing it here.
+ * doing it here. Similarly, IO completion will unlock the
+ * buffer, so we don't do it here.
*/
return 0;
}
@@ -1847,14 +1868,28 @@ xlog_dealloc_log(
xlog_cil_destroy(log);
/*
- * always need to ensure that the extra buffer does not point to memory
- * owned by another log buffer before we free it.
+ * Cycle all the iclogbuf locks to make sure all log IO completion
+ * is done before we tear down these buffers.
*/
+ iclog = log->l_iclog;
+ for (i = 0; i < log->l_iclog_bufs; i++) {
+ xfs_buf_lock(iclog->ic_bp);
+ xfs_buf_unlock(iclog->ic_bp);
+ iclog = iclog->ic_next;
+ }
+
+ /*
+ * Always need to ensure that the extra buffer does not point to memory
+ * owned by another log buffer before we free it. Also, cycle the lock
+ * first to ensure we've completed IO on it.
+ */
+ xfs_buf_lock(log->l_xbuf);
+ xfs_buf_unlock(log->l_xbuf);
xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size));
xfs_buf_free(log->l_xbuf);
iclog = log->l_iclog;
- for (i=0; i<log->l_iclog_bufs; i++) {
+ for (i = 0; i < log->l_iclog_bufs; i++) {
xfs_buf_free(iclog->ic_bp);
next_iclog = iclog->ic_next;
kmem_free(iclog);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index a4ae41c179a8..65d8c793a25c 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -603,6 +603,7 @@ DEFINE_INODE_EVENT(xfs_readlink);
DEFINE_INODE_EVENT(xfs_inactive_symlink);
DEFINE_INODE_EVENT(xfs_alloc_file_space);
DEFINE_INODE_EVENT(xfs_free_file_space);
+DEFINE_INODE_EVENT(xfs_zero_file_space);
DEFINE_INODE_EVENT(xfs_collapse_file_space);
DEFINE_INODE_EVENT(xfs_readdir);
#ifdef CONFIG_XFS_POSIX_ACL