summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/affs/file.c5
-rw-r--r--fs/autofs4/autofs_i.h72
-rw-r--r--fs/autofs4/dev-ioctl.c57
-rw-r--r--fs/autofs4/expire.c84
-rw-r--r--fs/autofs4/init.c10
-rw-r--r--fs/autofs4/inode.c52
-rw-r--r--fs/autofs4/root.c163
-rw-r--r--fs/autofs4/symlink.c11
-rw-r--r--fs/autofs4/waitq.c78
-rw-r--r--fs/binfmt_elf.c2
-rw-r--r--fs/block_dev.c55
-rw-r--r--fs/btrfs/async-thread.c2
-rw-r--r--fs/btrfs/backref.c10
-rw-r--r--fs/btrfs/compression.c6
-rw-r--r--fs/btrfs/delayed-inode.c3
-rw-r--r--fs/btrfs/delayed-inode.h2
-rw-r--r--fs/btrfs/disk-io.c4
-rw-r--r--fs/btrfs/extent_io.c45
-rw-r--r--fs/btrfs/extent_io.h3
-rw-r--r--fs/btrfs/free-space-tree.c18
-rw-r--r--fs/btrfs/inode.c52
-rw-r--r--fs/btrfs/ioctl.c119
-rw-r--r--fs/btrfs/relocation.c3
-rw-r--r--fs/btrfs/root-tree.c10
-rw-r--r--fs/btrfs/sysfs.c35
-rw-r--r--fs/btrfs/sysfs.h5
-rw-r--r--fs/btrfs/tests/btrfs-tests.c10
-rw-r--r--fs/btrfs/tests/extent-io-tests.c12
-rw-r--r--fs/btrfs/tests/inode-tests.c8
-rw-r--r--fs/btrfs/tree-log.c14
-rw-r--r--fs/buffer.c24
-rw-r--r--fs/ceph/addr.c4
-rw-r--r--fs/ceph/caps.c27
-rw-r--r--fs/ceph/file.c6
-rw-r--r--fs/ceph/inode.c2
-rw-r--r--fs/ceph/mds_client.c16
-rw-r--r--fs/ceph/mds_client.h1
-rw-r--r--fs/ceph/super.h1
-rw-r--r--fs/cifs/cifs_dfs_ref.c2
-rw-r--r--fs/cifs/cifsencrypt.c2
-rw-r--r--fs/cifs/cifsfs.c1
-rw-r--r--fs/cifs/cifsfs.h12
-rw-r--r--fs/cifs/cifssmb.c21
-rw-r--r--fs/cifs/connect.c3
-rw-r--r--fs/cifs/smb2pdu.c24
-rw-r--r--fs/compat_ioctl.c3
-rw-r--r--fs/dax.c53
-rw-r--r--fs/dcache.c20
-rw-r--r--fs/devpts/inode.c20
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/efivarfs/file.c70
-rw-r--r--fs/efivarfs/inode.c30
-rw-r--r--fs/efivarfs/internal.h3
-rw-r--r--fs/efivarfs/super.c16
-rw-r--r--fs/eventpoll.c38
-rw-r--r--fs/ext2/file.c19
-rw-r--r--fs/ext2/inode.c16
-rw-r--r--fs/ext4/balloc.c7
-rw-r--r--fs/ext4/crypto.c56
-rw-r--r--fs/ext4/dir.c13
-rw-r--r--fs/ext4/ext4.h1
-rw-r--r--fs/ext4/extents.c4
-rw-r--r--fs/ext4/file.c28
-rw-r--r--fs/ext4/ialloc.c6
-rw-r--r--fs/ext4/inode.c78
-rw-r--r--fs/ext4/ioctl.c7
-rw-r--r--fs/ext4/mballoc.c2
-rw-r--r--fs/ext4/move_extent.c16
-rw-r--r--fs/ext4/namei.c26
-rw-r--r--fs/ext4/resize.c2
-rw-r--r--fs/fs-writeback.c39
-rw-r--r--fs/hpfs/namei.c31
-rw-r--r--fs/inode.c6
-rw-r--r--fs/jffs2/README.Locking5
-rw-r--r--fs/jffs2/build.c75
-rw-r--r--fs/jffs2/dir.c11
-rw-r--r--fs/jffs2/file.c39
-rw-r--r--fs/jffs2/gc.c17
-rw-r--r--fs/jffs2/nodelist.h6
-rw-r--r--fs/mpage.c3
-rw-r--r--fs/namei.c22
-rw-r--r--fs/ncpfs/dir.c2
-rw-r--r--fs/nfs/blocklayout/extent_tree.c10
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c2
-rw-r--r--fs/nfs/nfs42proc.c119
-rw-r--r--fs/nfs/nfs4proc.c4
-rw-r--r--fs/nfs/pnfs.c175
-rw-r--r--fs/nfs/pnfs.h4
-rw-r--r--fs/notify/mark.c53
-rw-r--r--fs/ocfs2/aops.c1
-rw-r--r--fs/ocfs2/cluster/heartbeat.c24
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h26
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c13
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c127
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c42
-rw-r--r--fs/ocfs2/dlm/dlmthread.c13
-rw-r--r--fs/ocfs2/mmap.c4
-rw-r--r--fs/ocfs2/super.c5
-rw-r--r--fs/overlayfs/dir.c10
-rw-r--r--fs/overlayfs/inode.c2
-rw-r--r--fs/overlayfs/super.c13
-rw-r--r--fs/pnode.c9
-rw-r--r--fs/proc/task_mmu.c73
-rw-r--r--fs/proc/task_nommu.c49
-rw-r--r--fs/read_write.c9
-rw-r--r--fs/super.c1
-rw-r--r--fs/timerfd.c2
-rw-r--r--fs/userfaultfd.c6
-rw-r--r--fs/xattr.c6
-rw-r--r--fs/xfs/xfs_aops.c17
-rw-r--r--fs/xfs/xfs_aops.h1
-rw-r--r--fs/xfs/xfs_bmap_util.c3
-rw-r--r--fs/xfs/xfs_log_recover.c275
114 files changed, 1950 insertions, 1038 deletions
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 0548c53f41d5..22fc7c802d69 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -511,8 +511,6 @@ affs_do_readpage_ofs(struct page *page, unsigned to)
pr_debug("%s(%lu, %ld, 0, %d)\n", __func__, inode->i_ino,
page->index, to);
BUG_ON(to > PAGE_CACHE_SIZE);
- kmap(page);
- data = page_address(page);
bsize = AFFS_SB(sb)->s_data_blksize;
tmp = page->index << PAGE_CACHE_SHIFT;
bidx = tmp / bsize;
@@ -524,14 +522,15 @@ affs_do_readpage_ofs(struct page *page, unsigned to)
return PTR_ERR(bh);
tmp = min(bsize - boff, to - pos);
BUG_ON(pos + tmp > to || tmp > bsize);
+ data = kmap_atomic(page);
memcpy(data + pos, AFFS_DATA(bh) + boff, tmp);
+ kunmap_atomic(data);
affs_brelse(bh);
bidx++;
pos += tmp;
boff = 0;
}
flush_dcache_page(page);
- kunmap(page);
return 0;
}
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index c37149b929be..f0d268b97d19 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -1,15 +1,11 @@
-/* -*- c -*- ------------------------------------------------------------- *
- *
- * linux/fs/autofs/autofs_i.h
- *
- * Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
- * Copyright 2005-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
+ * Copyright 2005-2006 Ian Kent <raven@themaw.net>
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ----------------------------------------------------------------------- */
+ */
/* Internal header file for autofs */
@@ -35,28 +31,23 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <asm/current.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/* #define DEBUG */
-#define DPRINTK(fmt, ...) \
- pr_debug("pid %d: %s: " fmt "\n", \
- current->pid, __func__, ##__VA_ARGS__)
-
-#define AUTOFS_WARN(fmt, ...) \
- printk(KERN_WARNING "pid %d: %s: " fmt "\n", \
- current->pid, __func__, ##__VA_ARGS__)
-
-#define AUTOFS_ERROR(fmt, ...) \
- printk(KERN_ERR "pid %d: %s: " fmt "\n", \
- current->pid, __func__, ##__VA_ARGS__)
-
-/* Unified info structure. This is pointed to by both the dentry and
- inode structures. Each file in the filesystem has an instance of this
- structure. It holds a reference to the dentry, so dentries are never
- flushed while the file exists. All name lookups are dealt with at the
- dentry level, although the filesystem can interfere in the validation
- process. Readdir is implemented by traversing the dentry lists. */
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+#define pr_fmt(fmt) KBUILD_MODNAME ":pid:%d:%s: " fmt, current->pid, __func__
+
+/*
+ * Unified info structure. This is pointed to by both the dentry and
+ * inode structures. Each file in the filesystem has an instance of this
+ * structure. It holds a reference to the dentry, so dentries are never
+ * flushed while the file exists. All name lookups are dealt with at the
+ * dentry level, although the filesystem can interfere in the validation
+ * process. Readdir is implemented by traversing the dentry lists.
+ */
struct autofs_info {
struct dentry *dentry;
struct inode *inode;
@@ -78,7 +69,7 @@ struct autofs_info {
kgid_t gid;
};
-#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */
+#define AUTOFS_INF_EXPIRING (1<<0) /* dentry in the process of expiring */
#define AUTOFS_INF_NO_RCU (1<<1) /* the dentry is being considered
* for expiry, so RCU_walk is
* not permitted
@@ -140,10 +131,11 @@ static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry)
}
/* autofs4_oz_mode(): do we see the man behind the curtain? (The
- processes which do manipulations for us in user space sees the raw
- filesystem without "magic".) */
-
-static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) {
+ * processes which do manipulations for us in user space sees the raw
+ * filesystem without "magic".)
+ */
+static inline int autofs4_oz_mode(struct autofs_sb_info *sbi)
+{
return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
}
@@ -154,12 +146,12 @@ void autofs4_free_ino(struct autofs_info *);
int is_autofs4_dentry(struct dentry *);
int autofs4_expire_wait(struct dentry *dentry, int rcu_walk);
int autofs4_expire_run(struct super_block *, struct vfsmount *,
- struct autofs_sb_info *,
- struct autofs_packet_expire __user *);
+ struct autofs_sb_info *,
+ struct autofs_packet_expire __user *);
int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
struct autofs_sb_info *sbi, int when);
int autofs4_expire_multi(struct super_block *, struct vfsmount *,
- struct autofs_sb_info *, int __user *);
+ struct autofs_sb_info *, int __user *);
struct dentry *autofs4_expire_direct(struct super_block *sb,
struct vfsmount *mnt,
struct autofs_sb_info *sbi, int how);
@@ -224,8 +216,8 @@ static inline int autofs_prepare_pipe(struct file *pipe)
/* Queue management functions */
-int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
-int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
+int autofs4_wait(struct autofs_sb_info *, struct dentry *, enum autofs_notify);
+int autofs4_wait_release(struct autofs_sb_info *, autofs_wqt_t, int);
void autofs4_catatonic_mode(struct autofs_sb_info *);
static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi)
@@ -242,37 +234,37 @@ static inline void __autofs4_add_expiring(struct dentry *dentry)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
+
if (ino) {
if (list_empty(&ino->expiring))
list_add(&ino->expiring, &sbi->expiring_list);
}
- return;
}
static inline void autofs4_add_expiring(struct dentry *dentry)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
+
if (ino) {
spin_lock(&sbi->lookup_lock);
if (list_empty(&ino->expiring))
list_add(&ino->expiring, &sbi->expiring_list);
spin_unlock(&sbi->lookup_lock);
}
- return;
}
static inline void autofs4_del_expiring(struct dentry *dentry)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
+
if (ino) {
spin_lock(&sbi->lookup_lock);
if (!list_empty(&ino->expiring))
list_del_init(&ino->expiring);
spin_unlock(&sbi->lookup_lock);
}
- return;
}
extern void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index ac7d921ed984..c7fcc7438843 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -72,13 +72,13 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
{
int err = 0;
- if ((AUTOFS_DEV_IOCTL_VERSION_MAJOR != param->ver_major) ||
- (AUTOFS_DEV_IOCTL_VERSION_MINOR < param->ver_minor)) {
- AUTOFS_WARN("ioctl control interface version mismatch: "
- "kernel(%u.%u), user(%u.%u), cmd(%d)",
- AUTOFS_DEV_IOCTL_VERSION_MAJOR,
- AUTOFS_DEV_IOCTL_VERSION_MINOR,
- param->ver_major, param->ver_minor, cmd);
+ if ((param->ver_major != AUTOFS_DEV_IOCTL_VERSION_MAJOR) ||
+ (param->ver_minor > AUTOFS_DEV_IOCTL_VERSION_MINOR)) {
+ pr_warn("ioctl control interface version mismatch: "
+ "kernel(%u.%u), user(%u.%u), cmd(%d)\n",
+ AUTOFS_DEV_IOCTL_VERSION_MAJOR,
+ AUTOFS_DEV_IOCTL_VERSION_MINOR,
+ param->ver_major, param->ver_minor, cmd);
err = -EINVAL;
}
@@ -93,7 +93,8 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
* Copy parameter control struct, including a possible path allocated
* at the end of the struct.
*/
-static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
+static struct autofs_dev_ioctl *
+ copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
{
struct autofs_dev_ioctl tmp, *res;
@@ -116,7 +117,6 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
{
kfree(param);
- return;
}
/*
@@ -129,24 +129,24 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
err = check_dev_ioctl_version(cmd, param);
if (err) {
- AUTOFS_WARN("invalid device control module version "
- "supplied for cmd(0x%08x)", cmd);
+ pr_warn("invalid device control module version "
+ "supplied for cmd(0x%08x)\n", cmd);
goto out;
}
if (param->size > sizeof(*param)) {
err = invalid_str(param->path, param->size - sizeof(*param));
if (err) {
- AUTOFS_WARN(
- "path string terminator missing for cmd(0x%08x)",
+ pr_warn(
+ "path string terminator missing for cmd(0x%08x)\n",
cmd);
goto out;
}
err = check_name(param->path);
if (err) {
- AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
- cmd);
+ pr_warn("invalid path supplied for cmd(0x%08x)\n",
+ cmd);
goto out;
}
}
@@ -197,7 +197,9 @@ static int find_autofs_mount(const char *pathname,
void *data)
{
struct path path;
- int err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0);
+ int err;
+
+ err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0);
if (err)
return err;
err = -ENOENT;
@@ -225,6 +227,7 @@ static int test_by_dev(struct path *path, void *p)
static int test_by_type(struct path *path, void *p)
{
struct autofs_info *ino = autofs4_dentry_ino(path->dentry);
+
return ino && ino->sbi->type & *(unsigned *)p;
}
@@ -370,7 +373,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
new_pid = get_task_pid(current, PIDTYPE_PGID);
if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) {
- AUTOFS_WARN("Not allowed to change PID namespace");
+ pr_warn("not allowed to change PID namespace\n");
err = -EINVAL;
goto out;
}
@@ -456,8 +459,10 @@ static int autofs_dev_ioctl_requester(struct file *fp,
err = 0;
autofs4_expire_wait(path.dentry, 0);
spin_lock(&sbi->fs_lock);
- param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid);
- param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid);
+ param->requester.uid =
+ from_kuid_munged(current_user_ns(), ino->uid);
+ param->requester.gid =
+ from_kgid_munged(current_user_ns(), ino->gid);
spin_unlock(&sbi->fs_lock);
}
path_put(&path);
@@ -619,7 +624,8 @@ static ioctl_fn lookup_dev_ioctl(unsigned int cmd)
}
/* ioctl dispatcher */
-static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __user *user)
+static int _autofs_dev_ioctl(unsigned int command,
+ struct autofs_dev_ioctl __user *user)
{
struct autofs_dev_ioctl *param;
struct file *fp;
@@ -655,7 +661,7 @@ static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __use
fn = lookup_dev_ioctl(cmd);
if (!fn) {
- AUTOFS_WARN("unknown command 0x%08x", command);
+ pr_warn("unknown command 0x%08x\n", command);
return -ENOTTY;
}
@@ -711,6 +717,7 @@ out:
static long autofs_dev_ioctl(struct file *file, uint command, ulong u)
{
int err;
+
err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u);
return (long) err;
}
@@ -733,8 +740,8 @@ static const struct file_operations _dev_ioctl_fops = {
static struct miscdevice _autofs_dev_ioctl_misc = {
.minor = AUTOFS_MINOR,
- .name = AUTOFS_DEVICE_NAME,
- .fops = &_dev_ioctl_fops
+ .name = AUTOFS_DEVICE_NAME,
+ .fops = &_dev_ioctl_fops
};
MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
@@ -747,7 +754,7 @@ int __init autofs_dev_ioctl_init(void)
r = misc_register(&_autofs_dev_ioctl_misc);
if (r) {
- AUTOFS_ERROR("misc_register failed for control device");
+ pr_err("misc_register failed for control device\n");
return r;
}
@@ -757,6 +764,4 @@ int __init autofs_dev_ioctl_init(void)
void autofs_dev_ioctl_exit(void)
{
misc_deregister(&_autofs_dev_ioctl_misc);
- return;
}
-
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 1cebc3c52fa5..9510d8d2e9cd 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -1,16 +1,12 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/expire.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
- * Copyright 2001-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
+ * Copyright 2001-2006 Ian Kent <raven@themaw.net>
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include "autofs_i.h"
@@ -18,7 +14,7 @@ static unsigned long now;
/* Check if a dentry can be expired */
static inline int autofs4_can_expire(struct dentry *dentry,
- unsigned long timeout, int do_now)
+ unsigned long timeout, int do_now)
{
struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -41,7 +37,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
struct path path = {.mnt = mnt, .dentry = dentry};
int status = 1;
- DPRINTK("dentry %p %pd", dentry, dentry);
+ pr_debug("dentry %p %pd\n", dentry, dentry);
path_get(&path);
@@ -58,14 +54,16 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
/* Update the expiry counter if fs is busy */
if (!may_umount_tree(path.mnt)) {
- struct autofs_info *ino = autofs4_dentry_ino(top);
+ struct autofs_info *ino;
+
+ ino = autofs4_dentry_ino(top);
ino->last_used = jiffies;
goto done;
}
status = 0;
done:
- DPRINTK("returning = %d", status);
+ pr_debug("returning = %d\n", status);
path_put(&path);
return status;
}
@@ -74,7 +72,7 @@ done:
* Calculate and dget next entry in the subdirs list under root.
*/
static struct dentry *get_next_positive_subdir(struct dentry *prev,
- struct dentry *root)
+ struct dentry *root)
{
struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
struct list_head *next;
@@ -121,7 +119,7 @@ cont:
* Calculate and dget next entry in top down tree traversal.
*/
static struct dentry *get_next_positive_dentry(struct dentry *prev,
- struct dentry *root)
+ struct dentry *root)
{
struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
struct list_head *next;
@@ -187,15 +185,17 @@ again:
* autofs submounts.
*/
static int autofs4_direct_busy(struct vfsmount *mnt,
- struct dentry *top,
- unsigned long timeout,
- int do_now)
+ struct dentry *top,
+ unsigned long timeout,
+ int do_now)
{
- DPRINTK("top %p %pd", top, top);
+ pr_debug("top %p %pd\n", top, top);
/* If it's busy update the expiry counters */
if (!may_umount_tree(mnt)) {
- struct autofs_info *ino = autofs4_dentry_ino(top);
+ struct autofs_info *ino;
+
+ ino = autofs4_dentry_ino(top);
if (ino)
ino->last_used = jiffies;
return 1;
@@ -208,7 +208,8 @@ static int autofs4_direct_busy(struct vfsmount *mnt,
return 0;
}
-/* Check a directory tree of mount points for busyness
+/*
+ * Check a directory tree of mount points for busyness
* The tree is not busy iff no mountpoints are busy
*/
static int autofs4_tree_busy(struct vfsmount *mnt,
@@ -219,7 +220,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
struct autofs_info *top_ino = autofs4_dentry_ino(top);
struct dentry *p;
- DPRINTK("top %p %pd", top, top);
+ pr_debug("top %p %pd\n", top, top);
/* Negative dentry - give up */
if (!simple_positive(top))
@@ -227,7 +228,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
p = NULL;
while ((p = get_next_positive_dentry(p, top))) {
- DPRINTK("dentry %p %pd", p, p);
+ pr_debug("dentry %p %pd\n", p, p);
/*
* Is someone visiting anywhere in the subtree ?
@@ -273,11 +274,11 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt,
{
struct dentry *p;
- DPRINTK("parent %p %pd", parent, parent);
+ pr_debug("parent %p %pd\n", parent, parent);
p = NULL;
while ((p = get_next_positive_dentry(p, parent))) {
- DPRINTK("dentry %p %pd", p, p);
+ pr_debug("dentry %p %pd\n", p, p);
if (d_mountpoint(p)) {
/* Can we umount this guy */
@@ -362,7 +363,7 @@ static struct dentry *should_expire(struct dentry *dentry,
* offset (autofs-5.0+).
*/
if (d_mountpoint(dentry)) {
- DPRINTK("checking mountpoint %p %pd", dentry, dentry);
+ pr_debug("checking mountpoint %p %pd\n", dentry, dentry);
/* Can we umount this guy */
if (autofs4_mount_busy(mnt, dentry))
@@ -375,7 +376,7 @@ static struct dentry *should_expire(struct dentry *dentry,
}
if (d_really_is_positive(dentry) && d_is_symlink(dentry)) {
- DPRINTK("checking symlink %p %pd", dentry, dentry);
+ pr_debug("checking symlink %p %pd\n", dentry, dentry);
/*
* A symlink can't be "busy" in the usual sense so
* just check last used for expire timeout.
@@ -404,6 +405,7 @@ static struct dentry *should_expire(struct dentry *dentry,
} else {
/* Path walk currently on this dentry? */
struct dentry *expired;
+
ino_count = atomic_read(&ino->count) + 1;
if (d_count(dentry) > ino_count)
return NULL;
@@ -471,7 +473,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
return NULL;
found:
- DPRINTK("returning %p %pd", expired, expired);
+ pr_debug("returning %p %pd\n", expired, expired);
ino->flags |= AUTOFS_INF_EXPIRING;
smp_mb();
ino->flags &= ~AUTOFS_INF_NO_RCU;
@@ -503,12 +505,12 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
if (ino->flags & AUTOFS_INF_EXPIRING) {
spin_unlock(&sbi->fs_lock);
- DPRINTK("waiting for expire %p name=%pd", dentry, dentry);
+ pr_debug("waiting for expire %p name=%pd\n", dentry, dentry);
status = autofs4_wait(sbi, dentry, NFY_NONE);
wait_for_completion(&ino->expire_complete);
- DPRINTK("expire done status=%d", status);
+ pr_debug("expire done status=%d\n", status);
if (d_unhashed(dentry))
return -EAGAIN;
@@ -522,21 +524,22 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
/* Perform an expiry operation */
int autofs4_expire_run(struct super_block *sb,
- struct vfsmount *mnt,
- struct autofs_sb_info *sbi,
- struct autofs_packet_expire __user *pkt_p)
+ struct vfsmount *mnt,
+ struct autofs_sb_info *sbi,
+ struct autofs_packet_expire __user *pkt_p)
{
struct autofs_packet_expire pkt;
struct autofs_info *ino;
struct dentry *dentry;
int ret = 0;
- memset(&pkt,0,sizeof pkt);
+ memset(&pkt, 0, sizeof(pkt));
pkt.hdr.proto_version = sbi->version;
pkt.hdr.type = autofs_ptype_expire;
- if ((dentry = autofs4_expire_indirect(sb, mnt, sbi, 0)) == NULL)
+ dentry = autofs4_expire_indirect(sb, mnt, sbi, 0);
+ if (!dentry)
return -EAGAIN;
pkt.len = dentry->d_name.len;
@@ -544,7 +547,7 @@ int autofs4_expire_run(struct super_block *sb,
pkt.name[pkt.len] = '\0';
dput(dentry);
- if ( copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)) )
+ if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)))
ret = -EFAULT;
spin_lock(&sbi->fs_lock);
@@ -573,7 +576,8 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
struct autofs_info *ino = autofs4_dentry_ino(dentry);
/* This is synchronous because it makes the daemon a
- little easier */
+ * little easier
+ */
ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
spin_lock(&sbi->fs_lock);
@@ -588,8 +592,10 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
return ret;
}
-/* Call repeatedly until it returns -EAGAIN, meaning there's nothing
- more to be done */
+/*
+ * Call repeatedly until it returns -EAGAIN, meaning there's nothing
+ * more to be done.
+ */
int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
struct autofs_sb_info *sbi, int __user *arg)
{
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index b3db517e89ec..8cf0e63389ae 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -1,14 +1,10 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/init.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include <linux/module.h>
#include <linux/init.h>
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index a3ae0b2aeb5a..61b21051bd5a 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -1,15 +1,11 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/inode.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- * Copyright 2005-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ * Copyright 2005-2006 Ian Kent <raven@themaw.net>
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include <linux/kernel.h>
#include <linux/slab.h>
@@ -24,7 +20,9 @@
struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
{
- struct autofs_info *ino = kzalloc(sizeof(*ino), GFP_KERNEL);
+ struct autofs_info *ino;
+
+ ino = kzalloc(sizeof(*ino), GFP_KERNEL);
if (ino) {
INIT_LIST_HEAD(&ino->active);
INIT_LIST_HEAD(&ino->expiring);
@@ -62,7 +60,7 @@ void autofs4_kill_sb(struct super_block *sb)
put_pid(sbi->oz_pgrp);
}
- DPRINTK("shutting down");
+ pr_debug("shutting down\n");
kill_litter_super(sb);
if (sbi)
kfree_rcu(sbi, rcu);
@@ -94,7 +92,12 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root)
seq_printf(m, ",direct");
else
seq_printf(m, ",indirect");
-
+#ifdef CONFIG_CHECKPOINT_RESTORE
+ if (sbi->pipe)
+ seq_printf(m, ",pipe_ino=%ld", sbi->pipe->f_inode->i_ino);
+ else
+ seq_printf(m, ",pipe_ino=-1");
+#endif
return 0;
}
@@ -147,6 +150,7 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
while ((p = strsep(&options, ",")) != NULL) {
int token;
+
if (!*p)
continue;
@@ -204,9 +208,9 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
int autofs4_fill_super(struct super_block *s, void *data, int silent)
{
- struct inode * root_inode;
- struct dentry * root;
- struct file * pipe;
+ struct inode *root_inode;
+ struct dentry *root;
+ struct file *pipe;
int pipefd;
struct autofs_sb_info *sbi;
struct autofs_info *ino;
@@ -217,7 +221,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
- DPRINTK("starting up, sbi = %p",sbi);
+ pr_debug("starting up, sbi = %p\n", sbi);
s->s_fs_info = sbi;
sbi->magic = AUTOFS_SBI_MAGIC;
@@ -266,14 +270,14 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid,
&pgrp, &pgrp_set, &sbi->type, &sbi->min_proto,
&sbi->max_proto)) {
- printk("autofs: called with bogus options\n");
+ pr_err("called with bogus options\n");
goto fail_dput;
}
if (pgrp_set) {
sbi->oz_pgrp = find_get_pid(pgrp);
if (!sbi->oz_pgrp) {
- pr_warn("autofs: could not find process group %d\n",
+ pr_err("could not find process group %d\n",
pgrp);
goto fail_dput;
}
@@ -290,10 +294,10 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
/* Couldn't this be tested earlier? */
if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) {
- printk("autofs: kernel does not match daemon version "
+ pr_err("kernel does not match daemon version "
"daemon (%d, %d) kernel (%d, %d)\n",
- sbi->min_proto, sbi->max_proto,
- AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION);
+ sbi->min_proto, sbi->max_proto,
+ AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION);
goto fail_dput;
}
@@ -304,11 +308,11 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
sbi->version = sbi->max_proto;
sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
- DPRINTK("pipe fd = %d, pgrp = %u", pipefd, pid_nr(sbi->oz_pgrp));
+ pr_debug("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp));
pipe = fget(pipefd);
if (!pipe) {
- printk("autofs: could not open pipe file descriptor\n");
+ pr_err("could not open pipe file descriptor\n");
goto fail_dput;
}
ret = autofs_prepare_pipe(pipe);
@@ -323,12 +327,12 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
*/
s->s_root = root;
return 0;
-
+
/*
* Failure ... clean up.
*/
fail_fput:
- printk("autofs: pipe file descriptor does not contain proper ops\n");
+ pr_err("pipe file descriptor does not contain proper ops\n");
fput(pipe);
/* fall through */
fail_dput:
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index c6d7d3dbd52a..9328b5861c7a 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -1,16 +1,12 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/root.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
- * Copyright 2001-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
+ * Copyright 2001-2006 Ian Kent <raven@themaw.net>
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include <linux/capability.h>
#include <linux/errno.h>
@@ -23,16 +19,18 @@
#include "autofs_i.h"
-static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
-static int autofs4_dir_unlink(struct inode *,struct dentry *);
-static int autofs4_dir_rmdir(struct inode *,struct dentry *);
-static int autofs4_dir_mkdir(struct inode *,struct dentry *,umode_t);
-static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
+static int autofs4_dir_symlink(struct inode *, struct dentry *, const char *);
+static int autofs4_dir_unlink(struct inode *, struct dentry *);
+static int autofs4_dir_rmdir(struct inode *, struct dentry *);
+static int autofs4_dir_mkdir(struct inode *, struct dentry *, umode_t);
+static long autofs4_root_ioctl(struct file *, unsigned int, unsigned long);
#ifdef CONFIG_COMPAT
-static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
+static long autofs4_root_compat_ioctl(struct file *,
+ unsigned int, unsigned long);
#endif
static int autofs4_dir_open(struct inode *inode, struct file *file);
-static struct dentry *autofs4_lookup(struct inode *,struct dentry *, unsigned int);
+static struct dentry *autofs4_lookup(struct inode *,
+ struct dentry *, unsigned int);
static struct vfsmount *autofs4_d_automount(struct path *);
static int autofs4_d_manage(struct dentry *, bool);
static void autofs4_dentry_release(struct dentry *);
@@ -74,7 +72,9 @@ const struct dentry_operations autofs4_dentry_operations = {
static void autofs4_add_active(struct dentry *dentry)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ struct autofs_info *ino;
+
+ ino = autofs4_dentry_ino(dentry);
if (ino) {
spin_lock(&sbi->lookup_lock);
if (!ino->active_count) {
@@ -84,13 +84,14 @@ static void autofs4_add_active(struct dentry *dentry)
ino->active_count++;
spin_unlock(&sbi->lookup_lock);
}
- return;
}
static void autofs4_del_active(struct dentry *dentry)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ struct autofs_info *ino;
+
+ ino = autofs4_dentry_ino(dentry);
if (ino) {
spin_lock(&sbi->lookup_lock);
ino->active_count--;
@@ -100,7 +101,6 @@ static void autofs4_del_active(struct dentry *dentry)
}
spin_unlock(&sbi->lookup_lock);
}
- return;
}
static int autofs4_dir_open(struct inode *inode, struct file *file)
@@ -108,7 +108,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
struct dentry *dentry = file->f_path.dentry;
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- DPRINTK("file=%p dentry=%p %pd", file, dentry, dentry);
+ pr_debug("file=%p dentry=%p %pd\n", file, dentry, dentry);
if (autofs4_oz_mode(sbi))
goto out;
@@ -138,7 +138,7 @@ static void autofs4_dentry_release(struct dentry *de)
struct autofs_info *ino = autofs4_dentry_ino(de);
struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
- DPRINTK("releasing %p", de);
+ pr_debug("releasing %p\n", de);
if (!ino)
return;
@@ -278,9 +278,9 @@ static int autofs4_mount_wait(struct dentry *dentry, bool rcu_walk)
if (ino->flags & AUTOFS_INF_PENDING) {
if (rcu_walk)
return -ECHILD;
- DPRINTK("waiting for mount name=%pd", dentry);
+ pr_debug("waiting for mount name=%pd\n", dentry);
status = autofs4_wait(sbi, dentry, NFY_MOUNT);
- DPRINTK("mount wait done status=%d", status);
+ pr_debug("mount wait done status=%d\n", status);
}
ino->last_used = jiffies;
return status;
@@ -320,7 +320,9 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path)
if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
struct dentry *parent = dentry->d_parent;
struct autofs_info *ino;
- struct dentry *new = d_lookup(parent, &dentry->d_name);
+ struct dentry *new;
+
+ new = d_lookup(parent, &dentry->d_name);
if (!new)
return NULL;
ino = autofs4_dentry_ino(new);
@@ -338,7 +340,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
struct autofs_info *ino = autofs4_dentry_ino(dentry);
int status;
- DPRINTK("dentry=%p %pd", dentry, dentry);
+ pr_debug("dentry=%p %pd\n", dentry, dentry);
/* The daemon never triggers a mount. */
if (autofs4_oz_mode(sbi))
@@ -425,7 +427,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
struct autofs_info *ino = autofs4_dentry_ino(dentry);
int status;
- DPRINTK("dentry=%p %pd", dentry, dentry);
+ pr_debug("dentry=%p %pd\n", dentry, dentry);
/* The daemon never waits. */
if (autofs4_oz_mode(sbi)) {
@@ -455,6 +457,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
* a mount-trap.
*/
struct inode *inode;
+
if (ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU))
return 0;
if (d_mountpoint(dentry))
@@ -494,13 +497,14 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
}
/* Lookups in the root directory */
-static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+static struct dentry *autofs4_lookup(struct inode *dir,
+ struct dentry *dentry, unsigned int flags)
{
struct autofs_sb_info *sbi;
struct autofs_info *ino;
struct dentry *active;
- DPRINTK("name = %pd", dentry);
+ pr_debug("name = %pd\n", dentry);
/* File name too long to exist */
if (dentry->d_name.len > NAME_MAX)
@@ -508,14 +512,14 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u
sbi = autofs4_sbi(dir->i_sb);
- DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
- current->pid, task_pgrp_nr(current), sbi->catatonic,
- autofs4_oz_mode(sbi));
+ pr_debug("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n",
+ current->pid, task_pgrp_nr(current), sbi->catatonic,
+ autofs4_oz_mode(sbi));
active = autofs4_lookup_active(dentry);
- if (active) {
+ if (active)
return active;
- } else {
+ else {
/*
* A dentry that is not within the root can never trigger a
* mount operation, unless the directory already exists, so we
@@ -526,7 +530,8 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u
return ERR_PTR(-ENOENT);
/* Mark entries in the root as mount triggers */
- if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent))
+ if (IS_ROOT(dentry->d_parent) &&
+ autofs_type_indirect(sbi->type))
__managed_dentry_set_managed(dentry);
ino = autofs4_new_ino(sbi);
@@ -554,7 +559,7 @@ static int autofs4_dir_symlink(struct inode *dir,
size_t size = strlen(symname);
char *cp;
- DPRINTK("%s <- %pd", symname, dentry);
+ pr_debug("%s <- %pd\n", symname, dentry);
if (!autofs4_oz_mode(sbi))
return -EACCES;
@@ -613,7 +618,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
struct autofs_info *p_ino;
-
+
/* This allows root to remove symlinks */
if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -664,7 +669,6 @@ static void autofs_set_leaf_automount_flags(struct dentry *dentry)
if (IS_ROOT(parent->d_parent))
return;
managed_dentry_clear_managed(parent);
- return;
}
static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
@@ -687,7 +691,6 @@ static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
if (d_child->next == &parent->d_subdirs &&
d_child->prev == &parent->d_subdirs)
managed_dentry_set_managed(parent);
- return;
}
static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
@@ -695,8 +698,8 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
struct autofs_info *p_ino;
-
- DPRINTK("dentry %p, removing %pd", dentry, dentry);
+
+ pr_debug("dentry %p, removing %pd\n", dentry, dentry);
if (!autofs4_oz_mode(sbi))
return -EACCES;
@@ -728,7 +731,8 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
return 0;
}
-static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int autofs4_dir_mkdir(struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -738,7 +742,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m
if (!autofs4_oz_mode(sbi))
return -EACCES;
- DPRINTK("dentry %p, creating %pd", dentry, dentry);
+ pr_debug("dentry %p, creating %pd\n", dentry, dentry);
BUG_ON(!ino);
@@ -768,14 +772,18 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m
/* Get/set timeout ioctl() operation */
#ifdef CONFIG_COMPAT
static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi,
- compat_ulong_t __user *p)
+ compat_ulong_t __user *p)
{
- int rv;
unsigned long ntimeout;
+ int rv;
- if ((rv = get_user(ntimeout, p)) ||
- (rv = put_user(sbi->exp_timeout/HZ, p)))
- return rv;
+ rv = get_user(ntimeout, p);
+ if (rv)
+ goto error;
+
+ rv = put_user(sbi->exp_timeout/HZ, p);
+ if (rv)
+ goto error;
if (ntimeout > UINT_MAX/HZ)
sbi->exp_timeout = 0;
@@ -783,18 +791,24 @@ static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi,
sbi->exp_timeout = ntimeout * HZ;
return 0;
+error:
+ return rv;
}
#endif
static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi,
- unsigned long __user *p)
+ unsigned long __user *p)
{
- int rv;
unsigned long ntimeout;
+ int rv;
- if ((rv = get_user(ntimeout, p)) ||
- (rv = put_user(sbi->exp_timeout/HZ, p)))
- return rv;
+ rv = get_user(ntimeout, p);
+ if (rv)
+ goto error;
+
+ rv = put_user(sbi->exp_timeout/HZ, p);
+ if (rv)
+ goto error;
if (ntimeout > ULONG_MAX/HZ)
sbi->exp_timeout = 0;
@@ -802,16 +816,20 @@ static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi,
sbi->exp_timeout = ntimeout * HZ;
return 0;
+error:
+ return rv;
}
/* Return protocol version */
-static inline int autofs4_get_protover(struct autofs_sb_info *sbi, int __user *p)
+static inline int autofs4_get_protover(struct autofs_sb_info *sbi,
+ int __user *p)
{
return put_user(sbi->version, p);
}
/* Return protocol sub version */
-static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, int __user *p)
+static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi,
+ int __user *p)
{
return put_user(sbi->sub_version, p);
}
@@ -826,7 +844,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
if (may_umount(mnt))
status = 1;
- DPRINTK("returning %d", status);
+ pr_debug("returning %d\n", status);
status = put_user(status, p);
@@ -834,9 +852,9 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
}
/* Identify autofs4_dentries - this is so we can tell if there's
- an extra dentry refcount or not. We only hold a refcount on the
- dentry if its non-negative (ie, d_inode != NULL)
-*/
+ * an extra dentry refcount or not. We only hold a refcount on the
+ * dentry if its non-negative (ie, d_inode != NULL)
+ */
int is_autofs4_dentry(struct dentry *dentry)
{
return dentry && d_really_is_positive(dentry) &&
@@ -854,21 +872,21 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb);
void __user *p = (void __user *)arg;
- DPRINTK("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u",
- cmd,arg,sbi,task_pgrp_nr(current));
+ pr_debug("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n",
+ cmd, arg, sbi, task_pgrp_nr(current));
if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) ||
_IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
return -ENOTTY;
-
+
if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
return -EPERM;
-
- switch(cmd) {
+
+ switch (cmd) {
case AUTOFS_IOC_READY: /* Wait queue: go ahead and retry */
- return autofs4_wait_release(sbi,(autofs_wqt_t)arg,0);
+ return autofs4_wait_release(sbi, (autofs_wqt_t) arg, 0);
case AUTOFS_IOC_FAIL: /* Wait queue: fail with ENOENT */
- return autofs4_wait_release(sbi,(autofs_wqt_t)arg,-ENOENT);
+ return autofs4_wait_release(sbi, (autofs_wqt_t) arg, -ENOENT);
case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */
autofs4_catatonic_mode(sbi);
return 0;
@@ -888,13 +906,15 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
/* return a single thing to expire */
case AUTOFS_IOC_EXPIRE:
- return autofs4_expire_run(inode->i_sb,filp->f_path.mnt,sbi, p);
+ return autofs4_expire_run(inode->i_sb,
+ filp->f_path.mnt, sbi, p);
/* same as above, but can send multiple expires through pipe */
case AUTOFS_IOC_EXPIRE_MULTI:
- return autofs4_expire_multi(inode->i_sb,filp->f_path.mnt,sbi, p);
+ return autofs4_expire_multi(inode->i_sb,
+ filp->f_path.mnt, sbi, p);
default:
- return -ENOSYS;
+ return -EINVAL;
}
}
@@ -902,12 +922,13 @@ static long autofs4_root_ioctl(struct file *filp,
unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
+
return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
}
#ifdef CONFIG_COMPAT
static long autofs4_root_compat_ioctl(struct file *filp,
- unsigned int cmd, unsigned long arg)
+ unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
int ret;
@@ -916,7 +937,7 @@ static long autofs4_root_compat_ioctl(struct file *filp,
ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
else
ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
- (unsigned long)compat_ptr(arg));
+ (unsigned long) compat_ptr(arg));
return ret;
}
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index 84e037d1d129..99aab00dc217 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -1,14 +1,10 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/symlink.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include "autofs_i.h"
@@ -18,6 +14,7 @@ static const char *autofs4_get_link(struct dentry *dentry,
{
struct autofs_sb_info *sbi;
struct autofs_info *ino;
+
if (!dentry)
return ERR_PTR(-ECHILD);
sbi = autofs4_sbi(dentry->d_sb);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 35b755e79c2d..0146d911f468 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -1,15 +1,11 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/waitq.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- * Copyright 2001-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ * Copyright 2001-2006 Ian Kent <raven@themaw.net>
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include <linux/slab.h>
#include <linux/time.h>
@@ -18,7 +14,8 @@
#include "autofs_i.h"
/* We make this a static variable rather than a part of the superblock; it
- is better if we don't reassign numbers easily even across filesystems */
+ * is better if we don't reassign numbers easily even across filesystems
+ */
static autofs_wqt_t autofs4_next_wait_queue = 1;
/* These are the signals we allow interrupting a pending mount */
@@ -34,7 +31,7 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
return;
}
- DPRINTK("entering catatonic mode");
+ pr_debug("entering catatonic mode\n");
sbi->catatonic = 1;
wq = sbi->queues;
@@ -69,17 +66,19 @@ static int autofs4_write(struct autofs_sb_info *sbi,
set_fs(KERNEL_DS);
mutex_lock(&sbi->pipe_mutex);
- while (bytes &&
- (wr = __vfs_write(file,data,bytes,&file->f_pos)) > 0) {
+ wr = __vfs_write(file, data, bytes, &file->f_pos);
+ while (bytes && wr) {
data += wr;
bytes -= wr;
+ wr = __vfs_write(file, data, bytes, &file->f_pos);
}
mutex_unlock(&sbi->pipe_mutex);
set_fs(fs);
/* Keep the currently executing process from receiving a
- SIGPIPE unless it was already supposed to get one */
+ * SIGPIPE unless it was already supposed to get one
+ */
if (wr == -EPIPE && !sigpipe) {
spin_lock_irqsave(&current->sighand->siglock, flags);
sigdelset(&current->pending.signal, SIGPIPE);
@@ -89,7 +88,7 @@ static int autofs4_write(struct autofs_sb_info *sbi,
return (bytes > 0);
}
-
+
static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
struct autofs_wait_queue *wq,
int type)
@@ -102,10 +101,11 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
struct file *pipe = NULL;
size_t pktsz;
- DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d",
- (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, type);
+ pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n",
+ (unsigned long) wq->wait_queue_token,
+ wq->name.len, wq->name.name, type);
- memset(&pkt,0,sizeof pkt); /* For security reasons */
+ memset(&pkt, 0, sizeof(pkt)); /* For security reasons */
pkt.hdr.proto_version = sbi->version;
pkt.hdr.type = type;
@@ -126,7 +126,8 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
}
case autofs_ptype_expire_multi:
{
- struct autofs_packet_expire_multi *ep = &pkt.v4_pkt.expire_multi;
+ struct autofs_packet_expire_multi *ep =
+ &pkt.v4_pkt.expire_multi;
pktsz = sizeof(*ep);
@@ -163,7 +164,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
break;
}
default:
- printk("autofs4_notify_daemon: bad type %d!\n", type);
+ pr_warn("bad type %d!\n", type);
mutex_unlock(&sbi->wq_mutex);
return;
}
@@ -231,7 +232,7 @@ autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr)
if (wq->name.hash == qstr->hash &&
wq->name.len == qstr->len &&
wq->name.name &&
- !memcmp(wq->name.name, qstr->name, qstr->len))
+ !memcmp(wq->name.name, qstr->name, qstr->len))
break;
}
return wq;
@@ -248,7 +249,7 @@ autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr)
static int validate_request(struct autofs_wait_queue **wait,
struct autofs_sb_info *sbi,
struct qstr *qstr,
- struct dentry*dentry, enum autofs_notify notify)
+ struct dentry *dentry, enum autofs_notify notify)
{
struct autofs_wait_queue *wq;
struct autofs_info *ino;
@@ -322,8 +323,10 @@ static int validate_request(struct autofs_wait_queue **wait,
* continue on and create a new request.
*/
if (!IS_ROOT(dentry)) {
- if (d_really_is_positive(dentry) && d_unhashed(dentry)) {
+ if (d_unhashed(dentry) &&
+ d_really_is_positive(dentry)) {
struct dentry *parent = dentry->d_parent;
+
new = d_lookup(parent, &dentry->d_name);
if (new)
dentry = new;
@@ -340,8 +343,8 @@ static int validate_request(struct autofs_wait_queue **wait,
return 1;
}
-int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
- enum autofs_notify notify)
+int autofs4_wait(struct autofs_sb_info *sbi,
+ struct dentry *dentry, enum autofs_notify notify)
{
struct autofs_wait_queue *wq;
struct qstr qstr;
@@ -411,7 +414,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
if (!wq) {
/* Create a new wait queue */
- wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL);
+ wq = kmalloc(sizeof(struct autofs_wait_queue), GFP_KERNEL);
if (!wq) {
kfree(qstr.name);
mutex_unlock(&sbi->wq_mutex);
@@ -450,17 +453,19 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
autofs_ptype_expire_indirect;
}
- DPRINTK("new wait id = 0x%08lx, name = %.*s, nfy=%d\n",
- (unsigned long) wq->wait_queue_token, wq->name.len,
- wq->name.name, notify);
+ pr_debug("new wait id = 0x%08lx, name = %.*s, nfy=%d\n",
+ (unsigned long) wq->wait_queue_token, wq->name.len,
+ wq->name.name, notify);
- /* autofs4_notify_daemon() may block; it will unlock ->wq_mutex */
+ /*
+ * autofs4_notify_daemon() may block; it will unlock ->wq_mutex
+ */
autofs4_notify_daemon(sbi, wq, type);
} else {
wq->wait_ctr++;
- DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d",
- (unsigned long) wq->wait_queue_token, wq->name.len,
- wq->name.name, notify);
+ pr_debug("existing wait id = 0x%08lx, name = %.*s, nfy=%d\n",
+ (unsigned long) wq->wait_queue_token, wq->name.len,
+ wq->name.name, notify);
mutex_unlock(&sbi->wq_mutex);
kfree(qstr.name);
}
@@ -471,12 +476,14 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
*/
if (wq->name.name) {
/* Block all but "shutdown" signals while waiting */
- sigset_t oldset;
+ unsigned long shutdown_sigs_mask;
unsigned long irqflags;
+ sigset_t oldset;
spin_lock_irqsave(&current->sighand->siglock, irqflags);
oldset = current->blocked;
- siginitsetinv(&current->blocked, SHUTDOWN_SIGS & ~oldset.sig[0]);
+ shutdown_sigs_mask = SHUTDOWN_SIGS & ~oldset.sig[0];
+ siginitsetinv(&current->blocked, shutdown_sigs_mask);
recalc_sigpending();
spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
@@ -487,7 +494,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
recalc_sigpending();
spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
} else {
- DPRINTK("skipped sleeping");
+ pr_debug("skipped sleeping\n");
}
status = wq->status;
@@ -562,4 +569,3 @@ int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_tok
return 0;
}
-
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 051ea4809c14..7d914c67a9d0 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -653,7 +653,7 @@ static unsigned long randomize_stack_top(unsigned long stack_top)
if ((current->flags & PF_RANDOMIZE) &&
!(current->personality & ADDR_NO_RANDOMIZE)) {
- random_variable = (unsigned long) get_random_int();
+ random_variable = get_random_long();
random_variable &= STACK_RND_MASK;
random_variable <<= PAGE_SHIFT;
}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7b9cd49622b1..826b164a4b5b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1201,7 +1201,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_disk = disk;
bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
- bdev->bd_inode->i_flags = disk->fops->direct_access ? S_DAX : 0;
+ if (IS_ENABLED(CONFIG_BLK_DEV_DAX) && disk->fops->direct_access)
+ bdev->bd_inode->i_flags = S_DAX;
+ else
+ bdev->bd_inode->i_flags = 0;
+
if (!partno) {
ret = -ENXIO;
bdev->bd_part = disk_get_part(disk, partno);
@@ -1693,13 +1697,24 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
return try_to_free_buffers(page);
}
+static int blkdev_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ if (dax_mapping(mapping)) {
+ struct block_device *bdev = I_BDEV(mapping->host);
+
+ return dax_writeback_mapping_range(mapping, bdev, wbc);
+ }
+ return generic_writepages(mapping, wbc);
+}
+
static const struct address_space_operations def_blk_aops = {
.readpage = blkdev_readpage,
.readpages = blkdev_readpages,
.writepage = blkdev_writepage,
.write_begin = blkdev_write_begin,
.write_end = blkdev_write_end,
- .writepages = generic_writepages,
+ .writepages = blkdev_writepages,
.releasepage = blkdev_releasepage,
.direct_IO = blkdev_direct_IO,
.is_dirty_writeback = buffer_check_dirty_writeback,
@@ -1730,43 +1745,25 @@ static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
return __dax_fault(vma, vmf, blkdev_get_block, NULL);
}
-static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, unsigned int flags)
-{
- return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
-}
-
-static void blkdev_vm_open(struct vm_area_struct *vma)
+static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
{
- struct inode *bd_inode = bdev_file_inode(vma->vm_file);
- struct block_device *bdev = I_BDEV(bd_inode);
-
- inode_lock(bd_inode);
- bdev->bd_map_count++;
- inode_unlock(bd_inode);
+ return dax_pfn_mkwrite(vma, vmf);
}
-static void blkdev_vm_close(struct vm_area_struct *vma)
+static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned int flags)
{
- struct inode *bd_inode = bdev_file_inode(vma->vm_file);
- struct block_device *bdev = I_BDEV(bd_inode);
-
- inode_lock(bd_inode);
- bdev->bd_map_count--;
- inode_unlock(bd_inode);
+ return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
}
static const struct vm_operations_struct blkdev_dax_vm_ops = {
- .open = blkdev_vm_open,
- .close = blkdev_vm_close,
.fault = blkdev_dax_fault,
.pmd_fault = blkdev_dax_pmd_fault,
- .pfn_mkwrite = blkdev_dax_fault,
+ .pfn_mkwrite = blkdev_dax_pfn_mkwrite,
};
static const struct vm_operations_struct blkdev_default_vm_ops = {
- .open = blkdev_vm_open,
- .close = blkdev_vm_close,
.fault = filemap_fault,
.map_pages = filemap_map_pages,
};
@@ -1774,18 +1771,14 @@ static const struct vm_operations_struct blkdev_default_vm_ops = {
static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *bd_inode = bdev_file_inode(file);
- struct block_device *bdev = I_BDEV(bd_inode);
file_accessed(file);
- inode_lock(bd_inode);
- bdev->bd_map_count++;
if (IS_DAX(bd_inode)) {
vma->vm_ops = &blkdev_dax_vm_ops;
vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
} else {
vma->vm_ops = &blkdev_default_vm_ops;
}
- inode_unlock(bd_inode);
return 0;
}
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 88d9af3d4581..5fb60ea7eee2 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -328,8 +328,8 @@ static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
list_add_tail(&work->ordered_list, &wq->ordered_list);
spin_unlock_irqrestore(&wq->list_lock, flags);
}
- queue_work(wq->normal_wq, &work->normal_work);
trace_btrfs_work_queued(work);
+ queue_work(wq->normal_wq, &work->normal_work);
}
void btrfs_queue_work(struct btrfs_workqueue *wq,
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index b90cd3776f8e..f6dac40f87ff 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1406,7 +1406,8 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
read_extent_buffer(eb, dest + bytes_left,
name_off, name_len);
if (eb != eb_in) {
- btrfs_tree_read_unlock_blocking(eb);
+ if (!path->skip_locking)
+ btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
}
ret = btrfs_find_item(fs_root, path, parent, 0,
@@ -1426,9 +1427,10 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
eb = path->nodes[0];
/* make sure we can use eb after releasing the path */
if (eb != eb_in) {
- atomic_inc(&eb->refs);
- btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ if (!path->skip_locking)
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ path->nodes[0] = NULL;
+ path->locks[0] = 0;
}
btrfs_release_path(path);
iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index c473c42d7d6c..3346cd8f9910 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -637,11 +637,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
faili = nr_pages - 1;
cb->nr_pages = nr_pages;
- /* In the parent-locked case, we only locked the range we are
- * interested in. In all other cases, we can opportunistically
- * cache decompressed data that goes beyond the requested range. */
- if (!(bio_flags & EXTENT_BIO_PARENT_LOCKED))
- add_ra_bio_pages(inode, em_start + em_len, cb);
+ add_ra_bio_pages(inode, em_start + em_len, cb);
/* include any pages we added in add_ra-bio_pages */
uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 0be47e4b8136..b57daa895cea 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1689,7 +1689,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
*
*/
int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
- struct list_head *ins_list)
+ struct list_head *ins_list, bool *emitted)
{
struct btrfs_dir_item *di;
struct btrfs_delayed_item *curr, *next;
@@ -1733,6 +1733,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
if (over)
return 1;
+ *emitted = true;
}
return 0;
}
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index f70119f25421..0167853c84ae 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -144,7 +144,7 @@ void btrfs_put_delayed_items(struct list_head *ins_list,
int btrfs_should_delete_dir_index(struct list_head *del_list,
u64 index);
int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
- struct list_head *ins_list);
+ struct list_head *ins_list, bool *emitted);
/* for init */
int __init btrfs_delayed_inode_init(void);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dd08e29f5117..5699bbc23feb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -182,6 +182,7 @@ static struct btrfs_lockdep_keyset {
{ .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" },
{ .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" },
{ .id = BTRFS_UUID_TREE_OBJECTID, .name_stem = "uuid" },
+ { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, .name_stem = "free-space" },
{ .id = 0, .name_stem = "tree" },
};
@@ -930,7 +931,7 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags)
if (bio_flags & EXTENT_BIO_TREE_LOG)
return 0;
#ifdef CONFIG_X86
- if (static_cpu_has_safe(X86_FEATURE_XMM4_2))
+ if (static_cpu_has(X86_FEATURE_XMM4_2))
return 0;
#endif
return 1;
@@ -1787,7 +1788,6 @@ static int cleaner_kthread(void *arg)
int again;
struct btrfs_trans_handle *trans;
- set_freezable();
do {
again = 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2e7c97a3f344..392592dc7010 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2897,12 +2897,11 @@ static int __do_readpage(struct extent_io_tree *tree,
struct block_device *bdev;
int ret;
int nr = 0;
- int parent_locked = *bio_flags & EXTENT_BIO_PARENT_LOCKED;
size_t pg_offset = 0;
size_t iosize;
size_t disk_io_size;
size_t blocksize = inode->i_sb->s_blocksize;
- unsigned long this_bio_flag = *bio_flags & EXTENT_BIO_PARENT_LOCKED;
+ unsigned long this_bio_flag = 0;
set_page_extent_mapped(page);
@@ -2942,18 +2941,16 @@ static int __do_readpage(struct extent_io_tree *tree,
kunmap_atomic(userpage);
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
- if (!parent_locked)
- unlock_extent_cached(tree, cur,
- cur + iosize - 1,
- &cached, GFP_NOFS);
+ unlock_extent_cached(tree, cur,
+ cur + iosize - 1,
+ &cached, GFP_NOFS);
break;
}
em = __get_extent_map(inode, page, pg_offset, cur,
end - cur + 1, get_extent, em_cached);
if (IS_ERR_OR_NULL(em)) {
SetPageError(page);
- if (!parent_locked)
- unlock_extent(tree, cur, end);
+ unlock_extent(tree, cur, end);
break;
}
extent_offset = cur - em->start;
@@ -3038,12 +3035,9 @@ static int __do_readpage(struct extent_io_tree *tree,
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
- if (parent_locked)
- free_extent_state(cached);
- else
- unlock_extent_cached(tree, cur,
- cur + iosize - 1,
- &cached, GFP_NOFS);
+ unlock_extent_cached(tree, cur,
+ cur + iosize - 1,
+ &cached, GFP_NOFS);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -3052,8 +3046,7 @@ static int __do_readpage(struct extent_io_tree *tree,
if (test_range_bit(tree, cur, cur_end,
EXTENT_UPTODATE, 1, NULL)) {
check_page_uptodate(tree, page);
- if (!parent_locked)
- unlock_extent(tree, cur, cur + iosize - 1);
+ unlock_extent(tree, cur, cur + iosize - 1);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -3063,8 +3056,7 @@ static int __do_readpage(struct extent_io_tree *tree,
*/
if (block_start == EXTENT_MAP_INLINE) {
SetPageError(page);
- if (!parent_locked)
- unlock_extent(tree, cur, cur + iosize - 1);
+ unlock_extent(tree, cur, cur + iosize - 1);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -3083,8 +3075,7 @@ static int __do_readpage(struct extent_io_tree *tree,
*bio_flags = this_bio_flag;
} else {
SetPageError(page);
- if (!parent_locked)
- unlock_extent(tree, cur, cur + iosize - 1);
+ unlock_extent(tree, cur, cur + iosize - 1);
}
cur = cur + iosize;
pg_offset += iosize;
@@ -3213,20 +3204,6 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
return ret;
}
-int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
- get_extent_t *get_extent, int mirror_num)
-{
- struct bio *bio = NULL;
- unsigned long bio_flags = EXTENT_BIO_PARENT_LOCKED;
- int ret;
-
- ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num,
- &bio_flags, READ, NULL);
- if (bio)
- ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
- return ret;
-}
-
static noinline void update_nr_written(struct page *page,
struct writeback_control *wbc,
unsigned long nr_written)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 0377413bd4b9..880d5292e972 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -29,7 +29,6 @@
*/
#define EXTENT_BIO_COMPRESSED 1
#define EXTENT_BIO_TREE_LOG 2
-#define EXTENT_BIO_PARENT_LOCKED 4
#define EXTENT_BIO_FLAG_SHIFT 16
/* these are bit numbers for test/set bit */
@@ -210,8 +209,6 @@ static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
get_extent_t *get_extent, int mirror_num);
-int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
- get_extent_t *get_extent, int mirror_num);
int __init extent_io_init(void);
void extent_io_exit(void);
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 393e36bd5845..53dbeaf6ce94 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -153,6 +153,20 @@ static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize)
static unsigned long *alloc_bitmap(u32 bitmap_size)
{
+ void *mem;
+
+ /*
+ * The allocation size varies, observed numbers were < 4K up to 16K.
+ * Using vmalloc unconditionally would be too heavy, we'll try
+ * contiguous allocations first.
+ */
+ if (bitmap_size <= PAGE_SIZE)
+ return kzalloc(bitmap_size, GFP_NOFS);
+
+ mem = kzalloc(bitmap_size, GFP_NOFS | __GFP_NOWARN);
+ if (mem)
+ return mem;
+
return __vmalloc(bitmap_size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO,
PAGE_KERNEL);
}
@@ -289,7 +303,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
ret = 0;
out:
- vfree(bitmap);
+ kvfree(bitmap);
if (ret)
btrfs_abort_transaction(trans, root, ret);
return ret;
@@ -438,7 +452,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
ret = 0;
out:
- vfree(bitmap);
+ kvfree(bitmap);
if (ret)
btrfs_abort_transaction(trans, root, ret);
return ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e28f3d4691af..d96f5cf38a2d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5717,6 +5717,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
char *name_ptr;
int name_len;
int is_curr = 0; /* ctx->pos points to the current index? */
+ bool emitted;
/* FIXME, use a real flag for deciding about the key type */
if (root->fs_info->tree_root == root)
@@ -5745,6 +5746,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
if (ret < 0)
goto err;
+ emitted = false;
while (1) {
leaf = path->nodes[0];
slot = path->slots[0];
@@ -5824,6 +5826,7 @@ skip:
if (over)
goto nopos;
+ emitted = true;
di_len = btrfs_dir_name_len(leaf, di) +
btrfs_dir_data_len(leaf, di) + sizeof(*di);
di_cur += di_len;
@@ -5836,11 +5839,20 @@ next:
if (key_type == BTRFS_DIR_INDEX_KEY) {
if (is_curr)
ctx->pos++;
- ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
+ ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list, &emitted);
if (ret)
goto nopos;
}
+ /*
+ * If we haven't emitted any dir entry, we must not touch ctx->pos as
+ * it was was set to the termination value in previous call. We assume
+ * that "." and ".." were emitted if we reach this point and set the
+ * termination value as well for an empty directory.
+ */
+ if (ctx->pos > 2 && !emitted)
+ goto nopos;
+
/* Reached end of directory/root. Bump pos past the last item. */
ctx->pos++;
@@ -7116,21 +7128,41 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
if (ret)
return ERR_PTR(ret);
- em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
- ins.offset, ins.offset, ins.offset, 0);
- if (IS_ERR(em)) {
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
- return em;
- }
-
+ /*
+ * Create the ordered extent before the extent map. This is to avoid
+ * races with the fast fsync path that would lead to it logging file
+ * extent items that point to disk extents that were not yet written to.
+ * The fast fsync path collects ordered extents into a local list and
+ * then collects all the new extent maps, so we must create the ordered
+ * extent first and make sure the fast fsync path collects any new
+ * ordered extents after collecting new extent maps as well.
+ * The fsync path simply can not rely on inode_dio_wait() because it
+ * causes deadlock with AIO.
+ */
ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
ins.offset, ins.offset, 0);
if (ret) {
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
- free_extent_map(em);
return ERR_PTR(ret);
}
+ em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
+ ins.offset, ins.offset, ins.offset, 0);
+ if (IS_ERR(em)) {
+ struct btrfs_ordered_extent *oe;
+
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
+ oe = btrfs_lookup_ordered_extent(inode, start);
+ ASSERT(oe);
+ if (WARN_ON(!oe))
+ return em;
+ set_bit(BTRFS_ORDERED_IOERR, &oe->flags);
+ set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags);
+ btrfs_remove_ordered_extent(inode, oe);
+ /* Once for our lookup and once for the ordered extents tree. */
+ btrfs_put_ordered_extent(oe);
+ btrfs_put_ordered_extent(oe);
+ }
return em;
}
@@ -7954,6 +7986,7 @@ static void btrfs_endio_direct_read(struct bio *bio)
kfree(dip);
+ dio_bio->bi_error = bio->bi_error;
dio_end_io(dio_bio, bio->bi_error);
if (io_bio->end_io)
@@ -8008,6 +8041,7 @@ static void btrfs_endio_direct_write(struct bio *bio)
kfree(dip);
+ dio_bio->bi_error = bio->bi_error;
dio_end_io(dio_bio, bio->bi_error);
bio_put(bio);
}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 952172ca7e45..48aee9846329 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2794,24 +2794,29 @@ out:
static struct page *extent_same_get_page(struct inode *inode, pgoff_t index)
{
struct page *page;
- struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
page = grab_cache_page(inode->i_mapping, index);
if (!page)
- return NULL;
+ return ERR_PTR(-ENOMEM);
if (!PageUptodate(page)) {
- if (extent_read_full_page_nolock(tree, page, btrfs_get_extent,
- 0))
- return NULL;
+ int ret;
+
+ ret = btrfs_readpage(NULL, page);
+ if (ret)
+ return ERR_PTR(ret);
lock_page(page);
if (!PageUptodate(page)) {
unlock_page(page);
page_cache_release(page);
- return NULL;
+ return ERR_PTR(-EIO);
+ }
+ if (page->mapping != inode->i_mapping) {
+ unlock_page(page);
+ page_cache_release(page);
+ return ERR_PTR(-EAGAIN);
}
}
- unlock_page(page);
return page;
}
@@ -2823,17 +2828,31 @@ static int gather_extent_pages(struct inode *inode, struct page **pages,
pgoff_t index = off >> PAGE_CACHE_SHIFT;
for (i = 0; i < num_pages; i++) {
+again:
pages[i] = extent_same_get_page(inode, index + i);
- if (!pages[i])
- return -ENOMEM;
+ if (IS_ERR(pages[i])) {
+ int err = PTR_ERR(pages[i]);
+
+ if (err == -EAGAIN)
+ goto again;
+ pages[i] = NULL;
+ return err;
+ }
}
return 0;
}
-static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
+static int lock_extent_range(struct inode *inode, u64 off, u64 len,
+ bool retry_range_locking)
{
- /* do any pending delalloc/csum calc on src, one way or
- another, and lock file content */
+ /*
+ * Do any pending delalloc/csum calculations on inode, one way or
+ * another, and lock file content.
+ * The locking order is:
+ *
+ * 1) pages
+ * 2) range in the inode's io tree
+ */
while (1) {
struct btrfs_ordered_extent *ordered;
lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
@@ -2851,8 +2870,11 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
if (ordered)
btrfs_put_ordered_extent(ordered);
+ if (!retry_range_locking)
+ return -EAGAIN;
btrfs_wait_ordered_range(inode, off, len);
}
+ return 0;
}
static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2)
@@ -2877,15 +2899,24 @@ static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
}
-static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len)
+static int btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len,
+ bool retry_range_locking)
{
+ int ret;
+
if (inode1 < inode2) {
swap(inode1, inode2);
swap(loff1, loff2);
}
- lock_extent_range(inode1, loff1, len);
- lock_extent_range(inode2, loff2, len);
+ ret = lock_extent_range(inode1, loff1, len, retry_range_locking);
+ if (ret)
+ return ret;
+ ret = lock_extent_range(inode2, loff2, len, retry_range_locking);
+ if (ret)
+ unlock_extent(&BTRFS_I(inode1)->io_tree, loff1,
+ loff1 + len - 1);
+ return ret;
}
struct cmp_pages {
@@ -2901,11 +2932,15 @@ static void btrfs_cmp_data_free(struct cmp_pages *cmp)
for (i = 0; i < cmp->num_pages; i++) {
pg = cmp->src_pages[i];
- if (pg)
+ if (pg) {
+ unlock_page(pg);
page_cache_release(pg);
+ }
pg = cmp->dst_pages[i];
- if (pg)
+ if (pg) {
+ unlock_page(pg);
page_cache_release(pg);
+ }
}
kfree(cmp->src_pages);
kfree(cmp->dst_pages);
@@ -2966,6 +3001,8 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
src_page = cmp->src_pages[i];
dst_page = cmp->dst_pages[i];
+ ASSERT(PageLocked(src_page));
+ ASSERT(PageLocked(dst_page));
addr = kmap_atomic(src_page);
dst_addr = kmap_atomic(dst_page);
@@ -3078,14 +3115,46 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
goto out_unlock;
}
+again:
ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp);
if (ret)
goto out_unlock;
if (same_inode)
- lock_extent_range(src, same_lock_start, same_lock_len);
+ ret = lock_extent_range(src, same_lock_start, same_lock_len,
+ false);
else
- btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
+ ret = btrfs_double_extent_lock(src, loff, dst, dst_loff, len,
+ false);
+ /*
+ * If one of the inodes has dirty pages in the respective range or
+ * ordered extents, we need to flush dellaloc and wait for all ordered
+ * extents in the range. We must unlock the pages and the ranges in the
+ * io trees to avoid deadlocks when flushing delalloc (requires locking
+ * pages) and when waiting for ordered extents to complete (they require
+ * range locking).
+ */
+ if (ret == -EAGAIN) {
+ /*
+ * Ranges in the io trees already unlocked. Now unlock all
+ * pages before waiting for all IO to complete.
+ */
+ btrfs_cmp_data_free(&cmp);
+ if (same_inode) {
+ btrfs_wait_ordered_range(src, same_lock_start,
+ same_lock_len);
+ } else {
+ btrfs_wait_ordered_range(src, loff, len);
+ btrfs_wait_ordered_range(dst, dst_loff, len);
+ }
+ goto again;
+ }
+ ASSERT(ret == 0);
+ if (WARN_ON(ret)) {
+ /* ranges in the io trees already unlocked */
+ btrfs_cmp_data_free(&cmp);
+ return ret;
+ }
/* pass original length for comparison so we stay within i_size */
ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp);
@@ -3795,9 +3864,15 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
u64 lock_start = min_t(u64, off, destoff);
u64 lock_len = max_t(u64, off, destoff) + len - lock_start;
- lock_extent_range(src, lock_start, lock_len);
+ ret = lock_extent_range(src, lock_start, lock_len, true);
} else {
- btrfs_double_extent_lock(src, off, inode, destoff, len);
+ ret = btrfs_double_extent_lock(src, off, inode, destoff, len,
+ true);
+ }
+ ASSERT(ret == 0);
+ if (WARN_ON(ret)) {
+ /* ranges in the io trees already unlocked */
+ goto out_unlock;
}
ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index fd1c4d982463..2bd0011450df 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -575,7 +575,8 @@ static int is_cowonly_root(u64 root_objectid)
root_objectid == BTRFS_TREE_LOG_OBJECTID ||
root_objectid == BTRFS_CSUM_TREE_OBJECTID ||
root_objectid == BTRFS_UUID_TREE_OBJECTID ||
- root_objectid == BTRFS_QUOTA_TREE_OBJECTID)
+ root_objectid == BTRFS_QUOTA_TREE_OBJECTID ||
+ root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
return 1;
return 0;
}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 7cf8509deda7..2c849b08a91b 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -310,8 +310,16 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
err = btrfs_insert_fs_root(root->fs_info, root);
+ /*
+ * The root might have been inserted already, as before we look
+ * for orphan roots, log replay might have happened, which
+ * triggers a transaction commit and qgroup accounting, which
+ * in turn reads and inserts fs roots while doing backref
+ * walking.
+ */
+ if (err == -EEXIST)
+ err = 0;
if (err) {
- BUG_ON(err == -EEXIST);
btrfs_free_fs_root(root);
break;
}
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index e0ac85949067..539e7b5e3f86 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -202,6 +202,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF);
BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56);
BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA);
BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
+BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(mixed_backref),
@@ -213,6 +214,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(raid56),
BTRFS_FEAT_ATTR_PTR(skinny_metadata),
BTRFS_FEAT_ATTR_PTR(no_holes),
+ BTRFS_FEAT_ATTR_PTR(free_space_tree),
NULL
};
@@ -780,6 +782,39 @@ failure:
return error;
}
+
+/*
+ * Change per-fs features in /sys/fs/btrfs/UUID/features to match current
+ * values in superblock. Call after any changes to incompat/compat_ro flags
+ */
+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
+ u64 bit, enum btrfs_feature_set set)
+{
+ struct btrfs_fs_devices *fs_devs;
+ struct kobject *fsid_kobj;
+ u64 features;
+ int ret;
+
+ if (!fs_info)
+ return;
+
+ features = get_features(fs_info, set);
+ ASSERT(bit & supported_feature_masks[set]);
+
+ fs_devs = fs_info->fs_devices;
+ fsid_kobj = &fs_devs->fsid_kobj;
+
+ if (!fsid_kobj->state_initialized)
+ return;
+
+ /*
+ * FIXME: this is too heavy to update just one value, ideally we'd like
+ * to use sysfs_update_group but some refactoring is needed first.
+ */
+ sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group);
+ ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group);
+}
+
static int btrfs_init_debugfs(void)
{
#ifdef CONFIG_DEBUG_FS
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 9c09522125a6..d7da1a4c2f6c 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -56,7 +56,7 @@ static struct btrfs_feature_attr btrfs_attr_##_name = { \
#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \
BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature)
#define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \
- BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT, feature)
+ BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT_RO, feature)
#define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \
BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature)
@@ -90,4 +90,7 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
struct kobject *parent);
int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
+ u64 bit, enum btrfs_feature_set set);
+
#endif /* _BTRFS_SYSFS_H_ */
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index b1d920b30070..0e1e61a7ec23 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -82,18 +82,18 @@ void btrfs_destroy_test_fs(void)
struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
{
struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info),
- GFP_NOFS);
+ GFP_KERNEL);
if (!fs_info)
return fs_info;
fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices),
- GFP_NOFS);
+ GFP_KERNEL);
if (!fs_info->fs_devices) {
kfree(fs_info);
return NULL;
}
fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block),
- GFP_NOFS);
+ GFP_KERNEL);
if (!fs_info->super_copy) {
kfree(fs_info->fs_devices);
kfree(fs_info);
@@ -180,11 +180,11 @@ btrfs_alloc_dummy_block_group(unsigned long length)
{
struct btrfs_block_group_cache *cache;
- cache = kzalloc(sizeof(*cache), GFP_NOFS);
+ cache = kzalloc(sizeof(*cache), GFP_KERNEL);
if (!cache)
return NULL;
cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
- GFP_NOFS);
+ GFP_KERNEL);
if (!cache->free_space_ctl) {
kfree(cache);
return NULL;
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index e29fa297e053..669b58201e36 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -94,7 +94,7 @@ static int test_find_delalloc(void)
* test.
*/
for (index = 0; index < (total_dirty >> PAGE_CACHE_SHIFT); index++) {
- page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+ page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL);
if (!page) {
test_msg("Failed to allocate test page\n");
ret = -ENOMEM;
@@ -113,7 +113,7 @@ static int test_find_delalloc(void)
* |--- delalloc ---|
* |--- search ---|
*/
- set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_NOFS);
+ set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_KERNEL);
start = 0;
end = 0;
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -144,7 +144,7 @@ static int test_find_delalloc(void)
test_msg("Couldn't find the locked page\n");
goto out_bits;
}
- set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_NOFS);
+ set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_KERNEL);
start = test_start;
end = 0;
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -199,7 +199,7 @@ static int test_find_delalloc(void)
*
* We are re-using our test_start from above since it works out well.
*/
- set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_NOFS);
+ set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_KERNEL);
start = test_start;
end = 0;
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -262,7 +262,7 @@ static int test_find_delalloc(void)
}
ret = 0;
out_bits:
- clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_NOFS);
+ clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_KERNEL);
out:
if (locked_page)
page_cache_release(locked_page);
@@ -360,7 +360,7 @@ static int test_eb_bitmaps(void)
test_msg("Running extent buffer bitmap tests\n");
- bitmap = kmalloc(len, GFP_NOFS);
+ bitmap = kmalloc(len, GFP_KERNEL);
if (!bitmap) {
test_msg("Couldn't allocate test bitmap\n");
return -ENOMEM;
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 5de55fdd28bc..e2d3da02deee 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -974,7 +974,7 @@ static int test_extent_accounting(void)
(BTRFS_MAX_EXTENT_SIZE >> 1) + 4095,
EXTENT_DELALLOC | EXTENT_DIRTY |
EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0,
- NULL, GFP_NOFS);
+ NULL, GFP_KERNEL);
if (ret) {
test_msg("clear_extent_bit returned %d\n", ret);
goto out;
@@ -1045,7 +1045,7 @@ static int test_extent_accounting(void)
BTRFS_MAX_EXTENT_SIZE+8191,
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
- NULL, GFP_NOFS);
+ NULL, GFP_KERNEL);
if (ret) {
test_msg("clear_extent_bit returned %d\n", ret);
goto out;
@@ -1079,7 +1079,7 @@ static int test_extent_accounting(void)
ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
- NULL, GFP_NOFS);
+ NULL, GFP_KERNEL);
if (ret) {
test_msg("clear_extent_bit returned %d\n", ret);
goto out;
@@ -1096,7 +1096,7 @@ out:
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
- NULL, GFP_NOFS);
+ NULL, GFP_KERNEL);
iput(inode);
btrfs_free_dummy_root(root);
return ret;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 323e12cc9d2f..978c3a810893 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4127,7 +4127,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct inode *inode,
struct btrfs_path *path,
struct list_head *logged_list,
- struct btrfs_log_ctx *ctx)
+ struct btrfs_log_ctx *ctx,
+ const u64 start,
+ const u64 end)
{
struct extent_map *em, *n;
struct list_head extents;
@@ -4166,7 +4168,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
}
list_sort(NULL, &extents, extent_cmp);
-
+ /*
+ * Collect any new ordered extents within the range. This is to
+ * prevent logging file extent items without waiting for the disk
+ * location they point to being written. We do this only to deal
+ * with races against concurrent lockless direct IO writes.
+ */
+ btrfs_get_logged_extents(inode, logged_list, start, end);
process:
while (!list_empty(&extents)) {
em = list_entry(extents.next, struct extent_map, list);
@@ -4701,7 +4709,7 @@ log_extents:
goto out_unlock;
}
ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
- &logged_list, ctx);
+ &logged_list, ctx, start, end);
if (ret) {
err = ret;
goto out_unlock;
diff --git a/fs/buffer.c b/fs/buffer.c
index e1632abb4ca9..33be29675358 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -621,17 +621,17 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
* If warn is true, then emit a warning if the page is not uptodate and has
* not been truncated.
*
- * The caller must hold mem_cgroup_begin_page_stat() lock.
+ * The caller must hold lock_page_memcg().
*/
static void __set_page_dirty(struct page *page, struct address_space *mapping,
- struct mem_cgroup *memcg, int warn)
+ int warn)
{
unsigned long flags;
spin_lock_irqsave(&mapping->tree_lock, flags);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !PageUptodate(page));
- account_page_dirtied(page, mapping, memcg);
+ account_page_dirtied(page, mapping);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
@@ -666,7 +666,6 @@ static void __set_page_dirty(struct page *page, struct address_space *mapping,
int __set_page_dirty_buffers(struct page *page)
{
int newly_dirty;
- struct mem_cgroup *memcg;
struct address_space *mapping = page_mapping(page);
if (unlikely(!mapping))
@@ -683,17 +682,17 @@ int __set_page_dirty_buffers(struct page *page)
} while (bh != head);
}
/*
- * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
- * per-memcg dirty page counters.
+ * Lock out page->mem_cgroup migration to keep PageDirty
+ * synchronized with per-memcg dirty page counters.
*/
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
newly_dirty = !TestSetPageDirty(page);
spin_unlock(&mapping->private_lock);
if (newly_dirty)
- __set_page_dirty(page, mapping, memcg, 1);
+ __set_page_dirty(page, mapping, 1);
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
if (newly_dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -1167,15 +1166,14 @@ void mark_buffer_dirty(struct buffer_head *bh)
if (!test_set_buffer_dirty(bh)) {
struct page *page = bh->b_page;
struct address_space *mapping = NULL;
- struct mem_cgroup *memcg;
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
if (!TestSetPageDirty(page)) {
mapping = page_mapping(page);
if (mapping)
- __set_page_dirty(page, mapping, memcg, 0);
+ __set_page_dirty(page, mapping, 0);
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
if (mapping)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index c22213789090..19adeb0ef82a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1756,6 +1756,10 @@ int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
u32 pool;
int ret, flags;
+ /* does not support pool namespace yet */
+ if (ci->i_pool_ns_len)
+ return -EIO;
+
if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
NOPOOLPERM))
return 0;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index cdbf8cf3d52c..6fe0ad26a7df 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2753,7 +2753,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
void *inline_data, int inline_len,
struct ceph_buffer *xattr_buf,
struct ceph_mds_session *session,
- struct ceph_cap *cap, int issued)
+ struct ceph_cap *cap, int issued,
+ u32 pool_ns_len)
__releases(ci->i_ceph_lock)
__releases(mdsc->snap_rwsem)
{
@@ -2873,6 +2874,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
/* file layout may have changed */
ci->i_layout = grant->layout;
+ ci->i_pool_ns_len = pool_ns_len;
+
/* size/truncate_seq? */
queue_trunc = ceph_fill_file_size(inode, issued,
le32_to_cpu(grant->truncate_seq),
@@ -3411,6 +3414,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
u32 inline_len = 0;
void *snaptrace;
size_t snaptrace_len;
+ u32 pool_ns_len = 0;
void *p, *end;
dout("handle_caps from mds%d\n", mds);
@@ -3463,6 +3467,21 @@ void ceph_handle_caps(struct ceph_mds_session *session,
p += inline_len;
}
+ if (le16_to_cpu(msg->hdr.version) >= 8) {
+ u64 flush_tid;
+ u32 caller_uid, caller_gid;
+ u32 osd_epoch_barrier;
+ /* version >= 5 */
+ ceph_decode_32_safe(&p, end, osd_epoch_barrier, bad);
+ /* version >= 6 */
+ ceph_decode_64_safe(&p, end, flush_tid, bad);
+ /* version >= 7 */
+ ceph_decode_32_safe(&p, end, caller_uid, bad);
+ ceph_decode_32_safe(&p, end, caller_gid, bad);
+ /* version >= 8 */
+ ceph_decode_32_safe(&p, end, pool_ns_len, bad);
+ }
+
/* lookup ino */
inode = ceph_find_inode(sb, vino);
ci = ceph_inode(inode);
@@ -3518,7 +3537,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
&cap, &issued);
handle_cap_grant(mdsc, inode, h,
inline_version, inline_data, inline_len,
- msg->middle, session, cap, issued);
+ msg->middle, session, cap, issued,
+ pool_ns_len);
if (realm)
ceph_put_snap_realm(mdsc, realm);
goto done_unlocked;
@@ -3542,7 +3562,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
issued |= __ceph_caps_dirty(ci);
handle_cap_grant(mdsc, inode, h,
inline_version, inline_data, inline_len,
- msg->middle, session, cap, issued);
+ msg->middle, session, cap, issued,
+ pool_ns_len);
goto done_unlocked;
case CEPH_CAP_OP_FLUSH_ACK:
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 86a9c383955e..eb9028e8cfc5 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -698,8 +698,8 @@ static void ceph_aio_retry_work(struct work_struct *work)
req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2,
false, GFP_NOFS);
- if (IS_ERR(req)) {
- ret = PTR_ERR(req);
+ if (!req) {
+ ret = -ENOMEM;
req = orig_req;
goto out;
}
@@ -716,7 +716,6 @@ static void ceph_aio_retry_work(struct work_struct *work)
ceph_osdc_build_request(req, req->r_ops[0].extent.offset,
snapc, CEPH_NOSNAP, &aio_req->mtime);
- ceph_put_snap_context(snapc);
ceph_osdc_put_request(orig_req);
req->r_callback = ceph_aio_complete_req;
@@ -731,6 +730,7 @@ out:
ceph_aio_complete_req(req, NULL);
}
+ ceph_put_snap_context(snapc);
kfree(aio_work);
}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index fb4ba2e4e2a5..5849b88bbed3 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -396,6 +396,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_symlink = NULL;
memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
+ ci->i_pool_ns_len = 0;
ci->i_fragtree = RB_ROOT;
mutex_init(&ci->i_fragtree_mutex);
@@ -756,6 +757,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool)
ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
ci->i_layout = info->layout;
+ ci->i_pool_ns_len = iinfo->pool_ns_len;
queue_trunc = ceph_fill_file_size(inode, issued,
le32_to_cpu(info->truncate_seq),
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index e7b130a637f9..911d64d865f1 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -100,6 +100,14 @@ static int parse_reply_info_in(void **p, void *end,
} else
info->inline_version = CEPH_INLINE_NONE;
+ if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
+ ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
+ ceph_decode_need(p, end, info->pool_ns_len, bad);
+ *p += info->pool_ns_len;
+ } else {
+ info->pool_ns_len = 0;
+ }
+
return 0;
bad:
return err;
@@ -2298,6 +2306,14 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
CEPH_CAP_PIN);
+ /* deny access to directories with pool_ns layouts */
+ if (req->r_inode && S_ISDIR(req->r_inode->i_mode) &&
+ ceph_inode(req->r_inode)->i_pool_ns_len)
+ return -EIO;
+ if (req->r_locked_dir &&
+ ceph_inode(req->r_locked_dir)->i_pool_ns_len)
+ return -EIO;
+
/* issue */
mutex_lock(&mdsc->mutex);
__register_request(mdsc, req, dir);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index ccf11ef0ca87..37712ccffcc6 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -44,6 +44,7 @@ struct ceph_mds_reply_info_in {
u64 inline_version;
u32 inline_len;
char *inline_data;
+ u32 pool_ns_len;
};
/*
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 75b7d125ce66..9c458eb52245 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -287,6 +287,7 @@ struct ceph_inode_info {
struct ceph_dir_layout i_dir_layout;
struct ceph_file_layout i_layout;
+ size_t i_pool_ns_len;
char *i_symlink;
/* for dirs */
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 7dc886c9a78f..e956cba94338 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -175,7 +175,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
* string to the length of the original string to allow for worst case.
*/
md_len = strlen(sb_mountdata) + INET6_ADDRSTRLEN;
- mountdata = kzalloc(md_len + 1, GFP_KERNEL);
+ mountdata = kzalloc(md_len + sizeof("ip=") + 1, GFP_KERNEL);
if (mountdata == NULL) {
rc = -ENOMEM;
goto compose_mount_options_err;
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index d41165433260..4897dacf8944 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -715,7 +715,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
ses->auth_key.response = kmalloc(baselen + tilen, GFP_KERNEL);
if (!ses->auth_key.response) {
- rc = ENOMEM;
+ rc = -ENOMEM;
ses->auth_key.len = 0;
goto setup_ntlmv2_rsp_ret;
}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index c48ca13673e3..2eea40353e60 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1013,7 +1013,6 @@ const struct file_operations cifs_file_strict_ops = {
.llseek = cifs_llseek,
.unlocked_ioctl = cifs_ioctl,
.clone_file_range = cifs_clone_file_range,
- .clone_file_range = cifs_clone_file_range,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 68c4547528c4..83aac8ba50b0 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -31,19 +31,15 @@
* so that it will fit. We use hash_64 to convert the value to 31 bits, and
* then add 1, to ensure that we don't end up with a 0 as the value.
*/
-#if BITS_PER_LONG == 64
static inline ino_t
cifs_uniqueid_to_ino_t(u64 fileid)
{
+ if ((sizeof(ino_t)) < (sizeof(u64)))
+ return (ino_t)hash_64(fileid, (sizeof(ino_t) * 8) - 1) + 1;
+
return (ino_t)fileid;
+
}
-#else
-static inline ino_t
-cifs_uniqueid_to_ino_t(u64 fileid)
-{
- return (ino_t)hash_64(fileid, (sizeof(ino_t) * 8) - 1) + 1;
-}
-#endif
extern struct file_system_type cifs_fs_type;
extern const struct address_space_operations cifs_addr_ops;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 90b4f9f7de66..76fcb50295a3 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1396,11 +1396,10 @@ openRetry:
* current bigbuf.
*/
static int
-cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+discard_remaining_data(struct TCP_Server_Info *server)
{
unsigned int rfclen = get_rfc1002_length(server->smallbuf);
int remaining = rfclen + 4 - server->total_read;
- struct cifs_readdata *rdata = mid->callback_data;
while (remaining > 0) {
int length;
@@ -1414,10 +1413,20 @@ cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
remaining -= length;
}
- dequeue_mid(mid, rdata->result);
return 0;
}
+static int
+cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+ int length;
+ struct cifs_readdata *rdata = mid->callback_data;
+
+ length = discard_remaining_data(server);
+ dequeue_mid(mid, rdata->result);
+ return length;
+}
+
int
cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
@@ -1446,6 +1455,12 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
return length;
server->total_read += length;
+ if (server->ops->is_status_pending &&
+ server->ops->is_status_pending(buf, server, 0)) {
+ discard_remaining_data(server);
+ return -1;
+ }
+
/* Was the SMB read successful? */
rdata->result = server->ops->map_error(buf, false);
if (rdata->result != 0) {
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 4fbd92d2e113..a763cd3d9e7c 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2999,8 +2999,7 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
if (ses_init_buf) {
ses_init_buf->trailer.session_req.called_len = 32;
- if (server->server_RFC1001_name &&
- server->server_RFC1001_name[0] != 0)
+ if (server->server_RFC1001_name[0] != 0)
rfc1002mangle(ses_init_buf->trailer.
session_req.called_name,
server->server_RFC1001_name,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 10f8d5cf5681..42e1f440eb1e 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1106,21 +1106,25 @@ parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp,
{
char *data_offset;
struct create_context *cc;
- unsigned int next = 0;
+ unsigned int next;
+ unsigned int remaining;
char *name;
data_offset = (char *)rsp + 4 + le32_to_cpu(rsp->CreateContextsOffset);
+ remaining = le32_to_cpu(rsp->CreateContextsLength);
cc = (struct create_context *)data_offset;
- do {
- cc = (struct create_context *)((char *)cc + next);
+ while (remaining >= sizeof(struct create_context)) {
name = le16_to_cpu(cc->NameOffset) + (char *)cc;
- if (le16_to_cpu(cc->NameLength) != 4 ||
- strncmp(name, "RqLs", 4)) {
- next = le32_to_cpu(cc->Next);
- continue;
- }
- return server->ops->parse_lease_buf(cc, epoch);
- } while (next != 0);
+ if (le16_to_cpu(cc->NameLength) == 4 &&
+ strncmp(name, "RqLs", 4) == 0)
+ return server->ops->parse_lease_buf(cc, epoch);
+
+ next = le32_to_cpu(cc->Next);
+ if (!next)
+ break;
+ remaining -= next;
+ cc = (struct create_context *)((char *)cc + next);
+ }
return 0;
}
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index a5b8eb69a8f4..6402eaf8ab95 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1261,6 +1261,9 @@ COMPATIBLE_IOCTL(HCIUNBLOCKADDR)
COMPATIBLE_IOCTL(HCIINQUIRY)
COMPATIBLE_IOCTL(HCIUARTSETPROTO)
COMPATIBLE_IOCTL(HCIUARTGETPROTO)
+COMPATIBLE_IOCTL(HCIUARTGETDEVICE)
+COMPATIBLE_IOCTL(HCIUARTSETFLAGS)
+COMPATIBLE_IOCTL(HCIUARTGETFLAGS)
COMPATIBLE_IOCTL(RFCOMMCREATEDEV)
COMPATIBLE_IOCTL(RFCOMMRELEASEDEV)
COMPATIBLE_IOCTL(RFCOMMGETDEVLIST)
diff --git a/fs/dax.c b/fs/dax.c
index 4fd6b0c5c6b5..bbb2ad783770 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -58,16 +58,35 @@ static void dax_unmap_atomic(struct block_device *bdev,
blk_queue_exit(bdev->bd_queue);
}
+struct page *read_dax_sector(struct block_device *bdev, sector_t n)
+{
+ struct page *page = alloc_pages(GFP_KERNEL, 0);
+ struct blk_dax_ctl dax = {
+ .size = PAGE_SIZE,
+ .sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
+ };
+ long rc;
+
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ rc = dax_map_atomic(bdev, &dax);
+ if (rc < 0)
+ return ERR_PTR(rc);
+ memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
+ dax_unmap_atomic(bdev, &dax);
+ return page;
+}
+
/*
- * dax_clear_blocks() is called from within transaction context from XFS,
+ * dax_clear_sectors() is called from within transaction context from XFS,
* and hence this means the stack from this point must follow GFP_NOFS
* semantics for all operations.
*/
-int dax_clear_blocks(struct inode *inode, sector_t block, long _size)
+int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size)
{
- struct block_device *bdev = inode->i_sb->s_bdev;
struct blk_dax_ctl dax = {
- .sector = block << (inode->i_blkbits - 9),
+ .sector = _sector,
.size = _size,
};
@@ -89,7 +108,7 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long _size)
wmb_pmem();
return 0;
}
-EXPORT_SYMBOL_GPL(dax_clear_blocks);
+EXPORT_SYMBOL_GPL(dax_clear_sectors);
/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
@@ -338,7 +357,8 @@ static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
void *entry;
WARN_ON_ONCE(pmd_entry && !dirty);
- __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+ if (dirty)
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
spin_lock_irq(&mapping->tree_lock);
@@ -464,11 +484,10 @@ static int dax_writeback_one(struct block_device *bdev,
* end]. This is required by data integrity operations to ensure file data is
* on persistent storage prior to completion of the operation.
*/
-int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
- loff_t end)
+int dax_writeback_mapping_range(struct address_space *mapping,
+ struct block_device *bdev, struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
- struct block_device *bdev = inode->i_sb->s_bdev;
pgoff_t start_index, end_index, pmd_index;
pgoff_t indices[PAGEVEC_SIZE];
struct pagevec pvec;
@@ -479,8 +498,11 @@ int dax_writeback_mapping_range(struct address_space *mapping, loff_t start,
if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
return -EIO;
- start_index = start >> PAGE_CACHE_SHIFT;
- end_index = end >> PAGE_CACHE_SHIFT;
+ if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
+ return 0;
+
+ start_index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ end_index = wbc->range_end >> PAGE_CACHE_SHIFT;
pmd_index = DAX_PMD_INDEX(start_index);
rcu_read_lock();
@@ -1034,6 +1056,7 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct file *file = vma->vm_file;
+ int error;
/*
* We pass NO_SECTOR to dax_radix_entry() because we expect that a
@@ -1043,7 +1066,13 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
* saves us from having to make a call to get_block() here to look
* up the sector.
*/
- dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true);
+ error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false,
+ true);
+
+ if (error == -ENOMEM)
+ return VM_FAULT_OOM;
+ if (error)
+ return VM_FAULT_SIGBUS;
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
diff --git a/fs/dcache.c b/fs/dcache.c
index 92d5140de851..2398f9f94337 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -269,9 +269,6 @@ static inline int dname_external(const struct dentry *dentry)
return dentry->d_name.name != dentry->d_iname;
}
-/*
- * Make sure other CPUs see the inode attached before the type is set.
- */
static inline void __d_set_inode_and_type(struct dentry *dentry,
struct inode *inode,
unsigned type_flags)
@@ -279,28 +276,18 @@ static inline void __d_set_inode_and_type(struct dentry *dentry,
unsigned flags;
dentry->d_inode = inode;
- smp_wmb();
flags = READ_ONCE(dentry->d_flags);
flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
flags |= type_flags;
WRITE_ONCE(dentry->d_flags, flags);
}
-/*
- * Ideally, we want to make sure that other CPUs see the flags cleared before
- * the inode is detached, but this is really a violation of RCU principles
- * since the ordering suggests we should always set inode before flags.
- *
- * We should instead replace or discard the entire dentry - but that sucks
- * performancewise on mass deletion/rename.
- */
static inline void __d_clear_type_and_inode(struct dentry *dentry)
{
unsigned flags = READ_ONCE(dentry->d_flags);
flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
WRITE_ONCE(dentry->d_flags, flags);
- smp_wmb();
dentry->d_inode = NULL;
}
@@ -370,9 +357,11 @@ static void dentry_unlink_inode(struct dentry * dentry)
__releases(dentry->d_inode->i_lock)
{
struct inode *inode = dentry->d_inode;
+
+ raw_write_seqcount_begin(&dentry->d_seq);
__d_clear_type_and_inode(dentry);
hlist_del_init(&dentry->d_u.d_alias);
- dentry_rcuwalk_invalidate(dentry);
+ raw_write_seqcount_end(&dentry->d_seq);
spin_unlock(&dentry->d_lock);
spin_unlock(&inode->i_lock);
if (!inode->i_nlink)
@@ -1758,8 +1747,9 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
spin_lock(&dentry->d_lock);
if (inode)
hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
+ raw_write_seqcount_begin(&dentry->d_seq);
__d_set_inode_and_type(dentry, inode, add_flags);
- dentry_rcuwalk_invalidate(dentry);
+ raw_write_seqcount_end(&dentry->d_seq);
spin_unlock(&dentry->d_lock);
fsnotify_d_instantiate(dentry, inode);
}
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 1f107fd51328..655f21f99160 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -575,6 +575,26 @@ void devpts_kill_index(struct inode *ptmx_inode, int idx)
mutex_unlock(&allocated_ptys_lock);
}
+/*
+ * pty code needs to hold extra references in case of last /dev/tty close
+ */
+
+void devpts_add_ref(struct inode *ptmx_inode)
+{
+ struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+
+ atomic_inc(&sb->s_active);
+ ihold(ptmx_inode);
+}
+
+void devpts_del_ref(struct inode *ptmx_inode)
+{
+ struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+
+ iput(ptmx_inode);
+ deactivate_super(sb);
+}
+
/**
* devpts_pty_new -- create a new inode in /dev/pts/
* @ptmx_inode: inode of the master
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 1b2f7ffc8b84..d6a9012d42ad 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -472,8 +472,8 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
dio->io_error = -EIO;
if (dio->is_async && dio->rw == READ && dio->should_dirty) {
- bio_check_pages_dirty(bio); /* transfers ownership */
err = bio->bi_error;
+ bio_check_pages_dirty(bio); /* transfers ownership */
} else {
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index c424e4813ec8..d48e0d261d78 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -10,6 +10,7 @@
#include <linux/efi.h>
#include <linux/fs.h>
#include <linux/slab.h>
+#include <linux/mount.h>
#include "internal.h"
@@ -103,9 +104,78 @@ out_free:
return size;
}
+static int
+efivarfs_ioc_getxflags(struct file *file, void __user *arg)
+{
+ struct inode *inode = file->f_mapping->host;
+ unsigned int i_flags;
+ unsigned int flags = 0;
+
+ i_flags = inode->i_flags;
+ if (i_flags & S_IMMUTABLE)
+ flags |= FS_IMMUTABLE_FL;
+
+ if (copy_to_user(arg, &flags, sizeof(flags)))
+ return -EFAULT;
+ return 0;
+}
+
+static int
+efivarfs_ioc_setxflags(struct file *file, void __user *arg)
+{
+ struct inode *inode = file->f_mapping->host;
+ unsigned int flags;
+ unsigned int i_flags = 0;
+ int error;
+
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ if (copy_from_user(&flags, arg, sizeof(flags)))
+ return -EFAULT;
+
+ if (flags & ~FS_IMMUTABLE_FL)
+ return -EOPNOTSUPP;
+
+ if (!capable(CAP_LINUX_IMMUTABLE))
+ return -EPERM;
+
+ if (flags & FS_IMMUTABLE_FL)
+ i_flags |= S_IMMUTABLE;
+
+
+ error = mnt_want_write_file(file);
+ if (error)
+ return error;
+
+ inode_lock(inode);
+ inode_set_flags(inode, i_flags, S_IMMUTABLE);
+ inode_unlock(inode);
+
+ mnt_drop_write_file(file);
+
+ return 0;
+}
+
+long
+efivarfs_file_ioctl(struct file *file, unsigned int cmd, unsigned long p)
+{
+ void __user *arg = (void __user *)p;
+
+ switch (cmd) {
+ case FS_IOC_GETFLAGS:
+ return efivarfs_ioc_getxflags(file, arg);
+ case FS_IOC_SETFLAGS:
+ return efivarfs_ioc_setxflags(file, arg);
+ }
+
+ return -ENOTTY;
+}
+
const struct file_operations efivarfs_file_operations = {
.open = simple_open,
.read = efivarfs_file_read,
.write = efivarfs_file_write,
.llseek = no_llseek,
+ .unlocked_ioctl = efivarfs_file_ioctl,
};
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 3381b9da9ee6..e2ab6d0497f2 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -15,7 +15,8 @@
#include "internal.h"
struct inode *efivarfs_get_inode(struct super_block *sb,
- const struct inode *dir, int mode, dev_t dev)
+ const struct inode *dir, int mode,
+ dev_t dev, bool is_removable)
{
struct inode *inode = new_inode(sb);
@@ -23,6 +24,7 @@ struct inode *efivarfs_get_inode(struct super_block *sb,
inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_flags = is_removable ? 0 : S_IMMUTABLE;
switch (mode & S_IFMT) {
case S_IFREG:
inode->i_fop = &efivarfs_file_operations;
@@ -102,22 +104,17 @@ static void efivarfs_hex_to_guid(const char *str, efi_guid_t *guid)
static int efivarfs_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool excl)
{
- struct inode *inode;
+ struct inode *inode = NULL;
struct efivar_entry *var;
int namelen, i = 0, err = 0;
+ bool is_removable = false;
if (!efivarfs_valid_name(dentry->d_name.name, dentry->d_name.len))
return -EINVAL;
- inode = efivarfs_get_inode(dir->i_sb, dir, mode, 0);
- if (!inode)
- return -ENOMEM;
-
var = kzalloc(sizeof(struct efivar_entry), GFP_KERNEL);
- if (!var) {
- err = -ENOMEM;
- goto out;
- }
+ if (!var)
+ return -ENOMEM;
/* length of the variable name itself: remove GUID and separator */
namelen = dentry->d_name.len - EFI_VARIABLE_GUID_LEN - 1;
@@ -125,6 +122,16 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry,
efivarfs_hex_to_guid(dentry->d_name.name + namelen + 1,
&var->var.VendorGuid);
+ if (efivar_variable_is_removable(var->var.VendorGuid,
+ dentry->d_name.name, namelen))
+ is_removable = true;
+
+ inode = efivarfs_get_inode(dir->i_sb, dir, mode, 0, is_removable);
+ if (!inode) {
+ err = -ENOMEM;
+ goto out;
+ }
+
for (i = 0; i < namelen; i++)
var->var.VariableName[i] = dentry->d_name.name[i];
@@ -138,7 +145,8 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry,
out:
if (err) {
kfree(var);
- iput(inode);
+ if (inode)
+ iput(inode);
}
return err;
}
diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
index b5ff16addb7c..b4505188e799 100644
--- a/fs/efivarfs/internal.h
+++ b/fs/efivarfs/internal.h
@@ -15,7 +15,8 @@ extern const struct file_operations efivarfs_file_operations;
extern const struct inode_operations efivarfs_dir_inode_operations;
extern bool efivarfs_valid_name(const char *str, int len);
extern struct inode *efivarfs_get_inode(struct super_block *sb,
- const struct inode *dir, int mode, dev_t dev);
+ const struct inode *dir, int mode, dev_t dev,
+ bool is_removable);
extern struct list_head efivarfs_list;
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index b8a564f29107..dd029d13ea61 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -118,8 +118,9 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
struct dentry *dentry, *root = sb->s_root;
unsigned long size = 0;
char *name;
- int len, i;
+ int len;
int err = -ENOMEM;
+ bool is_removable = false;
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
@@ -128,15 +129,17 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
memcpy(entry->var.VariableName, name16, name_size);
memcpy(&(entry->var.VendorGuid), &vendor, sizeof(efi_guid_t));
- len = ucs2_strlen(entry->var.VariableName);
+ len = ucs2_utf8size(entry->var.VariableName);
/* name, plus '-', plus GUID, plus NUL*/
name = kmalloc(len + 1 + EFI_VARIABLE_GUID_LEN + 1, GFP_KERNEL);
if (!name)
goto fail;
- for (i = 0; i < len; i++)
- name[i] = entry->var.VariableName[i] & 0xFF;
+ ucs2_as_utf8(name, entry->var.VariableName, len);
+
+ if (efivar_variable_is_removable(entry->var.VendorGuid, name, len))
+ is_removable = true;
name[len] = '-';
@@ -144,7 +147,8 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
name[len + EFI_VARIABLE_GUID_LEN+1] = '\0';
- inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0);
+ inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0,
+ is_removable);
if (!inode)
goto fail_name;
@@ -200,7 +204,7 @@ static int efivarfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_d_op = &efivarfs_d_ops;
sb->s_time_gran = 1;
- inode = efivarfs_get_inode(sb, NULL, S_IFDIR | 0755, 0);
+ inode = efivarfs_get_inode(sb, NULL, S_IFDIR | 0755, 0, true);
if (!inode)
return -ENOMEM;
inode->i_op = &efivarfs_dir_inode_operations;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index ae1dbcf47e97..cde60741cad2 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -94,6 +94,11 @@
/* Epoll private bits inside the event mask */
#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
+#define EPOLLINOUT_BITS (POLLIN | POLLOUT)
+
+#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | POLLERR | POLLHUP | \
+ EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)
+
/* Maximum number of nesting allowed inside epoll sets */
#define EP_MAX_NESTS 4
@@ -1068,7 +1073,22 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
* wait list.
*/
if (waitqueue_active(&ep->wq)) {
- ewake = 1;
+ if ((epi->event.events & EPOLLEXCLUSIVE) &&
+ !((unsigned long)key & POLLFREE)) {
+ switch ((unsigned long)key & EPOLLINOUT_BITS) {
+ case POLLIN:
+ if (epi->event.events & POLLIN)
+ ewake = 1;
+ break;
+ case POLLOUT:
+ if (epi->event.events & POLLOUT)
+ ewake = 1;
+ break;
+ case 0:
+ ewake = 1;
+ break;
+ }
+ }
wake_up_locked(&ep->wq);
}
if (waitqueue_active(&ep->poll_wait))
@@ -1875,9 +1895,13 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
* so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
* Also, we do not currently supported nested exclusive wakeups.
*/
- if ((epds.events & EPOLLEXCLUSIVE) && (op == EPOLL_CTL_MOD ||
- (op == EPOLL_CTL_ADD && is_file_epoll(tf.file))))
- goto error_tgt_fput;
+ if (epds.events & EPOLLEXCLUSIVE) {
+ if (op == EPOLL_CTL_MOD)
+ goto error_tgt_fput;
+ if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
+ (epds.events & ~EPOLLEXCLUSIVE_OK_BITS)))
+ goto error_tgt_fput;
+ }
/*
* At this point it is safe to assume that the "private_data" contains
@@ -1950,8 +1974,10 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
break;
case EPOLL_CTL_MOD:
if (epi) {
- epds.events |= POLLERR | POLLHUP;
- error = ep_modify(ep, epi, &epds);
+ if (!(epi->event.events & EPOLLEXCLUSIVE)) {
+ epds.events |= POLLERR | POLLHUP;
+ error = ep_modify(ep, epi, &epds);
+ }
} else
error = -ENOENT;
break;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 2c88d683cd91..c1400b109805 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -80,23 +80,6 @@ static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
return ret;
}
-static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- struct inode *inode = file_inode(vma->vm_file);
- struct ext2_inode_info *ei = EXT2_I(inode);
- int ret;
-
- sb_start_pagefault(inode->i_sb);
- file_update_time(vma->vm_file);
- down_read(&ei->dax_sem);
-
- ret = __dax_mkwrite(vma, vmf, ext2_get_block, NULL);
-
- up_read(&ei->dax_sem);
- sb_end_pagefault(inode->i_sb);
- return ret;
-}
-
static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
struct vm_fault *vmf)
{
@@ -124,7 +107,7 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
static const struct vm_operations_struct ext2_dax_vm_ops = {
.fault = ext2_dax_fault,
.pmd_fault = ext2_dax_pmd_fault,
- .page_mkwrite = ext2_dax_mkwrite,
+ .page_mkwrite = ext2_dax_fault,
.pfn_mkwrite = ext2_dax_pfn_mkwrite,
};
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 338eefda70c6..6bd58e6ff038 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -737,8 +737,10 @@ static int ext2_get_blocks(struct inode *inode,
* so that it's not found by another thread before it's
* initialised
*/
- err = dax_clear_blocks(inode, le32_to_cpu(chain[depth-1].key),
- 1 << inode->i_blkbits);
+ err = dax_clear_sectors(inode->i_sb->s_bdev,
+ le32_to_cpu(chain[depth-1].key) <<
+ (inode->i_blkbits - 9),
+ 1 << inode->i_blkbits);
if (err) {
mutex_unlock(&ei->truncate_mutex);
goto cleanup;
@@ -874,6 +876,14 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
static int
ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
+#ifdef CONFIG_FS_DAX
+ if (dax_mapping(mapping)) {
+ return dax_writeback_mapping_range(mapping,
+ mapping->host->i_sb->s_bdev,
+ wbc);
+ }
+#endif
+
return mpage_writepages(mapping, wbc, ext2_get_block);
}
@@ -1296,7 +1306,7 @@ void ext2_set_inode_flags(struct inode *inode)
inode->i_flags |= S_NOATIME;
if (flags & EXT2_DIRSYNC_FL)
inode->i_flags |= S_DIRSYNC;
- if (test_opt(inode->i_sb, DAX))
+ if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode))
inode->i_flags |= S_DAX;
}
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index ec0668a60678..fe1f50fe764f 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -191,7 +191,6 @@ static int ext4_init_block_bitmap(struct super_block *sb,
/* If checksum is bad mark all blocks used to prevent allocation
* essentially implementing a per-group read-only flag. */
if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
- ext4_error(sb, "Checksum bad for group %u", block_group);
grp = ext4_get_group_info(sb, block_group);
if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
percpu_counter_sub(&sbi->s_freeclusters_counter,
@@ -442,14 +441,16 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
}
ext4_lock_group(sb, block_group);
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-
err = ext4_init_block_bitmap(sb, bh, block_group, desc);
set_bitmap_uptodate(bh);
set_buffer_uptodate(bh);
ext4_unlock_group(sb, block_group);
unlock_buffer(bh);
- if (err)
+ if (err) {
+ ext4_error(sb, "Failed to init block bitmap for group "
+ "%u: %d", block_group, err);
goto out;
+ }
goto verify;
}
ext4_unlock_group(sb, block_group);
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 845c22a5a744..edc053a81914 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -465,3 +465,59 @@ uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size)
return size;
return 0;
}
+
+/*
+ * Validate dentries for encrypted directories to make sure we aren't
+ * potentially caching stale data after a key has been added or
+ * removed.
+ */
+static int ext4_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ struct inode *dir = d_inode(dentry->d_parent);
+ struct ext4_crypt_info *ci = EXT4_I(dir)->i_crypt_info;
+ int dir_has_key, cached_with_key;
+
+ if (!ext4_encrypted_inode(dir))
+ return 0;
+
+ if (ci && ci->ci_keyring_key &&
+ (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
+ (1 << KEY_FLAG_REVOKED) |
+ (1 << KEY_FLAG_DEAD))))
+ ci = NULL;
+
+ /* this should eventually be an flag in d_flags */
+ cached_with_key = dentry->d_fsdata != NULL;
+ dir_has_key = (ci != NULL);
+
+ /*
+ * If the dentry was cached without the key, and it is a
+ * negative dentry, it might be a valid name. We can't check
+ * if the key has since been made available due to locking
+ * reasons, so we fail the validation so ext4_lookup() can do
+ * this check.
+ *
+ * We also fail the validation if the dentry was created with
+ * the key present, but we no longer have the key, or vice versa.
+ */
+ if ((!cached_with_key && d_is_negative(dentry)) ||
+ (!cached_with_key && dir_has_key) ||
+ (cached_with_key && !dir_has_key)) {
+#if 0 /* Revalidation debug */
+ char buf[80];
+ char *cp = simple_dname(dentry, buf, sizeof(buf));
+
+ if (IS_ERR(cp))
+ cp = (char *) "???";
+ pr_err("revalidate: %s %p %d %d %d\n", cp, dentry->d_fsdata,
+ cached_with_key, d_is_negative(dentry),
+ dir_has_key);
+#endif
+ return 0;
+ }
+ return 1;
+}
+
+const struct dentry_operations ext4_encrypted_d_ops = {
+ .d_revalidate = ext4_d_revalidate,
+};
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 1d1bca74f844..33f5e2a50cf8 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -111,6 +111,12 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
int dir_has_error = 0;
struct ext4_str fname_crypto_str = {.name = NULL, .len = 0};
+ if (ext4_encrypted_inode(inode)) {
+ err = ext4_get_encryption_info(inode);
+ if (err && err != -ENOKEY)
+ return err;
+ }
+
if (is_dx_dir(inode)) {
err = ext4_dx_readdir(file, ctx);
if (err != ERR_BAD_DX_DIR) {
@@ -157,8 +163,11 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
index, 1);
file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
bh = ext4_bread(NULL, inode, map.m_lblk, 0);
- if (IS_ERR(bh))
- return PTR_ERR(bh);
+ if (IS_ERR(bh)) {
+ err = PTR_ERR(bh);
+ bh = NULL;
+ goto errout;
+ }
}
if (!bh) {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0662b285dc8a..157b458a69d4 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2302,6 +2302,7 @@ struct page *ext4_encrypt(struct inode *inode,
int ext4_decrypt(struct page *page);
int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk,
ext4_fsblk_t pblk, ext4_lblk_t len);
+extern const struct dentry_operations ext4_encrypted_d_ops;
#ifdef CONFIG_EXT4_FS_ENCRYPTION
int ext4_init_crypto(void);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0ffabaf90aa5..3753ceb0b0dd 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3928,7 +3928,7 @@ static int
convert_initialized_extent(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
struct ext4_ext_path **ppath, int flags,
- unsigned int allocated, ext4_fsblk_t newblock)
+ unsigned int allocated)
{
struct ext4_ext_path *path = *ppath;
struct ext4_extent *ex;
@@ -4347,7 +4347,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
allocated = convert_initialized_extent(
handle, inode, map, &path,
- flags, allocated, newblock);
+ flags, allocated);
goto out2;
} else if (!ext4_ext_is_unwritten(ex))
goto out;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 1126436dada1..4cd318f31cbe 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -262,23 +262,8 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
return result;
}
-static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- int err;
- struct inode *inode = file_inode(vma->vm_file);
-
- sb_start_pagefault(inode->i_sb);
- file_update_time(vma->vm_file);
- down_read(&EXT4_I(inode)->i_mmap_sem);
- err = __dax_mkwrite(vma, vmf, ext4_dax_mmap_get_block, NULL);
- up_read(&EXT4_I(inode)->i_mmap_sem);
- sb_end_pagefault(inode->i_sb);
-
- return err;
-}
-
/*
- * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_mkwrite()
+ * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
* handler we check for races agaist truncate. Note that since we cycle through
* i_mmap_sem, we are sure that also any hole punching that began before we
* were called is finished by now and so if it included part of the file we
@@ -311,7 +296,7 @@ static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma,
static const struct vm_operations_struct ext4_dax_vm_ops = {
.fault = ext4_dax_fault,
.pmd_fault = ext4_dax_pmd_fault,
- .page_mkwrite = ext4_dax_mkwrite,
+ .page_mkwrite = ext4_dax_fault,
.pfn_mkwrite = ext4_dax_pfn_mkwrite,
};
#else
@@ -350,6 +335,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
struct super_block *sb = inode->i_sb;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct vfsmount *mnt = filp->f_path.mnt;
+ struct inode *dir = filp->f_path.dentry->d_parent->d_inode;
struct path path;
char buf[64], *cp;
int ret;
@@ -393,6 +379,14 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
if (ext4_encryption_info(inode) == NULL)
return -ENOKEY;
}
+ if (ext4_encrypted_inode(dir) &&
+ !ext4_is_child_context_consistent_with_parent(dir, inode)) {
+ ext4_warning(inode->i_sb,
+ "Inconsistent encryption contexts: %lu/%lu\n",
+ (unsigned long) dir->i_ino,
+ (unsigned long) inode->i_ino);
+ return -EPERM;
+ }
/*
* Set up the jbd2_inode if we are opening the inode for
* writing and the journal is present
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 3fcfd50a2e8a..acc0ad56bf2f 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -76,7 +76,6 @@ static int ext4_init_inode_bitmap(struct super_block *sb,
/* If checksum is bad mark all blocks and inodes use to prevent
* allocation, essentially implementing a per-group read-only flag. */
if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
- ext4_error(sb, "Checksum bad for group %u", block_group);
grp = ext4_get_group_info(sb, block_group);
if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
percpu_counter_sub(&sbi->s_freeclusters_counter,
@@ -191,8 +190,11 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
set_buffer_verified(bh);
ext4_unlock_group(sb, block_group);
unlock_buffer(bh);
- if (err)
+ if (err) {
+ ext4_error(sb, "Failed to init inode bitmap for group "
+ "%u: %d", block_group, err);
goto out;
+ }
return bh;
}
ext4_unlock_group(sb, block_group);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 83bc8bfb3bea..aee960b1af34 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -686,6 +686,34 @@ out_sem:
return retval;
}
+/*
+ * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages
+ * we have to be careful as someone else may be manipulating b_state as well.
+ */
+static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
+{
+ unsigned long old_state;
+ unsigned long new_state;
+
+ flags &= EXT4_MAP_FLAGS;
+
+ /* Dummy buffer_head? Set non-atomically. */
+ if (!bh->b_page) {
+ bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags;
+ return;
+ }
+ /*
+ * Someone else may be modifying b_state. Be careful! This is ugly but
+ * once we get rid of using bh as a container for mapping information
+ * to pass to / from get_block functions, this can go away.
+ */
+ do {
+ old_state = READ_ONCE(bh->b_state);
+ new_state = (old_state & ~EXT4_MAP_FLAGS) | flags;
+ } while (unlikely(
+ cmpxchg(&bh->b_state, old_state, new_state) != old_state));
+}
+
/* Maximum number of blocks we map for direct IO at once. */
#define DIO_MAX_BLOCKS 4096
@@ -722,7 +750,7 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
ext4_io_end_t *io_end = ext4_inode_aio(inode);
map_bh(bh, inode->i_sb, map.m_pblk);
- bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+ ext4_update_bh_state(bh, map.m_flags);
if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
set_buffer_defer_completion(bh);
bh->b_size = inode->i_sb->s_blocksize * map.m_len;
@@ -1685,7 +1713,7 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
return ret;
map_bh(bh, inode->i_sb, map.m_pblk);
- bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+ ext4_update_bh_state(bh, map.m_flags);
if (buffer_unwritten(bh)) {
/* A delayed write to unwritten bh should be marked
@@ -2450,6 +2478,10 @@ static int ext4_writepages(struct address_space *mapping,
trace_ext4_writepages(inode, wbc);
+ if (dax_mapping(mapping))
+ return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
+ wbc);
+
/*
* No pages to write? This is mainly a kludge to avoid starting
* a transaction for special inodes like journal inode on last iput()
@@ -3253,29 +3285,29 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
* case, we allocate an io_end structure to hook to the iocb.
*/
iocb->private = NULL;
- ext4_inode_aio_set(inode, NULL);
- if (!is_sync_kiocb(iocb)) {
- io_end = ext4_init_io_end(inode, GFP_NOFS);
- if (!io_end) {
- ret = -ENOMEM;
- goto retake_lock;
- }
- /*
- * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
- */
- iocb->private = ext4_get_io_end(io_end);
- /*
- * we save the io structure for current async direct
- * IO, so that later ext4_map_blocks() could flag the
- * io structure whether there is a unwritten extents
- * needs to be converted when IO is completed.
- */
- ext4_inode_aio_set(inode, io_end);
- }
-
if (overwrite) {
get_block_func = ext4_get_block_overwrite;
} else {
+ ext4_inode_aio_set(inode, NULL);
+ if (!is_sync_kiocb(iocb)) {
+ io_end = ext4_init_io_end(inode, GFP_NOFS);
+ if (!io_end) {
+ ret = -ENOMEM;
+ goto retake_lock;
+ }
+ /*
+ * Grab reference for DIO. Will be dropped in
+ * ext4_end_io_dio()
+ */
+ iocb->private = ext4_get_io_end(io_end);
+ /*
+ * we save the io structure for current async direct
+ * IO, so that later ext4_map_blocks() could flag the
+ * io structure whether there is a unwritten extents
+ * needs to be converted when IO is completed.
+ */
+ ext4_inode_aio_set(inode, io_end);
+ }
get_block_func = ext4_get_block_write;
dio_flags = DIO_LOCKING;
}
@@ -4127,7 +4159,7 @@ void ext4_set_inode_flags(struct inode *inode)
new_fl |= S_NOATIME;
if (flags & EXT4_DIRSYNC_FL)
new_fl |= S_DIRSYNC;
- if (test_opt(inode->i_sb, DAX))
+ if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode))
new_fl |= S_DAX;
inode_set_flags(inode, new_fl,
S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 0f6c36922c24..eae5917c534e 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -208,7 +208,7 @@ static int ext4_ioctl_setflags(struct inode *inode,
{
struct ext4_inode_info *ei = EXT4_I(inode);
handle_t *handle = NULL;
- int err = EPERM, migrate = 0;
+ int err = -EPERM, migrate = 0;
struct ext4_iloc iloc;
unsigned int oldflags, mask, i;
unsigned int jflag;
@@ -583,6 +583,11 @@ group_extend_out:
"Online defrag not supported with bigalloc");
err = -EOPNOTSUPP;
goto mext_out;
+ } else if (IS_DAX(inode)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online defrag not supported with DAX");
+ err = -EOPNOTSUPP;
+ goto mext_out;
}
err = mnt_want_write_file(filp);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 61eaf74dca37..4424b7bf8ac6 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2285,7 +2285,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
if (group == 0)
seq_puts(seq, "#group: free frags first ["
" 2^0 2^1 2^2 2^3 2^4 2^5 2^6 "
- " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]");
+ " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n");
i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
sizeof(struct ext4_group_info);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index fb6f11709ae6..4098acc701c3 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -265,11 +265,12 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
ext4_lblk_t orig_blk_offset, donor_blk_offset;
unsigned long blocksize = orig_inode->i_sb->s_blocksize;
unsigned int tmp_data_size, data_size, replaced_size;
- int err2, jblocks, retries = 0;
+ int i, err2, jblocks, retries = 0;
int replaced_count = 0;
int from = data_offset_in_page << orig_inode->i_blkbits;
int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
struct super_block *sb = orig_inode->i_sb;
+ struct buffer_head *bh = NULL;
/*
* It needs twice the amount of ordinary journal buffers because
@@ -380,8 +381,17 @@ data_copy:
}
/* Perform all necessary steps similar write_begin()/write_end()
* but keeping in mind that i_size will not change */
- *err = __block_write_begin(pagep[0], from, replaced_size,
- ext4_get_block);
+ if (!page_has_buffers(pagep[0]))
+ create_empty_buffers(pagep[0], 1 << orig_inode->i_blkbits, 0);
+ bh = page_buffers(pagep[0]);
+ for (i = 0; i < data_offset_in_page; i++)
+ bh = bh->b_this_page;
+ for (i = 0; i < block_len_in_page; i++) {
+ *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0);
+ if (*err < 0)
+ break;
+ bh = bh->b_this_page;
+ }
if (!*err)
*err = block_commit_write(pagep[0], from, from + replaced_size);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 06574dd77614..48e4b8907826 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1558,6 +1558,24 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
struct ext4_dir_entry_2 *de;
struct buffer_head *bh;
+ if (ext4_encrypted_inode(dir)) {
+ int res = ext4_get_encryption_info(dir);
+
+ /*
+ * This should be a properly defined flag for
+ * dentry->d_flags when we uplift this to the VFS.
+ * d_fsdata is set to (void *) 1 if if the dentry is
+ * created while the directory was encrypted and we
+ * don't have access to the key.
+ */
+ dentry->d_fsdata = NULL;
+ if (ext4_encryption_info(dir))
+ dentry->d_fsdata = (void *) 1;
+ d_set_d_op(dentry, &ext4_encrypted_d_ops);
+ if (res && res != -ENOKEY)
+ return ERR_PTR(res);
+ }
+
if (dentry->d_name.len > EXT4_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
@@ -1585,11 +1603,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
return ERR_PTR(-EFSCORRUPTED);
}
if (!IS_ERR(inode) && ext4_encrypted_inode(dir) &&
- (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)) &&
+ (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
!ext4_is_child_context_consistent_with_parent(dir,
inode)) {
+ int nokey = ext4_encrypted_inode(inode) &&
+ !ext4_encryption_info(inode);
+
iput(inode);
+ if (nokey)
+ return ERR_PTR(-ENOKEY);
ext4_warning(inode->i_sb,
"Inconsistent encryption contexts: %lu/%lu\n",
(unsigned long) dir->i_ino,
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index ad62d7acc315..34038e3598d5 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -198,7 +198,7 @@ static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size)
if (flex_gd == NULL)
goto out3;
- if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_flex_group_data))
+ if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_group_data))
goto out2;
flex_gd->count = flexbg_size;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 6915c950e6e8..5c46ed9f3e14 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -223,6 +223,9 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi,
#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
/* one round can affect upto 5 slots */
+static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
+static struct workqueue_struct *isw_wq;
+
void __inode_attach_wb(struct inode *inode, struct page *page)
{
struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -424,6 +427,8 @@ skip_switch:
iput(inode);
kfree(isw);
+
+ atomic_dec(&isw_nr_in_flight);
}
static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
@@ -433,7 +438,7 @@ static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
/* needs to grab bh-unsafe locks, bounce to work item */
INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
- schedule_work(&isw->work);
+ queue_work(isw_wq, &isw->work);
}
/**
@@ -469,7 +474,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
/* while holding I_WB_SWITCH, no one else can update the association */
spin_lock(&inode->i_lock);
- if (inode->i_state & (I_WB_SWITCH | I_FREEING) ||
+ if (!(inode->i_sb->s_flags & MS_ACTIVE) ||
+ inode->i_state & (I_WB_SWITCH | I_FREEING) ||
inode_to_wb(inode) == isw->new_wb) {
spin_unlock(&inode->i_lock);
goto out_free;
@@ -480,6 +486,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
ihold(inode);
isw->inode = inode;
+ atomic_inc(&isw_nr_in_flight);
+
/*
* In addition to synchronizing among switchers, I_WB_SWITCH tells
* the RCU protected stat update paths to grab the mapping's
@@ -840,6 +848,33 @@ restart:
wb_put(last_wb);
}
+/**
+ * cgroup_writeback_umount - flush inode wb switches for umount
+ *
+ * This function is called when a super_block is about to be destroyed and
+ * flushes in-flight inode wb switches. An inode wb switch goes through
+ * RCU and then workqueue, so the two need to be flushed in order to ensure
+ * that all previously scheduled switches are finished. As wb switches are
+ * rare occurrences and synchronize_rcu() can take a while, perform
+ * flushing iff wb switches are in flight.
+ */
+void cgroup_writeback_umount(void)
+{
+ if (atomic_read(&isw_nr_in_flight)) {
+ synchronize_rcu();
+ flush_workqueue(isw_wq);
+ }
+}
+
+static int __init cgroup_writeback_init(void)
+{
+ isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
+ if (!isw_wq)
+ return -ENOMEM;
+ return 0;
+}
+fs_initcall(cgroup_writeback_init);
+
#else /* CONFIG_CGROUP_WRITEBACK */
static struct bdi_writeback *
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 506765afa1a3..bb8d67e2740a 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -376,12 +376,11 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
struct inode *inode = d_inode(dentry);
dnode_secno dno;
int r;
- int rep = 0;
int err;
hpfs_lock(dir->i_sb);
hpfs_adjust_length(name, &len);
-again:
+
err = -ENOENT;
de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh);
if (!de)
@@ -401,33 +400,9 @@ again:
hpfs_error(dir->i_sb, "there was error when removing dirent");
err = -EFSERROR;
break;
- case 2: /* no space for deleting, try to truncate file */
-
+ case 2: /* no space for deleting */
err = -ENOSPC;
- if (rep++)
- break;
-
- dentry_unhash(dentry);
- if (!d_unhashed(dentry)) {
- hpfs_unlock(dir->i_sb);
- return -ENOSPC;
- }
- if (generic_permission(inode, MAY_WRITE) ||
- !S_ISREG(inode->i_mode) ||
- get_write_access(inode)) {
- d_rehash(dentry);
- } else {
- struct iattr newattrs;
- /*pr_info("truncating file before delete.\n");*/
- newattrs.ia_size = 0;
- newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
- err = notify_change(dentry, &newattrs, NULL);
- put_write_access(inode);
- if (!err)
- goto again;
- }
- hpfs_unlock(dir->i_sb);
- return -ENOSPC;
+ break;
default:
drop_nlink(inode);
err = 0;
diff --git a/fs/inode.c b/fs/inode.c
index 9f62db3bcc3e..69b8b526c194 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -154,6 +154,12 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_rdev = 0;
inode->dirtied_when = 0;
+#ifdef CONFIG_CGROUP_WRITEBACK
+ inode->i_wb_frn_winner = 0;
+ inode->i_wb_frn_avg_time = 0;
+ inode->i_wb_frn_history = 0;
+#endif
+
if (security_inode_alloc(inode))
goto out;
spin_lock_init(&inode->i_lock);
diff --git a/fs/jffs2/README.Locking b/fs/jffs2/README.Locking
index 3ea36554107f..8918ac905a3b 100644
--- a/fs/jffs2/README.Locking
+++ b/fs/jffs2/README.Locking
@@ -2,10 +2,6 @@
JFFS2 LOCKING DOCUMENTATION
---------------------------
-At least theoretically, JFFS2 does not require the Big Kernel Lock
-(BKL), which was always helpfully obtained for it by Linux 2.4 VFS
-code. It has its own locking, as described below.
-
This document attempts to describe the existing locking rules for
JFFS2. It is not expected to remain perfectly up to date, but ought to
be fairly close.
@@ -69,6 +65,7 @@ Ordering constraints:
any f->sem held.
2. Never attempt to lock two file mutexes in one thread.
No ordering rules have been made for doing so.
+ 3. Never lock a page cache page with f->sem held.
erase_completion_lock spinlock
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index 0ae91ad6df2d..b288c8ae1236 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -50,7 +50,8 @@ next_inode(int *i, struct jffs2_inode_cache *ic, struct jffs2_sb_info *c)
static void jffs2_build_inode_pass1(struct jffs2_sb_info *c,
- struct jffs2_inode_cache *ic)
+ struct jffs2_inode_cache *ic,
+ int *dir_hardlinks)
{
struct jffs2_full_dirent *fd;
@@ -69,19 +70,21 @@ static void jffs2_build_inode_pass1(struct jffs2_sb_info *c,
dbg_fsbuild("child \"%s\" (ino #%u) of dir ino #%u doesn't exist!\n",
fd->name, fd->ino, ic->ino);
jffs2_mark_node_obsolete(c, fd->raw);
+ /* Clear the ic/raw union so it doesn't cause problems later. */
+ fd->ic = NULL;
continue;
}
+ /* From this point, fd->raw is no longer used so we can set fd->ic */
+ fd->ic = child_ic;
+ child_ic->pino_nlink++;
+ /* If we appear (at this stage) to have hard-linked directories,
+ * set a flag to trigger a scan later */
if (fd->type == DT_DIR) {
- if (child_ic->pino_nlink) {
- JFFS2_ERROR("child dir \"%s\" (ino #%u) of dir ino #%u appears to be a hard link\n",
- fd->name, fd->ino, ic->ino);
- /* TODO: What do we do about it? */
- } else {
- child_ic->pino_nlink = ic->ino;
- }
- } else
- child_ic->pino_nlink++;
+ child_ic->flags |= INO_FLAGS_IS_DIR;
+ if (child_ic->pino_nlink > 1)
+ *dir_hardlinks = 1;
+ }
dbg_fsbuild("increased nlink for child \"%s\" (ino #%u)\n", fd->name, fd->ino);
/* Can't free scan_dents so far. We might need them in pass 2 */
@@ -95,8 +98,7 @@ static void jffs2_build_inode_pass1(struct jffs2_sb_info *c,
*/
static int jffs2_build_filesystem(struct jffs2_sb_info *c)
{
- int ret;
- int i;
+ int ret, i, dir_hardlinks = 0;
struct jffs2_inode_cache *ic;
struct jffs2_full_dirent *fd;
struct jffs2_full_dirent *dead_fds = NULL;
@@ -120,7 +122,7 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
/* Now scan the directory tree, increasing nlink according to every dirent found. */
for_each_inode(i, c, ic) {
if (ic->scan_dents) {
- jffs2_build_inode_pass1(c, ic);
+ jffs2_build_inode_pass1(c, ic, &dir_hardlinks);
cond_resched();
}
}
@@ -156,6 +158,20 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
}
dbg_fsbuild("pass 2a complete\n");
+
+ if (dir_hardlinks) {
+ /* If we detected directory hardlinks earlier, *hopefully*
+ * they are gone now because some of the links were from
+ * dead directories which still had some old dirents lying
+ * around and not yet garbage-collected, but which have
+ * been discarded above. So clear the pino_nlink field
+ * in each directory, so that the final scan below can
+ * print appropriate warnings. */
+ for_each_inode(i, c, ic) {
+ if (ic->flags & INO_FLAGS_IS_DIR)
+ ic->pino_nlink = 0;
+ }
+ }
dbg_fsbuild("freeing temporary data structures\n");
/* Finally, we can scan again and free the dirent structs */
@@ -163,6 +179,33 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
while(ic->scan_dents) {
fd = ic->scan_dents;
ic->scan_dents = fd->next;
+ /* We do use the pino_nlink field to count nlink of
+ * directories during fs build, so set it to the
+ * parent ino# now. Now that there's hopefully only
+ * one. */
+ if (fd->type == DT_DIR) {
+ if (!fd->ic) {
+ /* We'll have complained about it and marked the coresponding
+ raw node obsolete already. Just skip it. */
+ continue;
+ }
+
+ /* We *have* to have set this in jffs2_build_inode_pass1() */
+ BUG_ON(!(fd->ic->flags & INO_FLAGS_IS_DIR));
+
+ /* We clear ic->pino_nlink ∀ directories' ic *only* if dir_hardlinks
+ * is set. Otherwise, we know this should never trigger anyway, so
+ * we don't do the check. And ic->pino_nlink still contains the nlink
+ * value (which is 1). */
+ if (dir_hardlinks && fd->ic->pino_nlink) {
+ JFFS2_ERROR("child dir \"%s\" (ino #%u) of dir ino #%u is also hard linked from dir ino #%u\n",
+ fd->name, fd->ino, ic->ino, fd->ic->pino_nlink);
+ /* Should we unlink it from its previous parent? */
+ }
+
+ /* For directories, ic->pino_nlink holds that parent inode # */
+ fd->ic->pino_nlink = ic->ino;
+ }
jffs2_free_full_dirent(fd);
}
ic->scan_dents = NULL;
@@ -241,11 +284,7 @@ static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *c,
/* Reduce nlink of the child. If it's now zero, stick it on the
dead_fds list to be cleaned up later. Else just free the fd */
-
- if (fd->type == DT_DIR)
- child_ic->pino_nlink = 0;
- else
- child_ic->pino_nlink--;
+ child_ic->pino_nlink--;
if (!child_ic->pino_nlink) {
dbg_fsbuild("inode #%u (\"%s\") now has no links; adding to dead_fds list.\n",
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index d211b8e18566..30c4c9ebb693 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -843,9 +843,14 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
pr_notice("%s(): Link succeeded, unlink failed (err %d). You now have a hard link\n",
__func__, ret);
- /* Might as well let the VFS know */
- d_instantiate(new_dentry, d_inode(old_dentry));
- ihold(d_inode(old_dentry));
+ /*
+ * We can't keep the target in dcache after that.
+ * For one thing, we can't afford dentry aliases for directories.
+ * For another, if there was a victim, we _can't_ set new inode
+ * for that sucker and we have to trigger mount eviction - the
+ * caller won't do it on its own since we are returning an error.
+ */
+ d_invalidate(new_dentry);
new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
return ret;
}
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index c5ac5944bc1b..cad86bac3453 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -137,39 +137,33 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
struct page *pg;
struct inode *inode = mapping->host;
struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
- struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
- struct jffs2_raw_inode ri;
- uint32_t alloc_len = 0;
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
uint32_t pageofs = index << PAGE_CACHE_SHIFT;
int ret = 0;
- jffs2_dbg(1, "%s()\n", __func__);
-
- if (pageofs > inode->i_size) {
- ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
- ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
- if (ret)
- return ret;
- }
-
- mutex_lock(&f->sem);
pg = grab_cache_page_write_begin(mapping, index, flags);
- if (!pg) {
- if (alloc_len)
- jffs2_complete_reservation(c);
- mutex_unlock(&f->sem);
+ if (!pg)
return -ENOMEM;
- }
*pagep = pg;
- if (alloc_len) {
+ jffs2_dbg(1, "%s()\n", __func__);
+
+ if (pageofs > inode->i_size) {
/* Make new hole frag from old EOF to new page */
+ struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+ struct jffs2_raw_inode ri;
struct jffs2_full_dnode *fn;
+ uint32_t alloc_len;
jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new page\n",
(unsigned int)inode->i_size, pageofs);
+ ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
+ ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+ if (ret)
+ goto out_page;
+
+ mutex_lock(&f->sem);
memset(&ri, 0, sizeof(ri));
ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
@@ -196,6 +190,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
if (IS_ERR(fn)) {
ret = PTR_ERR(fn);
jffs2_complete_reservation(c);
+ mutex_unlock(&f->sem);
goto out_page;
}
ret = jffs2_add_full_dnode_to_inode(c, f, fn);
@@ -210,10 +205,12 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
jffs2_mark_node_obsolete(c, fn->raw);
jffs2_free_full_dnode(fn);
jffs2_complete_reservation(c);
+ mutex_unlock(&f->sem);
goto out_page;
}
jffs2_complete_reservation(c);
inode->i_size = pageofs;
+ mutex_unlock(&f->sem);
}
/*
@@ -222,18 +219,18 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
* case of a short-copy.
*/
if (!PageUptodate(pg)) {
+ mutex_lock(&f->sem);
ret = jffs2_do_readpage_nolock(inode, pg);
+ mutex_unlock(&f->sem);
if (ret)
goto out_page;
}
- mutex_unlock(&f->sem);
jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags);
return ret;
out_page:
unlock_page(pg);
page_cache_release(pg);
- mutex_unlock(&f->sem);
return ret;
}
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 5a2dec2b064c..95d5880a63ee 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -1296,14 +1296,17 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
BUG_ON(start > orig_start);
}
- /* First, use readpage() to read the appropriate page into the page cache */
- /* Q: What happens if we actually try to GC the _same_ page for which commit_write()
- * triggered garbage collection in the first place?
- * A: I _think_ it's OK. read_cache_page shouldn't deadlock, we'll write out the
- * page OK. We'll actually write it out again in commit_write, which is a little
- * suboptimal, but at least we're correct.
- */
+ /* The rules state that we must obtain the page lock *before* f->sem, so
+ * drop f->sem temporarily. Since we also hold c->alloc_sem, nothing's
+ * actually going to *change* so we're safe; we only allow reading.
+ *
+ * It is important to note that jffs2_write_begin() will ensure that its
+ * page is marked Uptodate before allocating space. That means that if we
+ * end up here trying to GC the *same* page that jffs2_write_begin() is
+ * trying to write out, read_cache_page() will not deadlock. */
+ mutex_unlock(&f->sem);
pg_ptr = jffs2_gc_fetch_page(c, f, start, &pg);
+ mutex_lock(&f->sem);
if (IS_ERR(pg_ptr)) {
pr_warn("read_cache_page() returned error: %ld\n",
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index fa35ff79ab35..0637271f3770 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -194,6 +194,7 @@ struct jffs2_inode_cache {
#define INO_STATE_CLEARING 6 /* In clear_inode() */
#define INO_FLAGS_XATTR_CHECKED 0x01 /* has no duplicate xattr_ref */
+#define INO_FLAGS_IS_DIR 0x02 /* is a directory */
#define RAWNODE_CLASS_INODE_CACHE 0
#define RAWNODE_CLASS_XATTR_DATUM 1
@@ -249,7 +250,10 @@ struct jffs2_readinode_info
struct jffs2_full_dirent
{
- struct jffs2_raw_node_ref *raw;
+ union {
+ struct jffs2_raw_node_ref *raw;
+ struct jffs2_inode_cache *ic; /* Just during part of build */
+ };
struct jffs2_full_dirent *next;
uint32_t version;
uint32_t ino; /* == zero for unlink */
diff --git a/fs/mpage.c b/fs/mpage.c
index 1480d3a18037..6bd9fd90964e 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -24,6 +24,7 @@
#include <linux/highmem.h>
#include <linux/prefetch.h>
#include <linux/mpage.h>
+#include <linux/mm_inline.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
@@ -366,7 +367,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
map_bh.b_state = 0;
map_bh.b_size = 0;
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
- struct page *page = list_entry(pages->prev, struct page, lru);
+ struct page *page = lru_to_page(pages);
prefetchw(&page->flags);
list_del(&page->lru);
diff --git a/fs/namei.c b/fs/namei.c
index f624d132e01e..9c590e0f66e9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1712,6 +1712,11 @@ static inline int should_follow_link(struct nameidata *nd, struct path *link,
return 0;
if (!follow)
return 0;
+ /* make sure that d_is_symlink above matches inode */
+ if (nd->flags & LOOKUP_RCU) {
+ if (read_seqcount_retry(&link->dentry->d_seq, seq))
+ return -ECHILD;
+ }
return pick_link(nd, link, inode, seq);
}
@@ -1743,11 +1748,11 @@ static int walk_component(struct nameidata *nd, int flags)
if (err < 0)
return err;
- inode = d_backing_inode(path.dentry);
seq = 0; /* we are already out of RCU mode */
err = -ENOENT;
if (d_is_negative(path.dentry))
goto out_path_put;
+ inode = d_backing_inode(path.dentry);
}
if (flags & WALK_PUT)
@@ -3192,12 +3197,12 @@ retry_lookup:
return error;
BUG_ON(nd->flags & LOOKUP_RCU);
- inode = d_backing_inode(path.dentry);
seq = 0; /* out of RCU mode, so the value doesn't matter */
if (unlikely(d_is_negative(path.dentry))) {
path_to_nameidata(&path, nd);
return -ENOENT;
}
+ inode = d_backing_inode(path.dentry);
finish_lookup:
if (nd->depth)
put_link(nd);
@@ -3206,11 +3211,6 @@ finish_lookup:
if (unlikely(error))
return error;
- if (unlikely(d_is_symlink(path.dentry)) && !(open_flag & O_PATH)) {
- path_to_nameidata(&path, nd);
- return -ELOOP;
- }
-
if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path.mnt) {
path_to_nameidata(&path, nd);
} else {
@@ -3229,6 +3229,10 @@ finish_open:
return error;
}
audit_inode(nd->name, nd->path.dentry, 0);
+ if (unlikely(d_is_symlink(nd->path.dentry)) && !(open_flag & O_PATH)) {
+ error = -ELOOP;
+ goto out;
+ }
error = -EISDIR;
if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
goto out;
@@ -3273,6 +3277,10 @@ opened:
goto exit_fput;
}
out:
+ if (unlikely(error > 0)) {
+ WARN_ON(1);
+ error = -EINVAL;
+ }
if (got_write)
mnt_drop_write(nd->path.mnt);
path_put(&save_parent);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 26c2de2de13f..b7f8eaeea5d8 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -633,7 +633,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
d_rehash(newdent);
} else {
spin_lock(&dentry->d_lock);
- NCP_FINFO(inode)->flags &= ~NCPI_DIR_CACHE;
+ NCP_FINFO(dir)->flags &= ~NCPI_DIR_CACHE;
spin_unlock(&dentry->d_lock);
}
} else {
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index c59a59c37f3d..35ab51c04814 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -476,6 +476,7 @@ static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
for (i = 0; i < nr_pages; i++)
put_page(arg->layoutupdate_pages[i]);
+ vfree(arg->start_p);
kfree(arg->layoutupdate_pages);
} else {
put_page(arg->layoutupdate_page);
@@ -559,10 +560,15 @@ retry:
if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
void *p = start_p, *end = p + arg->layoutupdate_len;
+ struct page *page = NULL;
int i = 0;
- for ( ; p < end; p += PAGE_SIZE)
- arg->layoutupdate_pages[i++] = vmalloc_to_page(p);
+ arg->start_p = start_p;
+ for ( ; p < end; p += PAGE_SIZE) {
+ page = vmalloc_to_page(p);
+ arg->layoutupdate_pages[i++] = page;
+ get_page(page);
+ }
}
dprintk("%s found %zu ranges\n", __func__, count);
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 5bcd92d50e82..0cb1abd535e3 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1215,7 +1215,7 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
hdr->pgio_mirror_idx + 1,
&hdr->pgio_mirror_idx))
goto out_eagain;
- set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+ set_bit(NFS_LAYOUT_RETURN_REQUESTED,
&hdr->lseg->pls_layout->plh_flags);
pnfs_read_resend_pnfs(hdr);
return task->tk_status;
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index 29898a9550fa..eb370460ce20 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -412,7 +412,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
OP_ILLEGAL, GFP_NOIO);
if (!fail_return) {
if (ff_layout_has_available_ds(lseg))
- set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+ set_bit(NFS_LAYOUT_RETURN_REQUESTED,
&lseg->pls_layout->plh_flags);
else
pnfs_error_mark_layout_for_return(ino, lseg);
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index bd25dc7077f7..dff83460e5a6 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -16,29 +16,8 @@
#define NFSDBG_FACILITY NFSDBG_PROC
-static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file,
- fmode_t fmode)
-{
- struct nfs_open_context *open;
- struct nfs_lock_context *lock;
- int ret;
-
- open = get_nfs_open_context(nfs_file_open_context(file));
- lock = nfs_get_lock_context(open);
- if (IS_ERR(lock)) {
- put_nfs_open_context(open);
- return PTR_ERR(lock);
- }
-
- ret = nfs4_set_rw_stateid(dst, open, lock, fmode);
-
- nfs_put_lock_context(lock);
- put_nfs_open_context(open);
- return ret;
-}
-
static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
- loff_t offset, loff_t len)
+ struct nfs_lock_context *lock, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(filep);
struct nfs_server *server = NFS_SERVER(inode);
@@ -56,7 +35,8 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
msg->rpc_argp = &args;
msg->rpc_resp = &res;
- status = nfs42_set_rw_stateid(&args.falloc_stateid, filep, FMODE_WRITE);
+ status = nfs4_set_rw_stateid(&args.falloc_stateid, lock->open_context,
+ lock, FMODE_WRITE);
if (status)
return status;
@@ -78,15 +58,26 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
{
struct nfs_server *server = NFS_SERVER(file_inode(filep));
struct nfs4_exception exception = { };
+ struct nfs_lock_context *lock;
int err;
+ lock = nfs_get_lock_context(nfs_file_open_context(filep));
+ if (IS_ERR(lock))
+ return PTR_ERR(lock);
+
+ exception.inode = file_inode(filep);
+ exception.state = lock->open_context->state;
+
do {
- err = _nfs42_proc_fallocate(msg, filep, offset, len);
- if (err == -ENOTSUPP)
- return -EOPNOTSUPP;
+ err = _nfs42_proc_fallocate(msg, filep, lock, offset, len);
+ if (err == -ENOTSUPP) {
+ err = -EOPNOTSUPP;
+ break;
+ }
err = nfs4_handle_exception(server, err, &exception);
} while (exception.retry);
+ nfs_put_lock_context(lock);
return err;
}
@@ -135,7 +126,8 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
return err;
}
-static loff_t _nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
+static loff_t _nfs42_proc_llseek(struct file *filep,
+ struct nfs_lock_context *lock, loff_t offset, int whence)
{
struct inode *inode = file_inode(filep);
struct nfs42_seek_args args = {
@@ -156,7 +148,8 @@ static loff_t _nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
if (!nfs_server_capable(inode, NFS_CAP_SEEK))
return -ENOTSUPP;
- status = nfs42_set_rw_stateid(&args.sa_stateid, filep, FMODE_READ);
+ status = nfs4_set_rw_stateid(&args.sa_stateid, lock->open_context,
+ lock, FMODE_READ);
if (status)
return status;
@@ -175,17 +168,28 @@ loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
{
struct nfs_server *server = NFS_SERVER(file_inode(filep));
struct nfs4_exception exception = { };
+ struct nfs_lock_context *lock;
loff_t err;
+ lock = nfs_get_lock_context(nfs_file_open_context(filep));
+ if (IS_ERR(lock))
+ return PTR_ERR(lock);
+
+ exception.inode = file_inode(filep);
+ exception.state = lock->open_context->state;
+
do {
- err = _nfs42_proc_llseek(filep, offset, whence);
+ err = _nfs42_proc_llseek(filep, lock, offset, whence);
if (err >= 0)
break;
- if (err == -ENOTSUPP)
- return -EOPNOTSUPP;
+ if (err == -ENOTSUPP) {
+ err = -EOPNOTSUPP;
+ break;
+ }
err = nfs4_handle_exception(server, err, &exception);
} while (exception.retry);
+ nfs_put_lock_context(lock);
return err;
}
@@ -298,8 +302,9 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server,
}
static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
- struct file *dst_f, loff_t src_offset,
- loff_t dst_offset, loff_t count)
+ struct file *dst_f, struct nfs_lock_context *src_lock,
+ struct nfs_lock_context *dst_lock, loff_t src_offset,
+ loff_t dst_offset, loff_t count)
{
struct inode *src_inode = file_inode(src_f);
struct inode *dst_inode = file_inode(dst_f);
@@ -320,11 +325,13 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
msg->rpc_argp = &args;
msg->rpc_resp = &res;
- status = nfs42_set_rw_stateid(&args.src_stateid, src_f, FMODE_READ);
+ status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context,
+ src_lock, FMODE_READ);
if (status)
return status;
- status = nfs42_set_rw_stateid(&args.dst_stateid, dst_f, FMODE_WRITE);
+ status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context,
+ dst_lock, FMODE_WRITE);
if (status)
return status;
@@ -349,22 +356,48 @@ int nfs42_proc_clone(struct file *src_f, struct file *dst_f,
};
struct inode *inode = file_inode(src_f);
struct nfs_server *server = NFS_SERVER(file_inode(src_f));
- struct nfs4_exception exception = { };
- int err;
+ struct nfs_lock_context *src_lock;
+ struct nfs_lock_context *dst_lock;
+ struct nfs4_exception src_exception = { };
+ struct nfs4_exception dst_exception = { };
+ int err, err2;
if (!nfs_server_capable(inode, NFS_CAP_CLONE))
return -EOPNOTSUPP;
+ src_lock = nfs_get_lock_context(nfs_file_open_context(src_f));
+ if (IS_ERR(src_lock))
+ return PTR_ERR(src_lock);
+
+ src_exception.inode = file_inode(src_f);
+ src_exception.state = src_lock->open_context->state;
+
+ dst_lock = nfs_get_lock_context(nfs_file_open_context(dst_f));
+ if (IS_ERR(dst_lock)) {
+ err = PTR_ERR(dst_lock);
+ goto out_put_src_lock;
+ }
+
+ dst_exception.inode = file_inode(dst_f);
+ dst_exception.state = dst_lock->open_context->state;
+
do {
- err = _nfs42_proc_clone(&msg, src_f, dst_f, src_offset,
- dst_offset, count);
+ err = _nfs42_proc_clone(&msg, src_f, dst_f, src_lock, dst_lock,
+ src_offset, dst_offset, count);
if (err == -ENOTSUPP || err == -EOPNOTSUPP) {
NFS_SERVER(inode)->caps &= ~NFS_CAP_CLONE;
- return -EOPNOTSUPP;
+ err = -EOPNOTSUPP;
+ break;
}
- err = nfs4_handle_exception(server, err, &exception);
- } while (exception.retry);
- return err;
+ err2 = nfs4_handle_exception(server, err, &src_exception);
+ err = nfs4_handle_exception(server, err, &dst_exception);
+ if (!err)
+ err = err2;
+ } while (src_exception.retry || dst_exception.retry);
+ nfs_put_lock_context(dst_lock);
+out_put_src_lock:
+ nfs_put_lock_context(src_lock);
+ return err;
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4bfc33ad0563..14881594dd07 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2466,9 +2466,9 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
dentry = d_add_unique(dentry, igrab(state->inode));
if (dentry == NULL) {
dentry = opendata->dentry;
- } else if (dentry != ctx->dentry) {
+ } else {
dput(ctx->dentry);
- ctx->dentry = dget(dentry);
+ ctx->dentry = dentry;
}
nfs_set_verifier(dentry,
nfs_save_change_attribute(d_inode(opendata->dir)));
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index a3592cc34a20..2fa483e6dbe2 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -52,9 +52,7 @@ static DEFINE_SPINLOCK(pnfs_spinlock);
*/
static LIST_HEAD(pnfs_modules_tbl);
-static int
-pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
- enum pnfs_iomode iomode, bool sync);
+static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo);
/* Return the registered pnfs layout driver module matching given id */
static struct pnfs_layoutdriver_type *
@@ -243,6 +241,8 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
{
struct inode *inode = lo->plh_inode;
+ pnfs_layoutreturn_before_put_layout_hdr(lo);
+
if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
if (!list_empty(&lo->plh_segs))
WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
@@ -252,6 +252,27 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
}
}
+/*
+ * Mark a pnfs_layout_hdr and all associated layout segments as invalid
+ *
+ * In order to continue using the pnfs_layout_hdr, a full recovery
+ * is required.
+ * Note that caller must hold inode->i_lock.
+ */
+static int
+pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
+ struct list_head *lseg_list)
+{
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_ANY,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+
+ set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+ return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range);
+}
+
static int
pnfs_iomode_to_fail_bit(u32 iomode)
{
@@ -345,58 +366,6 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
}
-/* Return true if layoutreturn is needed */
-static bool
-pnfs_layout_need_return(struct pnfs_layout_hdr *lo,
- struct pnfs_layout_segment *lseg)
-{
- struct pnfs_layout_segment *s;
-
- if (!test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
- return false;
-
- list_for_each_entry(s, &lo->plh_segs, pls_list)
- if (s != lseg && test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
- return false;
-
- return true;
-}
-
-static bool
-pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
-{
- if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
- return false;
- lo->plh_return_iomode = 0;
- pnfs_get_layout_hdr(lo);
- clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags);
- return true;
-}
-
-static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg,
- struct pnfs_layout_hdr *lo, struct inode *inode)
-{
- lo = lseg->pls_layout;
- inode = lo->plh_inode;
-
- spin_lock(&inode->i_lock);
- if (pnfs_layout_need_return(lo, lseg)) {
- nfs4_stateid stateid;
- enum pnfs_iomode iomode;
- bool send;
-
- nfs4_stateid_copy(&stateid, &lo->plh_stateid);
- iomode = lo->plh_return_iomode;
- send = pnfs_prepare_layoutreturn(lo);
- spin_unlock(&inode->i_lock);
- if (send) {
- /* Send an async layoutreturn so we dont deadlock */
- pnfs_send_layoutreturn(lo, &stateid, iomode, false);
- }
- } else
- spin_unlock(&inode->i_lock);
-}
-
void
pnfs_put_lseg(struct pnfs_layout_segment *lseg)
{
@@ -410,15 +379,8 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
atomic_read(&lseg->pls_refcount),
test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
- /* Handle the case where refcount != 1 */
- if (atomic_add_unless(&lseg->pls_refcount, -1, 1))
- return;
-
lo = lseg->pls_layout;
inode = lo->plh_inode;
- /* Do we need a layoutreturn? */
- if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
- pnfs_layoutreturn_before_put_lseg(lseg, lo, inode);
if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
@@ -613,9 +575,8 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
spin_lock(&nfsi->vfs_inode.i_lock);
lo = nfsi->layout;
if (lo) {
- lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
- pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
pnfs_get_layout_hdr(lo);
+ pnfs_mark_layout_stateid_invalid(lo, &tmp_list);
pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
spin_unlock(&nfsi->vfs_inode.i_lock);
@@ -676,11 +637,6 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
{
struct pnfs_layout_hdr *lo;
struct inode *inode;
- struct pnfs_layout_range range = {
- .iomode = IOMODE_ANY,
- .offset = 0,
- .length = NFS4_MAX_UINT64,
- };
LIST_HEAD(lseg_list);
int ret = 0;
@@ -695,11 +651,11 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
spin_lock(&inode->i_lock);
list_del_init(&lo->plh_bulk_destroy);
- lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
- if (is_bulk_recall)
- set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
- if (pnfs_mark_matching_lsegs_invalid(lo, &lseg_list, &range))
+ if (pnfs_mark_layout_stateid_invalid(lo, &lseg_list)) {
+ if (is_bulk_recall)
+ set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
ret = -EAGAIN;
+ }
spin_unlock(&inode->i_lock);
pnfs_free_lseg_list(&lseg_list);
/* Free all lsegs that are attached to commit buckets */
@@ -937,6 +893,17 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
}
+static bool
+pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
+{
+ if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
+ return false;
+ lo->plh_return_iomode = 0;
+ pnfs_get_layout_hdr(lo);
+ clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+ return true;
+}
+
static int
pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
enum pnfs_iomode iomode, bool sync)
@@ -971,6 +938,48 @@ out:
return status;
}
+/* Return true if layoutreturn is needed */
+static bool
+pnfs_layout_need_return(struct pnfs_layout_hdr *lo)
+{
+ struct pnfs_layout_segment *s;
+
+ if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+ return false;
+
+ /* Defer layoutreturn until all lsegs are done */
+ list_for_each_entry(s, &lo->plh_segs, pls_list) {
+ if (test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
+ return false;
+ }
+
+ return true;
+}
+
+static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct inode *inode= lo->plh_inode;
+
+ if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+ return;
+ spin_lock(&inode->i_lock);
+ if (pnfs_layout_need_return(lo)) {
+ nfs4_stateid stateid;
+ enum pnfs_iomode iomode;
+ bool send;
+
+ nfs4_stateid_copy(&stateid, &lo->plh_stateid);
+ iomode = lo->plh_return_iomode;
+ send = pnfs_prepare_layoutreturn(lo);
+ spin_unlock(&inode->i_lock);
+ if (send) {
+ /* Send an async layoutreturn so we dont deadlock */
+ pnfs_send_layoutreturn(lo, &stateid, iomode, false);
+ }
+ } else
+ spin_unlock(&inode->i_lock);
+}
+
/*
* Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
* when the layout segment list is empty.
@@ -1091,7 +1100,7 @@ bool pnfs_roc(struct inode *ino)
nfs4_stateid_copy(&stateid, &lo->plh_stateid);
/* always send layoutreturn if being marked so */
- if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+ if (test_and_clear_bit(NFS_LAYOUT_RETURN_REQUESTED,
&lo->plh_flags))
layoutreturn = pnfs_prepare_layoutreturn(lo);
@@ -1744,8 +1753,19 @@ pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode)
if (lo->plh_return_iomode != 0)
iomode = IOMODE_ANY;
lo->plh_return_iomode = iomode;
+ set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
}
+/**
+ * pnfs_mark_matching_lsegs_return - Free or return matching layout segments
+ * @lo: pointer to layout header
+ * @tmp_list: list header to be used with pnfs_free_lseg_list()
+ * @return_range: describe layout segment ranges to be returned
+ *
+ * This function is mainly intended for use by layoutrecall. It attempts
+ * to free the layout segment immediately, or else to mark it for return
+ * as soon as its reference count drops to zero.
+ */
int
pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
@@ -1768,12 +1788,11 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
lseg, lseg->pls_range.iomode,
lseg->pls_range.offset,
lseg->pls_range.length);
+ if (mark_lseg_invalid(lseg, tmp_list))
+ continue;
+ remaining++;
set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
pnfs_set_plh_return_iomode(lo, return_range->iomode);
- if (!mark_lseg_invalid(lseg, tmp_list))
- remaining++;
- set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
- &lo->plh_flags);
}
return remaining;
}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 9f4e2a47f4aa..1ac1db5f6dad 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -94,8 +94,8 @@ enum {
NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
- NFS_LAYOUT_RETURN, /* Return this layout ASAP */
- NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */
+ NFS_LAYOUT_RETURN, /* layoutreturn in progress */
+ NFS_LAYOUT_RETURN_REQUESTED, /* Return this layout ASAP */
NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */
};
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index cfcbf114676e..7115c5d7d373 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -91,7 +91,14 @@
#include <linux/fsnotify_backend.h>
#include "fsnotify.h"
+#define FSNOTIFY_REAPER_DELAY (1) /* 1 jiffy */
+
struct srcu_struct fsnotify_mark_srcu;
+static DEFINE_SPINLOCK(destroy_lock);
+static LIST_HEAD(destroy_list);
+
+static void fsnotify_mark_destroy(struct work_struct *work);
+static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy);
void fsnotify_get_mark(struct fsnotify_mark *mark)
{
@@ -165,19 +172,10 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
atomic_dec(&group->num_marks);
}
-static void
-fsnotify_mark_free_rcu(struct rcu_head *rcu)
-{
- struct fsnotify_mark *mark;
-
- mark = container_of(rcu, struct fsnotify_mark, g_rcu);
- fsnotify_put_mark(mark);
-}
-
/*
- * Free fsnotify mark. The freeing is actually happening from a call_srcu
- * callback. Caller must have a reference to the mark or be protected by
- * fsnotify_mark_srcu.
+ * Free fsnotify mark. The freeing is actually happening from a kthread which
+ * first waits for srcu period end. Caller must have a reference to the mark
+ * or be protected by fsnotify_mark_srcu.
*/
void fsnotify_free_mark(struct fsnotify_mark *mark)
{
@@ -192,7 +190,11 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
spin_unlock(&mark->lock);
- call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu);
+ spin_lock(&destroy_lock);
+ list_add(&mark->g_list, &destroy_list);
+ spin_unlock(&destroy_lock);
+ queue_delayed_work(system_unbound_wq, &reaper_work,
+ FSNOTIFY_REAPER_DELAY);
/*
* Some groups like to know that marks are being freed. This is a
@@ -388,7 +390,12 @@ err:
spin_unlock(&mark->lock);
- call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu);
+ spin_lock(&destroy_lock);
+ list_add(&mark->g_list, &destroy_list);
+ spin_unlock(&destroy_lock);
+ queue_delayed_work(system_unbound_wq, &reaper_work,
+ FSNOTIFY_REAPER_DELAY);
+
return ret;
}
@@ -491,3 +498,21 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
atomic_set(&mark->refcnt, 1);
mark->free_mark = free_mark;
}
+
+static void fsnotify_mark_destroy(struct work_struct *work)
+{
+ struct fsnotify_mark *mark, *next;
+ struct list_head private_destroy_list;
+
+ spin_lock(&destroy_lock);
+ /* exchange the list head */
+ list_replace_init(&destroy_list, &private_destroy_list);
+ spin_unlock(&destroy_lock);
+
+ synchronize_srcu(&fsnotify_mark_srcu);
+
+ list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) {
+ list_del_init(&mark->g_list);
+ fsnotify_put_mark(mark);
+ }
+}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 794fd1587f34..cda0361e95a4 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -956,6 +956,7 @@ clean_orphan:
tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
update_isize, end);
if (tmp_ret < 0) {
+ ocfs2_inode_unlock(inode, 1);
ret = tmp_ret;
mlog_errno(ret);
brelse(di_bh);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index a3cc6d2fc896..ef6a2ec494de 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -287,7 +287,6 @@ struct o2hb_bio_wait_ctxt {
static void o2hb_write_timeout(struct work_struct *work)
{
int failed, quorum;
- unsigned long flags;
struct o2hb_region *reg =
container_of(work, struct o2hb_region,
hr_write_timeout_work.work);
@@ -297,14 +296,14 @@ static void o2hb_write_timeout(struct work_struct *work)
jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
if (o2hb_global_heartbeat_active()) {
- spin_lock_irqsave(&o2hb_live_lock, flags);
+ spin_lock(&o2hb_live_lock);
if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
failed = bitmap_weight(o2hb_failed_region_bitmap,
O2NM_MAX_REGIONS);
quorum = bitmap_weight(o2hb_quorum_region_bitmap,
O2NM_MAX_REGIONS);
- spin_unlock_irqrestore(&o2hb_live_lock, flags);
+ spin_unlock(&o2hb_live_lock);
mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n",
quorum, failed);
@@ -1254,15 +1253,15 @@ static const struct file_operations o2hb_debug_fops = {
void o2hb_exit(void)
{
- kfree(o2hb_db_livenodes);
- kfree(o2hb_db_liveregions);
- kfree(o2hb_db_quorumregions);
- kfree(o2hb_db_failedregions);
debugfs_remove(o2hb_debug_failedregions);
debugfs_remove(o2hb_debug_quorumregions);
debugfs_remove(o2hb_debug_liveregions);
debugfs_remove(o2hb_debug_livenodes);
debugfs_remove(o2hb_debug_dir);
+ kfree(o2hb_db_livenodes);
+ kfree(o2hb_db_liveregions);
+ kfree(o2hb_db_quorumregions);
+ kfree(o2hb_db_failedregions);
}
static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
@@ -1438,13 +1437,15 @@ static void o2hb_region_release(struct config_item *item)
kfree(reg->hr_slots);
- kfree(reg->hr_db_regnum);
- kfree(reg->hr_db_livenodes);
debugfs_remove(reg->hr_debug_livenodes);
debugfs_remove(reg->hr_debug_regnum);
debugfs_remove(reg->hr_debug_elapsed_time);
debugfs_remove(reg->hr_debug_pinned);
debugfs_remove(reg->hr_debug_dir);
+ kfree(reg->hr_db_livenodes);
+ kfree(reg->hr_db_regnum);
+ kfree(reg->hr_debug_elapsed_time);
+ kfree(reg->hr_debug_pinned);
spin_lock(&o2hb_live_lock);
list_del(&reg->hr_all_item);
@@ -2423,11 +2424,10 @@ EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
int o2hb_check_node_heartbeating_no_sem(u8 node_num)
{
unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
- unsigned long flags;
- spin_lock_irqsave(&o2hb_live_lock, flags);
+ spin_lock(&o2hb_live_lock);
o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
- spin_unlock_irqrestore(&o2hb_live_lock, flags);
+ spin_unlock(&o2hb_live_lock);
if (!test_bit(node_num, testing_map)) {
mlog(ML_HEARTBEAT,
"node (%u) does not have heartbeating enabled.\n",
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 68c607e63ff6..004f2cbe8f71 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -282,6 +282,7 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
#define DLM_LOCK_RES_DROPPING_REF 0x00000040
#define DLM_LOCK_RES_BLOCK_DIRTY 0x00001000
#define DLM_LOCK_RES_SETREF_INPROG 0x00002000
+#define DLM_LOCK_RES_RECOVERY_WAITING 0x00004000
/* max milliseconds to wait to sync up a network failure with a node death */
#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000)
@@ -451,6 +452,7 @@ enum {
DLM_QUERY_REGION = 519,
DLM_QUERY_NODEINFO = 520,
DLM_BEGIN_EXIT_DOMAIN_MSG = 521,
+ DLM_DEREF_LOCKRES_DONE = 522,
};
struct dlm_reco_node_data
@@ -545,7 +547,7 @@ struct dlm_master_requery
* };
*
* from ../cluster/tcp.h
- * NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg))
+ * O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg))
* (roughly 4080 bytes)
* and sizeof(dlm_migratable_lockres) = 112 bytes
* and sizeof(dlm_migratable_lock) = 16 bytes
@@ -586,7 +588,7 @@ struct dlm_migratable_lockres
/* from above, 128 bytes
* for some undetermined future use */
-#define DLM_MIG_LOCKRES_RESERVED (NET_MAX_PAYLOAD_BYTES - \
+#define DLM_MIG_LOCKRES_RESERVED (O2NET_MAX_PAYLOAD_BYTES - \
DLM_MIG_LOCKRES_MAX_LEN)
struct dlm_create_lock
@@ -782,6 +784,20 @@ struct dlm_deref_lockres
u8 name[O2NM_MAX_NAME_LEN];
};
+enum {
+ DLM_DEREF_RESPONSE_DONE = 0,
+ DLM_DEREF_RESPONSE_INPROG = 1,
+};
+
+struct dlm_deref_lockres_done {
+ u32 pad1;
+ u16 pad2;
+ u8 node_idx;
+ u8 namelen;
+
+ u8 name[O2NM_MAX_NAME_LEN];
+};
+
static inline enum dlm_status
__dlm_lockres_state_to_status(struct dlm_lock_resource *res)
{
@@ -789,7 +805,8 @@ __dlm_lockres_state_to_status(struct dlm_lock_resource *res)
assert_spin_locked(&res->spinlock);
- if (res->state & DLM_LOCK_RES_RECOVERING)
+ if (res->state & (DLM_LOCK_RES_RECOVERING|
+ DLM_LOCK_RES_RECOVERY_WAITING))
status = DLM_RECOVERING;
else if (res->state & DLM_LOCK_RES_MIGRATING)
status = DLM_MIGRATING;
@@ -968,6 +985,8 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
void dlm_assert_master_post_handler(int status, void *data, void *ret_data);
int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data);
+int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data);
int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
@@ -1009,6 +1028,7 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
{
__dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS|
DLM_LOCK_RES_RECOVERING|
+ DLM_LOCK_RES_RECOVERY_WAITING|
DLM_LOCK_RES_MIGRATING));
}
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 2ee7fe747cea..12e064b8be9a 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -132,10 +132,13 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
* - Message DLM_QUERY_NODEINFO added to allow online node removes
* New in version 1.2:
* - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain
+ * New in version 1.3:
+ * - Message DLM_DEREF_LOCKRES_DONE added to inform non-master that the
+ * refmap is cleared
*/
static const struct dlm_protocol_version dlm_protocol = {
.pv_major = 1,
- .pv_minor = 2,
+ .pv_minor = 3,
};
#define DLM_DOMAIN_BACKOFF_MS 200
@@ -1396,7 +1399,7 @@ static int dlm_send_join_cancels(struct dlm_ctxt *dlm,
unsigned int map_size)
{
int status, tmpstat;
- unsigned int node;
+ int node;
if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) *
sizeof(unsigned long))) {
@@ -1853,7 +1856,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
sizeof(struct dlm_exit_domain),
dlm_begin_exit_domain_handler,
dlm, NULL, &dlm->dlm_domain_handlers);
+ if (status)
+ goto bail;
+ status = o2net_register_handler(DLM_DEREF_LOCKRES_DONE, dlm->key,
+ sizeof(struct dlm_deref_lockres_done),
+ dlm_deref_lockres_done_handler,
+ dlm, NULL, &dlm->dlm_domain_handlers);
bail:
if (status)
dlm_unregister_domain_handlers(dlm);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 9477d6e1de37..9aed6e202201 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2278,7 +2278,7 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
dlm_print_one_lock_resource(res);
BUG();
}
- return ret;
+ return ret ? ret : r;
}
int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
@@ -2345,7 +2345,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
res->lockname.len, res->lockname.name, node);
dlm_print_one_lock_resource(res);
}
- ret = 0;
+ ret = DLM_DEREF_RESPONSE_DONE;
goto done;
}
@@ -2365,7 +2365,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
spin_unlock(&dlm->work_lock);
queue_work(dlm->dlm_worker, &dlm->dispatched_work);
- return 0;
+ return DLM_DEREF_RESPONSE_INPROG;
done:
if (res)
@@ -2375,6 +2375,122 @@ done:
return ret;
}
+int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
+{
+ struct dlm_ctxt *dlm = data;
+ struct dlm_deref_lockres_done *deref
+ = (struct dlm_deref_lockres_done *)msg->buf;
+ struct dlm_lock_resource *res = NULL;
+ char *name;
+ unsigned int namelen;
+ int ret = -EINVAL;
+ u8 node;
+ unsigned int hash;
+
+ if (!dlm_grab(dlm))
+ return 0;
+
+ name = deref->name;
+ namelen = deref->namelen;
+ node = deref->node_idx;
+
+ if (namelen > DLM_LOCKID_NAME_MAX) {
+ mlog(ML_ERROR, "Invalid name length!");
+ goto done;
+ }
+ if (deref->node_idx >= O2NM_MAX_NODES) {
+ mlog(ML_ERROR, "Invalid node number: %u\n", node);
+ goto done;
+ }
+
+ hash = dlm_lockid_hash(name, namelen);
+
+ spin_lock(&dlm->spinlock);
+ res = __dlm_lookup_lockres_full(dlm, name, namelen, hash);
+ if (!res) {
+ spin_unlock(&dlm->spinlock);
+ mlog(ML_ERROR, "%s:%.*s: bad lockres name\n",
+ dlm->name, namelen, name);
+ goto done;
+ }
+
+ spin_lock(&res->spinlock);
+ BUG_ON(!(res->state & DLM_LOCK_RES_DROPPING_REF));
+ if (!list_empty(&res->purge)) {
+ mlog(0, "%s: Removing res %.*s from purgelist\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ list_del_init(&res->purge);
+ dlm_lockres_put(res);
+ dlm->purge_count--;
+ }
+
+ if (!__dlm_lockres_unused(res)) {
+ mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ BUG();
+ }
+
+ __dlm_unhash_lockres(dlm, res);
+
+ spin_lock(&dlm->track_lock);
+ if (!list_empty(&res->tracking))
+ list_del_init(&res->tracking);
+ else {
+ mlog(ML_ERROR, "%s: Resource %.*s not on the Tracking list\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ }
+ spin_unlock(&dlm->track_lock);
+
+ /* lockres is not in the hash now. drop the flag and wake up
+ * any processes waiting in dlm_get_lock_resource.
+ */
+ res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+ spin_unlock(&res->spinlock);
+ wake_up(&res->wq);
+
+ dlm_lockres_put(res);
+
+ spin_unlock(&dlm->spinlock);
+
+done:
+ dlm_put(dlm);
+ return ret;
+}
+
+static void dlm_drop_lockres_ref_done(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, u8 node)
+{
+ struct dlm_deref_lockres_done deref;
+ int ret = 0, r;
+ const char *lockname;
+ unsigned int namelen;
+
+ lockname = res->lockname.name;
+ namelen = res->lockname.len;
+ BUG_ON(namelen > O2NM_MAX_NAME_LEN);
+
+ memset(&deref, 0, sizeof(deref));
+ deref.node_idx = dlm->node_num;
+ deref.namelen = namelen;
+ memcpy(deref.name, lockname, namelen);
+
+ ret = o2net_send_message(DLM_DEREF_LOCKRES_DONE, dlm->key,
+ &deref, sizeof(deref), node, &r);
+ if (ret < 0) {
+ mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF DONE "
+ " to node %u\n", dlm->name, namelen,
+ lockname, ret, node);
+ } else if (r < 0) {
+ /* ignore the error */
+ mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
+ dlm->name, namelen, lockname, node, r);
+ dlm_print_one_lock_resource(res);
+ }
+}
+
static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
{
struct dlm_ctxt *dlm;
@@ -2395,6 +2511,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
}
spin_unlock(&res->spinlock);
+ dlm_drop_lockres_ref_done(dlm, res, node);
+
if (cleared) {
mlog(0, "%s:%.*s node %u ref dropped in dispatch\n",
dlm->name, res->lockname.len, res->lockname.name, node);
@@ -2432,7 +2550,8 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
return 0;
/* delay migration when the lockres is in RECOCERING state */
- if (res->state & DLM_LOCK_RES_RECOVERING)
+ if (res->state & (DLM_LOCK_RES_RECOVERING|
+ DLM_LOCK_RES_RECOVERY_WAITING))
return 0;
if (res->owner != dlm->node_num)
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index c5bdf02c213b..cd38488a10fc 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1403,12 +1403,24 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
* and RECOVERY flag changed when it completes. */
hash = dlm_lockid_hash(mres->lockname, mres->lockname_len);
spin_lock(&dlm->spinlock);
- res = __dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len,
+ res = __dlm_lookup_lockres_full(dlm, mres->lockname, mres->lockname_len,
hash);
if (res) {
/* this will get a ref on res */
/* mark it as recovering/migrating and hash it */
spin_lock(&res->spinlock);
+ if (res->state & DLM_LOCK_RES_DROPPING_REF) {
+ mlog(0, "%s: node is attempting to migrate "
+ "lockres %.*s, but marked as dropping "
+ " ref!\n", dlm->name,
+ mres->lockname_len, mres->lockname);
+ ret = -EINVAL;
+ spin_unlock(&res->spinlock);
+ spin_unlock(&dlm->spinlock);
+ dlm_lockres_put(res);
+ goto leave;
+ }
+
if (mres->flags & DLM_MRES_RECOVERY) {
res->state |= DLM_LOCK_RES_RECOVERING;
} else {
@@ -2163,6 +2175,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_lockres_hash(dlm, i);
hlist_for_each_entry(res, bucket, hash_node) {
+ if (res->state & DLM_LOCK_RES_RECOVERY_WAITING) {
+ spin_lock(&res->spinlock);
+ res->state &= ~DLM_LOCK_RES_RECOVERY_WAITING;
+ spin_unlock(&res->spinlock);
+ wake_up(&res->wq);
+ }
+
if (!(res->state & DLM_LOCK_RES_RECOVERING))
continue;
@@ -2300,6 +2319,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
res->lockname.len, res->lockname.name, freed, dead_node);
__dlm_print_one_lock_resource(res);
}
+ res->state |= DLM_LOCK_RES_RECOVERY_WAITING;
dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
} else if (test_bit(dead_node, res->refmap)) {
mlog(0, "%s:%.*s: dead node %u had a ref, but had "
@@ -2367,6 +2387,8 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
break;
}
}
+ dlm_lockres_clear_refmap_bit(dlm, res,
+ dead_node);
spin_unlock(&res->spinlock);
continue;
}
@@ -2375,14 +2397,16 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
dlm_revalidate_lvb(dlm, res, dead_node);
if (res->owner == dead_node) {
if (res->state & DLM_LOCK_RES_DROPPING_REF) {
- mlog(ML_NOTICE, "%s: res %.*s, Skip "
- "recovery as it is being freed\n",
- dlm->name, res->lockname.len,
- res->lockname.name);
- } else
- dlm_move_lockres_to_recovery_list(dlm,
- res);
-
+ mlog(0, "%s:%.*s: owned by "
+ "dead node %u, this node was "
+ "dropping its ref when it died. "
+ "continue, dropping the flag.\n",
+ dlm->name, res->lockname.len,
+ res->lockname.name, dead_node);
+ }
+ res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+ dlm_move_lockres_to_recovery_list(dlm,
+ res);
} else if (res->owner == dlm->node_num) {
dlm_free_dead_locks(dlm, res, dead_node);
__dlm_lockres_calc_usage(dlm, res);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index c5f6c241ecd7..68d239ba0c63 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -106,7 +106,8 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
return 0;
- if (res->state & DLM_LOCK_RES_RECOVERING)
+ if (res->state & (DLM_LOCK_RES_RECOVERING|
+ DLM_LOCK_RES_RECOVERY_WAITING))
return 0;
/* Another node has this resource with this node as the master */
@@ -202,6 +203,13 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
dlm->purge_count--;
}
+ if (!master && ret != 0) {
+ mlog(0, "%s: deref %.*s in progress or master goes down\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ spin_unlock(&res->spinlock);
+ return;
+ }
+
if (!__dlm_lockres_unused(res)) {
mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
dlm->name, res->lockname.len, res->lockname.name);
@@ -700,7 +708,8 @@ static int dlm_thread(void *data)
* dirty for a short while. */
BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
if (res->state & (DLM_LOCK_RES_IN_PROGRESS |
- DLM_LOCK_RES_RECOVERING)) {
+ DLM_LOCK_RES_RECOVERING |
+ DLM_LOCK_RES_RECOVERY_WAITING)) {
/* move it to the tail and keep going */
res->state &= ~DLM_LOCK_RES_DIRTY;
spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 9581d190f6e1..77ebc2bc1cca 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -147,6 +147,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
ret = ocfs2_inode_lock(inode, &di_bh, 1);
if (ret < 0) {
mlog_errno(ret);
+ if (ret == -ENOMEM)
+ ret = VM_FAULT_OOM;
+ else
+ ret = VM_FAULT_SIGBUS;
goto out;
}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index faa1365097bc..302854ee0985 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -236,6 +236,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
struct ocfs2_recovery_map *rm = osb->recovery_map;
struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan;
int i, out = 0;
+ unsigned long flags;
out += snprintf(buf + out, len - out,
"%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n",
@@ -271,14 +272,14 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
cconn->cc_version.pv_minor);
}
- spin_lock(&osb->dc_task_lock);
+ spin_lock_irqsave(&osb->dc_task_lock, flags);
out += snprintf(buf + out, len - out,
"%10s => Pid: %d Count: %lu WakeSeq: %lu "
"WorkSeq: %lu\n", "DownCnvt",
(osb->dc_task ? task_pid_nr(osb->dc_task) : -1),
osb->blocked_lock_count, osb->dc_wake_sequence,
osb->dc_work_sequence);
- spin_unlock(&osb->dc_task_lock);
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags);
spin_lock(&osb->osb_lock);
out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:",
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index ed95272d57a6..52f6de5d40a9 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -618,7 +618,8 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
* sole user of this dentry. Too tricky... Just unhash for
* now.
*/
- d_drop(dentry);
+ if (!err)
+ d_drop(dentry);
inode_unlock(dir);
return err;
@@ -903,6 +904,13 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(newdentry);
+ /*
+ * Old dentry now lives in different location. Dentries in
+ * lowerstack are stale. We cannot drop them here because
+ * access to them is lockless. This could be only pure upper
+ * or opaque directory - numlower is zero. Or upper non-dir
+ * entry - its pureness is tracked by flag opaque.
+ */
if (old_opaque != new_opaque) {
ovl_dentry_set_opaque(old, new_opaque);
if (!overwrite)
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 49e204560655..a4ff5d0d7db9 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -65,6 +65,8 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
inode_lock(upperdentry->d_inode);
err = notify_change(upperdentry, attr, NULL);
+ if (!err)
+ ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
inode_unlock(upperdentry->d_inode);
}
ovl_drop_write(dentry);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 8d826bd56b26..619ad4b016d2 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -76,12 +76,14 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry)
if (oe->__upperdentry) {
type = __OVL_PATH_UPPER;
- if (oe->numlower) {
- if (S_ISDIR(dentry->d_inode->i_mode))
- type |= __OVL_PATH_MERGE;
- } else if (!oe->opaque) {
+ /*
+ * Non-dir dentry can hold lower dentry from previous
+ * location. Its purity depends only on opaque flag.
+ */
+ if (oe->numlower && S_ISDIR(dentry->d_inode->i_mode))
+ type |= __OVL_PATH_MERGE;
+ else if (!oe->opaque)
type |= __OVL_PATH_PURE;
- }
} else {
if (oe->numlower > 1)
type |= __OVL_PATH_MERGE;
@@ -341,6 +343,7 @@ static const struct dentry_operations ovl_dentry_operations = {
static const struct dentry_operations ovl_reval_dentry_operations = {
.d_release = ovl_dentry_release,
+ .d_select_inode = ovl_d_select_inode,
.d_revalidate = ovl_dentry_revalidate,
.d_weak_revalidate = ovl_dentry_weak_revalidate,
};
diff --git a/fs/pnode.c b/fs/pnode.c
index 6367e1e435c6..c524fdddc7fb 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -202,6 +202,11 @@ static struct mount *last_dest, *last_source, *dest_master;
static struct mountpoint *mp;
static struct hlist_head *list;
+static inline bool peers(struct mount *m1, struct mount *m2)
+{
+ return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id;
+}
+
static int propagate_one(struct mount *m)
{
struct mount *child;
@@ -212,7 +217,7 @@ static int propagate_one(struct mount *m)
/* skip if mountpoint isn't covered by it */
if (!is_subdir(mp->m_dentry, m->mnt.mnt_root))
return 0;
- if (m->mnt_group_id == last_dest->mnt_group_id) {
+ if (peers(m, last_dest)) {
type = CL_MAKE_SHARED;
} else {
struct mount *n, *p;
@@ -223,7 +228,7 @@ static int propagate_one(struct mount *m)
last_source = last_source->mnt_master;
last_dest = last_source->mnt_parent;
}
- if (n->mnt_group_id != last_dest->mnt_group_id) {
+ if (!peers(n, last_dest)) {
last_source = last_source->mnt_master;
last_dest = last_source->mnt_parent;
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 85d16c67c33e..fa95ab2d3674 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -259,23 +259,29 @@ static int do_maps_open(struct inode *inode, struct file *file,
sizeof(struct proc_maps_private));
}
-static pid_t pid_of_stack(struct proc_maps_private *priv,
- struct vm_area_struct *vma, bool is_pid)
+/*
+ * Indicate if the VMA is a stack for the given task; for
+ * /proc/PID/maps that is the stack of the main task.
+ */
+static int is_stack(struct proc_maps_private *priv,
+ struct vm_area_struct *vma, int is_pid)
{
- struct inode *inode = priv->inode;
- struct task_struct *task;
- pid_t ret = 0;
+ int stack = 0;
+
+ if (is_pid) {
+ stack = vma->vm_start <= vma->vm_mm->start_stack &&
+ vma->vm_end >= vma->vm_mm->start_stack;
+ } else {
+ struct inode *inode = priv->inode;
+ struct task_struct *task;
- rcu_read_lock();
- task = pid_task(proc_pid(inode), PIDTYPE_PID);
- if (task) {
- task = task_of_stack(task, vma, is_pid);
+ rcu_read_lock();
+ task = pid_task(proc_pid(inode), PIDTYPE_PID);
if (task)
- ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
+ stack = vma_is_stack_for_task(vma, task);
+ rcu_read_unlock();
}
- rcu_read_unlock();
-
- return ret;
+ return stack;
}
static void
@@ -335,8 +341,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
name = arch_vma_name(vma);
if (!name) {
- pid_t tid;
-
if (!mm) {
name = "[vdso]";
goto done;
@@ -348,21 +352,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
goto done;
}
- tid = pid_of_stack(priv, vma, is_pid);
- if (tid != 0) {
- /*
- * Thread stack in /proc/PID/task/TID/maps or
- * the main process stack.
- */
- if (!is_pid || (vma->vm_start <= mm->start_stack &&
- vma->vm_end >= mm->start_stack)) {
- name = "[stack]";
- } else {
- /* Thread stack in /proc/PID/maps */
- seq_pad(m, ' ');
- seq_printf(m, "[stack:%d]", tid);
- }
- }
+ if (is_stack(priv, vma, is_pid))
+ name = "[stack]";
}
done:
@@ -1552,18 +1543,19 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end, struct mm_walk *walk)
{
+ pte_t huge_pte = huge_ptep_get(pte);
struct numa_maps *md;
struct page *page;
- if (!pte_present(*pte))
+ if (!pte_present(huge_pte))
return 0;
- page = pte_page(*pte);
+ page = pte_page(huge_pte);
if (!page)
return 0;
md = walk->private;
- gather_stats(page, md, pte_dirty(*pte), 1);
+ gather_stats(page, md, pte_dirty(huge_pte), 1);
return 0;
}
@@ -1617,19 +1609,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
seq_file_path(m, file, "\n\t= ");
} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
seq_puts(m, " heap");
- } else {
- pid_t tid = pid_of_stack(proc_priv, vma, is_pid);
- if (tid != 0) {
- /*
- * Thread stack in /proc/PID/task/TID/maps or
- * the main process stack.
- */
- if (!is_pid || (vma->vm_start <= mm->start_stack &&
- vma->vm_end >= mm->start_stack))
- seq_puts(m, " stack");
- else
- seq_printf(m, " stack:%d", tid);
- }
+ } else if (is_stack(proc_priv, vma, is_pid)) {
+ seq_puts(m, " stack");
}
if (is_vm_hugetlb_page(vma))
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index e0d64c92e4f6..faacb0c0d857 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -123,23 +123,26 @@ unsigned long task_statm(struct mm_struct *mm,
return size;
}
-static pid_t pid_of_stack(struct proc_maps_private *priv,
- struct vm_area_struct *vma, bool is_pid)
+static int is_stack(struct proc_maps_private *priv,
+ struct vm_area_struct *vma, int is_pid)
{
- struct inode *inode = priv->inode;
- struct task_struct *task;
- pid_t ret = 0;
-
- rcu_read_lock();
- task = pid_task(proc_pid(inode), PIDTYPE_PID);
- if (task) {
- task = task_of_stack(task, vma, is_pid);
+ struct mm_struct *mm = vma->vm_mm;
+ int stack = 0;
+
+ if (is_pid) {
+ stack = vma->vm_start <= mm->start_stack &&
+ vma->vm_end >= mm->start_stack;
+ } else {
+ struct inode *inode = priv->inode;
+ struct task_struct *task;
+
+ rcu_read_lock();
+ task = pid_task(proc_pid(inode), PIDTYPE_PID);
if (task)
- ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
+ stack = vma_is_stack_for_task(vma, task);
+ rcu_read_unlock();
}
- rcu_read_unlock();
-
- return ret;
+ return stack;
}
/*
@@ -181,21 +184,9 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
if (file) {
seq_pad(m, ' ');
seq_file_path(m, file, "");
- } else if (mm) {
- pid_t tid = pid_of_stack(priv, vma, is_pid);
-
- if (tid != 0) {
- seq_pad(m, ' ');
- /*
- * Thread stack in /proc/PID/task/TID/maps or
- * the main process stack.
- */
- if (!is_pid || (vma->vm_start <= mm->start_stack &&
- vma->vm_end >= mm->start_stack))
- seq_printf(m, "[stack]");
- else
- seq_printf(m, "[stack:%d]", tid);
- }
+ } else if (mm && is_stack(priv, vma, is_pid)) {
+ seq_pad(m, ' ');
+ seq_printf(m, "[stack]");
}
seq_putc(m, '\n');
diff --git a/fs/read_write.c b/fs/read_write.c
index 324ec271cc4e..dadf24e5c95b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -17,6 +17,7 @@
#include <linux/splice.h>
#include <linux/compat.h>
#include <linux/mount.h>
+#include <linux/fs.h>
#include "internal.h"
#include <asm/uaccess.h>
@@ -183,7 +184,7 @@ loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
switch (whence) {
case SEEK_SET: case SEEK_CUR:
return generic_file_llseek_size(file, offset, whence,
- ~0ULL, 0);
+ OFFSET_MAX, 0);
default:
return -EINVAL;
}
@@ -1532,10 +1533,12 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
if (!(file_in->f_mode & FMODE_READ) ||
!(file_out->f_mode & FMODE_WRITE) ||
- (file_out->f_flags & O_APPEND) ||
- !file_in->f_op->clone_file_range)
+ (file_out->f_flags & O_APPEND))
return -EBADF;
+ if (!file_in->f_op->clone_file_range)
+ return -EOPNOTSUPP;
+
ret = clone_verify_area(file_in, pos_in, len, false);
if (ret)
return ret;
diff --git a/fs/super.c b/fs/super.c
index 1182af8fd5ff..74914b1bae70 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -415,6 +415,7 @@ void generic_shutdown_super(struct super_block *sb)
sb->s_flags &= ~MS_ACTIVE;
fsnotify_unmount_inodes(sb);
+ cgroup_writeback_umount();
evict_inodes(sb);
diff --git a/fs/timerfd.c b/fs/timerfd.c
index b94fa6c3c6eb..053818dd6c18 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -153,7 +153,7 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
if (isalarm(ctx))
remaining = alarm_expires_remaining(&ctx->t.alarm);
else
- remaining = hrtimer_expires_remaining(&ctx->t.tmr);
+ remaining = hrtimer_expires_remaining_adjusted(&ctx->t.tmr);
return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
}
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 50311703135b..66cdb44616d5 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -287,6 +287,12 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
goto out;
/*
+ * We don't do userfault handling for the final child pid update.
+ */
+ if (current->flags & PF_EXITING)
+ goto out;
+
+ /*
* Check that we can return VM_FAULT_RETRY.
*
* NOTE: it should become possible to return VM_FAULT_RETRY
diff --git a/fs/xattr.c b/fs/xattr.c
index 07d0e47f6a7f..4861322e28e8 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -940,7 +940,7 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
bool trusted = capable(CAP_SYS_ADMIN);
struct simple_xattr *xattr;
ssize_t remaining_size = size;
- int err;
+ int err = 0;
#ifdef CONFIG_FS_POSIX_ACL
if (inode->i_acl) {
@@ -965,11 +965,11 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
err = xattr_list_one(&buffer, &remaining_size, xattr->name);
if (err)
- return err;
+ break;
}
spin_unlock(&xattrs->lock);
- return size - remaining_size;
+ return err ? err : size - remaining_size;
}
/*
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 379c089fb051..5c57b7b40728 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -55,7 +55,7 @@ xfs_count_page_state(
} while ((bh = bh->b_this_page) != head);
}
-STATIC struct block_device *
+struct block_device *
xfs_find_bdev_for_inode(
struct inode *inode)
{
@@ -1208,6 +1208,10 @@ xfs_vm_writepages(
struct writeback_control *wbc)
{
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
+ if (dax_mapping(mapping))
+ return dax_writeback_mapping_range(mapping,
+ xfs_find_bdev_for_inode(mapping->host), wbc);
+
return generic_writepages(mapping, wbc);
}
@@ -1953,7 +1957,6 @@ xfs_vm_set_page_dirty(
loff_t end_offset;
loff_t offset;
int newly_dirty;
- struct mem_cgroup *memcg;
if (unlikely(!mapping))
return !TestSetPageDirty(page);
@@ -1974,10 +1977,10 @@ xfs_vm_set_page_dirty(
} while (bh != head);
}
/*
- * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
- * per-memcg dirty page counters.
+ * Lock out page->mem_cgroup migration to keep PageDirty
+ * synchronized with per-memcg dirty page counters.
*/
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
newly_dirty = !TestSetPageDirty(page);
spin_unlock(&mapping->private_lock);
@@ -1988,13 +1991,13 @@ xfs_vm_set_page_dirty(
spin_lock_irqsave(&mapping->tree_lock, flags);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(!PageUptodate(page));
- account_page_dirtied(page, mapping, memcg);
+ account_page_dirtied(page, mapping);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
spin_unlock_irqrestore(&mapping->tree_lock, flags);
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
if (newly_dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
return newly_dirty;
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index f6ffc9ae5ceb..a4343c63fb38 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -62,5 +62,6 @@ int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
struct buffer_head *map_bh, int create);
extern void xfs_count_page_state(struct page *, int *, int *);
+extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 45ec9e40150c..6c876012b2e5 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -75,7 +75,8 @@ xfs_zero_extent(
ssize_t size = XFS_FSB_TO_B(mp, count_fsb);
if (IS_DAX(VFS_I(ip)))
- return dax_clear_blocks(VFS_I(ip), block, size);
+ return dax_clear_sectors(xfs_find_bdev_for_inode(VFS_I(ip)),
+ sector, size);
/*
* let the block layer decide on the fastest method of
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index da37beb76f6e..be5568839442 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1109,27 +1109,10 @@ xlog_verify_head(
bool tmp_wrapped;
/*
- * Search backwards through the log looking for the log record header
- * block. This wraps all the way back around to the head so something is
- * seriously wrong if we can't find it.
- */
- found = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp, rhead_blk,
- rhead, wrapped);
- if (found < 0)
- return found;
- if (!found) {
- xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
- return -EIO;
- }
-
- *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
-
- /*
- * Now that we have a tail block, check the head of the log for torn
- * writes. Search again until we hit the tail or the maximum number of
- * log record I/Os that could have been in flight at one time. Use a
- * temporary buffer so we don't trash the rhead/bp pointer from the
- * call above.
+ * Check the head of the log for torn writes. Search backwards from the
+ * head until we hit the tail or the maximum number of log record I/Os
+ * that could have been in flight at one time. Use a temporary buffer so
+ * we don't trash the rhead/bp pointers from the caller.
*/
tmp_bp = xlog_get_bp(log, 1);
if (!tmp_bp)
@@ -1216,6 +1199,115 @@ xlog_verify_head(
}
/*
+ * Check whether the head of the log points to an unmount record. In other
+ * words, determine whether the log is clean. If so, update the in-core state
+ * appropriately.
+ */
+static int
+xlog_check_unmount_rec(
+ struct xlog *log,
+ xfs_daddr_t *head_blk,
+ xfs_daddr_t *tail_blk,
+ struct xlog_rec_header *rhead,
+ xfs_daddr_t rhead_blk,
+ struct xfs_buf *bp,
+ bool *clean)
+{
+ struct xlog_op_header *op_head;
+ xfs_daddr_t umount_data_blk;
+ xfs_daddr_t after_umount_blk;
+ int hblks;
+ int error;
+ char *offset;
+
+ *clean = false;
+
+ /*
+ * Look for unmount record. If we find it, then we know there was a
+ * clean unmount. Since 'i' could be the last block in the physical
+ * log, we convert to a log block before comparing to the head_blk.
+ *
+ * Save the current tail lsn to use to pass to xlog_clear_stale_blocks()
+ * below. We won't want to clear the unmount record if there is one, so
+ * we pass the lsn of the unmount record rather than the block after it.
+ */
+ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+ int h_size = be32_to_cpu(rhead->h_size);
+ int h_version = be32_to_cpu(rhead->h_version);
+
+ if ((h_version & XLOG_VERSION_2) &&
+ (h_size > XLOG_HEADER_CYCLE_SIZE)) {
+ hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
+ if (h_size % XLOG_HEADER_CYCLE_SIZE)
+ hblks++;
+ } else {
+ hblks = 1;
+ }
+ } else {
+ hblks = 1;
+ }
+ after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len));
+ after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize);
+ if (*head_blk == after_umount_blk &&
+ be32_to_cpu(rhead->h_num_logops) == 1) {
+ umount_data_blk = rhead_blk + hblks;
+ umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize);
+ error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
+ if (error)
+ return error;
+
+ op_head = (struct xlog_op_header *)offset;
+ if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
+ /*
+ * Set tail and last sync so that newly written log
+ * records will point recovery to after the current
+ * unmount record.
+ */
+ xlog_assign_atomic_lsn(&log->l_tail_lsn,
+ log->l_curr_cycle, after_umount_blk);
+ xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
+ log->l_curr_cycle, after_umount_blk);
+ *tail_blk = after_umount_blk;
+
+ *clean = true;
+ }
+ }
+
+ return 0;
+}
+
+static void
+xlog_set_state(
+ struct xlog *log,
+ xfs_daddr_t head_blk,
+ struct xlog_rec_header *rhead,
+ xfs_daddr_t rhead_blk,
+ bool bump_cycle)
+{
+ /*
+ * Reset log values according to the state of the log when we
+ * crashed. In the case where head_blk == 0, we bump curr_cycle
+ * one because the next write starts a new cycle rather than
+ * continuing the cycle of the last good log record. At this
+ * point we have guaranteed that all partial log records have been
+ * accounted for. Therefore, we know that the last good log record
+ * written was complete and ended exactly on the end boundary
+ * of the physical log.
+ */
+ log->l_prev_block = rhead_blk;
+ log->l_curr_block = (int)head_blk;
+ log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
+ if (bump_cycle)
+ log->l_curr_cycle++;
+ atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
+ atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
+ xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
+ BBTOB(log->l_curr_block));
+ xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
+ BBTOB(log->l_curr_block));
+}
+
+/*
* Find the sync block number or the tail of the log.
*
* This will be the block number of the last record to have its
@@ -1238,22 +1330,20 @@ xlog_find_tail(
xfs_daddr_t *tail_blk)
{
xlog_rec_header_t *rhead;
- xlog_op_header_t *op_head;
char *offset = NULL;
xfs_buf_t *bp;
int error;
- xfs_daddr_t umount_data_blk;
- xfs_daddr_t after_umount_blk;
xfs_daddr_t rhead_blk;
xfs_lsn_t tail_lsn;
- int hblks;
bool wrapped = false;
+ bool clean = false;
/*
* Find previous log record
*/
if ((error = xlog_find_head(log, head_blk)))
return error;
+ ASSERT(*head_blk < INT_MAX);
bp = xlog_get_bp(log, 1);
if (!bp)
@@ -1271,100 +1361,75 @@ xlog_find_tail(
}
/*
- * Trim the head block back to skip over torn records. We can have
- * multiple log I/Os in flight at any time, so we assume CRC failures
- * back through the previous several records are torn writes and skip
- * them.
+ * Search backwards through the log looking for the log record header
+ * block. This wraps all the way back around to the head so something is
+ * seriously wrong if we can't find it.
*/
- ASSERT(*head_blk < INT_MAX);
- error = xlog_verify_head(log, head_blk, tail_blk, bp, &rhead_blk,
- &rhead, &wrapped);
- if (error)
- goto done;
+ error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp,
+ &rhead_blk, &rhead, &wrapped);
+ if (error < 0)
+ return error;
+ if (!error) {
+ xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
+ return -EIO;
+ }
+ *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
/*
- * Reset log values according to the state of the log when we
- * crashed. In the case where head_blk == 0, we bump curr_cycle
- * one because the next write starts a new cycle rather than
- * continuing the cycle of the last good log record. At this
- * point we have guaranteed that all partial log records have been
- * accounted for. Therefore, we know that the last good log record
- * written was complete and ended exactly on the end boundary
- * of the physical log.
+ * Set the log state based on the current head record.
*/
- log->l_prev_block = rhead_blk;
- log->l_curr_block = (int)*head_blk;
- log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
- if (wrapped)
- log->l_curr_cycle++;
- atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
- atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
- xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
- BBTOB(log->l_curr_block));
- xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
- BBTOB(log->l_curr_block));
+ xlog_set_state(log, *head_blk, rhead, rhead_blk, wrapped);
+ tail_lsn = atomic64_read(&log->l_tail_lsn);
/*
- * Look for unmount record. If we find it, then we know there
- * was a clean unmount. Since 'i' could be the last block in
- * the physical log, we convert to a log block before comparing
- * to the head_blk.
+ * Look for an unmount record at the head of the log. This sets the log
+ * state to determine whether recovery is necessary.
+ */
+ error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead,
+ rhead_blk, bp, &clean);
+ if (error)
+ goto done;
+
+ /*
+ * Verify the log head if the log is not clean (e.g., we have anything
+ * but an unmount record at the head). This uses CRC verification to
+ * detect and trim torn writes. If discovered, CRC failures are
+ * considered torn writes and the log head is trimmed accordingly.
*
- * Save the current tail lsn to use to pass to
- * xlog_clear_stale_blocks() below. We won't want to clear the
- * unmount record if there is one, so we pass the lsn of the
- * unmount record rather than the block after it.
+ * Note that we can only run CRC verification when the log is dirty
+ * because there's no guarantee that the log data behind an unmount
+ * record is compatible with the current architecture.
*/
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
- int h_size = be32_to_cpu(rhead->h_size);
- int h_version = be32_to_cpu(rhead->h_version);
+ if (!clean) {
+ xfs_daddr_t orig_head = *head_blk;
- if ((h_version & XLOG_VERSION_2) &&
- (h_size > XLOG_HEADER_CYCLE_SIZE)) {
- hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
- if (h_size % XLOG_HEADER_CYCLE_SIZE)
- hblks++;
- } else {
- hblks = 1;
- }
- } else {
- hblks = 1;
- }
- after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len));
- after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize);
- tail_lsn = atomic64_read(&log->l_tail_lsn);
- if (*head_blk == after_umount_blk &&
- be32_to_cpu(rhead->h_num_logops) == 1) {
- umount_data_blk = rhead_blk + hblks;
- umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize);
- error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
+ error = xlog_verify_head(log, head_blk, tail_blk, bp,
+ &rhead_blk, &rhead, &wrapped);
if (error)
goto done;
- op_head = (xlog_op_header_t *)offset;
- if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
- /*
- * Set tail and last sync so that newly written
- * log records will point recovery to after the
- * current unmount record.
- */
- xlog_assign_atomic_lsn(&log->l_tail_lsn,
- log->l_curr_cycle, after_umount_blk);
- xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
- log->l_curr_cycle, after_umount_blk);
- *tail_blk = after_umount_blk;
-
- /*
- * Note that the unmount was clean. If the unmount
- * was not clean, we need to know this to rebuild the
- * superblock counters from the perag headers if we
- * have a filesystem using non-persistent counters.
- */
- log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
+ /* update in-core state again if the head changed */
+ if (*head_blk != orig_head) {
+ xlog_set_state(log, *head_blk, rhead, rhead_blk,
+ wrapped);
+ tail_lsn = atomic64_read(&log->l_tail_lsn);
+ error = xlog_check_unmount_rec(log, head_blk, tail_blk,
+ rhead, rhead_blk, bp,
+ &clean);
+ if (error)
+ goto done;
}
}
/*
+ * Note that the unmount was clean. If the unmount was not clean, we
+ * need to know this to rebuild the superblock counters from the perag
+ * headers if we have a filesystem using non-persistent counters.
+ */
+ if (clean)
+ log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
+
+ /*
* Make sure that there are no blocks in front of the head
* with the same cycle number as the head. This can happen
* because we allow multiple outstanding log writes concurrently,
@@ -4491,7 +4556,7 @@ xlog_recover_process(
* know precisely what failed.
*/
if (pass == XLOG_RECOVER_CRCPASS) {
- if (rhead->h_crc && crc != le32_to_cpu(rhead->h_crc))
+ if (rhead->h_crc && crc != rhead->h_crc)
return -EFSBADCRC;
return 0;
}
@@ -4502,7 +4567,7 @@ xlog_recover_process(
* zero CRC check prevents warnings from being emitted when upgrading
* the kernel from one that does not add CRCs by default.
*/
- if (crc != le32_to_cpu(rhead->h_crc)) {
+ if (crc != rhead->h_crc) {
if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
xfs_alert(log->l_mp,
"log record CRC mismatch: found 0x%x, expected 0x%x.",